xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/genX_gfx_state.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29 
30 #include "anv_private.h"
31 
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34 #include "common/intel_genX_state_brw.h"
35 #include "common/intel_guardband.h"
36 #include "common/intel_tiled_render.h"
37 #include "compiler/brw_prim.h"
38 
39 const uint32_t genX(vk_to_intel_blend)[] = {
40    [VK_BLEND_FACTOR_ZERO]                    = BLENDFACTOR_ZERO,
41    [VK_BLEND_FACTOR_ONE]                     = BLENDFACTOR_ONE,
42    [VK_BLEND_FACTOR_SRC_COLOR]               = BLENDFACTOR_SRC_COLOR,
43    [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR]     = BLENDFACTOR_INV_SRC_COLOR,
44    [VK_BLEND_FACTOR_DST_COLOR]               = BLENDFACTOR_DST_COLOR,
45    [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR]     = BLENDFACTOR_INV_DST_COLOR,
46    [VK_BLEND_FACTOR_SRC_ALPHA]               = BLENDFACTOR_SRC_ALPHA,
47    [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA]     = BLENDFACTOR_INV_SRC_ALPHA,
48    [VK_BLEND_FACTOR_DST_ALPHA]               = BLENDFACTOR_DST_ALPHA,
49    [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA]     = BLENDFACTOR_INV_DST_ALPHA,
50    [VK_BLEND_FACTOR_CONSTANT_COLOR]          = BLENDFACTOR_CONST_COLOR,
51    [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,
52    [VK_BLEND_FACTOR_CONSTANT_ALPHA]          = BLENDFACTOR_CONST_ALPHA,
53    [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,
54    [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE]      = BLENDFACTOR_SRC_ALPHA_SATURATE,
55    [VK_BLEND_FACTOR_SRC1_COLOR]              = BLENDFACTOR_SRC1_COLOR,
56    [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR]    = BLENDFACTOR_INV_SRC1_COLOR,
57    [VK_BLEND_FACTOR_SRC1_ALPHA]              = BLENDFACTOR_SRC1_ALPHA,
58    [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA]    = BLENDFACTOR_INV_SRC1_ALPHA,
59 };
60 
61 static const uint32_t genX(vk_to_intel_blend_op)[] = {
62    [VK_BLEND_OP_ADD]                         = BLENDFUNCTION_ADD,
63    [VK_BLEND_OP_SUBTRACT]                    = BLENDFUNCTION_SUBTRACT,
64    [VK_BLEND_OP_REVERSE_SUBTRACT]            = BLENDFUNCTION_REVERSE_SUBTRACT,
65    [VK_BLEND_OP_MIN]                         = BLENDFUNCTION_MIN,
66    [VK_BLEND_OP_MAX]                         = BLENDFUNCTION_MAX,
67 };
68 
69 static void
genX(streamout_prologue)70 genX(streamout_prologue)(struct anv_cmd_buffer *cmd_buffer)
71 {
72 #if INTEL_WA_16013994831_GFX_VER
73    /* Wa_16013994831 - Disable preemption during streamout, enable back
74     * again if XFB not used by the current pipeline.
75     *
76     * Although this workaround applies to Gfx12+, we already disable object
77     * level preemption for another reason in genX_state.c so we can skip this
78     * for Gfx12.
79     */
80    if (!intel_needs_workaround(cmd_buffer->device->info, 16013994831))
81       return;
82 
83    struct anv_graphics_pipeline *pipeline =
84       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
85    if (pipeline->uses_xfb) {
86       genX(cmd_buffer_set_preemption)(cmd_buffer, false);
87       return;
88    }
89 
90    if (!cmd_buffer->state.gfx.object_preemption)
91       genX(cmd_buffer_set_preemption)(cmd_buffer, true);
92 #endif
93 }
94 
95 #if GFX_VER >= 12
96 static uint32_t
get_cps_state_offset(struct anv_cmd_buffer * cmd_buffer,bool cps_enabled,const struct vk_fragment_shading_rate_state * fsr)97 get_cps_state_offset(struct anv_cmd_buffer *cmd_buffer, bool cps_enabled,
98                      const struct vk_fragment_shading_rate_state *fsr)
99 {
100    struct anv_device *device = cmd_buffer->device;
101 
102    if (!cps_enabled)
103       return device->cps_states.offset;
104 
105    uint32_t offset;
106    static const uint32_t size_index[] = {
107       [1] = 0,
108       [2] = 1,
109       [4] = 2,
110    };
111 
112 #if GFX_VERx10 >= 125
113    offset =
114       1 + /* skip disabled */
115       fsr->combiner_ops[0] * 5 * 3 * 3 +
116       fsr->combiner_ops[1] * 3 * 3 +
117       size_index[fsr->fragment_size.width] * 3 +
118       size_index[fsr->fragment_size.height];
119 #else
120    offset =
121       1 + /* skip disabled */
122       size_index[fsr->fragment_size.width] * 3 +
123       size_index[fsr->fragment_size.height];
124 #endif
125 
126    offset *= MAX_VIEWPORTS * GENX(CPS_STATE_length) * 4;
127 
128    return device->cps_states.offset + offset;
129 }
130 #endif /* GFX_VER >= 12 */
131 
132 static bool
has_ds_feedback_loop(const struct vk_dynamic_graphics_state * dyn)133 has_ds_feedback_loop(const struct vk_dynamic_graphics_state *dyn)
134 {
135    return dyn->feedback_loops & (VK_IMAGE_ASPECT_DEPTH_BIT |
136                                  VK_IMAGE_ASPECT_STENCIL_BIT);
137 }
138 
139 UNUSED static bool
want_stencil_pma_fix(struct anv_cmd_buffer * cmd_buffer,const struct vk_dynamic_graphics_state * dyn,const struct vk_depth_stencil_state * ds)140 want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer,
141                      const struct vk_dynamic_graphics_state *dyn,
142                      const struct vk_depth_stencil_state *ds)
143 {
144    if (GFX_VER > 9)
145       return false;
146    assert(GFX_VER == 9);
147 
148    /* From the Skylake PRM Vol. 2c CACHE_MODE_1::STC PMA Optimization Enable:
149     *
150     *    Clearing this bit will force the STC cache to wait for pending
151     *    retirement of pixels at the HZ-read stage and do the STC-test for
152     *    Non-promoted, R-computed and Computed depth modes instead of
153     *    postponing the STC-test to RCPFE.
154     *
155     *    STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
156     *                  3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
157     *
158     *    STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
159     *                   (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
160     *                    3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
161     *
162     *    COMP_STC_EN = STC_TEST_EN &&
163     *                  3DSTATE_PS_EXTRA::PixelShaderComputesStencil
164     *
165     *    SW parses the pipeline states to generate the following logical
166     *    signal indicating if PMA FIX can be enabled.
167     *
168     *    STC_PMA_OPT =
169     *       3DSTATE_WM::ForceThreadDispatch != 1 &&
170     *       !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
171     *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
172     *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
173     *       !(3DSTATE_WM::EDSC_Mode == 2) &&
174     *       3DSTATE_PS_EXTRA::PixelShaderValid &&
175     *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
176     *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
177     *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
178     *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
179     *       (COMP_STC_EN || STC_WRITE_EN) &&
180     *       ((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
181     *         3DSTATE_WM::ForceKillPix == ON ||
182     *         3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
183     *         3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
184     *         3DSTATE_PS_BLEND::AlphaTestEnable ||
185     *         3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
186     *        (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
187     */
188 
189    /* These are always true:
190     *    3DSTATE_WM::ForceThreadDispatch != 1 &&
191     *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
192     */
193 
194    /* We only enable the PMA fix if we know for certain that HiZ is enabled.
195     * If we don't know whether HiZ is enabled or not, we disable the PMA fix
196     * and there is no harm.
197     *
198     * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
199     * 3DSTATE_DEPTH_BUFFER::HIZ Enable
200     */
201    if (!cmd_buffer->state.hiz_enabled)
202       return false;
203 
204    /* We can't possibly know if HiZ is enabled without the depth attachment */
205    ASSERTED const struct anv_image_view *d_iview =
206       cmd_buffer->state.gfx.depth_att.iview;
207    assert(d_iview && d_iview->image->planes[0].aux_usage == ISL_AUX_USAGE_HIZ);
208 
209    /* 3DSTATE_PS_EXTRA::PixelShaderValid */
210    struct anv_graphics_pipeline *pipeline =
211       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
212    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
213       return false;
214 
215    /* !(3DSTATE_WM::EDSC_Mode == 2) */
216    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
217    if (wm_prog_data->early_fragment_tests)
218       return false;
219 
220    /* We never use anv_pipeline for HiZ ops so this is trivially true:
221    *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
222     *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
223     *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
224     *      3DSTATE_WM_HZ_OP::StencilBufferClear)
225     */
226 
227    /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
228     * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
229     */
230    const bool stc_test_en = ds->stencil.test_enable;
231 
232    /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
233     * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
234     *  3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
235     */
236    const bool stc_write_en = ds->stencil.write_enable;
237 
238    /* STC_TEST_EN && 3DSTATE_PS_EXTRA::PixelShaderComputesStencil */
239    const bool comp_stc_en = stc_test_en && wm_prog_data->computed_stencil;
240 
241    /* COMP_STC_EN || STC_WRITE_EN */
242    if (!(comp_stc_en || stc_write_en))
243       return false;
244 
245    /* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
246     *  3DSTATE_WM::ForceKillPix == ON ||
247     *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
248     *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
249     *  3DSTATE_PS_BLEND::AlphaTestEnable ||
250     *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
251     * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF)
252     */
253    return pipeline->kill_pixel ||
254           pipeline->rp_has_ds_self_dep ||
255           has_ds_feedback_loop(dyn) ||
256           wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
257 }
258 
259 static void
genX(rasterization_mode)260 genX(rasterization_mode)(VkPolygonMode raster_mode,
261                          VkLineRasterizationModeKHR line_mode,
262                          float line_width,
263                          uint32_t *api_mode,
264                          bool *msaa_rasterization_enable)
265 {
266    if (raster_mode == VK_POLYGON_MODE_LINE) {
267       /* Unfortunately, configuring our line rasterization hardware on gfx8
268        * and later is rather painful.  Instead of giving us bits to tell the
269        * hardware what line mode to use like we had on gfx7, we now have an
270        * arcane combination of API Mode and MSAA enable bits which do things
271        * in a table which are expected to magically put the hardware into the
272        * right mode for your API.  Sadly, Vulkan isn't any of the APIs the
273        * hardware people thought of so nothing works the way you want it to.
274        *
275        * Look at the table titled "Multisample Rasterization Modes" in Vol 7
276        * of the Skylake PRM for more details.
277        */
278       switch (line_mode) {
279       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
280          *api_mode = DX101;
281 #if GFX_VER <= 9
282          /* Prior to ICL, the algorithm the HW uses to draw wide lines
283           * doesn't quite match what the CTS expects, at least for rectangular
284           * lines, so we set this to false here, making it draw parallelograms
285           * instead, which work well enough.
286           */
287          *msaa_rasterization_enable = line_width < 1.0078125;
288 #else
289          *msaa_rasterization_enable = true;
290 #endif
291          break;
292 
293       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
294       case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
295          *api_mode = DX9OGL;
296          *msaa_rasterization_enable = false;
297          break;
298 
299       default:
300          unreachable("Unsupported line rasterization mode");
301       }
302    } else {
303       *api_mode = DX101;
304       *msaa_rasterization_enable = true;
305    }
306 }
307 
308 static bool
309 is_src1_blend_factor(enum GENX(3D_Color_Buffer_Blend_Factor) factor)
310 {
311    return factor == BLENDFACTOR_SRC1_COLOR ||
312           factor == BLENDFACTOR_SRC1_ALPHA ||
313           factor == BLENDFACTOR_INV_SRC1_COLOR ||
314           factor == BLENDFACTOR_INV_SRC1_ALPHA;
315 }
316 
317 #if GFX_VERx10 == 125
318 /**
319  * Return the dimensions of the current rendering area, defined as the
320  * bounding box of all present color, depth and stencil attachments.
321  */
322 UNUSED static bool
calculate_render_area(struct anv_cmd_buffer * cmd_buffer,unsigned * width,unsigned * height)323 calculate_render_area(struct anv_cmd_buffer *cmd_buffer,
324                       unsigned *width, unsigned *height)
325 {
326    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
327 
328    *width = gfx->render_area.offset.x + gfx->render_area.extent.width;
329    *height = gfx->render_area.offset.y + gfx->render_area.extent.height;
330 
331    for (unsigned i = 0; i < gfx->color_att_count; i++) {
332       struct anv_attachment *att = &gfx->color_att[i];
333       if (att->iview) {
334          *width = MAX2(*width, att->iview->vk.extent.width);
335          *height = MAX2(*height, att->iview->vk.extent.height);
336       }
337    }
338 
339    const struct anv_image_view *const z_view = gfx->depth_att.iview;
340    if (z_view) {
341       *width = MAX2(*width, z_view->vk.extent.width);
342       *height = MAX2(*height, z_view->vk.extent.height);
343    }
344 
345    const struct anv_image_view *const s_view = gfx->stencil_att.iview;
346    if (s_view) {
347       *width = MAX2(*width, s_view->vk.extent.width);
348       *height = MAX2(*height, s_view->vk.extent.height);
349    }
350 
351    return *width && *height;
352 }
353 
354 /* Calculate TBIMR tiling parameters adequate for the current pipeline
355  * setup.  Return true if TBIMR should be enabled.
356  */
357 UNUSED static bool
calculate_tile_dimensions(struct anv_cmd_buffer * cmd_buffer,unsigned fb_width,unsigned fb_height,unsigned * tile_width,unsigned * tile_height)358 calculate_tile_dimensions(struct anv_cmd_buffer *cmd_buffer,
359                           unsigned fb_width, unsigned fb_height,
360                           unsigned *tile_width, unsigned *tile_height)
361 {
362    const struct anv_device *device = cmd_buffer->device;
363    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
364 
365    assert(GFX_VER == 12);
366    const unsigned aux_scale = ISL_MAIN_TO_CCS_SIZE_RATIO_XE;
367 
368    unsigned pixel_size = 0;
369 
370    /* Perform a rough calculation of the tile cache footprint of the
371     * pixel pipeline, approximating it as the sum of the amount of
372     * memory used per pixel by every render target, depth, stencil and
373     * auxiliary surfaces bound to the pipeline.
374     */
375    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
376       struct anv_attachment *att = &gfx->color_att[i];
377 
378       if (att->iview) {
379          const struct anv_image *image = att->iview->image;
380          const unsigned p = anv_image_aspect_to_plane(image,
381                                                       VK_IMAGE_ASPECT_COLOR_BIT);
382          const struct anv_image_plane *plane = &image->planes[p];
383 
384          pixel_size += intel_calculate_surface_pixel_size(
385             &plane->primary_surface.isl);
386 
387          if (isl_aux_usage_has_mcs(att->aux_usage))
388             pixel_size += intel_calculate_surface_pixel_size(
389                &plane->aux_surface.isl);
390 
391          if (isl_aux_usage_has_ccs(att->aux_usage))
392             pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
393                                           &plane->primary_surface.isl),
394                                        aux_scale);
395       }
396    }
397 
398    const struct anv_image_view *const z_view = gfx->depth_att.iview;
399    if (z_view) {
400       const struct anv_image *image = z_view->image;
401       assert(image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
402       const unsigned p = anv_image_aspect_to_plane(image,
403                                                    VK_IMAGE_ASPECT_DEPTH_BIT);
404       const struct anv_image_plane *plane = &image->planes[p];
405 
406       pixel_size += intel_calculate_surface_pixel_size(
407          &plane->primary_surface.isl);
408 
409       if (isl_aux_usage_has_hiz(image->planes[p].aux_usage))
410          pixel_size += intel_calculate_surface_pixel_size(
411             &plane->aux_surface.isl);
412 
413       if (isl_aux_usage_has_ccs(image->planes[p].aux_usage))
414          pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
415                                        &plane->primary_surface.isl),
416                                     aux_scale);
417    }
418 
419    const struct anv_image_view *const s_view = gfx->depth_att.iview;
420    if (s_view && s_view != z_view) {
421       const struct anv_image *image = s_view->image;
422       assert(image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
423       const unsigned p = anv_image_aspect_to_plane(image,
424                                                    VK_IMAGE_ASPECT_STENCIL_BIT);
425       const struct anv_image_plane *plane = &image->planes[p];
426 
427       pixel_size += intel_calculate_surface_pixel_size(
428          &plane->primary_surface.isl);
429    }
430 
431    if (!pixel_size)
432       return false;
433 
434    /* Compute a tile layout that allows reasonable utilization of the
435     * tile cache based on the per-pixel cache footprint estimated
436     * above.
437     */
438    intel_calculate_tile_dimensions(device->info, cmd_buffer->state.current_l3_config,
439                                    32, 32, fb_width, fb_height,
440                                    pixel_size, tile_width, tile_height);
441 
442    /* Perform TBIMR tile passes only if the framebuffer covers more
443     * than a single tile.
444     */
445    return *tile_width < fb_width || *tile_height < fb_height;
446 }
447 #endif
448 
449 /**
450  * This function takes the vulkan runtime values & dirty states and updates
451  * the values in anv_gfx_dynamic_state, flagging HW instructions for
452  * reemission if the values are changing.
453  *
454  * Nothing is emitted in the batch buffer.
455  *
456  * Returns a mask for state that we want to leave dirty afterwards.
457  */
458 anv_cmd_dirty_mask_t
genX(cmd_buffer_flush_gfx_runtime_state)459 genX(cmd_buffer_flush_gfx_runtime_state)(struct anv_cmd_buffer *cmd_buffer)
460 {
461    UNUSED struct anv_device *device = cmd_buffer->device;
462    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
463    const struct anv_graphics_pipeline *pipeline =
464       anv_pipeline_to_graphics(gfx->base.pipeline);
465    const struct vk_dynamic_graphics_state *dyn =
466       &cmd_buffer->vk.dynamic_graphics_state;
467    struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
468    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
469    struct anv_instance *instance = cmd_buffer->device->physical->instance;
470    anv_cmd_dirty_mask_t dirty_state_mask = 0;
471 
472 #define GET(field) hw_state->field
473 #define SET(bit, field, value)                               \
474    do {                                                      \
475       __typeof(hw_state->field) __v = value;                 \
476       if (hw_state->field != __v) {                          \
477          hw_state->field = __v;                              \
478          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit);   \
479       }                                                      \
480    } while (0)
481 #define SET_STAGE(bit, field, value, stage)                  \
482    do {                                                      \
483       __typeof(hw_state->field) __v = value;                 \
484       if (!anv_pipeline_has_stage(pipeline,                  \
485                                   MESA_SHADER_##stage)) {    \
486          hw_state->field = __v;                              \
487          break;                                              \
488       }                                                      \
489       if (hw_state->field != __v) {                          \
490          hw_state->field = __v;                              \
491          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit);   \
492       }                                                      \
493    } while (0)
494 
495 #define SETUP_PROVOKING_VERTEX(bit, cmd, mode)                         \
496    switch (mode) {                                                     \
497    case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:                     \
498       SET(bit, cmd.TriangleStripListProvokingVertexSelect, 0);         \
499       SET(bit, cmd.LineStripListProvokingVertexSelect,     0);         \
500       SET(bit, cmd.TriangleFanProvokingVertexSelect,       1);         \
501       break;                                                           \
502    case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:                      \
503       SET(bit, cmd.TriangleStripListProvokingVertexSelect, 2);         \
504       SET(bit, cmd.LineStripListProvokingVertexSelect,     1);         \
505       SET(bit, cmd.TriangleFanProvokingVertexSelect,       2);         \
506       break;                                                           \
507    default:                                                            \
508       unreachable("Invalid provoking vertex mode");                    \
509    }                                                                   \
510 
511    UNUSED bool fs_msaa_changed = false;
512    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
513        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
514        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ||
515        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR)) {
516       enum intel_msaa_flags fs_msaa_flags = 0;
517 
518       if (wm_prog_data) {
519          /* If we have any dynamic bits here, we might need to update the
520           * value in the push constant for the shader.
521           */
522          if (wm_prog_data->coarse_pixel_dispatch == BRW_SOMETIMES ||
523              wm_prog_data->persample_dispatch == BRW_SOMETIMES ||
524              wm_prog_data->alpha_to_coverage == BRW_SOMETIMES) {
525             fs_msaa_flags = INTEL_MSAA_FLAG_ENABLE_DYNAMIC;
526 
527             if (dyn->ms.rasterization_samples > 1) {
528                fs_msaa_flags |= INTEL_MSAA_FLAG_MULTISAMPLE_FBO;
529 
530                if (wm_prog_data->sample_shading) {
531                   assert(wm_prog_data->persample_dispatch != BRW_NEVER);
532                   fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH;
533                }
534                if ((pipeline->sample_shading_enable &&
535                     (pipeline->min_sample_shading * dyn->ms.rasterization_samples) > 1) ||
536                    wm_prog_data->sample_shading) {
537                   fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH |
538                                    INTEL_MSAA_FLAG_PERSAMPLE_INTERP;
539                }
540             }
541 
542             if (wm_prog_data->coarse_pixel_dispatch == BRW_SOMETIMES &&
543                 !(fs_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH)) {
544                fs_msaa_flags |= INTEL_MSAA_FLAG_COARSE_PI_MSG |
545                                 INTEL_MSAA_FLAG_COARSE_RT_WRITES;
546             }
547 
548             if (wm_prog_data->alpha_to_coverage == BRW_SOMETIMES &&
549                 dyn->ms.alpha_to_coverage_enable)
550                fs_msaa_flags |= INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE;
551 
552             /* Check the last push constant value and update */
553 
554             if (gfx->base.push_constants.gfx.fs_msaa_flags != fs_msaa_flags) {
555                gfx->base.push_constants.gfx.fs_msaa_flags = fs_msaa_flags;
556                cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
557                gfx->base.push_constants_data_dirty = true;
558             }
559          }
560       }
561 
562       if (fs_msaa_flags != gfx->fs_msaa_flags) {
563          gfx->fs_msaa_flags = fs_msaa_flags;
564          gfx->dirty |= ANV_CMD_DIRTY_FS_MSAA_FLAGS;
565       }
566    }
567 
568    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
569        (gfx->dirty & ANV_CMD_DIRTY_FS_MSAA_FLAGS) ||
570        (gfx->dirty & ANV_CMD_DIRTY_COARSE_PIXEL_ACTIVE)) {
571       if (wm_prog_data) {
572          const struct anv_shader_bin *fs_bin =
573             pipeline->base.shaders[MESA_SHADER_FRAGMENT];
574 
575          struct GENX(3DSTATE_PS) ps = {};
576          intel_set_ps_dispatch_state(&ps, device->info, wm_prog_data,
577                                      MAX2(dyn->ms.rasterization_samples, 1),
578                                      gfx->fs_msaa_flags);
579 
580          SET(PS, ps.KernelStartPointer0,
581              fs_bin->kernel.offset +
582              brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0));
583          SET(PS, ps.KernelStartPointer1,
584              fs_bin->kernel.offset +
585              brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1));
586 #if GFX_VER < 20
587          SET(PS, ps.KernelStartPointer2,
588              fs_bin->kernel.offset +
589              brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2));
590 #endif
591 
592          SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData0,
593              brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0));
594          SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData1,
595              brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1));
596 #if GFX_VER < 20
597          SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData2,
598              brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2));
599 #endif
600 
601 #if GFX_VER < 20
602          SET(PS, ps._8PixelDispatchEnable,  ps._8PixelDispatchEnable);
603          SET(PS, ps._16PixelDispatchEnable, ps._16PixelDispatchEnable);
604          SET(PS, ps._32PixelDispatchEnable, ps._32PixelDispatchEnable);
605 #else
606          SET(PS, ps.Kernel0Enable,            ps.Kernel0Enable);
607          SET(PS, ps.Kernel1Enable,            ps.Kernel1Enable);
608          SET(PS, ps.Kernel0SIMDWidth,         ps.Kernel0SIMDWidth);
609          SET(PS, ps.Kernel1SIMDWidth,         ps.Kernel1SIMDWidth);
610          SET(PS, ps.Kernel0PolyPackingPolicy, ps.Kernel0PolyPackingPolicy);
611 #endif
612 
613          SET(PS, ps.PositionXYOffsetSelect,
614              !wm_prog_data->uses_pos_offset ? POSOFFSET_NONE :
615              brw_wm_prog_data_is_persample(wm_prog_data, gfx->fs_msaa_flags) ?
616              POSOFFSET_SAMPLE : POSOFFSET_CENTROID);
617 
618          SET(PS_EXTRA, ps_extra.PixelShaderIsPerSample,
619              brw_wm_prog_data_is_persample(wm_prog_data, gfx->fs_msaa_flags));
620 #if GFX_VER >= 11
621          const bool uses_coarse_pixel =
622             brw_wm_prog_data_is_coarse(wm_prog_data, gfx->fs_msaa_flags);
623          SET(PS_EXTRA, ps_extra.PixelShaderIsPerCoarsePixel, uses_coarse_pixel);
624 #endif
625 #if GFX_VERx10 >= 125
626          enum anv_coarse_pixel_state cps_state = uses_coarse_pixel ?
627             ANV_COARSE_PIXEL_STATE_ENABLED : ANV_COARSE_PIXEL_STATE_DISABLED;
628          bool cps_state_toggled =
629             genX(cmd_buffer_set_coarse_pixel_active)(cmd_buffer, cps_state);
630          if (cps_state_toggled)
631             dirty_state_mask |= ANV_CMD_DIRTY_COARSE_PIXEL_ACTIVE;
632 
633          const bool needs_ps_dependency =
634             /* TODO: We should only require this when the last geometry shader
635              *       uses a fragment shading rate that is not constant.
636              */
637             uses_coarse_pixel || cps_state_toggled;
638          SET(PS_EXTRA, ps_extra.EnablePSDependencyOnCPsizeChange, needs_ps_dependency);
639 #endif
640          SET(WM, wm.BarycentricInterpolationMode,
641              wm_prog_data_barycentric_modes(wm_prog_data, gfx->fs_msaa_flags));
642       } else {
643 #if GFX_VER < 20
644          SET(PS, ps._8PixelDispatchEnable,  false);
645          SET(PS, ps._16PixelDispatchEnable, false);
646          SET(PS, ps._32PixelDispatchEnable, false);
647 #else
648          SET(PS, ps.Kernel0Enable, false);
649          SET(PS, ps.Kernel1Enable, false);
650 #endif
651       }
652    }
653 
654    if ((gfx->dirty & (ANV_CMD_DIRTY_PIPELINE |
655                       ANV_CMD_DIRTY_XFB_ENABLE |
656                       ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE)) ||
657        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE) ||
658        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM) ||
659        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
660       SET(STREAMOUT, so.RenderingDisable, dyn->rs.rasterizer_discard_enable);
661       SET(STREAMOUT, so.RenderStreamSelect, dyn->rs.rasterization_stream);
662 
663 #if INTEL_NEEDS_WA_18022508906
664       /* Wa_18022508906 :
665        *
666        * SKL PRMs, Volume 7: 3D-Media-GPGPU, Stream Output Logic (SOL) Stage:
667        *
668        * SOL_INT::Render_Enable =
669        *   (3DSTATE_STREAMOUT::Force_Rending == Force_On) ||
670        *   (
671        *     (3DSTATE_STREAMOUT::Force_Rending != Force_Off) &&
672        *     !(3DSTATE_GS::Enable && 3DSTATE_GS::Output Vertex Size == 0) &&
673        *     !3DSTATE_STREAMOUT::API_Render_Disable &&
674        *     (
675        *       3DSTATE_DEPTH_STENCIL_STATE::Stencil_TestEnable ||
676        *       3DSTATE_DEPTH_STENCIL_STATE::Depth_TestEnable ||
677        *       3DSTATE_DEPTH_STENCIL_STATE::Depth_WriteEnable ||
678        *       3DSTATE_PS_EXTRA::PS_Valid ||
679        *       3DSTATE_WM::Legacy Depth_Buffer_Clear ||
680        *       3DSTATE_WM::Legacy Depth_Buffer_Resolve_Enable ||
681        *       3DSTATE_WM::Legacy Hierarchical_Depth_Buffer_Resolve_Enable
682        *     )
683        *   )
684        *
685        * If SOL_INT::Render_Enable is false, the SO stage will not forward any
686        * topologies down the pipeline. Which is not what we want for occlusion
687        * queries.
688        *
689        * Here we force rendering to get SOL_INT::Render_Enable when occlusion
690        * queries are active.
691        */
692       SET(STREAMOUT, so.ForceRendering,
693           (!GET(so.RenderingDisable) && gfx->n_occlusion_queries > 0) ?
694           Force_on : 0);
695 #endif
696 
697       switch (dyn->rs.provoking_vertex) {
698       case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
699          SET(STREAMOUT, so.ReorderMode, LEADING);
700          SET_STAGE(GS, gs.ReorderMode, LEADING, GEOMETRY);
701          break;
702 
703       case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
704          SET(STREAMOUT, so.ReorderMode, TRAILING);
705          SET_STAGE(GS, gs.ReorderMode, TRAILING, GEOMETRY);
706          break;
707 
708       default:
709          unreachable("Invalid provoking vertex mode");
710       }
711    }
712 
713    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
714        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
715       uint32_t topology;
716       if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
717          topology = _3DPRIM_PATCHLIST(dyn->ts.patch_control_points);
718       else
719          topology = genX(vk_to_intel_primitive_type)[dyn->ia.primitive_topology];
720 
721       gfx->primitive_topology = topology;
722 
723       SET(VF_TOPOLOGY, vft.PrimitiveTopologyType, topology);
724    }
725 
726    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
727        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) ||
728        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
729        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
730       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
731 
732 #if GFX_VER >= 11
733    if (cmd_buffer->device->vk.enabled_extensions.KHR_fragment_shading_rate &&
734        ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
735         (gfx->dirty & ANV_CMD_DIRTY_FS_MSAA_FLAGS) ||
736         BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR))) {
737       const bool cps_enable = wm_prog_data &&
738          brw_wm_prog_data_is_coarse(wm_prog_data, gfx->fs_msaa_flags);
739 #if GFX_VER == 11
740       SET(CPS, cps.CoarsePixelShadingMode,
741                cps_enable ? CPS_MODE_CONSTANT : CPS_MODE_NONE);
742       SET(CPS, cps.MinCPSizeX, dyn->fsr.fragment_size.width);
743       SET(CPS, cps.MinCPSizeY, dyn->fsr.fragment_size.height);
744 #elif GFX_VER >= 12
745       SET(CPS, cps.CoarsePixelShadingStateArrayPointer,
746                get_cps_state_offset(cmd_buffer, cps_enable, &dyn->fsr));
747 #endif
748    }
749 #endif /* GFX_VER >= 11 */
750 
751    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
752        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN)) {
753       const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
754 
755       if (tes_prog_data && anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
756          if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
757             SET(TE, te.OutputTopology, tes_prog_data->output_topology);
758          } else {
759             /* When the origin is upper-left, we have to flip the winding order */
760             if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) {
761                SET(TE, te.OutputTopology, OUTPUT_TRI_CW);
762             } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) {
763                SET(TE, te.OutputTopology, OUTPUT_TRI_CCW);
764             } else {
765                SET(TE, te.OutputTopology, tes_prog_data->output_topology);
766             }
767          }
768       } else {
769          SET(TE, te.OutputTopology, OUTPUT_POINT);
770       }
771    }
772 
773    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH))
774       SET(SF, sf.LineWidth, dyn->rs.line.width);
775 
776    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
777       SETUP_PROVOKING_VERTEX(SF, sf, dyn->rs.provoking_vertex);
778       SETUP_PROVOKING_VERTEX(CLIP, clip, dyn->rs.provoking_vertex);
779    }
780 
781    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
782       /**
783        * From the Vulkan Spec:
784        *
785        *    "VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT specifies that the depth
786        *     bias representation is a factor of constant r equal to 1."
787        *
788        * From the SKL PRMs, Volume 7: 3D-Media-GPGPU, Depth Offset:
789        *
790        *    "When UNORM Depth Buffer is at Output Merger (or no Depth Buffer):
791        *
792        *     Bias = GlobalDepthOffsetConstant * r + GlobalDepthOffsetScale * MaxDepthSlope
793        *
794        *     Where r is the minimum representable value > 0 in the depth
795        *     buffer format, converted to float32 (note: If state bit Legacy
796        *     Global Depth Bias Enable is set, the r term will be forced to
797        *     1.0)"
798        *
799        * When VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT is set, enable
800        * LegacyGlobalDepthBiasEnable.
801        */
802       SET(SF, sf.LegacyGlobalDepthBiasEnable,
803           dyn->rs.depth_bias.representation ==
804           VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT);
805    }
806 
807    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE))
808       SET(CLIP, clip.APIMode, dyn->vp.depth_clip_negative_one_to_one ? APIMODE_OGL : APIMODE_D3D);
809 
810    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
811        (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
812        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) ||
813        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) ||
814        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) ||
815        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
816        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
817        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE) ||
818        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE) ||
819        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
820        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) ||
821        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) ||
822        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE)) {
823       /* Take dynamic primitive topology in to account with
824        *    3DSTATE_RASTER::APIMode
825        *    3DSTATE_RASTER::DXMultisampleRasterizationEnable
826        *    3DSTATE_RASTER::AntialiasingEnable
827        */
828       uint32_t api_mode = 0;
829       bool msaa_raster_enable = false;
830 
831       const VkLineRasterizationModeKHR line_mode =
832          anv_line_rasterization_mode(dyn->rs.line.mode,
833                                      dyn->ms.rasterization_samples);
834 
835       const VkPolygonMode dynamic_raster_mode =
836          genX(raster_polygon_mode)(pipeline,
837                                    dyn->rs.polygon_mode,
838                                    dyn->ia.primitive_topology);
839 
840       genX(rasterization_mode)(dynamic_raster_mode,
841                                line_mode, dyn->rs.line.width,
842                                &api_mode, &msaa_raster_enable);
843 
844      /* From the Browadwell PRM, Volume 2, documentation for
845       * 3DSTATE_RASTER, "Antialiasing Enable":
846       *
847       * "This field must be disabled if any of the render targets
848       * have integer (UINT or SINT) surface format."
849       *
850       * Additionally internal documentation for Gfx12+ states:
851       *
852       * "This bit MUST not be set when NUM_MULTISAMPLES > 1 OR
853       *  FORCED_SAMPLE_COUNT > 1."
854       */
855       const bool aa_enable =
856          anv_rasterization_aa_mode(dynamic_raster_mode, line_mode) &&
857          !gfx->has_uint_rt &&
858          !(GFX_VER >= 12 && gfx->samples > 1);
859 
860       const bool depth_clip_enable =
861          vk_rasterization_state_depth_clip_enable(&dyn->rs);
862 
863       const bool xy_clip_test_enable =
864          (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
865 
866       SET(CLIP, clip.ViewportXYClipTestEnable, xy_clip_test_enable);
867 
868       SET(RASTER, raster.APIMode, api_mode);
869       SET(RASTER, raster.DXMultisampleRasterizationEnable, msaa_raster_enable);
870       SET(RASTER, raster.AntialiasingEnable, aa_enable);
871       SET(RASTER, raster.CullMode, genX(vk_to_intel_cullmode)[dyn->rs.cull_mode]);
872       SET(RASTER, raster.FrontWinding, genX(vk_to_intel_front_face)[dyn->rs.front_face]);
873       SET(RASTER, raster.GlobalDepthOffsetEnableSolid, dyn->rs.depth_bias.enable);
874       SET(RASTER, raster.GlobalDepthOffsetEnableWireframe, dyn->rs.depth_bias.enable);
875       SET(RASTER, raster.GlobalDepthOffsetEnablePoint, dyn->rs.depth_bias.enable);
876       SET(RASTER, raster.GlobalDepthOffsetConstant, dyn->rs.depth_bias.constant);
877       SET(RASTER, raster.GlobalDepthOffsetScale, dyn->rs.depth_bias.slope);
878       SET(RASTER, raster.GlobalDepthOffsetClamp, dyn->rs.depth_bias.clamp);
879       SET(RASTER, raster.FrontFaceFillMode, genX(vk_to_intel_fillmode)[dyn->rs.polygon_mode]);
880       SET(RASTER, raster.BackFaceFillMode, genX(vk_to_intel_fillmode)[dyn->rs.polygon_mode]);
881       SET(RASTER, raster.ViewportZFarClipTestEnable, depth_clip_enable);
882       SET(RASTER, raster.ViewportZNearClipTestEnable, depth_clip_enable);
883       SET(RASTER, raster.ConservativeRasterizationEnable,
884                   dyn->rs.conservative_mode !=
885                   VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT);
886    }
887 
888    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES)) {
889       SET(MULTISAMPLE, ms.NumberofMultisamples,
890           __builtin_ffs(MAX2(dyn->ms.rasterization_samples, 1)) - 1);
891    }
892 
893    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK)) {
894       /* From the Vulkan 1.0 spec:
895        *    If pSampleMask is NULL, it is treated as if the mask has all bits
896        *    enabled, i.e. no coverage is removed from fragments.
897        *
898        * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits.
899        */
900       SET(SAMPLE_MASK, sm.SampleMask, dyn->ms.sample_mask & 0xffff);
901    }
902 
903    if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
904 #if GFX_VER == 9
905        /* For the PMA fix */
906        (gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
907 #endif
908        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) ||
909        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) ||
910        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) ||
911        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
912        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP) ||
913        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
914        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
915        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE)) {
916       VkImageAspectFlags ds_aspects = 0;
917       if (gfx->depth_att.vk_format != VK_FORMAT_UNDEFINED)
918          ds_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
919       if (gfx->stencil_att.vk_format != VK_FORMAT_UNDEFINED)
920          ds_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
921 
922       struct vk_depth_stencil_state opt_ds = dyn->ds;
923       vk_optimize_depth_stencil_state(&opt_ds, ds_aspects, true);
924 
925       SET(WM_DEPTH_STENCIL, ds.DoubleSidedStencilEnable, true);
926 
927       SET(WM_DEPTH_STENCIL, ds.StencilTestMask,
928                             opt_ds.stencil.front.compare_mask & 0xff);
929       SET(WM_DEPTH_STENCIL, ds.StencilWriteMask,
930                             opt_ds.stencil.front.write_mask & 0xff);
931 
932       SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestMask, opt_ds.stencil.back.compare_mask & 0xff);
933       SET(WM_DEPTH_STENCIL, ds.BackfaceStencilWriteMask, opt_ds.stencil.back.write_mask & 0xff);
934 
935       SET(WM_DEPTH_STENCIL, ds.StencilReferenceValue,
936                             opt_ds.stencil.front.reference & 0xff);
937       SET(WM_DEPTH_STENCIL, ds.BackfaceStencilReferenceValue,
938                             opt_ds.stencil.back.reference & 0xff);
939 
940       SET(WM_DEPTH_STENCIL, ds.DepthTestEnable, opt_ds.depth.test_enable);
941       SET(WM_DEPTH_STENCIL, ds.DepthBufferWriteEnable, opt_ds.depth.write_enable);
942       SET(WM_DEPTH_STENCIL, ds.DepthTestFunction,
943                             genX(vk_to_intel_compare_op)[opt_ds.depth.compare_op]);
944       SET(WM_DEPTH_STENCIL, ds.StencilTestEnable, opt_ds.stencil.test_enable);
945       SET(WM_DEPTH_STENCIL, ds.StencilBufferWriteEnable, opt_ds.stencil.write_enable);
946       SET(WM_DEPTH_STENCIL, ds.StencilFailOp,
947                             genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.fail]);
948       SET(WM_DEPTH_STENCIL, ds.StencilPassDepthPassOp,
949                             genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.pass]);
950       SET(WM_DEPTH_STENCIL, ds.StencilPassDepthFailOp,
951                             genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.depth_fail]);
952       SET(WM_DEPTH_STENCIL, ds.StencilTestFunction,
953                             genX(vk_to_intel_compare_op)[opt_ds.stencil.front.op.compare]);
954       SET(WM_DEPTH_STENCIL, ds.BackfaceStencilFailOp,
955                             genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.fail]);
956       SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthPassOp,
957                             genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.pass]);
958       SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthFailOp,
959                             genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.depth_fail]);
960       SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestFunction,
961                             genX(vk_to_intel_compare_op)[opt_ds.stencil.back.op.compare]);
962 
963 #if GFX_VER == 9
964       const bool pma = want_stencil_pma_fix(cmd_buffer, dyn, &opt_ds);
965       SET(PMA_FIX, pma_fix, pma);
966 #endif
967 
968 #if INTEL_WA_18019816803_GFX_VER
969       if (intel_needs_workaround(cmd_buffer->device->info, 18019816803)) {
970          bool ds_write_state = opt_ds.depth.write_enable || opt_ds.stencil.write_enable;
971          SET(WA_18019816803, ds_write_state, ds_write_state);
972       }
973 #endif
974    }
975 
976 #if INTEL_WA_14018283232_GFX_VER
977    if (intel_needs_workaround(cmd_buffer->device->info, 14018283232) &&
978        ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
979         BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE))) {
980       SET(WA_14018283232, wa_14018283232_toggle,
981           dyn->ds.depth.bounds_test.enable &&
982           wm_prog_data &&
983           wm_prog_data->uses_kill);
984    }
985 #endif
986 
987 #if GFX_VER >= 12
988    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE) ||
989        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) {
990       SET(DEPTH_BOUNDS, db.DepthBoundsTestEnable, dyn->ds.depth.bounds_test.enable);
991       /* Only look at updating the bounds if testing is enabled */
992       if (dyn->ds.depth.bounds_test.enable) {
993          SET(DEPTH_BOUNDS, db.DepthBoundsTestMinValue, dyn->ds.depth.bounds_test.min);
994          SET(DEPTH_BOUNDS, db.DepthBoundsTestMaxValue, dyn->ds.depth.bounds_test.max);
995       }
996    }
997 #endif
998 
999    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE) ||
1000        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE)) {
1001       SET(LINE_STIPPLE, ls.LineStipplePattern, dyn->rs.line.stipple.pattern);
1002       SET(LINE_STIPPLE, ls.LineStippleInverseRepeatCount,
1003                         1.0f / MAX2(1, dyn->rs.line.stipple.factor));
1004       SET(LINE_STIPPLE, ls.LineStippleRepeatCount, dyn->rs.line.stipple.factor);
1005 
1006       SET(WM,           wm.LineStippleEnable, dyn->rs.line.stipple.enable);
1007    }
1008 
1009    if ((gfx->dirty & ANV_CMD_DIRTY_RESTART_INDEX) ||
1010        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
1011       SET(VF, vf.IndexedDrawCutIndexEnable, dyn->ia.primitive_restart_enable);
1012       SET(VF, vf.CutIndex, gfx->restart_index);
1013    }
1014 
1015    if (gfx->dirty & ANV_CMD_DIRTY_INDEX_BUFFER)
1016       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_INDEX_BUFFER);
1017 
1018 #if GFX_VERx10 >= 125
1019    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE))
1020       SET(VFG, vfg.ListCutIndexEnable, dyn->ia.primitive_restart_enable);
1021 #endif
1022 
1023    if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
1024        (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) ||
1025         BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE)))
1026       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_PATTERN);
1027 
1028    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
1029        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE)) {
1030       SET_STAGE(PS_EXTRA, ps_extra.PixelShaderKillsPixel,
1031                 wm_prog_data && (pipeline->rp_has_ds_self_dep ||
1032                                  has_ds_feedback_loop(dyn) ||
1033                                  wm_prog_data->uses_kill),
1034                 FRAGMENT);
1035    }
1036 
1037 #if GFX_VERx10 >= 125
1038    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
1039       SET_STAGE(PS_EXTRA, ps_extra.PixelShaderHasUAV,
1040                 wm_prog_data && wm_prog_data->has_side_effects,
1041                 FRAGMENT);
1042    }
1043 #else
1044    if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
1045                                       ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE)) {
1046       /* Prior to Gfx12.5 the HW seems to avoid spawning fragment shaders even
1047        * if 3DSTATE_PS_EXTRA::PixelShaderKillsPixel=true when
1048        * 3DSTATE_PS_BLEND::HasWriteableRT=false. This is causing problems with
1049        * occlusion queries with 0 attachments. There are no CTS tests
1050        * exercising this but zink+anv fails a bunch of tests like piglit
1051        * arb_framebuffer_no_attachments-query.
1052        *
1053        * Here we choose to tweak the PixelShaderHasUAV to make sure the
1054        * fragment shaders are run properly.
1055        */
1056       SET_STAGE(PS_EXTRA, ps_extra.PixelShaderHasUAV,
1057                 wm_prog_data && (wm_prog_data->has_side_effects ||
1058                                  (gfx->color_att_count == 0 &&
1059                                   gfx->n_occlusion_queries > 0)),
1060                 FRAGMENT);
1061    }
1062 #endif
1063 
1064    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
1065        (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
1066        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP) ||
1067        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
1068        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE) ||
1069        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE) ||
1070        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) ||
1071        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES) ||
1072        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) {
1073       const uint8_t color_writes = dyn->cb.color_write_enables;
1074       bool has_writeable_rt =
1075          anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) &&
1076          !anv_cmd_buffer_all_color_write_masked(cmd_buffer);
1077 
1078       SET(BLEND_STATE, blend.AlphaToCoverageEnable,
1079                        dyn->ms.alpha_to_coverage_enable);
1080       SET(BLEND_STATE, blend.AlphaToOneEnable,
1081                        dyn->ms.alpha_to_one_enable);
1082       SET(BLEND_STATE, blend.ColorDitherEnable,
1083           cmd_buffer->state.gfx.rendering_flags & VK_RENDERING_ENABLE_LEGACY_DITHERING_BIT_EXT);
1084 
1085       bool independent_alpha_blend = false;
1086       /* Wa_14018912822, check if we set these during RT setup. */
1087       bool color_blend_zero = false;
1088       bool alpha_blend_zero = false;
1089       for (uint32_t i = 0; i < MAX_RTS; i++) {
1090          /* Disable anything above the current number of color attachments. */
1091          bool write_disabled = i >= gfx->color_att_count ||
1092                                (color_writes & BITFIELD_BIT(i)) == 0;
1093 
1094          SET(BLEND_STATE, blend.rts[i].WriteDisableAlpha,
1095                           write_disabled ||
1096                           (dyn->cb.attachments[i].write_mask &
1097                            VK_COLOR_COMPONENT_A_BIT) == 0);
1098          SET(BLEND_STATE, blend.rts[i].WriteDisableRed,
1099                           write_disabled ||
1100                           (dyn->cb.attachments[i].write_mask &
1101                            VK_COLOR_COMPONENT_R_BIT) == 0);
1102          SET(BLEND_STATE, blend.rts[i].WriteDisableGreen,
1103                           write_disabled ||
1104                           (dyn->cb.attachments[i].write_mask &
1105                            VK_COLOR_COMPONENT_G_BIT) == 0);
1106          SET(BLEND_STATE, blend.rts[i].WriteDisableBlue,
1107                           write_disabled ||
1108                           (dyn->cb.attachments[i].write_mask &
1109                            VK_COLOR_COMPONENT_B_BIT) == 0);
1110          /* Vulkan specification 1.2.168, VkLogicOp:
1111           *
1112           *   "Logical operations are controlled by the logicOpEnable and
1113           *   logicOp members of VkPipelineColorBlendStateCreateInfo. If
1114           *   logicOpEnable is VK_TRUE, then a logical operation selected by
1115           *   logicOp is applied between each color attachment and the
1116           *   fragment’s corresponding output value, and blending of all
1117           *   attachments is treated as if it were disabled."
1118           *
1119           * From the Broadwell PRM Volume 2d: Command Reference: Structures:
1120           * BLEND_STATE_ENTRY:
1121           *
1122           *   "Enabling LogicOp and Color Buffer Blending at the same time is
1123           *   UNDEFINED"
1124           */
1125          SET(BLEND_STATE, blend.rts[i].LogicOpFunction,
1126                           genX(vk_to_intel_logic_op)[dyn->cb.logic_op]);
1127          SET(BLEND_STATE, blend.rts[i].LogicOpEnable, dyn->cb.logic_op_enable);
1128 
1129          SET(BLEND_STATE, blend.rts[i].ColorClampRange, COLORCLAMP_RTFORMAT);
1130          SET(BLEND_STATE, blend.rts[i].PreBlendColorClampEnable, true);
1131          SET(BLEND_STATE, blend.rts[i].PostBlendColorClampEnable, true);
1132 
1133          /* Setup blend equation. */
1134          SET(BLEND_STATE, blend.rts[i].ColorBlendFunction,
1135                           genX(vk_to_intel_blend_op)[
1136                              dyn->cb.attachments[i].color_blend_op]);
1137          SET(BLEND_STATE, blend.rts[i].AlphaBlendFunction,
1138                           genX(vk_to_intel_blend_op)[
1139                              dyn->cb.attachments[i].alpha_blend_op]);
1140 
1141          if (dyn->cb.attachments[i].src_color_blend_factor !=
1142              dyn->cb.attachments[i].src_alpha_blend_factor ||
1143              dyn->cb.attachments[i].dst_color_blend_factor !=
1144              dyn->cb.attachments[i].dst_alpha_blend_factor ||
1145              dyn->cb.attachments[i].color_blend_op !=
1146              dyn->cb.attachments[i].alpha_blend_op) {
1147             independent_alpha_blend = true;
1148          }
1149 
1150          /* The Dual Source Blending documentation says:
1151           *
1152           * "If SRC1 is included in a src/dst blend factor and
1153           * a DualSource RT Write message is not used, results
1154           * are UNDEFINED. (This reflects the same restriction in DX APIs,
1155           * where undefined results are produced if “o1” is not written
1156           * by a PS – there are no default values defined)."
1157           *
1158           * There is no way to gracefully fix this undefined situation
1159           * so we just disable the blending to prevent possible issues.
1160           */
1161          if (wm_prog_data && !wm_prog_data->dual_src_blend &&
1162              anv_is_dual_src_blend_equation(&dyn->cb.attachments[i])) {
1163             SET(BLEND_STATE, blend.rts[i].ColorBufferBlendEnable, false);
1164          } else {
1165             SET(BLEND_STATE, blend.rts[i].ColorBufferBlendEnable,
1166                              !dyn->cb.logic_op_enable &&
1167                              dyn->cb.attachments[i].blend_enable);
1168          }
1169 
1170          /* Our hardware applies the blend factor prior to the blend function
1171           * regardless of what function is used.  Technically, this means the
1172           * hardware can do MORE than GL or Vulkan specify.  However, it also
1173           * means that, for MIN and MAX, we have to stomp the blend factor to
1174           * ONE to make it a no-op.
1175           */
1176          uint32_t SourceBlendFactor;
1177          uint32_t DestinationBlendFactor;
1178          uint32_t SourceAlphaBlendFactor;
1179          uint32_t DestinationAlphaBlendFactor;
1180          if (dyn->cb.attachments[i].color_blend_op == VK_BLEND_OP_MIN ||
1181              dyn->cb.attachments[i].color_blend_op == VK_BLEND_OP_MAX) {
1182             SourceBlendFactor = BLENDFACTOR_ONE;
1183             DestinationBlendFactor = BLENDFACTOR_ONE;
1184          } else {
1185             SourceBlendFactor = genX(vk_to_intel_blend)[
1186                dyn->cb.attachments[i].src_color_blend_factor];
1187             DestinationBlendFactor = genX(vk_to_intel_blend)[
1188                dyn->cb.attachments[i].dst_color_blend_factor];
1189          }
1190 
1191          if (dyn->cb.attachments[i].alpha_blend_op == VK_BLEND_OP_MIN ||
1192              dyn->cb.attachments[i].alpha_blend_op == VK_BLEND_OP_MAX) {
1193             SourceAlphaBlendFactor = BLENDFACTOR_ONE;
1194             DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
1195          } else {
1196             SourceAlphaBlendFactor = genX(vk_to_intel_blend)[
1197                dyn->cb.attachments[i].src_alpha_blend_factor];
1198             DestinationAlphaBlendFactor = genX(vk_to_intel_blend)[
1199                dyn->cb.attachments[i].dst_alpha_blend_factor];
1200          }
1201 
1202          /* Replace and Src1 value by 1.0 if dual source blending is not
1203           * enabled.
1204           */
1205          if (wm_prog_data && !wm_prog_data->dual_src_blend) {
1206             if (is_src1_blend_factor(SourceBlendFactor))
1207                SourceBlendFactor = BLENDFACTOR_ONE;
1208             if (is_src1_blend_factor(DestinationBlendFactor))
1209                DestinationBlendFactor = BLENDFACTOR_ONE;
1210          }
1211 
1212          if (instance->intel_enable_wa_14018912822 &&
1213              intel_needs_workaround(cmd_buffer->device->info, 14018912822) &&
1214              dyn->ms.rasterization_samples > 1) {
1215             if (DestinationBlendFactor == BLENDFACTOR_ZERO) {
1216                DestinationBlendFactor = BLENDFACTOR_CONST_COLOR;
1217                color_blend_zero = true;
1218             }
1219             if (DestinationAlphaBlendFactor == BLENDFACTOR_ZERO) {
1220                DestinationAlphaBlendFactor = BLENDFACTOR_CONST_ALPHA;
1221                alpha_blend_zero = true;
1222             }
1223          }
1224 
1225          SET(BLEND_STATE, blend.rts[i].SourceBlendFactor, SourceBlendFactor);
1226          SET(BLEND_STATE, blend.rts[i].DestinationBlendFactor, DestinationBlendFactor);
1227          SET(BLEND_STATE, blend.rts[i].SourceAlphaBlendFactor, SourceAlphaBlendFactor);
1228          SET(BLEND_STATE, blend.rts[i].DestinationAlphaBlendFactor, DestinationAlphaBlendFactor);
1229       }
1230       gfx->color_blend_zero = color_blend_zero;
1231       gfx->alpha_blend_zero = alpha_blend_zero;
1232 
1233       SET(BLEND_STATE, blend.IndependentAlphaBlendEnable, independent_alpha_blend);
1234 
1235       /* 3DSTATE_PS_BLEND to be consistent with the rest of the
1236        * BLEND_STATE_ENTRY.
1237        */
1238       SET(PS_BLEND, ps_blend.HasWriteableRT, has_writeable_rt);
1239       SET(PS_BLEND, ps_blend.ColorBufferBlendEnable, GET(blend.rts[0].ColorBufferBlendEnable));
1240       SET(PS_BLEND, ps_blend.SourceAlphaBlendFactor, GET(blend.rts[0].SourceAlphaBlendFactor));
1241       SET(PS_BLEND, ps_blend.DestinationAlphaBlendFactor, gfx->alpha_blend_zero ?
1242                                                           BLENDFACTOR_CONST_ALPHA :
1243                                                           GET(blend.rts[0].DestinationAlphaBlendFactor));
1244       SET(PS_BLEND, ps_blend.SourceBlendFactor, GET(blend.rts[0].SourceBlendFactor));
1245       SET(PS_BLEND, ps_blend.DestinationBlendFactor, gfx->color_blend_zero ?
1246                                                      BLENDFACTOR_CONST_COLOR :
1247                                                      GET(blend.rts[0].DestinationBlendFactor));
1248       SET(PS_BLEND, ps_blend.AlphaTestEnable, false);
1249       SET(PS_BLEND, ps_blend.IndependentAlphaBlendEnable, GET(blend.IndependentAlphaBlendEnable));
1250       SET(PS_BLEND, ps_blend.AlphaToCoverageEnable, dyn->ms.alpha_to_coverage_enable);
1251    }
1252 
1253    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
1254       SET(CC_STATE, cc.BlendConstantColorRed,
1255                     gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[0]);
1256       SET(CC_STATE, cc.BlendConstantColorGreen,
1257                     gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[1]);
1258       SET(CC_STATE, cc.BlendConstantColorBlue,
1259                     gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[2]);
1260       SET(CC_STATE, cc.BlendConstantColorAlpha,
1261                     gfx->alpha_blend_zero ? 0.0f : dyn->cb.blend_constants[3]);
1262    }
1263 
1264    if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) ||
1265        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
1266        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
1267        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) ||
1268        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) {
1269       struct anv_instance *instance = cmd_buffer->device->physical->instance;
1270       const VkViewport *viewports = dyn->vp.viewports;
1271 
1272       const float scale = dyn->vp.depth_clip_negative_one_to_one ? 0.5f : 1.0f;
1273 
1274       /* From the Vulkan 1.0.45 spec:
1275        *
1276        *    "If the last active vertex processing stage shader entry point's
1277        *     interface does not include a variable decorated with
1278        *     ViewportIndex, then the first viewport is used."
1279        *
1280        * This could mean that we might need to set the MaximumVPIndex based on
1281        * the pipeline's last stage, but if the last shader doesn't write the
1282        * viewport index and the VUE header is used, the compiler will force
1283        * the value to 0 (which is what the spec requires above). Otherwise it
1284        * seems like the HW should be pulling 0 if the VUE header is not
1285        * present.
1286        *
1287        * Avoiding a check on the pipeline seems to prevent additional
1288        * emissions of 3DSTATE_CLIP which appear to impact performance on
1289        * Assassin's Creed Valhalla..
1290        */
1291       SET(CLIP, clip.MaximumVPIndex, dyn->vp.viewport_count > 0 ?
1292                                      dyn->vp.viewport_count - 1 : 0);
1293 
1294       for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
1295          const VkViewport *vp = &viewports[i];
1296 
1297          /* The gfx7 state struct has just the matrix and guardband fields, the
1298           * gfx8 struct adds the min/max viewport fields. */
1299          struct GENX(SF_CLIP_VIEWPORT) sfv = {
1300             .ViewportMatrixElementm00 = vp->width / 2,
1301             .ViewportMatrixElementm11 = vp->height / 2,
1302             .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale,
1303             .ViewportMatrixElementm30 = vp->x + vp->width / 2,
1304             .ViewportMatrixElementm31 = vp->y + vp->height / 2,
1305             .ViewportMatrixElementm32 = dyn->vp.depth_clip_negative_one_to_one ?
1306                (vp->minDepth + vp->maxDepth) * scale : vp->minDepth,
1307             .XMinClipGuardband = -1.0f,
1308             .XMaxClipGuardband = 1.0f,
1309             .YMinClipGuardband = -1.0f,
1310             .YMaxClipGuardband = 1.0f,
1311             .XMinViewPort = vp->x,
1312             .XMaxViewPort = vp->x + vp->width - 1,
1313             .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
1314             .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
1315          };
1316 
1317          /* Fix depth test misrenderings by lowering translated depth range */
1318          if (instance->lower_depth_range_rate != 1.0f)
1319             sfv.ViewportMatrixElementm32 *= instance->lower_depth_range_rate;
1320 
1321          const uint32_t fb_size_max = 1 << 14;
1322          uint32_t x_min = 0, x_max = fb_size_max;
1323          uint32_t y_min = 0, y_max = fb_size_max;
1324 
1325          /* If we have a valid renderArea, include that */
1326          if (gfx->render_area.extent.width > 0 &&
1327              gfx->render_area.extent.height > 0) {
1328             x_min = MAX2(x_min, gfx->render_area.offset.x);
1329             x_max = MIN2(x_max, gfx->render_area.offset.x +
1330                                 gfx->render_area.extent.width);
1331             y_min = MAX2(y_min, gfx->render_area.offset.y);
1332             y_max = MIN2(y_max, gfx->render_area.offset.y +
1333                                 gfx->render_area.extent.height);
1334          }
1335 
1336          /* The client is required to have enough scissors for whatever it
1337           * sets as ViewportIndex but it's possible that they've got more
1338           * viewports set from a previous command. Also, from the Vulkan
1339           * 1.3.207:
1340           *
1341           *    "The application must ensure (using scissor if necessary) that
1342           *    all rendering is contained within the render area."
1343           *
1344           * If the client doesn't set a scissor, that basically means it
1345           * guarantees everything is in-bounds already. If we end up using a
1346           * guardband of [-1, 1] in that case, there shouldn't be much loss.
1347           * It's theoretically possible that they could do all their clipping
1348           * with clip planes but that'd be a bit odd.
1349           */
1350          if (i < dyn->vp.scissor_count) {
1351             const VkRect2D *scissor = &dyn->vp.scissors[i];
1352             x_min = MAX2(x_min, scissor->offset.x);
1353             x_max = MIN2(x_max, scissor->offset.x + scissor->extent.width);
1354             y_min = MAX2(y_min, scissor->offset.y);
1355             y_max = MIN2(y_max, scissor->offset.y + scissor->extent.height);
1356          }
1357 
1358          /* Only bother calculating the guardband if our known render area is
1359           * less than the maximum size. Otherwise, it will calculate [-1, 1]
1360           * anyway but possibly with precision loss.
1361           */
1362          if (x_min > 0 || x_max < fb_size_max ||
1363              y_min > 0 || y_max < fb_size_max) {
1364             intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
1365                                            sfv.ViewportMatrixElementm00,
1366                                            sfv.ViewportMatrixElementm11,
1367                                            sfv.ViewportMatrixElementm30,
1368                                            sfv.ViewportMatrixElementm31,
1369                                            &sfv.XMinClipGuardband,
1370                                            &sfv.XMaxClipGuardband,
1371                                            &sfv.YMinClipGuardband,
1372                                            &sfv.YMaxClipGuardband);
1373          }
1374 
1375 #define SET_VP(bit, state, field)                                        \
1376          do {                                                           \
1377             if (hw_state->state.field != sfv.field) {                   \
1378                hw_state->state.field = sfv.field;                       \
1379                BITSET_SET(hw_state->dirty,                              \
1380                           ANV_GFX_STATE_##bit);                         \
1381             }                                                           \
1382          } while (0)
1383          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm00);
1384          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm11);
1385          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm22);
1386          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm30);
1387          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm31);
1388          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm32);
1389          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinClipGuardband);
1390          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxClipGuardband);
1391          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinClipGuardband);
1392          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxClipGuardband);
1393          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinViewPort);
1394          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxViewPort);
1395          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinViewPort);
1396          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxViewPort);
1397 #undef SET_VP
1398 
1399          const bool depth_range_unrestricted =
1400             cmd_buffer->device->vk.enabled_extensions.EXT_depth_range_unrestricted;
1401 
1402          float min_depth_limit = depth_range_unrestricted ? -FLT_MAX : 0.0;
1403          float max_depth_limit = depth_range_unrestricted ? FLT_MAX : 1.0;
1404 
1405          float min_depth = dyn->rs.depth_clamp_enable ?
1406                            MIN2(vp->minDepth, vp->maxDepth) : min_depth_limit;
1407          float max_depth = dyn->rs.depth_clamp_enable ?
1408                            MAX2(vp->minDepth, vp->maxDepth) : max_depth_limit;
1409 
1410          SET(VIEWPORT_CC, vp_cc.elem[i].MinimumDepth, min_depth);
1411          SET(VIEWPORT_CC, vp_cc.elem[i].MaximumDepth, max_depth);
1412       }
1413 
1414       /* If the HW state is already considered dirty or the previous
1415        * programmed viewport count is smaller than what we need, update the
1416        * viewport count and ensure the HW state is dirty. Otherwise if the
1417        * number of viewport programmed previously was larger than what we need
1418        * now, no need to reemit we can just keep the old programmed values.
1419        */
1420       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP) ||
1421           hw_state->vp_sf_clip.count < dyn->vp.viewport_count) {
1422          hw_state->vp_sf_clip.count = dyn->vp.viewport_count;
1423          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP);
1424       }
1425       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) ||
1426           hw_state->vp_cc.count < dyn->vp.viewport_count) {
1427          hw_state->vp_cc.count = dyn->vp.viewport_count;
1428          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC);
1429       }
1430    }
1431 
1432    if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) ||
1433        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
1434        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS)) {
1435       const VkRect2D *scissors = dyn->vp.scissors;
1436       const VkViewport *viewports = dyn->vp.viewports;
1437 
1438       for (uint32_t i = 0; i < dyn->vp.scissor_count; i++) {
1439          const VkRect2D *s = &scissors[i];
1440          const VkViewport *vp = &viewports[i];
1441 
1442          const int max = 0xffff;
1443 
1444          uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height));
1445          uint32_t x_min = MAX2(s->offset.x, vp->x);
1446          int64_t y_max = MIN2(s->offset.y + s->extent.height - 1,
1447                               MAX2(vp->y, vp->y + vp->height) - 1);
1448          int64_t x_max = MIN2(s->offset.x + s->extent.width - 1,
1449                               vp->x + vp->width - 1);
1450 
1451          y_max = CLAMP(y_max, 0, INT16_MAX >> 1);
1452          x_max = CLAMP(x_max, 0, INT16_MAX >> 1);
1453 
1454          /* Do this math using int64_t so overflow gets clamped correctly. */
1455          if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1456             y_min = CLAMP((uint64_t) y_min, gfx->render_area.offset.y, max);
1457             x_min = CLAMP((uint64_t) x_min, gfx->render_area.offset.x, max);
1458             y_max = CLAMP((uint64_t) y_max, 0,
1459                           gfx->render_area.offset.y +
1460                           gfx->render_area.extent.height - 1);
1461             x_max = CLAMP((uint64_t) x_max, 0,
1462                           gfx->render_area.offset.x +
1463                           gfx->render_area.extent.width - 1);
1464          }
1465 
1466          if (s->extent.width <= 0 || s->extent.height <= 0) {
1467             /* Since xmax and ymax are inclusive, we have to have xmax < xmin
1468              * or ymax < ymin for empty clips. In case clip x, y, width height
1469              * are all 0, the clamps below produce 0 for xmin, ymin, xmax,
1470              * ymax, which isn't what we want. Just special case empty clips
1471              * and produce a canonical empty clip.
1472              */
1473             SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, 1);
1474             SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, 1);
1475             SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, 0);
1476             SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, 0);
1477          } else {
1478             SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, y_min);
1479             SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, x_min);
1480             SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, y_max);
1481             SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, x_max);
1482          }
1483       }
1484 
1485       /* If the HW state is already considered dirty or the previous
1486        * programmed viewport count is smaller than what we need, update the
1487        * viewport count and ensure the HW state is dirty. Otherwise if the
1488        * number of viewport programmed previously was larger than what we need
1489        * now, no need to reemit we can just keep the old programmed values.
1490        */
1491       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SCISSOR) ||
1492           hw_state->scissor.count < dyn->vp.scissor_count) {
1493          hw_state->scissor.count = dyn->vp.scissor_count;
1494          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SCISSOR);
1495       }
1496    }
1497 
1498 #if GFX_VERx10 == 125
1499    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)) {
1500       unsigned fb_width, fb_height, tile_width, tile_height;
1501 
1502       if (cmd_buffer->device->physical->instance->enable_tbimr &&
1503           calculate_render_area(cmd_buffer, &fb_width, &fb_height) &&
1504           calculate_tile_dimensions(cmd_buffer, fb_width, fb_height,
1505                                     &tile_width, &tile_height)) {
1506          /* Use a batch size of 128 polygons per slice as recommended
1507           * by BSpec 68436 "TBIMR Programming".
1508           */
1509          const unsigned num_slices = cmd_buffer->device->info->num_slices;
1510          const unsigned batch_size = DIV_ROUND_UP(num_slices, 2) * 256;
1511 
1512          SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleHeight, tile_height);
1513          SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleWidth, tile_width);
1514          SET(TBIMR_TILE_PASS_INFO, tbimr.VerticalTileCount,
1515              DIV_ROUND_UP(fb_height, tile_height));
1516          SET(TBIMR_TILE_PASS_INFO, tbimr.HorizontalTileCount,
1517              DIV_ROUND_UP(fb_width, tile_width));
1518          SET(TBIMR_TILE_PASS_INFO, tbimr.TBIMRBatchSize,
1519              util_logbase2(batch_size) - 5);
1520          SET(TBIMR_TILE_PASS_INFO, tbimr.TileBoxCheck, true);
1521          SET(TBIMR_TILE_PASS_INFO, use_tbimr, true);
1522       } else {
1523          hw_state->use_tbimr = false;
1524       }
1525    }
1526 #endif
1527 
1528    struct anv_push_constants *push = &cmd_buffer->state.gfx.base.push_constants;
1529 
1530    /* If the pipeline uses a dynamic value of patch_control_points and either
1531     * the pipeline change or the dynamic value change, check the value and
1532     * reemit if needed.
1533     */
1534    if (pipeline->dynamic_patch_control_points &&
1535        ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
1536         BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS)) &&
1537        push->gfx.tcs_input_vertices != dyn->ts.patch_control_points) {
1538       push->gfx.tcs_input_vertices = dyn->ts.patch_control_points;
1539       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
1540       gfx->base.push_constants_data_dirty = true;
1541    }
1542 
1543 #undef GET
1544 #undef SET
1545 #undef SET_STAGE
1546 
1547    vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
1548 
1549    return dirty_state_mask;
1550 }
1551 
1552 static void
emit_wa_18020335297_dummy_draw(struct anv_cmd_buffer * cmd_buffer)1553 emit_wa_18020335297_dummy_draw(struct anv_cmd_buffer *cmd_buffer)
1554 {
1555 #if GFX_VERx10 >= 125
1556    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VFG), vfg) {
1557       vfg.DistributionMode = RR_STRICT;
1558    }
1559    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) {
1560       vf.GeometryDistributionEnable = true;
1561    }
1562 #endif
1563 
1564 #if GFX_VER >= 12
1565    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
1566       pr.ReplicaMask = 1;
1567    }
1568 #endif
1569 
1570    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_RASTER), rr) {
1571       rr.CullMode = CULLMODE_NONE;
1572       rr.FrontFaceFillMode = FILL_MODE_SOLID;
1573       rr.BackFaceFillMode = FILL_MODE_SOLID;
1574    }
1575 
1576    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_STATISTICS), zero);
1577    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS), zero);
1578 
1579 #if GFX_VER >= 11
1580    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS_2), zero);
1581 #endif
1582 
1583    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLIP), clip) {
1584       clip.ClipEnable = true;
1585       clip.ClipMode = CLIPMODE_REJECT_ALL;
1586    }
1587 
1588    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VS), zero);
1589    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), zero);
1590    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HS), zero);
1591    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TE), zero);
1592    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DS), zero);
1593    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), zero);
1594 
1595    uint32_t *vertex_elements = anv_batch_emitn(&cmd_buffer->batch, 1 + 2 * 2,
1596                                                GENX(3DSTATE_VERTEX_ELEMENTS));
1597    uint32_t *ve_pack_dest = &vertex_elements[1];
1598 
1599    for (int i = 0; i < 2; i++) {
1600       struct GENX(VERTEX_ELEMENT_STATE) element = {
1601          .Valid = true,
1602          .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
1603          .Component0Control = VFCOMP_STORE_0,
1604          .Component1Control = VFCOMP_STORE_0,
1605          .Component2Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP,
1606          .Component3Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP,
1607       };
1608       GENX(VERTEX_ELEMENT_STATE_pack)(NULL, ve_pack_dest, &element);
1609       ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
1610    }
1611 
1612    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
1613       topo.PrimitiveTopologyType = _3DPRIM_TRILIST;
1614    }
1615 
1616    /* Emit dummy draw per slice. */
1617    for (unsigned i = 0; i < cmd_buffer->device->info->num_slices; i++) {
1618       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1619          prim.VertexCountPerInstance = 3;
1620          prim.PrimitiveTopologyType = _3DPRIM_TRILIST;
1621          prim.InstanceCount = 1;
1622          prim.VertexAccessType = SEQUENTIAL;
1623       }
1624    }
1625 }
1626 
1627 #if INTEL_WA_14018283232_GFX_VER
1628 void
genX(batch_emit_wa_14018283232)1629 genX(batch_emit_wa_14018283232)(struct anv_batch *batch)
1630 {
1631    anv_batch_emit(batch, GENX(RESOURCE_BARRIER), barrier) {
1632       barrier.ResourceBarrierBody = (struct GENX(RESOURCE_BARRIER_BODY)) {
1633          .BarrierType = RESOURCE_BARRIER_TYPE_IMMEDIATE,
1634          .SignalStage = RESOURCE_BARRIER_STAGE_COLOR,
1635             .WaitStage = RESOURCE_BARRIER_STAGE_PIXEL,
1636       };
1637    }
1638 }
1639 #endif
1640 
1641 /**
1642  * This function handles dirty state emission to the batch buffer.
1643  */
1644 static void
cmd_buffer_gfx_state_emission(struct anv_cmd_buffer * cmd_buffer)1645 cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
1646 {
1647    struct anv_device *device = cmd_buffer->device;
1648    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1649    struct anv_graphics_pipeline *pipeline =
1650       anv_pipeline_to_graphics(gfx->base.pipeline);
1651    const struct vk_dynamic_graphics_state *dyn =
1652       &cmd_buffer->vk.dynamic_graphics_state;
1653    struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
1654    const bool protected = cmd_buffer->vk.pool->flags &
1655                           VK_COMMAND_POOL_CREATE_PROTECTED_BIT;
1656 
1657    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_URB)) {
1658       genX(urb_workaround)(cmd_buffer, &pipeline->urb_cfg);
1659 
1660       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.urb);
1661 
1662       memcpy(&gfx->urb_cfg, &pipeline->urb_cfg,
1663              sizeof(struct intel_urb_config));
1664    }
1665 
1666    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION))
1667       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.primitive_replication);
1668 
1669    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_INSTANCING))
1670       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs_instancing);
1671 
1672    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS))
1673       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs);
1674 
1675 #if GFX_VER >= 11
1676    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2))
1677       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs_2);
1678 #endif
1679 
1680    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VS)) {
1681       anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
1682                                               final.vs, protected);
1683    }
1684 
1685    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_HS)) {
1686       anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
1687                                               final.hs, protected);
1688    }
1689 
1690    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_DS)) {
1691       anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
1692                                               final.ds, protected);
1693    }
1694 
1695    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS))
1696       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_statistics);
1697 
1698    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE))
1699       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe);
1700 
1701    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_SWIZ))
1702       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe_swiz);
1703 
1704    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST)) {
1705       /* Wa_16011773973:
1706        * If SOL is enabled and SO_DECL state has to be programmed,
1707        *    1. Send 3D State SOL state with SOL disabled
1708        *    2. Send SO_DECL NP state
1709        *    3. Send 3D State SOL with SOL Enabled
1710        */
1711       if (intel_needs_workaround(device->info, 16011773973) &&
1712           pipeline->uses_xfb)
1713          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), so);
1714 
1715       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline,
1716                                     final.so_decl_list);
1717 
1718 #if GFX_VER >= 11 && GFX_VER < 20
1719       /* ICL PRMs, Volume 2a - Command Reference: Instructions,
1720        * 3DSTATE_SO_DECL_LIST:
1721        *
1722        *    "Workaround: This command must be followed by a PIPE_CONTROL with
1723        *     CS Stall bit set."
1724        *
1725        * On DG2+ also known as Wa_1509820217.
1726        */
1727       genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
1728                                    cmd_buffer->state.current_pipeline,
1729                                    ANV_PIPE_CS_STALL_BIT);
1730 #endif
1731    }
1732 
1733    if (device->vk.enabled_extensions.EXT_mesh_shader) {
1734       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL)) {
1735          anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
1736                                                  final.mesh_control, protected);
1737       }
1738 
1739       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_SHADER))
1740          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_shader);
1741 
1742       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_DISTRIB))
1743          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_distrib);
1744 
1745       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL)) {
1746          anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
1747                                                  final.task_control, protected);
1748       }
1749 
1750       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_SHADER))
1751          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_shader);
1752 
1753       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_REDISTRIB))
1754          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_redistrib);
1755 
1756       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_MESH))
1757          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe_mesh);
1758 
1759       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP_MESH))
1760          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.clip_mesh);
1761    } else {
1762       assert(!BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL) &&
1763              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_SHADER) &&
1764              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_DISTRIB) &&
1765              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL) &&
1766              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_SHADER) &&
1767              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_REDISTRIB) &&
1768              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP_MESH) &&
1769              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_MESH));
1770    }
1771 
1772 #define INIT(category, name) \
1773    .name = hw_state->category.name
1774 #define SET(s, category, name) \
1775    s.name = hw_state->category.name
1776 
1777    /* Now the potentially dynamic instructions */
1778 
1779    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS)) {
1780       anv_batch_emit_merge_protected(&cmd_buffer->batch, GENX(3DSTATE_PS),
1781                                      pipeline, partial.ps, ps, protected) {
1782          SET(ps, ps, KernelStartPointer0);
1783          SET(ps, ps, KernelStartPointer1);
1784          SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData0);
1785          SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData1);
1786 
1787 #if GFX_VER < 20
1788          SET(ps, ps, KernelStartPointer2);
1789          SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData2);
1790 
1791          SET(ps, ps, _8PixelDispatchEnable);
1792          SET(ps, ps, _16PixelDispatchEnable);
1793          SET(ps, ps, _32PixelDispatchEnable);
1794 #else
1795          SET(ps, ps, Kernel0Enable);
1796          SET(ps, ps, Kernel1Enable);
1797          SET(ps, ps, Kernel0SIMDWidth);
1798          SET(ps, ps, Kernel1SIMDWidth);
1799          SET(ps, ps, Kernel0PolyPackingPolicy);
1800 #endif
1801          SET(ps, ps, PositionXYOffsetSelect);
1802       }
1803    }
1804 
1805    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS_EXTRA)) {
1806       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_PS_EXTRA),
1807                            pipeline, partial.ps_extra, pse) {
1808          SET(pse, ps_extra, PixelShaderHasUAV);
1809          SET(pse, ps_extra, PixelShaderIsPerSample);
1810 #if GFX_VER >= 11
1811          SET(pse, ps_extra, PixelShaderIsPerCoarsePixel);
1812 #endif
1813 #if GFX_VERx10 >= 125
1814          SET(pse, ps_extra, EnablePSDependencyOnCPsizeChange);
1815 #endif
1816          SET(pse, ps_extra, PixelShaderKillsPixel);
1817       }
1818    }
1819 
1820    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP)) {
1821       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_CLIP),
1822                            pipeline, partial.clip, clip) {
1823          SET(clip, clip, APIMode);
1824          SET(clip, clip, ViewportXYClipTestEnable);
1825          SET(clip, clip, TriangleStripListProvokingVertexSelect);
1826          SET(clip, clip, LineStripListProvokingVertexSelect);
1827          SET(clip, clip, TriangleFanProvokingVertexSelect);
1828          SET(clip, clip, MaximumVPIndex);
1829       }
1830    }
1831 
1832    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_STREAMOUT)) {
1833       genX(streamout_prologue)(cmd_buffer);
1834 
1835       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT),
1836                            pipeline, partial.so, so) {
1837          SET(so, so, RenderingDisable);
1838          SET(so, so, RenderStreamSelect);
1839          SET(so, so, ReorderMode);
1840          SET(so, so, ForceRendering);
1841       }
1842    }
1843 
1844    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP)) {
1845       struct anv_state sf_clip_state =
1846          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
1847                                             hw_state->vp_sf_clip.count * 64, 64);
1848 
1849       for (uint32_t i = 0; i < hw_state->vp_sf_clip.count; i++) {
1850          struct GENX(SF_CLIP_VIEWPORT) sfv = {
1851             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm00),
1852             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm11),
1853             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm22),
1854             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm30),
1855             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm31),
1856             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm32),
1857             INIT(vp_sf_clip.elem[i], XMinClipGuardband),
1858             INIT(vp_sf_clip.elem[i], XMaxClipGuardband),
1859             INIT(vp_sf_clip.elem[i], YMinClipGuardband),
1860             INIT(vp_sf_clip.elem[i], YMaxClipGuardband),
1861             INIT(vp_sf_clip.elem[i], XMinViewPort),
1862             INIT(vp_sf_clip.elem[i], XMaxViewPort),
1863             INIT(vp_sf_clip.elem[i], YMinViewPort),
1864             INIT(vp_sf_clip.elem[i], YMaxViewPort),
1865          };
1866          GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
1867       }
1868 
1869       anv_batch_emit(&cmd_buffer->batch,
1870                      GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
1871          clip.SFClipViewportPointer = sf_clip_state.offset;
1872       }
1873    }
1874 
1875    /* Force CC_VIEWPORT reallocation on Gfx9 when reprogramming
1876     * 3DSTATE_VIEWPORT_STATE_POINTERS_CC :
1877     *    https://gitlab.freedesktop.org/mesa/mesa/-/issues/11647
1878     */
1879    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) ||
1880        (GFX_VER == 9 &&
1881         BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR))) {
1882       hw_state->vp_cc.state =
1883          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
1884                                             hw_state->vp_cc.count * 8, 32);
1885 
1886       for (uint32_t i = 0; i < hw_state->vp_cc.count; i++) {
1887          struct GENX(CC_VIEWPORT) cc_viewport = {
1888             INIT(vp_cc.elem[i], MinimumDepth),
1889             INIT(vp_cc.elem[i], MaximumDepth),
1890          };
1891          GENX(CC_VIEWPORT_pack)(NULL, hw_state->vp_cc.state.map + i * 8,
1892                                 &cc_viewport);
1893       }
1894 
1895       /* Dirty the pointers to reemit 3DSTATE_VIEWPORT_STATE_POINTERS_CC below
1896        */
1897       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR);
1898    }
1899 
1900    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR)) {
1901       anv_batch_emit(&cmd_buffer->batch,
1902                      GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
1903          cc.CCViewportPointer = hw_state->vp_cc.state.offset;
1904       }
1905       cmd_buffer->state.gfx.viewport_set = true;
1906    }
1907 
1908    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SCISSOR)) {
1909       /* Wa_1409725701:
1910        *
1911        *    "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
1912        *    stored as an array of up to 16 elements. The location of first
1913        *    element of the array, as specified by Pointer to SCISSOR_RECT,
1914        *    should be aligned to a 64-byte boundary.
1915        */
1916       struct anv_state scissor_state =
1917          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
1918                                             hw_state->scissor.count * 8, 64);
1919 
1920       for (uint32_t i = 0; i < hw_state->scissor.count; i++) {
1921          struct GENX(SCISSOR_RECT) scissor = {
1922             INIT(scissor.elem[i], ScissorRectangleYMin),
1923             INIT(scissor.elem[i], ScissorRectangleXMin),
1924             INIT(scissor.elem[i], ScissorRectangleYMax),
1925             INIT(scissor.elem[i], ScissorRectangleXMax),
1926          };
1927          GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor);
1928       }
1929 
1930       anv_batch_emit(&cmd_buffer->batch,
1931                      GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) {
1932          ssp.ScissorRectPointer = scissor_state.offset;
1933       }
1934    }
1935 
1936    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY)) {
1937       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
1938          SET(vft, vft, PrimitiveTopologyType);
1939       }
1940    }
1941 
1942    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT)) {
1943       const uint32_t ve_count =
1944          pipeline->vs_input_elements + pipeline->svgs_count;
1945       const uint32_t num_dwords = 1 + 2 * MAX2(1, ve_count);
1946       uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
1947                                     GENX(3DSTATE_VERTEX_ELEMENTS));
1948 
1949       if (p) {
1950          if (ve_count == 0) {
1951             memcpy(p + 1, cmd_buffer->device->physical->empty_vs_input,
1952                    sizeof(cmd_buffer->device->physical->empty_vs_input));
1953          } else if (ve_count == pipeline->vertex_input_elems) {
1954             /* MESA_VK_DYNAMIC_VI is not dynamic for this pipeline, so
1955              * everything is in pipeline->vertex_input_data and we can just
1956              * memcpy
1957              */
1958             memcpy(p + 1, pipeline->vertex_input_data, 4 * 2 * ve_count);
1959             anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline,
1960                                           final.vf_instancing);
1961          } else {
1962             assert(pipeline->final.vf_instancing.len == 0);
1963             /* Use dyn->vi to emit the dynamic VERTEX_ELEMENT_STATE input. */
1964             genX(emit_vertex_input)(&cmd_buffer->batch, p + 1,
1965                                     pipeline, dyn->vi, false /* emit_in_pipeline */);
1966             /* Then append the VERTEX_ELEMENT_STATE for the draw parameters */
1967             memcpy(p + 1 + 2 * pipeline->vs_input_elements,
1968                    pipeline->vertex_input_data,
1969                    4 * 2 * pipeline->vertex_input_elems);
1970          }
1971       }
1972    }
1973 
1974    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TE)) {
1975       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_TE),
1976                            pipeline, partial.te, te) {
1977          SET(te, te, OutputTopology);
1978       }
1979    }
1980 
1981    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_GS)) {
1982       anv_batch_emit_merge_protected(&cmd_buffer->batch, GENX(3DSTATE_GS),
1983                                      pipeline, partial.gs, gs, protected) {
1984          SET(gs, gs, ReorderMode);
1985       }
1986    }
1987 
1988    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CPS)) {
1989 #if GFX_VER == 11
1990       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CPS), cps) {
1991          SET(cps, cps, CoarsePixelShadingMode);
1992          SET(cps, cps, MinCPSizeX);
1993          SET(cps, cps, MinCPSizeY);
1994       }
1995 #elif GFX_VER >= 12
1996       /* TODO: we can optimize this flush in the following cases:
1997        *
1998        *    In the case where the last geometry shader emits a value that is
1999        *    not constant, we can avoid this stall because we can synchronize
2000        *    the pixel shader internally with
2001        *    3DSTATE_PS::EnablePSDependencyOnCPsizeChange.
2002        *
2003        *    If we know that the previous pipeline and the current one are
2004        *    using the same fragment shading rate.
2005        */
2006       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2007 #if GFX_VERx10 >= 125
2008          pc.PSSStallSyncEnable = true;
2009 #else
2010          pc.PSDSyncEnable = true;
2011 #endif
2012       }
2013 
2014       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CPS_POINTERS), cps) {
2015          SET(cps, cps, CoarsePixelShadingStateArrayPointer);
2016       }
2017 #endif
2018    }
2019 
2020    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SF)) {
2021       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_SF),
2022                            pipeline, partial.sf, sf) {
2023          SET(sf, sf, LineWidth);
2024          SET(sf, sf, TriangleStripListProvokingVertexSelect);
2025          SET(sf, sf, LineStripListProvokingVertexSelect);
2026          SET(sf, sf, TriangleFanProvokingVertexSelect);
2027          SET(sf, sf, LegacyGlobalDepthBiasEnable);
2028       }
2029    }
2030 
2031    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_RASTER)) {
2032       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_RASTER),
2033                            pipeline, partial.raster, raster) {
2034          SET(raster, raster, APIMode);
2035          SET(raster, raster, DXMultisampleRasterizationEnable);
2036          SET(raster, raster, AntialiasingEnable);
2037          SET(raster, raster, CullMode);
2038          SET(raster, raster, FrontWinding);
2039          SET(raster, raster, GlobalDepthOffsetEnableSolid);
2040          SET(raster, raster, GlobalDepthOffsetEnableWireframe);
2041          SET(raster, raster, GlobalDepthOffsetEnablePoint);
2042          SET(raster, raster, GlobalDepthOffsetConstant);
2043          SET(raster, raster, GlobalDepthOffsetScale);
2044          SET(raster, raster, GlobalDepthOffsetClamp);
2045          SET(raster, raster, FrontFaceFillMode);
2046          SET(raster, raster, BackFaceFillMode);
2047          SET(raster, raster, ViewportZFarClipTestEnable);
2048          SET(raster, raster, ViewportZNearClipTestEnable);
2049          SET(raster, raster, ConservativeRasterizationEnable);
2050       }
2051    }
2052 
2053    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE)) {
2054       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_MULTISAMPLE),
2055                            pipeline, partial.ms, ms) {
2056          SET(ms, ms, NumberofMultisamples);
2057       }
2058    }
2059 
2060    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CC_STATE)) {
2061       hw_state->cc.state =
2062          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
2063                                             GENX(COLOR_CALC_STATE_length) * 4,
2064                                             64);
2065       struct GENX(COLOR_CALC_STATE) cc = {
2066          INIT(cc, BlendConstantColorRed),
2067          INIT(cc, BlendConstantColorGreen),
2068          INIT(cc, BlendConstantColorBlue),
2069          INIT(cc, BlendConstantColorAlpha),
2070       };
2071       GENX(COLOR_CALC_STATE_pack)(NULL, hw_state->cc.state.map, &cc);
2072 
2073       /* Dirty the pointers to reemit 3DSTATE_CC_STATE_POINTERS below
2074        */
2075       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CC_STATE_PTR);
2076    }
2077 
2078    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CC_STATE_PTR)) {
2079       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
2080          ccp.ColorCalcStatePointer = hw_state->cc.state.offset;
2081          ccp.ColorCalcStatePointerValid = true;
2082       }
2083    }
2084 
2085    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK)) {
2086       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
2087          SET(sm, sm, SampleMask);
2088       }
2089    }
2090 
2091    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM_DEPTH_STENCIL)) {
2092       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds) {
2093          SET(ds, ds, DoubleSidedStencilEnable);
2094          SET(ds, ds, StencilTestMask);
2095          SET(ds, ds, StencilWriteMask);
2096          SET(ds, ds, BackfaceStencilTestMask);
2097          SET(ds, ds, BackfaceStencilWriteMask);
2098          SET(ds, ds, StencilReferenceValue);
2099          SET(ds, ds, BackfaceStencilReferenceValue);
2100          SET(ds, ds, DepthTestEnable);
2101          SET(ds, ds, DepthBufferWriteEnable);
2102          SET(ds, ds, DepthTestFunction);
2103          SET(ds, ds, StencilTestEnable);
2104          SET(ds, ds, StencilBufferWriteEnable);
2105          SET(ds, ds, StencilFailOp);
2106          SET(ds, ds, StencilPassDepthPassOp);
2107          SET(ds, ds, StencilPassDepthFailOp);
2108          SET(ds, ds, StencilTestFunction);
2109          SET(ds, ds, BackfaceStencilFailOp);
2110          SET(ds, ds, BackfaceStencilPassDepthPassOp);
2111          SET(ds, ds, BackfaceStencilPassDepthFailOp);
2112          SET(ds, ds, BackfaceStencilTestFunction);
2113       }
2114    }
2115 
2116 #if GFX_VER >= 12
2117    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_DEPTH_BOUNDS)) {
2118       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
2119          SET(db, db, DepthBoundsTestEnable);
2120          SET(db, db, DepthBoundsTestMinValue);
2121          SET(db, db, DepthBoundsTestMaxValue);
2122       }
2123    }
2124 #endif
2125 
2126    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_LINE_STIPPLE)) {
2127       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_LINE_STIPPLE), ls) {
2128          SET(ls, ls, LineStipplePattern);
2129          SET(ls, ls, LineStippleInverseRepeatCount);
2130          SET(ls, ls, LineStippleRepeatCount);
2131       }
2132 #if GFX_VER >= 11
2133       /* ICL PRMs, Volume 2a - Command Reference: Instructions,
2134        * 3DSTATE_LINE_STIPPLE:
2135        *
2136        *    "Workaround: This command must be followed by a PIPE_CONTROL with
2137        *     CS Stall bit set."
2138        */
2139       genx_batch_emit_pipe_control(&cmd_buffer->batch,
2140                                    cmd_buffer->device->info,
2141                                    cmd_buffer->state.current_pipeline,
2142                                    ANV_PIPE_CS_STALL_BIT);
2143 #endif
2144    }
2145 
2146    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF)) {
2147       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) {
2148 #if GFX_VERx10 >= 125
2149          vf.GeometryDistributionEnable = true;
2150 #endif
2151          SET(vf, vf, IndexedDrawCutIndexEnable);
2152          SET(vf, vf, CutIndex);
2153       }
2154    }
2155 
2156    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_INDEX_BUFFER)) {
2157       struct anv_buffer *buffer = gfx->index_buffer;
2158       uint32_t offset = gfx->index_offset;
2159       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
2160          ib.IndexFormat           = gfx->index_type;
2161          ib.MOCS                  = anv_mocs(cmd_buffer->device,
2162                                              buffer ? buffer->address.bo : NULL,
2163                                              ISL_SURF_USAGE_INDEX_BUFFER_BIT);
2164 #if GFX_VER >= 12
2165          ib.L3BypassDisable       = true;
2166 #endif
2167          if (buffer) {
2168             ib.BufferStartingAddress = anv_address_add(buffer->address, offset);
2169             ib.BufferSize            = gfx->index_size;
2170          }
2171       }
2172    }
2173 
2174 #if GFX_VERx10 >= 125
2175    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VFG)) {
2176       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_VFG),
2177                            pipeline, partial.vfg, vfg) {
2178          SET(vfg, vfg, ListCutIndexEnable);
2179       }
2180    }
2181 #endif
2182 
2183    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SAMPLE_PATTERN)) {
2184       genX(emit_sample_pattern)(&cmd_buffer->batch,
2185                                 dyn->ms.sample_locations_enable ?
2186                                 dyn->ms.sample_locations : NULL);
2187    }
2188 
2189    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM)) {
2190       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_WM),
2191                            pipeline, partial.wm, wm) {
2192          SET(wm, wm, LineStippleEnable);
2193          SET(wm, wm, BarycentricInterpolationMode);
2194       }
2195    }
2196 
2197    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS_BLEND)) {
2198       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PS_BLEND), blend) {
2199          SET(blend, ps_blend, HasWriteableRT);
2200          SET(blend, ps_blend, ColorBufferBlendEnable);
2201          SET(blend, ps_blend, SourceAlphaBlendFactor);
2202          SET(blend, ps_blend, DestinationAlphaBlendFactor);
2203          SET(blend, ps_blend, SourceBlendFactor);
2204          SET(blend, ps_blend, DestinationBlendFactor);
2205          SET(blend, ps_blend, AlphaTestEnable);
2206          SET(blend, ps_blend, IndependentAlphaBlendEnable);
2207          SET(blend, ps_blend, AlphaToCoverageEnable);
2208       }
2209    }
2210 
2211    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE)) {
2212       const uint32_t num_dwords = GENX(BLEND_STATE_length) +
2213          GENX(BLEND_STATE_ENTRY_length) * MAX_RTS;
2214       hw_state->blend.state =
2215          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
2216                                             num_dwords * 4,
2217                                             64);
2218 
2219       uint32_t *dws = hw_state->blend.state.map;
2220 
2221       struct GENX(BLEND_STATE) blend_state = {
2222          INIT(blend, AlphaToCoverageEnable),
2223          INIT(blend, AlphaToOneEnable),
2224          INIT(blend, IndependentAlphaBlendEnable),
2225          INIT(blend, ColorDitherEnable),
2226       };
2227       GENX(BLEND_STATE_pack)(NULL, dws, &blend_state);
2228 
2229       /* Jump to blend entries. */
2230       dws += GENX(BLEND_STATE_length);
2231       for (uint32_t i = 0; i < MAX_RTS; i++) {
2232          struct GENX(BLEND_STATE_ENTRY) entry = {
2233             INIT(blend.rts[i], WriteDisableAlpha),
2234             INIT(blend.rts[i], WriteDisableRed),
2235             INIT(blend.rts[i], WriteDisableGreen),
2236             INIT(blend.rts[i], WriteDisableBlue),
2237             INIT(blend.rts[i], LogicOpFunction),
2238             INIT(blend.rts[i], LogicOpEnable),
2239             INIT(blend.rts[i], ColorBufferBlendEnable),
2240             INIT(blend.rts[i], ColorClampRange),
2241             INIT(blend.rts[i], PreBlendColorClampEnable),
2242             INIT(blend.rts[i], PostBlendColorClampEnable),
2243             INIT(blend.rts[i], SourceBlendFactor),
2244             INIT(blend.rts[i], DestinationBlendFactor),
2245             INIT(blend.rts[i], ColorBlendFunction),
2246             INIT(blend.rts[i], SourceAlphaBlendFactor),
2247             INIT(blend.rts[i], DestinationAlphaBlendFactor),
2248             INIT(blend.rts[i], AlphaBlendFunction),
2249          };
2250 
2251          GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry);
2252          dws += GENX(BLEND_STATE_ENTRY_length);
2253       }
2254 
2255       /* Dirty the pointers to reemit 3DSTATE_BLEND_STATE_POINTERS below */
2256       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_PTR);
2257    }
2258 
2259    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_PTR)) {
2260       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
2261          bsp.BlendStatePointer      = hw_state->blend.state.offset;
2262          bsp.BlendStatePointerValid = true;
2263       }
2264    }
2265 
2266 #if INTEL_WA_18019816803_GFX_VER
2267    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WA_18019816803)) {
2268       genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
2269                                    cmd_buffer->state.current_pipeline,
2270                                    ANV_PIPE_PSS_STALL_SYNC_BIT);
2271    }
2272 #endif
2273 
2274 #if INTEL_WA_14018283232_GFX_VER
2275    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WA_14018283232))
2276       genX(batch_emit_wa_14018283232)(&cmd_buffer->batch);
2277 #endif
2278 
2279 #if GFX_VER == 9
2280    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PMA_FIX))
2281       genX(cmd_buffer_enable_pma_fix)(cmd_buffer, hw_state->pma_fix);
2282 #endif
2283 
2284 #if GFX_VERx10 >= 125
2285    if (hw_state->use_tbimr &&
2286        BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TBIMR_TILE_PASS_INFO)) {
2287       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TBIMR_TILE_PASS_INFO),
2288                      tbimr) {
2289          SET(tbimr, tbimr, TileRectangleHeight);
2290          SET(tbimr, tbimr, TileRectangleWidth);
2291          SET(tbimr, tbimr, VerticalTileCount);
2292          SET(tbimr, tbimr, HorizontalTileCount);
2293          SET(tbimr, tbimr, TBIMRBatchSize);
2294          SET(tbimr, tbimr, TileBoxCheck);
2295       }
2296    }
2297 #endif
2298 
2299 #undef INIT
2300 #undef SET
2301 
2302    BITSET_ZERO(hw_state->dirty);
2303 }
2304 
2305 /**
2306  * This function handles possible state workarounds and emits the dirty
2307  * instructions to the batch buffer.
2308  */
2309 void
genX(cmd_buffer_flush_gfx_hw_state)2310 genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer)
2311 {
2312    struct anv_device *device = cmd_buffer->device;
2313    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
2314    struct anv_graphics_pipeline *pipeline =
2315       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2316    struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
2317 
2318    if (INTEL_DEBUG(DEBUG_REEMIT)) {
2319       BITSET_OR(gfx->dyn_state.dirty, gfx->dyn_state.dirty,
2320                 device->gfx_dirty_state);
2321    }
2322 
2323    /**
2324     * Put potential workarounds here if you need to reemit an instruction
2325     * because of another one is changing.
2326     */
2327 
2328    /* Since Wa_16011773973 will disable 3DSTATE_STREAMOUT, we need to reemit
2329     * it after.
2330     */
2331    if (intel_needs_workaround(device->info, 16011773973) &&
2332        pipeline->uses_xfb &&
2333        BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST)) {
2334       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
2335    }
2336 
2337    /* Gfx11 undocumented issue :
2338     * https://gitlab.freedesktop.org/mesa/mesa/-/issues/9781
2339     */
2340 #if GFX_VER == 11
2341    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE))
2342       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE);
2343 #endif
2344 
2345    /* Wa_18020335297 - Apply the WA when viewport ptr is reprogrammed. */
2346    if (intel_needs_workaround(device->info, 18020335297) &&
2347        (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) ||
2348         BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR)) &&
2349        cmd_buffer->state.gfx.viewport_set) {
2350       /* For mesh, we implement the WA using CS stall. This is for
2351        * simplicity and takes care of possible interaction with Wa_16014390852.
2352        */
2353       if (anv_pipeline_is_mesh(pipeline)) {
2354          genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
2355                                       _3D, ANV_PIPE_CS_STALL_BIT);
2356       } else {
2357          /* Mask off all instructions that we program. */
2358          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VFG);
2359          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF);
2360          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
2361          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_RASTER);
2362          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
2363          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
2364          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
2365          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_CLIP);
2366          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
2367          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
2368          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
2369 
2370          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VS);
2371          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_GS);
2372          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_HS);
2373          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_TE);
2374          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_DS);
2375 
2376          cmd_buffer_gfx_state_emission(cmd_buffer);
2377 
2378          emit_wa_18020335297_dummy_draw(cmd_buffer);
2379 
2380          /* Dirty all emitted WA state to make sure that current real
2381           * state is restored.
2382           */
2383          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VFG);
2384          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF);
2385          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
2386          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_RASTER);
2387          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
2388          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
2389          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
2390          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CLIP);
2391          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
2392          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
2393          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
2394 
2395          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS);
2396          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS);
2397          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS);
2398          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE);
2399          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS);
2400       }
2401    }
2402 
2403    cmd_buffer_gfx_state_emission(cmd_buffer);
2404 }
2405 
2406 void
genX(cmd_buffer_enable_pma_fix)2407 genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
2408 {
2409    if (!anv_cmd_buffer_is_render_queue(cmd_buffer))
2410       return;
2411 
2412    if (cmd_buffer->state.pma_fix_enabled == enable)
2413       return;
2414 
2415    cmd_buffer->state.pma_fix_enabled = enable;
2416 
2417    /* According to the Broadwell PIPE_CONTROL documentation, software should
2418     * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
2419     * prior to the LRI.  If stencil buffer writes are enabled, then a Render
2420     * Cache Flush is also necessary.
2421     *
2422     * The Skylake docs say to use a depth stall rather than a command
2423     * streamer stall.  However, the hardware seems to violently disagree.
2424     * A full command streamer stall seems to be needed in both cases.
2425     */
2426    genx_batch_emit_pipe_control
2427       (&cmd_buffer->batch, cmd_buffer->device->info,
2428        cmd_buffer->state.current_pipeline,
2429        ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
2430        ANV_PIPE_CS_STALL_BIT |
2431 #if GFX_VER >= 12
2432        ANV_PIPE_TILE_CACHE_FLUSH_BIT |
2433 #endif
2434        ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
2435 
2436 #if GFX_VER == 9
2437    uint32_t cache_mode;
2438    anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0),
2439                    .STCPMAOptimizationEnable = enable,
2440                    .STCPMAOptimizationEnableMask = true);
2441    anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
2442       lri.RegisterOffset   = GENX(CACHE_MODE_0_num);
2443       lri.DataDWord        = cache_mode;
2444    }
2445 
2446 #endif /* GFX_VER == 9 */
2447 
2448    /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
2449     * Flush bits is often necessary.  We do it regardless because it's easier.
2450     * The render cache flush is also necessary if stencil writes are enabled.
2451     *
2452     * Again, the Skylake docs give a different set of flushes but the BDW
2453     * flushes seem to work just as well.
2454     */
2455    genx_batch_emit_pipe_control
2456       (&cmd_buffer->batch, cmd_buffer->device->info,
2457        cmd_buffer->state.current_pipeline,
2458        ANV_PIPE_DEPTH_STALL_BIT |
2459        ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
2460 #if GFX_VER >= 12
2461        ANV_PIPE_TILE_CACHE_FLUSH_BIT |
2462 #endif
2463        ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
2464 }
2465