/* * Copyright © 2015 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include #include #include #include #include #include "anv_private.h" #include "genxml/gen_macros.h" #include "genxml/genX_pack.h" #include "common/intel_genX_state_brw.h" #include "common/intel_guardband.h" #include "common/intel_tiled_render.h" #include "compiler/brw_prim.h" const uint32_t genX(vk_to_intel_blend)[] = { [VK_BLEND_FACTOR_ZERO] = BLENDFACTOR_ZERO, [VK_BLEND_FACTOR_ONE] = BLENDFACTOR_ONE, [VK_BLEND_FACTOR_SRC_COLOR] = BLENDFACTOR_SRC_COLOR, [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR] = BLENDFACTOR_INV_SRC_COLOR, [VK_BLEND_FACTOR_DST_COLOR] = BLENDFACTOR_DST_COLOR, [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR] = BLENDFACTOR_INV_DST_COLOR, [VK_BLEND_FACTOR_SRC_ALPHA] = BLENDFACTOR_SRC_ALPHA, [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA] = BLENDFACTOR_INV_SRC_ALPHA, [VK_BLEND_FACTOR_DST_ALPHA] = BLENDFACTOR_DST_ALPHA, [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA] = BLENDFACTOR_INV_DST_ALPHA, [VK_BLEND_FACTOR_CONSTANT_COLOR] = BLENDFACTOR_CONST_COLOR, [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR, [VK_BLEND_FACTOR_CONSTANT_ALPHA] = BLENDFACTOR_CONST_ALPHA, [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA, [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE] = BLENDFACTOR_SRC_ALPHA_SATURATE, [VK_BLEND_FACTOR_SRC1_COLOR] = BLENDFACTOR_SRC1_COLOR, [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR] = BLENDFACTOR_INV_SRC1_COLOR, [VK_BLEND_FACTOR_SRC1_ALPHA] = BLENDFACTOR_SRC1_ALPHA, [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA] = BLENDFACTOR_INV_SRC1_ALPHA, }; static const uint32_t genX(vk_to_intel_blend_op)[] = { [VK_BLEND_OP_ADD] = BLENDFUNCTION_ADD, [VK_BLEND_OP_SUBTRACT] = BLENDFUNCTION_SUBTRACT, [VK_BLEND_OP_REVERSE_SUBTRACT] = BLENDFUNCTION_REVERSE_SUBTRACT, [VK_BLEND_OP_MIN] = BLENDFUNCTION_MIN, [VK_BLEND_OP_MAX] = BLENDFUNCTION_MAX, }; static void genX(streamout_prologue)(struct anv_cmd_buffer *cmd_buffer) { #if INTEL_WA_16013994831_GFX_VER /* Wa_16013994831 - Disable preemption during streamout, enable back * again if XFB not used by the current pipeline. * * Although this workaround applies to Gfx12+, we already disable object * level preemption for another reason in genX_state.c so we can skip this * for Gfx12. */ if (!intel_needs_workaround(cmd_buffer->device->info, 16013994831)) return; struct anv_graphics_pipeline *pipeline = anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline); if (pipeline->uses_xfb) { genX(cmd_buffer_set_preemption)(cmd_buffer, false); return; } if (!cmd_buffer->state.gfx.object_preemption) genX(cmd_buffer_set_preemption)(cmd_buffer, true); #endif } #if GFX_VER >= 12 static uint32_t get_cps_state_offset(struct anv_cmd_buffer *cmd_buffer, bool cps_enabled, const struct vk_fragment_shading_rate_state *fsr) { struct anv_device *device = cmd_buffer->device; if (!cps_enabled) return device->cps_states.offset; uint32_t offset; static const uint32_t size_index[] = { [1] = 0, [2] = 1, [4] = 2, }; #if GFX_VERx10 >= 125 offset = 1 + /* skip disabled */ fsr->combiner_ops[0] * 5 * 3 * 3 + fsr->combiner_ops[1] * 3 * 3 + size_index[fsr->fragment_size.width] * 3 + size_index[fsr->fragment_size.height]; #else offset = 1 + /* skip disabled */ size_index[fsr->fragment_size.width] * 3 + size_index[fsr->fragment_size.height]; #endif offset *= MAX_VIEWPORTS * GENX(CPS_STATE_length) * 4; return device->cps_states.offset + offset; } #endif /* GFX_VER >= 12 */ static bool has_ds_feedback_loop(const struct vk_dynamic_graphics_state *dyn) { return dyn->feedback_loops & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT); } UNUSED static bool want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer, const struct vk_dynamic_graphics_state *dyn, const struct vk_depth_stencil_state *ds) { if (GFX_VER > 9) return false; assert(GFX_VER == 9); /* From the Skylake PRM Vol. 2c CACHE_MODE_1::STC PMA Optimization Enable: * * Clearing this bit will force the STC cache to wait for pending * retirement of pixels at the HZ-read stage and do the STC-test for * Non-promoted, R-computed and Computed depth modes instead of * postponing the STC-test to RCPFE. * * STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE && * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable * * STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE && * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable && * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE) * * COMP_STC_EN = STC_TEST_EN && * 3DSTATE_PS_EXTRA::PixelShaderComputesStencil * * SW parses the pipeline states to generate the following logical * signal indicating if PMA FIX can be enabled. * * STC_PMA_OPT = * 3DSTATE_WM::ForceThreadDispatch != 1 && * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) && * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL && * 3DSTATE_DEPTH_BUFFER::HIZ Enable && * !(3DSTATE_WM::EDSC_Mode == 2) && * 3DSTATE_PS_EXTRA::PixelShaderValid && * !(3DSTATE_WM_HZ_OP::DepthBufferClear || * 3DSTATE_WM_HZ_OP::DepthBufferResolve || * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable || * 3DSTATE_WM_HZ_OP::StencilBufferClear) && * (COMP_STC_EN || STC_WRITE_EN) && * ((3DSTATE_PS_EXTRA::PixelShaderKillsPixels || * 3DSTATE_WM::ForceKillPix == ON || * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || * 3DSTATE_PS_BLEND::AlphaTestEnable || * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) || * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF)) */ /* These are always true: * 3DSTATE_WM::ForceThreadDispatch != 1 && * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) */ /* We only enable the PMA fix if we know for certain that HiZ is enabled. * If we don't know whether HiZ is enabled or not, we disable the PMA fix * and there is no harm. * * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) && * 3DSTATE_DEPTH_BUFFER::HIZ Enable */ if (!cmd_buffer->state.hiz_enabled) return false; /* We can't possibly know if HiZ is enabled without the depth attachment */ ASSERTED const struct anv_image_view *d_iview = cmd_buffer->state.gfx.depth_att.iview; assert(d_iview && d_iview->image->planes[0].aux_usage == ISL_AUX_USAGE_HIZ); /* 3DSTATE_PS_EXTRA::PixelShaderValid */ struct anv_graphics_pipeline *pipeline = anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline); if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) return false; /* !(3DSTATE_WM::EDSC_Mode == 2) */ const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); if (wm_prog_data->early_fragment_tests) return false; /* We never use anv_pipeline for HiZ ops so this is trivially true: * !(3DSTATE_WM_HZ_OP::DepthBufferClear || * 3DSTATE_WM_HZ_OP::DepthBufferResolve || * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable || * 3DSTATE_WM_HZ_OP::StencilBufferClear) */ /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE && * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable */ const bool stc_test_en = ds->stencil.test_enable; /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE && * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable && * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE) */ const bool stc_write_en = ds->stencil.write_enable; /* STC_TEST_EN && 3DSTATE_PS_EXTRA::PixelShaderComputesStencil */ const bool comp_stc_en = stc_test_en && wm_prog_data->computed_stencil; /* COMP_STC_EN || STC_WRITE_EN */ if (!(comp_stc_en || stc_write_en)) return false; /* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels || * 3DSTATE_WM::ForceKillPix == ON || * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || * 3DSTATE_PS_BLEND::AlphaTestEnable || * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) || * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF) */ return pipeline->kill_pixel || pipeline->rp_has_ds_self_dep || has_ds_feedback_loop(dyn) || wm_prog_data->computed_depth_mode != PSCDEPTH_OFF; } static void genX(rasterization_mode)(VkPolygonMode raster_mode, VkLineRasterizationModeKHR line_mode, float line_width, uint32_t *api_mode, bool *msaa_rasterization_enable) { if (raster_mode == VK_POLYGON_MODE_LINE) { /* Unfortunately, configuring our line rasterization hardware on gfx8 * and later is rather painful. Instead of giving us bits to tell the * hardware what line mode to use like we had on gfx7, we now have an * arcane combination of API Mode and MSAA enable bits which do things * in a table which are expected to magically put the hardware into the * right mode for your API. Sadly, Vulkan isn't any of the APIs the * hardware people thought of so nothing works the way you want it to. * * Look at the table titled "Multisample Rasterization Modes" in Vol 7 * of the Skylake PRM for more details. */ switch (line_mode) { case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT: *api_mode = DX101; #if GFX_VER <= 9 /* Prior to ICL, the algorithm the HW uses to draw wide lines * doesn't quite match what the CTS expects, at least for rectangular * lines, so we set this to false here, making it draw parallelograms * instead, which work well enough. */ *msaa_rasterization_enable = line_width < 1.0078125; #else *msaa_rasterization_enable = true; #endif break; case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT: case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT: *api_mode = DX9OGL; *msaa_rasterization_enable = false; break; default: unreachable("Unsupported line rasterization mode"); } } else { *api_mode = DX101; *msaa_rasterization_enable = true; } } static bool is_src1_blend_factor(enum GENX(3D_Color_Buffer_Blend_Factor) factor) { return factor == BLENDFACTOR_SRC1_COLOR || factor == BLENDFACTOR_SRC1_ALPHA || factor == BLENDFACTOR_INV_SRC1_COLOR || factor == BLENDFACTOR_INV_SRC1_ALPHA; } #if GFX_VERx10 == 125 /** * Return the dimensions of the current rendering area, defined as the * bounding box of all present color, depth and stencil attachments. */ UNUSED static bool calculate_render_area(struct anv_cmd_buffer *cmd_buffer, unsigned *width, unsigned *height) { struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; *width = gfx->render_area.offset.x + gfx->render_area.extent.width; *height = gfx->render_area.offset.y + gfx->render_area.extent.height; for (unsigned i = 0; i < gfx->color_att_count; i++) { struct anv_attachment *att = &gfx->color_att[i]; if (att->iview) { *width = MAX2(*width, att->iview->vk.extent.width); *height = MAX2(*height, att->iview->vk.extent.height); } } const struct anv_image_view *const z_view = gfx->depth_att.iview; if (z_view) { *width = MAX2(*width, z_view->vk.extent.width); *height = MAX2(*height, z_view->vk.extent.height); } const struct anv_image_view *const s_view = gfx->stencil_att.iview; if (s_view) { *width = MAX2(*width, s_view->vk.extent.width); *height = MAX2(*height, s_view->vk.extent.height); } return *width && *height; } /* Calculate TBIMR tiling parameters adequate for the current pipeline * setup. Return true if TBIMR should be enabled. */ UNUSED static bool calculate_tile_dimensions(struct anv_cmd_buffer *cmd_buffer, unsigned fb_width, unsigned fb_height, unsigned *tile_width, unsigned *tile_height) { const struct anv_device *device = cmd_buffer->device; struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; assert(GFX_VER == 12); const unsigned aux_scale = ISL_MAIN_TO_CCS_SIZE_RATIO_XE; unsigned pixel_size = 0; /* Perform a rough calculation of the tile cache footprint of the * pixel pipeline, approximating it as the sum of the amount of * memory used per pixel by every render target, depth, stencil and * auxiliary surfaces bound to the pipeline. */ for (uint32_t i = 0; i < gfx->color_att_count; i++) { struct anv_attachment *att = &gfx->color_att[i]; if (att->iview) { const struct anv_image *image = att->iview->image; const unsigned p = anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_COLOR_BIT); const struct anv_image_plane *plane = &image->planes[p]; pixel_size += intel_calculate_surface_pixel_size( &plane->primary_surface.isl); if (isl_aux_usage_has_mcs(att->aux_usage)) pixel_size += intel_calculate_surface_pixel_size( &plane->aux_surface.isl); if (isl_aux_usage_has_ccs(att->aux_usage)) pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size( &plane->primary_surface.isl), aux_scale); } } const struct anv_image_view *const z_view = gfx->depth_att.iview; if (z_view) { const struct anv_image *image = z_view->image; assert(image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT); const unsigned p = anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT); const struct anv_image_plane *plane = &image->planes[p]; pixel_size += intel_calculate_surface_pixel_size( &plane->primary_surface.isl); if (isl_aux_usage_has_hiz(image->planes[p].aux_usage)) pixel_size += intel_calculate_surface_pixel_size( &plane->aux_surface.isl); if (isl_aux_usage_has_ccs(image->planes[p].aux_usage)) pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size( &plane->primary_surface.isl), aux_scale); } const struct anv_image_view *const s_view = gfx->depth_att.iview; if (s_view && s_view != z_view) { const struct anv_image *image = s_view->image; assert(image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT); const unsigned p = anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT); const struct anv_image_plane *plane = &image->planes[p]; pixel_size += intel_calculate_surface_pixel_size( &plane->primary_surface.isl); } if (!pixel_size) return false; /* Compute a tile layout that allows reasonable utilization of the * tile cache based on the per-pixel cache footprint estimated * above. */ intel_calculate_tile_dimensions(device->info, cmd_buffer->state.current_l3_config, 32, 32, fb_width, fb_height, pixel_size, tile_width, tile_height); /* Perform TBIMR tile passes only if the framebuffer covers more * than a single tile. */ return *tile_width < fb_width || *tile_height < fb_height; } #endif /** * This function takes the vulkan runtime values & dirty states and updates * the values in anv_gfx_dynamic_state, flagging HW instructions for * reemission if the values are changing. * * Nothing is emitted in the batch buffer. * * Returns a mask for state that we want to leave dirty afterwards. */ anv_cmd_dirty_mask_t genX(cmd_buffer_flush_gfx_runtime_state)(struct anv_cmd_buffer *cmd_buffer) { UNUSED struct anv_device *device = cmd_buffer->device; struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; const struct anv_graphics_pipeline *pipeline = anv_pipeline_to_graphics(gfx->base.pipeline); const struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state; struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state; const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); struct anv_instance *instance = cmd_buffer->device->physical->instance; anv_cmd_dirty_mask_t dirty_state_mask = 0; #define GET(field) hw_state->field #define SET(bit, field, value) \ do { \ __typeof(hw_state->field) __v = value; \ if (hw_state->field != __v) { \ hw_state->field = __v; \ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit); \ } \ } while (0) #define SET_STAGE(bit, field, value, stage) \ do { \ __typeof(hw_state->field) __v = value; \ if (!anv_pipeline_has_stage(pipeline, \ MESA_SHADER_##stage)) { \ hw_state->field = __v; \ break; \ } \ if (hw_state->field != __v) { \ hw_state->field = __v; \ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit); \ } \ } while (0) #define SETUP_PROVOKING_VERTEX(bit, cmd, mode) \ switch (mode) { \ case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT: \ SET(bit, cmd.TriangleStripListProvokingVertexSelect, 0); \ SET(bit, cmd.LineStripListProvokingVertexSelect, 0); \ SET(bit, cmd.TriangleFanProvokingVertexSelect, 1); \ break; \ case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT: \ SET(bit, cmd.TriangleStripListProvokingVertexSelect, 2); \ SET(bit, cmd.LineStripListProvokingVertexSelect, 1); \ SET(bit, cmd.TriangleFanProvokingVertexSelect, 2); \ break; \ default: \ unreachable("Invalid provoking vertex mode"); \ } \ UNUSED bool fs_msaa_changed = false; if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR)) { enum intel_msaa_flags fs_msaa_flags = 0; if (wm_prog_data) { /* If we have any dynamic bits here, we might need to update the * value in the push constant for the shader. */ if (wm_prog_data->coarse_pixel_dispatch == BRW_SOMETIMES || wm_prog_data->persample_dispatch == BRW_SOMETIMES || wm_prog_data->alpha_to_coverage == BRW_SOMETIMES) { fs_msaa_flags = INTEL_MSAA_FLAG_ENABLE_DYNAMIC; if (dyn->ms.rasterization_samples > 1) { fs_msaa_flags |= INTEL_MSAA_FLAG_MULTISAMPLE_FBO; if (wm_prog_data->sample_shading) { assert(wm_prog_data->persample_dispatch != BRW_NEVER); fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH; } if ((pipeline->sample_shading_enable && (pipeline->min_sample_shading * dyn->ms.rasterization_samples) > 1) || wm_prog_data->sample_shading) { fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH | INTEL_MSAA_FLAG_PERSAMPLE_INTERP; } } if (wm_prog_data->coarse_pixel_dispatch == BRW_SOMETIMES && !(fs_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH)) { fs_msaa_flags |= INTEL_MSAA_FLAG_COARSE_PI_MSG | INTEL_MSAA_FLAG_COARSE_RT_WRITES; } if (wm_prog_data->alpha_to_coverage == BRW_SOMETIMES && dyn->ms.alpha_to_coverage_enable) fs_msaa_flags |= INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE; /* Check the last push constant value and update */ if (gfx->base.push_constants.gfx.fs_msaa_flags != fs_msaa_flags) { gfx->base.push_constants.gfx.fs_msaa_flags = fs_msaa_flags; cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT; gfx->base.push_constants_data_dirty = true; } } } if (fs_msaa_flags != gfx->fs_msaa_flags) { gfx->fs_msaa_flags = fs_msaa_flags; gfx->dirty |= ANV_CMD_DIRTY_FS_MSAA_FLAGS; } } if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) || (gfx->dirty & ANV_CMD_DIRTY_FS_MSAA_FLAGS) || (gfx->dirty & ANV_CMD_DIRTY_COARSE_PIXEL_ACTIVE)) { if (wm_prog_data) { const struct anv_shader_bin *fs_bin = pipeline->base.shaders[MESA_SHADER_FRAGMENT]; struct GENX(3DSTATE_PS) ps = {}; intel_set_ps_dispatch_state(&ps, device->info, wm_prog_data, MAX2(dyn->ms.rasterization_samples, 1), gfx->fs_msaa_flags); SET(PS, ps.KernelStartPointer0, fs_bin->kernel.offset + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0)); SET(PS, ps.KernelStartPointer1, fs_bin->kernel.offset + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1)); #if GFX_VER < 20 SET(PS, ps.KernelStartPointer2, fs_bin->kernel.offset + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2)); #endif SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData0, brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0)); SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData1, brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1)); #if GFX_VER < 20 SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData2, brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2)); #endif #if GFX_VER < 20 SET(PS, ps._8PixelDispatchEnable, ps._8PixelDispatchEnable); SET(PS, ps._16PixelDispatchEnable, ps._16PixelDispatchEnable); SET(PS, ps._32PixelDispatchEnable, ps._32PixelDispatchEnable); #else SET(PS, ps.Kernel0Enable, ps.Kernel0Enable); SET(PS, ps.Kernel1Enable, ps.Kernel1Enable); SET(PS, ps.Kernel0SIMDWidth, ps.Kernel0SIMDWidth); SET(PS, ps.Kernel1SIMDWidth, ps.Kernel1SIMDWidth); SET(PS, ps.Kernel0PolyPackingPolicy, ps.Kernel0PolyPackingPolicy); #endif SET(PS, ps.PositionXYOffsetSelect, !wm_prog_data->uses_pos_offset ? POSOFFSET_NONE : brw_wm_prog_data_is_persample(wm_prog_data, gfx->fs_msaa_flags) ? POSOFFSET_SAMPLE : POSOFFSET_CENTROID); SET(PS_EXTRA, ps_extra.PixelShaderIsPerSample, brw_wm_prog_data_is_persample(wm_prog_data, gfx->fs_msaa_flags)); #if GFX_VER >= 11 const bool uses_coarse_pixel = brw_wm_prog_data_is_coarse(wm_prog_data, gfx->fs_msaa_flags); SET(PS_EXTRA, ps_extra.PixelShaderIsPerCoarsePixel, uses_coarse_pixel); #endif #if GFX_VERx10 >= 125 enum anv_coarse_pixel_state cps_state = uses_coarse_pixel ? ANV_COARSE_PIXEL_STATE_ENABLED : ANV_COARSE_PIXEL_STATE_DISABLED; bool cps_state_toggled = genX(cmd_buffer_set_coarse_pixel_active)(cmd_buffer, cps_state); if (cps_state_toggled) dirty_state_mask |= ANV_CMD_DIRTY_COARSE_PIXEL_ACTIVE; const bool needs_ps_dependency = /* TODO: We should only require this when the last geometry shader * uses a fragment shading rate that is not constant. */ uses_coarse_pixel || cps_state_toggled; SET(PS_EXTRA, ps_extra.EnablePSDependencyOnCPsizeChange, needs_ps_dependency); #endif SET(WM, wm.BarycentricInterpolationMode, wm_prog_data_barycentric_modes(wm_prog_data, gfx->fs_msaa_flags)); } else { #if GFX_VER < 20 SET(PS, ps._8PixelDispatchEnable, false); SET(PS, ps._16PixelDispatchEnable, false); SET(PS, ps._32PixelDispatchEnable, false); #else SET(PS, ps.Kernel0Enable, false); SET(PS, ps.Kernel1Enable, false); #endif } } if ((gfx->dirty & (ANV_CMD_DIRTY_PIPELINE | ANV_CMD_DIRTY_XFB_ENABLE | ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE)) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) { SET(STREAMOUT, so.RenderingDisable, dyn->rs.rasterizer_discard_enable); SET(STREAMOUT, so.RenderStreamSelect, dyn->rs.rasterization_stream); #if INTEL_NEEDS_WA_18022508906 /* Wa_18022508906 : * * SKL PRMs, Volume 7: 3D-Media-GPGPU, Stream Output Logic (SOL) Stage: * * SOL_INT::Render_Enable = * (3DSTATE_STREAMOUT::Force_Rending == Force_On) || * ( * (3DSTATE_STREAMOUT::Force_Rending != Force_Off) && * !(3DSTATE_GS::Enable && 3DSTATE_GS::Output Vertex Size == 0) && * !3DSTATE_STREAMOUT::API_Render_Disable && * ( * 3DSTATE_DEPTH_STENCIL_STATE::Stencil_TestEnable || * 3DSTATE_DEPTH_STENCIL_STATE::Depth_TestEnable || * 3DSTATE_DEPTH_STENCIL_STATE::Depth_WriteEnable || * 3DSTATE_PS_EXTRA::PS_Valid || * 3DSTATE_WM::Legacy Depth_Buffer_Clear || * 3DSTATE_WM::Legacy Depth_Buffer_Resolve_Enable || * 3DSTATE_WM::Legacy Hierarchical_Depth_Buffer_Resolve_Enable * ) * ) * * If SOL_INT::Render_Enable is false, the SO stage will not forward any * topologies down the pipeline. Which is not what we want for occlusion * queries. * * Here we force rendering to get SOL_INT::Render_Enable when occlusion * queries are active. */ SET(STREAMOUT, so.ForceRendering, (!GET(so.RenderingDisable) && gfx->n_occlusion_queries > 0) ? Force_on : 0); #endif switch (dyn->rs.provoking_vertex) { case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT: SET(STREAMOUT, so.ReorderMode, LEADING); SET_STAGE(GS, gs.ReorderMode, LEADING, GEOMETRY); break; case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT: SET(STREAMOUT, so.ReorderMode, TRAILING); SET_STAGE(GS, gs.ReorderMode, TRAILING, GEOMETRY); break; default: unreachable("Invalid provoking vertex mode"); } } if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) { uint32_t topology; if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) topology = _3DPRIM_PATCHLIST(dyn->ts.patch_control_points); else topology = genX(vk_to_intel_primitive_type)[dyn->ia.primitive_topology]; gfx->primitive_topology = topology; SET(VF_TOPOLOGY, vft.PrimitiveTopologyType, topology); } if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES)) BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT); #if GFX_VER >= 11 if (cmd_buffer->device->vk.enabled_extensions.KHR_fragment_shading_rate && ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) || (gfx->dirty & ANV_CMD_DIRTY_FS_MSAA_FLAGS) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR))) { const bool cps_enable = wm_prog_data && brw_wm_prog_data_is_coarse(wm_prog_data, gfx->fs_msaa_flags); #if GFX_VER == 11 SET(CPS, cps.CoarsePixelShadingMode, cps_enable ? CPS_MODE_CONSTANT : CPS_MODE_NONE); SET(CPS, cps.MinCPSizeX, dyn->fsr.fragment_size.width); SET(CPS, cps.MinCPSizeY, dyn->fsr.fragment_size.height); #elif GFX_VER >= 12 SET(CPS, cps.CoarsePixelShadingStateArrayPointer, get_cps_state_offset(cmd_buffer, cps_enable, &dyn->fsr)); #endif } #endif /* GFX_VER >= 11 */ if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN)) { const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline); if (tes_prog_data && anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) { if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) { SET(TE, te.OutputTopology, tes_prog_data->output_topology); } else { /* When the origin is upper-left, we have to flip the winding order */ if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) { SET(TE, te.OutputTopology, OUTPUT_TRI_CW); } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) { SET(TE, te.OutputTopology, OUTPUT_TRI_CCW); } else { SET(TE, te.OutputTopology, tes_prog_data->output_topology); } } } else { SET(TE, te.OutputTopology, OUTPUT_POINT); } } if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH)) SET(SF, sf.LineWidth, dyn->rs.line.width); if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) { SETUP_PROVOKING_VERTEX(SF, sf, dyn->rs.provoking_vertex); SETUP_PROVOKING_VERTEX(CLIP, clip, dyn->rs.provoking_vertex); } if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) { /** * From the Vulkan Spec: * * "VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT specifies that the depth * bias representation is a factor of constant r equal to 1." * * From the SKL PRMs, Volume 7: 3D-Media-GPGPU, Depth Offset: * * "When UNORM Depth Buffer is at Output Merger (or no Depth Buffer): * * Bias = GlobalDepthOffsetConstant * r + GlobalDepthOffsetScale * MaxDepthSlope * * Where r is the minimum representable value > 0 in the depth * buffer format, converted to float32 (note: If state bit Legacy * Global Depth Bias Enable is set, the r term will be forced to * 1.0)" * * When VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT is set, enable * LegacyGlobalDepthBiasEnable. */ SET(SF, sf.LegacyGlobalDepthBiasEnable, dyn->rs.depth_bias.representation == VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT); } if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) SET(CLIP, clip.APIMode, dyn->vp.depth_clip_negative_one_to_one ? APIMODE_OGL : APIMODE_D3D); if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) || (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE)) { /* Take dynamic primitive topology in to account with * 3DSTATE_RASTER::APIMode * 3DSTATE_RASTER::DXMultisampleRasterizationEnable * 3DSTATE_RASTER::AntialiasingEnable */ uint32_t api_mode = 0; bool msaa_raster_enable = false; const VkLineRasterizationModeKHR line_mode = anv_line_rasterization_mode(dyn->rs.line.mode, dyn->ms.rasterization_samples); const VkPolygonMode dynamic_raster_mode = genX(raster_polygon_mode)(pipeline, dyn->rs.polygon_mode, dyn->ia.primitive_topology); genX(rasterization_mode)(dynamic_raster_mode, line_mode, dyn->rs.line.width, &api_mode, &msaa_raster_enable); /* From the Browadwell PRM, Volume 2, documentation for * 3DSTATE_RASTER, "Antialiasing Enable": * * "This field must be disabled if any of the render targets * have integer (UINT or SINT) surface format." * * Additionally internal documentation for Gfx12+ states: * * "This bit MUST not be set when NUM_MULTISAMPLES > 1 OR * FORCED_SAMPLE_COUNT > 1." */ const bool aa_enable = anv_rasterization_aa_mode(dynamic_raster_mode, line_mode) && !gfx->has_uint_rt && !(GFX_VER >= 12 && gfx->samples > 1); const bool depth_clip_enable = vk_rasterization_state_depth_clip_enable(&dyn->rs); const bool xy_clip_test_enable = (dynamic_raster_mode == VK_POLYGON_MODE_FILL); SET(CLIP, clip.ViewportXYClipTestEnable, xy_clip_test_enable); SET(RASTER, raster.APIMode, api_mode); SET(RASTER, raster.DXMultisampleRasterizationEnable, msaa_raster_enable); SET(RASTER, raster.AntialiasingEnable, aa_enable); SET(RASTER, raster.CullMode, genX(vk_to_intel_cullmode)[dyn->rs.cull_mode]); SET(RASTER, raster.FrontWinding, genX(vk_to_intel_front_face)[dyn->rs.front_face]); SET(RASTER, raster.GlobalDepthOffsetEnableSolid, dyn->rs.depth_bias.enable); SET(RASTER, raster.GlobalDepthOffsetEnableWireframe, dyn->rs.depth_bias.enable); SET(RASTER, raster.GlobalDepthOffsetEnablePoint, dyn->rs.depth_bias.enable); SET(RASTER, raster.GlobalDepthOffsetConstant, dyn->rs.depth_bias.constant); SET(RASTER, raster.GlobalDepthOffsetScale, dyn->rs.depth_bias.slope); SET(RASTER, raster.GlobalDepthOffsetClamp, dyn->rs.depth_bias.clamp); SET(RASTER, raster.FrontFaceFillMode, genX(vk_to_intel_fillmode)[dyn->rs.polygon_mode]); SET(RASTER, raster.BackFaceFillMode, genX(vk_to_intel_fillmode)[dyn->rs.polygon_mode]); SET(RASTER, raster.ViewportZFarClipTestEnable, depth_clip_enable); SET(RASTER, raster.ViewportZNearClipTestEnable, depth_clip_enable); SET(RASTER, raster.ConservativeRasterizationEnable, dyn->rs.conservative_mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT); } if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES)) { SET(MULTISAMPLE, ms.NumberofMultisamples, __builtin_ffs(MAX2(dyn->ms.rasterization_samples, 1)) - 1); } if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK)) { /* From the Vulkan 1.0 spec: * If pSampleMask is NULL, it is treated as if the mask has all bits * enabled, i.e. no coverage is removed from fragments. * * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits. */ SET(SAMPLE_MASK, sm.SampleMask, dyn->ms.sample_mask & 0xffff); } if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) || #if GFX_VER == 9 /* For the PMA fix */ (gfx->dirty & ANV_CMD_DIRTY_PIPELINE) || #endif BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE)) { VkImageAspectFlags ds_aspects = 0; if (gfx->depth_att.vk_format != VK_FORMAT_UNDEFINED) ds_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; if (gfx->stencil_att.vk_format != VK_FORMAT_UNDEFINED) ds_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; struct vk_depth_stencil_state opt_ds = dyn->ds; vk_optimize_depth_stencil_state(&opt_ds, ds_aspects, true); SET(WM_DEPTH_STENCIL, ds.DoubleSidedStencilEnable, true); SET(WM_DEPTH_STENCIL, ds.StencilTestMask, opt_ds.stencil.front.compare_mask & 0xff); SET(WM_DEPTH_STENCIL, ds.StencilWriteMask, opt_ds.stencil.front.write_mask & 0xff); SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestMask, opt_ds.stencil.back.compare_mask & 0xff); SET(WM_DEPTH_STENCIL, ds.BackfaceStencilWriteMask, opt_ds.stencil.back.write_mask & 0xff); SET(WM_DEPTH_STENCIL, ds.StencilReferenceValue, opt_ds.stencil.front.reference & 0xff); SET(WM_DEPTH_STENCIL, ds.BackfaceStencilReferenceValue, opt_ds.stencil.back.reference & 0xff); SET(WM_DEPTH_STENCIL, ds.DepthTestEnable, opt_ds.depth.test_enable); SET(WM_DEPTH_STENCIL, ds.DepthBufferWriteEnable, opt_ds.depth.write_enable); SET(WM_DEPTH_STENCIL, ds.DepthTestFunction, genX(vk_to_intel_compare_op)[opt_ds.depth.compare_op]); SET(WM_DEPTH_STENCIL, ds.StencilTestEnable, opt_ds.stencil.test_enable); SET(WM_DEPTH_STENCIL, ds.StencilBufferWriteEnable, opt_ds.stencil.write_enable); SET(WM_DEPTH_STENCIL, ds.StencilFailOp, genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.fail]); SET(WM_DEPTH_STENCIL, ds.StencilPassDepthPassOp, genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.pass]); SET(WM_DEPTH_STENCIL, ds.StencilPassDepthFailOp, genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.depth_fail]); SET(WM_DEPTH_STENCIL, ds.StencilTestFunction, genX(vk_to_intel_compare_op)[opt_ds.stencil.front.op.compare]); SET(WM_DEPTH_STENCIL, ds.BackfaceStencilFailOp, genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.fail]); SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthPassOp, genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.pass]); SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthFailOp, genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.depth_fail]); SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestFunction, genX(vk_to_intel_compare_op)[opt_ds.stencil.back.op.compare]); #if GFX_VER == 9 const bool pma = want_stencil_pma_fix(cmd_buffer, dyn, &opt_ds); SET(PMA_FIX, pma_fix, pma); #endif #if INTEL_WA_18019816803_GFX_VER if (intel_needs_workaround(cmd_buffer->device->info, 18019816803)) { bool ds_write_state = opt_ds.depth.write_enable || opt_ds.stencil.write_enable; SET(WA_18019816803, ds_write_state, ds_write_state); } #endif } #if INTEL_WA_14018283232_GFX_VER if (intel_needs_workaround(cmd_buffer->device->info, 14018283232) && ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE))) { SET(WA_14018283232, wa_14018283232_toggle, dyn->ds.depth.bounds_test.enable && wm_prog_data && wm_prog_data->uses_kill); } #endif #if GFX_VER >= 12 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) { SET(DEPTH_BOUNDS, db.DepthBoundsTestEnable, dyn->ds.depth.bounds_test.enable); /* Only look at updating the bounds if testing is enabled */ if (dyn->ds.depth.bounds_test.enable) { SET(DEPTH_BOUNDS, db.DepthBoundsTestMinValue, dyn->ds.depth.bounds_test.min); SET(DEPTH_BOUNDS, db.DepthBoundsTestMaxValue, dyn->ds.depth.bounds_test.max); } } #endif if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE)) { SET(LINE_STIPPLE, ls.LineStipplePattern, dyn->rs.line.stipple.pattern); SET(LINE_STIPPLE, ls.LineStippleInverseRepeatCount, 1.0f / MAX2(1, dyn->rs.line.stipple.factor)); SET(LINE_STIPPLE, ls.LineStippleRepeatCount, dyn->rs.line.stipple.factor); SET(WM, wm.LineStippleEnable, dyn->rs.line.stipple.enable); } if ((gfx->dirty & ANV_CMD_DIRTY_RESTART_INDEX) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) { SET(VF, vf.IndexedDrawCutIndexEnable, dyn->ia.primitive_restart_enable); SET(VF, vf.CutIndex, gfx->restart_index); } if (gfx->dirty & ANV_CMD_DIRTY_INDEX_BUFFER) BITSET_SET(hw_state->dirty, ANV_GFX_STATE_INDEX_BUFFER); #if GFX_VERx10 >= 125 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) SET(VFG, vfg.ListCutIndexEnable, dyn->ia.primitive_restart_enable); #endif if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations && (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE))) BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_PATTERN); if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE)) { SET_STAGE(PS_EXTRA, ps_extra.PixelShaderKillsPixel, wm_prog_data && (pipeline->rp_has_ds_self_dep || has_ds_feedback_loop(dyn) || wm_prog_data->uses_kill), FRAGMENT); } #if GFX_VERx10 >= 125 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { SET_STAGE(PS_EXTRA, ps_extra.PixelShaderHasUAV, wm_prog_data && wm_prog_data->has_side_effects, FRAGMENT); } #else if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE | ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE)) { /* Prior to Gfx12.5 the HW seems to avoid spawning fragment shaders even * if 3DSTATE_PS_EXTRA::PixelShaderKillsPixel=true when * 3DSTATE_PS_BLEND::HasWriteableRT=false. This is causing problems with * occlusion queries with 0 attachments. There are no CTS tests * exercising this but zink+anv fails a bunch of tests like piglit * arb_framebuffer_no_attachments-query. * * Here we choose to tweak the PixelShaderHasUAV to make sure the * fragment shaders are run properly. */ SET_STAGE(PS_EXTRA, ps_extra.PixelShaderHasUAV, wm_prog_data && (wm_prog_data->has_side_effects || (gfx->color_att_count == 0 && gfx->n_occlusion_queries > 0)), FRAGMENT); } #endif if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) || (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) { const uint8_t color_writes = dyn->cb.color_write_enables; bool has_writeable_rt = anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) && !anv_cmd_buffer_all_color_write_masked(cmd_buffer); SET(BLEND_STATE, blend.AlphaToCoverageEnable, dyn->ms.alpha_to_coverage_enable); SET(BLEND_STATE, blend.AlphaToOneEnable, dyn->ms.alpha_to_one_enable); SET(BLEND_STATE, blend.ColorDitherEnable, cmd_buffer->state.gfx.rendering_flags & VK_RENDERING_ENABLE_LEGACY_DITHERING_BIT_EXT); bool independent_alpha_blend = false; /* Wa_14018912822, check if we set these during RT setup. */ bool color_blend_zero = false; bool alpha_blend_zero = false; for (uint32_t i = 0; i < MAX_RTS; i++) { /* Disable anything above the current number of color attachments. */ bool write_disabled = i >= gfx->color_att_count || (color_writes & BITFIELD_BIT(i)) == 0; SET(BLEND_STATE, blend.rts[i].WriteDisableAlpha, write_disabled || (dyn->cb.attachments[i].write_mask & VK_COLOR_COMPONENT_A_BIT) == 0); SET(BLEND_STATE, blend.rts[i].WriteDisableRed, write_disabled || (dyn->cb.attachments[i].write_mask & VK_COLOR_COMPONENT_R_BIT) == 0); SET(BLEND_STATE, blend.rts[i].WriteDisableGreen, write_disabled || (dyn->cb.attachments[i].write_mask & VK_COLOR_COMPONENT_G_BIT) == 0); SET(BLEND_STATE, blend.rts[i].WriteDisableBlue, write_disabled || (dyn->cb.attachments[i].write_mask & VK_COLOR_COMPONENT_B_BIT) == 0); /* Vulkan specification 1.2.168, VkLogicOp: * * "Logical operations are controlled by the logicOpEnable and * logicOp members of VkPipelineColorBlendStateCreateInfo. If * logicOpEnable is VK_TRUE, then a logical operation selected by * logicOp is applied between each color attachment and the * fragment’s corresponding output value, and blending of all * attachments is treated as if it were disabled." * * From the Broadwell PRM Volume 2d: Command Reference: Structures: * BLEND_STATE_ENTRY: * * "Enabling LogicOp and Color Buffer Blending at the same time is * UNDEFINED" */ SET(BLEND_STATE, blend.rts[i].LogicOpFunction, genX(vk_to_intel_logic_op)[dyn->cb.logic_op]); SET(BLEND_STATE, blend.rts[i].LogicOpEnable, dyn->cb.logic_op_enable); SET(BLEND_STATE, blend.rts[i].ColorClampRange, COLORCLAMP_RTFORMAT); SET(BLEND_STATE, blend.rts[i].PreBlendColorClampEnable, true); SET(BLEND_STATE, blend.rts[i].PostBlendColorClampEnable, true); /* Setup blend equation. */ SET(BLEND_STATE, blend.rts[i].ColorBlendFunction, genX(vk_to_intel_blend_op)[ dyn->cb.attachments[i].color_blend_op]); SET(BLEND_STATE, blend.rts[i].AlphaBlendFunction, genX(vk_to_intel_blend_op)[ dyn->cb.attachments[i].alpha_blend_op]); if (dyn->cb.attachments[i].src_color_blend_factor != dyn->cb.attachments[i].src_alpha_blend_factor || dyn->cb.attachments[i].dst_color_blend_factor != dyn->cb.attachments[i].dst_alpha_blend_factor || dyn->cb.attachments[i].color_blend_op != dyn->cb.attachments[i].alpha_blend_op) { independent_alpha_blend = true; } /* The Dual Source Blending documentation says: * * "If SRC1 is included in a src/dst blend factor and * a DualSource RT Write message is not used, results * are UNDEFINED. (This reflects the same restriction in DX APIs, * where undefined results are produced if “o1” is not written * by a PS – there are no default values defined)." * * There is no way to gracefully fix this undefined situation * so we just disable the blending to prevent possible issues. */ if (wm_prog_data && !wm_prog_data->dual_src_blend && anv_is_dual_src_blend_equation(&dyn->cb.attachments[i])) { SET(BLEND_STATE, blend.rts[i].ColorBufferBlendEnable, false); } else { SET(BLEND_STATE, blend.rts[i].ColorBufferBlendEnable, !dyn->cb.logic_op_enable && dyn->cb.attachments[i].blend_enable); } /* Our hardware applies the blend factor prior to the blend function * regardless of what function is used. Technically, this means the * hardware can do MORE than GL or Vulkan specify. However, it also * means that, for MIN and MAX, we have to stomp the blend factor to * ONE to make it a no-op. */ uint32_t SourceBlendFactor; uint32_t DestinationBlendFactor; uint32_t SourceAlphaBlendFactor; uint32_t DestinationAlphaBlendFactor; if (dyn->cb.attachments[i].color_blend_op == VK_BLEND_OP_MIN || dyn->cb.attachments[i].color_blend_op == VK_BLEND_OP_MAX) { SourceBlendFactor = BLENDFACTOR_ONE; DestinationBlendFactor = BLENDFACTOR_ONE; } else { SourceBlendFactor = genX(vk_to_intel_blend)[ dyn->cb.attachments[i].src_color_blend_factor]; DestinationBlendFactor = genX(vk_to_intel_blend)[ dyn->cb.attachments[i].dst_color_blend_factor]; } if (dyn->cb.attachments[i].alpha_blend_op == VK_BLEND_OP_MIN || dyn->cb.attachments[i].alpha_blend_op == VK_BLEND_OP_MAX) { SourceAlphaBlendFactor = BLENDFACTOR_ONE; DestinationAlphaBlendFactor = BLENDFACTOR_ONE; } else { SourceAlphaBlendFactor = genX(vk_to_intel_blend)[ dyn->cb.attachments[i].src_alpha_blend_factor]; DestinationAlphaBlendFactor = genX(vk_to_intel_blend)[ dyn->cb.attachments[i].dst_alpha_blend_factor]; } /* Replace and Src1 value by 1.0 if dual source blending is not * enabled. */ if (wm_prog_data && !wm_prog_data->dual_src_blend) { if (is_src1_blend_factor(SourceBlendFactor)) SourceBlendFactor = BLENDFACTOR_ONE; if (is_src1_blend_factor(DestinationBlendFactor)) DestinationBlendFactor = BLENDFACTOR_ONE; } if (instance->intel_enable_wa_14018912822 && intel_needs_workaround(cmd_buffer->device->info, 14018912822) && dyn->ms.rasterization_samples > 1) { if (DestinationBlendFactor == BLENDFACTOR_ZERO) { DestinationBlendFactor = BLENDFACTOR_CONST_COLOR; color_blend_zero = true; } if (DestinationAlphaBlendFactor == BLENDFACTOR_ZERO) { DestinationAlphaBlendFactor = BLENDFACTOR_CONST_ALPHA; alpha_blend_zero = true; } } SET(BLEND_STATE, blend.rts[i].SourceBlendFactor, SourceBlendFactor); SET(BLEND_STATE, blend.rts[i].DestinationBlendFactor, DestinationBlendFactor); SET(BLEND_STATE, blend.rts[i].SourceAlphaBlendFactor, SourceAlphaBlendFactor); SET(BLEND_STATE, blend.rts[i].DestinationAlphaBlendFactor, DestinationAlphaBlendFactor); } gfx->color_blend_zero = color_blend_zero; gfx->alpha_blend_zero = alpha_blend_zero; SET(BLEND_STATE, blend.IndependentAlphaBlendEnable, independent_alpha_blend); /* 3DSTATE_PS_BLEND to be consistent with the rest of the * BLEND_STATE_ENTRY. */ SET(PS_BLEND, ps_blend.HasWriteableRT, has_writeable_rt); SET(PS_BLEND, ps_blend.ColorBufferBlendEnable, GET(blend.rts[0].ColorBufferBlendEnable)); SET(PS_BLEND, ps_blend.SourceAlphaBlendFactor, GET(blend.rts[0].SourceAlphaBlendFactor)); SET(PS_BLEND, ps_blend.DestinationAlphaBlendFactor, gfx->alpha_blend_zero ? BLENDFACTOR_CONST_ALPHA : GET(blend.rts[0].DestinationAlphaBlendFactor)); SET(PS_BLEND, ps_blend.SourceBlendFactor, GET(blend.rts[0].SourceBlendFactor)); SET(PS_BLEND, ps_blend.DestinationBlendFactor, gfx->color_blend_zero ? BLENDFACTOR_CONST_COLOR : GET(blend.rts[0].DestinationBlendFactor)); SET(PS_BLEND, ps_blend.AlphaTestEnable, false); SET(PS_BLEND, ps_blend.IndependentAlphaBlendEnable, GET(blend.IndependentAlphaBlendEnable)); SET(PS_BLEND, ps_blend.AlphaToCoverageEnable, dyn->ms.alpha_to_coverage_enable); } if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) { SET(CC_STATE, cc.BlendConstantColorRed, gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[0]); SET(CC_STATE, cc.BlendConstantColorGreen, gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[1]); SET(CC_STATE, cc.BlendConstantColorBlue, gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[2]); SET(CC_STATE, cc.BlendConstantColorAlpha, gfx->alpha_blend_zero ? 0.0f : dyn->cb.blend_constants[3]); } if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) { struct anv_instance *instance = cmd_buffer->device->physical->instance; const VkViewport *viewports = dyn->vp.viewports; const float scale = dyn->vp.depth_clip_negative_one_to_one ? 0.5f : 1.0f; /* From the Vulkan 1.0.45 spec: * * "If the last active vertex processing stage shader entry point's * interface does not include a variable decorated with * ViewportIndex, then the first viewport is used." * * This could mean that we might need to set the MaximumVPIndex based on * the pipeline's last stage, but if the last shader doesn't write the * viewport index and the VUE header is used, the compiler will force * the value to 0 (which is what the spec requires above). Otherwise it * seems like the HW should be pulling 0 if the VUE header is not * present. * * Avoiding a check on the pipeline seems to prevent additional * emissions of 3DSTATE_CLIP which appear to impact performance on * Assassin's Creed Valhalla.. */ SET(CLIP, clip.MaximumVPIndex, dyn->vp.viewport_count > 0 ? dyn->vp.viewport_count - 1 : 0); for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) { const VkViewport *vp = &viewports[i]; /* The gfx7 state struct has just the matrix and guardband fields, the * gfx8 struct adds the min/max viewport fields. */ struct GENX(SF_CLIP_VIEWPORT) sfv = { .ViewportMatrixElementm00 = vp->width / 2, .ViewportMatrixElementm11 = vp->height / 2, .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale, .ViewportMatrixElementm30 = vp->x + vp->width / 2, .ViewportMatrixElementm31 = vp->y + vp->height / 2, .ViewportMatrixElementm32 = dyn->vp.depth_clip_negative_one_to_one ? (vp->minDepth + vp->maxDepth) * scale : vp->minDepth, .XMinClipGuardband = -1.0f, .XMaxClipGuardband = 1.0f, .YMinClipGuardband = -1.0f, .YMaxClipGuardband = 1.0f, .XMinViewPort = vp->x, .XMaxViewPort = vp->x + vp->width - 1, .YMinViewPort = MIN2(vp->y, vp->y + vp->height), .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1, }; /* Fix depth test misrenderings by lowering translated depth range */ if (instance->lower_depth_range_rate != 1.0f) sfv.ViewportMatrixElementm32 *= instance->lower_depth_range_rate; const uint32_t fb_size_max = 1 << 14; uint32_t x_min = 0, x_max = fb_size_max; uint32_t y_min = 0, y_max = fb_size_max; /* If we have a valid renderArea, include that */ if (gfx->render_area.extent.width > 0 && gfx->render_area.extent.height > 0) { x_min = MAX2(x_min, gfx->render_area.offset.x); x_max = MIN2(x_max, gfx->render_area.offset.x + gfx->render_area.extent.width); y_min = MAX2(y_min, gfx->render_area.offset.y); y_max = MIN2(y_max, gfx->render_area.offset.y + gfx->render_area.extent.height); } /* The client is required to have enough scissors for whatever it * sets as ViewportIndex but it's possible that they've got more * viewports set from a previous command. Also, from the Vulkan * 1.3.207: * * "The application must ensure (using scissor if necessary) that * all rendering is contained within the render area." * * If the client doesn't set a scissor, that basically means it * guarantees everything is in-bounds already. If we end up using a * guardband of [-1, 1] in that case, there shouldn't be much loss. * It's theoretically possible that they could do all their clipping * with clip planes but that'd be a bit odd. */ if (i < dyn->vp.scissor_count) { const VkRect2D *scissor = &dyn->vp.scissors[i]; x_min = MAX2(x_min, scissor->offset.x); x_max = MIN2(x_max, scissor->offset.x + scissor->extent.width); y_min = MAX2(y_min, scissor->offset.y); y_max = MIN2(y_max, scissor->offset.y + scissor->extent.height); } /* Only bother calculating the guardband if our known render area is * less than the maximum size. Otherwise, it will calculate [-1, 1] * anyway but possibly with precision loss. */ if (x_min > 0 || x_max < fb_size_max || y_min > 0 || y_max < fb_size_max) { intel_calculate_guardband_size(x_min, x_max, y_min, y_max, sfv.ViewportMatrixElementm00, sfv.ViewportMatrixElementm11, sfv.ViewportMatrixElementm30, sfv.ViewportMatrixElementm31, &sfv.XMinClipGuardband, &sfv.XMaxClipGuardband, &sfv.YMinClipGuardband, &sfv.YMaxClipGuardband); } #define SET_VP(bit, state, field) \ do { \ if (hw_state->state.field != sfv.field) { \ hw_state->state.field = sfv.field; \ BITSET_SET(hw_state->dirty, \ ANV_GFX_STATE_##bit); \ } \ } while (0) SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm00); SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm11); SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm22); SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm30); SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm31); SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm32); SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinClipGuardband); SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxClipGuardband); SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinClipGuardband); SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxClipGuardband); SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinViewPort); SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxViewPort); SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinViewPort); SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxViewPort); #undef SET_VP const bool depth_range_unrestricted = cmd_buffer->device->vk.enabled_extensions.EXT_depth_range_unrestricted; float min_depth_limit = depth_range_unrestricted ? -FLT_MAX : 0.0; float max_depth_limit = depth_range_unrestricted ? FLT_MAX : 1.0; float min_depth = dyn->rs.depth_clamp_enable ? MIN2(vp->minDepth, vp->maxDepth) : min_depth_limit; float max_depth = dyn->rs.depth_clamp_enable ? MAX2(vp->minDepth, vp->maxDepth) : max_depth_limit; SET(VIEWPORT_CC, vp_cc.elem[i].MinimumDepth, min_depth); SET(VIEWPORT_CC, vp_cc.elem[i].MaximumDepth, max_depth); } /* If the HW state is already considered dirty or the previous * programmed viewport count is smaller than what we need, update the * viewport count and ensure the HW state is dirty. Otherwise if the * number of viewport programmed previously was larger than what we need * now, no need to reemit we can just keep the old programmed values. */ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP) || hw_state->vp_sf_clip.count < dyn->vp.viewport_count) { hw_state->vp_sf_clip.count = dyn->vp.viewport_count; BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP); } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) || hw_state->vp_cc.count < dyn->vp.viewport_count) { hw_state->vp_cc.count = dyn->vp.viewport_count; BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC); } } if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS)) { const VkRect2D *scissors = dyn->vp.scissors; const VkViewport *viewports = dyn->vp.viewports; for (uint32_t i = 0; i < dyn->vp.scissor_count; i++) { const VkRect2D *s = &scissors[i]; const VkViewport *vp = &viewports[i]; const int max = 0xffff; uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height)); uint32_t x_min = MAX2(s->offset.x, vp->x); int64_t y_max = MIN2(s->offset.y + s->extent.height - 1, MAX2(vp->y, vp->y + vp->height) - 1); int64_t x_max = MIN2(s->offset.x + s->extent.width - 1, vp->x + vp->width - 1); y_max = CLAMP(y_max, 0, INT16_MAX >> 1); x_max = CLAMP(x_max, 0, INT16_MAX >> 1); /* Do this math using int64_t so overflow gets clamped correctly. */ if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { y_min = CLAMP((uint64_t) y_min, gfx->render_area.offset.y, max); x_min = CLAMP((uint64_t) x_min, gfx->render_area.offset.x, max); y_max = CLAMP((uint64_t) y_max, 0, gfx->render_area.offset.y + gfx->render_area.extent.height - 1); x_max = CLAMP((uint64_t) x_max, 0, gfx->render_area.offset.x + gfx->render_area.extent.width - 1); } if (s->extent.width <= 0 || s->extent.height <= 0) { /* Since xmax and ymax are inclusive, we have to have xmax < xmin * or ymax < ymin for empty clips. In case clip x, y, width height * are all 0, the clamps below produce 0 for xmin, ymin, xmax, * ymax, which isn't what we want. Just special case empty clips * and produce a canonical empty clip. */ SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, 1); SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, 1); SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, 0); SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, 0); } else { SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, y_min); SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, x_min); SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, y_max); SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, x_max); } } /* If the HW state is already considered dirty or the previous * programmed viewport count is smaller than what we need, update the * viewport count and ensure the HW state is dirty. Otherwise if the * number of viewport programmed previously was larger than what we need * now, no need to reemit we can just keep the old programmed values. */ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SCISSOR) || hw_state->scissor.count < dyn->vp.scissor_count) { hw_state->scissor.count = dyn->vp.scissor_count; BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SCISSOR); } } #if GFX_VERx10 == 125 if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)) { unsigned fb_width, fb_height, tile_width, tile_height; if (cmd_buffer->device->physical->instance->enable_tbimr && calculate_render_area(cmd_buffer, &fb_width, &fb_height) && calculate_tile_dimensions(cmd_buffer, fb_width, fb_height, &tile_width, &tile_height)) { /* Use a batch size of 128 polygons per slice as recommended * by BSpec 68436 "TBIMR Programming". */ const unsigned num_slices = cmd_buffer->device->info->num_slices; const unsigned batch_size = DIV_ROUND_UP(num_slices, 2) * 256; SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleHeight, tile_height); SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleWidth, tile_width); SET(TBIMR_TILE_PASS_INFO, tbimr.VerticalTileCount, DIV_ROUND_UP(fb_height, tile_height)); SET(TBIMR_TILE_PASS_INFO, tbimr.HorizontalTileCount, DIV_ROUND_UP(fb_width, tile_width)); SET(TBIMR_TILE_PASS_INFO, tbimr.TBIMRBatchSize, util_logbase2(batch_size) - 5); SET(TBIMR_TILE_PASS_INFO, tbimr.TileBoxCheck, true); SET(TBIMR_TILE_PASS_INFO, use_tbimr, true); } else { hw_state->use_tbimr = false; } } #endif struct anv_push_constants *push = &cmd_buffer->state.gfx.base.push_constants; /* If the pipeline uses a dynamic value of patch_control_points and either * the pipeline change or the dynamic value change, check the value and * reemit if needed. */ if (pipeline->dynamic_patch_control_points && ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) || BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS)) && push->gfx.tcs_input_vertices != dyn->ts.patch_control_points) { push->gfx.tcs_input_vertices = dyn->ts.patch_control_points; cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT; gfx->base.push_constants_data_dirty = true; } #undef GET #undef SET #undef SET_STAGE vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state); return dirty_state_mask; } static void emit_wa_18020335297_dummy_draw(struct anv_cmd_buffer *cmd_buffer) { #if GFX_VERx10 >= 125 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VFG), vfg) { vfg.DistributionMode = RR_STRICT; } anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) { vf.GeometryDistributionEnable = true; } #endif #if GFX_VER >= 12 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) { pr.ReplicaMask = 1; } #endif anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_RASTER), rr) { rr.CullMode = CULLMODE_NONE; rr.FrontFaceFillMode = FILL_MODE_SOLID; rr.BackFaceFillMode = FILL_MODE_SOLID; } anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_STATISTICS), zero); anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS), zero); #if GFX_VER >= 11 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS_2), zero); #endif anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLIP), clip) { clip.ClipEnable = true; clip.ClipMode = CLIPMODE_REJECT_ALL; } anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VS), zero); anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), zero); anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HS), zero); anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TE), zero); anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DS), zero); anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), zero); uint32_t *vertex_elements = anv_batch_emitn(&cmd_buffer->batch, 1 + 2 * 2, GENX(3DSTATE_VERTEX_ELEMENTS)); uint32_t *ve_pack_dest = &vertex_elements[1]; for (int i = 0; i < 2; i++) { struct GENX(VERTEX_ELEMENT_STATE) element = { .Valid = true, .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT, .Component0Control = VFCOMP_STORE_0, .Component1Control = VFCOMP_STORE_0, .Component2Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP, .Component3Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP, }; GENX(VERTEX_ELEMENT_STATE_pack)(NULL, ve_pack_dest, &element); ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length); } anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), topo) { topo.PrimitiveTopologyType = _3DPRIM_TRILIST; } /* Emit dummy draw per slice. */ for (unsigned i = 0; i < cmd_buffer->device->info->num_slices; i++) { anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.VertexCountPerInstance = 3; prim.PrimitiveTopologyType = _3DPRIM_TRILIST; prim.InstanceCount = 1; prim.VertexAccessType = SEQUENTIAL; } } } #if INTEL_WA_14018283232_GFX_VER void genX(batch_emit_wa_14018283232)(struct anv_batch *batch) { anv_batch_emit(batch, GENX(RESOURCE_BARRIER), barrier) { barrier.ResourceBarrierBody = (struct GENX(RESOURCE_BARRIER_BODY)) { .BarrierType = RESOURCE_BARRIER_TYPE_IMMEDIATE, .SignalStage = RESOURCE_BARRIER_STAGE_COLOR, .WaitStage = RESOURCE_BARRIER_STAGE_PIXEL, }; } } #endif /** * This function handles dirty state emission to the batch buffer. */ static void cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer) { struct anv_device *device = cmd_buffer->device; struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; struct anv_graphics_pipeline *pipeline = anv_pipeline_to_graphics(gfx->base.pipeline); const struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state; struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state; const bool protected = cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT; if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_URB)) { genX(urb_workaround)(cmd_buffer, &pipeline->urb_cfg); anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.urb); memcpy(&gfx->urb_cfg, &pipeline->urb_cfg, sizeof(struct intel_urb_config)); } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION)) anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.primitive_replication); if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_INSTANCING)) anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs_instancing); if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS)) anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs); #if GFX_VER >= 11 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2)) anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs_2); #endif if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VS)) { anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline, final.vs, protected); } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_HS)) { anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline, final.hs, protected); } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_DS)) { anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline, final.ds, protected); } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS)) anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_statistics); if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE)) anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe); if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_SWIZ)) anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe_swiz); if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST)) { /* Wa_16011773973: * If SOL is enabled and SO_DECL state has to be programmed, * 1. Send 3D State SOL state with SOL disabled * 2. Send SO_DECL NP state * 3. Send 3D State SOL with SOL Enabled */ if (intel_needs_workaround(device->info, 16011773973) && pipeline->uses_xfb) anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), so); anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.so_decl_list); #if GFX_VER >= 11 && GFX_VER < 20 /* ICL PRMs, Volume 2a - Command Reference: Instructions, * 3DSTATE_SO_DECL_LIST: * * "Workaround: This command must be followed by a PIPE_CONTROL with * CS Stall bit set." * * On DG2+ also known as Wa_1509820217. */ genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info, cmd_buffer->state.current_pipeline, ANV_PIPE_CS_STALL_BIT); #endif } if (device->vk.enabled_extensions.EXT_mesh_shader) { if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL)) { anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline, final.mesh_control, protected); } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_SHADER)) anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_shader); if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_DISTRIB)) anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_distrib); if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL)) { anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline, final.task_control, protected); } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_SHADER)) anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_shader); if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_REDISTRIB)) anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_redistrib); if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_MESH)) anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe_mesh); if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP_MESH)) anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.clip_mesh); } else { assert(!BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL) && !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_SHADER) && !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_DISTRIB) && !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL) && !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_SHADER) && !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_REDISTRIB) && !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP_MESH) && !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_MESH)); } #define INIT(category, name) \ .name = hw_state->category.name #define SET(s, category, name) \ s.name = hw_state->category.name /* Now the potentially dynamic instructions */ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS)) { anv_batch_emit_merge_protected(&cmd_buffer->batch, GENX(3DSTATE_PS), pipeline, partial.ps, ps, protected) { SET(ps, ps, KernelStartPointer0); SET(ps, ps, KernelStartPointer1); SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData0); SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData1); #if GFX_VER < 20 SET(ps, ps, KernelStartPointer2); SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData2); SET(ps, ps, _8PixelDispatchEnable); SET(ps, ps, _16PixelDispatchEnable); SET(ps, ps, _32PixelDispatchEnable); #else SET(ps, ps, Kernel0Enable); SET(ps, ps, Kernel1Enable); SET(ps, ps, Kernel0SIMDWidth); SET(ps, ps, Kernel1SIMDWidth); SET(ps, ps, Kernel0PolyPackingPolicy); #endif SET(ps, ps, PositionXYOffsetSelect); } } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS_EXTRA)) { anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_PS_EXTRA), pipeline, partial.ps_extra, pse) { SET(pse, ps_extra, PixelShaderHasUAV); SET(pse, ps_extra, PixelShaderIsPerSample); #if GFX_VER >= 11 SET(pse, ps_extra, PixelShaderIsPerCoarsePixel); #endif #if GFX_VERx10 >= 125 SET(pse, ps_extra, EnablePSDependencyOnCPsizeChange); #endif SET(pse, ps_extra, PixelShaderKillsPixel); } } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP)) { anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_CLIP), pipeline, partial.clip, clip) { SET(clip, clip, APIMode); SET(clip, clip, ViewportXYClipTestEnable); SET(clip, clip, TriangleStripListProvokingVertexSelect); SET(clip, clip, LineStripListProvokingVertexSelect); SET(clip, clip, TriangleFanProvokingVertexSelect); SET(clip, clip, MaximumVPIndex); } } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_STREAMOUT)) { genX(streamout_prologue)(cmd_buffer); anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), pipeline, partial.so, so) { SET(so, so, RenderingDisable); SET(so, so, RenderStreamSelect); SET(so, so, ReorderMode); SET(so, so, ForceRendering); } } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP)) { struct anv_state sf_clip_state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, hw_state->vp_sf_clip.count * 64, 64); for (uint32_t i = 0; i < hw_state->vp_sf_clip.count; i++) { struct GENX(SF_CLIP_VIEWPORT) sfv = { INIT(vp_sf_clip.elem[i], ViewportMatrixElementm00), INIT(vp_sf_clip.elem[i], ViewportMatrixElementm11), INIT(vp_sf_clip.elem[i], ViewportMatrixElementm22), INIT(vp_sf_clip.elem[i], ViewportMatrixElementm30), INIT(vp_sf_clip.elem[i], ViewportMatrixElementm31), INIT(vp_sf_clip.elem[i], ViewportMatrixElementm32), INIT(vp_sf_clip.elem[i], XMinClipGuardband), INIT(vp_sf_clip.elem[i], XMaxClipGuardband), INIT(vp_sf_clip.elem[i], YMinClipGuardband), INIT(vp_sf_clip.elem[i], YMaxClipGuardband), INIT(vp_sf_clip.elem[i], XMinViewPort), INIT(vp_sf_clip.elem[i], XMaxViewPort), INIT(vp_sf_clip.elem[i], YMinViewPort), INIT(vp_sf_clip.elem[i], YMaxViewPort), }; GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv); } anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) { clip.SFClipViewportPointer = sf_clip_state.offset; } } /* Force CC_VIEWPORT reallocation on Gfx9 when reprogramming * 3DSTATE_VIEWPORT_STATE_POINTERS_CC : * https://gitlab.freedesktop.org/mesa/mesa/-/issues/11647 */ if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) || (GFX_VER == 9 && BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR))) { hw_state->vp_cc.state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, hw_state->vp_cc.count * 8, 32); for (uint32_t i = 0; i < hw_state->vp_cc.count; i++) { struct GENX(CC_VIEWPORT) cc_viewport = { INIT(vp_cc.elem[i], MinimumDepth), INIT(vp_cc.elem[i], MaximumDepth), }; GENX(CC_VIEWPORT_pack)(NULL, hw_state->vp_cc.state.map + i * 8, &cc_viewport); } /* Dirty the pointers to reemit 3DSTATE_VIEWPORT_STATE_POINTERS_CC below */ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR); } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR)) { anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) { cc.CCViewportPointer = hw_state->vp_cc.state.offset; } cmd_buffer->state.gfx.viewport_set = true; } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SCISSOR)) { /* Wa_1409725701: * * "The viewport-specific state used by the SF unit (SCISSOR_RECT) is * stored as an array of up to 16 elements. The location of first * element of the array, as specified by Pointer to SCISSOR_RECT, * should be aligned to a 64-byte boundary. */ struct anv_state scissor_state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, hw_state->scissor.count * 8, 64); for (uint32_t i = 0; i < hw_state->scissor.count; i++) { struct GENX(SCISSOR_RECT) scissor = { INIT(scissor.elem[i], ScissorRectangleYMin), INIT(scissor.elem[i], ScissorRectangleXMin), INIT(scissor.elem[i], ScissorRectangleYMax), INIT(scissor.elem[i], ScissorRectangleXMax), }; GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor); } anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) { ssp.ScissorRectPointer = scissor_state.offset; } } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY)) { anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) { SET(vft, vft, PrimitiveTopologyType); } } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT)) { const uint32_t ve_count = pipeline->vs_input_elements + pipeline->svgs_count; const uint32_t num_dwords = 1 + 2 * MAX2(1, ve_count); uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, num_dwords, GENX(3DSTATE_VERTEX_ELEMENTS)); if (p) { if (ve_count == 0) { memcpy(p + 1, cmd_buffer->device->physical->empty_vs_input, sizeof(cmd_buffer->device->physical->empty_vs_input)); } else if (ve_count == pipeline->vertex_input_elems) { /* MESA_VK_DYNAMIC_VI is not dynamic for this pipeline, so * everything is in pipeline->vertex_input_data and we can just * memcpy */ memcpy(p + 1, pipeline->vertex_input_data, 4 * 2 * ve_count); anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_instancing); } else { assert(pipeline->final.vf_instancing.len == 0); /* Use dyn->vi to emit the dynamic VERTEX_ELEMENT_STATE input. */ genX(emit_vertex_input)(&cmd_buffer->batch, p + 1, pipeline, dyn->vi, false /* emit_in_pipeline */); /* Then append the VERTEX_ELEMENT_STATE for the draw parameters */ memcpy(p + 1 + 2 * pipeline->vs_input_elements, pipeline->vertex_input_data, 4 * 2 * pipeline->vertex_input_elems); } } } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TE)) { anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_TE), pipeline, partial.te, te) { SET(te, te, OutputTopology); } } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_GS)) { anv_batch_emit_merge_protected(&cmd_buffer->batch, GENX(3DSTATE_GS), pipeline, partial.gs, gs, protected) { SET(gs, gs, ReorderMode); } } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CPS)) { #if GFX_VER == 11 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CPS), cps) { SET(cps, cps, CoarsePixelShadingMode); SET(cps, cps, MinCPSizeX); SET(cps, cps, MinCPSizeY); } #elif GFX_VER >= 12 /* TODO: we can optimize this flush in the following cases: * * In the case where the last geometry shader emits a value that is * not constant, we can avoid this stall because we can synchronize * the pixel shader internally with * 3DSTATE_PS::EnablePSDependencyOnCPsizeChange. * * If we know that the previous pipeline and the current one are * using the same fragment shading rate. */ anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { #if GFX_VERx10 >= 125 pc.PSSStallSyncEnable = true; #else pc.PSDSyncEnable = true; #endif } anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CPS_POINTERS), cps) { SET(cps, cps, CoarsePixelShadingStateArrayPointer); } #endif } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SF)) { anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_SF), pipeline, partial.sf, sf) { SET(sf, sf, LineWidth); SET(sf, sf, TriangleStripListProvokingVertexSelect); SET(sf, sf, LineStripListProvokingVertexSelect); SET(sf, sf, TriangleFanProvokingVertexSelect); SET(sf, sf, LegacyGlobalDepthBiasEnable); } } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_RASTER)) { anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_RASTER), pipeline, partial.raster, raster) { SET(raster, raster, APIMode); SET(raster, raster, DXMultisampleRasterizationEnable); SET(raster, raster, AntialiasingEnable); SET(raster, raster, CullMode); SET(raster, raster, FrontWinding); SET(raster, raster, GlobalDepthOffsetEnableSolid); SET(raster, raster, GlobalDepthOffsetEnableWireframe); SET(raster, raster, GlobalDepthOffsetEnablePoint); SET(raster, raster, GlobalDepthOffsetConstant); SET(raster, raster, GlobalDepthOffsetScale); SET(raster, raster, GlobalDepthOffsetClamp); SET(raster, raster, FrontFaceFillMode); SET(raster, raster, BackFaceFillMode); SET(raster, raster, ViewportZFarClipTestEnable); SET(raster, raster, ViewportZNearClipTestEnable); SET(raster, raster, ConservativeRasterizationEnable); } } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE)) { anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_MULTISAMPLE), pipeline, partial.ms, ms) { SET(ms, ms, NumberofMultisamples); } } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CC_STATE)) { hw_state->cc.state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, GENX(COLOR_CALC_STATE_length) * 4, 64); struct GENX(COLOR_CALC_STATE) cc = { INIT(cc, BlendConstantColorRed), INIT(cc, BlendConstantColorGreen), INIT(cc, BlendConstantColorBlue), INIT(cc, BlendConstantColorAlpha), }; GENX(COLOR_CALC_STATE_pack)(NULL, hw_state->cc.state.map, &cc); /* Dirty the pointers to reemit 3DSTATE_CC_STATE_POINTERS below */ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CC_STATE_PTR); } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CC_STATE_PTR)) { anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) { ccp.ColorCalcStatePointer = hw_state->cc.state.offset; ccp.ColorCalcStatePointerValid = true; } } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK)) { anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SAMPLE_MASK), sm) { SET(sm, sm, SampleMask); } } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM_DEPTH_STENCIL)) { anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds) { SET(ds, ds, DoubleSidedStencilEnable); SET(ds, ds, StencilTestMask); SET(ds, ds, StencilWriteMask); SET(ds, ds, BackfaceStencilTestMask); SET(ds, ds, BackfaceStencilWriteMask); SET(ds, ds, StencilReferenceValue); SET(ds, ds, BackfaceStencilReferenceValue); SET(ds, ds, DepthTestEnable); SET(ds, ds, DepthBufferWriteEnable); SET(ds, ds, DepthTestFunction); SET(ds, ds, StencilTestEnable); SET(ds, ds, StencilBufferWriteEnable); SET(ds, ds, StencilFailOp); SET(ds, ds, StencilPassDepthPassOp); SET(ds, ds, StencilPassDepthFailOp); SET(ds, ds, StencilTestFunction); SET(ds, ds, BackfaceStencilFailOp); SET(ds, ds, BackfaceStencilPassDepthPassOp); SET(ds, ds, BackfaceStencilPassDepthFailOp); SET(ds, ds, BackfaceStencilTestFunction); } } #if GFX_VER >= 12 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_DEPTH_BOUNDS)) { anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BOUNDS), db) { SET(db, db, DepthBoundsTestEnable); SET(db, db, DepthBoundsTestMinValue); SET(db, db, DepthBoundsTestMaxValue); } } #endif if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_LINE_STIPPLE)) { anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_LINE_STIPPLE), ls) { SET(ls, ls, LineStipplePattern); SET(ls, ls, LineStippleInverseRepeatCount); SET(ls, ls, LineStippleRepeatCount); } #if GFX_VER >= 11 /* ICL PRMs, Volume 2a - Command Reference: Instructions, * 3DSTATE_LINE_STIPPLE: * * "Workaround: This command must be followed by a PIPE_CONTROL with * CS Stall bit set." */ genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info, cmd_buffer->state.current_pipeline, ANV_PIPE_CS_STALL_BIT); #endif } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF)) { anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) { #if GFX_VERx10 >= 125 vf.GeometryDistributionEnable = true; #endif SET(vf, vf, IndexedDrawCutIndexEnable); SET(vf, vf, CutIndex); } } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_INDEX_BUFFER)) { struct anv_buffer *buffer = gfx->index_buffer; uint32_t offset = gfx->index_offset; anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER), ib) { ib.IndexFormat = gfx->index_type; ib.MOCS = anv_mocs(cmd_buffer->device, buffer ? buffer->address.bo : NULL, ISL_SURF_USAGE_INDEX_BUFFER_BIT); #if GFX_VER >= 12 ib.L3BypassDisable = true; #endif if (buffer) { ib.BufferStartingAddress = anv_address_add(buffer->address, offset); ib.BufferSize = gfx->index_size; } } } #if GFX_VERx10 >= 125 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VFG)) { anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_VFG), pipeline, partial.vfg, vfg) { SET(vfg, vfg, ListCutIndexEnable); } } #endif if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SAMPLE_PATTERN)) { genX(emit_sample_pattern)(&cmd_buffer->batch, dyn->ms.sample_locations_enable ? dyn->ms.sample_locations : NULL); } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM)) { anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_WM), pipeline, partial.wm, wm) { SET(wm, wm, LineStippleEnable); SET(wm, wm, BarycentricInterpolationMode); } } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS_BLEND)) { anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PS_BLEND), blend) { SET(blend, ps_blend, HasWriteableRT); SET(blend, ps_blend, ColorBufferBlendEnable); SET(blend, ps_blend, SourceAlphaBlendFactor); SET(blend, ps_blend, DestinationAlphaBlendFactor); SET(blend, ps_blend, SourceBlendFactor); SET(blend, ps_blend, DestinationBlendFactor); SET(blend, ps_blend, AlphaTestEnable); SET(blend, ps_blend, IndependentAlphaBlendEnable); SET(blend, ps_blend, AlphaToCoverageEnable); } } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE)) { const uint32_t num_dwords = GENX(BLEND_STATE_length) + GENX(BLEND_STATE_ENTRY_length) * MAX_RTS; hw_state->blend.state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, num_dwords * 4, 64); uint32_t *dws = hw_state->blend.state.map; struct GENX(BLEND_STATE) blend_state = { INIT(blend, AlphaToCoverageEnable), INIT(blend, AlphaToOneEnable), INIT(blend, IndependentAlphaBlendEnable), INIT(blend, ColorDitherEnable), }; GENX(BLEND_STATE_pack)(NULL, dws, &blend_state); /* Jump to blend entries. */ dws += GENX(BLEND_STATE_length); for (uint32_t i = 0; i < MAX_RTS; i++) { struct GENX(BLEND_STATE_ENTRY) entry = { INIT(blend.rts[i], WriteDisableAlpha), INIT(blend.rts[i], WriteDisableRed), INIT(blend.rts[i], WriteDisableGreen), INIT(blend.rts[i], WriteDisableBlue), INIT(blend.rts[i], LogicOpFunction), INIT(blend.rts[i], LogicOpEnable), INIT(blend.rts[i], ColorBufferBlendEnable), INIT(blend.rts[i], ColorClampRange), INIT(blend.rts[i], PreBlendColorClampEnable), INIT(blend.rts[i], PostBlendColorClampEnable), INIT(blend.rts[i], SourceBlendFactor), INIT(blend.rts[i], DestinationBlendFactor), INIT(blend.rts[i], ColorBlendFunction), INIT(blend.rts[i], SourceAlphaBlendFactor), INIT(blend.rts[i], DestinationAlphaBlendFactor), INIT(blend.rts[i], AlphaBlendFunction), }; GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry); dws += GENX(BLEND_STATE_ENTRY_length); } /* Dirty the pointers to reemit 3DSTATE_BLEND_STATE_POINTERS below */ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_PTR); } if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_PTR)) { anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) { bsp.BlendStatePointer = hw_state->blend.state.offset; bsp.BlendStatePointerValid = true; } } #if INTEL_WA_18019816803_GFX_VER if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WA_18019816803)) { genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info, cmd_buffer->state.current_pipeline, ANV_PIPE_PSS_STALL_SYNC_BIT); } #endif #if INTEL_WA_14018283232_GFX_VER if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WA_14018283232)) genX(batch_emit_wa_14018283232)(&cmd_buffer->batch); #endif #if GFX_VER == 9 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PMA_FIX)) genX(cmd_buffer_enable_pma_fix)(cmd_buffer, hw_state->pma_fix); #endif #if GFX_VERx10 >= 125 if (hw_state->use_tbimr && BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TBIMR_TILE_PASS_INFO)) { anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TBIMR_TILE_PASS_INFO), tbimr) { SET(tbimr, tbimr, TileRectangleHeight); SET(tbimr, tbimr, TileRectangleWidth); SET(tbimr, tbimr, VerticalTileCount); SET(tbimr, tbimr, HorizontalTileCount); SET(tbimr, tbimr, TBIMRBatchSize); SET(tbimr, tbimr, TileBoxCheck); } } #endif #undef INIT #undef SET BITSET_ZERO(hw_state->dirty); } /** * This function handles possible state workarounds and emits the dirty * instructions to the batch buffer. */ void genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer) { struct anv_device *device = cmd_buffer->device; struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; struct anv_graphics_pipeline *pipeline = anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline); struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state; if (INTEL_DEBUG(DEBUG_REEMIT)) { BITSET_OR(gfx->dyn_state.dirty, gfx->dyn_state.dirty, device->gfx_dirty_state); } /** * Put potential workarounds here if you need to reemit an instruction * because of another one is changing. */ /* Since Wa_16011773973 will disable 3DSTATE_STREAMOUT, we need to reemit * it after. */ if (intel_needs_workaround(device->info, 16011773973) && pipeline->uses_xfb && BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST)) { BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT); } /* Gfx11 undocumented issue : * https://gitlab.freedesktop.org/mesa/mesa/-/issues/9781 */ #if GFX_VER == 11 if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE)) BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE); #endif /* Wa_18020335297 - Apply the WA when viewport ptr is reprogrammed. */ if (intel_needs_workaround(device->info, 18020335297) && (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) || BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR)) && cmd_buffer->state.gfx.viewport_set) { /* For mesh, we implement the WA using CS stall. This is for * simplicity and takes care of possible interaction with Wa_16014390852. */ if (anv_pipeline_is_mesh(pipeline)) { genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info, _3D, ANV_PIPE_CS_STALL_BIT); } else { /* Mask off all instructions that we program. */ BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VFG); BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF); BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION); BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_RASTER); BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS); BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_SGVS); BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2); BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_CLIP); BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_STREAMOUT); BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT); BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY); BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VS); BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_GS); BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_HS); BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_TE); BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_DS); cmd_buffer_gfx_state_emission(cmd_buffer); emit_wa_18020335297_dummy_draw(cmd_buffer); /* Dirty all emitted WA state to make sure that current real * state is restored. */ BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VFG); BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF); BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION); BITSET_SET(hw_state->dirty, ANV_GFX_STATE_RASTER); BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS); BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS); BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2); BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CLIP); BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT); BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT); BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY); BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS); BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS); BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS); BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE); BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS); } } cmd_buffer_gfx_state_emission(cmd_buffer); } void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable) { if (!anv_cmd_buffer_is_render_queue(cmd_buffer)) return; if (cmd_buffer->state.pma_fix_enabled == enable) return; cmd_buffer->state.pma_fix_enabled = enable; /* According to the Broadwell PIPE_CONTROL documentation, software should * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set * prior to the LRI. If stencil buffer writes are enabled, then a Render * Cache Flush is also necessary. * * The Skylake docs say to use a depth stall rather than a command * streamer stall. However, the hardware seems to violently disagree. * A full command streamer stall seems to be needed in both cases. */ genx_batch_emit_pipe_control (&cmd_buffer->batch, cmd_buffer->device->info, cmd_buffer->state.current_pipeline, ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT | #if GFX_VER >= 12 ANV_PIPE_TILE_CACHE_FLUSH_BIT | #endif ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT); #if GFX_VER == 9 uint32_t cache_mode; anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0), .STCPMAOptimizationEnable = enable, .STCPMAOptimizationEnableMask = true); anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { lri.RegisterOffset = GENX(CACHE_MODE_0_num); lri.DataDWord = cache_mode; } #endif /* GFX_VER == 9 */ /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache * Flush bits is often necessary. We do it regardless because it's easier. * The render cache flush is also necessary if stencil writes are enabled. * * Again, the Skylake docs give a different set of flushes but the BDW * flushes seem to work just as well. */ genx_batch_emit_pipe_control (&cmd_buffer->batch, cmd_buffer->device->info, cmd_buffer->state.current_pipeline, ANV_PIPE_DEPTH_STALL_BIT | ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | #if GFX_VER >= 12 ANV_PIPE_TILE_CACHE_FLUSH_BIT | #endif ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT); }