xref: /aosp_15_r20/external/mesa3d/src/nouveau/vulkan/nvk_cmd_draw.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 #include "nvk_buffer.h"
6 #include "nvk_entrypoints.h"
7 #include "nvk_cmd_buffer.h"
8 #include "nvk_device.h"
9 #include "nvk_format.h"
10 #include "nvk_image.h"
11 #include "nvk_image_view.h"
12 #include "nvk_mme.h"
13 #include "nvk_physical_device.h"
14 #include "nvk_shader.h"
15 
16 #include "util/bitpack_helpers.h"
17 #include "vk_format.h"
18 #include "vk_render_pass.h"
19 #include "vk_standard_sample_locations.h"
20 
21 #include "nv_push_cl902d.h"
22 #include "nv_push_cl9097.h"
23 #include "nv_push_cl90b5.h"
24 #include "nv_push_cl90c0.h"
25 #include "nv_push_cla097.h"
26 #include "nv_push_clb097.h"
27 #include "nv_push_clb197.h"
28 #include "nv_push_clc397.h"
29 #include "nv_push_clc597.h"
30 #include "drf.h"
31 
32 static inline uint16_t
nvk_cmd_buffer_3d_cls(struct nvk_cmd_buffer * cmd)33 nvk_cmd_buffer_3d_cls(struct nvk_cmd_buffer *cmd)
34 {
35    struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
36    struct nvk_physical_device *pdev = nvk_device_physical(dev);
37    return pdev->info.cls_eng3d;
38 }
39 
40 static void
mme_set_priv_reg(struct mme_builder * b,struct mme_value value,struct mme_value mask,struct mme_value reg)41 mme_set_priv_reg(struct mme_builder *b,
42                  struct mme_value value,
43                  struct mme_value mask,
44                  struct mme_value reg)
45 {
46    mme_mthd(b, NV9097_WAIT_FOR_IDLE);
47    mme_emit(b, mme_zero());
48 
49    mme_mthd(b, NVK_SET_MME_SCRATCH(FALCON_0));
50    mme_emit(b, mme_zero());
51    mme_emit(b, value);
52    mme_emit(b, mask);
53 
54    mme_mthd(b, NV9097_SET_FALCON04);
55    mme_emit(b, reg);
56 
57    struct mme_value loop_cond = mme_mov(b, mme_zero());
58    mme_while(b, ine, loop_cond, mme_imm(1)) {
59       mme_state_to(b, loop_cond, NVK_SET_MME_SCRATCH(FALCON_0));
60       mme_mthd(b, NV9097_NO_OPERATION);
61       mme_emit(b, mme_zero());
62    };
63 }
64 
65 void
nvk_mme_set_priv_reg(struct mme_builder * b)66 nvk_mme_set_priv_reg(struct mme_builder *b)
67 {
68    struct mme_value value = mme_load(b);
69    struct mme_value mask = mme_load(b);
70    struct mme_value reg = mme_load(b);
71 
72    mme_set_priv_reg(b, value, mask, reg);
73 }
74 
75 void
nvk_mme_set_conservative_raster_state(struct mme_builder * b)76 nvk_mme_set_conservative_raster_state(struct mme_builder *b)
77 {
78    struct mme_value new_state = mme_load(b);
79    struct mme_value old_state =
80       nvk_mme_load_scratch(b, CONSERVATIVE_RASTER_STATE);
81 
82    mme_if(b, ine, new_state, old_state) {
83       nvk_mme_store_scratch(b, CONSERVATIVE_RASTER_STATE, new_state);
84       mme_set_priv_reg(b, new_state, mme_imm(BITFIELD_RANGE(23, 2)),
85                        mme_imm(0x418800));
86    }
87 }
88 
89 #define NVK_DRAW_CB0_SIZE sizeof(struct nvk_root_descriptor_table)
90 
91 void
nvk_mme_select_cb0(struct mme_builder * b)92 nvk_mme_select_cb0(struct mme_builder *b)
93 {
94    struct mme_value addr_hi = nvk_mme_load_scratch(b, CB0_ADDR_HI);
95    struct mme_value addr_lo = nvk_mme_load_scratch(b, CB0_ADDR_LO);
96 
97    mme_mthd(b, NV9097_SET_CONSTANT_BUFFER_SELECTOR_A);
98    mme_emit(b, mme_imm(NVK_DRAW_CB0_SIZE));
99    mme_emit(b, addr_hi);
100    mme_emit(b, addr_lo);
101 }
102 
103 static uint32_t nvk_mme_anti_alias_init(void);
104 
105 VkResult
nvk_push_draw_state_init(struct nvk_queue * queue,struct nv_push * p)106 nvk_push_draw_state_init(struct nvk_queue *queue, struct nv_push *p)
107 {
108    struct nvk_device *dev = nvk_queue_device(queue);
109    struct nvk_physical_device *pdev = nvk_device_physical(dev);
110 
111    /* 3D state */
112    P_MTHD(p, NV9097, SET_OBJECT);
113    P_NV9097_SET_OBJECT(p, {
114       .class_id = pdev->info.cls_eng3d,
115       .engine_id = 0,
116    });
117 
118    for (uint32_t mme = 0, mme_pos = 0; mme < NVK_MME_COUNT; mme++) {
119       size_t size;
120       uint32_t *dw = nvk_build_mme(&pdev->info, mme, &size);
121       if (dw == NULL)
122          return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
123 
124       assert(size % sizeof(uint32_t) == 0);
125       const uint32_t num_dw = size / sizeof(uint32_t);
126 
127       P_MTHD(p, NV9097, LOAD_MME_START_ADDRESS_RAM_POINTER);
128       P_NV9097_LOAD_MME_START_ADDRESS_RAM_POINTER(p, mme);
129       P_NV9097_LOAD_MME_START_ADDRESS_RAM(p, mme_pos);
130 
131       P_1INC(p, NV9097, LOAD_MME_INSTRUCTION_RAM_POINTER);
132       P_NV9097_LOAD_MME_INSTRUCTION_RAM_POINTER(p, mme_pos);
133       P_INLINE_ARRAY(p, dw, num_dw);
134 
135       mme_pos += num_dw;
136 
137       free(dw);
138    }
139 
140    if (pdev->info.cls_eng3d >= TURING_A)
141       P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
142 
143    /* Enable FP helper invocation memory loads
144     *
145     * For generations with firmware support for our `SET_PRIV_REG` mme method
146     * we simply use that. On older generations we'll let the kernel do it.
147     * Starting with GSP we have to do it via the firmware anyway.
148     *
149     * This clears bit 3 of gr_gpcs_tpcs_sm_disp_ctrl
150     *
151     * Without it,
152     * dEQP-VK.subgroups.vote.frag_helper.subgroupallequal_bvec2_fragment will
153     * occasionally fail.
154     */
155    if (pdev->info.cls_eng3d >= MAXWELL_B) {
156       unsigned reg = pdev->info.cls_eng3d >= VOLTA_A ? 0x419ba4 : 0x419f78;
157       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_PRIV_REG));
158       P_INLINE_DATA(p, 0);
159       P_INLINE_DATA(p, BITFIELD_BIT(3));
160       P_INLINE_DATA(p, reg);
161    }
162 
163    /* Disable Out Of Range Address exceptions
164     *
165     * From the SPH documentation:
166     *
167     *    "The SPH fields StoreReqStart and StoreReqEnd set a range of
168     *    attributes whose corresponding Odmap values of ST or ST_LAST are
169     *    treated as ST_REQ. Normally, for an attribute whose Omap bit is TRUE
170     *    and Odmap value is ST, when the shader writes data to this output, it
171     *    can not count on being able to read it back, since the next
172     *    downstream shader might have its Imap bit FALSE, thereby causing the
173     *    Bmap bit to be FALSE. By including a ST type of attribute in the
174     *    range of StoreReqStart and StoreReqEnd, the attribute’s Odmap value
175     *    is treated as ST_REQ, so an Omap bit being TRUE causes the Bmap bit
176     *    to be TRUE. This guarantees the shader program can output the value
177     *    and then read it back later. This will save register space."
178     *
179     * It's unclear exactly what's going on but this seems to imply that the
180     * hardware actually ANDs the output mask of one shader stage together with
181     * the input mask of the subsequent shader stage to determine which values
182     * are actually used.
183     *
184     * In the case when we have an empty fragment shader, it seems the hardware
185     * doesn't allocate any output memory for final geometry stage at all and
186     * so any writes to outputs from the final shader stage generates an Out Of
187     * Range Address exception.  We could fix this by eliminating unused
188     * outputs via cross-stage linking but that won't work in the case of
189     * VK_EXT_shader_object and VK_EXT_graphics_pipeline_library fast-link.
190     * Instead, the easiest solution is to just disable the exception.
191     *
192     * NOTE (Faith):
193     *
194     *    This above analysis is 100% conjecture on my part based on a creative
195     *    reading of the SPH docs and what I saw when trying to run certain
196     *    OpenGL CTS tests on NVK + Zink.  Without access to NVIDIA HW
197     *    engineers, have no way of verifying this analysis.
198     *
199     *    The CTS test in question is:
200     *
201     *    KHR-GL46.tessellation_shader.tessellation_control_to_tessellation_evaluation.gl_tessLevel
202     *
203     * This should also prevent any issues with array overruns on I/O arrays.
204     * Before, they would get an exception and kill the context whereas now
205     * they should gently get ignored.
206     *
207     * This clears bit 14 of gr_gpcs_tpcs_sms_hww_warp_esr_report_mask
208     */
209    if (pdev->info.cls_eng3d >= MAXWELL_B) {
210       unsigned reg = pdev->info.cls_eng3d >= VOLTA_A ? 0x419ea8 : 0x419e44;
211       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_PRIV_REG));
212       P_INLINE_DATA(p, 0);
213       P_INLINE_DATA(p, BITFIELD_BIT(14));
214       P_INLINE_DATA(p, reg);
215    }
216 
217    /* Set CONSERVATIVE_RASTER_STATE to an invalid value, to ensure the
218     * hardware reg is always set the first time conservative rasterization
219     * is enabled */
220    P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CONSERVATIVE_RASTER_STATE),
221                      ~0);
222 
223    /* Initialize tessellation parameters */
224    P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_TESS_PARAMS), 0);
225    P_IMMD(p, NV9097, SET_TESSELLATION_PARAMETERS, {});
226 
227    P_IMMD(p, NV9097, SET_RENDER_ENABLE_C, MODE_TRUE);
228 
229    P_IMMD(p, NV9097, SET_Z_COMPRESSION, ENABLE_TRUE);
230    P_MTHD(p, NV9097, SET_COLOR_COMPRESSION(0));
231    for (unsigned i = 0; i < 8; i++)
232       P_NV9097_SET_COLOR_COMPRESSION(p, i, ENABLE_TRUE);
233 
234    P_IMMD(p, NV9097, SET_CT_SELECT, { .target_count = 1 });
235 
236 //   P_MTHD(cmd->push, NVC0_3D, CSAA_ENABLE);
237 //   P_INLINE_DATA(cmd->push, 0);
238 
239    P_IMMD(p, NV9097, SET_ALIASED_LINE_WIDTH_ENABLE, V_TRUE);
240 
241    P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART_VERTEX_ARRAY, ENABLE_FALSE);
242 
243    P_IMMD(p, NV9097, SET_BLEND_SEPARATE_FOR_ALPHA, ENABLE_TRUE);
244    P_IMMD(p, NV9097, SET_SINGLE_CT_WRITE_CONTROL, ENABLE_TRUE);
245    P_IMMD(p, NV9097, SET_SINGLE_ROP_CONTROL, ENABLE_FALSE);
246    P_IMMD(p, NV9097, SET_TWO_SIDED_STENCIL_TEST, ENABLE_TRUE);
247 
248    P_IMMD(p, NV9097, SET_SHADE_MODE, V_OGL_SMOOTH);
249 
250    P_IMMD(p, NV9097, SET_API_VISIBLE_CALL_LIMIT, V__128);
251 
252    P_IMMD(p, NV9097, SET_ZCULL_STATS, ENABLE_TRUE);
253 
254    P_IMMD(p, NV9097, SET_L1_CONFIGURATION,
255                      DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB);
256 
257    P_IMMD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_ENABLE, V_FALSE);
258    P_IMMD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_UNORM8, {
259       .all_covered_all_hit_once = 0xff,
260    });
261    P_MTHD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_UNORM10);
262    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_UNORM10(p, {
263       .all_covered_all_hit_once = 0xff,
264    });
265    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_UNORM16(p, {
266       .all_covered_all_hit_once = 0xff,
267    });
268    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_FP11(p, {
269       .all_covered_all_hit_once = 0x3f,
270    });
271    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_FP16(p, {
272       .all_covered_all_hit_once = 0xff,
273    });
274    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_SRGB8(p, {
275       .all_covered_all_hit_once = 0xff,
276    });
277 
278    if (pdev->info.cls_eng3d < VOLTA_A)
279       P_IMMD(p, NV9097, SET_ALPHA_FRACTION, 0x3f);
280 
281    P_IMMD(p, NV9097, CHECK_SPH_VERSION, {
282       .current = 3,
283       .oldest_supported = 3,
284    });
285    P_IMMD(p, NV9097, CHECK_AAM_VERSION, {
286       .current = 2,
287       .oldest_supported = 2,
288    });
289 
290    if (pdev->info.cls_eng3d < MAXWELL_A)
291       P_IMMD(p, NV9097, SET_SHADER_SCHEDULING, MODE_OLDEST_THREAD_FIRST);
292 
293    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_PREFETCH_READ_REQUESTS,
294                      POLICY_EVICT_NORMAL);
295    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_NONINTERLOCKED_READ_REQUESTS,
296                      POLICY_EVICT_NORMAL);
297    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_INTERLOCKED_READ_REQUESTS,
298                      POLICY_EVICT_NORMAL);
299    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_NONINTERLOCKED_WRITE_REQUESTS,
300                      POLICY_EVICT_NORMAL);
301    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_INTERLOCKED_WRITE_REQUESTS,
302                      POLICY_EVICT_NORMAL);
303 
304    P_IMMD(p, NV9097, SET_BLEND_PER_FORMAT_ENABLE, SNORM8_UNORM16_SNORM16_TRUE);
305 
306    P_IMMD(p, NV9097, SET_ATTRIBUTE_DEFAULT, {
307       .color_front_diffuse    = COLOR_FRONT_DIFFUSE_VECTOR_0001,
308       .color_front_specular   = COLOR_FRONT_SPECULAR_VECTOR_0001,
309       .generic_vector         = GENERIC_VECTOR_VECTOR_0001,
310       .fixed_fnc_texture      = FIXED_FNC_TEXTURE_VECTOR_0001,
311       .dx9_color0             = DX9_COLOR0_VECTOR_0001,
312       .dx9_color1_to_color15  = DX9_COLOR1_TO_COLOR15_VECTOR_0000,
313    });
314 
315    P_IMMD(p, NV9097, SET_DA_OUTPUT, VERTEX_ID_USES_ARRAY_START_TRUE);
316 
317    P_IMMD(p, NV9097, SET_RENDER_ENABLE_CONTROL,
318                      CONDITIONAL_LOAD_CONSTANT_BUFFER_FALSE);
319 
320    P_IMMD(p, NV9097, SET_PS_OUTPUT_SAMPLE_MASK_USAGE, {
321       .enable                       = ENABLE_TRUE,
322       .qualify_by_anti_alias_enable = QUALIFY_BY_ANTI_ALIAS_ENABLE_ENABLE,
323    });
324 
325    if (pdev->info.cls_eng3d < VOLTA_A)
326       P_IMMD(p, NV9097, SET_PRIM_CIRCULAR_BUFFER_THROTTLE, 0x3fffff);
327 
328    P_IMMD(p, NV9097, SET_BLEND_OPT_CONTROL, ALLOW_FLOAT_PIXEL_KILLS_TRUE);
329    P_IMMD(p, NV9097, SET_BLEND_FLOAT_OPTION, ZERO_TIMES_ANYTHING_IS_ZERO_TRUE);
330    P_IMMD(p, NV9097, SET_BLEND_STATE_PER_TARGET, ENABLE_TRUE);
331 
332    if (pdev->info.cls_eng3d < MAXWELL_A)
333       P_IMMD(p, NV9097, SET_MAX_TI_WARPS_PER_BATCH, 3);
334 
335    if (pdev->info.cls_eng3d >= KEPLER_A &&
336        pdev->info.cls_eng3d < MAXWELL_A) {
337       P_IMMD(p, NVA097, SET_TEXTURE_INSTRUCTION_OPERAND,
338                         ORDERING_KEPLER_ORDER);
339    }
340 
341    P_IMMD(p, NV9097, SET_ALPHA_TEST, ENABLE_FALSE);
342    P_IMMD(p, NV9097, SET_TWO_SIDED_LIGHT, ENABLE_FALSE);
343    P_IMMD(p, NV9097, SET_COLOR_CLAMP, ENABLE_TRUE);
344    P_IMMD(p, NV9097, SET_PS_SATURATE, {
345       .output0 = OUTPUT0_FALSE,
346       .output1 = OUTPUT1_FALSE,
347       .output2 = OUTPUT2_FALSE,
348       .output3 = OUTPUT3_FALSE,
349       .output4 = OUTPUT4_FALSE,
350       .output5 = OUTPUT5_FALSE,
351       .output6 = OUTPUT6_FALSE,
352       .output7 = OUTPUT7_FALSE,
353    });
354 
355    P_IMMD(p, NV9097, SET_POINT_SIZE, fui(1.0));
356    P_IMMD(p, NV9097, SET_ATTRIBUTE_POINT_SIZE, { .enable = ENABLE_TRUE });
357 
358    /* From vulkan spec's point rasterization:
359     * "Point rasterization produces a fragment for each fragment area group of
360     * framebuffer pixels with one or more sample points that intersect a region
361     * centered at the point’s (xf,yf).
362     * This region is a square with side equal to the current point size.
363     * ... (xf,yf) is the exact, unrounded framebuffer coordinate of the vertex
364     * for the point"
365     *
366     * So it seems we always need square points with PointCoords like OpenGL
367     * point sprites.
368     *
369     * From OpenGL compatibility spec:
370     * Basic point rasterization:
371     * "If point sprites are enabled, then point rasterization produces a
372     * fragment for each framebuffer pixel whose center lies inside a square
373     * centered at the point’s (xw, yw), with side length equal to the current
374     * point size.
375     * ... and xw and yw are the exact, unrounded window coordinates of the
376     * vertex for the point"
377     *
378     * And Point multisample rasterization:
379     * "This region is a circle having diameter equal to the current point width
380     * if POINT_SPRITE is disabled, or a square with side equal to the current
381     * point width if POINT_SPRITE is enabled."
382     */
383    P_IMMD(p, NV9097, SET_POINT_SPRITE, ENABLE_TRUE);
384    P_IMMD(p, NV9097, SET_POINT_SPRITE_SELECT, {
385       .rmode      = RMODE_ZERO,
386       .origin     = ORIGIN_TOP,
387       .texture0   = TEXTURE0_PASSTHROUGH,
388       .texture1   = TEXTURE1_PASSTHROUGH,
389       .texture2   = TEXTURE2_PASSTHROUGH,
390       .texture3   = TEXTURE3_PASSTHROUGH,
391       .texture4   = TEXTURE4_PASSTHROUGH,
392       .texture5   = TEXTURE5_PASSTHROUGH,
393       .texture6   = TEXTURE6_PASSTHROUGH,
394       .texture7   = TEXTURE7_PASSTHROUGH,
395       .texture8   = TEXTURE8_PASSTHROUGH,
396       .texture9   = TEXTURE9_PASSTHROUGH,
397    });
398 
399    /* OpenGL's GL_POINT_SMOOTH */
400    P_IMMD(p, NV9097, SET_ANTI_ALIASED_POINT, ENABLE_FALSE);
401 
402    if (pdev->info.cls_eng3d >= MAXWELL_B)
403       P_IMMD(p, NVB197, SET_FILL_VIA_TRIANGLE, MODE_DISABLED);
404 
405    P_IMMD(p, NV9097, SET_POLY_SMOOTH, ENABLE_FALSE);
406 
407    P_IMMD(p, NV9097, SET_VIEWPORT_PIXEL, CENTER_AT_HALF_INTEGERS);
408 
409    P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_ANTI_ALIAS),
410           nvk_mme_anti_alias_init());
411 
412    /* Enable multisample rasterization even for one sample rasterization,
413     * this way we get strict lines and rectangular line support.
414     * More info at: DirectX rasterization rules
415     */
416    P_IMMD(p, NV9097, SET_ANTI_ALIAS_ENABLE, V_TRUE);
417 
418    if (pdev->info.cls_eng3d >= MAXWELL_B) {
419       P_IMMD(p, NVB197, SET_POST_PS_INITIAL_COVERAGE, true);
420       P_IMMD(p, NVB197, SET_OFFSET_RENDER_TARGET_INDEX,
421                         BY_VIEWPORT_INDEX_FALSE);
422    }
423 
424    /* TODO: Vertex runout */
425 
426    P_IMMD(p, NV9097, SET_WINDOW_ORIGIN, {
427       .mode    = MODE_UPPER_LEFT,
428       .flip_y  = FLIP_Y_FALSE,
429    });
430 
431    P_MTHD(p, NV9097, SET_WINDOW_OFFSET_X);
432    P_NV9097_SET_WINDOW_OFFSET_X(p, 0);
433    P_NV9097_SET_WINDOW_OFFSET_Y(p, 0);
434 
435    P_IMMD(p, NV9097, SET_ACTIVE_ZCULL_REGION, 0x3f);
436    P_IMMD(p, NV9097, SET_WINDOW_CLIP_ENABLE, V_FALSE);
437    P_IMMD(p, NV9097, SET_CLIP_ID_TEST, ENABLE_FALSE);
438 
439 //   P_IMMD(p, NV9097, X_X_X_SET_CLEAR_CONTROL, {
440 //      .respect_stencil_mask   = RESPECT_STENCIL_MASK_FALSE,
441 //      .use_clear_rect         = USE_CLEAR_RECT_FALSE,
442 //   });
443 
444    P_IMMD(p, NV9097, SET_VIEWPORT_SCALE_OFFSET, ENABLE_TRUE);
445 
446    P_IMMD(p, NV9097, SET_VIEWPORT_CLIP_CONTROL, {
447       .min_z_zero_max_z_one      = MIN_Z_ZERO_MAX_Z_ONE_FALSE,
448       .pixel_min_z               = PIXEL_MIN_Z_CLAMP,
449       .pixel_max_z               = PIXEL_MAX_Z_CLAMP,
450       .geometry_guardband        = GEOMETRY_GUARDBAND_SCALE_256,
451       .line_point_cull_guardband = LINE_POINT_CULL_GUARDBAND_SCALE_256,
452       .geometry_clip             = GEOMETRY_CLIP_WZERO_CLIP,
453       .geometry_guardband_z      = GEOMETRY_GUARDBAND_Z_SAME_AS_XY_GUARDBAND,
454    });
455 
456    for (unsigned i = 0; i < 16; i++)
457       P_IMMD(p, NV9097, SET_SCISSOR_ENABLE(i), V_FALSE);
458 
459    P_IMMD(p, NV9097, SET_CT_MRT_ENABLE, V_TRUE);
460 
461    if (pdev->info.cls_eng3d < VOLTA_A) {
462       uint64_t shader_base_addr =
463          nvk_heap_contiguous_base_address(&dev->shader_heap);
464 
465       P_MTHD(p, NV9097, SET_PROGRAM_REGION_A);
466       P_NV9097_SET_PROGRAM_REGION_A(p, shader_base_addr >> 32);
467       P_NV9097_SET_PROGRAM_REGION_B(p, shader_base_addr);
468    }
469 
470    for (uint32_t group = 0; group < 5; group++) {
471       for (uint32_t slot = 0; slot < 16; slot++) {
472          P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
473             .valid = VALID_FALSE,
474             .shader_slot = slot,
475          });
476       }
477    }
478 
479 //   P_MTHD(cmd->push, NVC0_3D, MACRO_GP_SELECT);
480 //   P_INLINE_DATA(cmd->push, 0x40);
481    P_IMMD(p, NV9097, SET_RT_LAYER, {
482       .v = 0,
483       .control = CONTROL_V_SELECTS_LAYER,
484    });
485 //   P_MTHD(cmd->push, NVC0_3D, MACRO_TEP_SELECT;
486 //   P_INLINE_DATA(cmd->push, 0x30);
487 
488    P_IMMD(p, NV9097, SET_POINT_CENTER_MODE, V_OGL);
489    P_IMMD(p, NV9097, SET_EDGE_FLAG, V_TRUE);
490    P_IMMD(p, NV9097, SET_SAMPLER_BINDING, V_INDEPENDENTLY);
491 
492    uint64_t zero_addr = dev->zero_page->va->addr;
493    P_MTHD(p, NV9097, SET_VERTEX_STREAM_SUBSTITUTE_A);
494    P_NV9097_SET_VERTEX_STREAM_SUBSTITUTE_A(p, zero_addr >> 32);
495    P_NV9097_SET_VERTEX_STREAM_SUBSTITUTE_B(p, zero_addr);
496 
497    P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_VB_ENABLES));
498    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_VB_ENABLES, 0);
499    for (uint32_t b = 0; b < 32; b++) {
500       P_IMMD(p, NV9097, SET_VERTEX_STREAM_A_FORMAT(b), {
501          .enable = false,
502       });
503    }
504 
505    if (pdev->info.cls_eng3d >= FERMI_A &&
506        pdev->info.cls_eng3d < MAXWELL_A) {
507       assert(dev->vab_memory);
508       uint64_t vab_addr = dev->vab_memory->va->addr;
509       P_MTHD(p, NV9097, SET_VAB_MEMORY_AREA_A);
510       P_NV9097_SET_VAB_MEMORY_AREA_A(p, vab_addr >> 32);
511       P_NV9097_SET_VAB_MEMORY_AREA_B(p, vab_addr);
512       P_NV9097_SET_VAB_MEMORY_AREA_C(p, SIZE_BYTES_256K);
513    }
514 
515    if (pdev->info.cls_eng3d == MAXWELL_A)
516       P_IMMD(p, NVB097, SET_SELECT_MAXWELL_TEXTURE_HEADERS, V_TRUE);
517 
518    /* Store the address to CB0 in a pair of state registers */
519    uint64_t cb0_addr = queue->draw_cb0->va->addr;
520    P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CB0_ADDR_HI));
521    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_ADDR_HI, cb0_addr >> 32);
522    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_ADDR_LO, cb0_addr);
523 
524    /* Store the address to the zero page in a pair of state registers */
525    P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_ZERO_ADDR_HI));
526    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_ZERO_ADDR_HI, zero_addr >> 32);
527    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_ZERO_ADDR_LO, zero_addr);
528 
529    /* We leave CB0 selected by default */
530    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SELECT_CB0));
531    P_INLINE_DATA(p, 0);
532 
533    /* Bind CB0 to all shader groups */
534    for (uint32_t group = 0; group < 5; group++) {
535       P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
536          .valid = VALID_TRUE,
537          .shader_slot = 0,
538       });
539    }
540 
541    /* Zero out CB0 */
542    P_1INC(p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET);
543    P_NV9097_LOAD_CONSTANT_BUFFER_OFFSET(p, 0);
544    for (uint32_t dw = 0; dw < NVK_DRAW_CB0_SIZE / 4; dw++)
545       P_INLINE_DATA(p, 0);
546 
547    /* These are shadowed in cb0 so they need to be zeroed as well for
548     * consistency.
549     */
550    P_IMMD(p, NV9097, SET_GLOBAL_BASE_INSTANCE_INDEX, 0);
551    P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CB0_FIRST_VERTEX));
552    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_FIRST_VERTEX, 0);
553    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_DRAW_INDEX, 0);
554    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_VIEW_INDEX, 0);
555 
556    return VK_SUCCESS;
557 }
558 
559 static void
nvk_cmd_buffer_dirty_render_pass(struct nvk_cmd_buffer * cmd)560 nvk_cmd_buffer_dirty_render_pass(struct nvk_cmd_buffer *cmd)
561 {
562    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
563 
564    /* These depend on color attachment count */
565    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
566    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
567    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
568    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
569 
570    /* These depend on the depth/stencil format */
571    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
572    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
573    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
574    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
575 
576    /* This may depend on render targets for ESO */
577    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
578 
579    /* This may depend on render targets */
580    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP);
581 }
582 
583 static void
nvk_cmd_flush_gfx_root_desc(struct nvk_cmd_buffer * cmd,struct nvk_descriptor_state * desc,size_t offset,size_t size)584 nvk_cmd_flush_gfx_root_desc(struct nvk_cmd_buffer *cmd,
585                             struct nvk_descriptor_state *desc,
586                             size_t offset, size_t size)
587 {
588    const uint32_t start_dw = offset / 4;
589    const uint32_t end_dw = DIV_ROUND_UP(offset + size, 4);
590    const uint32_t len_dw = end_dw - start_dw;
591 
592    struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + len_dw);
593    P_1INC(p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET);
594    P_NV9097_LOAD_CONSTANT_BUFFER_OFFSET(p, start_dw * 4);
595 
596    const uint32_t *root_dw = (uint32_t *)desc->root;
597    P_INLINE_ARRAY(p, &root_dw[start_dw], len_dw);
598 }
599 
600 void
nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer * cmd,const VkCommandBufferBeginInfo * pBeginInfo)601 nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer *cmd,
602                               const VkCommandBufferBeginInfo *pBeginInfo)
603 {
604    if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
605       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
606       P_MTHD(p, NV9097, INVALIDATE_SAMPLER_CACHE_NO_WFI);
607       P_NV9097_INVALIDATE_SAMPLER_CACHE_NO_WFI(p, {
608          .lines = LINES_ALL,
609       });
610       P_NV9097_INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI(p, {
611          .lines = LINES_ALL,
612       });
613 
614       P_IMMD(p, NVA097, INVALIDATE_SHADER_CACHES_NO_WFI, {
615          .constant = CONSTANT_TRUE,
616       });
617    }
618 
619    cmd->state.gfx.descriptors.flush_root = nvk_cmd_flush_gfx_root_desc;
620 
621    if (cmd->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
622        (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
623       char gcbiar_data[VK_GCBIARR_DATA_SIZE(NVK_MAX_RTS)];
624       const VkRenderingInfo *resume_info =
625          vk_get_command_buffer_inheritance_as_rendering_resume(cmd->vk.level,
626                                                                pBeginInfo,
627                                                                gcbiar_data);
628       if (resume_info) {
629          nvk_CmdBeginRendering(nvk_cmd_buffer_to_handle(cmd), resume_info);
630       } else {
631          const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
632             vk_get_command_buffer_inheritance_rendering_info(cmd->vk.level,
633                                                              pBeginInfo);
634          assert(inheritance_info);
635 
636          struct nvk_rendering_state *render = &cmd->state.gfx.render;
637          render->flags = inheritance_info->flags;
638          render->area = (VkRect2D) { };
639          render->layer_count = 0;
640          render->view_mask = inheritance_info->viewMask;
641          render->samples = inheritance_info->rasterizationSamples;
642 
643          render->color_att_count = inheritance_info->colorAttachmentCount;
644          for (uint32_t i = 0; i < render->color_att_count; i++) {
645             render->color_att[i].vk_format =
646                inheritance_info->pColorAttachmentFormats[i];
647          }
648          render->depth_att.vk_format =
649             inheritance_info->depthAttachmentFormat;
650          render->stencil_att.vk_format =
651             inheritance_info->stencilAttachmentFormat;
652 
653          const VkRenderingAttachmentLocationInfoKHR att_loc_info_default = {
654             .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
655             .colorAttachmentCount = inheritance_info->colorAttachmentCount,
656          };
657          const VkRenderingAttachmentLocationInfoKHR *att_loc_info =
658             vk_get_command_buffer_rendering_attachment_location_info(
659                cmd->vk.level, pBeginInfo);
660          if (att_loc_info == NULL)
661             att_loc_info = &att_loc_info_default;
662 
663          vk_cmd_set_rendering_attachment_locations(&cmd->vk, att_loc_info);
664 
665          nvk_cmd_buffer_dirty_render_pass(cmd);
666       }
667    }
668 
669    cmd->state.gfx.shaders_dirty = ~0;
670 }
671 
672 void
nvk_cmd_invalidate_graphics_state(struct nvk_cmd_buffer * cmd)673 nvk_cmd_invalidate_graphics_state(struct nvk_cmd_buffer *cmd)
674 {
675    vk_dynamic_graphics_state_dirty_all(&cmd->vk.dynamic_graphics_state);
676 
677    /* From the Vulkan 1.3.275 spec:
678     *
679     *    "...There is one exception to this rule - if the primary command
680     *    buffer is inside a render pass instance, then the render pass and
681     *    subpass state is not disturbed by executing secondary command
682     *    buffers."
683     *
684     * We need to reset everything EXCEPT the render pass state.
685     */
686    struct nvk_rendering_state render_save = cmd->state.gfx.render;
687    memset(&cmd->state.gfx, 0, sizeof(cmd->state.gfx));
688    cmd->state.gfx.render = render_save;
689 
690    /* We need to keep the flush_root callback */
691    cmd->state.gfx.descriptors.flush_root = nvk_cmd_flush_gfx_root_desc;
692 
693    cmd->state.gfx.shaders_dirty = ~0;
694 }
695 
696 static void
nvk_attachment_init(struct nvk_attachment * att,const VkRenderingAttachmentInfo * info)697 nvk_attachment_init(struct nvk_attachment *att,
698                     const VkRenderingAttachmentInfo *info)
699 {
700    if (info == NULL || info->imageView == VK_NULL_HANDLE) {
701       *att = (struct nvk_attachment) { .iview = NULL, };
702       return;
703    }
704 
705    VK_FROM_HANDLE(nvk_image_view, iview, info->imageView);
706    *att = (struct nvk_attachment) {
707       .vk_format = iview->vk.format,
708       .iview = iview,
709    };
710 
711    if (info->resolveMode != VK_RESOLVE_MODE_NONE) {
712       VK_FROM_HANDLE(nvk_image_view, res_iview, info->resolveImageView);
713       att->resolve_mode = info->resolveMode;
714       att->resolve_iview = res_iview;
715    }
716 
717    att->store_op = info->storeOp;
718 }
719 
720 static uint32_t
nil_to_nv9097_samples_mode(enum nil_sample_layout sample_layout)721 nil_to_nv9097_samples_mode(enum nil_sample_layout sample_layout)
722 {
723 #define MODE(S) [NIL_SAMPLE_LAYOUT_##S] = NV9097_SET_ANTI_ALIAS_SAMPLES_MODE_##S
724    uint16_t nil_to_nv9097[] = {
725       MODE(1X1),
726       MODE(2X1),
727       MODE(2X2),
728       MODE(4X2),
729       MODE(4X4),
730    };
731 #undef MODE
732    assert(sample_layout < ARRAY_SIZE(nil_to_nv9097));
733 
734    return nil_to_nv9097[sample_layout];
735 }
736 
737 VKAPI_ATTR void VKAPI_CALL
nvk_GetRenderingAreaGranularityKHR(VkDevice device,const VkRenderingAreaInfoKHR * pRenderingAreaInfo,VkExtent2D * pGranularity)738 nvk_GetRenderingAreaGranularityKHR(
739     VkDevice device,
740     const VkRenderingAreaInfoKHR *pRenderingAreaInfo,
741     VkExtent2D *pGranularity)
742 {
743    *pGranularity = (VkExtent2D) { .width = 1, .height = 1 };
744 }
745 
746 static bool
nvk_rendering_all_linear(const struct nvk_rendering_state * render)747 nvk_rendering_all_linear(const struct nvk_rendering_state *render)
748 {
749    /* Depth and stencil are never linear */
750    if (render->depth_att.iview || render->stencil_att.iview)
751       return false;
752 
753    for (uint32_t i = 0; i < render->color_att_count; i++) {
754       const struct nvk_image_view *iview = render->color_att[i].iview;
755       if (iview == NULL)
756          continue;
757 
758       const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
759       const uint8_t ip = iview->planes[0].image_plane;
760       const struct nil_image_level *level =
761          &image->planes[ip].nil.levels[iview->vk.base_mip_level];
762 
763       if (level->tiling.is_tiled)
764          return false;
765    }
766 
767    return true;
768 }
769 
770 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)771 nvk_CmdBeginRendering(VkCommandBuffer commandBuffer,
772                       const VkRenderingInfo *pRenderingInfo)
773 {
774    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
775    struct nvk_rendering_state *render = &cmd->state.gfx.render;
776 
777    memset(render, 0, sizeof(*render));
778 
779    render->flags = pRenderingInfo->flags;
780    render->area = pRenderingInfo->renderArea;
781    render->view_mask = pRenderingInfo->viewMask;
782    render->layer_count = pRenderingInfo->layerCount;
783    render->samples = 0;
784 
785    const uint32_t layer_count =
786       render->view_mask ? util_last_bit(render->view_mask) :
787                           render->layer_count;
788 
789    render->color_att_count = pRenderingInfo->colorAttachmentCount;
790    for (uint32_t i = 0; i < render->color_att_count; i++) {
791       nvk_attachment_init(&render->color_att[i],
792                           &pRenderingInfo->pColorAttachments[i]);
793    }
794 
795    nvk_attachment_init(&render->depth_att,
796                        pRenderingInfo->pDepthAttachment);
797    nvk_attachment_init(&render->stencil_att,
798                        pRenderingInfo->pStencilAttachment);
799 
800    render->all_linear = nvk_rendering_all_linear(render);
801 
802    const VkRenderingAttachmentLocationInfoKHR ral_info = {
803       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
804       .colorAttachmentCount = pRenderingInfo->colorAttachmentCount,
805    };
806    vk_cmd_set_rendering_attachment_locations(&cmd->vk, &ral_info);
807 
808    nvk_cmd_buffer_dirty_render_pass(cmd);
809 
810    struct nv_push *p = nvk_cmd_buffer_push(cmd, NVK_MAX_RTS * 12 + 29);
811 
812    P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_VIEW_MASK),
813           render->view_mask);
814 
815    P_MTHD(p, NV9097, SET_SURFACE_CLIP_HORIZONTAL);
816    P_NV9097_SET_SURFACE_CLIP_HORIZONTAL(p, {
817       .x       = render->area.offset.x,
818       .width   = render->area.extent.width,
819    });
820    P_NV9097_SET_SURFACE_CLIP_VERTICAL(p, {
821       .y       = render->area.offset.y,
822       .height  = render->area.extent.height,
823    });
824 
825    enum nil_sample_layout sample_layout = NIL_SAMPLE_LAYOUT_INVALID;
826 
827    /* We always emit SET_COLOR_TARGET_A(i) for every color target, regardless
828     * of the number of targets in the render pass.  This ensures that we have
829     * no left over pointers from previous render passes in the hardware.  This
830     * also allows us to point at any render target with SET_CT_SELECT and know
831     * that it's either a valid render target or NULL.
832     */
833    for (uint32_t i = 0; i < NVK_MAX_RTS; i++) {
834       if (render->color_att[i].iview) {
835          const struct nvk_image_view *iview = render->color_att[i].iview;
836          const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
837          /* Rendering to multi-planar images is valid for a specific single
838           * plane only, so assert that what we have is a single-plane, obtain
839           * its index, and begin rendering
840           */
841          assert(iview->plane_count == 1);
842          const uint8_t ip = iview->planes[0].image_plane;
843          const struct nvk_image_plane *plane = &image->planes[ip];
844 
845          if (!render->all_linear && !plane->nil.levels[0].tiling.is_tiled)
846             plane = &image->linear_tiled_shadow;
847 
848          const struct nil_image *nil_image = &plane->nil;
849          const struct nil_image_level *level =
850             &nil_image->levels[iview->vk.base_mip_level];
851          struct nil_Extent4D_Samples level_extent_sa =
852             nil_image_level_extent_sa(nil_image, iview->vk.base_mip_level);
853 
854          assert(sample_layout == NIL_SAMPLE_LAYOUT_INVALID ||
855                 sample_layout == nil_image->sample_layout);
856          sample_layout = nil_image->sample_layout;
857          render->samples = image->vk.samples;
858 
859          uint64_t addr = nvk_image_plane_base_address(plane) + level->offset_B;
860 
861          if (nil_image->dim == NIL_IMAGE_DIM_3D) {
862             addr += nil_image_level_z_offset_B(nil_image,
863                                                iview->vk.base_mip_level,
864                                                iview->vk.base_array_layer);
865             assert(layer_count <= iview->vk.extent.depth);
866          } else {
867             addr += iview->vk.base_array_layer *
868                     (uint64_t)nil_image->array_stride_B;
869             assert(layer_count <= iview->vk.layer_count);
870          }
871 
872          P_MTHD(p, NV9097, SET_COLOR_TARGET_A(i));
873          P_NV9097_SET_COLOR_TARGET_A(p, i, addr >> 32);
874          P_NV9097_SET_COLOR_TARGET_B(p, i, addr);
875 
876          if (level->tiling.is_tiled) {
877             const enum pipe_format p_format =
878                vk_format_to_pipe_format(iview->vk.format);
879 
880             /* We use the stride for depth/stencil targets because the Z/S
881              * hardware has no concept of a tile width.  Instead, we just set
882              * the width to the stride divided by bpp.
883              */
884             const uint32_t row_stride_el =
885                level->row_stride_B / util_format_get_blocksize(p_format);
886             P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, row_stride_el);
887             P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, level_extent_sa.height);
888             const uint8_t ct_format = nil_format_to_color_target(p_format);
889             P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, ct_format);
890 
891             P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
892                .block_width   = BLOCK_WIDTH_ONE_GOB,
893                .block_height  = level->tiling.y_log2,
894                .block_depth   = level->tiling.z_log2,
895                .layout        = LAYOUT_BLOCKLINEAR,
896                .third_dimension_control = (nil_image->dim == NIL_IMAGE_DIM_3D) ?
897                   THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_DEPTH_SIZE :
898                   THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
899             });
900 
901             P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, layer_count);
902             P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i,
903                nil_image->array_stride_B >> 2);
904             P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
905          } else {
906             /* NVIDIA can only render to 2D linear images */
907             assert(nil_image->dim == NIL_IMAGE_DIM_2D);
908             /* NVIDIA can only render to non-multisampled images */
909             assert(sample_layout == NIL_SAMPLE_LAYOUT_1X1);
910             /* NVIDIA doesn't support linear array images */
911             assert(iview->vk.base_array_layer == 0 && layer_count == 1);
912 
913             uint32_t pitch = level->row_stride_B;
914             const enum pipe_format p_format =
915                vk_format_to_pipe_format(iview->vk.format);
916             /* When memory layout is set to LAYOUT_PITCH, the WIDTH field
917              * takes row pitch
918              */
919             P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, pitch);
920             P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, level_extent_sa.height);
921 
922             const uint8_t ct_format = nil_format_to_color_target(p_format);
923             P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, ct_format);
924 
925             P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
926                .layout = LAYOUT_PITCH,
927                .third_dimension_control =
928                   THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
929             });
930 
931             P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, 1);
932             P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i, 0);
933             P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
934          }
935 
936          P_IMMD(p, NV9097, SET_COLOR_COMPRESSION(i), nil_image->compressed);
937       } else {
938          P_MTHD(p, NV9097, SET_COLOR_TARGET_A(i));
939          P_NV9097_SET_COLOR_TARGET_A(p, i, 0);
940          P_NV9097_SET_COLOR_TARGET_B(p, i, 0);
941          P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, 64);
942          P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, 0);
943          P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, V_DISABLED);
944          P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
945             .layout        = LAYOUT_BLOCKLINEAR,
946          });
947          P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, layer_count);
948          P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i, 0);
949          P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
950 
951          P_IMMD(p, NV9097, SET_COLOR_COMPRESSION(i), ENABLE_TRUE);
952       }
953    }
954 
955    if (render->depth_att.iview || render->stencil_att.iview) {
956       struct nvk_image_view *iview = render->depth_att.iview ?
957                                      render->depth_att.iview :
958                                      render->stencil_att.iview;
959       const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
960       /* Depth/stencil are always single-plane */
961       assert(iview->plane_count == 1);
962       const uint8_t ip = iview->planes[0].image_plane;
963       struct nil_image nil_image = image->planes[ip].nil;
964 
965       uint64_t addr = nvk_image_base_address(image, ip);
966       uint32_t mip_level = iview->vk.base_mip_level;
967       uint32_t base_array_layer = iview->vk.base_array_layer;
968 
969       if (nil_image.dim == NIL_IMAGE_DIM_3D) {
970          uint64_t level_offset_B;
971          nil_image = nil_image_3d_level_as_2d_array(&nil_image, mip_level,
972                                                     &level_offset_B);
973          addr += level_offset_B;
974          mip_level = 0;
975          base_array_layer = 0;
976          assert(layer_count <= iview->vk.extent.depth);
977       } else {
978          assert(layer_count <= iview->vk.layer_count);
979       }
980 
981       const struct nil_image_level *level = &nil_image.levels[mip_level];
982       addr += level->offset_B;
983 
984       assert(sample_layout == NIL_SAMPLE_LAYOUT_INVALID ||
985              sample_layout == nil_image.sample_layout);
986       sample_layout = nil_image.sample_layout;
987       render->samples = image->vk.samples;
988 
989       P_MTHD(p, NV9097, SET_ZT_A);
990       P_NV9097_SET_ZT_A(p, addr >> 32);
991       P_NV9097_SET_ZT_B(p, addr);
992       const enum pipe_format p_format =
993          vk_format_to_pipe_format(iview->vk.format);
994       const uint8_t zs_format = nil_format_to_depth_stencil(p_format);
995       P_NV9097_SET_ZT_FORMAT(p, zs_format);
996       assert(level->tiling.is_tiled);
997       assert(level->tiling.z_log2 == 0);
998       P_NV9097_SET_ZT_BLOCK_SIZE(p, {
999          .width = WIDTH_ONE_GOB,
1000          .height = level->tiling.y_log2,
1001          .depth = DEPTH_ONE_GOB,
1002       });
1003       P_NV9097_SET_ZT_ARRAY_PITCH(p, nil_image.array_stride_B >> 2);
1004 
1005       P_IMMD(p, NV9097, SET_ZT_SELECT, 1 /* target_count */);
1006 
1007       struct nil_Extent4D_Samples level_extent_sa =
1008          nil_image_level_extent_sa(&nil_image, mip_level);
1009 
1010       /* We use the stride for depth/stencil targets because the Z/S hardware
1011        * has no concept of a tile width.  Instead, we just set the width to
1012        * the stride divided by bpp.
1013        */
1014       const uint32_t row_stride_el =
1015          level->row_stride_B / util_format_get_blocksize(p_format);
1016 
1017       P_MTHD(p, NV9097, SET_ZT_SIZE_A);
1018       P_NV9097_SET_ZT_SIZE_A(p, row_stride_el);
1019       P_NV9097_SET_ZT_SIZE_B(p, level_extent_sa.height);
1020       P_NV9097_SET_ZT_SIZE_C(p, {
1021          .third_dimension  = base_array_layer + layer_count,
1022          .control          = CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
1023       });
1024 
1025       P_IMMD(p, NV9097, SET_ZT_LAYER, base_array_layer);
1026 
1027       P_IMMD(p, NV9097, SET_Z_COMPRESSION, nil_image.compressed);
1028 
1029       if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1030          P_IMMD(p, NVC597, SET_ZT_SPARSE, {
1031             .enable = ENABLE_FALSE,
1032          });
1033       }
1034    } else {
1035       P_IMMD(p, NV9097, SET_ZT_SELECT, 0 /* target_count */);
1036    }
1037 
1038    /* From the Vulkan 1.3.275 spec:
1039     *
1040     *    "It is legal for a subpass to use no color or depth/stencil
1041     *    attachments, either because it has no attachment references or
1042     *    because all of them are VK_ATTACHMENT_UNUSED. This kind of subpass
1043     *    can use shader side effects such as image stores and atomics to
1044     *    produce an output. In this case, the subpass continues to use the
1045     *    width, height, and layers of the framebuffer to define the dimensions
1046     *    of the rendering area, and the rasterizationSamples from each
1047     *    pipeline’s VkPipelineMultisampleStateCreateInfo to define the number
1048     *    of samples used in rasterization;"
1049     *
1050     * In the case where we have attachments, we emit SET_ANTI_ALIAS here
1051     * because SET_COLOR_TARGET_* and SET_ZT_* don't have any other way of
1052     * specifying the sample layout and we want to ensure it matches.  When
1053     * we don't have any attachments, we defer SET_ANTI_ALIAS to draw time
1054     * where we base it on dynamic rasterizationSamples.
1055     */
1056    if (sample_layout != NIL_SAMPLE_LAYOUT_INVALID) {
1057       P_IMMD(p, NV9097, SET_ANTI_ALIAS,
1058              nil_to_nv9097_samples_mode(sample_layout));
1059    }
1060 
1061    if (render->flags & VK_RENDERING_RESUMING_BIT)
1062       return;
1063 
1064    for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
1065       const struct nvk_image_view *iview = render->color_att[i].iview;
1066       if (iview == NULL)
1067          continue;
1068 
1069       const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1070       assert(iview->plane_count == 1);
1071       const uint8_t ip = iview->planes[0].image_plane;
1072       const struct nvk_image_plane *plane = &image->planes[ip];
1073 
1074       const VkAttachmentLoadOp load_op =
1075          pRenderingInfo->pColorAttachments[i].loadOp;
1076       if (!render->all_linear && !plane->nil.levels[0].tiling.is_tiled &&
1077           load_op == VK_ATTACHMENT_LOAD_OP_LOAD)
1078          nvk_linear_render_copy(cmd, iview, render->area, true);
1079    }
1080 
1081    uint32_t clear_count = 0;
1082    VkClearAttachment clear_att[NVK_MAX_RTS + 1];
1083    for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
1084       const VkRenderingAttachmentInfo *att_info =
1085          &pRenderingInfo->pColorAttachments[i];
1086       if (att_info->imageView == VK_NULL_HANDLE ||
1087           att_info->loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
1088          continue;
1089 
1090       clear_att[clear_count++] = (VkClearAttachment) {
1091          .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1092          .colorAttachment = i,
1093          .clearValue = att_info->clearValue,
1094       };
1095    }
1096 
1097    clear_att[clear_count] = (VkClearAttachment) { .aspectMask = 0, };
1098    if (pRenderingInfo->pDepthAttachment != NULL &&
1099        pRenderingInfo->pDepthAttachment->imageView != VK_NULL_HANDLE &&
1100        pRenderingInfo->pDepthAttachment->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1101       clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
1102       clear_att[clear_count].clearValue.depthStencil.depth =
1103          pRenderingInfo->pDepthAttachment->clearValue.depthStencil.depth;
1104    }
1105    if (pRenderingInfo->pStencilAttachment != NULL &&
1106        pRenderingInfo->pStencilAttachment->imageView != VK_NULL_HANDLE &&
1107        pRenderingInfo->pStencilAttachment->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1108       clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
1109       clear_att[clear_count].clearValue.depthStencil.stencil =
1110          pRenderingInfo->pStencilAttachment->clearValue.depthStencil.stencil;
1111    }
1112    if (clear_att[clear_count].aspectMask != 0)
1113       clear_count++;
1114 
1115    if (clear_count > 0) {
1116       const VkClearRect clear_rect = {
1117          .rect = render->area,
1118          .baseArrayLayer = 0,
1119          .layerCount = render->view_mask ? 1 : render->layer_count,
1120       };
1121 
1122       P_MTHD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE);
1123       P_NV9097_SET_RENDER_ENABLE_OVERRIDE(p, MODE_ALWAYS_RENDER);
1124 
1125       nvk_CmdClearAttachments(nvk_cmd_buffer_to_handle(cmd),
1126                               clear_count, clear_att, 1, &clear_rect);
1127       p = nvk_cmd_buffer_push(cmd, 2);
1128       P_MTHD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE);
1129       P_NV9097_SET_RENDER_ENABLE_OVERRIDE(p, MODE_USE_RENDER_ENABLE);
1130    }
1131 
1132    /* TODO: Attachment clears */
1133 }
1134 
1135 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndRendering(VkCommandBuffer commandBuffer)1136 nvk_CmdEndRendering(VkCommandBuffer commandBuffer)
1137 {
1138    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
1139    struct nvk_rendering_state *render = &cmd->state.gfx.render;
1140 
1141    if (!(render->flags & VK_RENDERING_SUSPENDING_BIT)) {
1142       for (uint32_t i = 0; i < render->color_att_count; i++) {
1143          struct nvk_image_view *iview = render->color_att[i].iview;
1144          if (iview == NULL)
1145             continue;
1146 
1147          struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1148          const uint8_t ip = iview->planes[0].image_plane;
1149          const struct nvk_image_plane *plane = &image->planes[ip];
1150          if (!render->all_linear && !plane->nil.levels[0].tiling.is_tiled &&
1151              render->color_att[i].store_op == VK_ATTACHMENT_STORE_OP_STORE)
1152             nvk_linear_render_copy(cmd, iview, render->area, false);
1153       }
1154    }
1155 
1156    bool need_resolve = false;
1157 
1158    /* Translate render state back to VK for meta */
1159    VkRenderingAttachmentInfo vk_color_att[NVK_MAX_RTS];
1160    for (uint32_t i = 0; i < render->color_att_count; i++) {
1161       if (render->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE)
1162          need_resolve = true;
1163 
1164       vk_color_att[i] = (VkRenderingAttachmentInfo) {
1165          .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1166          .imageView = nvk_image_view_to_handle(render->color_att[i].iview),
1167          .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1168          .resolveMode = render->color_att[i].resolve_mode,
1169          .resolveImageView =
1170             nvk_image_view_to_handle(render->color_att[i].resolve_iview),
1171          .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1172       };
1173    }
1174 
1175    const VkRenderingAttachmentInfo vk_depth_att = {
1176       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1177       .imageView = nvk_image_view_to_handle(render->depth_att.iview),
1178       .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1179       .resolveMode = render->depth_att.resolve_mode,
1180       .resolveImageView =
1181          nvk_image_view_to_handle(render->depth_att.resolve_iview),
1182       .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1183    };
1184    if (render->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE)
1185       need_resolve = true;
1186 
1187    const VkRenderingAttachmentInfo vk_stencil_att = {
1188       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1189       .imageView = nvk_image_view_to_handle(render->stencil_att.iview),
1190       .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1191       .resolveMode = render->stencil_att.resolve_mode,
1192       .resolveImageView =
1193          nvk_image_view_to_handle(render->stencil_att.resolve_iview),
1194       .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1195    };
1196    if (render->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)
1197       need_resolve = true;
1198 
1199    const VkRenderingInfo vk_render = {
1200       .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
1201       .renderArea = render->area,
1202       .layerCount = render->layer_count,
1203       .viewMask = render->view_mask,
1204       .colorAttachmentCount = render->color_att_count,
1205       .pColorAttachments = vk_color_att,
1206       .pDepthAttachment = &vk_depth_att,
1207       .pStencilAttachment = &vk_stencil_att,
1208    };
1209 
1210    if (render->flags & VK_RENDERING_SUSPENDING_BIT)
1211       need_resolve = false;
1212 
1213    memset(render, 0, sizeof(*render));
1214 
1215    if (need_resolve) {
1216       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1217       P_IMMD(p, NVA097, INVALIDATE_TEXTURE_DATA_CACHE, {
1218          .lines = LINES_ALL,
1219       });
1220 
1221       nvk_meta_resolve_rendering(cmd, &vk_render);
1222    }
1223 }
1224 
1225 void
nvk_cmd_bind_graphics_shader(struct nvk_cmd_buffer * cmd,const gl_shader_stage stage,struct nvk_shader * shader)1226 nvk_cmd_bind_graphics_shader(struct nvk_cmd_buffer *cmd,
1227                              const gl_shader_stage stage,
1228                              struct nvk_shader *shader)
1229 {
1230    assert(stage < ARRAY_SIZE(cmd->state.gfx.shaders));
1231    if (cmd->state.gfx.shaders[stage] == shader)
1232       return;
1233 
1234    cmd->state.gfx.shaders[stage] = shader;
1235    cmd->state.gfx.shaders_dirty |= BITFIELD_BIT(stage);
1236 }
1237 
1238 static uint32_t
mesa_to_nv9097_shader_type(gl_shader_stage stage)1239 mesa_to_nv9097_shader_type(gl_shader_stage stage)
1240 {
1241    static const uint32_t mesa_to_nv9097[] = {
1242       [MESA_SHADER_VERTEX]    = NV9097_SET_PIPELINE_SHADER_TYPE_VERTEX,
1243       [MESA_SHADER_TESS_CTRL] = NV9097_SET_PIPELINE_SHADER_TYPE_TESSELLATION_INIT,
1244       [MESA_SHADER_TESS_EVAL] = NV9097_SET_PIPELINE_SHADER_TYPE_TESSELLATION,
1245       [MESA_SHADER_GEOMETRY]  = NV9097_SET_PIPELINE_SHADER_TYPE_GEOMETRY,
1246       [MESA_SHADER_FRAGMENT]  = NV9097_SET_PIPELINE_SHADER_TYPE_PIXEL,
1247    };
1248    assert(stage < ARRAY_SIZE(mesa_to_nv9097));
1249    return mesa_to_nv9097[stage];
1250 }
1251 
1252 static uint32_t
nvk_pipeline_bind_group(gl_shader_stage stage)1253 nvk_pipeline_bind_group(gl_shader_stage stage)
1254 {
1255    return stage;
1256 }
1257 
1258 static uint32_t
nvk_mme_tess_params(enum nak_ts_domain domain,enum nak_ts_spacing spacing,enum nak_ts_prims prims)1259 nvk_mme_tess_params(enum nak_ts_domain domain,
1260                     enum nak_ts_spacing spacing,
1261                     enum nak_ts_prims prims)
1262 {
1263    /* This is laid out the same as SET_TESSELLATION_PARAMETERS, only with an
1264     * extra bit for lower_left
1265     */
1266    uint16_t params = ((uint16_t)domain << 0) |
1267                      ((uint16_t)spacing << 4) |
1268                      ((uint16_t)prims << 8);
1269    return nvk_mme_val_mask(params, 0x0fff);
1270 }
1271 
1272 static uint32_t
nvk_mme_tess_lower_left(bool lower_left)1273 nvk_mme_tess_lower_left(bool lower_left)
1274 {
1275    return nvk_mme_val_mask((uint16_t)lower_left << 12, 1u << 12);
1276 }
1277 
1278 void
nvk_mme_set_tess_params(struct mme_builder * b)1279 nvk_mme_set_tess_params(struct mme_builder *b)
1280 {
1281    struct mme_value val_mask = mme_load(b);
1282    struct mme_value old_params = nvk_mme_load_scratch(b, TESS_PARAMS);
1283    struct mme_value params = nvk_mme_set_masked(b, old_params, val_mask);
1284    mme_free_reg(b, val_mask);
1285 
1286    mme_if(b, ine, params, old_params) {
1287       nvk_mme_store_scratch(b, TESS_PARAMS, params);
1288 
1289       /* lower_left lives at bit 12 */
1290       struct mme_value lower_left = mme_merge(b, mme_zero(), params, 0, 1, 12);
1291 
1292       /* Only the bottom 12 bits are valid to put in HW */
1293       mme_merge_to(b, params, mme_zero(), params, 0, 12, 0);
1294 
1295       /* If we're using a lower-left orientation, we need to flip triangles
1296        * between CW and CCW.
1297        */
1298       mme_if(b, ine, lower_left, mme_zero()) {
1299          struct mme_value prims_cw = mme_imm(NAK_TS_PRIMS_TRIANGLES_CW);
1300          struct mme_value prims_ccw = mme_imm(NAK_TS_PRIMS_TRIANGLES_CCW);
1301 
1302          struct mme_value prims = mme_merge(b, mme_zero(), params, 0, 4, 8);
1303          mme_if(b, ieq, prims, prims_cw) {
1304             mme_merge_to(b, params, params, prims_ccw, 8, 4, 0);
1305          }
1306          mme_if(b, ieq, prims, prims_ccw) {
1307             mme_merge_to(b, params, params, prims_cw, 8, 4, 0);
1308          }
1309          mme_free_reg(b, prims);
1310       }
1311       mme_free_reg(b, lower_left);
1312 
1313       mme_mthd(b, NV9097_SET_TESSELLATION_PARAMETERS);
1314       mme_emit(b, params);
1315    }
1316 }
1317 
1318 const struct nvk_mme_test_case nvk_mme_set_tess_params_tests[] = {{
1319    /* This case doesn't change the state so it should do nothing */
1320    .init = (struct nvk_mme_mthd_data[]) {
1321       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0 },
1322       { }
1323    },
1324    .params = (uint32_t[]) { 0xffff0000 },
1325    .expected = (struct nvk_mme_mthd_data[]) {
1326       { }
1327    },
1328 }, {
1329    /* TRIANGLE, INTEGER, TRIANGLES_CW, lower_left = false */
1330    .init = (struct nvk_mme_mthd_data[]) {
1331       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0 },
1332       { }
1333    },
1334    .params = (uint32_t[]) { 0xffff0201 },
1335    .expected = (struct nvk_mme_mthd_data[]) {
1336       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0201 },
1337       { NV9097_SET_TESSELLATION_PARAMETERS, 0x0201 },
1338       { }
1339    },
1340 }, {
1341    /* TRIANGLE, INTEGER, TRIANGLES_CW, lower_left = true */
1342    .init = (struct nvk_mme_mthd_data[]) {
1343       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0201 },
1344       { }
1345    },
1346    .params = (uint32_t[]) { 0x10001000 },
1347    .expected = (struct nvk_mme_mthd_data[]) {
1348       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x1201 },
1349       { NV9097_SET_TESSELLATION_PARAMETERS, 0x0301 },
1350       { }
1351    },
1352 }, {
1353    /* TRIANGLE, INTEGER, TRIANGLES_CCW, lower_left = true */
1354    .init = (struct nvk_mme_mthd_data[]) {
1355       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0301 },
1356       { }
1357    },
1358    .params = (uint32_t[]) { 0x10001000 },
1359    .expected = (struct nvk_mme_mthd_data[]) {
1360       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x1301 },
1361       { NV9097_SET_TESSELLATION_PARAMETERS, 0x0201 },
1362       { }
1363    },
1364 }, {}};
1365 
1366 static uint32_t nvk_mme_anti_alias_min_sample_shading(float mss);
1367 
1368 static void
nvk_flush_shaders(struct nvk_cmd_buffer * cmd)1369 nvk_flush_shaders(struct nvk_cmd_buffer *cmd)
1370 {
1371    if (cmd->state.gfx.shaders_dirty == 0)
1372       return;
1373 
1374    /* Map shader types to shaders */
1375    struct nvk_shader *type_shader[6] = { NULL, };
1376    uint32_t types_dirty = 0;
1377 
1378    const uint32_t gfx_stages = BITFIELD_BIT(MESA_SHADER_VERTEX) |
1379                                BITFIELD_BIT(MESA_SHADER_TESS_CTRL) |
1380                                BITFIELD_BIT(MESA_SHADER_TESS_EVAL) |
1381                                BITFIELD_BIT(MESA_SHADER_GEOMETRY) |
1382                                BITFIELD_BIT(MESA_SHADER_FRAGMENT);
1383 
1384    u_foreach_bit(stage, cmd->state.gfx.shaders_dirty & gfx_stages) {
1385       uint32_t type = mesa_to_nv9097_shader_type(stage);
1386       types_dirty |= BITFIELD_BIT(type);
1387 
1388       /* Only copy non-NULL shaders because mesh/task alias with vertex and
1389        * tessellation stages.
1390        */
1391       struct nvk_shader *shader = cmd->state.gfx.shaders[stage];
1392       if (shader != NULL) {
1393          assert(type < ARRAY_SIZE(type_shader));
1394          assert(type_shader[type] == NULL);
1395          type_shader[type] = shader;
1396 
1397          const struct nvk_cbuf_map *cbuf_map = &shader->cbuf_map;
1398          struct nvk_cbuf_group *cbuf_group =
1399             &cmd->state.gfx.cbuf_groups[nvk_cbuf_binding_for_stage(stage)];
1400          for (uint32_t i = 0; i < cbuf_map->cbuf_count; i++) {
1401             if (memcmp(&cbuf_group->cbufs[i], &cbuf_map->cbufs[i],
1402                        sizeof(cbuf_group->cbufs[i])) != 0) {
1403                cbuf_group->cbufs[i] = cbuf_map->cbufs[i];
1404                cbuf_group->dirty |= BITFIELD_BIT(i);
1405             }
1406          }
1407       }
1408    }
1409 
1410    u_foreach_bit(type, types_dirty) {
1411       struct nvk_shader *shader = type_shader[type];
1412 
1413       /* We always map index == type */
1414       const uint32_t idx = type;
1415 
1416       struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
1417       P_IMMD(p, NV9097, SET_PIPELINE_SHADER(idx), {
1418          .enable  = shader != NULL,
1419          .type    = type,
1420       });
1421 
1422       if (shader == NULL)
1423          continue;
1424 
1425       uint64_t addr = shader->hdr_addr;
1426       if (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A) {
1427          P_MTHD(p, NVC397, SET_PIPELINE_PROGRAM_ADDRESS_A(idx));
1428          P_NVC397_SET_PIPELINE_PROGRAM_ADDRESS_A(p, idx, addr >> 32);
1429          P_NVC397_SET_PIPELINE_PROGRAM_ADDRESS_B(p, idx, addr);
1430       } else {
1431          assert(addr < 0xffffffff);
1432          P_IMMD(p, NV9097, SET_PIPELINE_PROGRAM(idx), addr);
1433       }
1434 
1435       P_MTHD(p, NVC397, SET_PIPELINE_REGISTER_COUNT(idx));
1436       P_NVC397_SET_PIPELINE_REGISTER_COUNT(p, idx, shader->info.num_gprs);
1437       P_NVC397_SET_PIPELINE_BINDING(p, idx,
1438          nvk_pipeline_bind_group(shader->info.stage));
1439 
1440       if (shader->info.stage == MESA_SHADER_TESS_EVAL) {
1441          P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_TESS_PARAMS));
1442          P_INLINE_DATA(p, nvk_mme_tess_params(shader->info.ts.domain,
1443                                               shader->info.ts.spacing,
1444                                               shader->info.ts.prims));
1445       }
1446 
1447       if (shader->info.stage == MESA_SHADER_FRAGMENT) {
1448          p = nvk_cmd_buffer_push(cmd, 11);
1449 
1450          P_MTHD(p, NVC397, SET_SUBTILING_PERF_KNOB_A);
1451          P_NV9097_SET_SUBTILING_PERF_KNOB_A(p, {
1452             .fraction_of_spm_register_file_per_subtile         = 0x10,
1453             .fraction_of_spm_pixel_output_buffer_per_subtile   = 0x40,
1454             .fraction_of_spm_triangle_ram_per_subtile          = 0x16,
1455             .fraction_of_max_quads_per_subtile                 = 0x20,
1456          });
1457          P_NV9097_SET_SUBTILING_PERF_KNOB_B(p, 0x20);
1458 
1459          P_IMMD(p, NV9097, SET_API_MANDATED_EARLY_Z,
1460                 shader->info.fs.early_fragment_tests);
1461 
1462          if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1463             P_IMMD(p, NVB197, SET_POST_Z_PS_IMASK,
1464                    shader->info.fs.post_depth_coverage);
1465          } else {
1466             assert(!shader->info.fs.post_depth_coverage);
1467          }
1468 
1469          P_IMMD(p, NV9097, SET_ZCULL_BOUNDS, {
1470             .z_min_unbounded_enable = shader->info.fs.writes_depth,
1471             .z_max_unbounded_enable = shader->info.fs.writes_depth,
1472          });
1473 
1474          P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_ANTI_ALIAS));
1475          P_INLINE_DATA(p,
1476             nvk_mme_anti_alias_min_sample_shading(shader->min_sample_shading));
1477       }
1478    }
1479 
1480    const uint32_t vtg_stages = BITFIELD_BIT(MESA_SHADER_VERTEX) |
1481                                BITFIELD_BIT(MESA_SHADER_TESS_EVAL) |
1482                                BITFIELD_BIT(MESA_SHADER_GEOMETRY);
1483    const uint32_t vtgm_stages = vtg_stages | BITFIELD_BIT(MESA_SHADER_MESH);
1484 
1485    if (cmd->state.gfx.shaders_dirty & vtg_stages) {
1486       struct nak_xfb_info *xfb = NULL;
1487       u_foreach_bit(stage, vtg_stages) {
1488          if (cmd->state.gfx.shaders[stage] != NULL)
1489             xfb = &cmd->state.gfx.shaders[stage]->info.vtg.xfb;
1490       }
1491 
1492       if (xfb == NULL) {
1493          struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
1494          for (uint8_t b = 0; b < 4; b++)
1495             P_IMMD(p, NV9097, SET_STREAM_OUT_CONTROL_COMPONENT_COUNT(b), 0);
1496       } else {
1497          for (uint8_t b = 0; b < ARRAY_SIZE(xfb->attr_count); b++) {
1498             const uint8_t attr_count = xfb->attr_count[b];
1499             /* upload packed varying indices in multiples of 4 bytes */
1500             const uint32_t n = DIV_ROUND_UP(attr_count, 4);
1501 
1502             struct nv_push *p = nvk_cmd_buffer_push(cmd, 5 + n);
1503 
1504             P_MTHD(p, NV9097, SET_STREAM_OUT_CONTROL_STREAM(b));
1505             P_NV9097_SET_STREAM_OUT_CONTROL_STREAM(p, b, xfb->stream[b]);
1506             P_NV9097_SET_STREAM_OUT_CONTROL_COMPONENT_COUNT(p, b, attr_count);
1507             P_NV9097_SET_STREAM_OUT_CONTROL_STRIDE(p, b, xfb->stride[b]);
1508 
1509             if (n > 0) {
1510                P_MTHD(p, NV9097, SET_STREAM_OUT_LAYOUT_SELECT(b, 0));
1511                P_INLINE_ARRAY(p, (const uint32_t*)xfb->attr_index[b], n);
1512             }
1513          }
1514       }
1515    }
1516 
1517    if (cmd->state.gfx.shaders_dirty & vtgm_stages) {
1518       struct nvk_shader *last_vtgm = NULL;
1519       u_foreach_bit(stage, vtgm_stages) {
1520          if (cmd->state.gfx.shaders[stage] != NULL)
1521             last_vtgm = cmd->state.gfx.shaders[stage];
1522       }
1523 
1524       struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
1525 
1526       P_IMMD(p, NV9097, SET_RT_LAYER, {
1527          .v       = 0,
1528          .control = last_vtgm->info.vtg.writes_layer ?
1529                     CONTROL_GEOMETRY_SHADER_SELECTS_LAYER :
1530                     CONTROL_V_SELECTS_LAYER,
1531       });
1532 
1533       const uint8_t clip_enable = last_vtgm->info.vtg.clip_enable;
1534       const uint8_t cull_enable = last_vtgm->info.vtg.cull_enable;
1535       P_IMMD(p, NV9097, SET_USER_CLIP_ENABLE, {
1536          .plane0 = ((clip_enable | cull_enable) >> 0) & 1,
1537          .plane1 = ((clip_enable | cull_enable) >> 1) & 1,
1538          .plane2 = ((clip_enable | cull_enable) >> 2) & 1,
1539          .plane3 = ((clip_enable | cull_enable) >> 3) & 1,
1540          .plane4 = ((clip_enable | cull_enable) >> 4) & 1,
1541          .plane5 = ((clip_enable | cull_enable) >> 5) & 1,
1542          .plane6 = ((clip_enable | cull_enable) >> 6) & 1,
1543          .plane7 = ((clip_enable | cull_enable) >> 7) & 1,
1544       });
1545       P_IMMD(p, NV9097, SET_USER_CLIP_OP, {
1546          .plane0 = (cull_enable >> 0) & 1,
1547          .plane1 = (cull_enable >> 1) & 1,
1548          .plane2 = (cull_enable >> 2) & 1,
1549          .plane3 = (cull_enable >> 3) & 1,
1550          .plane4 = (cull_enable >> 4) & 1,
1551          .plane5 = (cull_enable >> 5) & 1,
1552          .plane6 = (cull_enable >> 6) & 1,
1553          .plane7 = (cull_enable >> 7) & 1,
1554       });
1555    }
1556 
1557    cmd->state.gfx.shaders_dirty = 0;
1558 }
1559 
1560 void
nvk_mme_set_vb_enables(struct mme_builder * b)1561 nvk_mme_set_vb_enables(struct mme_builder *b)
1562 {
1563    struct mme_value enables = mme_load(b);
1564    struct mme_value old_enables = nvk_mme_load_scratch(b, VB_ENABLES);
1565    nvk_mme_store_scratch(b, VB_ENABLES, enables);
1566 
1567    struct mme_value changed = mme_xor(b, enables, old_enables);
1568    mme_free_reg(b, old_enables);
1569 
1570    struct mme_value vb_idx4 = mme_mov(b, mme_zero());
1571    mme_while(b, ine, changed, mme_zero()) {
1572       mme_if(b, ine, mme_and(b, changed, mme_imm(1)), mme_zero()) {
1573          struct mme_value state =
1574             mme_state_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1575          mme_merge_to(b, state, state, enables, 12, 1, 0);
1576          mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1577          mme_emit(b, state);
1578       }
1579       mme_add_to(b, vb_idx4, vb_idx4, mme_imm(4));
1580       mme_srl_to(b, changed, changed, mme_imm(1));
1581       mme_srl_to(b, enables, enables, mme_imm(1));
1582    }
1583 }
1584 
1585 static uint32_t
nvk_mme_vb_stride(uint32_t vb_idx,uint32_t stride)1586 nvk_mme_vb_stride(uint32_t vb_idx, uint32_t stride)
1587 {
1588    assert(stride < (1 << 12));
1589    assert(vb_idx < (1 << 5));
1590    return (vb_idx << 16) | stride;
1591 }
1592 
1593 void
nvk_mme_set_vb_stride(struct mme_builder * b)1594 nvk_mme_set_vb_stride(struct mme_builder *b)
1595 {
1596    /* Param is laid out as
1597     *
1598     *    bits 0..11  : stride
1599     *    bits 16..21 : VB index
1600     */
1601    struct mme_value param = mme_load(b);
1602 
1603    struct mme_value vb_idx4 = mme_merge(b, mme_zero(), param, 2, 5, 16);
1604 
1605    struct mme_value state =
1606       mme_state_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1607    struct mme_value new_state = mme_merge(b, state, param, 0, 12, 0);
1608    mme_if(b, ine, state, new_state) {
1609       mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1610       mme_emit(b, new_state);
1611    }
1612 }
1613 
1614 static void
nvk_flush_vi_state(struct nvk_cmd_buffer * cmd)1615 nvk_flush_vi_state(struct nvk_cmd_buffer *cmd)
1616 {
1617    struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
1618    struct nvk_physical_device *pdev = nvk_device_physical(dev);
1619    const struct vk_dynamic_graphics_state *dyn =
1620       &cmd->vk.dynamic_graphics_state;
1621 
1622    struct nv_push *p = nvk_cmd_buffer_push(cmd, 258);
1623 
1624    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) {
1625       P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VB_ENABLES));
1626       P_INLINE_DATA(p, dyn->vi->bindings_valid);
1627    }
1628 
1629    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) ||
1630        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) {
1631       u_foreach_bit(a, dyn->vi->attributes_valid) {
1632          const struct nvk_va_format *fmt =
1633             nvk_get_va_format(pdev, dyn->vi->attributes[a].format);
1634 
1635          P_IMMD(p, NV9097, SET_VERTEX_ATTRIBUTE_A(a), {
1636             .stream                 = dyn->vi->attributes[a].binding,
1637             .offset                 = dyn->vi->attributes[a].offset,
1638             .component_bit_widths   = fmt->bit_widths,
1639             .numerical_type         = fmt->type,
1640             .swap_r_and_b           = fmt->swap_rb,
1641          });
1642       }
1643 
1644       u_foreach_bit(b, dyn->vi->bindings_valid) {
1645          const bool instanced = dyn->vi->bindings[b].input_rate ==
1646                                 VK_VERTEX_INPUT_RATE_INSTANCE;
1647          P_IMMD(p, NV9097, SET_VERTEX_STREAM_INSTANCE_A(b), instanced);
1648          P_IMMD(p, NV9097, SET_VERTEX_STREAM_A_FREQUENCY(b),
1649             dyn->vi->bindings[b].divisor);
1650       }
1651    }
1652 
1653    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
1654        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES)) {
1655       u_foreach_bit(b, dyn->vi->bindings_valid) {
1656          assert(dyn->vi_binding_strides[b] < (1 << 12));
1657          P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VB_STRIDE));
1658          P_INLINE_DATA(p, nvk_mme_vb_stride(b, dyn->vi_binding_strides[b]));
1659       }
1660    }
1661 }
1662 
1663 static uint32_t
vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)1664 vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)
1665 {
1666    switch (prim) {
1667    case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
1668       return NV9097_BEGIN_OP_POINTS;
1669    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
1670       return NV9097_BEGIN_OP_LINES;
1671    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
1672       return NV9097_BEGIN_OP_LINE_STRIP;
1673    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
1674 #pragma GCC diagnostic push
1675 #pragma GCC diagnostic ignored "-Wswitch"
1676    case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA:
1677 #pragma GCC diagnostic pop
1678       return NV9097_BEGIN_OP_TRIANGLES;
1679    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
1680       return NV9097_BEGIN_OP_TRIANGLE_STRIP;
1681    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
1682       return NV9097_BEGIN_OP_TRIANGLE_FAN;
1683    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1684       return NV9097_BEGIN_OP_LINELIST_ADJCY;
1685    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1686       return NV9097_BEGIN_OP_LINESTRIP_ADJCY;
1687    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1688       return NV9097_BEGIN_OP_TRIANGLELIST_ADJCY;
1689    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
1690       return NV9097_BEGIN_OP_TRIANGLESTRIP_ADJCY;
1691    case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
1692       return NV9097_BEGIN_OP_PATCH;
1693    default:
1694       unreachable("Invalid primitive topology");
1695    }
1696 }
1697 
1698 static void
nvk_flush_ia_state(struct nvk_cmd_buffer * cmd)1699 nvk_flush_ia_state(struct nvk_cmd_buffer *cmd)
1700 {
1701    const struct vk_dynamic_graphics_state *dyn =
1702       &cmd->vk.dynamic_graphics_state;
1703 
1704    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
1705       uint32_t begin;
1706       V_NV9097_BEGIN(begin, {
1707          .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
1708          .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
1709          .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
1710          .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
1711       });
1712 
1713       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1714       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_DRAW_BEGIN));
1715       P_INLINE_DATA(p, begin);
1716    }
1717 
1718    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
1719       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1720       P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART,
1721              dyn->ia.primitive_restart_enable);
1722    }
1723 }
1724 
1725 static void
nvk_flush_ts_state(struct nvk_cmd_buffer * cmd)1726 nvk_flush_ts_state(struct nvk_cmd_buffer *cmd)
1727 {
1728    const struct vk_dynamic_graphics_state *dyn =
1729       &cmd->vk.dynamic_graphics_state;
1730    struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
1731 
1732    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS)) {
1733       /* The hardware gets grumpy if we set this to 0 so make sure we set it
1734        * to at least 1 in case it's dirty but uninitialized.
1735        */
1736       P_IMMD(p, NV9097, SET_PATCH, MAX2(1, dyn->ts.patch_control_points));
1737    }
1738 
1739    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN)) {
1740       P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_TESS_PARAMS));
1741       P_INLINE_DATA(p, nvk_mme_tess_lower_left(
1742          dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT));
1743    }
1744 }
1745 
1746 static void
nvk_flush_vp_state(struct nvk_cmd_buffer * cmd)1747 nvk_flush_vp_state(struct nvk_cmd_buffer *cmd)
1748 {
1749    const struct vk_dynamic_graphics_state *dyn =
1750       &cmd->vk.dynamic_graphics_state;
1751 
1752    struct nv_push *p =
1753       nvk_cmd_buffer_push(cmd, 18 * dyn->vp.viewport_count + 4 * NVK_MAX_VIEWPORTS);
1754 
1755    /* Nothing to do for MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT */
1756 
1757    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
1758        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) {
1759       for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
1760          const VkViewport *vp = &dyn->vp.viewports[i];
1761 
1762          /* These exactly match the spec values.  Nvidia hardware oddities
1763           * are accounted for later.
1764           */
1765          const float o_x = vp->x + 0.5f * vp->width;
1766          const float o_y = vp->y + 0.5f * vp->height;
1767          const float o_z = !dyn->vp.depth_clip_negative_one_to_one ?
1768                            vp->minDepth :
1769                            (vp->maxDepth + vp->minDepth) * 0.5f;
1770 
1771          const float p_x = vp->width;
1772          const float p_y = vp->height;
1773          const float p_z = !dyn->vp.depth_clip_negative_one_to_one ?
1774                            vp->maxDepth - vp->minDepth :
1775                            (vp->maxDepth - vp->minDepth) * 0.5f;
1776 
1777          P_MTHD(p, NV9097, SET_VIEWPORT_SCALE_X(i));
1778          P_NV9097_SET_VIEWPORT_SCALE_X(p, i, fui(0.5f * p_x));
1779          P_NV9097_SET_VIEWPORT_SCALE_Y(p, i, fui(0.5f * p_y));
1780          P_NV9097_SET_VIEWPORT_SCALE_Z(p, i, fui(p_z));
1781 
1782          P_NV9097_SET_VIEWPORT_OFFSET_X(p, i, fui(o_x));
1783          P_NV9097_SET_VIEWPORT_OFFSET_Y(p, i, fui(o_y));
1784          P_NV9097_SET_VIEWPORT_OFFSET_Z(p, i, fui(o_z));
1785 
1786          float xmin = vp->x;
1787          float xmax = vp->x + vp->width;
1788          float ymin = MIN2(vp->y, vp->y + vp->height);
1789          float ymax = MAX2(vp->y, vp->y + vp->height);
1790          float zmin = MIN2(vp->minDepth, vp->maxDepth);
1791          float zmax = MAX2(vp->minDepth, vp->maxDepth);
1792          assert(xmin <= xmax && ymin <= ymax);
1793 
1794          const float max_dim = (float)0xffff;
1795          xmin = CLAMP(xmin, 0, max_dim);
1796          xmax = CLAMP(xmax, 0, max_dim);
1797          ymin = CLAMP(ymin, 0, max_dim);
1798          ymax = CLAMP(ymax, 0, max_dim);
1799 
1800          P_MTHD(p, NV9097, SET_VIEWPORT_CLIP_HORIZONTAL(i));
1801          P_NV9097_SET_VIEWPORT_CLIP_HORIZONTAL(p, i, {
1802             .x0      = xmin,
1803             .width   = xmax - xmin,
1804          });
1805          P_NV9097_SET_VIEWPORT_CLIP_VERTICAL(p, i, {
1806             .y0      = ymin,
1807             .height  = ymax - ymin,
1808          });
1809 
1810          if (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A) {
1811             P_NV9097_SET_VIEWPORT_CLIP_MIN_Z(p, i, fui(zmin));
1812             P_NV9097_SET_VIEWPORT_CLIP_MAX_Z(p, i, fui(zmax));
1813          } else {
1814             P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VIEWPORT_MIN_MAX_Z));
1815             P_INLINE_DATA(p, i);
1816             P_INLINE_DATA(p, fui(zmin));
1817             P_INLINE_DATA(p, fui(zmax));
1818          }
1819 
1820          if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1821             P_IMMD(p, NVB197, SET_VIEWPORT_COORDINATE_SWIZZLE(i), {
1822                .x = X_POS_X,
1823                .y = Y_POS_Y,
1824                .z = Z_POS_Z,
1825                .w = W_POS_W,
1826             });
1827          }
1828       }
1829    }
1830 
1831    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) {
1832       P_IMMD(p, NV9097, SET_VIEWPORT_Z_CLIP,
1833              dyn->vp.depth_clip_negative_one_to_one ?
1834              RANGE_NEGATIVE_W_TO_POSITIVE_W :
1835              RANGE_ZERO_TO_POSITIVE_W);
1836    }
1837 
1838    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT)) {
1839       for (unsigned i = dyn->vp.scissor_count; i < NVK_MAX_VIEWPORTS; i++)
1840          P_IMMD(p, NV9097, SET_SCISSOR_ENABLE(i), V_FALSE);
1841    }
1842 
1843    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS)) {
1844       for (unsigned i = 0; i < dyn->vp.scissor_count; i++) {
1845          const VkRect2D *s = &dyn->vp.scissors[i];
1846 
1847          const uint32_t xmin = MIN2(16384, s->offset.x);
1848          const uint32_t xmax = MIN2(16384, s->offset.x + s->extent.width);
1849          const uint32_t ymin = MIN2(16384, s->offset.y);
1850          const uint32_t ymax = MIN2(16384, s->offset.y + s->extent.height);
1851 
1852          P_MTHD(p, NV9097, SET_SCISSOR_ENABLE(i));
1853          P_NV9097_SET_SCISSOR_ENABLE(p, i, V_TRUE);
1854          P_NV9097_SET_SCISSOR_HORIZONTAL(p, i, {
1855             .xmin = xmin,
1856             .xmax = xmax,
1857          });
1858          P_NV9097_SET_SCISSOR_VERTICAL(p, i, {
1859             .ymin = ymin,
1860             .ymax = ymax,
1861          });
1862       }
1863    }
1864 }
1865 
1866 static uint32_t
vk_to_nv9097_polygon_mode(VkPolygonMode vk_mode)1867 vk_to_nv9097_polygon_mode(VkPolygonMode vk_mode)
1868 {
1869    ASSERTED uint16_t vk_to_nv9097[] = {
1870       [VK_POLYGON_MODE_FILL]  = NV9097_SET_FRONT_POLYGON_MODE_V_FILL,
1871       [VK_POLYGON_MODE_LINE]  = NV9097_SET_FRONT_POLYGON_MODE_V_LINE,
1872       [VK_POLYGON_MODE_POINT] = NV9097_SET_FRONT_POLYGON_MODE_V_POINT,
1873    };
1874    assert(vk_mode < ARRAY_SIZE(vk_to_nv9097));
1875 
1876    uint32_t nv9097_mode = 0x1b00 | (2 - vk_mode);
1877    assert(nv9097_mode == vk_to_nv9097[vk_mode]);
1878    return nv9097_mode;
1879 }
1880 
1881 static uint32_t
vk_to_nv9097_cull_mode(VkCullModeFlags vk_cull_mode)1882 vk_to_nv9097_cull_mode(VkCullModeFlags vk_cull_mode)
1883 {
1884    static const uint16_t vk_to_nv9097[] = {
1885       [VK_CULL_MODE_FRONT_BIT]      = NV9097_OGL_SET_CULL_FACE_V_FRONT,
1886       [VK_CULL_MODE_BACK_BIT]       = NV9097_OGL_SET_CULL_FACE_V_BACK,
1887       [VK_CULL_MODE_FRONT_AND_BACK] = NV9097_OGL_SET_CULL_FACE_V_FRONT_AND_BACK,
1888    };
1889    assert(vk_cull_mode < ARRAY_SIZE(vk_to_nv9097));
1890    return vk_to_nv9097[vk_cull_mode];
1891 }
1892 
1893 static uint32_t
vk_to_nv9097_front_face(VkFrontFace vk_face)1894 vk_to_nv9097_front_face(VkFrontFace vk_face)
1895 {
1896    /* Vulkan and OpenGL are backwards here because Vulkan assumes the D3D
1897     * convention in which framebuffer coordinates always start in the upper
1898     * left while OpenGL has framebuffer coordinates starting in the lower
1899     * left.  Therefore, we want the reverse of the hardware enum name.
1900     */
1901    ASSERTED static const uint16_t vk_to_nv9097[] = {
1902       [VK_FRONT_FACE_COUNTER_CLOCKWISE]   = NV9097_OGL_SET_FRONT_FACE_V_CCW,
1903       [VK_FRONT_FACE_CLOCKWISE]           = NV9097_OGL_SET_FRONT_FACE_V_CW,
1904    };
1905    assert(vk_face < ARRAY_SIZE(vk_to_nv9097));
1906 
1907    uint32_t nv9097_face = 0x900 | (1 - vk_face);
1908    assert(nv9097_face == vk_to_nv9097[vk_face]);
1909    return nv9097_face;
1910 }
1911 
1912 static uint32_t
vk_to_nv9097_provoking_vertex(VkProvokingVertexModeEXT vk_mode)1913 vk_to_nv9097_provoking_vertex(VkProvokingVertexModeEXT vk_mode)
1914 {
1915    STATIC_ASSERT(VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT ==
1916                  NV9097_SET_PROVOKING_VERTEX_V_FIRST);
1917    STATIC_ASSERT(VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT ==
1918                  NV9097_SET_PROVOKING_VERTEX_V_LAST);
1919    return vk_mode;
1920 }
1921 
1922 void
nvk_mme_set_viewport_min_max_z(struct mme_builder * b)1923 nvk_mme_set_viewport_min_max_z(struct mme_builder *b)
1924 {
1925    struct mme_value vp_idx = mme_load(b);
1926    struct mme_value min_z = mme_load(b);
1927    struct mme_value max_z = mme_load(b);
1928 
1929    /* Multiply by 2 because it's an array with stride 8 */
1930    mme_sll_to(b, vp_idx, vp_idx, mme_imm(1));
1931    mme_mthd_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MIN_Z), vp_idx);
1932    mme_emit(b, min_z);
1933    mme_emit(b, max_z);
1934 
1935    struct mme_value z_clamp = nvk_mme_load_scratch(b, Z_CLAMP);
1936    mme_if(b, ine, z_clamp, mme_zero()) {
1937       /* Multiply by 2 again because this array has stride 16 */
1938       mme_sll_to(b, vp_idx, vp_idx, mme_imm(1));
1939       mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), vp_idx);
1940       mme_emit(b, min_z);
1941       mme_emit(b, max_z);
1942    }
1943 }
1944 
1945 void
nvk_mme_set_z_clamp(struct mme_builder * b)1946 nvk_mme_set_z_clamp(struct mme_builder *b)
1947 {
1948    struct mme_value z_clamp = mme_load(b);
1949    struct mme_value old_z_clamp = nvk_mme_load_scratch(b, Z_CLAMP);
1950    mme_if(b, ine, z_clamp, old_z_clamp) {
1951       nvk_mme_store_scratch(b, Z_CLAMP, z_clamp);
1952 
1953       mme_if(b, ine, z_clamp, mme_zero()) {
1954          struct mme_value i_2 = mme_mov(b, mme_zero());
1955          mme_while(b, ine, i_2, mme_imm(NVK_MAX_VIEWPORTS * 2)) {
1956             struct mme_value min_z =
1957                mme_state_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MIN_Z), i_2);
1958             struct mme_value max_z =
1959                mme_state_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MAX_Z), i_2);
1960 
1961             struct mme_value i_4 = mme_sll(b, i_2, mme_imm(1));
1962             mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), i_4);
1963             mme_emit(b, min_z);
1964             mme_emit(b, max_z);
1965 
1966             mme_free_reg(b, i_4);
1967             mme_free_reg(b, min_z);
1968             mme_free_reg(b, max_z);
1969 
1970             mme_add_to(b, i_2, i_2, mme_imm(2));
1971          }
1972          mme_free_reg(b, i_2);
1973       }
1974       mme_if(b, ieq, z_clamp, mme_zero()) {
1975          struct mme_value i_4 = mme_mov(b, mme_zero());
1976          mme_while(b, ine, i_4, mme_imm(NVK_MAX_VIEWPORTS * 4)) {
1977             mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), i_4);
1978             mme_emit(b, mme_imm(fui(-INFINITY)));
1979             mme_emit(b, mme_imm(fui(INFINITY)));
1980 
1981             mme_add_to(b, i_4, i_4, mme_imm(4));
1982          }
1983          mme_free_reg(b, i_4);
1984       }
1985    }
1986 }
1987 
1988 static void
nvk_flush_rs_state(struct nvk_cmd_buffer * cmd)1989 nvk_flush_rs_state(struct nvk_cmd_buffer *cmd)
1990 {
1991    struct nv_push *p = nvk_cmd_buffer_push(cmd, 46);
1992 
1993    const struct vk_dynamic_graphics_state *dyn =
1994       &cmd->vk.dynamic_graphics_state;
1995 
1996    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE))
1997       P_IMMD(p, NV9097, SET_RASTER_ENABLE, !dyn->rs.rasterizer_discard_enable);
1998 
1999    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) ||
2000        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE)) {
2001       const bool z_clamp = dyn->rs.depth_clamp_enable;
2002       const bool z_clip = vk_rasterization_state_depth_clip_enable(&dyn->rs);
2003       P_IMMD(p, NVC397, SET_VIEWPORT_CLIP_CONTROL, {
2004          /* We only set Z clip range if clamp is requested.  Otherwise, we
2005           * leave it set to -/+INF and clamp using the guardband below.
2006           */
2007          .min_z_zero_max_z_one = MIN_Z_ZERO_MAX_Z_ONE_FALSE,
2008          .z_clip_range = nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A
2009                          ? (z_clamp ? Z_CLIP_RANGE_MIN_Z_MAX_Z
2010                                     : Z_CLIP_RANGE_MINUS_INF_PLUS_INF)
2011                          : Z_CLIP_RANGE_USE_FIELD_MIN_Z_ZERO_MAX_Z_ONE,
2012 
2013          .pixel_min_z = PIXEL_MIN_Z_CLAMP,
2014          .pixel_max_z = PIXEL_MAX_Z_CLAMP,
2015 
2016          .geometry_guardband = GEOMETRY_GUARDBAND_SCALE_256,
2017          .line_point_cull_guardband = LINE_POINT_CULL_GUARDBAND_SCALE_256,
2018          .geometry_clip = z_clip ? GEOMETRY_CLIP_FRUSTUM_XYZ_CLIP
2019                                  : GEOMETRY_CLIP_FRUSTUM_XY_CLIP,
2020 
2021          /* We clip depth with the geometry clipper to ensure that it gets
2022           * clipped before depth bias is applied.  If we leave it up to the
2023           * raserizer clipper (pixel_min/max_z = CLIP), it will clip too late
2024           * in the pipeline.  This can be seen in two different ways:
2025           *
2026           *  - When depth bias is enabled, the bias is applied post-clipping.
2027           *    If we clip in the rasterizer, it will clip according to the
2028           *    post-bias depth which is wrong.
2029           *
2030           *  - If the fragment shader overrides the depth by writing to
2031           *    gl_FragDepth, it should be clipped according to the original
2032           *    geometry, not accoring to gl_FragDepth.
2033           *
2034           * In order to always get the geometry clipper, we need to set a
2035           * tight guardband (geometry_guardband_z = SCALE_1).
2036           */
2037          .geometry_guardband_z = z_clip ? GEOMETRY_GUARDBAND_Z_SCALE_1
2038                                         : GEOMETRY_GUARDBAND_Z_SCALE_256,
2039       });
2040 
2041       /* Pre-Volta, we don't have SET_VIEWPORT_CLIP_CONTROL::z_clip_range.
2042        * Instead, we have to emulate it by smashing VIEWPORT_CLIP_MIN/MAX_Z
2043        * based on whether or not z_clamp is set. This is done by a pair of
2044        * macros, one of which is called here and the other is called in
2045        * viewport setup.
2046        */
2047       if (nvk_cmd_buffer_3d_cls(cmd) < VOLTA_A) {
2048          P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_Z_CLAMP));
2049          P_INLINE_DATA(p, z_clamp);
2050       }
2051    }
2052 
2053    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE)) {
2054       uint32_t polygon_mode = vk_to_nv9097_polygon_mode(dyn->rs.polygon_mode);
2055       P_MTHD(p, NV9097, SET_FRONT_POLYGON_MODE);
2056       P_NV9097_SET_FRONT_POLYGON_MODE(p, polygon_mode);
2057       P_NV9097_SET_BACK_POLYGON_MODE(p, polygon_mode);
2058    }
2059 
2060    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE)) {
2061       P_IMMD(p, NV9097, OGL_SET_CULL, dyn->rs.cull_mode != VK_CULL_MODE_NONE);
2062 
2063       if (dyn->rs.cull_mode != VK_CULL_MODE_NONE) {
2064          uint32_t face = vk_to_nv9097_cull_mode(dyn->rs.cull_mode);
2065          P_IMMD(p, NV9097, OGL_SET_CULL_FACE, face);
2066       }
2067    }
2068 
2069    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE)) {
2070       P_IMMD(p, NV9097, OGL_SET_FRONT_FACE,
2071          vk_to_nv9097_front_face(dyn->rs.front_face));
2072    }
2073 
2074    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
2075       P_IMMD(p, NV9097, SET_PROVOKING_VERTEX,
2076              vk_to_nv9097_provoking_vertex(dyn->rs.provoking_vertex));
2077    }
2078 
2079    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE)) {
2080       P_MTHD(p, NV9097, SET_POLY_OFFSET_POINT);
2081       P_NV9097_SET_POLY_OFFSET_POINT(p, dyn->rs.depth_bias.enable);
2082       P_NV9097_SET_POLY_OFFSET_LINE(p, dyn->rs.depth_bias.enable);
2083       P_NV9097_SET_POLY_OFFSET_FILL(p, dyn->rs.depth_bias.enable);
2084    }
2085 
2086    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
2087       switch (dyn->rs.depth_bias.representation) {
2088       case VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORMAT_EXT:
2089          P_IMMD(p, NV9097, SET_DEPTH_BIAS_CONTROL,
2090                 DEPTH_FORMAT_DEPENDENT_TRUE);
2091          break;
2092       case VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT:
2093          P_IMMD(p, NV9097, SET_DEPTH_BIAS_CONTROL,
2094                 DEPTH_FORMAT_DEPENDENT_FALSE);
2095          break;
2096       case VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT:
2097       default:
2098          unreachable("Unsupported depth bias representation");
2099       }
2100       /* TODO: The blob multiplies by 2 for some reason. We don't. */
2101       P_IMMD(p, NV9097, SET_DEPTH_BIAS, fui(dyn->rs.depth_bias.constant));
2102       P_IMMD(p, NV9097, SET_SLOPE_SCALE_DEPTH_BIAS, fui(dyn->rs.depth_bias.slope));
2103       P_IMMD(p, NV9097, SET_DEPTH_BIAS_CLAMP, fui(dyn->rs.depth_bias.clamp));
2104    }
2105 
2106    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH)) {
2107       P_MTHD(p, NV9097, SET_LINE_WIDTH_FLOAT);
2108       P_NV9097_SET_LINE_WIDTH_FLOAT(p, fui(dyn->rs.line.width));
2109       P_NV9097_SET_ALIASED_LINE_WIDTH_FLOAT(p, fui(dyn->rs.line.width));
2110    }
2111 
2112    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE)) {
2113       switch (dyn->rs.line.mode) {
2114       case VK_LINE_RASTERIZATION_MODE_DEFAULT_KHR:
2115       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR:
2116          P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_FALSE);
2117          P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_FALSE);
2118          break;
2119 
2120       case VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR:
2121          P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_TRUE);
2122          P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_FALSE);
2123          break;
2124 
2125       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR:
2126          P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_TRUE);
2127          P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_TRUE);
2128          break;
2129 
2130       default:
2131          unreachable("Invalid line rasterization mode");
2132       }
2133    }
2134 
2135    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE))
2136       P_IMMD(p, NV9097, SET_LINE_STIPPLE, dyn->rs.line.stipple.enable);
2137 
2138    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE)) {
2139       /* map factor from [1,256] to [0, 255] */
2140       uint32_t stipple_factor = CLAMP(dyn->rs.line.stipple.factor, 1, 256) - 1;
2141       P_IMMD(p, NV9097, SET_LINE_STIPPLE_PARAMETERS, {
2142          .factor  = stipple_factor,
2143          .pattern = dyn->rs.line.stipple.pattern,
2144       });
2145    }
2146 
2147    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM))
2148       P_IMMD(p, NV9097, SET_RASTER_INPUT, dyn->rs.rasterization_stream);
2149 
2150    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE) ||
2151        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_EXTRA_PRIMITIVE_OVERESTIMATION_SIZE)) {
2152       if (nvk_cmd_buffer_3d_cls(cmd) < MAXWELL_B) {
2153          assert(dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT);
2154       } else if (dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
2155          P_IMMD(p, NVB197, SET_CONSERVATIVE_RASTER, ENABLE_FALSE);
2156       } else {
2157          uint32_t extra_overestimate =
2158             MIN2(3, dyn->rs.extra_primitive_overestimation_size * 4);
2159 
2160          if (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A) {
2161             P_IMMD(p, NVC397, SET_CONSERVATIVE_RASTER_CONTROL, {
2162                .extra_prim_bloat = extra_overestimate,
2163                .copy_inner_to_outer =
2164                   (dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT),
2165                .triangle_snap_mode = TRIANGLE_SNAP_MODE_MODE_PRE_SNAP,
2166                .line_and_point_snap_mode = LINE_AND_POINT_SNAP_MODE_MODE_PRE_SNAP,
2167                .uncertainty_region_size = UNCERTAINTY_REGION_SIZE_SIZE_512,
2168             });
2169          } else {
2170             P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_CONSERVATIVE_RASTER_STATE));
2171             P_INLINE_DATA(p, extra_overestimate << 23);
2172          }
2173          P_IMMD(p, NVB197, SET_CONSERVATIVE_RASTER, ENABLE_TRUE);
2174       }
2175    }
2176 }
2177 
2178 static uint32_t
nvk_mme_anti_alias_init(void)2179 nvk_mme_anti_alias_init(void)
2180 {
2181    /* This is a valid value but we never set it so it ensures that the macro
2182     * will actually run the first time we set anything.
2183     */
2184    return 0xf;
2185 }
2186 
2187 static uint32_t
nvk_mme_anti_alias_min_sample_shading(float mss)2188 nvk_mme_anti_alias_min_sample_shading(float mss)
2189 {
2190    /* The value we want to comput in the MME is
2191     *
2192     *    passes = next_pow2(samples * minSampleShading)
2193     *
2194     * Since samples is already a power of two,
2195     *
2196     *    passes_log2 = log2_ceil(samples * minSampleShading)
2197     *                = log2_ceil(samples / (1.0 / minSampleShading))
2198     *                = samples_log2 - log2_floor(1.0 / minSampleShading)
2199     *
2200     * if we assume (1.0 / min_sample_shading) >= 1.0.  This last bit is
2201     * something we can compute in the MME as long as the float math on the
2202     * right-hand side happens  on the CPU.
2203     */
2204    float rcp_mss = CLAMP(1.0 / mss, 1.0f, 16.0f);
2205    uint32_t rcp_mss_log2 = util_logbase2(floorf(rcp_mss));
2206 
2207    assert(rcp_mss_log2 != nvk_mme_anti_alias_init());
2208 
2209    return nvk_mme_val_mask(rcp_mss_log2 << 0, 0x000f);
2210 }
2211 
2212 static uint32_t
nvk_mme_anti_alias_samples(uint32_t samples)2213 nvk_mme_anti_alias_samples(uint32_t samples)
2214 {
2215    assert(util_is_power_of_two_or_zero(samples));
2216    const uint32_t samples_log2 = util_logbase2(MAX2(1, samples));
2217 
2218    return nvk_mme_val_mask(samples_log2 << 4, 0x00f0);
2219 }
2220 
2221 void
nvk_mme_set_anti_alias(struct mme_builder * b)2222 nvk_mme_set_anti_alias(struct mme_builder *b)
2223 {
2224    struct mme_value val_mask = mme_load(b);
2225    struct mme_value old_anti_alias = nvk_mme_load_scratch(b, ANTI_ALIAS);
2226    struct mme_value anti_alias =
2227       nvk_mme_set_masked(b, old_anti_alias, val_mask);
2228    mme_free_reg(b, val_mask);
2229 
2230    mme_if(b, ine, anti_alias, old_anti_alias) {
2231       mme_free_reg(b, old_anti_alias);
2232       nvk_mme_store_scratch(b, ANTI_ALIAS, anti_alias);
2233 
2234       struct mme_value rcp_mss_log2 =
2235          mme_merge(b, mme_zero(), anti_alias, 0, 4, 0);
2236       struct mme_value samples_log2 =
2237          mme_merge(b, mme_zero(), anti_alias, 0, 4, 4);
2238       mme_free_reg(b, anti_alias);
2239 
2240       /* We've already done all the hard work on the CPU in
2241        * nvk_mme_min_sample_shading().  All we have to do here is add the two
2242        * log2 values and clamp so we don't get negative.
2243        */
2244       struct mme_value passes_log2 = mme_sub(b, samples_log2, rcp_mss_log2);
2245       mme_free_reg(b, rcp_mss_log2);
2246 
2247       /* passes = MAX(passes, 1) */
2248       struct mme_value neg = mme_srl(b, passes_log2, mme_imm(31));
2249       mme_if(b, ine, neg, mme_zero()) {
2250          mme_mov_to(b, passes_log2, mme_zero());
2251       }
2252       mme_free_reg(b, neg);
2253 
2254       /*
2255        * NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL {
2256        *    ...
2257        *    .centroid = passes > 1 ? CENTROID_PER_PASS
2258        *                           : CENTROID_PER_FRAGMENT,
2259        * }
2260        */
2261       struct mme_value aac = mme_mov(b,
2262          mme_imm(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID_PER_FRAGMENT
2263                  << DRF_LO(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID)));
2264       mme_if(b, ine, passes_log2, mme_zero()) {
2265          mme_mov_to(b, aac,
2266             mme_imm(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID_PER_PASS
2267                     << DRF_LO(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID)));
2268       }
2269 
2270       struct mme_value passes = mme_sll(b, mme_imm(1), passes_log2);
2271       mme_merge_to(b, aac, aac, passes, 0, 4, 0);
2272       mme_free_reg(b, passes);
2273 
2274       mme_mthd(b, NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL);
2275       mme_emit(b, aac);
2276       mme_free_reg(b, aac);
2277 
2278       /* Now we need to emit sample masks per-sample:
2279        *
2280        *    struct nak_sample_mask push_sm[NVK_MAX_SAMPLES];
2281        *    uint32_t samples_per_pass = samples / passes;
2282        *    uint32_t sample_mask = BITFIELD_MASK(samples_per_pass);
2283        *    for (uint32_t s = 0; NVK_MAX_SAMPLES;) {
2284        *       push_sm[s] = (struct nak_sample_mask) {
2285        *          .sample_mask = sample_mask,
2286        *       };
2287        *
2288        *       s++;
2289        *
2290        *       if (s & samples_per_pass)
2291        *          sample_mask <<= samples_per_pass;
2292        *    }
2293        *
2294        * Annoyingly, we have to pack these in pairs
2295        */
2296       STATIC_ASSERT(sizeof(struct nak_sample_mask) == 2);
2297 
2298       mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
2299       mme_emit(b, mme_imm(nvk_root_descriptor_offset(draw.sample_masks)));
2300       mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
2301 
2302       /* Annoyingly, we have to pack these in pairs */
2303 
2304       struct mme_value samples_per_pass_log2 =
2305          mme_sub(b, samples_log2, passes_log2);
2306       mme_free_reg(b, samples_log2);
2307       mme_free_reg(b, passes_log2);
2308 
2309       mme_if(b, ieq, samples_per_pass_log2, mme_zero()) {
2310          /* One sample per pass, we can just blast it out */
2311          for (uint32_t i = 0; i < NVK_MAX_SAMPLES; i += 2) {
2312             uint32_t mask0 = 1 << i;
2313             uint32_t mask1 = 1 << (i + 1);
2314             mme_emit(b, mme_imm(mask0 | (mask1 << 16)));
2315          }
2316       }
2317 
2318       mme_if(b, ine, samples_per_pass_log2, mme_zero()) {
2319          struct mme_value samples_per_pass =
2320             mme_sll(b, mme_imm(1), samples_per_pass_log2);
2321 
2322          /* sample_mask = (1 << samples_per_pass) - 1 */
2323          struct mme_value sample_mask =
2324             mme_sll(b, mme_imm(1), samples_per_pass);
2325          mme_sub_to(b, sample_mask, sample_mask, mme_imm(1));
2326 
2327          struct mme_value mod_mask = mme_sub(b, samples_per_pass, mme_imm(1));
2328 
2329          struct mme_value s = mme_mov(b, mme_zero());
2330          mme_while(b, ine, s, mme_imm(NVK_MAX_SAMPLES)) {
2331             /* Since samples_per_pass >= 2, we know that both masks in the pair
2332              * will be the same.
2333              */
2334             struct mme_value packed =
2335                mme_merge(b, sample_mask, sample_mask, 16, 16, 0);
2336             mme_emit(b, packed);
2337             mme_free_reg(b, packed);
2338 
2339             mme_add_to(b, s, s, mme_imm(2));
2340 
2341             /* if (s % samples_per_pass == 0) */
2342             struct mme_value mod = mme_and(b, s, mod_mask);
2343             mme_if(b, ieq, mod, mme_zero()) {
2344                mme_sll_to(b, sample_mask, sample_mask, samples_per_pass);
2345             }
2346          }
2347       }
2348    }
2349 }
2350 
2351 const struct nvk_mme_test_case nvk_mme_set_anti_alias_tests[] = {{
2352    /* This case doesn't change the state so it should do nothing */
2353    .init = (struct nvk_mme_mthd_data[]) {
2354       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0 },
2355       { }
2356    },
2357    .params = (uint32_t[]) { 0xffff0000 },
2358    .expected = (struct nvk_mme_mthd_data[]) {
2359       { }
2360    },
2361 }, {
2362    /* Single sample, minSampleShading = 1.0 */
2363    .init = (struct nvk_mme_mthd_data[]) {
2364       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0xf },
2365       { }
2366    },
2367    .params = (uint32_t[]) { 0xffff0000 },
2368    .expected = (struct nvk_mme_mthd_data[]) {
2369       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0 },
2370       { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x1 },
2371       { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2372         nvk_root_descriptor_offset(draw.sample_masks) },
2373       { NV9097_LOAD_CONSTANT_BUFFER(0), 0x020001 },
2374       { NV9097_LOAD_CONSTANT_BUFFER(1), 0x080004 },
2375       { NV9097_LOAD_CONSTANT_BUFFER(2), 0x200010 },
2376       { NV9097_LOAD_CONSTANT_BUFFER(3), 0x800040 },
2377       { }
2378    },
2379 }, {
2380    /* Single sample, minSampleShading = 0.25 */
2381    .init = (struct nvk_mme_mthd_data[]) {
2382       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0xf },
2383       { }
2384    },
2385    .params = (uint32_t[]) { 0xffff0002 },
2386    .expected = (struct nvk_mme_mthd_data[]) {
2387       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x2 },
2388       { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x1 },
2389       { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2390         nvk_root_descriptor_offset(draw.sample_masks) },
2391       { NV9097_LOAD_CONSTANT_BUFFER(0), 0x020001 },
2392       { NV9097_LOAD_CONSTANT_BUFFER(1), 0x080004 },
2393       { NV9097_LOAD_CONSTANT_BUFFER(2), 0x200010 },
2394       { NV9097_LOAD_CONSTANT_BUFFER(3), 0x800040 },
2395       { }
2396    },
2397 }, {
2398    /* 8 samples, minSampleShading = 0.5 */
2399    .init = (struct nvk_mme_mthd_data[]) {
2400       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x1 },
2401       { }
2402    },
2403    .params = (uint32_t[]) { 0x00f00030 },
2404    .expected = (struct nvk_mme_mthd_data[]) {
2405       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x31 },
2406       { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x14 },
2407       { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2408         nvk_root_descriptor_offset(draw.sample_masks) },
2409       { NV9097_LOAD_CONSTANT_BUFFER(0), 0x030003 },
2410       { NV9097_LOAD_CONSTANT_BUFFER(1), 0x0c000c },
2411       { NV9097_LOAD_CONSTANT_BUFFER(2), 0x300030 },
2412       { NV9097_LOAD_CONSTANT_BUFFER(3), 0xc000c0 },
2413       { }
2414    },
2415 }, {
2416    /* 8 samples, minSampleShading = 0.25 */
2417    .init = (struct nvk_mme_mthd_data[]) {
2418       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x30 },
2419       { }
2420    },
2421    .params = (uint32_t[]) { 0x000f0002 },
2422    .expected = (struct nvk_mme_mthd_data[]) {
2423       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x32 },
2424       { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x12 },
2425       { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2426         nvk_root_descriptor_offset(draw.sample_masks) },
2427       { NV9097_LOAD_CONSTANT_BUFFER(0), 0x0f000f },
2428       { NV9097_LOAD_CONSTANT_BUFFER(1), 0x0f000f },
2429       { NV9097_LOAD_CONSTANT_BUFFER(2), 0xf000f0 },
2430       { NV9097_LOAD_CONSTANT_BUFFER(3), 0xf000f0 },
2431       { }
2432    },
2433 }, {}};
2434 
2435 static VkSampleLocationEXT
vk_sample_location(const struct vk_sample_locations_state * sl,uint32_t x,uint32_t y,uint32_t s)2436 vk_sample_location(const struct vk_sample_locations_state *sl,
2437                    uint32_t x, uint32_t y, uint32_t s)
2438 {
2439    x = x % sl->grid_size.width;
2440    y = y % sl->grid_size.height;
2441 
2442    return sl->locations[(x + y * sl->grid_size.width) * sl->per_pixel + s];
2443 }
2444 
2445 static struct nak_sample_location
vk_to_nak_sample_location(VkSampleLocationEXT loc)2446 vk_to_nak_sample_location(VkSampleLocationEXT loc)
2447 {
2448    return (struct nak_sample_location) {
2449       .x_u4 = util_bitpack_ufixed_clamp(loc.x, 0, 3, 4),
2450       .y_u4 = util_bitpack_ufixed_clamp(loc.y, 0, 3, 4),
2451    };
2452 }
2453 
2454 static void
nvk_flush_ms_state(struct nvk_cmd_buffer * cmd)2455 nvk_flush_ms_state(struct nvk_cmd_buffer *cmd)
2456 {
2457    const struct nvk_rendering_state *render = &cmd->state.gfx.render;
2458    const struct vk_dynamic_graphics_state *dyn =
2459       &cmd->vk.dynamic_graphics_state;
2460 
2461    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES)) {
2462       struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
2463 
2464       /* When we don't have any attachments, we can't know the sample count
2465        * from the render pass so we need to emit SET_ANTI_ALIAS here.  See the
2466        * comment in nvk_BeginRendering() for more details.
2467        */
2468       if (render->samples == 0) {
2469          /* Multisample information MAY be missing (rasterizationSamples == 0)
2470           * if rasterizer discard is enabled.  However, this isn't valid in
2471           * the hardware so always use at least one sample.
2472           */
2473          const uint32_t samples = MAX2(1, dyn->ms.rasterization_samples);
2474          enum nil_sample_layout layout = nil_choose_sample_layout(samples);
2475          P_IMMD(p, NV9097, SET_ANTI_ALIAS, nil_to_nv9097_samples_mode(layout));
2476       } else {
2477          /* Multisample information MAY be missing (rasterizationSamples == 0)
2478           * if rasterizer discard is enabled.
2479           */
2480          assert(dyn->ms.rasterization_samples == 0 ||
2481                 dyn->ms.rasterization_samples == render->samples);
2482       }
2483 
2484       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_ANTI_ALIAS));
2485       P_INLINE_DATA(p,
2486          nvk_mme_anti_alias_samples(dyn->ms.rasterization_samples));
2487    }
2488 
2489    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
2490        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE)) {
2491       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
2492       P_IMMD(p, NV9097, SET_ANTI_ALIAS_ALPHA_CONTROL, {
2493          .alpha_to_coverage = dyn->ms.alpha_to_coverage_enable,
2494          .alpha_to_one      = dyn->ms.alpha_to_one_enable,
2495       });
2496    }
2497 
2498    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ||
2499        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) ||
2500        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE)) {
2501       const struct vk_sample_locations_state *sl;
2502       if (dyn->ms.sample_locations_enable) {
2503          sl = dyn->ms.sample_locations;
2504       } else {
2505          const uint32_t samples = MAX2(1, dyn->ms.rasterization_samples);
2506          sl = vk_standard_sample_locations_state(samples);
2507       }
2508 
2509       struct nak_sample_location push_sl[NVK_MAX_SAMPLES];
2510       for (uint32_t i = 0; i < sl->per_pixel; i++)
2511          push_sl[i] = vk_to_nak_sample_location(sl->locations[i]);
2512 
2513       nvk_descriptor_state_set_root_array(cmd, &cmd->state.gfx.descriptors,
2514                                           draw.sample_locations,
2515                                           0, NVK_MAX_SAMPLES, push_sl);
2516 
2517       if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
2518          struct nak_sample_location loc[16];
2519          for (uint32_t n = 0; n < ARRAY_SIZE(loc); n++) {
2520             const uint32_t s = n % sl->per_pixel;
2521             const uint32_t px = n / sl->per_pixel;
2522             const uint32_t x = px % 2;
2523             const uint32_t y = px / 2;
2524 
2525             loc[n] = vk_to_nak_sample_location(vk_sample_location(sl, x, y, s));
2526          }
2527 
2528          struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
2529 
2530          P_MTHD(p, NVB197, SET_ANTI_ALIAS_SAMPLE_POSITIONS(0));
2531          for (uint32_t i = 0; i < 4; i++) {
2532             P_NVB197_SET_ANTI_ALIAS_SAMPLE_POSITIONS(p, i, {
2533                .x0 = loc[i * 4 + 0].x_u4,
2534                .y0 = loc[i * 4 + 0].y_u4,
2535                .x1 = loc[i * 4 + 1].x_u4,
2536                .y1 = loc[i * 4 + 1].y_u4,
2537                .x2 = loc[i * 4 + 2].x_u4,
2538                .y2 = loc[i * 4 + 2].y_u4,
2539                .x3 = loc[i * 4 + 3].x_u4,
2540                .y3 = loc[i * 4 + 3].y_u4,
2541             });
2542          }
2543       }
2544    }
2545 
2546    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK)) {
2547       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
2548       P_MTHD(p, NV9097, SET_SAMPLE_MASK_X0_Y0);
2549       P_NV9097_SET_SAMPLE_MASK_X0_Y0(p, dyn->ms.sample_mask & 0xffff);
2550       P_NV9097_SET_SAMPLE_MASK_X1_Y0(p, dyn->ms.sample_mask & 0xffff);
2551       P_NV9097_SET_SAMPLE_MASK_X0_Y1(p, dyn->ms.sample_mask & 0xffff);
2552       P_NV9097_SET_SAMPLE_MASK_X1_Y1(p, dyn->ms.sample_mask & 0xffff);
2553    }
2554 }
2555 
2556 static uint32_t
vk_to_nv9097_compare_op(VkCompareOp vk_op)2557 vk_to_nv9097_compare_op(VkCompareOp vk_op)
2558 {
2559    ASSERTED static const uint16_t vk_to_nv9097[] = {
2560       [VK_COMPARE_OP_NEVER]            = NV9097_SET_DEPTH_FUNC_V_OGL_NEVER,
2561       [VK_COMPARE_OP_LESS]             = NV9097_SET_DEPTH_FUNC_V_OGL_LESS,
2562       [VK_COMPARE_OP_EQUAL]            = NV9097_SET_DEPTH_FUNC_V_OGL_EQUAL,
2563       [VK_COMPARE_OP_LESS_OR_EQUAL]    = NV9097_SET_DEPTH_FUNC_V_OGL_LEQUAL,
2564       [VK_COMPARE_OP_GREATER]          = NV9097_SET_DEPTH_FUNC_V_OGL_GREATER,
2565       [VK_COMPARE_OP_NOT_EQUAL]        = NV9097_SET_DEPTH_FUNC_V_OGL_NOTEQUAL,
2566       [VK_COMPARE_OP_GREATER_OR_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_GEQUAL,
2567       [VK_COMPARE_OP_ALWAYS]           = NV9097_SET_DEPTH_FUNC_V_OGL_ALWAYS,
2568    };
2569    assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
2570 
2571    uint32_t nv9097_op = 0x200 | vk_op;
2572    assert(nv9097_op == vk_to_nv9097[vk_op]);
2573    return nv9097_op;
2574 }
2575 
2576 static uint32_t
vk_to_nv9097_stencil_op(VkStencilOp vk_op)2577 vk_to_nv9097_stencil_op(VkStencilOp vk_op)
2578 {
2579 #define OP(vk, nv) [VK_STENCIL_OP_##vk] = NV9097_SET_STENCIL_OP_FAIL_V_##nv
2580    ASSERTED static const uint16_t vk_to_nv9097[] = {
2581       OP(KEEP,                D3D_KEEP),
2582       OP(ZERO,                D3D_ZERO),
2583       OP(REPLACE,             D3D_REPLACE),
2584       OP(INCREMENT_AND_CLAMP, D3D_INCRSAT),
2585       OP(DECREMENT_AND_CLAMP, D3D_DECRSAT),
2586       OP(INVERT,              D3D_INVERT),
2587       OP(INCREMENT_AND_WRAP,  D3D_INCR),
2588       OP(DECREMENT_AND_WRAP,  D3D_DECR),
2589    };
2590    assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
2591 #undef OP
2592 
2593    uint32_t nv9097_op = vk_op + 1;
2594    assert(nv9097_op == vk_to_nv9097[vk_op]);
2595    return nv9097_op;
2596 }
2597 
2598 static void
nvk_flush_ds_state(struct nvk_cmd_buffer * cmd)2599 nvk_flush_ds_state(struct nvk_cmd_buffer *cmd)
2600 {
2601    struct nv_push *p = nvk_cmd_buffer_push(cmd, 35);
2602 
2603    const struct nvk_rendering_state *render = &cmd->state.gfx.render;
2604    const struct vk_dynamic_graphics_state *dyn =
2605       &cmd->vk.dynamic_graphics_state;
2606 
2607    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE)) {
2608       bool enable = dyn->ds.depth.test_enable &&
2609                     render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2610       P_IMMD(p, NV9097, SET_DEPTH_TEST, enable);
2611    }
2612 
2613    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE)) {
2614       bool enable = dyn->ds.depth.write_enable &&
2615                     render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2616       P_IMMD(p, NV9097, SET_DEPTH_WRITE, enable);
2617    }
2618 
2619    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP)) {
2620       const uint32_t func = vk_to_nv9097_compare_op(dyn->ds.depth.compare_op);
2621       P_IMMD(p, NV9097, SET_DEPTH_FUNC, func);
2622    }
2623 
2624    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE)) {
2625       bool enable = dyn->ds.depth.bounds_test.enable &&
2626                     render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2627       P_IMMD(p, NV9097, SET_DEPTH_BOUNDS_TEST, enable);
2628    }
2629 
2630    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) {
2631       P_MTHD(p, NV9097, SET_DEPTH_BOUNDS_MIN);
2632       P_NV9097_SET_DEPTH_BOUNDS_MIN(p, fui(dyn->ds.depth.bounds_test.min));
2633       P_NV9097_SET_DEPTH_BOUNDS_MAX(p, fui(dyn->ds.depth.bounds_test.max));
2634    }
2635 
2636    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE)) {
2637       bool enable = dyn->ds.stencil.test_enable &&
2638                     render->stencil_att.vk_format != VK_FORMAT_UNDEFINED;
2639       P_IMMD(p, NV9097, SET_STENCIL_TEST, enable);
2640    }
2641 
2642    const struct vk_stencil_test_face_state *front = &dyn->ds.stencil.front;
2643    const struct vk_stencil_test_face_state *back = &dyn->ds.stencil.back;
2644    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP)) {
2645       P_MTHD(p, NV9097, SET_STENCIL_OP_FAIL);
2646       P_NV9097_SET_STENCIL_OP_FAIL(p, vk_to_nv9097_stencil_op(front->op.fail));
2647       P_NV9097_SET_STENCIL_OP_ZFAIL(p, vk_to_nv9097_stencil_op(front->op.depth_fail));
2648       P_NV9097_SET_STENCIL_OP_ZPASS(p, vk_to_nv9097_stencil_op(front->op.pass));
2649       P_NV9097_SET_STENCIL_FUNC(p, vk_to_nv9097_compare_op(front->op.compare));
2650 
2651       P_MTHD(p, NV9097, SET_BACK_STENCIL_OP_FAIL);
2652       P_NV9097_SET_BACK_STENCIL_OP_FAIL(p, vk_to_nv9097_stencil_op(back->op.fail));
2653       P_NV9097_SET_BACK_STENCIL_OP_ZFAIL(p, vk_to_nv9097_stencil_op(back->op.depth_fail));
2654       P_NV9097_SET_BACK_STENCIL_OP_ZPASS(p, vk_to_nv9097_stencil_op(back->op.pass));
2655       P_NV9097_SET_BACK_STENCIL_FUNC(p, vk_to_nv9097_compare_op(back->op.compare));
2656    }
2657 
2658    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK)) {
2659       P_IMMD(p, NV9097, SET_STENCIL_FUNC_MASK, front->compare_mask);
2660       P_IMMD(p, NV9097, SET_BACK_STENCIL_FUNC_MASK, back->compare_mask);
2661    }
2662 
2663    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK)) {
2664       P_IMMD(p, NV9097, SET_STENCIL_MASK, front->write_mask);
2665       P_IMMD(p, NV9097, SET_BACK_STENCIL_MASK, back->write_mask);
2666    }
2667 
2668    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE)) {
2669       P_IMMD(p, NV9097, SET_STENCIL_FUNC_REF, front->reference);
2670       P_IMMD(p, NV9097, SET_BACK_STENCIL_FUNC_REF, back->reference);
2671    }
2672 }
2673 
2674 static uint32_t
vk_to_nv9097_logic_op(VkLogicOp vk_op)2675 vk_to_nv9097_logic_op(VkLogicOp vk_op)
2676 {
2677    ASSERTED uint16_t vk_to_nv9097[] = {
2678       [VK_LOGIC_OP_CLEAR]           = NV9097_SET_LOGIC_OP_FUNC_V_CLEAR,
2679       [VK_LOGIC_OP_AND]             = NV9097_SET_LOGIC_OP_FUNC_V_AND,
2680       [VK_LOGIC_OP_AND_REVERSE]     = NV9097_SET_LOGIC_OP_FUNC_V_AND_REVERSE,
2681       [VK_LOGIC_OP_COPY]            = NV9097_SET_LOGIC_OP_FUNC_V_COPY,
2682       [VK_LOGIC_OP_AND_INVERTED]    = NV9097_SET_LOGIC_OP_FUNC_V_AND_INVERTED,
2683       [VK_LOGIC_OP_NO_OP]           = NV9097_SET_LOGIC_OP_FUNC_V_NOOP,
2684       [VK_LOGIC_OP_XOR]             = NV9097_SET_LOGIC_OP_FUNC_V_XOR,
2685       [VK_LOGIC_OP_OR]              = NV9097_SET_LOGIC_OP_FUNC_V_OR,
2686       [VK_LOGIC_OP_NOR]             = NV9097_SET_LOGIC_OP_FUNC_V_NOR,
2687       [VK_LOGIC_OP_EQUIVALENT]      = NV9097_SET_LOGIC_OP_FUNC_V_EQUIV,
2688       [VK_LOGIC_OP_INVERT]          = NV9097_SET_LOGIC_OP_FUNC_V_INVERT,
2689       [VK_LOGIC_OP_OR_REVERSE]      = NV9097_SET_LOGIC_OP_FUNC_V_OR_REVERSE,
2690       [VK_LOGIC_OP_COPY_INVERTED]   = NV9097_SET_LOGIC_OP_FUNC_V_COPY_INVERTED,
2691       [VK_LOGIC_OP_OR_INVERTED]     = NV9097_SET_LOGIC_OP_FUNC_V_OR_INVERTED,
2692       [VK_LOGIC_OP_NAND]            = NV9097_SET_LOGIC_OP_FUNC_V_NAND,
2693       [VK_LOGIC_OP_SET]             = NV9097_SET_LOGIC_OP_FUNC_V_SET,
2694    };
2695    assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
2696 
2697    uint32_t nv9097_op = 0x1500 | vk_op;
2698    assert(nv9097_op == vk_to_nv9097[vk_op]);
2699    return nv9097_op;
2700 }
2701 
2702 static uint32_t
vk_to_nv9097_blend_op(VkBlendOp vk_op)2703 vk_to_nv9097_blend_op(VkBlendOp vk_op)
2704 {
2705 #define OP(vk, nv) [VK_BLEND_OP_##vk] = NV9097_SET_BLEND_COLOR_OP_V_OGL_##nv
2706    ASSERTED uint16_t vk_to_nv9097[] = {
2707       OP(ADD,              FUNC_ADD),
2708       OP(SUBTRACT,         FUNC_SUBTRACT),
2709       OP(REVERSE_SUBTRACT, FUNC_REVERSE_SUBTRACT),
2710       OP(MIN,              MIN),
2711       OP(MAX,              MAX),
2712    };
2713    assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
2714 #undef OP
2715 
2716    return vk_to_nv9097[vk_op];
2717 }
2718 
2719 static uint32_t
vk_to_nv9097_blend_factor(VkBlendFactor vk_factor)2720 vk_to_nv9097_blend_factor(VkBlendFactor vk_factor)
2721 {
2722 #define FACTOR(vk, nv) [VK_BLEND_FACTOR_##vk] = \
2723    NV9097_SET_BLEND_COLOR_SOURCE_COEFF_V_##nv
2724    ASSERTED uint16_t vk_to_nv9097[] = {
2725       FACTOR(ZERO,                     OGL_ZERO),
2726       FACTOR(ONE,                      OGL_ONE),
2727       FACTOR(SRC_COLOR,                OGL_SRC_COLOR),
2728       FACTOR(ONE_MINUS_SRC_COLOR,      OGL_ONE_MINUS_SRC_COLOR),
2729       FACTOR(DST_COLOR,                OGL_DST_COLOR),
2730       FACTOR(ONE_MINUS_DST_COLOR,      OGL_ONE_MINUS_DST_COLOR),
2731       FACTOR(SRC_ALPHA,                OGL_SRC_ALPHA),
2732       FACTOR(ONE_MINUS_SRC_ALPHA,      OGL_ONE_MINUS_SRC_ALPHA),
2733       FACTOR(DST_ALPHA,                OGL_DST_ALPHA),
2734       FACTOR(ONE_MINUS_DST_ALPHA,      OGL_ONE_MINUS_DST_ALPHA),
2735       FACTOR(CONSTANT_COLOR,           OGL_CONSTANT_COLOR),
2736       FACTOR(ONE_MINUS_CONSTANT_COLOR, OGL_ONE_MINUS_CONSTANT_COLOR),
2737       FACTOR(CONSTANT_ALPHA,           OGL_CONSTANT_ALPHA),
2738       FACTOR(ONE_MINUS_CONSTANT_ALPHA, OGL_ONE_MINUS_CONSTANT_ALPHA),
2739       FACTOR(SRC_ALPHA_SATURATE,       OGL_SRC_ALPHA_SATURATE),
2740       FACTOR(SRC1_COLOR,               OGL_SRC1COLOR),
2741       FACTOR(ONE_MINUS_SRC1_COLOR,     OGL_INVSRC1COLOR),
2742       FACTOR(SRC1_ALPHA,               OGL_SRC1ALPHA),
2743       FACTOR(ONE_MINUS_SRC1_ALPHA,     OGL_INVSRC1ALPHA),
2744    };
2745    assert(vk_factor < ARRAY_SIZE(vk_to_nv9097));
2746 #undef FACTOR
2747 
2748    return vk_to_nv9097[vk_factor];
2749 }
2750 
2751 void
nvk_mme_set_write_mask(struct mme_builder * b)2752 nvk_mme_set_write_mask(struct mme_builder *b)
2753 {
2754    struct mme_value count = mme_load(b);
2755    struct mme_value mask = mme_load(b);
2756 
2757    /*
2758     * mask is a bit field
2759     *
2760     * attachment index 88887777666655554444333322221111
2761     * component        abgrabgrabgrabgrabgrabgrabgrabgr
2762    */
2763 
2764    struct mme_value common_mask = mme_mov(b, mme_imm(1));
2765    struct mme_value first = mme_and(b, mask, mme_imm(BITFIELD_RANGE(0, 4)));
2766    struct mme_value i = mme_mov(b, mme_zero());
2767 
2768    mme_while(b, ine, i, count) {
2769       /*
2770          We call NV9097_SET_CT_WRITE per attachment. It needs a value as:
2771          0x0000 0000 0000 0000 000a 000b 000g 000r
2772 
2773          So for i=0 a mask of
2774          0x0000 0000 0000 0000 0000 0000 0000 1111
2775          becomes
2776          0x0000 0000 0000 0000 0001 0001 0001 0001
2777       */
2778 
2779       struct mme_value val = mme_merge(b, mme_zero(), mask, 0, 1, 0);
2780       mme_merge_to(b, val, val, mask, 4, 1, 1);
2781       mme_merge_to(b, val, val, mask, 8, 1, 2);
2782       mme_merge_to(b, val, val, mask, 12, 1, 3);
2783 
2784       mme_mthd_arr(b, NV9097_SET_CT_WRITE(0), i);
2785       mme_emit(b, val);
2786       mme_free_reg(b, val);
2787 
2788       /* Check if all masks are common */
2789       struct mme_value temp = mme_add(b, mask, mme_imm(BITFIELD_RANGE(0, 4)));
2790       mme_if(b, ine, first, temp) {
2791          mme_mov_to(b, common_mask, mme_zero());
2792       }
2793       mme_free_reg(b, temp);
2794 
2795       mme_srl_to(b, mask, mask, mme_imm(4));
2796 
2797       mme_add_to(b, i, i, mme_imm(1));
2798    }
2799 
2800    mme_mthd(b, NV9097_SET_SINGLE_CT_WRITE_CONTROL);
2801    mme_emit(b, common_mask);
2802 }
2803 
2804 static void
nvk_flush_cb_state(struct nvk_cmd_buffer * cmd)2805 nvk_flush_cb_state(struct nvk_cmd_buffer *cmd)
2806 {
2807    struct nvk_rendering_state *render = &cmd->state.gfx.render;
2808    const struct vk_dynamic_graphics_state *dyn =
2809       &cmd->vk.dynamic_graphics_state;
2810 
2811    struct nv_push *p =
2812       nvk_cmd_buffer_push(cmd, 15 + 10 * render->color_att_count);
2813 
2814    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE))
2815       P_IMMD(p, NV9097, SET_LOGIC_OP, dyn->cb.logic_op_enable);
2816 
2817    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP)) {
2818       const uint32_t func = vk_to_nv9097_logic_op(dyn->cb.logic_op);
2819       P_IMMD(p, NV9097, SET_LOGIC_OP_FUNC, func);
2820    }
2821 
2822    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES)) {
2823       for (uint8_t a = 0; a < render->color_att_count; a++) {
2824          P_IMMD(p, NV9097, SET_BLEND(a), dyn->cb.attachments[a].blend_enable);
2825       }
2826    }
2827 
2828    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) {
2829       for (uint8_t a = 0; a < render->color_att_count; a++) {
2830          const struct vk_color_blend_attachment_state *att =
2831             &dyn->cb.attachments[a];
2832          P_MTHD(p, NV9097, SET_BLEND_PER_TARGET_SEPARATE_FOR_ALPHA(a));
2833          P_NV9097_SET_BLEND_PER_TARGET_SEPARATE_FOR_ALPHA(p, a, ENABLE_TRUE);
2834          P_NV9097_SET_BLEND_PER_TARGET_COLOR_OP(p, a,
2835                vk_to_nv9097_blend_op(att->color_blend_op));
2836          P_NV9097_SET_BLEND_PER_TARGET_COLOR_SOURCE_COEFF(p, a,
2837                vk_to_nv9097_blend_factor(att->src_color_blend_factor));
2838          P_NV9097_SET_BLEND_PER_TARGET_COLOR_DEST_COEFF(p, a,
2839                vk_to_nv9097_blend_factor(att->dst_color_blend_factor));
2840          P_NV9097_SET_BLEND_PER_TARGET_ALPHA_OP(p, a,
2841                vk_to_nv9097_blend_op(att->alpha_blend_op));
2842          P_NV9097_SET_BLEND_PER_TARGET_ALPHA_SOURCE_COEFF(p, a,
2843                vk_to_nv9097_blend_factor(att->src_alpha_blend_factor));
2844          P_NV9097_SET_BLEND_PER_TARGET_ALPHA_DEST_COEFF(p, a,
2845                vk_to_nv9097_blend_factor(att->dst_alpha_blend_factor));
2846       }
2847    }
2848 
2849    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) ||
2850        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
2851        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RP_ATTACHMENTS) ||
2852        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP)) {
2853       uint32_t color_write_enables = 0x0;
2854       for (uint8_t a = 0; a < render->color_att_count; a++) {
2855          if (dyn->cb.color_write_enables & BITFIELD_BIT(a))
2856             color_write_enables |= 0xf << (4 * a);
2857       }
2858 
2859       uint32_t cb_att_write_mask = 0x0;
2860       for (uint8_t a = 0; a < render->color_att_count; a++)
2861          cb_att_write_mask |= dyn->cb.attachments[a].write_mask << (a * 4);
2862 
2863       uint32_t rp_att_write_mask = 0x0;
2864       for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
2865          if (dyn->rp.attachments & (MESA_VK_RP_ATTACHMENT_COLOR_0_BIT << a))
2866             rp_att_write_mask |= 0xf << (4 * a);
2867       }
2868 
2869       uint32_t att_has_loc_mask = 0x0;
2870       for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
2871          if (dyn->cal.color_map[a] != MESA_VK_ATTACHMENT_UNUSED)
2872             att_has_loc_mask |= 0xf << (4 * a);
2873       }
2874 
2875       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_WRITE_MASK));
2876       P_INLINE_DATA(p, render->color_att_count);
2877       P_INLINE_DATA(p, color_write_enables &
2878                        cb_att_write_mask &
2879                        rp_att_write_mask &
2880                        att_has_loc_mask);
2881    }
2882 
2883    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP)) {
2884       int8_t loc_att[NVK_MAX_RTS] = { -1, -1, -1, -1, -1, -1, -1, -1};
2885       uint8_t max_loc = 0;
2886       uint32_t att_used = 0;
2887       for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
2888          if (dyn->cal.color_map[a] == MESA_VK_ATTACHMENT_UNUSED)
2889             continue;
2890 
2891          att_used |= BITFIELD_BIT(a);
2892 
2893          assert(dyn->cal.color_map[a] < NVK_MAX_RTS);
2894          loc_att[dyn->cal.color_map[a]] = a;
2895          max_loc = MAX2(max_loc, dyn->cal.color_map[a]);
2896       }
2897 
2898       for (uint8_t l = 0; l < NVK_MAX_RTS; l++) {
2899          if (loc_att[l] >= 0)
2900             continue;
2901 
2902          /* Just grab any color attachment.  The way we set up color targets
2903           * in BeginRenderPass ensures that every color target is either the
2904           * valid color target referenced by this render pass or a valid NULL
2905           * target.  If we end up mapping to some other target in this render
2906           * pass, the handling of att_has_loc_mask above will ensure that no
2907           * color writes actually happen.
2908           */
2909          uint8_t a = ffs(~att_used) - 1;
2910          att_used |= BITFIELD_BIT(a);
2911          loc_att[l] = a;
2912       }
2913 
2914       P_IMMD(p, NV9097, SET_CT_SELECT, {
2915          .target_count = max_loc + 1,
2916          .target0 = loc_att[0],
2917          .target1 = loc_att[1],
2918          .target2 = loc_att[2],
2919          .target3 = loc_att[3],
2920          .target4 = loc_att[4],
2921          .target5 = loc_att[5],
2922          .target6 = loc_att[6],
2923          .target7 = loc_att[7],
2924       });
2925    }
2926 
2927    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
2928       P_MTHD(p, NV9097, SET_BLEND_CONST_RED);
2929       P_NV9097_SET_BLEND_CONST_RED(p,     fui(dyn->cb.blend_constants[0]));
2930       P_NV9097_SET_BLEND_CONST_GREEN(p,   fui(dyn->cb.blend_constants[1]));
2931       P_NV9097_SET_BLEND_CONST_BLUE(p,    fui(dyn->cb.blend_constants[2]));
2932       P_NV9097_SET_BLEND_CONST_ALPHA(p,   fui(dyn->cb.blend_constants[3]));
2933    }
2934 }
2935 
2936 static void
nvk_flush_dynamic_state(struct nvk_cmd_buffer * cmd)2937 nvk_flush_dynamic_state(struct nvk_cmd_buffer *cmd)
2938 {
2939    struct vk_dynamic_graphics_state *dyn =
2940       &cmd->vk.dynamic_graphics_state;
2941 
2942    if (!vk_dynamic_graphics_state_any_dirty(dyn))
2943       return;
2944 
2945    nvk_flush_vi_state(cmd);
2946    nvk_flush_ia_state(cmd);
2947    nvk_flush_ts_state(cmd);
2948    nvk_flush_vp_state(cmd);
2949    nvk_flush_rs_state(cmd);
2950 
2951    /* MESA_VK_DYNAMIC_FSR */
2952 
2953    nvk_flush_ms_state(cmd);
2954    nvk_flush_ds_state(cmd);
2955    nvk_flush_cb_state(cmd);
2956 
2957    vk_dynamic_graphics_state_clear_dirty(dyn);
2958 }
2959 
2960 void
nvk_mme_bind_cbuf_desc(struct mme_builder * b)2961 nvk_mme_bind_cbuf_desc(struct mme_builder *b)
2962 {
2963    /* First 4 bits are group, later bits are slot */
2964    struct mme_value group_slot = mme_load(b);
2965 
2966    struct mme_value addr_lo, addr_hi, size;
2967    if (nvk_use_bindless_cbuf(b->devinfo)) {
2968       if (b->devinfo->cls_eng3d >= TURING_A) {
2969          struct mme_value64 addr = mme_load_addr64(b);
2970          mme_tu104_read_fifoed(b, addr, mme_imm(2));
2971       }
2972 
2973       /* Load the descriptor */
2974       struct mme_value desc_lo = mme_load(b);
2975       struct mme_value desc_hi = mme_load(b);
2976 
2977       /* The bottom 45 bits are addr >> 4 */
2978       addr_lo = mme_merge(b, mme_zero(), desc_lo, 4, 28, 0);
2979       addr_hi = mme_merge(b, mme_zero(), desc_lo, 0, 4, 28);
2980       mme_merge_to(b, addr_hi, addr_hi, desc_hi, 4, 13, 0);
2981 
2982       /* The top 19 bits are size >> 4 */
2983       size = mme_merge(b, mme_zero(), desc_hi, 4, 19, 13);
2984 
2985       mme_free_reg(b, desc_hi);
2986       mme_free_reg(b, desc_lo);
2987    } else {
2988       if (b->devinfo->cls_eng3d >= TURING_A) {
2989          struct mme_value64 addr = mme_load_addr64(b);
2990          mme_tu104_read_fifoed(b, addr, mme_imm(3));
2991       }
2992 
2993       /* Load the descriptor */
2994       addr_lo = mme_load(b);
2995       addr_hi = mme_load(b);
2996       size = mme_load(b);
2997    }
2998 
2999    struct mme_value cb = mme_alloc_reg(b);
3000    mme_if(b, ieq, size, mme_zero()) {
3001       /* Bottim bit is the valid bit, 8:4 are shader slot */
3002       mme_merge_to(b, cb, mme_zero(), group_slot, 4, 5, 4);
3003    }
3004 
3005    mme_if(b, ine, size, mme_zero()) {
3006       /* size = max(size, NVK_MAX_CBUF_SIZE) */
3007       assert(util_is_power_of_two_nonzero(NVK_MAX_CBUF_SIZE));
3008       struct mme_value is_large =
3009          mme_and(b, size, mme_imm(~(NVK_MAX_CBUF_SIZE - 1)));
3010       mme_if(b, ine, is_large, mme_zero()) {
3011          mme_mov_to(b, size, mme_imm(NVK_MAX_CBUF_SIZE));
3012       }
3013 
3014       mme_mthd(b, NV9097_SET_CONSTANT_BUFFER_SELECTOR_A);
3015       mme_emit(b, size);
3016       mme_emit(b, addr_hi);
3017       mme_emit(b, addr_lo);
3018 
3019       /* Bottom bit is the valid bit, 8:4 are shader slot */
3020       mme_merge_to(b, cb, mme_imm(1), group_slot, 4, 5, 4);
3021    }
3022 
3023    mme_free_reg(b, addr_hi);
3024    mme_free_reg(b, addr_lo);
3025    mme_free_reg(b, size);
3026 
3027    /* The group comes in the bottom 4 bits in group_slot and we need to
3028     * combine it with the method.  However, unlike most array methods with a
3029     * stride if 1 dword, BIND_GROUP_CONSTANT_BUFFER has a stride of 32B or 8
3030     * dwords.  This means we need to also shift by 3.
3031     */
3032    struct mme_value group = mme_merge(b, mme_imm(0), group_slot, 3, 4, 0);
3033    mme_mthd_arr(b, NV9097_BIND_GROUP_CONSTANT_BUFFER(0), group);
3034    mme_emit(b, cb);
3035 }
3036 
3037 static void
nvk_flush_descriptors(struct nvk_cmd_buffer * cmd)3038 nvk_flush_descriptors(struct nvk_cmd_buffer *cmd)
3039 {
3040    struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
3041    struct nvk_physical_device *pdev = nvk_device_physical(dev);
3042    const uint32_t min_cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info);
3043    struct nvk_descriptor_state *desc = &cmd->state.gfx.descriptors;
3044 
3045    nvk_cmd_buffer_flush_push_descriptors(cmd, desc);
3046 
3047    /* Find cbuf maps for the 5 cbuf groups */
3048    const struct nvk_shader *cbuf_shaders[5] = { NULL, };
3049    for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; stage++) {
3050       const struct nvk_shader *shader = cmd->state.gfx.shaders[stage];
3051       if (shader == NULL)
3052          continue;
3053 
3054       uint32_t group = nvk_cbuf_binding_for_stage(stage);
3055       assert(group < ARRAY_SIZE(cbuf_shaders));
3056       cbuf_shaders[group] = shader;
3057    }
3058 
3059    bool bound_any_cbuf = false;
3060    for (uint32_t g = 0; g < ARRAY_SIZE(cbuf_shaders); g++) {
3061       if (cbuf_shaders[g] == NULL)
3062          continue;
3063 
3064       const struct nvk_shader *shader = cbuf_shaders[g];
3065       const struct nvk_cbuf_map *cbuf_map = &shader->cbuf_map;
3066       struct nvk_cbuf_group *group = &cmd->state.gfx.cbuf_groups[g];
3067 
3068       /* We only bother to re-bind cbufs that are in use */
3069       const uint32_t rebind =
3070          group->dirty & BITFIELD_MASK(cbuf_map->cbuf_count);
3071       if (!rebind)
3072          continue;
3073 
3074       u_foreach_bit(c, rebind) {
3075          const struct nvk_cbuf *cbuf = &group->cbufs[c];
3076 
3077          /* We bind these at the very end */
3078          if (cbuf->type == NVK_CBUF_TYPE_ROOT_DESC)
3079             continue;
3080 
3081          bound_any_cbuf = true;
3082 
3083          struct nvk_buffer_address ba;
3084          if (nvk_cmd_buffer_get_cbuf_addr(cmd, desc, shader, cbuf, &ba)) {
3085             assert(ba.base_addr % min_cbuf_alignment == 0);
3086             ba.size = align(ba.size, min_cbuf_alignment);
3087             ba.size = MIN2(ba.size, NVK_MAX_CBUF_SIZE);
3088 
3089             struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3090 
3091             if (ba.size > 0) {
3092                P_MTHD(p, NV9097, SET_CONSTANT_BUFFER_SELECTOR_A);
3093                P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_A(p, ba.size);
3094                P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_B(p, ba.base_addr >> 32);
3095                P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_C(p, ba.base_addr);
3096             }
3097 
3098             P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(g), {
3099                .valid = ba.size > 0,
3100                .shader_slot = c,
3101             });
3102          } else {
3103             uint64_t desc_addr =
3104                nvk_cmd_buffer_get_cbuf_descriptor_addr(cmd, desc, cbuf);
3105 
3106             if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
3107                struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
3108 
3109                P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC));
3110                P_INLINE_DATA(p, g | (c << 4));
3111                P_INLINE_DATA(p, desc_addr >> 32);
3112                P_INLINE_DATA(p, desc_addr);
3113             } else {
3114                struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
3115 
3116                P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC));
3117                P_INLINE_DATA(p, g | (c << 4));
3118 
3119                nv_push_update_count(p, 3);
3120                nvk_cmd_buffer_push_indirect(cmd, desc_addr, 12);
3121             }
3122          }
3123       }
3124 
3125       group->dirty &= ~rebind;
3126    }
3127 
3128    /* We bind all root descriptors last so that CONSTANT_BUFFER_SELECTOR is
3129     * always left pointing at the root descriptor table.  This way draw
3130     * parameters and similar MME root table updates always hit the root
3131     * descriptor table and not some random UBO.
3132     */
3133    if (bound_any_cbuf) {
3134       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
3135       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SELECT_CB0));
3136       P_INLINE_DATA(p, 0);
3137    }
3138 }
3139 
3140 static void
nvk_flush_gfx_state(struct nvk_cmd_buffer * cmd)3141 nvk_flush_gfx_state(struct nvk_cmd_buffer *cmd)
3142 {
3143    nvk_flush_shaders(cmd);
3144    nvk_flush_dynamic_state(cmd);
3145    nvk_flush_descriptors(cmd);
3146 }
3147 
3148 void
nvk_mme_bind_ib(struct mme_builder * b)3149 nvk_mme_bind_ib(struct mme_builder *b)
3150 {
3151    struct mme_value64 addr = mme_load_addr64(b);
3152    struct mme_value size_B = mme_load(b);
3153 
3154    struct mme_value addr_or = mme_or(b, addr.lo, addr.hi);
3155    mme_if(b, ieq, addr_or, mme_zero()) {
3156       mme_mov_to(b, size_B, mme_zero());
3157    }
3158    mme_free_reg(b, addr_or);
3159 
3160    if (b->devinfo->cls_eng3d < TURING_A) {
3161       mme_if(b, ieq, size_B, mme_zero()) {
3162          nvk_mme_load_scratch_to(b, addr.hi, ZERO_ADDR_HI);
3163          nvk_mme_load_scratch_to(b, addr.lo, ZERO_ADDR_LO);
3164       }
3165    }
3166 
3167    mme_mthd(b, NV9097_SET_INDEX_BUFFER_A);
3168    mme_emit(b, addr.hi);
3169    mme_emit(b, addr.lo);
3170 
3171    if (b->devinfo->cls_eng3d >= TURING_A) {
3172       mme_mthd(b, NVC597_SET_INDEX_BUFFER_SIZE_A);
3173       mme_emit(b, mme_zero());
3174       mme_emit(b, size_B);
3175    } else {
3176       /* Convert to an end address */
3177       mme_add64_to(b, addr, addr, mme_value64(size_B, mme_zero()));
3178       mme_add64_to(b, addr, addr, mme_imm64(-1));
3179 
3180       /* mme_mthd(b, NV9097_SET_INDEX_BUFFER_C); */
3181       mme_emit(b, addr.hi);
3182       mme_emit(b, addr.lo);
3183    }
3184    mme_free_reg64(b, addr);
3185    mme_free_reg(b, size_B);
3186 
3187    struct mme_value fmt = mme_load(b);
3188    struct mme_value restart = mme_mov(b, mme_imm(UINT32_MAX));
3189    struct mme_value index_type = mme_mov(b,
3190       mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_FOUR_BYTES));
3191 
3192    /* The Vulkan and D3D enums don't overlap so we can handle both at the same
3193     * time with one MME macro.
3194     */
3195    UNUSED static const uint32_t DXGI_FORMAT_R32_UINT = 42;
3196    static const uint32_t DXGI_FORMAT_R16_UINT = 57;
3197    static const uint32_t DXGI_FORMAT_R8_UINT = 62;
3198 
3199    mme_if(b, ieq, fmt, mme_imm(VK_INDEX_TYPE_UINT16)) {
3200       mme_mov_to(b, restart, mme_imm(UINT16_MAX));
3201       mme_mov_to(b, index_type,
3202                  mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_TWO_BYTES));
3203    }
3204 
3205    mme_if(b, ieq, fmt, mme_imm(DXGI_FORMAT_R16_UINT)) {
3206       mme_mov_to(b, restart, mme_imm(UINT16_MAX));
3207       mme_mov_to(b, index_type,
3208                  mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_TWO_BYTES));
3209    }
3210 
3211    mme_if(b, ieq, fmt, mme_imm(VK_INDEX_TYPE_UINT8_KHR)) {
3212       mme_mov_to(b, restart, mme_imm(UINT8_MAX));
3213       mme_mov_to(b, index_type,
3214                  mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_ONE_BYTE));
3215    }
3216 
3217    mme_if(b, ieq, fmt, mme_imm(DXGI_FORMAT_R8_UINT)) {
3218       mme_mov_to(b, restart, mme_imm(UINT8_MAX));
3219       mme_mov_to(b, index_type,
3220                  mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_ONE_BYTE));
3221    }
3222 
3223    mme_mthd(b, NV9097_SET_DA_PRIMITIVE_RESTART_INDEX);
3224    mme_emit(b, restart);
3225 
3226    mme_mthd(b, NV9097_SET_INDEX_BUFFER_E);
3227    mme_emit(b, index_type);
3228 }
3229 
3230 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)3231 nvk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,
3232                            VkBuffer _buffer,
3233                            VkDeviceSize offset,
3234                            VkDeviceSize size,
3235                            VkIndexType indexType)
3236 {
3237    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3238    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
3239    struct nvk_addr_range addr_range =
3240       nvk_buffer_addr_range(buffer, offset, size);
3241 
3242    struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3243    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_IB));
3244    P_INLINE_DATA(p, addr_range.addr >> 32);
3245    P_INLINE_DATA(p, addr_range.addr);
3246    assert(addr_range.range <= UINT32_MAX);
3247    P_INLINE_DATA(p, addr_range.range);
3248    P_INLINE_DATA(p, indexType);
3249 }
3250 
3251 void
nvk_mme_bind_vb(struct mme_builder * b)3252 nvk_mme_bind_vb(struct mme_builder *b)
3253 {
3254    struct mme_value vb_idx = mme_load(b);
3255    struct mme_value64 addr = mme_load_addr64(b);
3256    struct mme_value size_B = mme_load(b);
3257 
3258    struct mme_value addr_or = mme_or(b, addr.lo, addr.hi);
3259    mme_if(b, ieq, addr_or, mme_zero()) {
3260       mme_mov_to(b, size_B, mme_zero());
3261    }
3262    mme_free_reg(b, addr_or);
3263 
3264    if (b->devinfo->cls_eng3d < TURING_A) {
3265       mme_if(b, ieq, size_B, mme_zero()) {
3266          nvk_mme_load_scratch_to(b, addr.hi, ZERO_ADDR_HI);
3267          nvk_mme_load_scratch_to(b, addr.lo, ZERO_ADDR_LO);
3268       }
3269    }
3270 
3271    struct mme_value vb_idx4 = mme_sll(b, vb_idx, mme_imm(2));
3272    mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_LOCATION_A(0), vb_idx4);
3273    mme_free_reg(b, vb_idx4);
3274    mme_emit(b, addr.hi);
3275    mme_emit(b, addr.lo);
3276 
3277    if (b->devinfo->cls_eng3d >= TURING_A) {
3278       struct mme_value vb_idx2 = mme_sll(b, vb_idx, mme_imm(1));
3279       mme_mthd_arr(b, NVC597_SET_VERTEX_STREAM_SIZE_A(0), vb_idx2);
3280       mme_emit(b, mme_zero());
3281       mme_emit(b, size_B);
3282    } else {
3283       /* Convert to an end address */
3284       mme_add64_to(b, addr, addr, mme_value64(size_B, mme_zero()));
3285       mme_add64_to(b, addr, addr, mme_imm64(-1));
3286 
3287       struct mme_value vb_idx2 = mme_sll(b, vb_idx, mme_imm(1));
3288       mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_LIMIT_A_A(0), vb_idx2);
3289       mme_emit(b, addr.hi);
3290       mme_emit(b, addr.lo);
3291    }
3292 }
3293 
3294 static void
nvk_mme_bind_vb_test_check(const struct nv_device_info * devinfo,const struct nvk_mme_test_case * test,const struct nvk_mme_mthd_data * results)3295 nvk_mme_bind_vb_test_check(const struct nv_device_info *devinfo,
3296                            const struct nvk_mme_test_case *test,
3297                            const struct nvk_mme_mthd_data *results)
3298 {
3299    const uint32_t vb_idx = test->params[0];
3300    const uint32_t addr_hi = test->params[1];
3301    const uint32_t addr_lo = test->params[2];
3302 
3303    uint32_t size_B = test->params[3];
3304    if (addr_hi == 0 && addr_lo == 0)
3305       size_B = 0;
3306 
3307    assert(results[0].mthd == NV9097_SET_VERTEX_STREAM_A_LOCATION_A(vb_idx));
3308    assert(results[1].mthd == NV9097_SET_VERTEX_STREAM_A_LOCATION_B(vb_idx));
3309 
3310    if (devinfo->cls_eng3d >= TURING_A) {
3311       assert(results[0].data == addr_hi);
3312       assert(results[1].data == addr_lo);
3313 
3314       assert(results[2].mthd == NVC597_SET_VERTEX_STREAM_SIZE_A(3));
3315       assert(results[3].mthd == NVC597_SET_VERTEX_STREAM_SIZE_B(3));
3316       assert(results[2].data == 0);
3317       assert(results[3].data == size_B);
3318    } else {
3319       uint64_t addr = ((uint64_t)addr_hi << 32) | addr_lo;
3320       if (size_B == 0)
3321          addr = ((uint64_t)test->init[0].data << 32) | test->init[1].data;
3322 
3323       assert(results[0].data == addr >> 32);
3324       assert(results[1].data == (uint32_t)addr);
3325 
3326       const uint64_t limit = (addr + size_B) - 1;
3327       assert(results[2].mthd == NV9097_SET_VERTEX_STREAM_LIMIT_A_A(3));
3328       assert(results[3].mthd == NV9097_SET_VERTEX_STREAM_LIMIT_A_B(3));
3329       assert(results[2].data == limit >> 32);
3330       assert(results[3].data == (uint32_t)limit);
3331    }
3332 }
3333 
3334 const struct nvk_mme_test_case nvk_mme_bind_vb_tests[] = {{
3335    .params = (uint32_t[]) { 3, 0xff3, 0xff4ab000, 0x10000 },
3336    .check = nvk_mme_bind_vb_test_check,
3337 }, {
3338    .init = (struct nvk_mme_mthd_data[]) {
3339       { NVK_SET_MME_SCRATCH(ZERO_ADDR_HI), 0xff3 },
3340       { NVK_SET_MME_SCRATCH(ZERO_ADDR_LO), 0xff356000 },
3341       { }
3342    },
3343    .params = (uint32_t[]) { 3, 0xff3, 0xff4ab000, 0 },
3344    .check = nvk_mme_bind_vb_test_check,
3345 }, {
3346    .init = (struct nvk_mme_mthd_data[]) {
3347       { NVK_SET_MME_SCRATCH(ZERO_ADDR_HI), 0xff3 },
3348       { NVK_SET_MME_SCRATCH(ZERO_ADDR_LO), 0xff356000 },
3349       { }
3350    },
3351    .params = (uint32_t[]) { 3, 0, 0, 0x800 },
3352    .check = nvk_mme_bind_vb_test_check,
3353 }, {}};
3354 
3355 void
nvk_cmd_bind_vertex_buffer(struct nvk_cmd_buffer * cmd,uint32_t vb_idx,struct nvk_addr_range addr_range)3356 nvk_cmd_bind_vertex_buffer(struct nvk_cmd_buffer *cmd, uint32_t vb_idx,
3357                            struct nvk_addr_range addr_range)
3358 {
3359    /* Used for meta save/restore */
3360    if (vb_idx == 0)
3361       cmd->state.gfx.vb0 = addr_range;
3362 
3363    struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3364    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_VB));
3365    P_INLINE_DATA(p, vb_idx);
3366    P_INLINE_DATA(p, addr_range.addr >> 32);
3367    P_INLINE_DATA(p, addr_range.addr);
3368    assert(addr_range.range <= UINT32_MAX);
3369    P_INLINE_DATA(p, addr_range.range);
3370 }
3371 
3372 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)3373 nvk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,
3374                           uint32_t firstBinding,
3375                           uint32_t bindingCount,
3376                           const VkBuffer *pBuffers,
3377                           const VkDeviceSize *pOffsets,
3378                           const VkDeviceSize *pSizes,
3379                           const VkDeviceSize *pStrides)
3380 {
3381    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3382 
3383    if (pStrides) {
3384       vk_cmd_set_vertex_binding_strides(&cmd->vk, firstBinding,
3385                                         bindingCount, pStrides);
3386    }
3387 
3388    for (uint32_t i = 0; i < bindingCount; i++) {
3389       VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]);
3390       uint32_t idx = firstBinding + i;
3391 
3392       uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
3393       const struct nvk_addr_range addr_range =
3394          nvk_buffer_addr_range(buffer, pOffsets[i], size);
3395 
3396       nvk_cmd_bind_vertex_buffer(cmd, idx, addr_range);
3397    }
3398 }
3399 
3400 static void
nvk_mme_set_cb0_mthd(struct mme_builder * b,uint16_t cb0_offset,uint16_t mthd,struct mme_value val)3401 nvk_mme_set_cb0_mthd(struct mme_builder *b,
3402                      uint16_t cb0_offset,
3403                      uint16_t mthd,
3404                      struct mme_value val)
3405 {
3406    if (b->devinfo->cls_eng3d >= TURING_A) {
3407       struct mme_value old = mme_state(b, mthd);
3408       mme_if(b, ine, old, val) {
3409          mme_mthd(b, mthd);
3410          mme_emit(b, val);
3411 
3412          mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
3413          mme_emit(b, mme_imm(cb0_offset));
3414          mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
3415          mme_emit(b, val);
3416       }
3417       mme_free_reg(b, old);
3418    } else {
3419       /* Fermi is really tight on registers. Don't bother with the if and set
3420        * both unconditionally for now.
3421        */
3422       mme_mthd(b, mthd);
3423       mme_emit(b, val);
3424 
3425       mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
3426       mme_emit(b, mme_imm(cb0_offset));
3427       mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
3428       mme_emit(b, val);
3429    }
3430 }
3431 
3432 static void
nvk_mme_set_cb0_scratch(struct mme_builder * b,uint16_t cb0_offset,enum nvk_mme_scratch scratch,struct mme_value val)3433 nvk_mme_set_cb0_scratch(struct mme_builder *b,
3434                         uint16_t cb0_offset,
3435                         enum nvk_mme_scratch scratch,
3436                         struct mme_value val)
3437 {
3438    const uint16_t mthd = NV9097_SET_MME_SHADOW_SCRATCH(scratch);
3439    nvk_mme_set_cb0_mthd(b, cb0_offset, mthd, val);
3440 }
3441 
3442 struct mme_draw_params {
3443    struct mme_value base_vertex;
3444    struct mme_value first_vertex;
3445    struct mme_value first_instance;
3446    struct mme_value draw_index;
3447 };
3448 
3449 static void
nvk_mme_build_set_draw_params(struct mme_builder * b,const struct mme_draw_params * p)3450 nvk_mme_build_set_draw_params(struct mme_builder *b,
3451                               const struct mme_draw_params *p)
3452 {
3453    nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.base_vertex),
3454                            NVK_MME_SCRATCH_CB0_FIRST_VERTEX,
3455                            p->first_vertex);
3456    nvk_mme_set_cb0_mthd(b, nvk_root_descriptor_offset(draw.base_instance),
3457                         NV9097_SET_GLOBAL_BASE_INSTANCE_INDEX,
3458                         p->first_instance);
3459    nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.draw_index),
3460                            NVK_MME_SCRATCH_CB0_DRAW_INDEX,
3461                            p->draw_index);
3462    nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.view_index),
3463                            NVK_MME_SCRATCH_CB0_VIEW_INDEX,
3464                            mme_zero());
3465 
3466    mme_mthd(b, NV9097_SET_GLOBAL_BASE_VERTEX_INDEX);
3467    mme_emit(b, p->base_vertex);
3468    mme_mthd(b, NV9097_SET_VERTEX_ID_BASE);
3469    mme_emit(b, p->base_vertex);
3470 }
3471 
3472 static void
nvk_mme_emit_view_index(struct mme_builder * b,struct mme_value view_index)3473 nvk_mme_emit_view_index(struct mme_builder *b, struct mme_value view_index)
3474 {
3475    /* Set the push constant */
3476    nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.view_index),
3477                            NVK_MME_SCRATCH_CB0_VIEW_INDEX,
3478                            view_index);
3479 
3480    /* Set the layer to the view index */
3481    STATIC_ASSERT(DRF_LO(NV9097_SET_RT_LAYER_V) == 0);
3482    STATIC_ASSERT(NV9097_SET_RT_LAYER_CONTROL_V_SELECTS_LAYER == 0);
3483    mme_mthd(b, NV9097_SET_RT_LAYER);
3484    mme_emit(b, view_index);
3485 }
3486 
3487 static void
nvk_mme_build_draw_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value first_vertex,struct mme_value vertex_count)3488 nvk_mme_build_draw_loop(struct mme_builder *b,
3489                         struct mme_value instance_count,
3490                         struct mme_value first_vertex,
3491                         struct mme_value vertex_count)
3492 {
3493    struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
3494 
3495    mme_loop(b, instance_count) {
3496       mme_mthd(b, NV9097_BEGIN);
3497       mme_emit(b, begin);
3498 
3499       mme_mthd(b, NV9097_SET_VERTEX_ARRAY_START);
3500       mme_emit(b, first_vertex);
3501       mme_emit(b, vertex_count);
3502 
3503       mme_mthd(b, NV9097_END);
3504       mme_emit(b, mme_zero());
3505 
3506       mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
3507    }
3508 
3509    mme_free_reg(b, begin);
3510 }
3511 
3512 static void
nvk_mme_build_draw(struct mme_builder * b,struct mme_value draw_index)3513 nvk_mme_build_draw(struct mme_builder *b,
3514                    struct mme_value draw_index)
3515 {
3516    /* These are in VkDrawIndirectCommand order */
3517    struct mme_value vertex_count = mme_load(b);
3518    struct mme_value instance_count = mme_load(b);
3519    struct mme_value first_vertex = mme_load(b);
3520    struct mme_value first_instance = mme_load(b);
3521 
3522    struct mme_draw_params params = {
3523       .first_vertex = first_vertex,
3524       .first_instance = first_instance,
3525       .draw_index = draw_index,
3526    };
3527    nvk_mme_build_set_draw_params(b, &params);
3528 
3529    mme_free_reg(b, first_instance);
3530 
3531    if (b->devinfo->cls_eng3d < TURING_A)
3532       nvk_mme_spill(b, DRAW_IDX, draw_index);
3533 
3534    struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3535    mme_if(b, ieq, view_mask, mme_zero()) {
3536       mme_free_reg(b, view_mask);
3537 
3538       nvk_mme_build_draw_loop(b, instance_count,
3539                               first_vertex, vertex_count);
3540    }
3541 
3542    view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3543    mme_if(b, ine, view_mask, mme_zero()) {
3544       mme_free_reg(b, view_mask);
3545 
3546       struct mme_value view = mme_mov(b, mme_zero());
3547       mme_while(b, ine, view, mme_imm(32)) {
3548          view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3549          struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
3550          mme_free_reg(b, view_mask);
3551          mme_if(b, ine, has_view, mme_zero()) {
3552             mme_free_reg(b, has_view);
3553             nvk_mme_emit_view_index(b, view);
3554             nvk_mme_build_draw_loop(b, instance_count,
3555                                     first_vertex, vertex_count);
3556          }
3557 
3558          mme_add_to(b, view, view, mme_imm(1));
3559       }
3560       mme_free_reg(b, view);
3561    }
3562 
3563    mme_free_reg(b, instance_count);
3564    mme_free_reg(b, first_vertex);
3565    mme_free_reg(b, vertex_count);
3566 
3567    if (b->devinfo->cls_eng3d < TURING_A)
3568       nvk_mme_unspill(b, DRAW_IDX, draw_index);
3569 }
3570 
3571 void
nvk_mme_draw(struct mme_builder * b)3572 nvk_mme_draw(struct mme_builder *b)
3573 {
3574    struct mme_value draw_index = mme_load(b);
3575    nvk_mme_build_draw(b, draw_index);
3576 }
3577 
3578 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)3579 nvk_CmdDraw(VkCommandBuffer commandBuffer,
3580             uint32_t vertexCount,
3581             uint32_t instanceCount,
3582             uint32_t firstVertex,
3583             uint32_t firstInstance)
3584 {
3585    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3586 
3587    nvk_flush_gfx_state(cmd);
3588 
3589    struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3590    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
3591    P_INLINE_DATA(p, 0 /* draw_index */);
3592    P_INLINE_DATA(p, vertexCount);
3593    P_INLINE_DATA(p, instanceCount);
3594    P_INLINE_DATA(p, firstVertex);
3595    P_INLINE_DATA(p, firstInstance);
3596 }
3597 
3598 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)3599 nvk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,
3600                     uint32_t drawCount,
3601                     const VkMultiDrawInfoEXT *pVertexInfo,
3602                     uint32_t instanceCount,
3603                     uint32_t firstInstance,
3604                     uint32_t stride)
3605 {
3606    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3607 
3608    nvk_flush_gfx_state(cmd);
3609 
3610    for (uint32_t draw_index = 0; draw_index < drawCount; draw_index++) {
3611       struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3612       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
3613       P_INLINE_DATA(p, draw_index);
3614       P_INLINE_DATA(p, pVertexInfo->vertexCount);
3615       P_INLINE_DATA(p, instanceCount);
3616       P_INLINE_DATA(p, pVertexInfo->firstVertex);
3617       P_INLINE_DATA(p, firstInstance);
3618 
3619       pVertexInfo = ((void *)pVertexInfo) + stride;
3620    }
3621 }
3622 
3623 static void
nvk_mme_build_draw_indexed_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value first_index,struct mme_value index_count)3624 nvk_mme_build_draw_indexed_loop(struct mme_builder *b,
3625                                 struct mme_value instance_count,
3626                                 struct mme_value first_index,
3627                                 struct mme_value index_count)
3628 {
3629    struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
3630 
3631    mme_loop(b, instance_count) {
3632       mme_mthd(b, NV9097_BEGIN);
3633       mme_emit(b, begin);
3634 
3635       mme_mthd(b, NV9097_SET_INDEX_BUFFER_F);
3636       mme_emit(b, first_index);
3637       mme_emit(b, index_count);
3638 
3639       mme_mthd(b, NV9097_END);
3640       mme_emit(b, mme_zero());
3641 
3642       mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
3643    }
3644 
3645    mme_free_reg(b, begin);
3646 }
3647 
3648 static void
nvk_mme_build_draw_indexed(struct mme_builder * b,struct mme_value draw_index)3649 nvk_mme_build_draw_indexed(struct mme_builder *b,
3650                            struct mme_value draw_index)
3651 {
3652    /* These are in VkDrawIndexedIndirectCommand order */
3653    struct mme_value index_count = mme_load(b);
3654    struct mme_value instance_count = mme_load(b);
3655    struct mme_value first_index = mme_load(b);
3656    struct mme_value vertex_offset = mme_load(b);
3657    struct mme_value first_instance = mme_load(b);
3658 
3659    struct mme_draw_params params = {
3660       .base_vertex = vertex_offset,
3661       .first_vertex = vertex_offset,
3662       .first_instance = first_instance,
3663       .draw_index = draw_index,
3664    };
3665    nvk_mme_build_set_draw_params(b, &params);
3666 
3667    mme_free_reg(b, vertex_offset);
3668    mme_free_reg(b, first_instance);
3669 
3670    if (b->devinfo->cls_eng3d < TURING_A)
3671       nvk_mme_spill(b, DRAW_IDX, draw_index);
3672 
3673    struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3674    mme_if(b, ieq, view_mask, mme_zero()) {
3675       mme_free_reg(b, view_mask);
3676 
3677       nvk_mme_build_draw_indexed_loop(b, instance_count,
3678                                       first_index, index_count);
3679    }
3680 
3681    view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3682    mme_if(b, ine, view_mask, mme_zero()) {
3683       mme_free_reg(b, view_mask);
3684 
3685       struct mme_value view = mme_mov(b, mme_zero());
3686       mme_while(b, ine, view, mme_imm(32)) {
3687          view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3688          struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
3689          mme_free_reg(b, view_mask);
3690          mme_if(b, ine, has_view, mme_zero()) {
3691             mme_free_reg(b, has_view);
3692             nvk_mme_emit_view_index(b, view);
3693             nvk_mme_build_draw_indexed_loop(b, instance_count,
3694                                             first_index, index_count);
3695          }
3696 
3697          mme_add_to(b, view, view, mme_imm(1));
3698       }
3699       mme_free_reg(b, view);
3700    }
3701 
3702    mme_free_reg(b, instance_count);
3703    mme_free_reg(b, first_index);
3704    mme_free_reg(b, index_count);
3705 
3706    if (b->devinfo->cls_eng3d < TURING_A)
3707       nvk_mme_unspill(b, DRAW_IDX, draw_index);
3708 }
3709 
3710 void
nvk_mme_draw_indexed(struct mme_builder * b)3711 nvk_mme_draw_indexed(struct mme_builder *b)
3712 {
3713    struct mme_value draw_index = mme_load(b);
3714    nvk_mme_build_draw_indexed(b, draw_index);
3715 }
3716 
3717 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)3718 nvk_CmdDrawIndexed(VkCommandBuffer commandBuffer,
3719                    uint32_t indexCount,
3720                    uint32_t instanceCount,
3721                    uint32_t firstIndex,
3722                    int32_t vertexOffset,
3723                    uint32_t firstInstance)
3724 {
3725    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3726 
3727    nvk_flush_gfx_state(cmd);
3728 
3729    struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
3730    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
3731    P_INLINE_DATA(p, 0 /* draw_index */);
3732    P_INLINE_DATA(p, indexCount);
3733    P_INLINE_DATA(p, instanceCount);
3734    P_INLINE_DATA(p, firstIndex);
3735    P_INLINE_DATA(p, vertexOffset);
3736    P_INLINE_DATA(p, firstInstance);
3737 }
3738 
3739 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)3740 nvk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,
3741                            uint32_t drawCount,
3742                            const VkMultiDrawIndexedInfoEXT *pIndexInfo,
3743                            uint32_t instanceCount,
3744                            uint32_t firstInstance,
3745                            uint32_t stride,
3746                            const int32_t *pVertexOffset)
3747 {
3748    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3749 
3750    nvk_flush_gfx_state(cmd);
3751 
3752    for (uint32_t draw_index = 0; draw_index < drawCount; draw_index++) {
3753       const uint32_t vertex_offset =
3754          pVertexOffset != NULL ? *pVertexOffset : pIndexInfo->vertexOffset;
3755 
3756       struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
3757       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
3758       P_INLINE_DATA(p, draw_index);
3759       P_INLINE_DATA(p, pIndexInfo->indexCount);
3760       P_INLINE_DATA(p, instanceCount);
3761       P_INLINE_DATA(p, pIndexInfo->firstIndex);
3762       P_INLINE_DATA(p, vertex_offset);
3763       P_INLINE_DATA(p, firstInstance);
3764 
3765       pIndexInfo = ((void *)pIndexInfo) + stride;
3766    }
3767 }
3768 
3769 void
nvk_mme_draw_indirect(struct mme_builder * b)3770 nvk_mme_draw_indirect(struct mme_builder *b)
3771 {
3772    if (b->devinfo->cls_eng3d >= TURING_A) {
3773       struct mme_value64 draw_addr = mme_load_addr64(b);
3774       struct mme_value draw_count = mme_load(b);
3775       struct mme_value stride = mme_load(b);
3776 
3777       struct mme_value draw = mme_mov(b, mme_zero());
3778       mme_while(b, ult, draw, draw_count) {
3779          mme_tu104_read_fifoed(b, draw_addr, mme_imm(4));
3780 
3781          nvk_mme_build_draw(b, draw);
3782 
3783          mme_add_to(b, draw, draw, mme_imm(1));
3784          mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
3785       }
3786    } else {
3787       struct mme_value draw_count = mme_load(b);
3788       nvk_mme_load_to_scratch(b, DRAW_PAD_DW);
3789 
3790       struct mme_value draw = mme_mov(b, mme_zero());
3791       mme_while(b, ine, draw, draw_count) {
3792          nvk_mme_spill(b, DRAW_COUNT, draw_count);
3793 
3794          nvk_mme_build_draw(b, draw);
3795          mme_add_to(b, draw, draw, mme_imm(1));
3796 
3797          struct mme_value pad_dw = nvk_mme_load_scratch(b, DRAW_PAD_DW);
3798          mme_loop(b, pad_dw) {
3799             mme_free_reg(b, mme_load(b));
3800          }
3801          mme_free_reg(b, pad_dw);
3802 
3803          nvk_mme_unspill(b, DRAW_COUNT, draw_count);
3804       }
3805    }
3806 }
3807 
3808 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3809 nvk_CmdDrawIndirect(VkCommandBuffer commandBuffer,
3810                     VkBuffer _buffer,
3811                     VkDeviceSize offset,
3812                     uint32_t drawCount,
3813                     uint32_t stride)
3814 {
3815    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3816    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
3817 
3818    /* From the Vulkan 1.3.238 spec:
3819     *
3820     *    VUID-vkCmdDrawIndirect-drawCount-00476
3821     *
3822     *    "If drawCount is greater than 1, stride must be a multiple of 4 and
3823     *    must be greater than or equal to sizeof(VkDrawIndirectCommand)"
3824     *
3825     * and
3826     *
3827     *    "If drawCount is less than or equal to one, stride is ignored."
3828     */
3829    if (drawCount > 1) {
3830       assert(stride % 4 == 0);
3831       assert(stride >= sizeof(VkDrawIndirectCommand));
3832    } else {
3833       stride = sizeof(VkDrawIndirectCommand);
3834    }
3835 
3836    nvk_flush_gfx_state(cmd);
3837 
3838    if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
3839       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3840       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
3841       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
3842       P_INLINE_DATA(p, draw_addr >> 32);
3843       P_INLINE_DATA(p, draw_addr);
3844       P_INLINE_DATA(p, drawCount);
3845       P_INLINE_DATA(p, stride);
3846    } else {
3847       const uint32_t max_draws_per_push =
3848          ((NV_PUSH_MAX_COUNT - 3) * 4) / stride;
3849 
3850       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
3851       while (drawCount) {
3852          const uint32_t count = MIN2(drawCount, max_draws_per_push);
3853 
3854          struct nv_push *p = nvk_cmd_buffer_push(cmd, 3);
3855          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
3856          P_INLINE_DATA(p, count);
3857          P_INLINE_DATA(p, (stride - sizeof(VkDrawIndirectCommand)) / 4);
3858 
3859          uint64_t range = count * (uint64_t)stride;
3860          nv_push_update_count(p, range / 4);
3861          nvk_cmd_buffer_push_indirect(cmd, draw_addr, range);
3862 
3863          draw_addr += range;
3864          drawCount -= count;
3865       }
3866    }
3867 }
3868 
3869 void
nvk_mme_draw_indexed_indirect(struct mme_builder * b)3870 nvk_mme_draw_indexed_indirect(struct mme_builder *b)
3871 {
3872    if (b->devinfo->cls_eng3d >= TURING_A) {
3873       struct mme_value64 draw_addr = mme_load_addr64(b);
3874       struct mme_value draw_count = mme_load(b);
3875       struct mme_value stride = mme_load(b);
3876 
3877       struct mme_value draw = mme_mov(b, mme_zero());
3878       mme_while(b, ult, draw, draw_count) {
3879          mme_tu104_read_fifoed(b, draw_addr, mme_imm(5));
3880 
3881          nvk_mme_build_draw_indexed(b, draw);
3882 
3883          mme_add_to(b, draw, draw, mme_imm(1));
3884          mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
3885       }
3886    } else {
3887       struct mme_value draw_count = mme_load(b);
3888       nvk_mme_load_to_scratch(b, DRAW_PAD_DW);
3889 
3890       struct mme_value draw = mme_mov(b, mme_zero());
3891       mme_while(b, ine, draw, draw_count) {
3892          nvk_mme_spill(b, DRAW_COUNT, draw_count);
3893 
3894          nvk_mme_build_draw_indexed(b, draw);
3895          mme_add_to(b, draw, draw, mme_imm(1));
3896 
3897          struct mme_value pad_dw = nvk_mme_load_scratch(b, DRAW_PAD_DW);
3898          mme_loop(b, pad_dw) {
3899             mme_free_reg(b, mme_load(b));
3900          }
3901          mme_free_reg(b, pad_dw);
3902 
3903          nvk_mme_unspill(b, DRAW_COUNT, draw_count);
3904       }
3905    }
3906 }
3907 
3908 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3909 nvk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
3910                            VkBuffer _buffer,
3911                            VkDeviceSize offset,
3912                            uint32_t drawCount,
3913                            uint32_t stride)
3914 {
3915    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3916    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
3917 
3918    /* From the Vulkan 1.3.238 spec:
3919     *
3920     *    VUID-vkCmdDrawIndexedIndirect-drawCount-00528
3921     *
3922     *    "If drawCount is greater than 1, stride must be a multiple of 4 and
3923     *    must be greater than or equal to sizeof(VkDrawIndexedIndirectCommand)"
3924     *
3925     * and
3926     *
3927     *    "If drawCount is less than or equal to one, stride is ignored."
3928     */
3929    if (drawCount > 1) {
3930       assert(stride % 4 == 0);
3931       assert(stride >= sizeof(VkDrawIndexedIndirectCommand));
3932    } else {
3933       stride = sizeof(VkDrawIndexedIndirectCommand);
3934    }
3935 
3936    nvk_flush_gfx_state(cmd);
3937 
3938    if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
3939       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3940       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
3941       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
3942       P_INLINE_DATA(p, draw_addr >> 32);
3943       P_INLINE_DATA(p, draw_addr);
3944       P_INLINE_DATA(p, drawCount);
3945       P_INLINE_DATA(p, stride);
3946    } else {
3947       const uint32_t max_draws_per_push =
3948          ((NV_PUSH_MAX_COUNT - 3) * 4) / stride;
3949 
3950       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
3951       while (drawCount) {
3952          const uint32_t count = MIN2(drawCount, max_draws_per_push);
3953 
3954          struct nv_push *p = nvk_cmd_buffer_push(cmd, 3);
3955          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
3956          P_INLINE_DATA(p, count);
3957          P_INLINE_DATA(p, (stride - sizeof(VkDrawIndexedIndirectCommand)) / 4);
3958 
3959          uint64_t range = count * (uint64_t)stride;
3960          nv_push_update_count(p, range / 4);
3961          nvk_cmd_buffer_push_indirect(cmd, draw_addr, range);
3962 
3963          draw_addr += range;
3964          drawCount -= count;
3965       }
3966    }
3967 }
3968 
3969 void
nvk_mme_draw_indirect_count(struct mme_builder * b)3970 nvk_mme_draw_indirect_count(struct mme_builder *b)
3971 {
3972    if (b->devinfo->cls_eng3d < TURING_A)
3973       return;
3974 
3975    struct mme_value64 draw_addr = mme_load_addr64(b);
3976    struct mme_value64 draw_count_addr = mme_load_addr64(b);
3977    struct mme_value draw_max = mme_load(b);
3978    struct mme_value stride = mme_load(b);
3979 
3980    mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
3981    mme_free_reg64(b, draw_count_addr);
3982    struct mme_value draw_count_buf = mme_load(b);
3983 
3984    mme_if(b, ule, draw_count_buf, draw_max) {
3985       mme_mov_to(b, draw_max, draw_count_buf);
3986    }
3987    mme_free_reg(b, draw_count_buf);
3988 
3989    struct mme_value draw = mme_mov(b, mme_zero());
3990    mme_while(b, ult, draw, draw_max) {
3991       mme_tu104_read_fifoed(b, draw_addr, mme_imm(4));
3992 
3993       nvk_mme_build_draw(b, draw);
3994 
3995       mme_add_to(b, draw, draw, mme_imm(1));
3996       mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
3997    }
3998 }
3999 
4000 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)4001 nvk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,
4002                          VkBuffer _buffer,
4003                          VkDeviceSize offset,
4004                          VkBuffer countBuffer,
4005                          VkDeviceSize countBufferOffset,
4006                          uint32_t maxDrawCount,
4007                          uint32_t stride)
4008 {
4009    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4010    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4011    VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
4012 
4013    /* TODO: Indirect count draw pre-Turing */
4014    assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
4015 
4016    nvk_flush_gfx_state(cmd);
4017 
4018    struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4019    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT_COUNT));
4020    uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4021    P_INLINE_DATA(p, draw_addr >> 32);
4022    P_INLINE_DATA(p, draw_addr);
4023    uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
4024                                                  countBufferOffset);
4025    P_INLINE_DATA(p, draw_count_addr >> 32);
4026    P_INLINE_DATA(p, draw_count_addr);
4027    P_INLINE_DATA(p, maxDrawCount);
4028    P_INLINE_DATA(p, stride);
4029 }
4030 
4031 void
nvk_mme_draw_indexed_indirect_count(struct mme_builder * b)4032 nvk_mme_draw_indexed_indirect_count(struct mme_builder *b)
4033 {
4034    if (b->devinfo->cls_eng3d < TURING_A)
4035       return;
4036 
4037    struct mme_value64 draw_addr = mme_load_addr64(b);
4038    struct mme_value64 draw_count_addr = mme_load_addr64(b);
4039    struct mme_value draw_max = mme_load(b);
4040    struct mme_value stride = mme_load(b);
4041 
4042    mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
4043    mme_free_reg64(b, draw_count_addr);
4044    struct mme_value draw_count_buf = mme_load(b);
4045 
4046    mme_if(b, ule, draw_count_buf, draw_max) {
4047       mme_mov_to(b, draw_max, draw_count_buf);
4048    }
4049    mme_free_reg(b, draw_count_buf);
4050 
4051    struct mme_value draw = mme_mov(b, mme_zero());
4052    mme_while(b, ult, draw, draw_max) {
4053       mme_tu104_read_fifoed(b, draw_addr, mme_imm(5));
4054 
4055       nvk_mme_build_draw_indexed(b, draw);
4056 
4057       mme_add_to(b, draw, draw, mme_imm(1));
4058       mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4059    }
4060 }
4061 
4062 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)4063 nvk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,
4064                                 VkBuffer _buffer,
4065                                 VkDeviceSize offset,
4066                                 VkBuffer countBuffer,
4067                                 VkDeviceSize countBufferOffset,
4068                                 uint32_t maxDrawCount,
4069                                 uint32_t stride)
4070 {
4071    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4072    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4073    VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
4074 
4075    /* TODO: Indexed indirect count draw pre-Turing */
4076    assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
4077 
4078    nvk_flush_gfx_state(cmd);
4079 
4080    struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4081    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT_COUNT));
4082    uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4083    P_INLINE_DATA(p, draw_addr >> 32);
4084    P_INLINE_DATA(p, draw_addr);
4085    uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
4086                                                  countBufferOffset);
4087    P_INLINE_DATA(p, draw_count_addr >> 32);
4088    P_INLINE_DATA(p, draw_count_addr);
4089    P_INLINE_DATA(p, maxDrawCount);
4090    P_INLINE_DATA(p, stride);
4091 }
4092 
4093 static void
nvk_mme_xfb_draw_indirect_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value counter)4094 nvk_mme_xfb_draw_indirect_loop(struct mme_builder *b,
4095                                struct mme_value instance_count,
4096                                struct mme_value counter)
4097 {
4098    struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
4099 
4100    mme_loop(b, instance_count) {
4101       mme_mthd(b, NV9097_BEGIN);
4102       mme_emit(b, begin);
4103 
4104       mme_mthd(b, NV9097_DRAW_AUTO);
4105       mme_emit(b, counter);
4106 
4107       mme_mthd(b, NV9097_END);
4108       mme_emit(b, mme_zero());
4109 
4110       mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
4111    }
4112 
4113    mme_free_reg(b, begin);
4114 }
4115 
4116 void
nvk_mme_xfb_draw_indirect(struct mme_builder * b)4117 nvk_mme_xfb_draw_indirect(struct mme_builder *b)
4118 {
4119    struct mme_value instance_count = mme_load(b);
4120    struct mme_value first_instance = mme_load(b);
4121 
4122    if (b->devinfo->cls_eng3d >= TURING_A) {
4123       struct mme_value64 counter_addr = mme_load_addr64(b);
4124       mme_tu104_read_fifoed(b, counter_addr, mme_imm(1));
4125       mme_free_reg(b, counter_addr.lo);
4126       mme_free_reg(b, counter_addr.hi);
4127    }
4128    struct mme_value counter = mme_load(b);
4129 
4130    struct mme_draw_params params = {
4131       .first_instance = first_instance,
4132    };
4133    nvk_mme_build_set_draw_params(b, &params);
4134 
4135    mme_free_reg(b, first_instance);
4136 
4137    struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4138    mme_if(b, ieq, view_mask, mme_zero()) {
4139       mme_free_reg(b, view_mask);
4140 
4141       nvk_mme_xfb_draw_indirect_loop(b, instance_count, counter);
4142    }
4143 
4144    view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4145    mme_if(b, ine, view_mask, mme_zero()) {
4146       mme_free_reg(b, view_mask);
4147 
4148       struct mme_value view = mme_mov(b, mme_zero());
4149       mme_while(b, ine, view, mme_imm(32)) {
4150          view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4151          struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
4152          mme_free_reg(b, view_mask);
4153          mme_if(b, ine, has_view, mme_zero()) {
4154             mme_free_reg(b, has_view);
4155             nvk_mme_emit_view_index(b, view);
4156             nvk_mme_xfb_draw_indirect_loop(b, instance_count, counter);
4157          }
4158 
4159          mme_add_to(b, view, view, mme_imm(1));
4160       }
4161    }
4162 
4163    mme_free_reg(b, instance_count);
4164    mme_free_reg(b, counter);
4165 }
4166 
4167 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)4168 nvk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
4169                                 uint32_t instanceCount,
4170                                 uint32_t firstInstance,
4171                                 VkBuffer counterBuffer,
4172                                 VkDeviceSize counterBufferOffset,
4173                                 uint32_t counterOffset,
4174                                 uint32_t vertexStride)
4175 {
4176    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4177    VK_FROM_HANDLE(nvk_buffer, counter_buffer, counterBuffer);
4178 
4179    nvk_flush_gfx_state(cmd);
4180 
4181    uint64_t counter_addr = nvk_buffer_address(counter_buffer,
4182                                               counterBufferOffset);
4183 
4184    if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4185       struct nv_push *p = nvk_cmd_buffer_push(cmd, 9);
4186       P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
4187       P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
4188 
4189       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
4190       P_INLINE_DATA(p, instanceCount);
4191       P_INLINE_DATA(p, firstInstance);
4192       P_INLINE_DATA(p, counter_addr >> 32);
4193       P_INLINE_DATA(p, counter_addr);
4194    } else {
4195       struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
4196       P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
4197       P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
4198 
4199       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
4200       P_INLINE_DATA(p, instanceCount);
4201       P_INLINE_DATA(p, firstInstance);
4202       nv_push_update_count(p, 1);
4203       nvk_cmd_buffer_push_indirect(cmd, counter_addr, 4);
4204    }
4205 }
4206 
4207 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)4208 nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
4209                                        uint32_t firstBinding,
4210                                        uint32_t bindingCount,
4211                                        const VkBuffer *pBuffers,
4212                                        const VkDeviceSize *pOffsets,
4213                                        const VkDeviceSize *pSizes)
4214 {
4215    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4216 
4217    for (uint32_t i = 0; i < bindingCount; i++) {
4218       VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]);
4219       uint32_t idx = firstBinding + i;
4220       uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
4221       struct nvk_addr_range addr_range =
4222          nvk_buffer_addr_range(buffer, pOffsets[i], size);
4223       assert(addr_range.range <= UINT32_MAX);
4224 
4225       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
4226 
4227       P_MTHD(p, NV9097, SET_STREAM_OUT_BUFFER_ENABLE(idx));
4228       P_NV9097_SET_STREAM_OUT_BUFFER_ENABLE(p, idx, V_TRUE);
4229       P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_A(p, idx, addr_range.addr >> 32);
4230       P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_B(p, idx, addr_range.addr);
4231       P_NV9097_SET_STREAM_OUT_BUFFER_SIZE(p, idx, (uint32_t)addr_range.range);
4232    }
4233 
4234    // TODO: do we need to SET_STREAM_OUT_BUFFER_ENABLE V_FALSE ?
4235 }
4236 
4237 void
nvk_mme_xfb_counter_load(struct mme_builder * b)4238 nvk_mme_xfb_counter_load(struct mme_builder *b)
4239 {
4240    struct mme_value buffer = mme_load(b);
4241 
4242    struct mme_value counter;
4243    if (b->devinfo->cls_eng3d >= TURING_A) {
4244       struct mme_value64 counter_addr = mme_load_addr64(b);
4245 
4246       mme_tu104_read_fifoed(b, counter_addr, mme_imm(1));
4247       mme_free_reg(b, counter_addr.lo);
4248       mme_free_reg(b, counter_addr.hi);
4249 
4250       counter = mme_load(b);
4251    } else {
4252       counter = mme_load(b);
4253    }
4254 
4255    mme_mthd_arr(b, NV9097_SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(0), buffer);
4256    mme_emit(b, counter);
4257 
4258    mme_free_reg(b, counter);
4259    mme_free_reg(b, buffer);
4260 }
4261 
4262 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)4263 nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
4264                                  uint32_t firstCounterBuffer,
4265                                  uint32_t counterBufferCount,
4266                                  const VkBuffer *pCounterBuffers,
4267                                  const VkDeviceSize *pCounterBufferOffsets)
4268 {
4269    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4270    const uint32_t max_buffers = 4;
4271 
4272    struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + 2 * max_buffers);
4273 
4274    P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_TRUE);
4275    for (uint32_t i = 0; i < max_buffers; ++i) {
4276       P_IMMD(p, NV9097, SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(i), 0);
4277    }
4278 
4279    for (uint32_t i = 0; i < counterBufferCount; ++i) {
4280       if (pCounterBuffers[i] == VK_NULL_HANDLE)
4281          continue;
4282 
4283       VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]);
4284       // index of counter buffer corresponts to index of transform buffer
4285       uint32_t cb_idx = firstCounterBuffer + i;
4286       uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
4287       uint64_t cb_addr = nvk_buffer_address(buffer, offset);
4288 
4289       if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4290          struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
4291          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD));
4292          /* The STREAM_OUT_BUFFER_LOAD_WRITE_POINTER registers are 8 dword stride */
4293          P_INLINE_DATA(p, cb_idx * 8);
4294          P_INLINE_DATA(p, cb_addr >> 32);
4295          P_INLINE_DATA(p, cb_addr);
4296       } else {
4297          struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
4298          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD));
4299          P_INLINE_DATA(p, cb_idx);
4300          nv_push_update_count(p, 1);
4301          nvk_cmd_buffer_push_indirect(cmd, cb_addr, 4);
4302       }
4303    }
4304 }
4305 
4306 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)4307 nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
4308                                uint32_t firstCounterBuffer,
4309                                uint32_t counterBufferCount,
4310                                const VkBuffer *pCounterBuffers,
4311                                const VkDeviceSize *pCounterBufferOffsets)
4312 {
4313    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4314 
4315    struct nv_push *p = nvk_cmd_buffer_push(cmd, 5 * counterBufferCount + 2);
4316 
4317    P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_FALSE);
4318 
4319    for (uint32_t i = 0; i < counterBufferCount; ++i) {
4320       if (pCounterBuffers[i] == VK_NULL_HANDLE)
4321          continue;
4322 
4323       VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]);
4324       // index of counter buffer corresponts to index of transform buffer
4325       uint32_t cb_idx = firstCounterBuffer + i;
4326       uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
4327       uint64_t cb_addr = nvk_buffer_address(buffer, offset);
4328 
4329       P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
4330       P_NV9097_SET_REPORT_SEMAPHORE_A(p, cb_addr >> 32);
4331       P_NV9097_SET_REPORT_SEMAPHORE_B(p, cb_addr);
4332       P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
4333       P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
4334          .operation = OPERATION_REPORT_ONLY,
4335          .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT,
4336          .report = REPORT_STREAMING_BYTE_COUNT,
4337          .sub_report = cb_idx,
4338          .structure_size = STRUCTURE_SIZE_ONE_WORD,
4339       });
4340    }
4341 }
4342 
4343 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)4344 nvk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
4345                                     const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
4346 {
4347    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4348    VK_FROM_HANDLE(nvk_buffer, buffer, pConditionalRenderingBegin->buffer);
4349 
4350    uint64_t addr = nvk_buffer_address(buffer, pConditionalRenderingBegin->offset);
4351    bool inverted = pConditionalRenderingBegin->flags &
4352       VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
4353 
4354    /* From the Vulkan 1.3.280 spec:
4355     *
4356     *    "If the 32-bit value at offset in buffer memory is zero,
4357     *     then the rendering commands are discarded,
4358     *     otherwise they are executed as normal."
4359     *
4360     * The hardware compare a 64-bit value, as such we are required to copy it.
4361     */
4362    uint64_t tmp_addr;
4363    VkResult result = nvk_cmd_buffer_cond_render_alloc(cmd, &tmp_addr);
4364    if (result != VK_SUCCESS) {
4365       vk_command_buffer_set_error(&cmd->vk, result);
4366       return;
4367    }
4368 
4369    struct nv_push *p = nvk_cmd_buffer_push(cmd, 26);
4370 
4371    P_MTHD(p, NV90B5, OFFSET_IN_UPPER);
4372    P_NV90B5_OFFSET_IN_UPPER(p, addr >> 32);
4373    P_NV90B5_OFFSET_IN_LOWER(p, addr & 0xffffffff);
4374    P_NV90B5_OFFSET_OUT_UPPER(p, tmp_addr >> 32);
4375    P_NV90B5_OFFSET_OUT_LOWER(p, tmp_addr & 0xffffffff);
4376    P_NV90B5_PITCH_IN(p, 4);
4377    P_NV90B5_PITCH_OUT(p, 4);
4378    P_NV90B5_LINE_LENGTH_IN(p, 4);
4379    P_NV90B5_LINE_COUNT(p, 1);
4380 
4381    P_IMMD(p, NV90B5, SET_REMAP_COMPONENTS, {
4382       .dst_x = DST_X_SRC_X,
4383       .dst_y = DST_Y_SRC_X,
4384       .dst_z = DST_Z_NO_WRITE,
4385       .dst_w = DST_W_NO_WRITE,
4386       .component_size = COMPONENT_SIZE_ONE,
4387       .num_src_components = NUM_SRC_COMPONENTS_ONE,
4388       .num_dst_components = NUM_DST_COMPONENTS_TWO,
4389    });
4390 
4391    P_IMMD(p, NV90B5, LAUNCH_DMA, {
4392       .data_transfer_type = DATA_TRANSFER_TYPE_PIPELINED,
4393       .multi_line_enable = MULTI_LINE_ENABLE_TRUE,
4394       .flush_enable = FLUSH_ENABLE_TRUE,
4395       .src_memory_layout = SRC_MEMORY_LAYOUT_PITCH,
4396       .dst_memory_layout = DST_MEMORY_LAYOUT_PITCH,
4397       .remap_enable = REMAP_ENABLE_TRUE,
4398    });
4399 
4400    P_MTHD(p, NV9097, SET_RENDER_ENABLE_A);
4401    P_NV9097_SET_RENDER_ENABLE_A(p, tmp_addr >> 32);
4402    P_NV9097_SET_RENDER_ENABLE_B(p, tmp_addr & 0xfffffff0);
4403    P_NV9097_SET_RENDER_ENABLE_C(p, inverted ? MODE_RENDER_IF_EQUAL : MODE_RENDER_IF_NOT_EQUAL);
4404 
4405    P_MTHD(p, NV90C0, SET_RENDER_ENABLE_A);
4406    P_NV90C0_SET_RENDER_ENABLE_A(p, tmp_addr >> 32);
4407    P_NV90C0_SET_RENDER_ENABLE_B(p, tmp_addr & 0xfffffff0);
4408    P_NV90C0_SET_RENDER_ENABLE_C(p, inverted ? MODE_RENDER_IF_EQUAL : MODE_RENDER_IF_NOT_EQUAL);
4409 }
4410 
4411 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)4412 nvk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
4413 {
4414    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4415 
4416    struct nv_push *p = nvk_cmd_buffer_push(cmd, 12);
4417    P_MTHD(p, NV9097, SET_RENDER_ENABLE_A);
4418    P_NV9097_SET_RENDER_ENABLE_A(p, 0);
4419    P_NV9097_SET_RENDER_ENABLE_B(p, 0);
4420    P_NV9097_SET_RENDER_ENABLE_C(p, MODE_TRUE);
4421 
4422    P_MTHD(p, NV90C0, SET_RENDER_ENABLE_A);
4423    P_NV90C0_SET_RENDER_ENABLE_A(p, 0);
4424    P_NV90C0_SET_RENDER_ENABLE_B(p, 0);
4425    P_NV90C0_SET_RENDER_ENABLE_C(p, MODE_TRUE);
4426 }
4427