1 /*
2 * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 */
5 #include "nvk_buffer.h"
6 #include "nvk_entrypoints.h"
7 #include "nvk_cmd_buffer.h"
8 #include "nvk_device.h"
9 #include "nvk_format.h"
10 #include "nvk_image.h"
11 #include "nvk_image_view.h"
12 #include "nvk_mme.h"
13 #include "nvk_physical_device.h"
14 #include "nvk_shader.h"
15
16 #include "util/bitpack_helpers.h"
17 #include "vk_format.h"
18 #include "vk_render_pass.h"
19 #include "vk_standard_sample_locations.h"
20
21 #include "nv_push_cl902d.h"
22 #include "nv_push_cl9097.h"
23 #include "nv_push_cl90b5.h"
24 #include "nv_push_cl90c0.h"
25 #include "nv_push_cla097.h"
26 #include "nv_push_clb097.h"
27 #include "nv_push_clb197.h"
28 #include "nv_push_clc397.h"
29 #include "nv_push_clc597.h"
30 #include "drf.h"
31
32 static inline uint16_t
nvk_cmd_buffer_3d_cls(struct nvk_cmd_buffer * cmd)33 nvk_cmd_buffer_3d_cls(struct nvk_cmd_buffer *cmd)
34 {
35 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
36 struct nvk_physical_device *pdev = nvk_device_physical(dev);
37 return pdev->info.cls_eng3d;
38 }
39
40 static void
mme_set_priv_reg(struct mme_builder * b,struct mme_value value,struct mme_value mask,struct mme_value reg)41 mme_set_priv_reg(struct mme_builder *b,
42 struct mme_value value,
43 struct mme_value mask,
44 struct mme_value reg)
45 {
46 mme_mthd(b, NV9097_WAIT_FOR_IDLE);
47 mme_emit(b, mme_zero());
48
49 mme_mthd(b, NVK_SET_MME_SCRATCH(FALCON_0));
50 mme_emit(b, mme_zero());
51 mme_emit(b, value);
52 mme_emit(b, mask);
53
54 mme_mthd(b, NV9097_SET_FALCON04);
55 mme_emit(b, reg);
56
57 struct mme_value loop_cond = mme_mov(b, mme_zero());
58 mme_while(b, ine, loop_cond, mme_imm(1)) {
59 mme_state_to(b, loop_cond, NVK_SET_MME_SCRATCH(FALCON_0));
60 mme_mthd(b, NV9097_NO_OPERATION);
61 mme_emit(b, mme_zero());
62 };
63 }
64
65 void
nvk_mme_set_priv_reg(struct mme_builder * b)66 nvk_mme_set_priv_reg(struct mme_builder *b)
67 {
68 struct mme_value value = mme_load(b);
69 struct mme_value mask = mme_load(b);
70 struct mme_value reg = mme_load(b);
71
72 mme_set_priv_reg(b, value, mask, reg);
73 }
74
75 void
nvk_mme_set_conservative_raster_state(struct mme_builder * b)76 nvk_mme_set_conservative_raster_state(struct mme_builder *b)
77 {
78 struct mme_value new_state = mme_load(b);
79 struct mme_value old_state =
80 nvk_mme_load_scratch(b, CONSERVATIVE_RASTER_STATE);
81
82 mme_if(b, ine, new_state, old_state) {
83 nvk_mme_store_scratch(b, CONSERVATIVE_RASTER_STATE, new_state);
84 mme_set_priv_reg(b, new_state, mme_imm(BITFIELD_RANGE(23, 2)),
85 mme_imm(0x418800));
86 }
87 }
88
89 #define NVK_DRAW_CB0_SIZE sizeof(struct nvk_root_descriptor_table)
90
91 void
nvk_mme_select_cb0(struct mme_builder * b)92 nvk_mme_select_cb0(struct mme_builder *b)
93 {
94 struct mme_value addr_hi = nvk_mme_load_scratch(b, CB0_ADDR_HI);
95 struct mme_value addr_lo = nvk_mme_load_scratch(b, CB0_ADDR_LO);
96
97 mme_mthd(b, NV9097_SET_CONSTANT_BUFFER_SELECTOR_A);
98 mme_emit(b, mme_imm(NVK_DRAW_CB0_SIZE));
99 mme_emit(b, addr_hi);
100 mme_emit(b, addr_lo);
101 }
102
103 static uint32_t nvk_mme_anti_alias_init(void);
104
105 VkResult
nvk_push_draw_state_init(struct nvk_queue * queue,struct nv_push * p)106 nvk_push_draw_state_init(struct nvk_queue *queue, struct nv_push *p)
107 {
108 struct nvk_device *dev = nvk_queue_device(queue);
109 struct nvk_physical_device *pdev = nvk_device_physical(dev);
110
111 /* 3D state */
112 P_MTHD(p, NV9097, SET_OBJECT);
113 P_NV9097_SET_OBJECT(p, {
114 .class_id = pdev->info.cls_eng3d,
115 .engine_id = 0,
116 });
117
118 for (uint32_t mme = 0, mme_pos = 0; mme < NVK_MME_COUNT; mme++) {
119 size_t size;
120 uint32_t *dw = nvk_build_mme(&pdev->info, mme, &size);
121 if (dw == NULL)
122 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
123
124 assert(size % sizeof(uint32_t) == 0);
125 const uint32_t num_dw = size / sizeof(uint32_t);
126
127 P_MTHD(p, NV9097, LOAD_MME_START_ADDRESS_RAM_POINTER);
128 P_NV9097_LOAD_MME_START_ADDRESS_RAM_POINTER(p, mme);
129 P_NV9097_LOAD_MME_START_ADDRESS_RAM(p, mme_pos);
130
131 P_1INC(p, NV9097, LOAD_MME_INSTRUCTION_RAM_POINTER);
132 P_NV9097_LOAD_MME_INSTRUCTION_RAM_POINTER(p, mme_pos);
133 P_INLINE_ARRAY(p, dw, num_dw);
134
135 mme_pos += num_dw;
136
137 free(dw);
138 }
139
140 if (pdev->info.cls_eng3d >= TURING_A)
141 P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
142
143 /* Enable FP helper invocation memory loads
144 *
145 * For generations with firmware support for our `SET_PRIV_REG` mme method
146 * we simply use that. On older generations we'll let the kernel do it.
147 * Starting with GSP we have to do it via the firmware anyway.
148 *
149 * This clears bit 3 of gr_gpcs_tpcs_sm_disp_ctrl
150 *
151 * Without it,
152 * dEQP-VK.subgroups.vote.frag_helper.subgroupallequal_bvec2_fragment will
153 * occasionally fail.
154 */
155 if (pdev->info.cls_eng3d >= MAXWELL_B) {
156 unsigned reg = pdev->info.cls_eng3d >= VOLTA_A ? 0x419ba4 : 0x419f78;
157 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_PRIV_REG));
158 P_INLINE_DATA(p, 0);
159 P_INLINE_DATA(p, BITFIELD_BIT(3));
160 P_INLINE_DATA(p, reg);
161 }
162
163 /* Disable Out Of Range Address exceptions
164 *
165 * From the SPH documentation:
166 *
167 * "The SPH fields StoreReqStart and StoreReqEnd set a range of
168 * attributes whose corresponding Odmap values of ST or ST_LAST are
169 * treated as ST_REQ. Normally, for an attribute whose Omap bit is TRUE
170 * and Odmap value is ST, when the shader writes data to this output, it
171 * can not count on being able to read it back, since the next
172 * downstream shader might have its Imap bit FALSE, thereby causing the
173 * Bmap bit to be FALSE. By including a ST type of attribute in the
174 * range of StoreReqStart and StoreReqEnd, the attribute’s Odmap value
175 * is treated as ST_REQ, so an Omap bit being TRUE causes the Bmap bit
176 * to be TRUE. This guarantees the shader program can output the value
177 * and then read it back later. This will save register space."
178 *
179 * It's unclear exactly what's going on but this seems to imply that the
180 * hardware actually ANDs the output mask of one shader stage together with
181 * the input mask of the subsequent shader stage to determine which values
182 * are actually used.
183 *
184 * In the case when we have an empty fragment shader, it seems the hardware
185 * doesn't allocate any output memory for final geometry stage at all and
186 * so any writes to outputs from the final shader stage generates an Out Of
187 * Range Address exception. We could fix this by eliminating unused
188 * outputs via cross-stage linking but that won't work in the case of
189 * VK_EXT_shader_object and VK_EXT_graphics_pipeline_library fast-link.
190 * Instead, the easiest solution is to just disable the exception.
191 *
192 * NOTE (Faith):
193 *
194 * This above analysis is 100% conjecture on my part based on a creative
195 * reading of the SPH docs and what I saw when trying to run certain
196 * OpenGL CTS tests on NVK + Zink. Without access to NVIDIA HW
197 * engineers, have no way of verifying this analysis.
198 *
199 * The CTS test in question is:
200 *
201 * KHR-GL46.tessellation_shader.tessellation_control_to_tessellation_evaluation.gl_tessLevel
202 *
203 * This should also prevent any issues with array overruns on I/O arrays.
204 * Before, they would get an exception and kill the context whereas now
205 * they should gently get ignored.
206 *
207 * This clears bit 14 of gr_gpcs_tpcs_sms_hww_warp_esr_report_mask
208 */
209 if (pdev->info.cls_eng3d >= MAXWELL_B) {
210 unsigned reg = pdev->info.cls_eng3d >= VOLTA_A ? 0x419ea8 : 0x419e44;
211 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_PRIV_REG));
212 P_INLINE_DATA(p, 0);
213 P_INLINE_DATA(p, BITFIELD_BIT(14));
214 P_INLINE_DATA(p, reg);
215 }
216
217 /* Set CONSERVATIVE_RASTER_STATE to an invalid value, to ensure the
218 * hardware reg is always set the first time conservative rasterization
219 * is enabled */
220 P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CONSERVATIVE_RASTER_STATE),
221 ~0);
222
223 /* Initialize tessellation parameters */
224 P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_TESS_PARAMS), 0);
225 P_IMMD(p, NV9097, SET_TESSELLATION_PARAMETERS, {});
226
227 P_IMMD(p, NV9097, SET_RENDER_ENABLE_C, MODE_TRUE);
228
229 P_IMMD(p, NV9097, SET_Z_COMPRESSION, ENABLE_TRUE);
230 P_MTHD(p, NV9097, SET_COLOR_COMPRESSION(0));
231 for (unsigned i = 0; i < 8; i++)
232 P_NV9097_SET_COLOR_COMPRESSION(p, i, ENABLE_TRUE);
233
234 P_IMMD(p, NV9097, SET_CT_SELECT, { .target_count = 1 });
235
236 // P_MTHD(cmd->push, NVC0_3D, CSAA_ENABLE);
237 // P_INLINE_DATA(cmd->push, 0);
238
239 P_IMMD(p, NV9097, SET_ALIASED_LINE_WIDTH_ENABLE, V_TRUE);
240
241 P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART_VERTEX_ARRAY, ENABLE_FALSE);
242
243 P_IMMD(p, NV9097, SET_BLEND_SEPARATE_FOR_ALPHA, ENABLE_TRUE);
244 P_IMMD(p, NV9097, SET_SINGLE_CT_WRITE_CONTROL, ENABLE_TRUE);
245 P_IMMD(p, NV9097, SET_SINGLE_ROP_CONTROL, ENABLE_FALSE);
246 P_IMMD(p, NV9097, SET_TWO_SIDED_STENCIL_TEST, ENABLE_TRUE);
247
248 P_IMMD(p, NV9097, SET_SHADE_MODE, V_OGL_SMOOTH);
249
250 P_IMMD(p, NV9097, SET_API_VISIBLE_CALL_LIMIT, V__128);
251
252 P_IMMD(p, NV9097, SET_ZCULL_STATS, ENABLE_TRUE);
253
254 P_IMMD(p, NV9097, SET_L1_CONFIGURATION,
255 DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB);
256
257 P_IMMD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_ENABLE, V_FALSE);
258 P_IMMD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_UNORM8, {
259 .all_covered_all_hit_once = 0xff,
260 });
261 P_MTHD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_UNORM10);
262 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_UNORM10(p, {
263 .all_covered_all_hit_once = 0xff,
264 });
265 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_UNORM16(p, {
266 .all_covered_all_hit_once = 0xff,
267 });
268 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_FP11(p, {
269 .all_covered_all_hit_once = 0x3f,
270 });
271 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_FP16(p, {
272 .all_covered_all_hit_once = 0xff,
273 });
274 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_SRGB8(p, {
275 .all_covered_all_hit_once = 0xff,
276 });
277
278 if (pdev->info.cls_eng3d < VOLTA_A)
279 P_IMMD(p, NV9097, SET_ALPHA_FRACTION, 0x3f);
280
281 P_IMMD(p, NV9097, CHECK_SPH_VERSION, {
282 .current = 3,
283 .oldest_supported = 3,
284 });
285 P_IMMD(p, NV9097, CHECK_AAM_VERSION, {
286 .current = 2,
287 .oldest_supported = 2,
288 });
289
290 if (pdev->info.cls_eng3d < MAXWELL_A)
291 P_IMMD(p, NV9097, SET_SHADER_SCHEDULING, MODE_OLDEST_THREAD_FIRST);
292
293 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_PREFETCH_READ_REQUESTS,
294 POLICY_EVICT_NORMAL);
295 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_NONINTERLOCKED_READ_REQUESTS,
296 POLICY_EVICT_NORMAL);
297 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_INTERLOCKED_READ_REQUESTS,
298 POLICY_EVICT_NORMAL);
299 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_NONINTERLOCKED_WRITE_REQUESTS,
300 POLICY_EVICT_NORMAL);
301 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_INTERLOCKED_WRITE_REQUESTS,
302 POLICY_EVICT_NORMAL);
303
304 P_IMMD(p, NV9097, SET_BLEND_PER_FORMAT_ENABLE, SNORM8_UNORM16_SNORM16_TRUE);
305
306 P_IMMD(p, NV9097, SET_ATTRIBUTE_DEFAULT, {
307 .color_front_diffuse = COLOR_FRONT_DIFFUSE_VECTOR_0001,
308 .color_front_specular = COLOR_FRONT_SPECULAR_VECTOR_0001,
309 .generic_vector = GENERIC_VECTOR_VECTOR_0001,
310 .fixed_fnc_texture = FIXED_FNC_TEXTURE_VECTOR_0001,
311 .dx9_color0 = DX9_COLOR0_VECTOR_0001,
312 .dx9_color1_to_color15 = DX9_COLOR1_TO_COLOR15_VECTOR_0000,
313 });
314
315 P_IMMD(p, NV9097, SET_DA_OUTPUT, VERTEX_ID_USES_ARRAY_START_TRUE);
316
317 P_IMMD(p, NV9097, SET_RENDER_ENABLE_CONTROL,
318 CONDITIONAL_LOAD_CONSTANT_BUFFER_FALSE);
319
320 P_IMMD(p, NV9097, SET_PS_OUTPUT_SAMPLE_MASK_USAGE, {
321 .enable = ENABLE_TRUE,
322 .qualify_by_anti_alias_enable = QUALIFY_BY_ANTI_ALIAS_ENABLE_ENABLE,
323 });
324
325 if (pdev->info.cls_eng3d < VOLTA_A)
326 P_IMMD(p, NV9097, SET_PRIM_CIRCULAR_BUFFER_THROTTLE, 0x3fffff);
327
328 P_IMMD(p, NV9097, SET_BLEND_OPT_CONTROL, ALLOW_FLOAT_PIXEL_KILLS_TRUE);
329 P_IMMD(p, NV9097, SET_BLEND_FLOAT_OPTION, ZERO_TIMES_ANYTHING_IS_ZERO_TRUE);
330 P_IMMD(p, NV9097, SET_BLEND_STATE_PER_TARGET, ENABLE_TRUE);
331
332 if (pdev->info.cls_eng3d < MAXWELL_A)
333 P_IMMD(p, NV9097, SET_MAX_TI_WARPS_PER_BATCH, 3);
334
335 if (pdev->info.cls_eng3d >= KEPLER_A &&
336 pdev->info.cls_eng3d < MAXWELL_A) {
337 P_IMMD(p, NVA097, SET_TEXTURE_INSTRUCTION_OPERAND,
338 ORDERING_KEPLER_ORDER);
339 }
340
341 P_IMMD(p, NV9097, SET_ALPHA_TEST, ENABLE_FALSE);
342 P_IMMD(p, NV9097, SET_TWO_SIDED_LIGHT, ENABLE_FALSE);
343 P_IMMD(p, NV9097, SET_COLOR_CLAMP, ENABLE_TRUE);
344 P_IMMD(p, NV9097, SET_PS_SATURATE, {
345 .output0 = OUTPUT0_FALSE,
346 .output1 = OUTPUT1_FALSE,
347 .output2 = OUTPUT2_FALSE,
348 .output3 = OUTPUT3_FALSE,
349 .output4 = OUTPUT4_FALSE,
350 .output5 = OUTPUT5_FALSE,
351 .output6 = OUTPUT6_FALSE,
352 .output7 = OUTPUT7_FALSE,
353 });
354
355 P_IMMD(p, NV9097, SET_POINT_SIZE, fui(1.0));
356 P_IMMD(p, NV9097, SET_ATTRIBUTE_POINT_SIZE, { .enable = ENABLE_TRUE });
357
358 /* From vulkan spec's point rasterization:
359 * "Point rasterization produces a fragment for each fragment area group of
360 * framebuffer pixels with one or more sample points that intersect a region
361 * centered at the point’s (xf,yf).
362 * This region is a square with side equal to the current point size.
363 * ... (xf,yf) is the exact, unrounded framebuffer coordinate of the vertex
364 * for the point"
365 *
366 * So it seems we always need square points with PointCoords like OpenGL
367 * point sprites.
368 *
369 * From OpenGL compatibility spec:
370 * Basic point rasterization:
371 * "If point sprites are enabled, then point rasterization produces a
372 * fragment for each framebuffer pixel whose center lies inside a square
373 * centered at the point’s (xw, yw), with side length equal to the current
374 * point size.
375 * ... and xw and yw are the exact, unrounded window coordinates of the
376 * vertex for the point"
377 *
378 * And Point multisample rasterization:
379 * "This region is a circle having diameter equal to the current point width
380 * if POINT_SPRITE is disabled, or a square with side equal to the current
381 * point width if POINT_SPRITE is enabled."
382 */
383 P_IMMD(p, NV9097, SET_POINT_SPRITE, ENABLE_TRUE);
384 P_IMMD(p, NV9097, SET_POINT_SPRITE_SELECT, {
385 .rmode = RMODE_ZERO,
386 .origin = ORIGIN_TOP,
387 .texture0 = TEXTURE0_PASSTHROUGH,
388 .texture1 = TEXTURE1_PASSTHROUGH,
389 .texture2 = TEXTURE2_PASSTHROUGH,
390 .texture3 = TEXTURE3_PASSTHROUGH,
391 .texture4 = TEXTURE4_PASSTHROUGH,
392 .texture5 = TEXTURE5_PASSTHROUGH,
393 .texture6 = TEXTURE6_PASSTHROUGH,
394 .texture7 = TEXTURE7_PASSTHROUGH,
395 .texture8 = TEXTURE8_PASSTHROUGH,
396 .texture9 = TEXTURE9_PASSTHROUGH,
397 });
398
399 /* OpenGL's GL_POINT_SMOOTH */
400 P_IMMD(p, NV9097, SET_ANTI_ALIASED_POINT, ENABLE_FALSE);
401
402 if (pdev->info.cls_eng3d >= MAXWELL_B)
403 P_IMMD(p, NVB197, SET_FILL_VIA_TRIANGLE, MODE_DISABLED);
404
405 P_IMMD(p, NV9097, SET_POLY_SMOOTH, ENABLE_FALSE);
406
407 P_IMMD(p, NV9097, SET_VIEWPORT_PIXEL, CENTER_AT_HALF_INTEGERS);
408
409 P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_ANTI_ALIAS),
410 nvk_mme_anti_alias_init());
411
412 /* Enable multisample rasterization even for one sample rasterization,
413 * this way we get strict lines and rectangular line support.
414 * More info at: DirectX rasterization rules
415 */
416 P_IMMD(p, NV9097, SET_ANTI_ALIAS_ENABLE, V_TRUE);
417
418 if (pdev->info.cls_eng3d >= MAXWELL_B) {
419 P_IMMD(p, NVB197, SET_POST_PS_INITIAL_COVERAGE, true);
420 P_IMMD(p, NVB197, SET_OFFSET_RENDER_TARGET_INDEX,
421 BY_VIEWPORT_INDEX_FALSE);
422 }
423
424 /* TODO: Vertex runout */
425
426 P_IMMD(p, NV9097, SET_WINDOW_ORIGIN, {
427 .mode = MODE_UPPER_LEFT,
428 .flip_y = FLIP_Y_FALSE,
429 });
430
431 P_MTHD(p, NV9097, SET_WINDOW_OFFSET_X);
432 P_NV9097_SET_WINDOW_OFFSET_X(p, 0);
433 P_NV9097_SET_WINDOW_OFFSET_Y(p, 0);
434
435 P_IMMD(p, NV9097, SET_ACTIVE_ZCULL_REGION, 0x3f);
436 P_IMMD(p, NV9097, SET_WINDOW_CLIP_ENABLE, V_FALSE);
437 P_IMMD(p, NV9097, SET_CLIP_ID_TEST, ENABLE_FALSE);
438
439 // P_IMMD(p, NV9097, X_X_X_SET_CLEAR_CONTROL, {
440 // .respect_stencil_mask = RESPECT_STENCIL_MASK_FALSE,
441 // .use_clear_rect = USE_CLEAR_RECT_FALSE,
442 // });
443
444 P_IMMD(p, NV9097, SET_VIEWPORT_SCALE_OFFSET, ENABLE_TRUE);
445
446 P_IMMD(p, NV9097, SET_VIEWPORT_CLIP_CONTROL, {
447 .min_z_zero_max_z_one = MIN_Z_ZERO_MAX_Z_ONE_FALSE,
448 .pixel_min_z = PIXEL_MIN_Z_CLAMP,
449 .pixel_max_z = PIXEL_MAX_Z_CLAMP,
450 .geometry_guardband = GEOMETRY_GUARDBAND_SCALE_256,
451 .line_point_cull_guardband = LINE_POINT_CULL_GUARDBAND_SCALE_256,
452 .geometry_clip = GEOMETRY_CLIP_WZERO_CLIP,
453 .geometry_guardband_z = GEOMETRY_GUARDBAND_Z_SAME_AS_XY_GUARDBAND,
454 });
455
456 for (unsigned i = 0; i < 16; i++)
457 P_IMMD(p, NV9097, SET_SCISSOR_ENABLE(i), V_FALSE);
458
459 P_IMMD(p, NV9097, SET_CT_MRT_ENABLE, V_TRUE);
460
461 if (pdev->info.cls_eng3d < VOLTA_A) {
462 uint64_t shader_base_addr =
463 nvk_heap_contiguous_base_address(&dev->shader_heap);
464
465 P_MTHD(p, NV9097, SET_PROGRAM_REGION_A);
466 P_NV9097_SET_PROGRAM_REGION_A(p, shader_base_addr >> 32);
467 P_NV9097_SET_PROGRAM_REGION_B(p, shader_base_addr);
468 }
469
470 for (uint32_t group = 0; group < 5; group++) {
471 for (uint32_t slot = 0; slot < 16; slot++) {
472 P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
473 .valid = VALID_FALSE,
474 .shader_slot = slot,
475 });
476 }
477 }
478
479 // P_MTHD(cmd->push, NVC0_3D, MACRO_GP_SELECT);
480 // P_INLINE_DATA(cmd->push, 0x40);
481 P_IMMD(p, NV9097, SET_RT_LAYER, {
482 .v = 0,
483 .control = CONTROL_V_SELECTS_LAYER,
484 });
485 // P_MTHD(cmd->push, NVC0_3D, MACRO_TEP_SELECT;
486 // P_INLINE_DATA(cmd->push, 0x30);
487
488 P_IMMD(p, NV9097, SET_POINT_CENTER_MODE, V_OGL);
489 P_IMMD(p, NV9097, SET_EDGE_FLAG, V_TRUE);
490 P_IMMD(p, NV9097, SET_SAMPLER_BINDING, V_INDEPENDENTLY);
491
492 uint64_t zero_addr = dev->zero_page->va->addr;
493 P_MTHD(p, NV9097, SET_VERTEX_STREAM_SUBSTITUTE_A);
494 P_NV9097_SET_VERTEX_STREAM_SUBSTITUTE_A(p, zero_addr >> 32);
495 P_NV9097_SET_VERTEX_STREAM_SUBSTITUTE_B(p, zero_addr);
496
497 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_VB_ENABLES));
498 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_VB_ENABLES, 0);
499 for (uint32_t b = 0; b < 32; b++) {
500 P_IMMD(p, NV9097, SET_VERTEX_STREAM_A_FORMAT(b), {
501 .enable = false,
502 });
503 }
504
505 if (pdev->info.cls_eng3d >= FERMI_A &&
506 pdev->info.cls_eng3d < MAXWELL_A) {
507 assert(dev->vab_memory);
508 uint64_t vab_addr = dev->vab_memory->va->addr;
509 P_MTHD(p, NV9097, SET_VAB_MEMORY_AREA_A);
510 P_NV9097_SET_VAB_MEMORY_AREA_A(p, vab_addr >> 32);
511 P_NV9097_SET_VAB_MEMORY_AREA_B(p, vab_addr);
512 P_NV9097_SET_VAB_MEMORY_AREA_C(p, SIZE_BYTES_256K);
513 }
514
515 if (pdev->info.cls_eng3d == MAXWELL_A)
516 P_IMMD(p, NVB097, SET_SELECT_MAXWELL_TEXTURE_HEADERS, V_TRUE);
517
518 /* Store the address to CB0 in a pair of state registers */
519 uint64_t cb0_addr = queue->draw_cb0->va->addr;
520 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CB0_ADDR_HI));
521 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_ADDR_HI, cb0_addr >> 32);
522 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_ADDR_LO, cb0_addr);
523
524 /* Store the address to the zero page in a pair of state registers */
525 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_ZERO_ADDR_HI));
526 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_ZERO_ADDR_HI, zero_addr >> 32);
527 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_ZERO_ADDR_LO, zero_addr);
528
529 /* We leave CB0 selected by default */
530 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SELECT_CB0));
531 P_INLINE_DATA(p, 0);
532
533 /* Bind CB0 to all shader groups */
534 for (uint32_t group = 0; group < 5; group++) {
535 P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
536 .valid = VALID_TRUE,
537 .shader_slot = 0,
538 });
539 }
540
541 /* Zero out CB0 */
542 P_1INC(p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET);
543 P_NV9097_LOAD_CONSTANT_BUFFER_OFFSET(p, 0);
544 for (uint32_t dw = 0; dw < NVK_DRAW_CB0_SIZE / 4; dw++)
545 P_INLINE_DATA(p, 0);
546
547 /* These are shadowed in cb0 so they need to be zeroed as well for
548 * consistency.
549 */
550 P_IMMD(p, NV9097, SET_GLOBAL_BASE_INSTANCE_INDEX, 0);
551 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CB0_FIRST_VERTEX));
552 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_FIRST_VERTEX, 0);
553 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_DRAW_INDEX, 0);
554 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_VIEW_INDEX, 0);
555
556 return VK_SUCCESS;
557 }
558
559 static void
nvk_cmd_buffer_dirty_render_pass(struct nvk_cmd_buffer * cmd)560 nvk_cmd_buffer_dirty_render_pass(struct nvk_cmd_buffer *cmd)
561 {
562 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
563
564 /* These depend on color attachment count */
565 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
566 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
567 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
568 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
569
570 /* These depend on the depth/stencil format */
571 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
572 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
573 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
574 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
575
576 /* This may depend on render targets for ESO */
577 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
578
579 /* This may depend on render targets */
580 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP);
581 }
582
583 static void
nvk_cmd_flush_gfx_root_desc(struct nvk_cmd_buffer * cmd,struct nvk_descriptor_state * desc,size_t offset,size_t size)584 nvk_cmd_flush_gfx_root_desc(struct nvk_cmd_buffer *cmd,
585 struct nvk_descriptor_state *desc,
586 size_t offset, size_t size)
587 {
588 const uint32_t start_dw = offset / 4;
589 const uint32_t end_dw = DIV_ROUND_UP(offset + size, 4);
590 const uint32_t len_dw = end_dw - start_dw;
591
592 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + len_dw);
593 P_1INC(p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET);
594 P_NV9097_LOAD_CONSTANT_BUFFER_OFFSET(p, start_dw * 4);
595
596 const uint32_t *root_dw = (uint32_t *)desc->root;
597 P_INLINE_ARRAY(p, &root_dw[start_dw], len_dw);
598 }
599
600 void
nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer * cmd,const VkCommandBufferBeginInfo * pBeginInfo)601 nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer *cmd,
602 const VkCommandBufferBeginInfo *pBeginInfo)
603 {
604 if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
605 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
606 P_MTHD(p, NV9097, INVALIDATE_SAMPLER_CACHE_NO_WFI);
607 P_NV9097_INVALIDATE_SAMPLER_CACHE_NO_WFI(p, {
608 .lines = LINES_ALL,
609 });
610 P_NV9097_INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI(p, {
611 .lines = LINES_ALL,
612 });
613
614 P_IMMD(p, NVA097, INVALIDATE_SHADER_CACHES_NO_WFI, {
615 .constant = CONSTANT_TRUE,
616 });
617 }
618
619 cmd->state.gfx.descriptors.flush_root = nvk_cmd_flush_gfx_root_desc;
620
621 if (cmd->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
622 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
623 char gcbiar_data[VK_GCBIARR_DATA_SIZE(NVK_MAX_RTS)];
624 const VkRenderingInfo *resume_info =
625 vk_get_command_buffer_inheritance_as_rendering_resume(cmd->vk.level,
626 pBeginInfo,
627 gcbiar_data);
628 if (resume_info) {
629 nvk_CmdBeginRendering(nvk_cmd_buffer_to_handle(cmd), resume_info);
630 } else {
631 const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
632 vk_get_command_buffer_inheritance_rendering_info(cmd->vk.level,
633 pBeginInfo);
634 assert(inheritance_info);
635
636 struct nvk_rendering_state *render = &cmd->state.gfx.render;
637 render->flags = inheritance_info->flags;
638 render->area = (VkRect2D) { };
639 render->layer_count = 0;
640 render->view_mask = inheritance_info->viewMask;
641 render->samples = inheritance_info->rasterizationSamples;
642
643 render->color_att_count = inheritance_info->colorAttachmentCount;
644 for (uint32_t i = 0; i < render->color_att_count; i++) {
645 render->color_att[i].vk_format =
646 inheritance_info->pColorAttachmentFormats[i];
647 }
648 render->depth_att.vk_format =
649 inheritance_info->depthAttachmentFormat;
650 render->stencil_att.vk_format =
651 inheritance_info->stencilAttachmentFormat;
652
653 const VkRenderingAttachmentLocationInfoKHR att_loc_info_default = {
654 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
655 .colorAttachmentCount = inheritance_info->colorAttachmentCount,
656 };
657 const VkRenderingAttachmentLocationInfoKHR *att_loc_info =
658 vk_get_command_buffer_rendering_attachment_location_info(
659 cmd->vk.level, pBeginInfo);
660 if (att_loc_info == NULL)
661 att_loc_info = &att_loc_info_default;
662
663 vk_cmd_set_rendering_attachment_locations(&cmd->vk, att_loc_info);
664
665 nvk_cmd_buffer_dirty_render_pass(cmd);
666 }
667 }
668
669 cmd->state.gfx.shaders_dirty = ~0;
670 }
671
672 void
nvk_cmd_invalidate_graphics_state(struct nvk_cmd_buffer * cmd)673 nvk_cmd_invalidate_graphics_state(struct nvk_cmd_buffer *cmd)
674 {
675 vk_dynamic_graphics_state_dirty_all(&cmd->vk.dynamic_graphics_state);
676
677 /* From the Vulkan 1.3.275 spec:
678 *
679 * "...There is one exception to this rule - if the primary command
680 * buffer is inside a render pass instance, then the render pass and
681 * subpass state is not disturbed by executing secondary command
682 * buffers."
683 *
684 * We need to reset everything EXCEPT the render pass state.
685 */
686 struct nvk_rendering_state render_save = cmd->state.gfx.render;
687 memset(&cmd->state.gfx, 0, sizeof(cmd->state.gfx));
688 cmd->state.gfx.render = render_save;
689
690 /* We need to keep the flush_root callback */
691 cmd->state.gfx.descriptors.flush_root = nvk_cmd_flush_gfx_root_desc;
692
693 cmd->state.gfx.shaders_dirty = ~0;
694 }
695
696 static void
nvk_attachment_init(struct nvk_attachment * att,const VkRenderingAttachmentInfo * info)697 nvk_attachment_init(struct nvk_attachment *att,
698 const VkRenderingAttachmentInfo *info)
699 {
700 if (info == NULL || info->imageView == VK_NULL_HANDLE) {
701 *att = (struct nvk_attachment) { .iview = NULL, };
702 return;
703 }
704
705 VK_FROM_HANDLE(nvk_image_view, iview, info->imageView);
706 *att = (struct nvk_attachment) {
707 .vk_format = iview->vk.format,
708 .iview = iview,
709 };
710
711 if (info->resolveMode != VK_RESOLVE_MODE_NONE) {
712 VK_FROM_HANDLE(nvk_image_view, res_iview, info->resolveImageView);
713 att->resolve_mode = info->resolveMode;
714 att->resolve_iview = res_iview;
715 }
716
717 att->store_op = info->storeOp;
718 }
719
720 static uint32_t
nil_to_nv9097_samples_mode(enum nil_sample_layout sample_layout)721 nil_to_nv9097_samples_mode(enum nil_sample_layout sample_layout)
722 {
723 #define MODE(S) [NIL_SAMPLE_LAYOUT_##S] = NV9097_SET_ANTI_ALIAS_SAMPLES_MODE_##S
724 uint16_t nil_to_nv9097[] = {
725 MODE(1X1),
726 MODE(2X1),
727 MODE(2X2),
728 MODE(4X2),
729 MODE(4X4),
730 };
731 #undef MODE
732 assert(sample_layout < ARRAY_SIZE(nil_to_nv9097));
733
734 return nil_to_nv9097[sample_layout];
735 }
736
737 VKAPI_ATTR void VKAPI_CALL
nvk_GetRenderingAreaGranularityKHR(VkDevice device,const VkRenderingAreaInfoKHR * pRenderingAreaInfo,VkExtent2D * pGranularity)738 nvk_GetRenderingAreaGranularityKHR(
739 VkDevice device,
740 const VkRenderingAreaInfoKHR *pRenderingAreaInfo,
741 VkExtent2D *pGranularity)
742 {
743 *pGranularity = (VkExtent2D) { .width = 1, .height = 1 };
744 }
745
746 static bool
nvk_rendering_all_linear(const struct nvk_rendering_state * render)747 nvk_rendering_all_linear(const struct nvk_rendering_state *render)
748 {
749 /* Depth and stencil are never linear */
750 if (render->depth_att.iview || render->stencil_att.iview)
751 return false;
752
753 for (uint32_t i = 0; i < render->color_att_count; i++) {
754 const struct nvk_image_view *iview = render->color_att[i].iview;
755 if (iview == NULL)
756 continue;
757
758 const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
759 const uint8_t ip = iview->planes[0].image_plane;
760 const struct nil_image_level *level =
761 &image->planes[ip].nil.levels[iview->vk.base_mip_level];
762
763 if (level->tiling.is_tiled)
764 return false;
765 }
766
767 return true;
768 }
769
770 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)771 nvk_CmdBeginRendering(VkCommandBuffer commandBuffer,
772 const VkRenderingInfo *pRenderingInfo)
773 {
774 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
775 struct nvk_rendering_state *render = &cmd->state.gfx.render;
776
777 memset(render, 0, sizeof(*render));
778
779 render->flags = pRenderingInfo->flags;
780 render->area = pRenderingInfo->renderArea;
781 render->view_mask = pRenderingInfo->viewMask;
782 render->layer_count = pRenderingInfo->layerCount;
783 render->samples = 0;
784
785 const uint32_t layer_count =
786 render->view_mask ? util_last_bit(render->view_mask) :
787 render->layer_count;
788
789 render->color_att_count = pRenderingInfo->colorAttachmentCount;
790 for (uint32_t i = 0; i < render->color_att_count; i++) {
791 nvk_attachment_init(&render->color_att[i],
792 &pRenderingInfo->pColorAttachments[i]);
793 }
794
795 nvk_attachment_init(&render->depth_att,
796 pRenderingInfo->pDepthAttachment);
797 nvk_attachment_init(&render->stencil_att,
798 pRenderingInfo->pStencilAttachment);
799
800 render->all_linear = nvk_rendering_all_linear(render);
801
802 const VkRenderingAttachmentLocationInfoKHR ral_info = {
803 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
804 .colorAttachmentCount = pRenderingInfo->colorAttachmentCount,
805 };
806 vk_cmd_set_rendering_attachment_locations(&cmd->vk, &ral_info);
807
808 nvk_cmd_buffer_dirty_render_pass(cmd);
809
810 struct nv_push *p = nvk_cmd_buffer_push(cmd, NVK_MAX_RTS * 12 + 29);
811
812 P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_VIEW_MASK),
813 render->view_mask);
814
815 P_MTHD(p, NV9097, SET_SURFACE_CLIP_HORIZONTAL);
816 P_NV9097_SET_SURFACE_CLIP_HORIZONTAL(p, {
817 .x = render->area.offset.x,
818 .width = render->area.extent.width,
819 });
820 P_NV9097_SET_SURFACE_CLIP_VERTICAL(p, {
821 .y = render->area.offset.y,
822 .height = render->area.extent.height,
823 });
824
825 enum nil_sample_layout sample_layout = NIL_SAMPLE_LAYOUT_INVALID;
826
827 /* We always emit SET_COLOR_TARGET_A(i) for every color target, regardless
828 * of the number of targets in the render pass. This ensures that we have
829 * no left over pointers from previous render passes in the hardware. This
830 * also allows us to point at any render target with SET_CT_SELECT and know
831 * that it's either a valid render target or NULL.
832 */
833 for (uint32_t i = 0; i < NVK_MAX_RTS; i++) {
834 if (render->color_att[i].iview) {
835 const struct nvk_image_view *iview = render->color_att[i].iview;
836 const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
837 /* Rendering to multi-planar images is valid for a specific single
838 * plane only, so assert that what we have is a single-plane, obtain
839 * its index, and begin rendering
840 */
841 assert(iview->plane_count == 1);
842 const uint8_t ip = iview->planes[0].image_plane;
843 const struct nvk_image_plane *plane = &image->planes[ip];
844
845 if (!render->all_linear && !plane->nil.levels[0].tiling.is_tiled)
846 plane = &image->linear_tiled_shadow;
847
848 const struct nil_image *nil_image = &plane->nil;
849 const struct nil_image_level *level =
850 &nil_image->levels[iview->vk.base_mip_level];
851 struct nil_Extent4D_Samples level_extent_sa =
852 nil_image_level_extent_sa(nil_image, iview->vk.base_mip_level);
853
854 assert(sample_layout == NIL_SAMPLE_LAYOUT_INVALID ||
855 sample_layout == nil_image->sample_layout);
856 sample_layout = nil_image->sample_layout;
857 render->samples = image->vk.samples;
858
859 uint64_t addr = nvk_image_plane_base_address(plane) + level->offset_B;
860
861 if (nil_image->dim == NIL_IMAGE_DIM_3D) {
862 addr += nil_image_level_z_offset_B(nil_image,
863 iview->vk.base_mip_level,
864 iview->vk.base_array_layer);
865 assert(layer_count <= iview->vk.extent.depth);
866 } else {
867 addr += iview->vk.base_array_layer *
868 (uint64_t)nil_image->array_stride_B;
869 assert(layer_count <= iview->vk.layer_count);
870 }
871
872 P_MTHD(p, NV9097, SET_COLOR_TARGET_A(i));
873 P_NV9097_SET_COLOR_TARGET_A(p, i, addr >> 32);
874 P_NV9097_SET_COLOR_TARGET_B(p, i, addr);
875
876 if (level->tiling.is_tiled) {
877 const enum pipe_format p_format =
878 vk_format_to_pipe_format(iview->vk.format);
879
880 /* We use the stride for depth/stencil targets because the Z/S
881 * hardware has no concept of a tile width. Instead, we just set
882 * the width to the stride divided by bpp.
883 */
884 const uint32_t row_stride_el =
885 level->row_stride_B / util_format_get_blocksize(p_format);
886 P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, row_stride_el);
887 P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, level_extent_sa.height);
888 const uint8_t ct_format = nil_format_to_color_target(p_format);
889 P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, ct_format);
890
891 P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
892 .block_width = BLOCK_WIDTH_ONE_GOB,
893 .block_height = level->tiling.y_log2,
894 .block_depth = level->tiling.z_log2,
895 .layout = LAYOUT_BLOCKLINEAR,
896 .third_dimension_control = (nil_image->dim == NIL_IMAGE_DIM_3D) ?
897 THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_DEPTH_SIZE :
898 THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
899 });
900
901 P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, layer_count);
902 P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i,
903 nil_image->array_stride_B >> 2);
904 P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
905 } else {
906 /* NVIDIA can only render to 2D linear images */
907 assert(nil_image->dim == NIL_IMAGE_DIM_2D);
908 /* NVIDIA can only render to non-multisampled images */
909 assert(sample_layout == NIL_SAMPLE_LAYOUT_1X1);
910 /* NVIDIA doesn't support linear array images */
911 assert(iview->vk.base_array_layer == 0 && layer_count == 1);
912
913 uint32_t pitch = level->row_stride_B;
914 const enum pipe_format p_format =
915 vk_format_to_pipe_format(iview->vk.format);
916 /* When memory layout is set to LAYOUT_PITCH, the WIDTH field
917 * takes row pitch
918 */
919 P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, pitch);
920 P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, level_extent_sa.height);
921
922 const uint8_t ct_format = nil_format_to_color_target(p_format);
923 P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, ct_format);
924
925 P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
926 .layout = LAYOUT_PITCH,
927 .third_dimension_control =
928 THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
929 });
930
931 P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, 1);
932 P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i, 0);
933 P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
934 }
935
936 P_IMMD(p, NV9097, SET_COLOR_COMPRESSION(i), nil_image->compressed);
937 } else {
938 P_MTHD(p, NV9097, SET_COLOR_TARGET_A(i));
939 P_NV9097_SET_COLOR_TARGET_A(p, i, 0);
940 P_NV9097_SET_COLOR_TARGET_B(p, i, 0);
941 P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, 64);
942 P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, 0);
943 P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, V_DISABLED);
944 P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
945 .layout = LAYOUT_BLOCKLINEAR,
946 });
947 P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, layer_count);
948 P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i, 0);
949 P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
950
951 P_IMMD(p, NV9097, SET_COLOR_COMPRESSION(i), ENABLE_TRUE);
952 }
953 }
954
955 if (render->depth_att.iview || render->stencil_att.iview) {
956 struct nvk_image_view *iview = render->depth_att.iview ?
957 render->depth_att.iview :
958 render->stencil_att.iview;
959 const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
960 /* Depth/stencil are always single-plane */
961 assert(iview->plane_count == 1);
962 const uint8_t ip = iview->planes[0].image_plane;
963 struct nil_image nil_image = image->planes[ip].nil;
964
965 uint64_t addr = nvk_image_base_address(image, ip);
966 uint32_t mip_level = iview->vk.base_mip_level;
967 uint32_t base_array_layer = iview->vk.base_array_layer;
968
969 if (nil_image.dim == NIL_IMAGE_DIM_3D) {
970 uint64_t level_offset_B;
971 nil_image = nil_image_3d_level_as_2d_array(&nil_image, mip_level,
972 &level_offset_B);
973 addr += level_offset_B;
974 mip_level = 0;
975 base_array_layer = 0;
976 assert(layer_count <= iview->vk.extent.depth);
977 } else {
978 assert(layer_count <= iview->vk.layer_count);
979 }
980
981 const struct nil_image_level *level = &nil_image.levels[mip_level];
982 addr += level->offset_B;
983
984 assert(sample_layout == NIL_SAMPLE_LAYOUT_INVALID ||
985 sample_layout == nil_image.sample_layout);
986 sample_layout = nil_image.sample_layout;
987 render->samples = image->vk.samples;
988
989 P_MTHD(p, NV9097, SET_ZT_A);
990 P_NV9097_SET_ZT_A(p, addr >> 32);
991 P_NV9097_SET_ZT_B(p, addr);
992 const enum pipe_format p_format =
993 vk_format_to_pipe_format(iview->vk.format);
994 const uint8_t zs_format = nil_format_to_depth_stencil(p_format);
995 P_NV9097_SET_ZT_FORMAT(p, zs_format);
996 assert(level->tiling.is_tiled);
997 assert(level->tiling.z_log2 == 0);
998 P_NV9097_SET_ZT_BLOCK_SIZE(p, {
999 .width = WIDTH_ONE_GOB,
1000 .height = level->tiling.y_log2,
1001 .depth = DEPTH_ONE_GOB,
1002 });
1003 P_NV9097_SET_ZT_ARRAY_PITCH(p, nil_image.array_stride_B >> 2);
1004
1005 P_IMMD(p, NV9097, SET_ZT_SELECT, 1 /* target_count */);
1006
1007 struct nil_Extent4D_Samples level_extent_sa =
1008 nil_image_level_extent_sa(&nil_image, mip_level);
1009
1010 /* We use the stride for depth/stencil targets because the Z/S hardware
1011 * has no concept of a tile width. Instead, we just set the width to
1012 * the stride divided by bpp.
1013 */
1014 const uint32_t row_stride_el =
1015 level->row_stride_B / util_format_get_blocksize(p_format);
1016
1017 P_MTHD(p, NV9097, SET_ZT_SIZE_A);
1018 P_NV9097_SET_ZT_SIZE_A(p, row_stride_el);
1019 P_NV9097_SET_ZT_SIZE_B(p, level_extent_sa.height);
1020 P_NV9097_SET_ZT_SIZE_C(p, {
1021 .third_dimension = base_array_layer + layer_count,
1022 .control = CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
1023 });
1024
1025 P_IMMD(p, NV9097, SET_ZT_LAYER, base_array_layer);
1026
1027 P_IMMD(p, NV9097, SET_Z_COMPRESSION, nil_image.compressed);
1028
1029 if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1030 P_IMMD(p, NVC597, SET_ZT_SPARSE, {
1031 .enable = ENABLE_FALSE,
1032 });
1033 }
1034 } else {
1035 P_IMMD(p, NV9097, SET_ZT_SELECT, 0 /* target_count */);
1036 }
1037
1038 /* From the Vulkan 1.3.275 spec:
1039 *
1040 * "It is legal for a subpass to use no color or depth/stencil
1041 * attachments, either because it has no attachment references or
1042 * because all of them are VK_ATTACHMENT_UNUSED. This kind of subpass
1043 * can use shader side effects such as image stores and atomics to
1044 * produce an output. In this case, the subpass continues to use the
1045 * width, height, and layers of the framebuffer to define the dimensions
1046 * of the rendering area, and the rasterizationSamples from each
1047 * pipeline’s VkPipelineMultisampleStateCreateInfo to define the number
1048 * of samples used in rasterization;"
1049 *
1050 * In the case where we have attachments, we emit SET_ANTI_ALIAS here
1051 * because SET_COLOR_TARGET_* and SET_ZT_* don't have any other way of
1052 * specifying the sample layout and we want to ensure it matches. When
1053 * we don't have any attachments, we defer SET_ANTI_ALIAS to draw time
1054 * where we base it on dynamic rasterizationSamples.
1055 */
1056 if (sample_layout != NIL_SAMPLE_LAYOUT_INVALID) {
1057 P_IMMD(p, NV9097, SET_ANTI_ALIAS,
1058 nil_to_nv9097_samples_mode(sample_layout));
1059 }
1060
1061 if (render->flags & VK_RENDERING_RESUMING_BIT)
1062 return;
1063
1064 for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
1065 const struct nvk_image_view *iview = render->color_att[i].iview;
1066 if (iview == NULL)
1067 continue;
1068
1069 const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1070 assert(iview->plane_count == 1);
1071 const uint8_t ip = iview->planes[0].image_plane;
1072 const struct nvk_image_plane *plane = &image->planes[ip];
1073
1074 const VkAttachmentLoadOp load_op =
1075 pRenderingInfo->pColorAttachments[i].loadOp;
1076 if (!render->all_linear && !plane->nil.levels[0].tiling.is_tiled &&
1077 load_op == VK_ATTACHMENT_LOAD_OP_LOAD)
1078 nvk_linear_render_copy(cmd, iview, render->area, true);
1079 }
1080
1081 uint32_t clear_count = 0;
1082 VkClearAttachment clear_att[NVK_MAX_RTS + 1];
1083 for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
1084 const VkRenderingAttachmentInfo *att_info =
1085 &pRenderingInfo->pColorAttachments[i];
1086 if (att_info->imageView == VK_NULL_HANDLE ||
1087 att_info->loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
1088 continue;
1089
1090 clear_att[clear_count++] = (VkClearAttachment) {
1091 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1092 .colorAttachment = i,
1093 .clearValue = att_info->clearValue,
1094 };
1095 }
1096
1097 clear_att[clear_count] = (VkClearAttachment) { .aspectMask = 0, };
1098 if (pRenderingInfo->pDepthAttachment != NULL &&
1099 pRenderingInfo->pDepthAttachment->imageView != VK_NULL_HANDLE &&
1100 pRenderingInfo->pDepthAttachment->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1101 clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
1102 clear_att[clear_count].clearValue.depthStencil.depth =
1103 pRenderingInfo->pDepthAttachment->clearValue.depthStencil.depth;
1104 }
1105 if (pRenderingInfo->pStencilAttachment != NULL &&
1106 pRenderingInfo->pStencilAttachment->imageView != VK_NULL_HANDLE &&
1107 pRenderingInfo->pStencilAttachment->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1108 clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
1109 clear_att[clear_count].clearValue.depthStencil.stencil =
1110 pRenderingInfo->pStencilAttachment->clearValue.depthStencil.stencil;
1111 }
1112 if (clear_att[clear_count].aspectMask != 0)
1113 clear_count++;
1114
1115 if (clear_count > 0) {
1116 const VkClearRect clear_rect = {
1117 .rect = render->area,
1118 .baseArrayLayer = 0,
1119 .layerCount = render->view_mask ? 1 : render->layer_count,
1120 };
1121
1122 P_MTHD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE);
1123 P_NV9097_SET_RENDER_ENABLE_OVERRIDE(p, MODE_ALWAYS_RENDER);
1124
1125 nvk_CmdClearAttachments(nvk_cmd_buffer_to_handle(cmd),
1126 clear_count, clear_att, 1, &clear_rect);
1127 p = nvk_cmd_buffer_push(cmd, 2);
1128 P_MTHD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE);
1129 P_NV9097_SET_RENDER_ENABLE_OVERRIDE(p, MODE_USE_RENDER_ENABLE);
1130 }
1131
1132 /* TODO: Attachment clears */
1133 }
1134
1135 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndRendering(VkCommandBuffer commandBuffer)1136 nvk_CmdEndRendering(VkCommandBuffer commandBuffer)
1137 {
1138 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
1139 struct nvk_rendering_state *render = &cmd->state.gfx.render;
1140
1141 if (!(render->flags & VK_RENDERING_SUSPENDING_BIT)) {
1142 for (uint32_t i = 0; i < render->color_att_count; i++) {
1143 struct nvk_image_view *iview = render->color_att[i].iview;
1144 if (iview == NULL)
1145 continue;
1146
1147 struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1148 const uint8_t ip = iview->planes[0].image_plane;
1149 const struct nvk_image_plane *plane = &image->planes[ip];
1150 if (!render->all_linear && !plane->nil.levels[0].tiling.is_tiled &&
1151 render->color_att[i].store_op == VK_ATTACHMENT_STORE_OP_STORE)
1152 nvk_linear_render_copy(cmd, iview, render->area, false);
1153 }
1154 }
1155
1156 bool need_resolve = false;
1157
1158 /* Translate render state back to VK for meta */
1159 VkRenderingAttachmentInfo vk_color_att[NVK_MAX_RTS];
1160 for (uint32_t i = 0; i < render->color_att_count; i++) {
1161 if (render->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE)
1162 need_resolve = true;
1163
1164 vk_color_att[i] = (VkRenderingAttachmentInfo) {
1165 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1166 .imageView = nvk_image_view_to_handle(render->color_att[i].iview),
1167 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1168 .resolveMode = render->color_att[i].resolve_mode,
1169 .resolveImageView =
1170 nvk_image_view_to_handle(render->color_att[i].resolve_iview),
1171 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1172 };
1173 }
1174
1175 const VkRenderingAttachmentInfo vk_depth_att = {
1176 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1177 .imageView = nvk_image_view_to_handle(render->depth_att.iview),
1178 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1179 .resolveMode = render->depth_att.resolve_mode,
1180 .resolveImageView =
1181 nvk_image_view_to_handle(render->depth_att.resolve_iview),
1182 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1183 };
1184 if (render->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE)
1185 need_resolve = true;
1186
1187 const VkRenderingAttachmentInfo vk_stencil_att = {
1188 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1189 .imageView = nvk_image_view_to_handle(render->stencil_att.iview),
1190 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1191 .resolveMode = render->stencil_att.resolve_mode,
1192 .resolveImageView =
1193 nvk_image_view_to_handle(render->stencil_att.resolve_iview),
1194 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1195 };
1196 if (render->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)
1197 need_resolve = true;
1198
1199 const VkRenderingInfo vk_render = {
1200 .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
1201 .renderArea = render->area,
1202 .layerCount = render->layer_count,
1203 .viewMask = render->view_mask,
1204 .colorAttachmentCount = render->color_att_count,
1205 .pColorAttachments = vk_color_att,
1206 .pDepthAttachment = &vk_depth_att,
1207 .pStencilAttachment = &vk_stencil_att,
1208 };
1209
1210 if (render->flags & VK_RENDERING_SUSPENDING_BIT)
1211 need_resolve = false;
1212
1213 memset(render, 0, sizeof(*render));
1214
1215 if (need_resolve) {
1216 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1217 P_IMMD(p, NVA097, INVALIDATE_TEXTURE_DATA_CACHE, {
1218 .lines = LINES_ALL,
1219 });
1220
1221 nvk_meta_resolve_rendering(cmd, &vk_render);
1222 }
1223 }
1224
1225 void
nvk_cmd_bind_graphics_shader(struct nvk_cmd_buffer * cmd,const gl_shader_stage stage,struct nvk_shader * shader)1226 nvk_cmd_bind_graphics_shader(struct nvk_cmd_buffer *cmd,
1227 const gl_shader_stage stage,
1228 struct nvk_shader *shader)
1229 {
1230 assert(stage < ARRAY_SIZE(cmd->state.gfx.shaders));
1231 if (cmd->state.gfx.shaders[stage] == shader)
1232 return;
1233
1234 cmd->state.gfx.shaders[stage] = shader;
1235 cmd->state.gfx.shaders_dirty |= BITFIELD_BIT(stage);
1236 }
1237
1238 static uint32_t
mesa_to_nv9097_shader_type(gl_shader_stage stage)1239 mesa_to_nv9097_shader_type(gl_shader_stage stage)
1240 {
1241 static const uint32_t mesa_to_nv9097[] = {
1242 [MESA_SHADER_VERTEX] = NV9097_SET_PIPELINE_SHADER_TYPE_VERTEX,
1243 [MESA_SHADER_TESS_CTRL] = NV9097_SET_PIPELINE_SHADER_TYPE_TESSELLATION_INIT,
1244 [MESA_SHADER_TESS_EVAL] = NV9097_SET_PIPELINE_SHADER_TYPE_TESSELLATION,
1245 [MESA_SHADER_GEOMETRY] = NV9097_SET_PIPELINE_SHADER_TYPE_GEOMETRY,
1246 [MESA_SHADER_FRAGMENT] = NV9097_SET_PIPELINE_SHADER_TYPE_PIXEL,
1247 };
1248 assert(stage < ARRAY_SIZE(mesa_to_nv9097));
1249 return mesa_to_nv9097[stage];
1250 }
1251
1252 static uint32_t
nvk_pipeline_bind_group(gl_shader_stage stage)1253 nvk_pipeline_bind_group(gl_shader_stage stage)
1254 {
1255 return stage;
1256 }
1257
1258 static uint32_t
nvk_mme_tess_params(enum nak_ts_domain domain,enum nak_ts_spacing spacing,enum nak_ts_prims prims)1259 nvk_mme_tess_params(enum nak_ts_domain domain,
1260 enum nak_ts_spacing spacing,
1261 enum nak_ts_prims prims)
1262 {
1263 /* This is laid out the same as SET_TESSELLATION_PARAMETERS, only with an
1264 * extra bit for lower_left
1265 */
1266 uint16_t params = ((uint16_t)domain << 0) |
1267 ((uint16_t)spacing << 4) |
1268 ((uint16_t)prims << 8);
1269 return nvk_mme_val_mask(params, 0x0fff);
1270 }
1271
1272 static uint32_t
nvk_mme_tess_lower_left(bool lower_left)1273 nvk_mme_tess_lower_left(bool lower_left)
1274 {
1275 return nvk_mme_val_mask((uint16_t)lower_left << 12, 1u << 12);
1276 }
1277
1278 void
nvk_mme_set_tess_params(struct mme_builder * b)1279 nvk_mme_set_tess_params(struct mme_builder *b)
1280 {
1281 struct mme_value val_mask = mme_load(b);
1282 struct mme_value old_params = nvk_mme_load_scratch(b, TESS_PARAMS);
1283 struct mme_value params = nvk_mme_set_masked(b, old_params, val_mask);
1284 mme_free_reg(b, val_mask);
1285
1286 mme_if(b, ine, params, old_params) {
1287 nvk_mme_store_scratch(b, TESS_PARAMS, params);
1288
1289 /* lower_left lives at bit 12 */
1290 struct mme_value lower_left = mme_merge(b, mme_zero(), params, 0, 1, 12);
1291
1292 /* Only the bottom 12 bits are valid to put in HW */
1293 mme_merge_to(b, params, mme_zero(), params, 0, 12, 0);
1294
1295 /* If we're using a lower-left orientation, we need to flip triangles
1296 * between CW and CCW.
1297 */
1298 mme_if(b, ine, lower_left, mme_zero()) {
1299 struct mme_value prims_cw = mme_imm(NAK_TS_PRIMS_TRIANGLES_CW);
1300 struct mme_value prims_ccw = mme_imm(NAK_TS_PRIMS_TRIANGLES_CCW);
1301
1302 struct mme_value prims = mme_merge(b, mme_zero(), params, 0, 4, 8);
1303 mme_if(b, ieq, prims, prims_cw) {
1304 mme_merge_to(b, params, params, prims_ccw, 8, 4, 0);
1305 }
1306 mme_if(b, ieq, prims, prims_ccw) {
1307 mme_merge_to(b, params, params, prims_cw, 8, 4, 0);
1308 }
1309 mme_free_reg(b, prims);
1310 }
1311 mme_free_reg(b, lower_left);
1312
1313 mme_mthd(b, NV9097_SET_TESSELLATION_PARAMETERS);
1314 mme_emit(b, params);
1315 }
1316 }
1317
1318 const struct nvk_mme_test_case nvk_mme_set_tess_params_tests[] = {{
1319 /* This case doesn't change the state so it should do nothing */
1320 .init = (struct nvk_mme_mthd_data[]) {
1321 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0 },
1322 { }
1323 },
1324 .params = (uint32_t[]) { 0xffff0000 },
1325 .expected = (struct nvk_mme_mthd_data[]) {
1326 { }
1327 },
1328 }, {
1329 /* TRIANGLE, INTEGER, TRIANGLES_CW, lower_left = false */
1330 .init = (struct nvk_mme_mthd_data[]) {
1331 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0 },
1332 { }
1333 },
1334 .params = (uint32_t[]) { 0xffff0201 },
1335 .expected = (struct nvk_mme_mthd_data[]) {
1336 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0201 },
1337 { NV9097_SET_TESSELLATION_PARAMETERS, 0x0201 },
1338 { }
1339 },
1340 }, {
1341 /* TRIANGLE, INTEGER, TRIANGLES_CW, lower_left = true */
1342 .init = (struct nvk_mme_mthd_data[]) {
1343 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0201 },
1344 { }
1345 },
1346 .params = (uint32_t[]) { 0x10001000 },
1347 .expected = (struct nvk_mme_mthd_data[]) {
1348 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x1201 },
1349 { NV9097_SET_TESSELLATION_PARAMETERS, 0x0301 },
1350 { }
1351 },
1352 }, {
1353 /* TRIANGLE, INTEGER, TRIANGLES_CCW, lower_left = true */
1354 .init = (struct nvk_mme_mthd_data[]) {
1355 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0301 },
1356 { }
1357 },
1358 .params = (uint32_t[]) { 0x10001000 },
1359 .expected = (struct nvk_mme_mthd_data[]) {
1360 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x1301 },
1361 { NV9097_SET_TESSELLATION_PARAMETERS, 0x0201 },
1362 { }
1363 },
1364 }, {}};
1365
1366 static uint32_t nvk_mme_anti_alias_min_sample_shading(float mss);
1367
1368 static void
nvk_flush_shaders(struct nvk_cmd_buffer * cmd)1369 nvk_flush_shaders(struct nvk_cmd_buffer *cmd)
1370 {
1371 if (cmd->state.gfx.shaders_dirty == 0)
1372 return;
1373
1374 /* Map shader types to shaders */
1375 struct nvk_shader *type_shader[6] = { NULL, };
1376 uint32_t types_dirty = 0;
1377
1378 const uint32_t gfx_stages = BITFIELD_BIT(MESA_SHADER_VERTEX) |
1379 BITFIELD_BIT(MESA_SHADER_TESS_CTRL) |
1380 BITFIELD_BIT(MESA_SHADER_TESS_EVAL) |
1381 BITFIELD_BIT(MESA_SHADER_GEOMETRY) |
1382 BITFIELD_BIT(MESA_SHADER_FRAGMENT);
1383
1384 u_foreach_bit(stage, cmd->state.gfx.shaders_dirty & gfx_stages) {
1385 uint32_t type = mesa_to_nv9097_shader_type(stage);
1386 types_dirty |= BITFIELD_BIT(type);
1387
1388 /* Only copy non-NULL shaders because mesh/task alias with vertex and
1389 * tessellation stages.
1390 */
1391 struct nvk_shader *shader = cmd->state.gfx.shaders[stage];
1392 if (shader != NULL) {
1393 assert(type < ARRAY_SIZE(type_shader));
1394 assert(type_shader[type] == NULL);
1395 type_shader[type] = shader;
1396
1397 const struct nvk_cbuf_map *cbuf_map = &shader->cbuf_map;
1398 struct nvk_cbuf_group *cbuf_group =
1399 &cmd->state.gfx.cbuf_groups[nvk_cbuf_binding_for_stage(stage)];
1400 for (uint32_t i = 0; i < cbuf_map->cbuf_count; i++) {
1401 if (memcmp(&cbuf_group->cbufs[i], &cbuf_map->cbufs[i],
1402 sizeof(cbuf_group->cbufs[i])) != 0) {
1403 cbuf_group->cbufs[i] = cbuf_map->cbufs[i];
1404 cbuf_group->dirty |= BITFIELD_BIT(i);
1405 }
1406 }
1407 }
1408 }
1409
1410 u_foreach_bit(type, types_dirty) {
1411 struct nvk_shader *shader = type_shader[type];
1412
1413 /* We always map index == type */
1414 const uint32_t idx = type;
1415
1416 struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
1417 P_IMMD(p, NV9097, SET_PIPELINE_SHADER(idx), {
1418 .enable = shader != NULL,
1419 .type = type,
1420 });
1421
1422 if (shader == NULL)
1423 continue;
1424
1425 uint64_t addr = shader->hdr_addr;
1426 if (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A) {
1427 P_MTHD(p, NVC397, SET_PIPELINE_PROGRAM_ADDRESS_A(idx));
1428 P_NVC397_SET_PIPELINE_PROGRAM_ADDRESS_A(p, idx, addr >> 32);
1429 P_NVC397_SET_PIPELINE_PROGRAM_ADDRESS_B(p, idx, addr);
1430 } else {
1431 assert(addr < 0xffffffff);
1432 P_IMMD(p, NV9097, SET_PIPELINE_PROGRAM(idx), addr);
1433 }
1434
1435 P_MTHD(p, NVC397, SET_PIPELINE_REGISTER_COUNT(idx));
1436 P_NVC397_SET_PIPELINE_REGISTER_COUNT(p, idx, shader->info.num_gprs);
1437 P_NVC397_SET_PIPELINE_BINDING(p, idx,
1438 nvk_pipeline_bind_group(shader->info.stage));
1439
1440 if (shader->info.stage == MESA_SHADER_TESS_EVAL) {
1441 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_TESS_PARAMS));
1442 P_INLINE_DATA(p, nvk_mme_tess_params(shader->info.ts.domain,
1443 shader->info.ts.spacing,
1444 shader->info.ts.prims));
1445 }
1446
1447 if (shader->info.stage == MESA_SHADER_FRAGMENT) {
1448 p = nvk_cmd_buffer_push(cmd, 11);
1449
1450 P_MTHD(p, NVC397, SET_SUBTILING_PERF_KNOB_A);
1451 P_NV9097_SET_SUBTILING_PERF_KNOB_A(p, {
1452 .fraction_of_spm_register_file_per_subtile = 0x10,
1453 .fraction_of_spm_pixel_output_buffer_per_subtile = 0x40,
1454 .fraction_of_spm_triangle_ram_per_subtile = 0x16,
1455 .fraction_of_max_quads_per_subtile = 0x20,
1456 });
1457 P_NV9097_SET_SUBTILING_PERF_KNOB_B(p, 0x20);
1458
1459 P_IMMD(p, NV9097, SET_API_MANDATED_EARLY_Z,
1460 shader->info.fs.early_fragment_tests);
1461
1462 if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1463 P_IMMD(p, NVB197, SET_POST_Z_PS_IMASK,
1464 shader->info.fs.post_depth_coverage);
1465 } else {
1466 assert(!shader->info.fs.post_depth_coverage);
1467 }
1468
1469 P_IMMD(p, NV9097, SET_ZCULL_BOUNDS, {
1470 .z_min_unbounded_enable = shader->info.fs.writes_depth,
1471 .z_max_unbounded_enable = shader->info.fs.writes_depth,
1472 });
1473
1474 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_ANTI_ALIAS));
1475 P_INLINE_DATA(p,
1476 nvk_mme_anti_alias_min_sample_shading(shader->min_sample_shading));
1477 }
1478 }
1479
1480 const uint32_t vtg_stages = BITFIELD_BIT(MESA_SHADER_VERTEX) |
1481 BITFIELD_BIT(MESA_SHADER_TESS_EVAL) |
1482 BITFIELD_BIT(MESA_SHADER_GEOMETRY);
1483 const uint32_t vtgm_stages = vtg_stages | BITFIELD_BIT(MESA_SHADER_MESH);
1484
1485 if (cmd->state.gfx.shaders_dirty & vtg_stages) {
1486 struct nak_xfb_info *xfb = NULL;
1487 u_foreach_bit(stage, vtg_stages) {
1488 if (cmd->state.gfx.shaders[stage] != NULL)
1489 xfb = &cmd->state.gfx.shaders[stage]->info.vtg.xfb;
1490 }
1491
1492 if (xfb == NULL) {
1493 struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
1494 for (uint8_t b = 0; b < 4; b++)
1495 P_IMMD(p, NV9097, SET_STREAM_OUT_CONTROL_COMPONENT_COUNT(b), 0);
1496 } else {
1497 for (uint8_t b = 0; b < ARRAY_SIZE(xfb->attr_count); b++) {
1498 const uint8_t attr_count = xfb->attr_count[b];
1499 /* upload packed varying indices in multiples of 4 bytes */
1500 const uint32_t n = DIV_ROUND_UP(attr_count, 4);
1501
1502 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5 + n);
1503
1504 P_MTHD(p, NV9097, SET_STREAM_OUT_CONTROL_STREAM(b));
1505 P_NV9097_SET_STREAM_OUT_CONTROL_STREAM(p, b, xfb->stream[b]);
1506 P_NV9097_SET_STREAM_OUT_CONTROL_COMPONENT_COUNT(p, b, attr_count);
1507 P_NV9097_SET_STREAM_OUT_CONTROL_STRIDE(p, b, xfb->stride[b]);
1508
1509 if (n > 0) {
1510 P_MTHD(p, NV9097, SET_STREAM_OUT_LAYOUT_SELECT(b, 0));
1511 P_INLINE_ARRAY(p, (const uint32_t*)xfb->attr_index[b], n);
1512 }
1513 }
1514 }
1515 }
1516
1517 if (cmd->state.gfx.shaders_dirty & vtgm_stages) {
1518 struct nvk_shader *last_vtgm = NULL;
1519 u_foreach_bit(stage, vtgm_stages) {
1520 if (cmd->state.gfx.shaders[stage] != NULL)
1521 last_vtgm = cmd->state.gfx.shaders[stage];
1522 }
1523
1524 struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
1525
1526 P_IMMD(p, NV9097, SET_RT_LAYER, {
1527 .v = 0,
1528 .control = last_vtgm->info.vtg.writes_layer ?
1529 CONTROL_GEOMETRY_SHADER_SELECTS_LAYER :
1530 CONTROL_V_SELECTS_LAYER,
1531 });
1532
1533 const uint8_t clip_enable = last_vtgm->info.vtg.clip_enable;
1534 const uint8_t cull_enable = last_vtgm->info.vtg.cull_enable;
1535 P_IMMD(p, NV9097, SET_USER_CLIP_ENABLE, {
1536 .plane0 = ((clip_enable | cull_enable) >> 0) & 1,
1537 .plane1 = ((clip_enable | cull_enable) >> 1) & 1,
1538 .plane2 = ((clip_enable | cull_enable) >> 2) & 1,
1539 .plane3 = ((clip_enable | cull_enable) >> 3) & 1,
1540 .plane4 = ((clip_enable | cull_enable) >> 4) & 1,
1541 .plane5 = ((clip_enable | cull_enable) >> 5) & 1,
1542 .plane6 = ((clip_enable | cull_enable) >> 6) & 1,
1543 .plane7 = ((clip_enable | cull_enable) >> 7) & 1,
1544 });
1545 P_IMMD(p, NV9097, SET_USER_CLIP_OP, {
1546 .plane0 = (cull_enable >> 0) & 1,
1547 .plane1 = (cull_enable >> 1) & 1,
1548 .plane2 = (cull_enable >> 2) & 1,
1549 .plane3 = (cull_enable >> 3) & 1,
1550 .plane4 = (cull_enable >> 4) & 1,
1551 .plane5 = (cull_enable >> 5) & 1,
1552 .plane6 = (cull_enable >> 6) & 1,
1553 .plane7 = (cull_enable >> 7) & 1,
1554 });
1555 }
1556
1557 cmd->state.gfx.shaders_dirty = 0;
1558 }
1559
1560 void
nvk_mme_set_vb_enables(struct mme_builder * b)1561 nvk_mme_set_vb_enables(struct mme_builder *b)
1562 {
1563 struct mme_value enables = mme_load(b);
1564 struct mme_value old_enables = nvk_mme_load_scratch(b, VB_ENABLES);
1565 nvk_mme_store_scratch(b, VB_ENABLES, enables);
1566
1567 struct mme_value changed = mme_xor(b, enables, old_enables);
1568 mme_free_reg(b, old_enables);
1569
1570 struct mme_value vb_idx4 = mme_mov(b, mme_zero());
1571 mme_while(b, ine, changed, mme_zero()) {
1572 mme_if(b, ine, mme_and(b, changed, mme_imm(1)), mme_zero()) {
1573 struct mme_value state =
1574 mme_state_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1575 mme_merge_to(b, state, state, enables, 12, 1, 0);
1576 mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1577 mme_emit(b, state);
1578 }
1579 mme_add_to(b, vb_idx4, vb_idx4, mme_imm(4));
1580 mme_srl_to(b, changed, changed, mme_imm(1));
1581 mme_srl_to(b, enables, enables, mme_imm(1));
1582 }
1583 }
1584
1585 static uint32_t
nvk_mme_vb_stride(uint32_t vb_idx,uint32_t stride)1586 nvk_mme_vb_stride(uint32_t vb_idx, uint32_t stride)
1587 {
1588 assert(stride < (1 << 12));
1589 assert(vb_idx < (1 << 5));
1590 return (vb_idx << 16) | stride;
1591 }
1592
1593 void
nvk_mme_set_vb_stride(struct mme_builder * b)1594 nvk_mme_set_vb_stride(struct mme_builder *b)
1595 {
1596 /* Param is laid out as
1597 *
1598 * bits 0..11 : stride
1599 * bits 16..21 : VB index
1600 */
1601 struct mme_value param = mme_load(b);
1602
1603 struct mme_value vb_idx4 = mme_merge(b, mme_zero(), param, 2, 5, 16);
1604
1605 struct mme_value state =
1606 mme_state_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1607 struct mme_value new_state = mme_merge(b, state, param, 0, 12, 0);
1608 mme_if(b, ine, state, new_state) {
1609 mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1610 mme_emit(b, new_state);
1611 }
1612 }
1613
1614 static void
nvk_flush_vi_state(struct nvk_cmd_buffer * cmd)1615 nvk_flush_vi_state(struct nvk_cmd_buffer *cmd)
1616 {
1617 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
1618 struct nvk_physical_device *pdev = nvk_device_physical(dev);
1619 const struct vk_dynamic_graphics_state *dyn =
1620 &cmd->vk.dynamic_graphics_state;
1621
1622 struct nv_push *p = nvk_cmd_buffer_push(cmd, 258);
1623
1624 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) {
1625 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VB_ENABLES));
1626 P_INLINE_DATA(p, dyn->vi->bindings_valid);
1627 }
1628
1629 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) ||
1630 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) {
1631 u_foreach_bit(a, dyn->vi->attributes_valid) {
1632 const struct nvk_va_format *fmt =
1633 nvk_get_va_format(pdev, dyn->vi->attributes[a].format);
1634
1635 P_IMMD(p, NV9097, SET_VERTEX_ATTRIBUTE_A(a), {
1636 .stream = dyn->vi->attributes[a].binding,
1637 .offset = dyn->vi->attributes[a].offset,
1638 .component_bit_widths = fmt->bit_widths,
1639 .numerical_type = fmt->type,
1640 .swap_r_and_b = fmt->swap_rb,
1641 });
1642 }
1643
1644 u_foreach_bit(b, dyn->vi->bindings_valid) {
1645 const bool instanced = dyn->vi->bindings[b].input_rate ==
1646 VK_VERTEX_INPUT_RATE_INSTANCE;
1647 P_IMMD(p, NV9097, SET_VERTEX_STREAM_INSTANCE_A(b), instanced);
1648 P_IMMD(p, NV9097, SET_VERTEX_STREAM_A_FREQUENCY(b),
1649 dyn->vi->bindings[b].divisor);
1650 }
1651 }
1652
1653 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
1654 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES)) {
1655 u_foreach_bit(b, dyn->vi->bindings_valid) {
1656 assert(dyn->vi_binding_strides[b] < (1 << 12));
1657 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VB_STRIDE));
1658 P_INLINE_DATA(p, nvk_mme_vb_stride(b, dyn->vi_binding_strides[b]));
1659 }
1660 }
1661 }
1662
1663 static uint32_t
vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)1664 vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)
1665 {
1666 switch (prim) {
1667 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
1668 return NV9097_BEGIN_OP_POINTS;
1669 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
1670 return NV9097_BEGIN_OP_LINES;
1671 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
1672 return NV9097_BEGIN_OP_LINE_STRIP;
1673 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
1674 #pragma GCC diagnostic push
1675 #pragma GCC diagnostic ignored "-Wswitch"
1676 case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA:
1677 #pragma GCC diagnostic pop
1678 return NV9097_BEGIN_OP_TRIANGLES;
1679 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
1680 return NV9097_BEGIN_OP_TRIANGLE_STRIP;
1681 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
1682 return NV9097_BEGIN_OP_TRIANGLE_FAN;
1683 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1684 return NV9097_BEGIN_OP_LINELIST_ADJCY;
1685 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1686 return NV9097_BEGIN_OP_LINESTRIP_ADJCY;
1687 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1688 return NV9097_BEGIN_OP_TRIANGLELIST_ADJCY;
1689 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
1690 return NV9097_BEGIN_OP_TRIANGLESTRIP_ADJCY;
1691 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
1692 return NV9097_BEGIN_OP_PATCH;
1693 default:
1694 unreachable("Invalid primitive topology");
1695 }
1696 }
1697
1698 static void
nvk_flush_ia_state(struct nvk_cmd_buffer * cmd)1699 nvk_flush_ia_state(struct nvk_cmd_buffer *cmd)
1700 {
1701 const struct vk_dynamic_graphics_state *dyn =
1702 &cmd->vk.dynamic_graphics_state;
1703
1704 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
1705 uint32_t begin;
1706 V_NV9097_BEGIN(begin, {
1707 .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
1708 .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
1709 .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
1710 .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
1711 });
1712
1713 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1714 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_DRAW_BEGIN));
1715 P_INLINE_DATA(p, begin);
1716 }
1717
1718 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
1719 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1720 P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART,
1721 dyn->ia.primitive_restart_enable);
1722 }
1723 }
1724
1725 static void
nvk_flush_ts_state(struct nvk_cmd_buffer * cmd)1726 nvk_flush_ts_state(struct nvk_cmd_buffer *cmd)
1727 {
1728 const struct vk_dynamic_graphics_state *dyn =
1729 &cmd->vk.dynamic_graphics_state;
1730 struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
1731
1732 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS)) {
1733 /* The hardware gets grumpy if we set this to 0 so make sure we set it
1734 * to at least 1 in case it's dirty but uninitialized.
1735 */
1736 P_IMMD(p, NV9097, SET_PATCH, MAX2(1, dyn->ts.patch_control_points));
1737 }
1738
1739 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN)) {
1740 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_TESS_PARAMS));
1741 P_INLINE_DATA(p, nvk_mme_tess_lower_left(
1742 dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT));
1743 }
1744 }
1745
1746 static void
nvk_flush_vp_state(struct nvk_cmd_buffer * cmd)1747 nvk_flush_vp_state(struct nvk_cmd_buffer *cmd)
1748 {
1749 const struct vk_dynamic_graphics_state *dyn =
1750 &cmd->vk.dynamic_graphics_state;
1751
1752 struct nv_push *p =
1753 nvk_cmd_buffer_push(cmd, 18 * dyn->vp.viewport_count + 4 * NVK_MAX_VIEWPORTS);
1754
1755 /* Nothing to do for MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT */
1756
1757 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
1758 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) {
1759 for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
1760 const VkViewport *vp = &dyn->vp.viewports[i];
1761
1762 /* These exactly match the spec values. Nvidia hardware oddities
1763 * are accounted for later.
1764 */
1765 const float o_x = vp->x + 0.5f * vp->width;
1766 const float o_y = vp->y + 0.5f * vp->height;
1767 const float o_z = !dyn->vp.depth_clip_negative_one_to_one ?
1768 vp->minDepth :
1769 (vp->maxDepth + vp->minDepth) * 0.5f;
1770
1771 const float p_x = vp->width;
1772 const float p_y = vp->height;
1773 const float p_z = !dyn->vp.depth_clip_negative_one_to_one ?
1774 vp->maxDepth - vp->minDepth :
1775 (vp->maxDepth - vp->minDepth) * 0.5f;
1776
1777 P_MTHD(p, NV9097, SET_VIEWPORT_SCALE_X(i));
1778 P_NV9097_SET_VIEWPORT_SCALE_X(p, i, fui(0.5f * p_x));
1779 P_NV9097_SET_VIEWPORT_SCALE_Y(p, i, fui(0.5f * p_y));
1780 P_NV9097_SET_VIEWPORT_SCALE_Z(p, i, fui(p_z));
1781
1782 P_NV9097_SET_VIEWPORT_OFFSET_X(p, i, fui(o_x));
1783 P_NV9097_SET_VIEWPORT_OFFSET_Y(p, i, fui(o_y));
1784 P_NV9097_SET_VIEWPORT_OFFSET_Z(p, i, fui(o_z));
1785
1786 float xmin = vp->x;
1787 float xmax = vp->x + vp->width;
1788 float ymin = MIN2(vp->y, vp->y + vp->height);
1789 float ymax = MAX2(vp->y, vp->y + vp->height);
1790 float zmin = MIN2(vp->minDepth, vp->maxDepth);
1791 float zmax = MAX2(vp->minDepth, vp->maxDepth);
1792 assert(xmin <= xmax && ymin <= ymax);
1793
1794 const float max_dim = (float)0xffff;
1795 xmin = CLAMP(xmin, 0, max_dim);
1796 xmax = CLAMP(xmax, 0, max_dim);
1797 ymin = CLAMP(ymin, 0, max_dim);
1798 ymax = CLAMP(ymax, 0, max_dim);
1799
1800 P_MTHD(p, NV9097, SET_VIEWPORT_CLIP_HORIZONTAL(i));
1801 P_NV9097_SET_VIEWPORT_CLIP_HORIZONTAL(p, i, {
1802 .x0 = xmin,
1803 .width = xmax - xmin,
1804 });
1805 P_NV9097_SET_VIEWPORT_CLIP_VERTICAL(p, i, {
1806 .y0 = ymin,
1807 .height = ymax - ymin,
1808 });
1809
1810 if (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A) {
1811 P_NV9097_SET_VIEWPORT_CLIP_MIN_Z(p, i, fui(zmin));
1812 P_NV9097_SET_VIEWPORT_CLIP_MAX_Z(p, i, fui(zmax));
1813 } else {
1814 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VIEWPORT_MIN_MAX_Z));
1815 P_INLINE_DATA(p, i);
1816 P_INLINE_DATA(p, fui(zmin));
1817 P_INLINE_DATA(p, fui(zmax));
1818 }
1819
1820 if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1821 P_IMMD(p, NVB197, SET_VIEWPORT_COORDINATE_SWIZZLE(i), {
1822 .x = X_POS_X,
1823 .y = Y_POS_Y,
1824 .z = Z_POS_Z,
1825 .w = W_POS_W,
1826 });
1827 }
1828 }
1829 }
1830
1831 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) {
1832 P_IMMD(p, NV9097, SET_VIEWPORT_Z_CLIP,
1833 dyn->vp.depth_clip_negative_one_to_one ?
1834 RANGE_NEGATIVE_W_TO_POSITIVE_W :
1835 RANGE_ZERO_TO_POSITIVE_W);
1836 }
1837
1838 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT)) {
1839 for (unsigned i = dyn->vp.scissor_count; i < NVK_MAX_VIEWPORTS; i++)
1840 P_IMMD(p, NV9097, SET_SCISSOR_ENABLE(i), V_FALSE);
1841 }
1842
1843 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS)) {
1844 for (unsigned i = 0; i < dyn->vp.scissor_count; i++) {
1845 const VkRect2D *s = &dyn->vp.scissors[i];
1846
1847 const uint32_t xmin = MIN2(16384, s->offset.x);
1848 const uint32_t xmax = MIN2(16384, s->offset.x + s->extent.width);
1849 const uint32_t ymin = MIN2(16384, s->offset.y);
1850 const uint32_t ymax = MIN2(16384, s->offset.y + s->extent.height);
1851
1852 P_MTHD(p, NV9097, SET_SCISSOR_ENABLE(i));
1853 P_NV9097_SET_SCISSOR_ENABLE(p, i, V_TRUE);
1854 P_NV9097_SET_SCISSOR_HORIZONTAL(p, i, {
1855 .xmin = xmin,
1856 .xmax = xmax,
1857 });
1858 P_NV9097_SET_SCISSOR_VERTICAL(p, i, {
1859 .ymin = ymin,
1860 .ymax = ymax,
1861 });
1862 }
1863 }
1864 }
1865
1866 static uint32_t
vk_to_nv9097_polygon_mode(VkPolygonMode vk_mode)1867 vk_to_nv9097_polygon_mode(VkPolygonMode vk_mode)
1868 {
1869 ASSERTED uint16_t vk_to_nv9097[] = {
1870 [VK_POLYGON_MODE_FILL] = NV9097_SET_FRONT_POLYGON_MODE_V_FILL,
1871 [VK_POLYGON_MODE_LINE] = NV9097_SET_FRONT_POLYGON_MODE_V_LINE,
1872 [VK_POLYGON_MODE_POINT] = NV9097_SET_FRONT_POLYGON_MODE_V_POINT,
1873 };
1874 assert(vk_mode < ARRAY_SIZE(vk_to_nv9097));
1875
1876 uint32_t nv9097_mode = 0x1b00 | (2 - vk_mode);
1877 assert(nv9097_mode == vk_to_nv9097[vk_mode]);
1878 return nv9097_mode;
1879 }
1880
1881 static uint32_t
vk_to_nv9097_cull_mode(VkCullModeFlags vk_cull_mode)1882 vk_to_nv9097_cull_mode(VkCullModeFlags vk_cull_mode)
1883 {
1884 static const uint16_t vk_to_nv9097[] = {
1885 [VK_CULL_MODE_FRONT_BIT] = NV9097_OGL_SET_CULL_FACE_V_FRONT,
1886 [VK_CULL_MODE_BACK_BIT] = NV9097_OGL_SET_CULL_FACE_V_BACK,
1887 [VK_CULL_MODE_FRONT_AND_BACK] = NV9097_OGL_SET_CULL_FACE_V_FRONT_AND_BACK,
1888 };
1889 assert(vk_cull_mode < ARRAY_SIZE(vk_to_nv9097));
1890 return vk_to_nv9097[vk_cull_mode];
1891 }
1892
1893 static uint32_t
vk_to_nv9097_front_face(VkFrontFace vk_face)1894 vk_to_nv9097_front_face(VkFrontFace vk_face)
1895 {
1896 /* Vulkan and OpenGL are backwards here because Vulkan assumes the D3D
1897 * convention in which framebuffer coordinates always start in the upper
1898 * left while OpenGL has framebuffer coordinates starting in the lower
1899 * left. Therefore, we want the reverse of the hardware enum name.
1900 */
1901 ASSERTED static const uint16_t vk_to_nv9097[] = {
1902 [VK_FRONT_FACE_COUNTER_CLOCKWISE] = NV9097_OGL_SET_FRONT_FACE_V_CCW,
1903 [VK_FRONT_FACE_CLOCKWISE] = NV9097_OGL_SET_FRONT_FACE_V_CW,
1904 };
1905 assert(vk_face < ARRAY_SIZE(vk_to_nv9097));
1906
1907 uint32_t nv9097_face = 0x900 | (1 - vk_face);
1908 assert(nv9097_face == vk_to_nv9097[vk_face]);
1909 return nv9097_face;
1910 }
1911
1912 static uint32_t
vk_to_nv9097_provoking_vertex(VkProvokingVertexModeEXT vk_mode)1913 vk_to_nv9097_provoking_vertex(VkProvokingVertexModeEXT vk_mode)
1914 {
1915 STATIC_ASSERT(VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT ==
1916 NV9097_SET_PROVOKING_VERTEX_V_FIRST);
1917 STATIC_ASSERT(VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT ==
1918 NV9097_SET_PROVOKING_VERTEX_V_LAST);
1919 return vk_mode;
1920 }
1921
1922 void
nvk_mme_set_viewport_min_max_z(struct mme_builder * b)1923 nvk_mme_set_viewport_min_max_z(struct mme_builder *b)
1924 {
1925 struct mme_value vp_idx = mme_load(b);
1926 struct mme_value min_z = mme_load(b);
1927 struct mme_value max_z = mme_load(b);
1928
1929 /* Multiply by 2 because it's an array with stride 8 */
1930 mme_sll_to(b, vp_idx, vp_idx, mme_imm(1));
1931 mme_mthd_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MIN_Z), vp_idx);
1932 mme_emit(b, min_z);
1933 mme_emit(b, max_z);
1934
1935 struct mme_value z_clamp = nvk_mme_load_scratch(b, Z_CLAMP);
1936 mme_if(b, ine, z_clamp, mme_zero()) {
1937 /* Multiply by 2 again because this array has stride 16 */
1938 mme_sll_to(b, vp_idx, vp_idx, mme_imm(1));
1939 mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), vp_idx);
1940 mme_emit(b, min_z);
1941 mme_emit(b, max_z);
1942 }
1943 }
1944
1945 void
nvk_mme_set_z_clamp(struct mme_builder * b)1946 nvk_mme_set_z_clamp(struct mme_builder *b)
1947 {
1948 struct mme_value z_clamp = mme_load(b);
1949 struct mme_value old_z_clamp = nvk_mme_load_scratch(b, Z_CLAMP);
1950 mme_if(b, ine, z_clamp, old_z_clamp) {
1951 nvk_mme_store_scratch(b, Z_CLAMP, z_clamp);
1952
1953 mme_if(b, ine, z_clamp, mme_zero()) {
1954 struct mme_value i_2 = mme_mov(b, mme_zero());
1955 mme_while(b, ine, i_2, mme_imm(NVK_MAX_VIEWPORTS * 2)) {
1956 struct mme_value min_z =
1957 mme_state_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MIN_Z), i_2);
1958 struct mme_value max_z =
1959 mme_state_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MAX_Z), i_2);
1960
1961 struct mme_value i_4 = mme_sll(b, i_2, mme_imm(1));
1962 mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), i_4);
1963 mme_emit(b, min_z);
1964 mme_emit(b, max_z);
1965
1966 mme_free_reg(b, i_4);
1967 mme_free_reg(b, min_z);
1968 mme_free_reg(b, max_z);
1969
1970 mme_add_to(b, i_2, i_2, mme_imm(2));
1971 }
1972 mme_free_reg(b, i_2);
1973 }
1974 mme_if(b, ieq, z_clamp, mme_zero()) {
1975 struct mme_value i_4 = mme_mov(b, mme_zero());
1976 mme_while(b, ine, i_4, mme_imm(NVK_MAX_VIEWPORTS * 4)) {
1977 mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), i_4);
1978 mme_emit(b, mme_imm(fui(-INFINITY)));
1979 mme_emit(b, mme_imm(fui(INFINITY)));
1980
1981 mme_add_to(b, i_4, i_4, mme_imm(4));
1982 }
1983 mme_free_reg(b, i_4);
1984 }
1985 }
1986 }
1987
1988 static void
nvk_flush_rs_state(struct nvk_cmd_buffer * cmd)1989 nvk_flush_rs_state(struct nvk_cmd_buffer *cmd)
1990 {
1991 struct nv_push *p = nvk_cmd_buffer_push(cmd, 46);
1992
1993 const struct vk_dynamic_graphics_state *dyn =
1994 &cmd->vk.dynamic_graphics_state;
1995
1996 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE))
1997 P_IMMD(p, NV9097, SET_RASTER_ENABLE, !dyn->rs.rasterizer_discard_enable);
1998
1999 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) ||
2000 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE)) {
2001 const bool z_clamp = dyn->rs.depth_clamp_enable;
2002 const bool z_clip = vk_rasterization_state_depth_clip_enable(&dyn->rs);
2003 P_IMMD(p, NVC397, SET_VIEWPORT_CLIP_CONTROL, {
2004 /* We only set Z clip range if clamp is requested. Otherwise, we
2005 * leave it set to -/+INF and clamp using the guardband below.
2006 */
2007 .min_z_zero_max_z_one = MIN_Z_ZERO_MAX_Z_ONE_FALSE,
2008 .z_clip_range = nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A
2009 ? (z_clamp ? Z_CLIP_RANGE_MIN_Z_MAX_Z
2010 : Z_CLIP_RANGE_MINUS_INF_PLUS_INF)
2011 : Z_CLIP_RANGE_USE_FIELD_MIN_Z_ZERO_MAX_Z_ONE,
2012
2013 .pixel_min_z = PIXEL_MIN_Z_CLAMP,
2014 .pixel_max_z = PIXEL_MAX_Z_CLAMP,
2015
2016 .geometry_guardband = GEOMETRY_GUARDBAND_SCALE_256,
2017 .line_point_cull_guardband = LINE_POINT_CULL_GUARDBAND_SCALE_256,
2018 .geometry_clip = z_clip ? GEOMETRY_CLIP_FRUSTUM_XYZ_CLIP
2019 : GEOMETRY_CLIP_FRUSTUM_XY_CLIP,
2020
2021 /* We clip depth with the geometry clipper to ensure that it gets
2022 * clipped before depth bias is applied. If we leave it up to the
2023 * raserizer clipper (pixel_min/max_z = CLIP), it will clip too late
2024 * in the pipeline. This can be seen in two different ways:
2025 *
2026 * - When depth bias is enabled, the bias is applied post-clipping.
2027 * If we clip in the rasterizer, it will clip according to the
2028 * post-bias depth which is wrong.
2029 *
2030 * - If the fragment shader overrides the depth by writing to
2031 * gl_FragDepth, it should be clipped according to the original
2032 * geometry, not accoring to gl_FragDepth.
2033 *
2034 * In order to always get the geometry clipper, we need to set a
2035 * tight guardband (geometry_guardband_z = SCALE_1).
2036 */
2037 .geometry_guardband_z = z_clip ? GEOMETRY_GUARDBAND_Z_SCALE_1
2038 : GEOMETRY_GUARDBAND_Z_SCALE_256,
2039 });
2040
2041 /* Pre-Volta, we don't have SET_VIEWPORT_CLIP_CONTROL::z_clip_range.
2042 * Instead, we have to emulate it by smashing VIEWPORT_CLIP_MIN/MAX_Z
2043 * based on whether or not z_clamp is set. This is done by a pair of
2044 * macros, one of which is called here and the other is called in
2045 * viewport setup.
2046 */
2047 if (nvk_cmd_buffer_3d_cls(cmd) < VOLTA_A) {
2048 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_Z_CLAMP));
2049 P_INLINE_DATA(p, z_clamp);
2050 }
2051 }
2052
2053 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE)) {
2054 uint32_t polygon_mode = vk_to_nv9097_polygon_mode(dyn->rs.polygon_mode);
2055 P_MTHD(p, NV9097, SET_FRONT_POLYGON_MODE);
2056 P_NV9097_SET_FRONT_POLYGON_MODE(p, polygon_mode);
2057 P_NV9097_SET_BACK_POLYGON_MODE(p, polygon_mode);
2058 }
2059
2060 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE)) {
2061 P_IMMD(p, NV9097, OGL_SET_CULL, dyn->rs.cull_mode != VK_CULL_MODE_NONE);
2062
2063 if (dyn->rs.cull_mode != VK_CULL_MODE_NONE) {
2064 uint32_t face = vk_to_nv9097_cull_mode(dyn->rs.cull_mode);
2065 P_IMMD(p, NV9097, OGL_SET_CULL_FACE, face);
2066 }
2067 }
2068
2069 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE)) {
2070 P_IMMD(p, NV9097, OGL_SET_FRONT_FACE,
2071 vk_to_nv9097_front_face(dyn->rs.front_face));
2072 }
2073
2074 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
2075 P_IMMD(p, NV9097, SET_PROVOKING_VERTEX,
2076 vk_to_nv9097_provoking_vertex(dyn->rs.provoking_vertex));
2077 }
2078
2079 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE)) {
2080 P_MTHD(p, NV9097, SET_POLY_OFFSET_POINT);
2081 P_NV9097_SET_POLY_OFFSET_POINT(p, dyn->rs.depth_bias.enable);
2082 P_NV9097_SET_POLY_OFFSET_LINE(p, dyn->rs.depth_bias.enable);
2083 P_NV9097_SET_POLY_OFFSET_FILL(p, dyn->rs.depth_bias.enable);
2084 }
2085
2086 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
2087 switch (dyn->rs.depth_bias.representation) {
2088 case VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORMAT_EXT:
2089 P_IMMD(p, NV9097, SET_DEPTH_BIAS_CONTROL,
2090 DEPTH_FORMAT_DEPENDENT_TRUE);
2091 break;
2092 case VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT:
2093 P_IMMD(p, NV9097, SET_DEPTH_BIAS_CONTROL,
2094 DEPTH_FORMAT_DEPENDENT_FALSE);
2095 break;
2096 case VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT:
2097 default:
2098 unreachable("Unsupported depth bias representation");
2099 }
2100 /* TODO: The blob multiplies by 2 for some reason. We don't. */
2101 P_IMMD(p, NV9097, SET_DEPTH_BIAS, fui(dyn->rs.depth_bias.constant));
2102 P_IMMD(p, NV9097, SET_SLOPE_SCALE_DEPTH_BIAS, fui(dyn->rs.depth_bias.slope));
2103 P_IMMD(p, NV9097, SET_DEPTH_BIAS_CLAMP, fui(dyn->rs.depth_bias.clamp));
2104 }
2105
2106 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH)) {
2107 P_MTHD(p, NV9097, SET_LINE_WIDTH_FLOAT);
2108 P_NV9097_SET_LINE_WIDTH_FLOAT(p, fui(dyn->rs.line.width));
2109 P_NV9097_SET_ALIASED_LINE_WIDTH_FLOAT(p, fui(dyn->rs.line.width));
2110 }
2111
2112 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE)) {
2113 switch (dyn->rs.line.mode) {
2114 case VK_LINE_RASTERIZATION_MODE_DEFAULT_KHR:
2115 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR:
2116 P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_FALSE);
2117 P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_FALSE);
2118 break;
2119
2120 case VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR:
2121 P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_TRUE);
2122 P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_FALSE);
2123 break;
2124
2125 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR:
2126 P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_TRUE);
2127 P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_TRUE);
2128 break;
2129
2130 default:
2131 unreachable("Invalid line rasterization mode");
2132 }
2133 }
2134
2135 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE))
2136 P_IMMD(p, NV9097, SET_LINE_STIPPLE, dyn->rs.line.stipple.enable);
2137
2138 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE)) {
2139 /* map factor from [1,256] to [0, 255] */
2140 uint32_t stipple_factor = CLAMP(dyn->rs.line.stipple.factor, 1, 256) - 1;
2141 P_IMMD(p, NV9097, SET_LINE_STIPPLE_PARAMETERS, {
2142 .factor = stipple_factor,
2143 .pattern = dyn->rs.line.stipple.pattern,
2144 });
2145 }
2146
2147 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM))
2148 P_IMMD(p, NV9097, SET_RASTER_INPUT, dyn->rs.rasterization_stream);
2149
2150 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE) ||
2151 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_EXTRA_PRIMITIVE_OVERESTIMATION_SIZE)) {
2152 if (nvk_cmd_buffer_3d_cls(cmd) < MAXWELL_B) {
2153 assert(dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT);
2154 } else if (dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
2155 P_IMMD(p, NVB197, SET_CONSERVATIVE_RASTER, ENABLE_FALSE);
2156 } else {
2157 uint32_t extra_overestimate =
2158 MIN2(3, dyn->rs.extra_primitive_overestimation_size * 4);
2159
2160 if (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A) {
2161 P_IMMD(p, NVC397, SET_CONSERVATIVE_RASTER_CONTROL, {
2162 .extra_prim_bloat = extra_overestimate,
2163 .copy_inner_to_outer =
2164 (dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT),
2165 .triangle_snap_mode = TRIANGLE_SNAP_MODE_MODE_PRE_SNAP,
2166 .line_and_point_snap_mode = LINE_AND_POINT_SNAP_MODE_MODE_PRE_SNAP,
2167 .uncertainty_region_size = UNCERTAINTY_REGION_SIZE_SIZE_512,
2168 });
2169 } else {
2170 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_CONSERVATIVE_RASTER_STATE));
2171 P_INLINE_DATA(p, extra_overestimate << 23);
2172 }
2173 P_IMMD(p, NVB197, SET_CONSERVATIVE_RASTER, ENABLE_TRUE);
2174 }
2175 }
2176 }
2177
2178 static uint32_t
nvk_mme_anti_alias_init(void)2179 nvk_mme_anti_alias_init(void)
2180 {
2181 /* This is a valid value but we never set it so it ensures that the macro
2182 * will actually run the first time we set anything.
2183 */
2184 return 0xf;
2185 }
2186
2187 static uint32_t
nvk_mme_anti_alias_min_sample_shading(float mss)2188 nvk_mme_anti_alias_min_sample_shading(float mss)
2189 {
2190 /* The value we want to comput in the MME is
2191 *
2192 * passes = next_pow2(samples * minSampleShading)
2193 *
2194 * Since samples is already a power of two,
2195 *
2196 * passes_log2 = log2_ceil(samples * minSampleShading)
2197 * = log2_ceil(samples / (1.0 / minSampleShading))
2198 * = samples_log2 - log2_floor(1.0 / minSampleShading)
2199 *
2200 * if we assume (1.0 / min_sample_shading) >= 1.0. This last bit is
2201 * something we can compute in the MME as long as the float math on the
2202 * right-hand side happens on the CPU.
2203 */
2204 float rcp_mss = CLAMP(1.0 / mss, 1.0f, 16.0f);
2205 uint32_t rcp_mss_log2 = util_logbase2(floorf(rcp_mss));
2206
2207 assert(rcp_mss_log2 != nvk_mme_anti_alias_init());
2208
2209 return nvk_mme_val_mask(rcp_mss_log2 << 0, 0x000f);
2210 }
2211
2212 static uint32_t
nvk_mme_anti_alias_samples(uint32_t samples)2213 nvk_mme_anti_alias_samples(uint32_t samples)
2214 {
2215 assert(util_is_power_of_two_or_zero(samples));
2216 const uint32_t samples_log2 = util_logbase2(MAX2(1, samples));
2217
2218 return nvk_mme_val_mask(samples_log2 << 4, 0x00f0);
2219 }
2220
2221 void
nvk_mme_set_anti_alias(struct mme_builder * b)2222 nvk_mme_set_anti_alias(struct mme_builder *b)
2223 {
2224 struct mme_value val_mask = mme_load(b);
2225 struct mme_value old_anti_alias = nvk_mme_load_scratch(b, ANTI_ALIAS);
2226 struct mme_value anti_alias =
2227 nvk_mme_set_masked(b, old_anti_alias, val_mask);
2228 mme_free_reg(b, val_mask);
2229
2230 mme_if(b, ine, anti_alias, old_anti_alias) {
2231 mme_free_reg(b, old_anti_alias);
2232 nvk_mme_store_scratch(b, ANTI_ALIAS, anti_alias);
2233
2234 struct mme_value rcp_mss_log2 =
2235 mme_merge(b, mme_zero(), anti_alias, 0, 4, 0);
2236 struct mme_value samples_log2 =
2237 mme_merge(b, mme_zero(), anti_alias, 0, 4, 4);
2238 mme_free_reg(b, anti_alias);
2239
2240 /* We've already done all the hard work on the CPU in
2241 * nvk_mme_min_sample_shading(). All we have to do here is add the two
2242 * log2 values and clamp so we don't get negative.
2243 */
2244 struct mme_value passes_log2 = mme_sub(b, samples_log2, rcp_mss_log2);
2245 mme_free_reg(b, rcp_mss_log2);
2246
2247 /* passes = MAX(passes, 1) */
2248 struct mme_value neg = mme_srl(b, passes_log2, mme_imm(31));
2249 mme_if(b, ine, neg, mme_zero()) {
2250 mme_mov_to(b, passes_log2, mme_zero());
2251 }
2252 mme_free_reg(b, neg);
2253
2254 /*
2255 * NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL {
2256 * ...
2257 * .centroid = passes > 1 ? CENTROID_PER_PASS
2258 * : CENTROID_PER_FRAGMENT,
2259 * }
2260 */
2261 struct mme_value aac = mme_mov(b,
2262 mme_imm(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID_PER_FRAGMENT
2263 << DRF_LO(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID)));
2264 mme_if(b, ine, passes_log2, mme_zero()) {
2265 mme_mov_to(b, aac,
2266 mme_imm(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID_PER_PASS
2267 << DRF_LO(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID)));
2268 }
2269
2270 struct mme_value passes = mme_sll(b, mme_imm(1), passes_log2);
2271 mme_merge_to(b, aac, aac, passes, 0, 4, 0);
2272 mme_free_reg(b, passes);
2273
2274 mme_mthd(b, NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL);
2275 mme_emit(b, aac);
2276 mme_free_reg(b, aac);
2277
2278 /* Now we need to emit sample masks per-sample:
2279 *
2280 * struct nak_sample_mask push_sm[NVK_MAX_SAMPLES];
2281 * uint32_t samples_per_pass = samples / passes;
2282 * uint32_t sample_mask = BITFIELD_MASK(samples_per_pass);
2283 * for (uint32_t s = 0; NVK_MAX_SAMPLES;) {
2284 * push_sm[s] = (struct nak_sample_mask) {
2285 * .sample_mask = sample_mask,
2286 * };
2287 *
2288 * s++;
2289 *
2290 * if (s & samples_per_pass)
2291 * sample_mask <<= samples_per_pass;
2292 * }
2293 *
2294 * Annoyingly, we have to pack these in pairs
2295 */
2296 STATIC_ASSERT(sizeof(struct nak_sample_mask) == 2);
2297
2298 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
2299 mme_emit(b, mme_imm(nvk_root_descriptor_offset(draw.sample_masks)));
2300 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
2301
2302 /* Annoyingly, we have to pack these in pairs */
2303
2304 struct mme_value samples_per_pass_log2 =
2305 mme_sub(b, samples_log2, passes_log2);
2306 mme_free_reg(b, samples_log2);
2307 mme_free_reg(b, passes_log2);
2308
2309 mme_if(b, ieq, samples_per_pass_log2, mme_zero()) {
2310 /* One sample per pass, we can just blast it out */
2311 for (uint32_t i = 0; i < NVK_MAX_SAMPLES; i += 2) {
2312 uint32_t mask0 = 1 << i;
2313 uint32_t mask1 = 1 << (i + 1);
2314 mme_emit(b, mme_imm(mask0 | (mask1 << 16)));
2315 }
2316 }
2317
2318 mme_if(b, ine, samples_per_pass_log2, mme_zero()) {
2319 struct mme_value samples_per_pass =
2320 mme_sll(b, mme_imm(1), samples_per_pass_log2);
2321
2322 /* sample_mask = (1 << samples_per_pass) - 1 */
2323 struct mme_value sample_mask =
2324 mme_sll(b, mme_imm(1), samples_per_pass);
2325 mme_sub_to(b, sample_mask, sample_mask, mme_imm(1));
2326
2327 struct mme_value mod_mask = mme_sub(b, samples_per_pass, mme_imm(1));
2328
2329 struct mme_value s = mme_mov(b, mme_zero());
2330 mme_while(b, ine, s, mme_imm(NVK_MAX_SAMPLES)) {
2331 /* Since samples_per_pass >= 2, we know that both masks in the pair
2332 * will be the same.
2333 */
2334 struct mme_value packed =
2335 mme_merge(b, sample_mask, sample_mask, 16, 16, 0);
2336 mme_emit(b, packed);
2337 mme_free_reg(b, packed);
2338
2339 mme_add_to(b, s, s, mme_imm(2));
2340
2341 /* if (s % samples_per_pass == 0) */
2342 struct mme_value mod = mme_and(b, s, mod_mask);
2343 mme_if(b, ieq, mod, mme_zero()) {
2344 mme_sll_to(b, sample_mask, sample_mask, samples_per_pass);
2345 }
2346 }
2347 }
2348 }
2349 }
2350
2351 const struct nvk_mme_test_case nvk_mme_set_anti_alias_tests[] = {{
2352 /* This case doesn't change the state so it should do nothing */
2353 .init = (struct nvk_mme_mthd_data[]) {
2354 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0 },
2355 { }
2356 },
2357 .params = (uint32_t[]) { 0xffff0000 },
2358 .expected = (struct nvk_mme_mthd_data[]) {
2359 { }
2360 },
2361 }, {
2362 /* Single sample, minSampleShading = 1.0 */
2363 .init = (struct nvk_mme_mthd_data[]) {
2364 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0xf },
2365 { }
2366 },
2367 .params = (uint32_t[]) { 0xffff0000 },
2368 .expected = (struct nvk_mme_mthd_data[]) {
2369 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0 },
2370 { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x1 },
2371 { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2372 nvk_root_descriptor_offset(draw.sample_masks) },
2373 { NV9097_LOAD_CONSTANT_BUFFER(0), 0x020001 },
2374 { NV9097_LOAD_CONSTANT_BUFFER(1), 0x080004 },
2375 { NV9097_LOAD_CONSTANT_BUFFER(2), 0x200010 },
2376 { NV9097_LOAD_CONSTANT_BUFFER(3), 0x800040 },
2377 { }
2378 },
2379 }, {
2380 /* Single sample, minSampleShading = 0.25 */
2381 .init = (struct nvk_mme_mthd_data[]) {
2382 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0xf },
2383 { }
2384 },
2385 .params = (uint32_t[]) { 0xffff0002 },
2386 .expected = (struct nvk_mme_mthd_data[]) {
2387 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x2 },
2388 { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x1 },
2389 { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2390 nvk_root_descriptor_offset(draw.sample_masks) },
2391 { NV9097_LOAD_CONSTANT_BUFFER(0), 0x020001 },
2392 { NV9097_LOAD_CONSTANT_BUFFER(1), 0x080004 },
2393 { NV9097_LOAD_CONSTANT_BUFFER(2), 0x200010 },
2394 { NV9097_LOAD_CONSTANT_BUFFER(3), 0x800040 },
2395 { }
2396 },
2397 }, {
2398 /* 8 samples, minSampleShading = 0.5 */
2399 .init = (struct nvk_mme_mthd_data[]) {
2400 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x1 },
2401 { }
2402 },
2403 .params = (uint32_t[]) { 0x00f00030 },
2404 .expected = (struct nvk_mme_mthd_data[]) {
2405 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x31 },
2406 { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x14 },
2407 { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2408 nvk_root_descriptor_offset(draw.sample_masks) },
2409 { NV9097_LOAD_CONSTANT_BUFFER(0), 0x030003 },
2410 { NV9097_LOAD_CONSTANT_BUFFER(1), 0x0c000c },
2411 { NV9097_LOAD_CONSTANT_BUFFER(2), 0x300030 },
2412 { NV9097_LOAD_CONSTANT_BUFFER(3), 0xc000c0 },
2413 { }
2414 },
2415 }, {
2416 /* 8 samples, minSampleShading = 0.25 */
2417 .init = (struct nvk_mme_mthd_data[]) {
2418 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x30 },
2419 { }
2420 },
2421 .params = (uint32_t[]) { 0x000f0002 },
2422 .expected = (struct nvk_mme_mthd_data[]) {
2423 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x32 },
2424 { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x12 },
2425 { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2426 nvk_root_descriptor_offset(draw.sample_masks) },
2427 { NV9097_LOAD_CONSTANT_BUFFER(0), 0x0f000f },
2428 { NV9097_LOAD_CONSTANT_BUFFER(1), 0x0f000f },
2429 { NV9097_LOAD_CONSTANT_BUFFER(2), 0xf000f0 },
2430 { NV9097_LOAD_CONSTANT_BUFFER(3), 0xf000f0 },
2431 { }
2432 },
2433 }, {}};
2434
2435 static VkSampleLocationEXT
vk_sample_location(const struct vk_sample_locations_state * sl,uint32_t x,uint32_t y,uint32_t s)2436 vk_sample_location(const struct vk_sample_locations_state *sl,
2437 uint32_t x, uint32_t y, uint32_t s)
2438 {
2439 x = x % sl->grid_size.width;
2440 y = y % sl->grid_size.height;
2441
2442 return sl->locations[(x + y * sl->grid_size.width) * sl->per_pixel + s];
2443 }
2444
2445 static struct nak_sample_location
vk_to_nak_sample_location(VkSampleLocationEXT loc)2446 vk_to_nak_sample_location(VkSampleLocationEXT loc)
2447 {
2448 return (struct nak_sample_location) {
2449 .x_u4 = util_bitpack_ufixed_clamp(loc.x, 0, 3, 4),
2450 .y_u4 = util_bitpack_ufixed_clamp(loc.y, 0, 3, 4),
2451 };
2452 }
2453
2454 static void
nvk_flush_ms_state(struct nvk_cmd_buffer * cmd)2455 nvk_flush_ms_state(struct nvk_cmd_buffer *cmd)
2456 {
2457 const struct nvk_rendering_state *render = &cmd->state.gfx.render;
2458 const struct vk_dynamic_graphics_state *dyn =
2459 &cmd->vk.dynamic_graphics_state;
2460
2461 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES)) {
2462 struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
2463
2464 /* When we don't have any attachments, we can't know the sample count
2465 * from the render pass so we need to emit SET_ANTI_ALIAS here. See the
2466 * comment in nvk_BeginRendering() for more details.
2467 */
2468 if (render->samples == 0) {
2469 /* Multisample information MAY be missing (rasterizationSamples == 0)
2470 * if rasterizer discard is enabled. However, this isn't valid in
2471 * the hardware so always use at least one sample.
2472 */
2473 const uint32_t samples = MAX2(1, dyn->ms.rasterization_samples);
2474 enum nil_sample_layout layout = nil_choose_sample_layout(samples);
2475 P_IMMD(p, NV9097, SET_ANTI_ALIAS, nil_to_nv9097_samples_mode(layout));
2476 } else {
2477 /* Multisample information MAY be missing (rasterizationSamples == 0)
2478 * if rasterizer discard is enabled.
2479 */
2480 assert(dyn->ms.rasterization_samples == 0 ||
2481 dyn->ms.rasterization_samples == render->samples);
2482 }
2483
2484 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_ANTI_ALIAS));
2485 P_INLINE_DATA(p,
2486 nvk_mme_anti_alias_samples(dyn->ms.rasterization_samples));
2487 }
2488
2489 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
2490 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE)) {
2491 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
2492 P_IMMD(p, NV9097, SET_ANTI_ALIAS_ALPHA_CONTROL, {
2493 .alpha_to_coverage = dyn->ms.alpha_to_coverage_enable,
2494 .alpha_to_one = dyn->ms.alpha_to_one_enable,
2495 });
2496 }
2497
2498 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ||
2499 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) ||
2500 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE)) {
2501 const struct vk_sample_locations_state *sl;
2502 if (dyn->ms.sample_locations_enable) {
2503 sl = dyn->ms.sample_locations;
2504 } else {
2505 const uint32_t samples = MAX2(1, dyn->ms.rasterization_samples);
2506 sl = vk_standard_sample_locations_state(samples);
2507 }
2508
2509 struct nak_sample_location push_sl[NVK_MAX_SAMPLES];
2510 for (uint32_t i = 0; i < sl->per_pixel; i++)
2511 push_sl[i] = vk_to_nak_sample_location(sl->locations[i]);
2512
2513 nvk_descriptor_state_set_root_array(cmd, &cmd->state.gfx.descriptors,
2514 draw.sample_locations,
2515 0, NVK_MAX_SAMPLES, push_sl);
2516
2517 if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
2518 struct nak_sample_location loc[16];
2519 for (uint32_t n = 0; n < ARRAY_SIZE(loc); n++) {
2520 const uint32_t s = n % sl->per_pixel;
2521 const uint32_t px = n / sl->per_pixel;
2522 const uint32_t x = px % 2;
2523 const uint32_t y = px / 2;
2524
2525 loc[n] = vk_to_nak_sample_location(vk_sample_location(sl, x, y, s));
2526 }
2527
2528 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
2529
2530 P_MTHD(p, NVB197, SET_ANTI_ALIAS_SAMPLE_POSITIONS(0));
2531 for (uint32_t i = 0; i < 4; i++) {
2532 P_NVB197_SET_ANTI_ALIAS_SAMPLE_POSITIONS(p, i, {
2533 .x0 = loc[i * 4 + 0].x_u4,
2534 .y0 = loc[i * 4 + 0].y_u4,
2535 .x1 = loc[i * 4 + 1].x_u4,
2536 .y1 = loc[i * 4 + 1].y_u4,
2537 .x2 = loc[i * 4 + 2].x_u4,
2538 .y2 = loc[i * 4 + 2].y_u4,
2539 .x3 = loc[i * 4 + 3].x_u4,
2540 .y3 = loc[i * 4 + 3].y_u4,
2541 });
2542 }
2543 }
2544 }
2545
2546 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK)) {
2547 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
2548 P_MTHD(p, NV9097, SET_SAMPLE_MASK_X0_Y0);
2549 P_NV9097_SET_SAMPLE_MASK_X0_Y0(p, dyn->ms.sample_mask & 0xffff);
2550 P_NV9097_SET_SAMPLE_MASK_X1_Y0(p, dyn->ms.sample_mask & 0xffff);
2551 P_NV9097_SET_SAMPLE_MASK_X0_Y1(p, dyn->ms.sample_mask & 0xffff);
2552 P_NV9097_SET_SAMPLE_MASK_X1_Y1(p, dyn->ms.sample_mask & 0xffff);
2553 }
2554 }
2555
2556 static uint32_t
vk_to_nv9097_compare_op(VkCompareOp vk_op)2557 vk_to_nv9097_compare_op(VkCompareOp vk_op)
2558 {
2559 ASSERTED static const uint16_t vk_to_nv9097[] = {
2560 [VK_COMPARE_OP_NEVER] = NV9097_SET_DEPTH_FUNC_V_OGL_NEVER,
2561 [VK_COMPARE_OP_LESS] = NV9097_SET_DEPTH_FUNC_V_OGL_LESS,
2562 [VK_COMPARE_OP_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_EQUAL,
2563 [VK_COMPARE_OP_LESS_OR_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_LEQUAL,
2564 [VK_COMPARE_OP_GREATER] = NV9097_SET_DEPTH_FUNC_V_OGL_GREATER,
2565 [VK_COMPARE_OP_NOT_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_NOTEQUAL,
2566 [VK_COMPARE_OP_GREATER_OR_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_GEQUAL,
2567 [VK_COMPARE_OP_ALWAYS] = NV9097_SET_DEPTH_FUNC_V_OGL_ALWAYS,
2568 };
2569 assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
2570
2571 uint32_t nv9097_op = 0x200 | vk_op;
2572 assert(nv9097_op == vk_to_nv9097[vk_op]);
2573 return nv9097_op;
2574 }
2575
2576 static uint32_t
vk_to_nv9097_stencil_op(VkStencilOp vk_op)2577 vk_to_nv9097_stencil_op(VkStencilOp vk_op)
2578 {
2579 #define OP(vk, nv) [VK_STENCIL_OP_##vk] = NV9097_SET_STENCIL_OP_FAIL_V_##nv
2580 ASSERTED static const uint16_t vk_to_nv9097[] = {
2581 OP(KEEP, D3D_KEEP),
2582 OP(ZERO, D3D_ZERO),
2583 OP(REPLACE, D3D_REPLACE),
2584 OP(INCREMENT_AND_CLAMP, D3D_INCRSAT),
2585 OP(DECREMENT_AND_CLAMP, D3D_DECRSAT),
2586 OP(INVERT, D3D_INVERT),
2587 OP(INCREMENT_AND_WRAP, D3D_INCR),
2588 OP(DECREMENT_AND_WRAP, D3D_DECR),
2589 };
2590 assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
2591 #undef OP
2592
2593 uint32_t nv9097_op = vk_op + 1;
2594 assert(nv9097_op == vk_to_nv9097[vk_op]);
2595 return nv9097_op;
2596 }
2597
2598 static void
nvk_flush_ds_state(struct nvk_cmd_buffer * cmd)2599 nvk_flush_ds_state(struct nvk_cmd_buffer *cmd)
2600 {
2601 struct nv_push *p = nvk_cmd_buffer_push(cmd, 35);
2602
2603 const struct nvk_rendering_state *render = &cmd->state.gfx.render;
2604 const struct vk_dynamic_graphics_state *dyn =
2605 &cmd->vk.dynamic_graphics_state;
2606
2607 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE)) {
2608 bool enable = dyn->ds.depth.test_enable &&
2609 render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2610 P_IMMD(p, NV9097, SET_DEPTH_TEST, enable);
2611 }
2612
2613 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE)) {
2614 bool enable = dyn->ds.depth.write_enable &&
2615 render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2616 P_IMMD(p, NV9097, SET_DEPTH_WRITE, enable);
2617 }
2618
2619 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP)) {
2620 const uint32_t func = vk_to_nv9097_compare_op(dyn->ds.depth.compare_op);
2621 P_IMMD(p, NV9097, SET_DEPTH_FUNC, func);
2622 }
2623
2624 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE)) {
2625 bool enable = dyn->ds.depth.bounds_test.enable &&
2626 render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2627 P_IMMD(p, NV9097, SET_DEPTH_BOUNDS_TEST, enable);
2628 }
2629
2630 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) {
2631 P_MTHD(p, NV9097, SET_DEPTH_BOUNDS_MIN);
2632 P_NV9097_SET_DEPTH_BOUNDS_MIN(p, fui(dyn->ds.depth.bounds_test.min));
2633 P_NV9097_SET_DEPTH_BOUNDS_MAX(p, fui(dyn->ds.depth.bounds_test.max));
2634 }
2635
2636 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE)) {
2637 bool enable = dyn->ds.stencil.test_enable &&
2638 render->stencil_att.vk_format != VK_FORMAT_UNDEFINED;
2639 P_IMMD(p, NV9097, SET_STENCIL_TEST, enable);
2640 }
2641
2642 const struct vk_stencil_test_face_state *front = &dyn->ds.stencil.front;
2643 const struct vk_stencil_test_face_state *back = &dyn->ds.stencil.back;
2644 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP)) {
2645 P_MTHD(p, NV9097, SET_STENCIL_OP_FAIL);
2646 P_NV9097_SET_STENCIL_OP_FAIL(p, vk_to_nv9097_stencil_op(front->op.fail));
2647 P_NV9097_SET_STENCIL_OP_ZFAIL(p, vk_to_nv9097_stencil_op(front->op.depth_fail));
2648 P_NV9097_SET_STENCIL_OP_ZPASS(p, vk_to_nv9097_stencil_op(front->op.pass));
2649 P_NV9097_SET_STENCIL_FUNC(p, vk_to_nv9097_compare_op(front->op.compare));
2650
2651 P_MTHD(p, NV9097, SET_BACK_STENCIL_OP_FAIL);
2652 P_NV9097_SET_BACK_STENCIL_OP_FAIL(p, vk_to_nv9097_stencil_op(back->op.fail));
2653 P_NV9097_SET_BACK_STENCIL_OP_ZFAIL(p, vk_to_nv9097_stencil_op(back->op.depth_fail));
2654 P_NV9097_SET_BACK_STENCIL_OP_ZPASS(p, vk_to_nv9097_stencil_op(back->op.pass));
2655 P_NV9097_SET_BACK_STENCIL_FUNC(p, vk_to_nv9097_compare_op(back->op.compare));
2656 }
2657
2658 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK)) {
2659 P_IMMD(p, NV9097, SET_STENCIL_FUNC_MASK, front->compare_mask);
2660 P_IMMD(p, NV9097, SET_BACK_STENCIL_FUNC_MASK, back->compare_mask);
2661 }
2662
2663 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK)) {
2664 P_IMMD(p, NV9097, SET_STENCIL_MASK, front->write_mask);
2665 P_IMMD(p, NV9097, SET_BACK_STENCIL_MASK, back->write_mask);
2666 }
2667
2668 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE)) {
2669 P_IMMD(p, NV9097, SET_STENCIL_FUNC_REF, front->reference);
2670 P_IMMD(p, NV9097, SET_BACK_STENCIL_FUNC_REF, back->reference);
2671 }
2672 }
2673
2674 static uint32_t
vk_to_nv9097_logic_op(VkLogicOp vk_op)2675 vk_to_nv9097_logic_op(VkLogicOp vk_op)
2676 {
2677 ASSERTED uint16_t vk_to_nv9097[] = {
2678 [VK_LOGIC_OP_CLEAR] = NV9097_SET_LOGIC_OP_FUNC_V_CLEAR,
2679 [VK_LOGIC_OP_AND] = NV9097_SET_LOGIC_OP_FUNC_V_AND,
2680 [VK_LOGIC_OP_AND_REVERSE] = NV9097_SET_LOGIC_OP_FUNC_V_AND_REVERSE,
2681 [VK_LOGIC_OP_COPY] = NV9097_SET_LOGIC_OP_FUNC_V_COPY,
2682 [VK_LOGIC_OP_AND_INVERTED] = NV9097_SET_LOGIC_OP_FUNC_V_AND_INVERTED,
2683 [VK_LOGIC_OP_NO_OP] = NV9097_SET_LOGIC_OP_FUNC_V_NOOP,
2684 [VK_LOGIC_OP_XOR] = NV9097_SET_LOGIC_OP_FUNC_V_XOR,
2685 [VK_LOGIC_OP_OR] = NV9097_SET_LOGIC_OP_FUNC_V_OR,
2686 [VK_LOGIC_OP_NOR] = NV9097_SET_LOGIC_OP_FUNC_V_NOR,
2687 [VK_LOGIC_OP_EQUIVALENT] = NV9097_SET_LOGIC_OP_FUNC_V_EQUIV,
2688 [VK_LOGIC_OP_INVERT] = NV9097_SET_LOGIC_OP_FUNC_V_INVERT,
2689 [VK_LOGIC_OP_OR_REVERSE] = NV9097_SET_LOGIC_OP_FUNC_V_OR_REVERSE,
2690 [VK_LOGIC_OP_COPY_INVERTED] = NV9097_SET_LOGIC_OP_FUNC_V_COPY_INVERTED,
2691 [VK_LOGIC_OP_OR_INVERTED] = NV9097_SET_LOGIC_OP_FUNC_V_OR_INVERTED,
2692 [VK_LOGIC_OP_NAND] = NV9097_SET_LOGIC_OP_FUNC_V_NAND,
2693 [VK_LOGIC_OP_SET] = NV9097_SET_LOGIC_OP_FUNC_V_SET,
2694 };
2695 assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
2696
2697 uint32_t nv9097_op = 0x1500 | vk_op;
2698 assert(nv9097_op == vk_to_nv9097[vk_op]);
2699 return nv9097_op;
2700 }
2701
2702 static uint32_t
vk_to_nv9097_blend_op(VkBlendOp vk_op)2703 vk_to_nv9097_blend_op(VkBlendOp vk_op)
2704 {
2705 #define OP(vk, nv) [VK_BLEND_OP_##vk] = NV9097_SET_BLEND_COLOR_OP_V_OGL_##nv
2706 ASSERTED uint16_t vk_to_nv9097[] = {
2707 OP(ADD, FUNC_ADD),
2708 OP(SUBTRACT, FUNC_SUBTRACT),
2709 OP(REVERSE_SUBTRACT, FUNC_REVERSE_SUBTRACT),
2710 OP(MIN, MIN),
2711 OP(MAX, MAX),
2712 };
2713 assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
2714 #undef OP
2715
2716 return vk_to_nv9097[vk_op];
2717 }
2718
2719 static uint32_t
vk_to_nv9097_blend_factor(VkBlendFactor vk_factor)2720 vk_to_nv9097_blend_factor(VkBlendFactor vk_factor)
2721 {
2722 #define FACTOR(vk, nv) [VK_BLEND_FACTOR_##vk] = \
2723 NV9097_SET_BLEND_COLOR_SOURCE_COEFF_V_##nv
2724 ASSERTED uint16_t vk_to_nv9097[] = {
2725 FACTOR(ZERO, OGL_ZERO),
2726 FACTOR(ONE, OGL_ONE),
2727 FACTOR(SRC_COLOR, OGL_SRC_COLOR),
2728 FACTOR(ONE_MINUS_SRC_COLOR, OGL_ONE_MINUS_SRC_COLOR),
2729 FACTOR(DST_COLOR, OGL_DST_COLOR),
2730 FACTOR(ONE_MINUS_DST_COLOR, OGL_ONE_MINUS_DST_COLOR),
2731 FACTOR(SRC_ALPHA, OGL_SRC_ALPHA),
2732 FACTOR(ONE_MINUS_SRC_ALPHA, OGL_ONE_MINUS_SRC_ALPHA),
2733 FACTOR(DST_ALPHA, OGL_DST_ALPHA),
2734 FACTOR(ONE_MINUS_DST_ALPHA, OGL_ONE_MINUS_DST_ALPHA),
2735 FACTOR(CONSTANT_COLOR, OGL_CONSTANT_COLOR),
2736 FACTOR(ONE_MINUS_CONSTANT_COLOR, OGL_ONE_MINUS_CONSTANT_COLOR),
2737 FACTOR(CONSTANT_ALPHA, OGL_CONSTANT_ALPHA),
2738 FACTOR(ONE_MINUS_CONSTANT_ALPHA, OGL_ONE_MINUS_CONSTANT_ALPHA),
2739 FACTOR(SRC_ALPHA_SATURATE, OGL_SRC_ALPHA_SATURATE),
2740 FACTOR(SRC1_COLOR, OGL_SRC1COLOR),
2741 FACTOR(ONE_MINUS_SRC1_COLOR, OGL_INVSRC1COLOR),
2742 FACTOR(SRC1_ALPHA, OGL_SRC1ALPHA),
2743 FACTOR(ONE_MINUS_SRC1_ALPHA, OGL_INVSRC1ALPHA),
2744 };
2745 assert(vk_factor < ARRAY_SIZE(vk_to_nv9097));
2746 #undef FACTOR
2747
2748 return vk_to_nv9097[vk_factor];
2749 }
2750
2751 void
nvk_mme_set_write_mask(struct mme_builder * b)2752 nvk_mme_set_write_mask(struct mme_builder *b)
2753 {
2754 struct mme_value count = mme_load(b);
2755 struct mme_value mask = mme_load(b);
2756
2757 /*
2758 * mask is a bit field
2759 *
2760 * attachment index 88887777666655554444333322221111
2761 * component abgrabgrabgrabgrabgrabgrabgrabgr
2762 */
2763
2764 struct mme_value common_mask = mme_mov(b, mme_imm(1));
2765 struct mme_value first = mme_and(b, mask, mme_imm(BITFIELD_RANGE(0, 4)));
2766 struct mme_value i = mme_mov(b, mme_zero());
2767
2768 mme_while(b, ine, i, count) {
2769 /*
2770 We call NV9097_SET_CT_WRITE per attachment. It needs a value as:
2771 0x0000 0000 0000 0000 000a 000b 000g 000r
2772
2773 So for i=0 a mask of
2774 0x0000 0000 0000 0000 0000 0000 0000 1111
2775 becomes
2776 0x0000 0000 0000 0000 0001 0001 0001 0001
2777 */
2778
2779 struct mme_value val = mme_merge(b, mme_zero(), mask, 0, 1, 0);
2780 mme_merge_to(b, val, val, mask, 4, 1, 1);
2781 mme_merge_to(b, val, val, mask, 8, 1, 2);
2782 mme_merge_to(b, val, val, mask, 12, 1, 3);
2783
2784 mme_mthd_arr(b, NV9097_SET_CT_WRITE(0), i);
2785 mme_emit(b, val);
2786 mme_free_reg(b, val);
2787
2788 /* Check if all masks are common */
2789 struct mme_value temp = mme_add(b, mask, mme_imm(BITFIELD_RANGE(0, 4)));
2790 mme_if(b, ine, first, temp) {
2791 mme_mov_to(b, common_mask, mme_zero());
2792 }
2793 mme_free_reg(b, temp);
2794
2795 mme_srl_to(b, mask, mask, mme_imm(4));
2796
2797 mme_add_to(b, i, i, mme_imm(1));
2798 }
2799
2800 mme_mthd(b, NV9097_SET_SINGLE_CT_WRITE_CONTROL);
2801 mme_emit(b, common_mask);
2802 }
2803
2804 static void
nvk_flush_cb_state(struct nvk_cmd_buffer * cmd)2805 nvk_flush_cb_state(struct nvk_cmd_buffer *cmd)
2806 {
2807 struct nvk_rendering_state *render = &cmd->state.gfx.render;
2808 const struct vk_dynamic_graphics_state *dyn =
2809 &cmd->vk.dynamic_graphics_state;
2810
2811 struct nv_push *p =
2812 nvk_cmd_buffer_push(cmd, 15 + 10 * render->color_att_count);
2813
2814 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE))
2815 P_IMMD(p, NV9097, SET_LOGIC_OP, dyn->cb.logic_op_enable);
2816
2817 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP)) {
2818 const uint32_t func = vk_to_nv9097_logic_op(dyn->cb.logic_op);
2819 P_IMMD(p, NV9097, SET_LOGIC_OP_FUNC, func);
2820 }
2821
2822 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES)) {
2823 for (uint8_t a = 0; a < render->color_att_count; a++) {
2824 P_IMMD(p, NV9097, SET_BLEND(a), dyn->cb.attachments[a].blend_enable);
2825 }
2826 }
2827
2828 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) {
2829 for (uint8_t a = 0; a < render->color_att_count; a++) {
2830 const struct vk_color_blend_attachment_state *att =
2831 &dyn->cb.attachments[a];
2832 P_MTHD(p, NV9097, SET_BLEND_PER_TARGET_SEPARATE_FOR_ALPHA(a));
2833 P_NV9097_SET_BLEND_PER_TARGET_SEPARATE_FOR_ALPHA(p, a, ENABLE_TRUE);
2834 P_NV9097_SET_BLEND_PER_TARGET_COLOR_OP(p, a,
2835 vk_to_nv9097_blend_op(att->color_blend_op));
2836 P_NV9097_SET_BLEND_PER_TARGET_COLOR_SOURCE_COEFF(p, a,
2837 vk_to_nv9097_blend_factor(att->src_color_blend_factor));
2838 P_NV9097_SET_BLEND_PER_TARGET_COLOR_DEST_COEFF(p, a,
2839 vk_to_nv9097_blend_factor(att->dst_color_blend_factor));
2840 P_NV9097_SET_BLEND_PER_TARGET_ALPHA_OP(p, a,
2841 vk_to_nv9097_blend_op(att->alpha_blend_op));
2842 P_NV9097_SET_BLEND_PER_TARGET_ALPHA_SOURCE_COEFF(p, a,
2843 vk_to_nv9097_blend_factor(att->src_alpha_blend_factor));
2844 P_NV9097_SET_BLEND_PER_TARGET_ALPHA_DEST_COEFF(p, a,
2845 vk_to_nv9097_blend_factor(att->dst_alpha_blend_factor));
2846 }
2847 }
2848
2849 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) ||
2850 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
2851 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RP_ATTACHMENTS) ||
2852 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP)) {
2853 uint32_t color_write_enables = 0x0;
2854 for (uint8_t a = 0; a < render->color_att_count; a++) {
2855 if (dyn->cb.color_write_enables & BITFIELD_BIT(a))
2856 color_write_enables |= 0xf << (4 * a);
2857 }
2858
2859 uint32_t cb_att_write_mask = 0x0;
2860 for (uint8_t a = 0; a < render->color_att_count; a++)
2861 cb_att_write_mask |= dyn->cb.attachments[a].write_mask << (a * 4);
2862
2863 uint32_t rp_att_write_mask = 0x0;
2864 for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
2865 if (dyn->rp.attachments & (MESA_VK_RP_ATTACHMENT_COLOR_0_BIT << a))
2866 rp_att_write_mask |= 0xf << (4 * a);
2867 }
2868
2869 uint32_t att_has_loc_mask = 0x0;
2870 for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
2871 if (dyn->cal.color_map[a] != MESA_VK_ATTACHMENT_UNUSED)
2872 att_has_loc_mask |= 0xf << (4 * a);
2873 }
2874
2875 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_WRITE_MASK));
2876 P_INLINE_DATA(p, render->color_att_count);
2877 P_INLINE_DATA(p, color_write_enables &
2878 cb_att_write_mask &
2879 rp_att_write_mask &
2880 att_has_loc_mask);
2881 }
2882
2883 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP)) {
2884 int8_t loc_att[NVK_MAX_RTS] = { -1, -1, -1, -1, -1, -1, -1, -1};
2885 uint8_t max_loc = 0;
2886 uint32_t att_used = 0;
2887 for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
2888 if (dyn->cal.color_map[a] == MESA_VK_ATTACHMENT_UNUSED)
2889 continue;
2890
2891 att_used |= BITFIELD_BIT(a);
2892
2893 assert(dyn->cal.color_map[a] < NVK_MAX_RTS);
2894 loc_att[dyn->cal.color_map[a]] = a;
2895 max_loc = MAX2(max_loc, dyn->cal.color_map[a]);
2896 }
2897
2898 for (uint8_t l = 0; l < NVK_MAX_RTS; l++) {
2899 if (loc_att[l] >= 0)
2900 continue;
2901
2902 /* Just grab any color attachment. The way we set up color targets
2903 * in BeginRenderPass ensures that every color target is either the
2904 * valid color target referenced by this render pass or a valid NULL
2905 * target. If we end up mapping to some other target in this render
2906 * pass, the handling of att_has_loc_mask above will ensure that no
2907 * color writes actually happen.
2908 */
2909 uint8_t a = ffs(~att_used) - 1;
2910 att_used |= BITFIELD_BIT(a);
2911 loc_att[l] = a;
2912 }
2913
2914 P_IMMD(p, NV9097, SET_CT_SELECT, {
2915 .target_count = max_loc + 1,
2916 .target0 = loc_att[0],
2917 .target1 = loc_att[1],
2918 .target2 = loc_att[2],
2919 .target3 = loc_att[3],
2920 .target4 = loc_att[4],
2921 .target5 = loc_att[5],
2922 .target6 = loc_att[6],
2923 .target7 = loc_att[7],
2924 });
2925 }
2926
2927 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
2928 P_MTHD(p, NV9097, SET_BLEND_CONST_RED);
2929 P_NV9097_SET_BLEND_CONST_RED(p, fui(dyn->cb.blend_constants[0]));
2930 P_NV9097_SET_BLEND_CONST_GREEN(p, fui(dyn->cb.blend_constants[1]));
2931 P_NV9097_SET_BLEND_CONST_BLUE(p, fui(dyn->cb.blend_constants[2]));
2932 P_NV9097_SET_BLEND_CONST_ALPHA(p, fui(dyn->cb.blend_constants[3]));
2933 }
2934 }
2935
2936 static void
nvk_flush_dynamic_state(struct nvk_cmd_buffer * cmd)2937 nvk_flush_dynamic_state(struct nvk_cmd_buffer *cmd)
2938 {
2939 struct vk_dynamic_graphics_state *dyn =
2940 &cmd->vk.dynamic_graphics_state;
2941
2942 if (!vk_dynamic_graphics_state_any_dirty(dyn))
2943 return;
2944
2945 nvk_flush_vi_state(cmd);
2946 nvk_flush_ia_state(cmd);
2947 nvk_flush_ts_state(cmd);
2948 nvk_flush_vp_state(cmd);
2949 nvk_flush_rs_state(cmd);
2950
2951 /* MESA_VK_DYNAMIC_FSR */
2952
2953 nvk_flush_ms_state(cmd);
2954 nvk_flush_ds_state(cmd);
2955 nvk_flush_cb_state(cmd);
2956
2957 vk_dynamic_graphics_state_clear_dirty(dyn);
2958 }
2959
2960 void
nvk_mme_bind_cbuf_desc(struct mme_builder * b)2961 nvk_mme_bind_cbuf_desc(struct mme_builder *b)
2962 {
2963 /* First 4 bits are group, later bits are slot */
2964 struct mme_value group_slot = mme_load(b);
2965
2966 struct mme_value addr_lo, addr_hi, size;
2967 if (nvk_use_bindless_cbuf(b->devinfo)) {
2968 if (b->devinfo->cls_eng3d >= TURING_A) {
2969 struct mme_value64 addr = mme_load_addr64(b);
2970 mme_tu104_read_fifoed(b, addr, mme_imm(2));
2971 }
2972
2973 /* Load the descriptor */
2974 struct mme_value desc_lo = mme_load(b);
2975 struct mme_value desc_hi = mme_load(b);
2976
2977 /* The bottom 45 bits are addr >> 4 */
2978 addr_lo = mme_merge(b, mme_zero(), desc_lo, 4, 28, 0);
2979 addr_hi = mme_merge(b, mme_zero(), desc_lo, 0, 4, 28);
2980 mme_merge_to(b, addr_hi, addr_hi, desc_hi, 4, 13, 0);
2981
2982 /* The top 19 bits are size >> 4 */
2983 size = mme_merge(b, mme_zero(), desc_hi, 4, 19, 13);
2984
2985 mme_free_reg(b, desc_hi);
2986 mme_free_reg(b, desc_lo);
2987 } else {
2988 if (b->devinfo->cls_eng3d >= TURING_A) {
2989 struct mme_value64 addr = mme_load_addr64(b);
2990 mme_tu104_read_fifoed(b, addr, mme_imm(3));
2991 }
2992
2993 /* Load the descriptor */
2994 addr_lo = mme_load(b);
2995 addr_hi = mme_load(b);
2996 size = mme_load(b);
2997 }
2998
2999 struct mme_value cb = mme_alloc_reg(b);
3000 mme_if(b, ieq, size, mme_zero()) {
3001 /* Bottim bit is the valid bit, 8:4 are shader slot */
3002 mme_merge_to(b, cb, mme_zero(), group_slot, 4, 5, 4);
3003 }
3004
3005 mme_if(b, ine, size, mme_zero()) {
3006 /* size = max(size, NVK_MAX_CBUF_SIZE) */
3007 assert(util_is_power_of_two_nonzero(NVK_MAX_CBUF_SIZE));
3008 struct mme_value is_large =
3009 mme_and(b, size, mme_imm(~(NVK_MAX_CBUF_SIZE - 1)));
3010 mme_if(b, ine, is_large, mme_zero()) {
3011 mme_mov_to(b, size, mme_imm(NVK_MAX_CBUF_SIZE));
3012 }
3013
3014 mme_mthd(b, NV9097_SET_CONSTANT_BUFFER_SELECTOR_A);
3015 mme_emit(b, size);
3016 mme_emit(b, addr_hi);
3017 mme_emit(b, addr_lo);
3018
3019 /* Bottom bit is the valid bit, 8:4 are shader slot */
3020 mme_merge_to(b, cb, mme_imm(1), group_slot, 4, 5, 4);
3021 }
3022
3023 mme_free_reg(b, addr_hi);
3024 mme_free_reg(b, addr_lo);
3025 mme_free_reg(b, size);
3026
3027 /* The group comes in the bottom 4 bits in group_slot and we need to
3028 * combine it with the method. However, unlike most array methods with a
3029 * stride if 1 dword, BIND_GROUP_CONSTANT_BUFFER has a stride of 32B or 8
3030 * dwords. This means we need to also shift by 3.
3031 */
3032 struct mme_value group = mme_merge(b, mme_imm(0), group_slot, 3, 4, 0);
3033 mme_mthd_arr(b, NV9097_BIND_GROUP_CONSTANT_BUFFER(0), group);
3034 mme_emit(b, cb);
3035 }
3036
3037 static void
nvk_flush_descriptors(struct nvk_cmd_buffer * cmd)3038 nvk_flush_descriptors(struct nvk_cmd_buffer *cmd)
3039 {
3040 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
3041 struct nvk_physical_device *pdev = nvk_device_physical(dev);
3042 const uint32_t min_cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info);
3043 struct nvk_descriptor_state *desc = &cmd->state.gfx.descriptors;
3044
3045 nvk_cmd_buffer_flush_push_descriptors(cmd, desc);
3046
3047 /* Find cbuf maps for the 5 cbuf groups */
3048 const struct nvk_shader *cbuf_shaders[5] = { NULL, };
3049 for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; stage++) {
3050 const struct nvk_shader *shader = cmd->state.gfx.shaders[stage];
3051 if (shader == NULL)
3052 continue;
3053
3054 uint32_t group = nvk_cbuf_binding_for_stage(stage);
3055 assert(group < ARRAY_SIZE(cbuf_shaders));
3056 cbuf_shaders[group] = shader;
3057 }
3058
3059 bool bound_any_cbuf = false;
3060 for (uint32_t g = 0; g < ARRAY_SIZE(cbuf_shaders); g++) {
3061 if (cbuf_shaders[g] == NULL)
3062 continue;
3063
3064 const struct nvk_shader *shader = cbuf_shaders[g];
3065 const struct nvk_cbuf_map *cbuf_map = &shader->cbuf_map;
3066 struct nvk_cbuf_group *group = &cmd->state.gfx.cbuf_groups[g];
3067
3068 /* We only bother to re-bind cbufs that are in use */
3069 const uint32_t rebind =
3070 group->dirty & BITFIELD_MASK(cbuf_map->cbuf_count);
3071 if (!rebind)
3072 continue;
3073
3074 u_foreach_bit(c, rebind) {
3075 const struct nvk_cbuf *cbuf = &group->cbufs[c];
3076
3077 /* We bind these at the very end */
3078 if (cbuf->type == NVK_CBUF_TYPE_ROOT_DESC)
3079 continue;
3080
3081 bound_any_cbuf = true;
3082
3083 struct nvk_buffer_address ba;
3084 if (nvk_cmd_buffer_get_cbuf_addr(cmd, desc, shader, cbuf, &ba)) {
3085 assert(ba.base_addr % min_cbuf_alignment == 0);
3086 ba.size = align(ba.size, min_cbuf_alignment);
3087 ba.size = MIN2(ba.size, NVK_MAX_CBUF_SIZE);
3088
3089 struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3090
3091 if (ba.size > 0) {
3092 P_MTHD(p, NV9097, SET_CONSTANT_BUFFER_SELECTOR_A);
3093 P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_A(p, ba.size);
3094 P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_B(p, ba.base_addr >> 32);
3095 P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_C(p, ba.base_addr);
3096 }
3097
3098 P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(g), {
3099 .valid = ba.size > 0,
3100 .shader_slot = c,
3101 });
3102 } else {
3103 uint64_t desc_addr =
3104 nvk_cmd_buffer_get_cbuf_descriptor_addr(cmd, desc, cbuf);
3105
3106 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
3107 struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
3108
3109 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC));
3110 P_INLINE_DATA(p, g | (c << 4));
3111 P_INLINE_DATA(p, desc_addr >> 32);
3112 P_INLINE_DATA(p, desc_addr);
3113 } else {
3114 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
3115
3116 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC));
3117 P_INLINE_DATA(p, g | (c << 4));
3118
3119 nv_push_update_count(p, 3);
3120 nvk_cmd_buffer_push_indirect(cmd, desc_addr, 12);
3121 }
3122 }
3123 }
3124
3125 group->dirty &= ~rebind;
3126 }
3127
3128 /* We bind all root descriptors last so that CONSTANT_BUFFER_SELECTOR is
3129 * always left pointing at the root descriptor table. This way draw
3130 * parameters and similar MME root table updates always hit the root
3131 * descriptor table and not some random UBO.
3132 */
3133 if (bound_any_cbuf) {
3134 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
3135 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SELECT_CB0));
3136 P_INLINE_DATA(p, 0);
3137 }
3138 }
3139
3140 static void
nvk_flush_gfx_state(struct nvk_cmd_buffer * cmd)3141 nvk_flush_gfx_state(struct nvk_cmd_buffer *cmd)
3142 {
3143 nvk_flush_shaders(cmd);
3144 nvk_flush_dynamic_state(cmd);
3145 nvk_flush_descriptors(cmd);
3146 }
3147
3148 void
nvk_mme_bind_ib(struct mme_builder * b)3149 nvk_mme_bind_ib(struct mme_builder *b)
3150 {
3151 struct mme_value64 addr = mme_load_addr64(b);
3152 struct mme_value size_B = mme_load(b);
3153
3154 struct mme_value addr_or = mme_or(b, addr.lo, addr.hi);
3155 mme_if(b, ieq, addr_or, mme_zero()) {
3156 mme_mov_to(b, size_B, mme_zero());
3157 }
3158 mme_free_reg(b, addr_or);
3159
3160 if (b->devinfo->cls_eng3d < TURING_A) {
3161 mme_if(b, ieq, size_B, mme_zero()) {
3162 nvk_mme_load_scratch_to(b, addr.hi, ZERO_ADDR_HI);
3163 nvk_mme_load_scratch_to(b, addr.lo, ZERO_ADDR_LO);
3164 }
3165 }
3166
3167 mme_mthd(b, NV9097_SET_INDEX_BUFFER_A);
3168 mme_emit(b, addr.hi);
3169 mme_emit(b, addr.lo);
3170
3171 if (b->devinfo->cls_eng3d >= TURING_A) {
3172 mme_mthd(b, NVC597_SET_INDEX_BUFFER_SIZE_A);
3173 mme_emit(b, mme_zero());
3174 mme_emit(b, size_B);
3175 } else {
3176 /* Convert to an end address */
3177 mme_add64_to(b, addr, addr, mme_value64(size_B, mme_zero()));
3178 mme_add64_to(b, addr, addr, mme_imm64(-1));
3179
3180 /* mme_mthd(b, NV9097_SET_INDEX_BUFFER_C); */
3181 mme_emit(b, addr.hi);
3182 mme_emit(b, addr.lo);
3183 }
3184 mme_free_reg64(b, addr);
3185 mme_free_reg(b, size_B);
3186
3187 struct mme_value fmt = mme_load(b);
3188 struct mme_value restart = mme_mov(b, mme_imm(UINT32_MAX));
3189 struct mme_value index_type = mme_mov(b,
3190 mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_FOUR_BYTES));
3191
3192 /* The Vulkan and D3D enums don't overlap so we can handle both at the same
3193 * time with one MME macro.
3194 */
3195 UNUSED static const uint32_t DXGI_FORMAT_R32_UINT = 42;
3196 static const uint32_t DXGI_FORMAT_R16_UINT = 57;
3197 static const uint32_t DXGI_FORMAT_R8_UINT = 62;
3198
3199 mme_if(b, ieq, fmt, mme_imm(VK_INDEX_TYPE_UINT16)) {
3200 mme_mov_to(b, restart, mme_imm(UINT16_MAX));
3201 mme_mov_to(b, index_type,
3202 mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_TWO_BYTES));
3203 }
3204
3205 mme_if(b, ieq, fmt, mme_imm(DXGI_FORMAT_R16_UINT)) {
3206 mme_mov_to(b, restart, mme_imm(UINT16_MAX));
3207 mme_mov_to(b, index_type,
3208 mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_TWO_BYTES));
3209 }
3210
3211 mme_if(b, ieq, fmt, mme_imm(VK_INDEX_TYPE_UINT8_KHR)) {
3212 mme_mov_to(b, restart, mme_imm(UINT8_MAX));
3213 mme_mov_to(b, index_type,
3214 mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_ONE_BYTE));
3215 }
3216
3217 mme_if(b, ieq, fmt, mme_imm(DXGI_FORMAT_R8_UINT)) {
3218 mme_mov_to(b, restart, mme_imm(UINT8_MAX));
3219 mme_mov_to(b, index_type,
3220 mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_ONE_BYTE));
3221 }
3222
3223 mme_mthd(b, NV9097_SET_DA_PRIMITIVE_RESTART_INDEX);
3224 mme_emit(b, restart);
3225
3226 mme_mthd(b, NV9097_SET_INDEX_BUFFER_E);
3227 mme_emit(b, index_type);
3228 }
3229
3230 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)3231 nvk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,
3232 VkBuffer _buffer,
3233 VkDeviceSize offset,
3234 VkDeviceSize size,
3235 VkIndexType indexType)
3236 {
3237 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3238 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
3239 struct nvk_addr_range addr_range =
3240 nvk_buffer_addr_range(buffer, offset, size);
3241
3242 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3243 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_IB));
3244 P_INLINE_DATA(p, addr_range.addr >> 32);
3245 P_INLINE_DATA(p, addr_range.addr);
3246 assert(addr_range.range <= UINT32_MAX);
3247 P_INLINE_DATA(p, addr_range.range);
3248 P_INLINE_DATA(p, indexType);
3249 }
3250
3251 void
nvk_mme_bind_vb(struct mme_builder * b)3252 nvk_mme_bind_vb(struct mme_builder *b)
3253 {
3254 struct mme_value vb_idx = mme_load(b);
3255 struct mme_value64 addr = mme_load_addr64(b);
3256 struct mme_value size_B = mme_load(b);
3257
3258 struct mme_value addr_or = mme_or(b, addr.lo, addr.hi);
3259 mme_if(b, ieq, addr_or, mme_zero()) {
3260 mme_mov_to(b, size_B, mme_zero());
3261 }
3262 mme_free_reg(b, addr_or);
3263
3264 if (b->devinfo->cls_eng3d < TURING_A) {
3265 mme_if(b, ieq, size_B, mme_zero()) {
3266 nvk_mme_load_scratch_to(b, addr.hi, ZERO_ADDR_HI);
3267 nvk_mme_load_scratch_to(b, addr.lo, ZERO_ADDR_LO);
3268 }
3269 }
3270
3271 struct mme_value vb_idx4 = mme_sll(b, vb_idx, mme_imm(2));
3272 mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_LOCATION_A(0), vb_idx4);
3273 mme_free_reg(b, vb_idx4);
3274 mme_emit(b, addr.hi);
3275 mme_emit(b, addr.lo);
3276
3277 if (b->devinfo->cls_eng3d >= TURING_A) {
3278 struct mme_value vb_idx2 = mme_sll(b, vb_idx, mme_imm(1));
3279 mme_mthd_arr(b, NVC597_SET_VERTEX_STREAM_SIZE_A(0), vb_idx2);
3280 mme_emit(b, mme_zero());
3281 mme_emit(b, size_B);
3282 } else {
3283 /* Convert to an end address */
3284 mme_add64_to(b, addr, addr, mme_value64(size_B, mme_zero()));
3285 mme_add64_to(b, addr, addr, mme_imm64(-1));
3286
3287 struct mme_value vb_idx2 = mme_sll(b, vb_idx, mme_imm(1));
3288 mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_LIMIT_A_A(0), vb_idx2);
3289 mme_emit(b, addr.hi);
3290 mme_emit(b, addr.lo);
3291 }
3292 }
3293
3294 static void
nvk_mme_bind_vb_test_check(const struct nv_device_info * devinfo,const struct nvk_mme_test_case * test,const struct nvk_mme_mthd_data * results)3295 nvk_mme_bind_vb_test_check(const struct nv_device_info *devinfo,
3296 const struct nvk_mme_test_case *test,
3297 const struct nvk_mme_mthd_data *results)
3298 {
3299 const uint32_t vb_idx = test->params[0];
3300 const uint32_t addr_hi = test->params[1];
3301 const uint32_t addr_lo = test->params[2];
3302
3303 uint32_t size_B = test->params[3];
3304 if (addr_hi == 0 && addr_lo == 0)
3305 size_B = 0;
3306
3307 assert(results[0].mthd == NV9097_SET_VERTEX_STREAM_A_LOCATION_A(vb_idx));
3308 assert(results[1].mthd == NV9097_SET_VERTEX_STREAM_A_LOCATION_B(vb_idx));
3309
3310 if (devinfo->cls_eng3d >= TURING_A) {
3311 assert(results[0].data == addr_hi);
3312 assert(results[1].data == addr_lo);
3313
3314 assert(results[2].mthd == NVC597_SET_VERTEX_STREAM_SIZE_A(3));
3315 assert(results[3].mthd == NVC597_SET_VERTEX_STREAM_SIZE_B(3));
3316 assert(results[2].data == 0);
3317 assert(results[3].data == size_B);
3318 } else {
3319 uint64_t addr = ((uint64_t)addr_hi << 32) | addr_lo;
3320 if (size_B == 0)
3321 addr = ((uint64_t)test->init[0].data << 32) | test->init[1].data;
3322
3323 assert(results[0].data == addr >> 32);
3324 assert(results[1].data == (uint32_t)addr);
3325
3326 const uint64_t limit = (addr + size_B) - 1;
3327 assert(results[2].mthd == NV9097_SET_VERTEX_STREAM_LIMIT_A_A(3));
3328 assert(results[3].mthd == NV9097_SET_VERTEX_STREAM_LIMIT_A_B(3));
3329 assert(results[2].data == limit >> 32);
3330 assert(results[3].data == (uint32_t)limit);
3331 }
3332 }
3333
3334 const struct nvk_mme_test_case nvk_mme_bind_vb_tests[] = {{
3335 .params = (uint32_t[]) { 3, 0xff3, 0xff4ab000, 0x10000 },
3336 .check = nvk_mme_bind_vb_test_check,
3337 }, {
3338 .init = (struct nvk_mme_mthd_data[]) {
3339 { NVK_SET_MME_SCRATCH(ZERO_ADDR_HI), 0xff3 },
3340 { NVK_SET_MME_SCRATCH(ZERO_ADDR_LO), 0xff356000 },
3341 { }
3342 },
3343 .params = (uint32_t[]) { 3, 0xff3, 0xff4ab000, 0 },
3344 .check = nvk_mme_bind_vb_test_check,
3345 }, {
3346 .init = (struct nvk_mme_mthd_data[]) {
3347 { NVK_SET_MME_SCRATCH(ZERO_ADDR_HI), 0xff3 },
3348 { NVK_SET_MME_SCRATCH(ZERO_ADDR_LO), 0xff356000 },
3349 { }
3350 },
3351 .params = (uint32_t[]) { 3, 0, 0, 0x800 },
3352 .check = nvk_mme_bind_vb_test_check,
3353 }, {}};
3354
3355 void
nvk_cmd_bind_vertex_buffer(struct nvk_cmd_buffer * cmd,uint32_t vb_idx,struct nvk_addr_range addr_range)3356 nvk_cmd_bind_vertex_buffer(struct nvk_cmd_buffer *cmd, uint32_t vb_idx,
3357 struct nvk_addr_range addr_range)
3358 {
3359 /* Used for meta save/restore */
3360 if (vb_idx == 0)
3361 cmd->state.gfx.vb0 = addr_range;
3362
3363 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3364 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_VB));
3365 P_INLINE_DATA(p, vb_idx);
3366 P_INLINE_DATA(p, addr_range.addr >> 32);
3367 P_INLINE_DATA(p, addr_range.addr);
3368 assert(addr_range.range <= UINT32_MAX);
3369 P_INLINE_DATA(p, addr_range.range);
3370 }
3371
3372 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)3373 nvk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,
3374 uint32_t firstBinding,
3375 uint32_t bindingCount,
3376 const VkBuffer *pBuffers,
3377 const VkDeviceSize *pOffsets,
3378 const VkDeviceSize *pSizes,
3379 const VkDeviceSize *pStrides)
3380 {
3381 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3382
3383 if (pStrides) {
3384 vk_cmd_set_vertex_binding_strides(&cmd->vk, firstBinding,
3385 bindingCount, pStrides);
3386 }
3387
3388 for (uint32_t i = 0; i < bindingCount; i++) {
3389 VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]);
3390 uint32_t idx = firstBinding + i;
3391
3392 uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
3393 const struct nvk_addr_range addr_range =
3394 nvk_buffer_addr_range(buffer, pOffsets[i], size);
3395
3396 nvk_cmd_bind_vertex_buffer(cmd, idx, addr_range);
3397 }
3398 }
3399
3400 static void
nvk_mme_set_cb0_mthd(struct mme_builder * b,uint16_t cb0_offset,uint16_t mthd,struct mme_value val)3401 nvk_mme_set_cb0_mthd(struct mme_builder *b,
3402 uint16_t cb0_offset,
3403 uint16_t mthd,
3404 struct mme_value val)
3405 {
3406 if (b->devinfo->cls_eng3d >= TURING_A) {
3407 struct mme_value old = mme_state(b, mthd);
3408 mme_if(b, ine, old, val) {
3409 mme_mthd(b, mthd);
3410 mme_emit(b, val);
3411
3412 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
3413 mme_emit(b, mme_imm(cb0_offset));
3414 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
3415 mme_emit(b, val);
3416 }
3417 mme_free_reg(b, old);
3418 } else {
3419 /* Fermi is really tight on registers. Don't bother with the if and set
3420 * both unconditionally for now.
3421 */
3422 mme_mthd(b, mthd);
3423 mme_emit(b, val);
3424
3425 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
3426 mme_emit(b, mme_imm(cb0_offset));
3427 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
3428 mme_emit(b, val);
3429 }
3430 }
3431
3432 static void
nvk_mme_set_cb0_scratch(struct mme_builder * b,uint16_t cb0_offset,enum nvk_mme_scratch scratch,struct mme_value val)3433 nvk_mme_set_cb0_scratch(struct mme_builder *b,
3434 uint16_t cb0_offset,
3435 enum nvk_mme_scratch scratch,
3436 struct mme_value val)
3437 {
3438 const uint16_t mthd = NV9097_SET_MME_SHADOW_SCRATCH(scratch);
3439 nvk_mme_set_cb0_mthd(b, cb0_offset, mthd, val);
3440 }
3441
3442 struct mme_draw_params {
3443 struct mme_value base_vertex;
3444 struct mme_value first_vertex;
3445 struct mme_value first_instance;
3446 struct mme_value draw_index;
3447 };
3448
3449 static void
nvk_mme_build_set_draw_params(struct mme_builder * b,const struct mme_draw_params * p)3450 nvk_mme_build_set_draw_params(struct mme_builder *b,
3451 const struct mme_draw_params *p)
3452 {
3453 nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.base_vertex),
3454 NVK_MME_SCRATCH_CB0_FIRST_VERTEX,
3455 p->first_vertex);
3456 nvk_mme_set_cb0_mthd(b, nvk_root_descriptor_offset(draw.base_instance),
3457 NV9097_SET_GLOBAL_BASE_INSTANCE_INDEX,
3458 p->first_instance);
3459 nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.draw_index),
3460 NVK_MME_SCRATCH_CB0_DRAW_INDEX,
3461 p->draw_index);
3462 nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.view_index),
3463 NVK_MME_SCRATCH_CB0_VIEW_INDEX,
3464 mme_zero());
3465
3466 mme_mthd(b, NV9097_SET_GLOBAL_BASE_VERTEX_INDEX);
3467 mme_emit(b, p->base_vertex);
3468 mme_mthd(b, NV9097_SET_VERTEX_ID_BASE);
3469 mme_emit(b, p->base_vertex);
3470 }
3471
3472 static void
nvk_mme_emit_view_index(struct mme_builder * b,struct mme_value view_index)3473 nvk_mme_emit_view_index(struct mme_builder *b, struct mme_value view_index)
3474 {
3475 /* Set the push constant */
3476 nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.view_index),
3477 NVK_MME_SCRATCH_CB0_VIEW_INDEX,
3478 view_index);
3479
3480 /* Set the layer to the view index */
3481 STATIC_ASSERT(DRF_LO(NV9097_SET_RT_LAYER_V) == 0);
3482 STATIC_ASSERT(NV9097_SET_RT_LAYER_CONTROL_V_SELECTS_LAYER == 0);
3483 mme_mthd(b, NV9097_SET_RT_LAYER);
3484 mme_emit(b, view_index);
3485 }
3486
3487 static void
nvk_mme_build_draw_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value first_vertex,struct mme_value vertex_count)3488 nvk_mme_build_draw_loop(struct mme_builder *b,
3489 struct mme_value instance_count,
3490 struct mme_value first_vertex,
3491 struct mme_value vertex_count)
3492 {
3493 struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
3494
3495 mme_loop(b, instance_count) {
3496 mme_mthd(b, NV9097_BEGIN);
3497 mme_emit(b, begin);
3498
3499 mme_mthd(b, NV9097_SET_VERTEX_ARRAY_START);
3500 mme_emit(b, first_vertex);
3501 mme_emit(b, vertex_count);
3502
3503 mme_mthd(b, NV9097_END);
3504 mme_emit(b, mme_zero());
3505
3506 mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
3507 }
3508
3509 mme_free_reg(b, begin);
3510 }
3511
3512 static void
nvk_mme_build_draw(struct mme_builder * b,struct mme_value draw_index)3513 nvk_mme_build_draw(struct mme_builder *b,
3514 struct mme_value draw_index)
3515 {
3516 /* These are in VkDrawIndirectCommand order */
3517 struct mme_value vertex_count = mme_load(b);
3518 struct mme_value instance_count = mme_load(b);
3519 struct mme_value first_vertex = mme_load(b);
3520 struct mme_value first_instance = mme_load(b);
3521
3522 struct mme_draw_params params = {
3523 .first_vertex = first_vertex,
3524 .first_instance = first_instance,
3525 .draw_index = draw_index,
3526 };
3527 nvk_mme_build_set_draw_params(b, ¶ms);
3528
3529 mme_free_reg(b, first_instance);
3530
3531 if (b->devinfo->cls_eng3d < TURING_A)
3532 nvk_mme_spill(b, DRAW_IDX, draw_index);
3533
3534 struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3535 mme_if(b, ieq, view_mask, mme_zero()) {
3536 mme_free_reg(b, view_mask);
3537
3538 nvk_mme_build_draw_loop(b, instance_count,
3539 first_vertex, vertex_count);
3540 }
3541
3542 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3543 mme_if(b, ine, view_mask, mme_zero()) {
3544 mme_free_reg(b, view_mask);
3545
3546 struct mme_value view = mme_mov(b, mme_zero());
3547 mme_while(b, ine, view, mme_imm(32)) {
3548 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3549 struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
3550 mme_free_reg(b, view_mask);
3551 mme_if(b, ine, has_view, mme_zero()) {
3552 mme_free_reg(b, has_view);
3553 nvk_mme_emit_view_index(b, view);
3554 nvk_mme_build_draw_loop(b, instance_count,
3555 first_vertex, vertex_count);
3556 }
3557
3558 mme_add_to(b, view, view, mme_imm(1));
3559 }
3560 mme_free_reg(b, view);
3561 }
3562
3563 mme_free_reg(b, instance_count);
3564 mme_free_reg(b, first_vertex);
3565 mme_free_reg(b, vertex_count);
3566
3567 if (b->devinfo->cls_eng3d < TURING_A)
3568 nvk_mme_unspill(b, DRAW_IDX, draw_index);
3569 }
3570
3571 void
nvk_mme_draw(struct mme_builder * b)3572 nvk_mme_draw(struct mme_builder *b)
3573 {
3574 struct mme_value draw_index = mme_load(b);
3575 nvk_mme_build_draw(b, draw_index);
3576 }
3577
3578 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)3579 nvk_CmdDraw(VkCommandBuffer commandBuffer,
3580 uint32_t vertexCount,
3581 uint32_t instanceCount,
3582 uint32_t firstVertex,
3583 uint32_t firstInstance)
3584 {
3585 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3586
3587 nvk_flush_gfx_state(cmd);
3588
3589 struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3590 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
3591 P_INLINE_DATA(p, 0 /* draw_index */);
3592 P_INLINE_DATA(p, vertexCount);
3593 P_INLINE_DATA(p, instanceCount);
3594 P_INLINE_DATA(p, firstVertex);
3595 P_INLINE_DATA(p, firstInstance);
3596 }
3597
3598 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)3599 nvk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,
3600 uint32_t drawCount,
3601 const VkMultiDrawInfoEXT *pVertexInfo,
3602 uint32_t instanceCount,
3603 uint32_t firstInstance,
3604 uint32_t stride)
3605 {
3606 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3607
3608 nvk_flush_gfx_state(cmd);
3609
3610 for (uint32_t draw_index = 0; draw_index < drawCount; draw_index++) {
3611 struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3612 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
3613 P_INLINE_DATA(p, draw_index);
3614 P_INLINE_DATA(p, pVertexInfo->vertexCount);
3615 P_INLINE_DATA(p, instanceCount);
3616 P_INLINE_DATA(p, pVertexInfo->firstVertex);
3617 P_INLINE_DATA(p, firstInstance);
3618
3619 pVertexInfo = ((void *)pVertexInfo) + stride;
3620 }
3621 }
3622
3623 static void
nvk_mme_build_draw_indexed_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value first_index,struct mme_value index_count)3624 nvk_mme_build_draw_indexed_loop(struct mme_builder *b,
3625 struct mme_value instance_count,
3626 struct mme_value first_index,
3627 struct mme_value index_count)
3628 {
3629 struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
3630
3631 mme_loop(b, instance_count) {
3632 mme_mthd(b, NV9097_BEGIN);
3633 mme_emit(b, begin);
3634
3635 mme_mthd(b, NV9097_SET_INDEX_BUFFER_F);
3636 mme_emit(b, first_index);
3637 mme_emit(b, index_count);
3638
3639 mme_mthd(b, NV9097_END);
3640 mme_emit(b, mme_zero());
3641
3642 mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
3643 }
3644
3645 mme_free_reg(b, begin);
3646 }
3647
3648 static void
nvk_mme_build_draw_indexed(struct mme_builder * b,struct mme_value draw_index)3649 nvk_mme_build_draw_indexed(struct mme_builder *b,
3650 struct mme_value draw_index)
3651 {
3652 /* These are in VkDrawIndexedIndirectCommand order */
3653 struct mme_value index_count = mme_load(b);
3654 struct mme_value instance_count = mme_load(b);
3655 struct mme_value first_index = mme_load(b);
3656 struct mme_value vertex_offset = mme_load(b);
3657 struct mme_value first_instance = mme_load(b);
3658
3659 struct mme_draw_params params = {
3660 .base_vertex = vertex_offset,
3661 .first_vertex = vertex_offset,
3662 .first_instance = first_instance,
3663 .draw_index = draw_index,
3664 };
3665 nvk_mme_build_set_draw_params(b, ¶ms);
3666
3667 mme_free_reg(b, vertex_offset);
3668 mme_free_reg(b, first_instance);
3669
3670 if (b->devinfo->cls_eng3d < TURING_A)
3671 nvk_mme_spill(b, DRAW_IDX, draw_index);
3672
3673 struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3674 mme_if(b, ieq, view_mask, mme_zero()) {
3675 mme_free_reg(b, view_mask);
3676
3677 nvk_mme_build_draw_indexed_loop(b, instance_count,
3678 first_index, index_count);
3679 }
3680
3681 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3682 mme_if(b, ine, view_mask, mme_zero()) {
3683 mme_free_reg(b, view_mask);
3684
3685 struct mme_value view = mme_mov(b, mme_zero());
3686 mme_while(b, ine, view, mme_imm(32)) {
3687 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3688 struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
3689 mme_free_reg(b, view_mask);
3690 mme_if(b, ine, has_view, mme_zero()) {
3691 mme_free_reg(b, has_view);
3692 nvk_mme_emit_view_index(b, view);
3693 nvk_mme_build_draw_indexed_loop(b, instance_count,
3694 first_index, index_count);
3695 }
3696
3697 mme_add_to(b, view, view, mme_imm(1));
3698 }
3699 mme_free_reg(b, view);
3700 }
3701
3702 mme_free_reg(b, instance_count);
3703 mme_free_reg(b, first_index);
3704 mme_free_reg(b, index_count);
3705
3706 if (b->devinfo->cls_eng3d < TURING_A)
3707 nvk_mme_unspill(b, DRAW_IDX, draw_index);
3708 }
3709
3710 void
nvk_mme_draw_indexed(struct mme_builder * b)3711 nvk_mme_draw_indexed(struct mme_builder *b)
3712 {
3713 struct mme_value draw_index = mme_load(b);
3714 nvk_mme_build_draw_indexed(b, draw_index);
3715 }
3716
3717 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)3718 nvk_CmdDrawIndexed(VkCommandBuffer commandBuffer,
3719 uint32_t indexCount,
3720 uint32_t instanceCount,
3721 uint32_t firstIndex,
3722 int32_t vertexOffset,
3723 uint32_t firstInstance)
3724 {
3725 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3726
3727 nvk_flush_gfx_state(cmd);
3728
3729 struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
3730 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
3731 P_INLINE_DATA(p, 0 /* draw_index */);
3732 P_INLINE_DATA(p, indexCount);
3733 P_INLINE_DATA(p, instanceCount);
3734 P_INLINE_DATA(p, firstIndex);
3735 P_INLINE_DATA(p, vertexOffset);
3736 P_INLINE_DATA(p, firstInstance);
3737 }
3738
3739 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)3740 nvk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,
3741 uint32_t drawCount,
3742 const VkMultiDrawIndexedInfoEXT *pIndexInfo,
3743 uint32_t instanceCount,
3744 uint32_t firstInstance,
3745 uint32_t stride,
3746 const int32_t *pVertexOffset)
3747 {
3748 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3749
3750 nvk_flush_gfx_state(cmd);
3751
3752 for (uint32_t draw_index = 0; draw_index < drawCount; draw_index++) {
3753 const uint32_t vertex_offset =
3754 pVertexOffset != NULL ? *pVertexOffset : pIndexInfo->vertexOffset;
3755
3756 struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
3757 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
3758 P_INLINE_DATA(p, draw_index);
3759 P_INLINE_DATA(p, pIndexInfo->indexCount);
3760 P_INLINE_DATA(p, instanceCount);
3761 P_INLINE_DATA(p, pIndexInfo->firstIndex);
3762 P_INLINE_DATA(p, vertex_offset);
3763 P_INLINE_DATA(p, firstInstance);
3764
3765 pIndexInfo = ((void *)pIndexInfo) + stride;
3766 }
3767 }
3768
3769 void
nvk_mme_draw_indirect(struct mme_builder * b)3770 nvk_mme_draw_indirect(struct mme_builder *b)
3771 {
3772 if (b->devinfo->cls_eng3d >= TURING_A) {
3773 struct mme_value64 draw_addr = mme_load_addr64(b);
3774 struct mme_value draw_count = mme_load(b);
3775 struct mme_value stride = mme_load(b);
3776
3777 struct mme_value draw = mme_mov(b, mme_zero());
3778 mme_while(b, ult, draw, draw_count) {
3779 mme_tu104_read_fifoed(b, draw_addr, mme_imm(4));
3780
3781 nvk_mme_build_draw(b, draw);
3782
3783 mme_add_to(b, draw, draw, mme_imm(1));
3784 mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
3785 }
3786 } else {
3787 struct mme_value draw_count = mme_load(b);
3788 nvk_mme_load_to_scratch(b, DRAW_PAD_DW);
3789
3790 struct mme_value draw = mme_mov(b, mme_zero());
3791 mme_while(b, ine, draw, draw_count) {
3792 nvk_mme_spill(b, DRAW_COUNT, draw_count);
3793
3794 nvk_mme_build_draw(b, draw);
3795 mme_add_to(b, draw, draw, mme_imm(1));
3796
3797 struct mme_value pad_dw = nvk_mme_load_scratch(b, DRAW_PAD_DW);
3798 mme_loop(b, pad_dw) {
3799 mme_free_reg(b, mme_load(b));
3800 }
3801 mme_free_reg(b, pad_dw);
3802
3803 nvk_mme_unspill(b, DRAW_COUNT, draw_count);
3804 }
3805 }
3806 }
3807
3808 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3809 nvk_CmdDrawIndirect(VkCommandBuffer commandBuffer,
3810 VkBuffer _buffer,
3811 VkDeviceSize offset,
3812 uint32_t drawCount,
3813 uint32_t stride)
3814 {
3815 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3816 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
3817
3818 /* From the Vulkan 1.3.238 spec:
3819 *
3820 * VUID-vkCmdDrawIndirect-drawCount-00476
3821 *
3822 * "If drawCount is greater than 1, stride must be a multiple of 4 and
3823 * must be greater than or equal to sizeof(VkDrawIndirectCommand)"
3824 *
3825 * and
3826 *
3827 * "If drawCount is less than or equal to one, stride is ignored."
3828 */
3829 if (drawCount > 1) {
3830 assert(stride % 4 == 0);
3831 assert(stride >= sizeof(VkDrawIndirectCommand));
3832 } else {
3833 stride = sizeof(VkDrawIndirectCommand);
3834 }
3835
3836 nvk_flush_gfx_state(cmd);
3837
3838 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
3839 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3840 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
3841 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
3842 P_INLINE_DATA(p, draw_addr >> 32);
3843 P_INLINE_DATA(p, draw_addr);
3844 P_INLINE_DATA(p, drawCount);
3845 P_INLINE_DATA(p, stride);
3846 } else {
3847 const uint32_t max_draws_per_push =
3848 ((NV_PUSH_MAX_COUNT - 3) * 4) / stride;
3849
3850 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
3851 while (drawCount) {
3852 const uint32_t count = MIN2(drawCount, max_draws_per_push);
3853
3854 struct nv_push *p = nvk_cmd_buffer_push(cmd, 3);
3855 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
3856 P_INLINE_DATA(p, count);
3857 P_INLINE_DATA(p, (stride - sizeof(VkDrawIndirectCommand)) / 4);
3858
3859 uint64_t range = count * (uint64_t)stride;
3860 nv_push_update_count(p, range / 4);
3861 nvk_cmd_buffer_push_indirect(cmd, draw_addr, range);
3862
3863 draw_addr += range;
3864 drawCount -= count;
3865 }
3866 }
3867 }
3868
3869 void
nvk_mme_draw_indexed_indirect(struct mme_builder * b)3870 nvk_mme_draw_indexed_indirect(struct mme_builder *b)
3871 {
3872 if (b->devinfo->cls_eng3d >= TURING_A) {
3873 struct mme_value64 draw_addr = mme_load_addr64(b);
3874 struct mme_value draw_count = mme_load(b);
3875 struct mme_value stride = mme_load(b);
3876
3877 struct mme_value draw = mme_mov(b, mme_zero());
3878 mme_while(b, ult, draw, draw_count) {
3879 mme_tu104_read_fifoed(b, draw_addr, mme_imm(5));
3880
3881 nvk_mme_build_draw_indexed(b, draw);
3882
3883 mme_add_to(b, draw, draw, mme_imm(1));
3884 mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
3885 }
3886 } else {
3887 struct mme_value draw_count = mme_load(b);
3888 nvk_mme_load_to_scratch(b, DRAW_PAD_DW);
3889
3890 struct mme_value draw = mme_mov(b, mme_zero());
3891 mme_while(b, ine, draw, draw_count) {
3892 nvk_mme_spill(b, DRAW_COUNT, draw_count);
3893
3894 nvk_mme_build_draw_indexed(b, draw);
3895 mme_add_to(b, draw, draw, mme_imm(1));
3896
3897 struct mme_value pad_dw = nvk_mme_load_scratch(b, DRAW_PAD_DW);
3898 mme_loop(b, pad_dw) {
3899 mme_free_reg(b, mme_load(b));
3900 }
3901 mme_free_reg(b, pad_dw);
3902
3903 nvk_mme_unspill(b, DRAW_COUNT, draw_count);
3904 }
3905 }
3906 }
3907
3908 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3909 nvk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
3910 VkBuffer _buffer,
3911 VkDeviceSize offset,
3912 uint32_t drawCount,
3913 uint32_t stride)
3914 {
3915 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3916 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
3917
3918 /* From the Vulkan 1.3.238 spec:
3919 *
3920 * VUID-vkCmdDrawIndexedIndirect-drawCount-00528
3921 *
3922 * "If drawCount is greater than 1, stride must be a multiple of 4 and
3923 * must be greater than or equal to sizeof(VkDrawIndexedIndirectCommand)"
3924 *
3925 * and
3926 *
3927 * "If drawCount is less than or equal to one, stride is ignored."
3928 */
3929 if (drawCount > 1) {
3930 assert(stride % 4 == 0);
3931 assert(stride >= sizeof(VkDrawIndexedIndirectCommand));
3932 } else {
3933 stride = sizeof(VkDrawIndexedIndirectCommand);
3934 }
3935
3936 nvk_flush_gfx_state(cmd);
3937
3938 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
3939 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3940 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
3941 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
3942 P_INLINE_DATA(p, draw_addr >> 32);
3943 P_INLINE_DATA(p, draw_addr);
3944 P_INLINE_DATA(p, drawCount);
3945 P_INLINE_DATA(p, stride);
3946 } else {
3947 const uint32_t max_draws_per_push =
3948 ((NV_PUSH_MAX_COUNT - 3) * 4) / stride;
3949
3950 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
3951 while (drawCount) {
3952 const uint32_t count = MIN2(drawCount, max_draws_per_push);
3953
3954 struct nv_push *p = nvk_cmd_buffer_push(cmd, 3);
3955 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
3956 P_INLINE_DATA(p, count);
3957 P_INLINE_DATA(p, (stride - sizeof(VkDrawIndexedIndirectCommand)) / 4);
3958
3959 uint64_t range = count * (uint64_t)stride;
3960 nv_push_update_count(p, range / 4);
3961 nvk_cmd_buffer_push_indirect(cmd, draw_addr, range);
3962
3963 draw_addr += range;
3964 drawCount -= count;
3965 }
3966 }
3967 }
3968
3969 void
nvk_mme_draw_indirect_count(struct mme_builder * b)3970 nvk_mme_draw_indirect_count(struct mme_builder *b)
3971 {
3972 if (b->devinfo->cls_eng3d < TURING_A)
3973 return;
3974
3975 struct mme_value64 draw_addr = mme_load_addr64(b);
3976 struct mme_value64 draw_count_addr = mme_load_addr64(b);
3977 struct mme_value draw_max = mme_load(b);
3978 struct mme_value stride = mme_load(b);
3979
3980 mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
3981 mme_free_reg64(b, draw_count_addr);
3982 struct mme_value draw_count_buf = mme_load(b);
3983
3984 mme_if(b, ule, draw_count_buf, draw_max) {
3985 mme_mov_to(b, draw_max, draw_count_buf);
3986 }
3987 mme_free_reg(b, draw_count_buf);
3988
3989 struct mme_value draw = mme_mov(b, mme_zero());
3990 mme_while(b, ult, draw, draw_max) {
3991 mme_tu104_read_fifoed(b, draw_addr, mme_imm(4));
3992
3993 nvk_mme_build_draw(b, draw);
3994
3995 mme_add_to(b, draw, draw, mme_imm(1));
3996 mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
3997 }
3998 }
3999
4000 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)4001 nvk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,
4002 VkBuffer _buffer,
4003 VkDeviceSize offset,
4004 VkBuffer countBuffer,
4005 VkDeviceSize countBufferOffset,
4006 uint32_t maxDrawCount,
4007 uint32_t stride)
4008 {
4009 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4010 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4011 VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
4012
4013 /* TODO: Indirect count draw pre-Turing */
4014 assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
4015
4016 nvk_flush_gfx_state(cmd);
4017
4018 struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4019 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT_COUNT));
4020 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4021 P_INLINE_DATA(p, draw_addr >> 32);
4022 P_INLINE_DATA(p, draw_addr);
4023 uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
4024 countBufferOffset);
4025 P_INLINE_DATA(p, draw_count_addr >> 32);
4026 P_INLINE_DATA(p, draw_count_addr);
4027 P_INLINE_DATA(p, maxDrawCount);
4028 P_INLINE_DATA(p, stride);
4029 }
4030
4031 void
nvk_mme_draw_indexed_indirect_count(struct mme_builder * b)4032 nvk_mme_draw_indexed_indirect_count(struct mme_builder *b)
4033 {
4034 if (b->devinfo->cls_eng3d < TURING_A)
4035 return;
4036
4037 struct mme_value64 draw_addr = mme_load_addr64(b);
4038 struct mme_value64 draw_count_addr = mme_load_addr64(b);
4039 struct mme_value draw_max = mme_load(b);
4040 struct mme_value stride = mme_load(b);
4041
4042 mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
4043 mme_free_reg64(b, draw_count_addr);
4044 struct mme_value draw_count_buf = mme_load(b);
4045
4046 mme_if(b, ule, draw_count_buf, draw_max) {
4047 mme_mov_to(b, draw_max, draw_count_buf);
4048 }
4049 mme_free_reg(b, draw_count_buf);
4050
4051 struct mme_value draw = mme_mov(b, mme_zero());
4052 mme_while(b, ult, draw, draw_max) {
4053 mme_tu104_read_fifoed(b, draw_addr, mme_imm(5));
4054
4055 nvk_mme_build_draw_indexed(b, draw);
4056
4057 mme_add_to(b, draw, draw, mme_imm(1));
4058 mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4059 }
4060 }
4061
4062 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)4063 nvk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,
4064 VkBuffer _buffer,
4065 VkDeviceSize offset,
4066 VkBuffer countBuffer,
4067 VkDeviceSize countBufferOffset,
4068 uint32_t maxDrawCount,
4069 uint32_t stride)
4070 {
4071 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4072 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4073 VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
4074
4075 /* TODO: Indexed indirect count draw pre-Turing */
4076 assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
4077
4078 nvk_flush_gfx_state(cmd);
4079
4080 struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4081 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT_COUNT));
4082 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4083 P_INLINE_DATA(p, draw_addr >> 32);
4084 P_INLINE_DATA(p, draw_addr);
4085 uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
4086 countBufferOffset);
4087 P_INLINE_DATA(p, draw_count_addr >> 32);
4088 P_INLINE_DATA(p, draw_count_addr);
4089 P_INLINE_DATA(p, maxDrawCount);
4090 P_INLINE_DATA(p, stride);
4091 }
4092
4093 static void
nvk_mme_xfb_draw_indirect_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value counter)4094 nvk_mme_xfb_draw_indirect_loop(struct mme_builder *b,
4095 struct mme_value instance_count,
4096 struct mme_value counter)
4097 {
4098 struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
4099
4100 mme_loop(b, instance_count) {
4101 mme_mthd(b, NV9097_BEGIN);
4102 mme_emit(b, begin);
4103
4104 mme_mthd(b, NV9097_DRAW_AUTO);
4105 mme_emit(b, counter);
4106
4107 mme_mthd(b, NV9097_END);
4108 mme_emit(b, mme_zero());
4109
4110 mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
4111 }
4112
4113 mme_free_reg(b, begin);
4114 }
4115
4116 void
nvk_mme_xfb_draw_indirect(struct mme_builder * b)4117 nvk_mme_xfb_draw_indirect(struct mme_builder *b)
4118 {
4119 struct mme_value instance_count = mme_load(b);
4120 struct mme_value first_instance = mme_load(b);
4121
4122 if (b->devinfo->cls_eng3d >= TURING_A) {
4123 struct mme_value64 counter_addr = mme_load_addr64(b);
4124 mme_tu104_read_fifoed(b, counter_addr, mme_imm(1));
4125 mme_free_reg(b, counter_addr.lo);
4126 mme_free_reg(b, counter_addr.hi);
4127 }
4128 struct mme_value counter = mme_load(b);
4129
4130 struct mme_draw_params params = {
4131 .first_instance = first_instance,
4132 };
4133 nvk_mme_build_set_draw_params(b, ¶ms);
4134
4135 mme_free_reg(b, first_instance);
4136
4137 struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4138 mme_if(b, ieq, view_mask, mme_zero()) {
4139 mme_free_reg(b, view_mask);
4140
4141 nvk_mme_xfb_draw_indirect_loop(b, instance_count, counter);
4142 }
4143
4144 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4145 mme_if(b, ine, view_mask, mme_zero()) {
4146 mme_free_reg(b, view_mask);
4147
4148 struct mme_value view = mme_mov(b, mme_zero());
4149 mme_while(b, ine, view, mme_imm(32)) {
4150 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4151 struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
4152 mme_free_reg(b, view_mask);
4153 mme_if(b, ine, has_view, mme_zero()) {
4154 mme_free_reg(b, has_view);
4155 nvk_mme_emit_view_index(b, view);
4156 nvk_mme_xfb_draw_indirect_loop(b, instance_count, counter);
4157 }
4158
4159 mme_add_to(b, view, view, mme_imm(1));
4160 }
4161 }
4162
4163 mme_free_reg(b, instance_count);
4164 mme_free_reg(b, counter);
4165 }
4166
4167 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)4168 nvk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
4169 uint32_t instanceCount,
4170 uint32_t firstInstance,
4171 VkBuffer counterBuffer,
4172 VkDeviceSize counterBufferOffset,
4173 uint32_t counterOffset,
4174 uint32_t vertexStride)
4175 {
4176 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4177 VK_FROM_HANDLE(nvk_buffer, counter_buffer, counterBuffer);
4178
4179 nvk_flush_gfx_state(cmd);
4180
4181 uint64_t counter_addr = nvk_buffer_address(counter_buffer,
4182 counterBufferOffset);
4183
4184 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4185 struct nv_push *p = nvk_cmd_buffer_push(cmd, 9);
4186 P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
4187 P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
4188
4189 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
4190 P_INLINE_DATA(p, instanceCount);
4191 P_INLINE_DATA(p, firstInstance);
4192 P_INLINE_DATA(p, counter_addr >> 32);
4193 P_INLINE_DATA(p, counter_addr);
4194 } else {
4195 struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
4196 P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
4197 P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
4198
4199 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
4200 P_INLINE_DATA(p, instanceCount);
4201 P_INLINE_DATA(p, firstInstance);
4202 nv_push_update_count(p, 1);
4203 nvk_cmd_buffer_push_indirect(cmd, counter_addr, 4);
4204 }
4205 }
4206
4207 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)4208 nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
4209 uint32_t firstBinding,
4210 uint32_t bindingCount,
4211 const VkBuffer *pBuffers,
4212 const VkDeviceSize *pOffsets,
4213 const VkDeviceSize *pSizes)
4214 {
4215 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4216
4217 for (uint32_t i = 0; i < bindingCount; i++) {
4218 VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]);
4219 uint32_t idx = firstBinding + i;
4220 uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
4221 struct nvk_addr_range addr_range =
4222 nvk_buffer_addr_range(buffer, pOffsets[i], size);
4223 assert(addr_range.range <= UINT32_MAX);
4224
4225 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
4226
4227 P_MTHD(p, NV9097, SET_STREAM_OUT_BUFFER_ENABLE(idx));
4228 P_NV9097_SET_STREAM_OUT_BUFFER_ENABLE(p, idx, V_TRUE);
4229 P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_A(p, idx, addr_range.addr >> 32);
4230 P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_B(p, idx, addr_range.addr);
4231 P_NV9097_SET_STREAM_OUT_BUFFER_SIZE(p, idx, (uint32_t)addr_range.range);
4232 }
4233
4234 // TODO: do we need to SET_STREAM_OUT_BUFFER_ENABLE V_FALSE ?
4235 }
4236
4237 void
nvk_mme_xfb_counter_load(struct mme_builder * b)4238 nvk_mme_xfb_counter_load(struct mme_builder *b)
4239 {
4240 struct mme_value buffer = mme_load(b);
4241
4242 struct mme_value counter;
4243 if (b->devinfo->cls_eng3d >= TURING_A) {
4244 struct mme_value64 counter_addr = mme_load_addr64(b);
4245
4246 mme_tu104_read_fifoed(b, counter_addr, mme_imm(1));
4247 mme_free_reg(b, counter_addr.lo);
4248 mme_free_reg(b, counter_addr.hi);
4249
4250 counter = mme_load(b);
4251 } else {
4252 counter = mme_load(b);
4253 }
4254
4255 mme_mthd_arr(b, NV9097_SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(0), buffer);
4256 mme_emit(b, counter);
4257
4258 mme_free_reg(b, counter);
4259 mme_free_reg(b, buffer);
4260 }
4261
4262 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)4263 nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
4264 uint32_t firstCounterBuffer,
4265 uint32_t counterBufferCount,
4266 const VkBuffer *pCounterBuffers,
4267 const VkDeviceSize *pCounterBufferOffsets)
4268 {
4269 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4270 const uint32_t max_buffers = 4;
4271
4272 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + 2 * max_buffers);
4273
4274 P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_TRUE);
4275 for (uint32_t i = 0; i < max_buffers; ++i) {
4276 P_IMMD(p, NV9097, SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(i), 0);
4277 }
4278
4279 for (uint32_t i = 0; i < counterBufferCount; ++i) {
4280 if (pCounterBuffers[i] == VK_NULL_HANDLE)
4281 continue;
4282
4283 VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]);
4284 // index of counter buffer corresponts to index of transform buffer
4285 uint32_t cb_idx = firstCounterBuffer + i;
4286 uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
4287 uint64_t cb_addr = nvk_buffer_address(buffer, offset);
4288
4289 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4290 struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
4291 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD));
4292 /* The STREAM_OUT_BUFFER_LOAD_WRITE_POINTER registers are 8 dword stride */
4293 P_INLINE_DATA(p, cb_idx * 8);
4294 P_INLINE_DATA(p, cb_addr >> 32);
4295 P_INLINE_DATA(p, cb_addr);
4296 } else {
4297 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
4298 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD));
4299 P_INLINE_DATA(p, cb_idx);
4300 nv_push_update_count(p, 1);
4301 nvk_cmd_buffer_push_indirect(cmd, cb_addr, 4);
4302 }
4303 }
4304 }
4305
4306 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)4307 nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
4308 uint32_t firstCounterBuffer,
4309 uint32_t counterBufferCount,
4310 const VkBuffer *pCounterBuffers,
4311 const VkDeviceSize *pCounterBufferOffsets)
4312 {
4313 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4314
4315 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5 * counterBufferCount + 2);
4316
4317 P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_FALSE);
4318
4319 for (uint32_t i = 0; i < counterBufferCount; ++i) {
4320 if (pCounterBuffers[i] == VK_NULL_HANDLE)
4321 continue;
4322
4323 VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]);
4324 // index of counter buffer corresponts to index of transform buffer
4325 uint32_t cb_idx = firstCounterBuffer + i;
4326 uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
4327 uint64_t cb_addr = nvk_buffer_address(buffer, offset);
4328
4329 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
4330 P_NV9097_SET_REPORT_SEMAPHORE_A(p, cb_addr >> 32);
4331 P_NV9097_SET_REPORT_SEMAPHORE_B(p, cb_addr);
4332 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
4333 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
4334 .operation = OPERATION_REPORT_ONLY,
4335 .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT,
4336 .report = REPORT_STREAMING_BYTE_COUNT,
4337 .sub_report = cb_idx,
4338 .structure_size = STRUCTURE_SIZE_ONE_WORD,
4339 });
4340 }
4341 }
4342
4343 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)4344 nvk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
4345 const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
4346 {
4347 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4348 VK_FROM_HANDLE(nvk_buffer, buffer, pConditionalRenderingBegin->buffer);
4349
4350 uint64_t addr = nvk_buffer_address(buffer, pConditionalRenderingBegin->offset);
4351 bool inverted = pConditionalRenderingBegin->flags &
4352 VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
4353
4354 /* From the Vulkan 1.3.280 spec:
4355 *
4356 * "If the 32-bit value at offset in buffer memory is zero,
4357 * then the rendering commands are discarded,
4358 * otherwise they are executed as normal."
4359 *
4360 * The hardware compare a 64-bit value, as such we are required to copy it.
4361 */
4362 uint64_t tmp_addr;
4363 VkResult result = nvk_cmd_buffer_cond_render_alloc(cmd, &tmp_addr);
4364 if (result != VK_SUCCESS) {
4365 vk_command_buffer_set_error(&cmd->vk, result);
4366 return;
4367 }
4368
4369 struct nv_push *p = nvk_cmd_buffer_push(cmd, 26);
4370
4371 P_MTHD(p, NV90B5, OFFSET_IN_UPPER);
4372 P_NV90B5_OFFSET_IN_UPPER(p, addr >> 32);
4373 P_NV90B5_OFFSET_IN_LOWER(p, addr & 0xffffffff);
4374 P_NV90B5_OFFSET_OUT_UPPER(p, tmp_addr >> 32);
4375 P_NV90B5_OFFSET_OUT_LOWER(p, tmp_addr & 0xffffffff);
4376 P_NV90B5_PITCH_IN(p, 4);
4377 P_NV90B5_PITCH_OUT(p, 4);
4378 P_NV90B5_LINE_LENGTH_IN(p, 4);
4379 P_NV90B5_LINE_COUNT(p, 1);
4380
4381 P_IMMD(p, NV90B5, SET_REMAP_COMPONENTS, {
4382 .dst_x = DST_X_SRC_X,
4383 .dst_y = DST_Y_SRC_X,
4384 .dst_z = DST_Z_NO_WRITE,
4385 .dst_w = DST_W_NO_WRITE,
4386 .component_size = COMPONENT_SIZE_ONE,
4387 .num_src_components = NUM_SRC_COMPONENTS_ONE,
4388 .num_dst_components = NUM_DST_COMPONENTS_TWO,
4389 });
4390
4391 P_IMMD(p, NV90B5, LAUNCH_DMA, {
4392 .data_transfer_type = DATA_TRANSFER_TYPE_PIPELINED,
4393 .multi_line_enable = MULTI_LINE_ENABLE_TRUE,
4394 .flush_enable = FLUSH_ENABLE_TRUE,
4395 .src_memory_layout = SRC_MEMORY_LAYOUT_PITCH,
4396 .dst_memory_layout = DST_MEMORY_LAYOUT_PITCH,
4397 .remap_enable = REMAP_ENABLE_TRUE,
4398 });
4399
4400 P_MTHD(p, NV9097, SET_RENDER_ENABLE_A);
4401 P_NV9097_SET_RENDER_ENABLE_A(p, tmp_addr >> 32);
4402 P_NV9097_SET_RENDER_ENABLE_B(p, tmp_addr & 0xfffffff0);
4403 P_NV9097_SET_RENDER_ENABLE_C(p, inverted ? MODE_RENDER_IF_EQUAL : MODE_RENDER_IF_NOT_EQUAL);
4404
4405 P_MTHD(p, NV90C0, SET_RENDER_ENABLE_A);
4406 P_NV90C0_SET_RENDER_ENABLE_A(p, tmp_addr >> 32);
4407 P_NV90C0_SET_RENDER_ENABLE_B(p, tmp_addr & 0xfffffff0);
4408 P_NV90C0_SET_RENDER_ENABLE_C(p, inverted ? MODE_RENDER_IF_EQUAL : MODE_RENDER_IF_NOT_EQUAL);
4409 }
4410
4411 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)4412 nvk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
4413 {
4414 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4415
4416 struct nv_push *p = nvk_cmd_buffer_push(cmd, 12);
4417 P_MTHD(p, NV9097, SET_RENDER_ENABLE_A);
4418 P_NV9097_SET_RENDER_ENABLE_A(p, 0);
4419 P_NV9097_SET_RENDER_ENABLE_B(p, 0);
4420 P_NV9097_SET_RENDER_ENABLE_C(p, MODE_TRUE);
4421
4422 P_MTHD(p, NV90C0, SET_RENDER_ENABLE_A);
4423 P_NV90C0_SET_RENDER_ENABLE_A(p, 0);
4424 P_NV90C0_SET_RENDER_ENABLE_B(p, 0);
4425 P_NV90C0_SET_RENDER_ENABLE_C(p, MODE_TRUE);
4426 }
4427