1 /*
2 * Copyright © 2023 Imagination Technologies Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include <stdint.h>
25 #include <stddef.h>
26 #include <string.h>
27 #include <vulkan/vulkan_core.h>
28
29 #include "c11/threads.h"
30 #include "hwdef/rogue_hw_utils.h"
31 #include "pvr_bo.h"
32 #include "pvr_csb.h"
33 #include "pvr_csb_enum_helpers.h"
34 #include "pvr_device_info.h"
35 #include "pvr_formats.h"
36 #include "pvr_hw_pass.h"
37 #include "pvr_job_common.h"
38 #include "pvr_pds.h"
39 #include "pvr_private.h"
40 #include "pvr_shader_factory.h"
41 #include "pvr_spm.h"
42 #include "pvr_static_shaders.h"
43 #include "pvr_tex_state.h"
44 #include "pvr_types.h"
45 #include "pvr_uscgen.h"
46 #include "util/bitscan.h"
47 #include "util/macros.h"
48 #include "util/simple_mtx.h"
49 #include "util/u_atomic.h"
50 #include "vk_alloc.h"
51 #include "vk_log.h"
52
53 struct pvr_spm_scratch_buffer {
54 uint32_t ref_count;
55 struct pvr_bo *bo;
56 uint64_t size;
57 };
58
pvr_spm_init_scratch_buffer_store(struct pvr_device * device)59 void pvr_spm_init_scratch_buffer_store(struct pvr_device *device)
60 {
61 struct pvr_spm_scratch_buffer_store *store =
62 &device->spm_scratch_buffer_store;
63
64 simple_mtx_init(&store->mtx, mtx_plain);
65 store->head_ref = NULL;
66 }
67
pvr_spm_finish_scratch_buffer_store(struct pvr_device * device)68 void pvr_spm_finish_scratch_buffer_store(struct pvr_device *device)
69 {
70 struct pvr_spm_scratch_buffer_store *store =
71 &device->spm_scratch_buffer_store;
72
73 /* Either a framebuffer was never created so no scratch buffer was ever
74 * created or all framebuffers have been freed so only the store's reference
75 * remains.
76 */
77 assert(!store->head_ref || p_atomic_read(&store->head_ref->ref_count) == 1);
78
79 simple_mtx_destroy(&store->mtx);
80
81 if (store->head_ref) {
82 pvr_bo_free(device, store->head_ref->bo);
83 vk_free(&device->vk.alloc, store->head_ref);
84 }
85 }
86
87 uint64_t
pvr_spm_scratch_buffer_calc_required_size(const struct pvr_render_pass * pass,uint32_t framebuffer_width,uint32_t framebuffer_height)88 pvr_spm_scratch_buffer_calc_required_size(const struct pvr_render_pass *pass,
89 uint32_t framebuffer_width,
90 uint32_t framebuffer_height)
91 {
92 uint64_t dwords_per_pixel;
93 uint64_t buffer_size;
94
95 /* If we're allocating an SPM scratch buffer we'll have a minimum of 1 output
96 * reg and/or tile_buffer.
97 */
98 uint32_t nr_tile_buffers = 1;
99 uint32_t nr_output_regs = 1;
100
101 for (uint32_t i = 0; i < pass->hw_setup->render_count; i++) {
102 const struct pvr_renderpass_hwsetup_render *hw_render =
103 &pass->hw_setup->renders[i];
104
105 nr_tile_buffers = MAX2(nr_tile_buffers, hw_render->tile_buffers_count);
106 nr_output_regs = MAX2(nr_output_regs, hw_render->output_regs_count);
107 }
108
109 dwords_per_pixel =
110 (uint64_t)pass->max_sample_count * nr_output_regs * nr_tile_buffers;
111
112 buffer_size = ALIGN_POT((uint64_t)framebuffer_width,
113 PVRX(CR_PBE_WORD0_MRT0_LINESTRIDE_ALIGNMENT));
114 buffer_size *=
115 (uint64_t)framebuffer_height * PVR_DW_TO_BYTES(dwords_per_pixel);
116
117 return buffer_size;
118 }
119
120 static VkResult
pvr_spm_scratch_buffer_alloc(struct pvr_device * device,uint64_t size,struct pvr_spm_scratch_buffer ** const buffer_out)121 pvr_spm_scratch_buffer_alloc(struct pvr_device *device,
122 uint64_t size,
123 struct pvr_spm_scratch_buffer **const buffer_out)
124 {
125 const uint32_t cache_line_size =
126 rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
127 struct pvr_spm_scratch_buffer *scratch_buffer;
128 struct pvr_bo *bo;
129 VkResult result;
130
131 result = pvr_bo_alloc(device,
132 device->heaps.general_heap,
133 size,
134 cache_line_size,
135 0,
136 &bo);
137 if (result != VK_SUCCESS) {
138 *buffer_out = NULL;
139 return result;
140 }
141
142 scratch_buffer = vk_alloc(&device->vk.alloc,
143 sizeof(*scratch_buffer),
144 4,
145 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
146 if (!scratch_buffer) {
147 pvr_bo_free(device, bo);
148 *buffer_out = NULL;
149 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
150 }
151
152 *scratch_buffer = (struct pvr_spm_scratch_buffer){
153 .bo = bo,
154 .size = size,
155 };
156
157 *buffer_out = scratch_buffer;
158
159 return VK_SUCCESS;
160 }
161
162 static void
pvr_spm_scratch_buffer_release_locked(struct pvr_device * device,struct pvr_spm_scratch_buffer * buffer)163 pvr_spm_scratch_buffer_release_locked(struct pvr_device *device,
164 struct pvr_spm_scratch_buffer *buffer)
165 {
166 struct pvr_spm_scratch_buffer_store *store =
167 &device->spm_scratch_buffer_store;
168
169 simple_mtx_assert_locked(&store->mtx);
170
171 if (p_atomic_dec_zero(&buffer->ref_count)) {
172 pvr_bo_free(device, buffer->bo);
173 vk_free(&device->vk.alloc, buffer);
174 }
175 }
176
pvr_spm_scratch_buffer_release(struct pvr_device * device,struct pvr_spm_scratch_buffer * buffer)177 void pvr_spm_scratch_buffer_release(struct pvr_device *device,
178 struct pvr_spm_scratch_buffer *buffer)
179 {
180 struct pvr_spm_scratch_buffer_store *store =
181 &device->spm_scratch_buffer_store;
182
183 simple_mtx_lock(&store->mtx);
184
185 pvr_spm_scratch_buffer_release_locked(device, buffer);
186
187 simple_mtx_unlock(&store->mtx);
188 }
189
pvr_spm_scratch_buffer_store_set_head_ref_locked(struct pvr_spm_scratch_buffer_store * store,struct pvr_spm_scratch_buffer * buffer)190 static void pvr_spm_scratch_buffer_store_set_head_ref_locked(
191 struct pvr_spm_scratch_buffer_store *store,
192 struct pvr_spm_scratch_buffer *buffer)
193 {
194 simple_mtx_assert_locked(&store->mtx);
195 assert(!store->head_ref);
196
197 p_atomic_inc(&buffer->ref_count);
198 store->head_ref = buffer;
199 }
200
pvr_spm_scratch_buffer_store_release_head_ref_locked(struct pvr_device * device,struct pvr_spm_scratch_buffer_store * store)201 static void pvr_spm_scratch_buffer_store_release_head_ref_locked(
202 struct pvr_device *device,
203 struct pvr_spm_scratch_buffer_store *store)
204 {
205 simple_mtx_assert_locked(&store->mtx);
206
207 pvr_spm_scratch_buffer_release_locked(device, store->head_ref);
208
209 store->head_ref = NULL;
210 }
211
pvr_spm_scratch_buffer_get_buffer(struct pvr_device * device,uint64_t size,struct pvr_spm_scratch_buffer ** const buffer_out)212 VkResult pvr_spm_scratch_buffer_get_buffer(
213 struct pvr_device *device,
214 uint64_t size,
215 struct pvr_spm_scratch_buffer **const buffer_out)
216 {
217 struct pvr_spm_scratch_buffer_store *store =
218 &device->spm_scratch_buffer_store;
219 struct pvr_spm_scratch_buffer *buffer;
220
221 simple_mtx_lock(&store->mtx);
222
223 /* When a render requires a PR the fw will wait for other renders to end,
224 * free the PB space, unschedule any other vert/frag jobs and solely run the
225 * PR on the whole device until completion.
226 * Thus we can safely use the same scratch buffer across multiple
227 * framebuffers as the scratch buffer is only used during PRs and only one PR
228 * can ever be executed at any one time.
229 */
230 if (store->head_ref && store->head_ref->size <= size) {
231 buffer = store->head_ref;
232 } else {
233 VkResult result;
234
235 if (store->head_ref)
236 pvr_spm_scratch_buffer_store_release_head_ref_locked(device, store);
237
238 result = pvr_spm_scratch_buffer_alloc(device, size, &buffer);
239 if (result != VK_SUCCESS) {
240 simple_mtx_unlock(&store->mtx);
241 *buffer_out = NULL;
242
243 return result;
244 }
245
246 pvr_spm_scratch_buffer_store_set_head_ref_locked(store, buffer);
247 }
248
249 p_atomic_inc(&buffer->ref_count);
250 simple_mtx_unlock(&store->mtx);
251 *buffer_out = buffer;
252
253 return VK_SUCCESS;
254 }
255
pvr_device_init_spm_load_state(struct pvr_device * device)256 VkResult pvr_device_init_spm_load_state(struct pvr_device *device)
257 {
258 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
259 uint32_t pds_texture_aligned_offsets[PVR_SPM_LOAD_PROGRAM_COUNT];
260 uint32_t pds_kick_aligned_offsets[PVR_SPM_LOAD_PROGRAM_COUNT];
261 uint32_t usc_aligned_offsets[PVR_SPM_LOAD_PROGRAM_COUNT];
262 uint32_t pds_allocation_size = 0;
263 uint32_t usc_allocation_size = 0;
264 struct pvr_suballoc_bo *pds_bo;
265 struct pvr_suballoc_bo *usc_bo;
266 uint8_t *mem_ptr;
267 VkResult result;
268
269 static_assert(PVR_SPM_LOAD_PROGRAM_COUNT == ARRAY_SIZE(spm_load_collection),
270 "Size mismatch");
271
272 /* TODO: We don't need to upload all the programs since the set contains
273 * programs for devices with 8 output regs as well. We can save some memory
274 * by not uploading them on devices without the feature.
275 * It's likely that once the compiler is hooked up we'll be using the shader
276 * cache and generate the shaders as needed so this todo will be unnecessary.
277 */
278
279 /* Upload USC shaders. */
280
281 for (uint32_t i = 0; i < ARRAY_SIZE(spm_load_collection); i++) {
282 usc_aligned_offsets[i] = usc_allocation_size;
283 usc_allocation_size += ALIGN_POT(spm_load_collection[i].size, 4);
284 }
285
286 result = pvr_bo_suballoc(&device->suballoc_usc,
287 usc_allocation_size,
288 4,
289 false,
290 &usc_bo);
291 if (result != VK_SUCCESS)
292 return result;
293
294 mem_ptr = (uint8_t *)pvr_bo_suballoc_get_map_addr(usc_bo);
295
296 for (uint32_t i = 0; i < ARRAY_SIZE(spm_load_collection); i++) {
297 memcpy(mem_ptr + usc_aligned_offsets[i],
298 spm_load_collection[i].code,
299 spm_load_collection[i].size);
300 }
301
302 /* Upload PDS programs. */
303
304 for (uint32_t i = 0; i < ARRAY_SIZE(spm_load_collection); i++) {
305 struct pvr_pds_pixel_shader_sa_program pds_texture_program = {
306 /* DMA for clear colors and tile buffer address parts. */
307 .num_texture_dma_kicks = 1,
308 };
309 struct pvr_pds_kickusc_program pds_kick_program = { 0 };
310
311 /* TODO: This looks a bit odd and isn't consistent with other code where
312 * we're getting the size of the PDS program. Can we improve this?
313 */
314 pvr_pds_set_sizes_pixel_shader_uniform_texture_code(&pds_texture_program);
315 pvr_pds_set_sizes_pixel_shader_sa_texture_data(&pds_texture_program,
316 dev_info);
317
318 /* TODO: Looking at the pvr_pds_generate_...() functions and the run-time
319 * behavior the data size is always the same here. Should we try saving
320 * some memory by adjusting things based on that?
321 */
322 device->spm_load_state.load_program[i].pds_texture_program_data_size =
323 pds_texture_program.data_size;
324
325 pds_texture_aligned_offsets[i] = pds_allocation_size;
326 /* FIXME: Figure out the define for alignment of 16. */
327 pds_allocation_size +=
328 ALIGN_POT(PVR_DW_TO_BYTES(pds_texture_program.code_size), 16);
329
330 pvr_pds_set_sizes_pixel_shader(&pds_kick_program);
331
332 pds_kick_aligned_offsets[i] = pds_allocation_size;
333 /* FIXME: Figure out the define for alignment of 16. */
334 pds_allocation_size +=
335 ALIGN_POT(PVR_DW_TO_BYTES(pds_kick_program.code_size +
336 pds_kick_program.data_size),
337 16);
338 }
339
340 /* FIXME: Figure out the define for alignment of 16. */
341 result = pvr_bo_suballoc(&device->suballoc_pds,
342 pds_allocation_size,
343 16,
344 false,
345 &pds_bo);
346 if (result != VK_SUCCESS) {
347 pvr_bo_suballoc_free(usc_bo);
348 return result;
349 }
350
351 mem_ptr = (uint8_t *)pvr_bo_suballoc_get_map_addr(pds_bo);
352
353 for (uint32_t i = 0; i < ARRAY_SIZE(spm_load_collection); i++) {
354 struct pvr_pds_pixel_shader_sa_program pds_texture_program = {
355 /* DMA for clear colors and tile buffer address parts. */
356 .num_texture_dma_kicks = 1,
357 };
358 const pvr_dev_addr_t usc_program_dev_addr =
359 PVR_DEV_ADDR_OFFSET(usc_bo->dev_addr, usc_aligned_offsets[i]);
360 struct pvr_pds_kickusc_program pds_kick_program = { 0 };
361 enum PVRX(PDSINST_DOUTU_SAMPLE_RATE) sample_rate;
362
363 pvr_pds_generate_pixel_shader_sa_code_segment(
364 &pds_texture_program,
365 (uint32_t *)(mem_ptr + pds_texture_aligned_offsets[i]));
366
367 if (spm_load_collection[i].info->msaa_sample_count > 1)
368 sample_rate = PVRX(PDSINST_DOUTU_SAMPLE_RATE_FULL);
369 else
370 sample_rate = PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE);
371
372 pvr_pds_setup_doutu(&pds_kick_program.usc_task_control,
373 usc_program_dev_addr.addr,
374 spm_load_collection[i].info->temps_required,
375 sample_rate,
376 false);
377
378 /* Generated both code and data. */
379 pvr_pds_generate_pixel_shader_program(
380 &pds_kick_program,
381 (uint32_t *)(mem_ptr + pds_kick_aligned_offsets[i]));
382
383 device->spm_load_state.load_program[i].pds_pixel_program_offset =
384 PVR_DEV_ADDR_OFFSET(pds_bo->dev_addr, pds_kick_aligned_offsets[i]);
385 device->spm_load_state.load_program[i].pds_uniform_program_offset =
386 PVR_DEV_ADDR_OFFSET(pds_bo->dev_addr, pds_texture_aligned_offsets[i]);
387
388 /* TODO: From looking at the pvr_pds_generate_...() functions, it seems
389 * like temps_used is always 1. Should we remove this and hard code it
390 * with a define in the PDS code?
391 */
392 device->spm_load_state.load_program[i].pds_texture_program_temps_count =
393 pds_texture_program.temps_used;
394 }
395
396 device->spm_load_state.usc_programs = usc_bo;
397 device->spm_load_state.pds_programs = pds_bo;
398
399 return VK_SUCCESS;
400 }
401
pvr_device_finish_spm_load_state(struct pvr_device * device)402 void pvr_device_finish_spm_load_state(struct pvr_device *device)
403 {
404 pvr_bo_suballoc_free(device->spm_load_state.pds_programs);
405 pvr_bo_suballoc_free(device->spm_load_state.usc_programs);
406 }
407
PVRX(PBESTATE_PACKMODE)408 static inline enum PVRX(PBESTATE_PACKMODE)
409 pvr_spm_get_pbe_packmode(uint32_t dword_count)
410 {
411 switch (dword_count) {
412 case 1:
413 return PVRX(PBESTATE_PACKMODE_U32);
414 case 2:
415 return PVRX(PBESTATE_PACKMODE_U32U32);
416 case 3:
417 return PVRX(PBESTATE_PACKMODE_U32U32U32);
418 case 4:
419 return PVRX(PBESTATE_PACKMODE_U32U32U32U32);
420 default:
421 unreachable("Unsupported dword_count");
422 }
423 }
424
425 /**
426 * \brief Sets up PBE registers and state values per a single render output.
427 *
428 * On a PR we want to store tile data to the scratch buffer so we need to
429 * setup the Pixel Back End (PBE) to write the data to the scratch buffer. This
430 * function sets up the PBE state and register values required to do so, for a
431 * single resource whether it be a tile buffer or the output register set.
432 *
433 * \return Size of the data saved into the scratch buffer in bytes.
434 */
pvr_spm_setup_pbe_state(const struct pvr_device_info * dev_info,const VkExtent2D * framebuffer_size,uint32_t dword_count,enum pvr_pbe_source_start_pos source_start,uint32_t sample_count,pvr_dev_addr_t scratch_buffer_addr,uint32_t pbe_state_words_out[static const ROGUE_NUM_PBESTATE_STATE_WORDS],uint64_t pbe_reg_words_out[static const ROGUE_NUM_PBESTATE_REG_WORDS])435 static uint64_t pvr_spm_setup_pbe_state(
436 const struct pvr_device_info *dev_info,
437 const VkExtent2D *framebuffer_size,
438 uint32_t dword_count,
439 enum pvr_pbe_source_start_pos source_start,
440 uint32_t sample_count,
441 pvr_dev_addr_t scratch_buffer_addr,
442 uint32_t pbe_state_words_out[static const ROGUE_NUM_PBESTATE_STATE_WORDS],
443 uint64_t pbe_reg_words_out[static const ROGUE_NUM_PBESTATE_REG_WORDS])
444 {
445 const uint32_t stride =
446 ALIGN_POT(framebuffer_size->width,
447 PVRX(PBESTATE_REG_WORD0_LINESTRIDE_UNIT_SIZE));
448
449 const struct pvr_pbe_surf_params surface_params = {
450 .swizzle = {
451 [0] = PIPE_SWIZZLE_X,
452 [1] = PIPE_SWIZZLE_Y,
453 [2] = PIPE_SWIZZLE_Z,
454 [3] = PIPE_SWIZZLE_W,
455 },
456 .pbe_packmode = pvr_spm_get_pbe_packmode(dword_count),
457 .source_format = PVRX(PBESTATE_SOURCE_FORMAT_8_PER_CHANNEL),
458 .addr = scratch_buffer_addr,
459 .mem_layout = PVR_MEMLAYOUT_LINEAR,
460 .stride = stride,
461 };
462 const struct pvr_pbe_render_params render_params = {
463 .max_x_clip = framebuffer_size->width - 1,
464 .max_y_clip = framebuffer_size->height - 1,
465 .source_start = source_start,
466 };
467
468 pvr_pbe_pack_state(dev_info,
469 &surface_params,
470 &render_params,
471 pbe_state_words_out,
472 pbe_reg_words_out);
473
474 return (uint64_t)stride * framebuffer_size->height * sample_count *
475 PVR_DW_TO_BYTES(dword_count);
476 }
477
pvr_set_pbe_all_valid_mask(struct usc_mrt_desc * desc)478 static inline void pvr_set_pbe_all_valid_mask(struct usc_mrt_desc *desc)
479 {
480 for (uint32_t i = 0; i < ARRAY_SIZE(desc->valid_mask); i++)
481 desc->valid_mask[i] = ~0;
482 }
483
484 #define PVR_DEV_ADDR_ADVANCE(_addr, _offset) \
485 _addr = PVR_DEV_ADDR_OFFSET(_addr, _offset)
486
487 /**
488 * \brief Sets up PBE registers, PBE state values and MRT data per a single
489 * render output requiring 8 dwords to be written.
490 *
491 * On a PR we want to store tile data to the scratch buffer so we need to
492 * setup the Pixel Back End (PBE) to write the data to the scratch buffer, as
493 * well as setup the Multiple Render Target (MRT) info so the compiler knows
494 * what data needs to be stored (output regs or tile buffers) and generate the
495 * appropriate EOT shader.
496 *
497 * This function is only available for devices with the eight_output_registers
498 * feature thus requiring 8 dwords to be stored.
499 *
500 * \return Size of the data saved into the scratch buffer in bytes.
501 */
pvr_spm_setup_pbe_eight_dword_write(const struct pvr_device_info * dev_info,const VkExtent2D * framebuffer_size,uint32_t sample_count,enum usc_mrt_resource_type source_type,uint32_t tile_buffer_idx,pvr_dev_addr_t scratch_buffer_addr,uint32_t pbe_state_word_0_out[static const ROGUE_NUM_PBESTATE_STATE_WORDS],uint32_t pbe_state_word_1_out[static const ROGUE_NUM_PBESTATE_STATE_WORDS],uint64_t pbe_reg_word_0_out[static const ROGUE_NUM_PBESTATE_REG_WORDS],uint64_t pbe_reg_word_1_out[static const ROGUE_NUM_PBESTATE_REG_WORDS],uint32_t * render_target_used_out)502 static uint64_t pvr_spm_setup_pbe_eight_dword_write(
503 const struct pvr_device_info *dev_info,
504 const VkExtent2D *framebuffer_size,
505 uint32_t sample_count,
506 enum usc_mrt_resource_type source_type,
507 uint32_t tile_buffer_idx,
508 pvr_dev_addr_t scratch_buffer_addr,
509 uint32_t pbe_state_word_0_out[static const ROGUE_NUM_PBESTATE_STATE_WORDS],
510 uint32_t pbe_state_word_1_out[static const ROGUE_NUM_PBESTATE_STATE_WORDS],
511 uint64_t pbe_reg_word_0_out[static const ROGUE_NUM_PBESTATE_REG_WORDS],
512 uint64_t pbe_reg_word_1_out[static const ROGUE_NUM_PBESTATE_REG_WORDS],
513 uint32_t *render_target_used_out)
514 {
515 const uint32_t max_pbe_write_size_dw = 4;
516 uint32_t render_target_used = 0;
517 uint64_t mem_stored;
518
519 assert(PVR_HAS_FEATURE(dev_info, eight_output_registers));
520 assert(source_type != USC_MRT_RESOURCE_TYPE_INVALID);
521
522 /* To store 8 dwords we need to split this into two
523 * ROGUE_PBESTATE_PACKMODE_U32U32U32U32 stores with the second one using
524 * PVR_PBE_STARTPOS_BIT128 as the source offset to store the last 4 dwords.
525 */
526
527 mem_stored = pvr_spm_setup_pbe_state(dev_info,
528 framebuffer_size,
529 max_pbe_write_size_dw,
530 PVR_PBE_STARTPOS_BIT0,
531 sample_count,
532 scratch_buffer_addr,
533 pbe_state_word_0_out,
534 pbe_reg_word_0_out);
535
536 PVR_DEV_ADDR_ADVANCE(scratch_buffer_addr, mem_stored);
537
538 render_target_used++;
539
540 mem_stored += pvr_spm_setup_pbe_state(dev_info,
541 framebuffer_size,
542 max_pbe_write_size_dw,
543 PVR_PBE_STARTPOS_BIT128,
544 sample_count,
545 scratch_buffer_addr,
546 pbe_state_word_1_out,
547 pbe_reg_word_1_out);
548
549 PVR_DEV_ADDR_ADVANCE(scratch_buffer_addr, mem_stored);
550
551 render_target_used++;
552 *render_target_used_out = render_target_used;
553
554 return mem_stored;
555 }
556
557 /**
558 * \brief Create and upload the EOT PDS program.
559 *
560 * Essentially DOUTU the USC EOT shader.
561 */
562 /* TODO: See if we can dedup this with
563 * pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload().
564 */
pvr_pds_pixel_event_program_create_and_upload(struct pvr_device * device,const struct pvr_suballoc_bo * usc_eot_program,uint32_t usc_temp_count,struct pvr_pds_upload * const pds_upload_out)565 static VkResult pvr_pds_pixel_event_program_create_and_upload(
566 struct pvr_device *device,
567 const struct pvr_suballoc_bo *usc_eot_program,
568 uint32_t usc_temp_count,
569 struct pvr_pds_upload *const pds_upload_out)
570 {
571 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
572 struct pvr_pds_event_program program = { 0 };
573 uint32_t *staging_buffer;
574 VkResult result;
575
576 pvr_pds_setup_doutu(&program.task_control,
577 usc_eot_program->dev_addr.addr,
578 usc_temp_count,
579 PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
580 false);
581
582 staging_buffer =
583 vk_alloc(&device->vk.alloc,
584 PVR_DW_TO_BYTES(device->pixel_event_data_size_in_dwords),
585 8,
586 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
587 if (!staging_buffer)
588 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
589
590 pvr_pds_generate_pixel_event_data_segment(&program,
591 staging_buffer,
592 dev_info);
593
594 result = pvr_gpu_upload_pds(device,
595 staging_buffer,
596 device->pixel_event_data_size_in_dwords,
597 4,
598 NULL,
599 0,
600 0,
601 4,
602 pds_upload_out);
603 vk_free(&device->vk.alloc, staging_buffer);
604 return result;
605 }
606
607 /**
608 * \brief Sets up the End of Tile (EOT) program for SPM.
609 *
610 * This sets up an EOT program to store the render pass'es on-chip and
611 * off-chip tile data to the SPM scratch buffer on the EOT event.
612 */
613 VkResult
pvr_spm_init_eot_state(struct pvr_device * device,struct pvr_spm_eot_state * spm_eot_state,const struct pvr_framebuffer * framebuffer,const struct pvr_renderpass_hwsetup_render * hw_render,uint32_t * emit_count_out)614 pvr_spm_init_eot_state(struct pvr_device *device,
615 struct pvr_spm_eot_state *spm_eot_state,
616 const struct pvr_framebuffer *framebuffer,
617 const struct pvr_renderpass_hwsetup_render *hw_render,
618 uint32_t *emit_count_out)
619 {
620 const VkExtent2D framebuffer_size = {
621 .width = framebuffer->width,
622 .height = framebuffer->height,
623 };
624 uint32_t pbe_state_words[PVR_MAX_COLOR_ATTACHMENTS]
625 [ROGUE_NUM_PBESTATE_STATE_WORDS];
626 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
627 uint32_t total_render_target_used = 0;
628 struct pvr_pds_upload pds_eot_program;
629 struct util_dynarray usc_shader_binary;
630 uint32_t usc_temp_count;
631 VkResult result;
632
633 pvr_dev_addr_t next_scratch_buffer_addr =
634 framebuffer->scratch_buffer->bo->vma->dev_addr;
635 uint64_t mem_stored;
636
637 /* TODO: See if instead of having a separate path for devices with 8 output
638 * regs we can instead do this in a loop and dedup some stuff.
639 */
640 assert(util_is_power_of_two_or_zero(hw_render->output_regs_count) &&
641 hw_render->output_regs_count <= 8);
642 if (hw_render->output_regs_count == 8) {
643 uint32_t render_targets_used;
644
645 /* Store on-chip tile data (i.e. output regs). */
646
647 mem_stored = pvr_spm_setup_pbe_eight_dword_write(
648 dev_info,
649 &framebuffer_size,
650 hw_render->sample_count,
651 USC_MRT_RESOURCE_TYPE_OUTPUT_REG,
652 0,
653 next_scratch_buffer_addr,
654 pbe_state_words[total_render_target_used],
655 pbe_state_words[total_render_target_used + 1],
656 spm_eot_state->pbe_reg_words[total_render_target_used],
657 spm_eot_state->pbe_reg_words[total_render_target_used + 1],
658 &render_targets_used);
659
660 PVR_DEV_ADDR_ADVANCE(next_scratch_buffer_addr, mem_stored);
661 total_render_target_used += render_targets_used;
662
663 /* Store off-chip tile data (i.e. tile buffers). */
664
665 for (uint32_t i = 0; i < hw_render->tile_buffers_count; i++) {
666 assert(!"Add support for tile buffers in EOT");
667 pvr_finishme("Add support for tile buffers in EOT");
668
669 /* `+ 1` since we have 2 emits per tile buffer. */
670 assert(total_render_target_used + 1 < PVR_MAX_COLOR_ATTACHMENTS);
671
672 mem_stored = pvr_spm_setup_pbe_eight_dword_write(
673 dev_info,
674 &framebuffer_size,
675 hw_render->sample_count,
676 USC_MRT_RESOURCE_TYPE_MEMORY,
677 i,
678 next_scratch_buffer_addr,
679 pbe_state_words[total_render_target_used],
680 pbe_state_words[total_render_target_used + 1],
681 spm_eot_state->pbe_reg_words[total_render_target_used],
682 spm_eot_state->pbe_reg_words[total_render_target_used + 1],
683 &render_targets_used);
684
685 PVR_DEV_ADDR_ADVANCE(next_scratch_buffer_addr, mem_stored);
686 total_render_target_used += render_targets_used;
687 }
688 } else {
689 /* Store on-chip tile data (i.e. output regs). */
690
691 mem_stored = pvr_spm_setup_pbe_state(
692 dev_info,
693 &framebuffer_size,
694 hw_render->output_regs_count,
695 PVR_PBE_STARTPOS_BIT0,
696 hw_render->sample_count,
697 next_scratch_buffer_addr,
698 pbe_state_words[total_render_target_used],
699 spm_eot_state->pbe_reg_words[total_render_target_used]);
700
701 PVR_DEV_ADDR_ADVANCE(next_scratch_buffer_addr, mem_stored);
702
703 total_render_target_used++;
704
705 /* Store off-chip tile data (i.e. tile buffers). */
706
707 for (uint32_t i = 0; i < hw_render->tile_buffers_count; i++) {
708 assert(!"Add support for tile buffers in EOT");
709 pvr_finishme("Add support for tile buffers in EOT");
710
711 assert(total_render_target_used < PVR_MAX_COLOR_ATTACHMENTS);
712
713 mem_stored = pvr_spm_setup_pbe_state(
714 dev_info,
715 &framebuffer_size,
716 hw_render->output_regs_count,
717 PVR_PBE_STARTPOS_BIT0,
718 hw_render->sample_count,
719 next_scratch_buffer_addr,
720 pbe_state_words[total_render_target_used],
721 spm_eot_state->pbe_reg_words[total_render_target_used]);
722
723 PVR_DEV_ADDR_ADVANCE(next_scratch_buffer_addr, mem_stored);
724
725 total_render_target_used++;
726 }
727 }
728
729 pvr_uscgen_eot("SPM EOT",
730 total_render_target_used,
731 pbe_state_words[0],
732 &usc_temp_count,
733 &usc_shader_binary);
734
735 /* TODO: Create a #define in the compiler code to replace the 16. */
736 result = pvr_gpu_upload_usc(device,
737 usc_shader_binary.data,
738 usc_shader_binary.size,
739 16,
740 &spm_eot_state->usc_eot_program);
741
742 util_dynarray_fini(&usc_shader_binary);
743
744 if (result != VK_SUCCESS)
745 return result;
746
747 result = pvr_pds_pixel_event_program_create_and_upload(
748 device,
749 spm_eot_state->usc_eot_program,
750 usc_temp_count,
751 &pds_eot_program);
752 if (result != VK_SUCCESS) {
753 pvr_bo_suballoc_free(spm_eot_state->usc_eot_program);
754 return result;
755 }
756
757 spm_eot_state->pixel_event_program_data_upload = pds_eot_program.pvr_bo;
758 spm_eot_state->pixel_event_program_data_offset = pds_eot_program.data_offset;
759
760 *emit_count_out = total_render_target_used;
761
762 return VK_SUCCESS;
763 }
764
pvr_spm_finish_eot_state(struct pvr_device * device,struct pvr_spm_eot_state * spm_eot_state)765 void pvr_spm_finish_eot_state(struct pvr_device *device,
766 struct pvr_spm_eot_state *spm_eot_state)
767 {
768 pvr_bo_suballoc_free(spm_eot_state->pixel_event_program_data_upload);
769 pvr_bo_suballoc_free(spm_eot_state->usc_eot_program);
770 }
771
pvr_get_format_from_dword_count(uint32_t dword_count)772 static VkFormat pvr_get_format_from_dword_count(uint32_t dword_count)
773 {
774 switch (dword_count) {
775 case 1:
776 return VK_FORMAT_R32_UINT;
777 case 2:
778 return VK_FORMAT_R32G32_UINT;
779 case 4:
780 return VK_FORMAT_R32G32B32A32_UINT;
781 default:
782 unreachable("Invalid dword_count");
783 }
784 }
785
pvr_spm_setup_texture_state_words(struct pvr_device * device,uint32_t dword_count,const VkExtent2D framebuffer_size,uint32_t sample_count,pvr_dev_addr_t scratch_buffer_addr,uint64_t image_descriptor[static const ROGUE_NUM_TEXSTATE_IMAGE_WORDS],uint64_t * mem_used_out)786 static VkResult pvr_spm_setup_texture_state_words(
787 struct pvr_device *device,
788 uint32_t dword_count,
789 const VkExtent2D framebuffer_size,
790 uint32_t sample_count,
791 pvr_dev_addr_t scratch_buffer_addr,
792 uint64_t image_descriptor[static const ROGUE_NUM_TEXSTATE_IMAGE_WORDS],
793 uint64_t *mem_used_out)
794 {
795 /* We can ignore the framebuffer's layer count since we only support
796 * writing to layer 0.
797 */
798 struct pvr_texture_state_info info = {
799 .format = pvr_get_format_from_dword_count(dword_count),
800 .mem_layout = PVR_MEMLAYOUT_LINEAR,
801
802 .type = VK_IMAGE_VIEW_TYPE_2D,
803 .tex_state_type = PVR_TEXTURE_STATE_STORAGE,
804 .extent = {
805 .width = framebuffer_size.width,
806 .height = framebuffer_size.height,
807 },
808
809 .mip_levels = 1,
810
811 .sample_count = sample_count,
812 .stride = framebuffer_size.width,
813
814 .addr = scratch_buffer_addr,
815 };
816 const uint64_t aligned_fb_width =
817 ALIGN_POT(framebuffer_size.width,
818 PVRX(CR_PBE_WORD0_MRT0_LINESTRIDE_ALIGNMENT));
819 const uint64_t fb_area = aligned_fb_width * framebuffer_size.height;
820 const uint8_t *format_swizzle;
821 VkResult result;
822
823 format_swizzle = pvr_get_format_swizzle(info.format);
824 memcpy(info.swizzle, format_swizzle, sizeof(info.swizzle));
825
826 result = pvr_pack_tex_state(device, &info, image_descriptor);
827 if (result != VK_SUCCESS)
828 return result;
829
830 *mem_used_out = fb_area * PVR_DW_TO_BYTES(dword_count) * sample_count;
831
832 return VK_SUCCESS;
833 }
834
835 /* FIXME: Can we dedup this with pvr_load_op_pds_data_create_and_upload() ? */
pvr_pds_bgnd_program_create_and_upload(struct pvr_device * device,uint32_t texture_program_data_size_in_dwords,const struct pvr_bo * consts_buffer,uint32_t const_shared_regs,struct pvr_pds_upload * pds_upload_out)836 static VkResult pvr_pds_bgnd_program_create_and_upload(
837 struct pvr_device *device,
838 uint32_t texture_program_data_size_in_dwords,
839 const struct pvr_bo *consts_buffer,
840 uint32_t const_shared_regs,
841 struct pvr_pds_upload *pds_upload_out)
842 {
843 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
844 struct pvr_pds_pixel_shader_sa_program texture_program = { 0 };
845 uint32_t staging_buffer_size;
846 uint32_t *staging_buffer;
847 VkResult result;
848
849 pvr_csb_pack (&texture_program.texture_dma_address[0],
850 PDSINST_DOUT_FIELDS_DOUTD_SRC0,
851 doutd_src0) {
852 doutd_src0.sbase = consts_buffer->vma->dev_addr;
853 }
854
855 pvr_csb_pack (&texture_program.texture_dma_control[0],
856 PDSINST_DOUT_FIELDS_DOUTD_SRC1,
857 doutd_src1) {
858 doutd_src1.dest = PVRX(PDSINST_DOUTD_DEST_COMMON_STORE);
859 doutd_src1.bsize = const_shared_regs;
860 }
861
862 texture_program.num_texture_dma_kicks += 1;
863
864 #if MESA_DEBUG
865 pvr_pds_set_sizes_pixel_shader_sa_texture_data(&texture_program, dev_info);
866 assert(texture_program_data_size_in_dwords == texture_program.data_size);
867 #endif
868
869 staging_buffer_size = PVR_DW_TO_BYTES(texture_program_data_size_in_dwords);
870
871 staging_buffer = vk_alloc(&device->vk.alloc,
872 staging_buffer_size,
873 8,
874 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
875 if (!staging_buffer)
876 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
877
878 pvr_pds_generate_pixel_shader_sa_texture_state_data(&texture_program,
879 staging_buffer,
880 dev_info);
881
882 /* FIXME: Figure out the define for alignment of 16. */
883 result = pvr_gpu_upload_pds(device,
884 &staging_buffer[0],
885 texture_program_data_size_in_dwords,
886 16,
887 NULL,
888 0,
889 0,
890 16,
891 pds_upload_out);
892 if (result != VK_SUCCESS) {
893 vk_free(&device->vk.alloc, staging_buffer);
894 return result;
895 }
896
897 vk_free(&device->vk.alloc, staging_buffer);
898
899 return VK_SUCCESS;
900 }
901
902 VkResult
pvr_spm_init_bgobj_state(struct pvr_device * device,struct pvr_spm_bgobj_state * spm_bgobj_state,const struct pvr_framebuffer * framebuffer,const struct pvr_renderpass_hwsetup_render * hw_render,uint32_t emit_count)903 pvr_spm_init_bgobj_state(struct pvr_device *device,
904 struct pvr_spm_bgobj_state *spm_bgobj_state,
905 const struct pvr_framebuffer *framebuffer,
906 const struct pvr_renderpass_hwsetup_render *hw_render,
907 uint32_t emit_count)
908 {
909 const uint32_t spm_load_program_idx =
910 pvr_get_spm_load_program_index(hw_render->sample_count,
911 hw_render->tile_buffers_count,
912 hw_render->output_regs_count);
913 const VkExtent2D framebuffer_size = {
914 .width = framebuffer->width,
915 .height = framebuffer->height,
916 };
917 pvr_dev_addr_t next_scratch_buffer_addr =
918 framebuffer->scratch_buffer->bo->vma->dev_addr;
919 struct pvr_spm_per_load_program_state *load_program_state;
920 struct pvr_pds_upload pds_texture_data_upload;
921 const struct pvr_shader_factory_info *info;
922 union pvr_sampler_descriptor *descriptor;
923 uint64_t consts_buffer_size;
924 uint32_t dword_count;
925 uint32_t *mem_ptr;
926 VkResult result;
927
928 assert(spm_load_program_idx < ARRAY_SIZE(spm_load_collection));
929 info = spm_load_collection[spm_load_program_idx].info;
930
931 consts_buffer_size = PVR_DW_TO_BYTES(info->const_shared_regs);
932
933 /* TODO: Remove this check, along with the pvr_finishme(), once the zeroed
934 * shaders are replaced by the real shaders.
935 */
936 if (!consts_buffer_size)
937 return VK_SUCCESS;
938
939 pvr_finishme("Remove consts buffer size check");
940
941 result = pvr_bo_alloc(device,
942 device->heaps.general_heap,
943 consts_buffer_size,
944 sizeof(uint32_t),
945 PVR_BO_ALLOC_FLAG_CPU_MAPPED,
946 &spm_bgobj_state->consts_buffer);
947 if (result != VK_SUCCESS)
948 return result;
949
950 mem_ptr = spm_bgobj_state->consts_buffer->bo->map;
951
952 if (info->driver_const_location_map) {
953 const uint32_t *const const_map = info->driver_const_location_map;
954
955 for (uint32_t i = 0; i < PVR_SPM_LOAD_CONST_COUNT; i += 2) {
956 pvr_dev_addr_t tile_buffer_addr;
957
958 if (const_map[i] == PVR_SPM_LOAD_DEST_UNUSED) {
959 #if MESA_DEBUG
960 for (uint32_t j = i; j < PVR_SPM_LOAD_CONST_COUNT; j++)
961 assert(const_map[j] == PVR_SPM_LOAD_DEST_UNUSED);
962 #endif
963 break;
964 }
965
966 tile_buffer_addr =
967 device->tile_buffer_state.buffers[i / 2]->vma->dev_addr;
968
969 assert(const_map[i] == const_map[i + 1] + 1);
970 mem_ptr[const_map[i]] = tile_buffer_addr.addr >> 32;
971 mem_ptr[const_map[i + 1]] = (uint32_t)tile_buffer_addr.addr;
972 }
973 }
974
975 /* TODO: The 32 comes from how the shaders are compiled. We should
976 * unhardcode it when this is hooked up to the compiler.
977 */
978 descriptor = (union pvr_sampler_descriptor *)(mem_ptr + 32);
979 *descriptor = (union pvr_sampler_descriptor){ 0 };
980
981 pvr_csb_pack (&descriptor->data.sampler_word, TEXSTATE_SAMPLER, sampler) {
982 sampler.non_normalized_coords = true;
983 sampler.addrmode_v = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
984 sampler.addrmode_u = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
985 sampler.minfilter = PVRX(TEXSTATE_FILTER_POINT);
986 sampler.magfilter = PVRX(TEXSTATE_FILTER_POINT);
987 sampler.maxlod = PVRX(TEXSTATE_CLAMP_MIN);
988 sampler.minlod = PVRX(TEXSTATE_CLAMP_MIN);
989 sampler.dadjust = PVRX(TEXSTATE_DADJUST_ZERO_UINT);
990 }
991
992 /* Even if we might have 8 output regs we can only pack and write 4 dwords
993 * using R32G32B32A32_UINT.
994 */
995 if (hw_render->tile_buffers_count > 0)
996 dword_count = 4;
997 else
998 dword_count = MIN2(hw_render->output_regs_count, 4);
999
1000 for (uint32_t i = 0; i < emit_count; i++) {
1001 uint64_t *mem_ptr_u64 = (uint64_t *)mem_ptr;
1002 uint64_t mem_used = 0;
1003
1004 STATIC_ASSERT(ROGUE_NUM_TEXSTATE_IMAGE_WORDS * sizeof(uint64_t) /
1005 sizeof(uint32_t) ==
1006 PVR_IMAGE_DESCRIPTOR_SIZE);
1007 mem_ptr_u64 += i * ROGUE_NUM_TEXSTATE_IMAGE_WORDS;
1008
1009 result = pvr_spm_setup_texture_state_words(device,
1010 dword_count,
1011 framebuffer_size,
1012 hw_render->sample_count,
1013 next_scratch_buffer_addr,
1014 mem_ptr_u64,
1015 &mem_used);
1016 if (result != VK_SUCCESS)
1017 goto err_free_consts_buffer;
1018
1019 PVR_DEV_ADDR_ADVANCE(next_scratch_buffer_addr, mem_used);
1020 }
1021
1022 assert(spm_load_program_idx <
1023 ARRAY_SIZE(device->spm_load_state.load_program));
1024 load_program_state =
1025 &device->spm_load_state.load_program[spm_load_program_idx];
1026
1027 result = pvr_pds_bgnd_program_create_and_upload(
1028 device,
1029 load_program_state->pds_texture_program_data_size,
1030 spm_bgobj_state->consts_buffer,
1031 info->const_shared_regs,
1032 &pds_texture_data_upload);
1033 if (result != VK_SUCCESS)
1034 goto err_free_consts_buffer;
1035
1036 spm_bgobj_state->pds_texture_data_upload = pds_texture_data_upload.pvr_bo;
1037
1038 /* TODO: Is it worth to dedup this with pvr_pds_bgnd_pack_state() ? */
1039
1040 /* clang-format off */
1041 pvr_csb_pack (&spm_bgobj_state->pds_reg_values[0],
1042 CR_PDS_BGRND0_BASE,
1043 value) {
1044 /* clang-format on */
1045 value.shader_addr = load_program_state->pds_pixel_program_offset;
1046 value.texunicode_addr = load_program_state->pds_uniform_program_offset;
1047 }
1048
1049 /* clang-format off */
1050 pvr_csb_pack (&spm_bgobj_state->pds_reg_values[1],
1051 CR_PDS_BGRND1_BASE,
1052 value) {
1053 /* clang-format on */
1054 value.texturedata_addr =
1055 PVR_DEV_ADDR(pds_texture_data_upload.data_offset);
1056 }
1057
1058 /* clang-format off */
1059 pvr_csb_pack (&spm_bgobj_state->pds_reg_values[2],
1060 CR_PDS_BGRND3_SIZEINFO,
1061 value) {
1062 /* clang-format on */
1063 value.usc_sharedsize =
1064 DIV_ROUND_UP(info->const_shared_regs,
1065 PVRX(CR_PDS_BGRND3_SIZEINFO_USC_SHAREDSIZE_UNIT_SIZE));
1066 value.pds_texturestatesize = DIV_ROUND_UP(
1067 pds_texture_data_upload.data_size,
1068 PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE));
1069 value.pds_tempsize =
1070 DIV_ROUND_UP(load_program_state->pds_texture_program_temps_count,
1071 PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEMPSIZE_UNIT_SIZE));
1072 }
1073
1074 return VK_SUCCESS;
1075
1076 err_free_consts_buffer:
1077 pvr_bo_free(device, spm_bgobj_state->consts_buffer);
1078
1079 return result;
1080 }
1081
pvr_spm_finish_bgobj_state(struct pvr_device * device,struct pvr_spm_bgobj_state * spm_bgobj_state)1082 void pvr_spm_finish_bgobj_state(struct pvr_device *device,
1083 struct pvr_spm_bgobj_state *spm_bgobj_state)
1084 {
1085 pvr_bo_suballoc_free(spm_bgobj_state->pds_texture_data_upload);
1086 pvr_bo_free(device, spm_bgobj_state->consts_buffer);
1087 }
1088
1089 #undef PVR_DEV_ADDR_ADVANCE
1090