xref: /aosp_15_r20/external/mesa3d/src/imagination/vulkan/pvr_job_render.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2022 Imagination Technologies Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a copy
5  * of this software and associated documentation files (the "Software"), to deal
6  * in the Software without restriction, including without limitation the rights
7  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8  * copies of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <stdint.h>
27 #include <vulkan/vulkan.h>
28 
29 #include "hwdef/rogue_hw_defs.h"
30 #include "hwdef/rogue_hw_utils.h"
31 #include "pvr_bo.h"
32 #include "pvr_csb.h"
33 #include "pvr_debug.h"
34 #include "pvr_csb_enum_helpers.h"
35 #include "pvr_debug.h"
36 #include "pvr_job_common.h"
37 #include "pvr_job_context.h"
38 #include "pvr_job_render.h"
39 #include "pvr_pds.h"
40 #include "pvr_private.h"
41 #include "pvr_rogue_fw.h"
42 #include "pvr_types.h"
43 #include "pvr_winsys.h"
44 #include "util/compiler.h"
45 #include "util/format/format_utils.h"
46 #include "util/macros.h"
47 #include "util/u_math.h"
48 #include "vk_alloc.h"
49 #include "vk_log.h"
50 #include "vk_util.h"
51 
52 #define ROGUE_BIF_PM_FREELIST_BASE_ADDR_ALIGNSIZE 16U
53 
54 /* FIXME: Is there a hardware define we can use instead? */
55 /* 1 DWord per PM physical page stored in the free list */
56 #define ROGUE_FREE_LIST_ENTRY_SIZE ((uint32_t)sizeof(uint32_t))
57 
58 /* FIXME: The three defines below, for the number of PC, PD and PT entries in a
59  * 4KB page, come from rgxmmudefs_km.h (meaning they're part of the
60  * auto-generated hwdefs). Should these be defined in rogue_mmu.xml? Keeping in
61  * mind that we probably only need these three values. */
62 #define ROGUE_NUM_PC_ENTRIES_PER_PAGE 0x400U
63 
64 #define ROGUE_NUM_PD_ENTRIES_PER_PAGE 0x200U
65 
66 #define ROGUE_NUM_PT_ENTRIES_PER_PAGE 0x200U
67 
68 struct pvr_free_list {
69    struct pvr_device *device;
70 
71    uint64_t size;
72 
73    struct pvr_bo *bo;
74 
75    struct pvr_winsys_free_list *ws_free_list;
76 };
77 
78 struct pvr_rt_dataset {
79    struct pvr_device *device;
80 
81    /* RT dataset information */
82    uint32_t width;
83    uint32_t height;
84    uint32_t samples;
85    uint32_t layers;
86 
87    struct pvr_free_list *global_free_list;
88    struct pvr_free_list *local_free_list;
89 
90    struct pvr_bo *vheap_rtc_bo;
91    pvr_dev_addr_t vheap_dev_addr;
92    pvr_dev_addr_t rtc_dev_addr;
93 
94    struct pvr_bo *tpc_bo;
95    uint64_t tpc_stride;
96    uint64_t tpc_size;
97 
98    struct pvr_winsys_rt_dataset *ws_rt_dataset;
99 
100    /* RT data information */
101    struct pvr_bo *mta_mlist_bo;
102 
103    struct pvr_bo *rgn_headers_bo;
104    uint64_t rgn_headers_stride;
105 
106    bool need_frag;
107 
108    uint8_t rt_data_idx;
109 
110    struct {
111       pvr_dev_addr_t mta_dev_addr;
112       pvr_dev_addr_t mlist_dev_addr;
113       pvr_dev_addr_t rgn_headers_dev_addr;
114    } rt_datas[ROGUE_NUM_RTDATAS];
115 };
116 
pvr_free_list_create(struct pvr_device * device,uint32_t initial_size,uint32_t max_size,uint32_t grow_size,uint32_t grow_threshold,struct pvr_free_list * parent_free_list,struct pvr_free_list ** const free_list_out)117 VkResult pvr_free_list_create(struct pvr_device *device,
118                               uint32_t initial_size,
119                               uint32_t max_size,
120                               uint32_t grow_size,
121                               uint32_t grow_threshold,
122                               struct pvr_free_list *parent_free_list,
123                               struct pvr_free_list **const free_list_out)
124 {
125    const struct pvr_device_runtime_info *runtime_info =
126       &device->pdevice->dev_runtime_info;
127    struct pvr_winsys_free_list *parent_ws_free_list =
128       parent_free_list ? parent_free_list->ws_free_list : NULL;
129    const uint64_t bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED |
130                              PVR_BO_ALLOC_FLAG_PM_FW_PROTECT;
131    struct pvr_free_list *free_list;
132    uint32_t cache_line_size;
133    uint32_t initial_num_pages;
134    uint32_t grow_num_pages;
135    uint32_t max_num_pages;
136    uint64_t addr_alignment;
137    uint64_t size_alignment;
138    uint64_t size;
139    VkResult result;
140 
141    assert((initial_size + grow_size) <= max_size);
142    assert(max_size != 0);
143    assert(grow_threshold <= 100);
144 
145    /* Make sure the free list is created with at least a single page. */
146    if (initial_size == 0)
147       initial_size = ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE;
148 
149    /* The freelists sizes must respect the PM freelist base address alignment
150     * requirement. As the freelist entries are cached by the SLC, it's also
151     * necessary to ensure the sizes respect the SLC cache line size to avoid
152     * invalid entries appearing in the cache, which would be problematic after
153     * a grow operation, as the SLC entries aren't invalidated. We do this by
154     * making sure the freelist values are appropriately aligned.
155     *
156     * To calculate the alignment, we first take the largest of the freelist
157     * base address alignment and the SLC cache line size. We then divide this
158     * by the freelist entry size to determine the number of freelist entries
159     * required by the PM. Finally, as each entry holds a single PM physical
160     * page, we multiple the number of entries by the page size.
161     *
162     * As an example, if the base address alignment is 16 bytes, the SLC cache
163     * line size is 64 bytes and the freelist entry size is 4 bytes then 16
164     * entries are required, as we take the SLC cacheline size (being the larger
165     * of the two values) and divide this by 4. If the PM page size is 4096
166     * bytes then we end up with an alignment of 65536 bytes.
167     */
168    cache_line_size = rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
169 
170    addr_alignment =
171       MAX2(ROGUE_BIF_PM_FREELIST_BASE_ADDR_ALIGNSIZE, cache_line_size);
172    size_alignment = (addr_alignment / ROGUE_FREE_LIST_ENTRY_SIZE) *
173                     ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE;
174 
175    assert(util_is_power_of_two_nonzero64(size_alignment));
176 
177    initial_size = align64(initial_size, size_alignment);
178    max_size = align64(max_size, size_alignment);
179    grow_size = align64(grow_size, size_alignment);
180 
181    /* Make sure the 'max' size doesn't exceed what the firmware supports and
182     * adjust the other sizes accordingly.
183     */
184    if (max_size > runtime_info->max_free_list_size) {
185       max_size = runtime_info->max_free_list_size;
186       assert(align64(max_size, size_alignment) == max_size);
187    }
188 
189    if (initial_size > max_size)
190       initial_size = max_size;
191 
192    if (initial_size == max_size)
193       grow_size = 0;
194 
195    initial_num_pages = initial_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
196    max_num_pages = max_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
197    grow_num_pages = grow_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
198 
199    /* Calculate the size of the buffer needed to store the free list entries
200     * based on the maximum number of pages we can have.
201     */
202    size = max_num_pages * ROGUE_FREE_LIST_ENTRY_SIZE;
203    assert(align64(size, addr_alignment) == size);
204 
205    free_list = vk_alloc(&device->vk.alloc,
206                         sizeof(*free_list),
207                         8,
208                         VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
209    if (!free_list)
210       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
211 
212    /* FIXME: The memory is mapped GPU uncached, but this seems to contradict
213     * the comment above about aligning to the SLC cache line size.
214     */
215    result = pvr_bo_alloc(device,
216                          device->heaps.general_heap,
217                          size,
218                          addr_alignment,
219                          bo_flags,
220                          &free_list->bo);
221    if (result != VK_SUCCESS)
222       goto err_vk_free_free_list;
223 
224    result = device->ws->ops->free_list_create(device->ws,
225                                               free_list->bo->vma,
226                                               initial_num_pages,
227                                               max_num_pages,
228                                               grow_num_pages,
229                                               grow_threshold,
230                                               parent_ws_free_list,
231                                               &free_list->ws_free_list);
232    if (result != VK_SUCCESS)
233       goto err_pvr_bo_free_bo;
234 
235    free_list->device = device;
236    free_list->size = size;
237 
238    *free_list_out = free_list;
239 
240    return VK_SUCCESS;
241 
242 err_pvr_bo_free_bo:
243    pvr_bo_free(device, free_list->bo);
244 
245 err_vk_free_free_list:
246    vk_free(&device->vk.alloc, free_list);
247 
248    return result;
249 }
250 
pvr_free_list_destroy(struct pvr_free_list * free_list)251 void pvr_free_list_destroy(struct pvr_free_list *free_list)
252 {
253    struct pvr_device *device = free_list->device;
254 
255    device->ws->ops->free_list_destroy(free_list->ws_free_list);
256    pvr_bo_free(device, free_list->bo);
257    vk_free(&device->vk.alloc, free_list);
258 }
259 
pvr_get_samples_in_xy(uint32_t samples,uint32_t * const x_out,uint32_t * const y_out)260 static inline void pvr_get_samples_in_xy(uint32_t samples,
261                                          uint32_t *const x_out,
262                                          uint32_t *const y_out)
263 {
264    switch (samples) {
265    case 1:
266       *x_out = 1;
267       *y_out = 1;
268       break;
269    case 2:
270       *x_out = 1;
271       *y_out = 2;
272       break;
273    case 4:
274       *x_out = 2;
275       *y_out = 2;
276       break;
277    case 8:
278       *x_out = 2;
279       *y_out = 4;
280       break;
281    default:
282       unreachable("Unsupported number of samples");
283    }
284 }
285 
pvr_rt_mtile_info_init(const struct pvr_device_info * dev_info,struct pvr_rt_mtile_info * info,uint32_t width,uint32_t height,uint32_t samples)286 void pvr_rt_mtile_info_init(const struct pvr_device_info *dev_info,
287                             struct pvr_rt_mtile_info *info,
288                             uint32_t width,
289                             uint32_t height,
290                             uint32_t samples)
291 {
292    uint32_t samples_in_x;
293    uint32_t samples_in_y;
294 
295    pvr_get_samples_in_xy(samples, &samples_in_x, &samples_in_y);
296 
297    info->tile_size_x = PVR_GET_FEATURE_VALUE(dev_info, tile_size_x, 1);
298    info->tile_size_y = PVR_GET_FEATURE_VALUE(dev_info, tile_size_y, 1);
299 
300    info->num_tiles_x = DIV_ROUND_UP(width, info->tile_size_x);
301    info->num_tiles_y = DIV_ROUND_UP(height, info->tile_size_y);
302 
303    rogue_get_num_macrotiles_xy(dev_info, &info->mtiles_x, &info->mtiles_y);
304 
305    if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
306       assert(PVR_GET_FEATURE_VALUE(dev_info,
307                                    simple_parameter_format_version,
308                                    0) == 2);
309       /* Set up 16 macrotiles with a multiple of 2x2 tiles per macrotile,
310        * which is aligned to a tile group.
311        */
312       info->mtile_x1 = DIV_ROUND_UP(info->num_tiles_x, 8) * 2;
313       info->mtile_y1 = DIV_ROUND_UP(info->num_tiles_y, 8) * 2;
314       info->mtile_x2 = 0;
315       info->mtile_y2 = 0;
316       info->mtile_x3 = 0;
317       info->mtile_y3 = 0;
318       info->x_tile_max = ALIGN_POT(info->num_tiles_x, 2) - 1;
319       info->y_tile_max = ALIGN_POT(info->num_tiles_y, 2) - 1;
320    } else {
321       /* Set up 16 macrotiles with a multiple of 4x4 tiles per macrotile. */
322       info->mtile_x1 = ALIGN_POT(DIV_ROUND_UP(info->num_tiles_x, 4), 4);
323       info->mtile_y1 = ALIGN_POT(DIV_ROUND_UP(info->num_tiles_y, 4), 4);
324       info->mtile_x2 = info->mtile_x1 * 2;
325       info->mtile_y2 = info->mtile_y1 * 2;
326       info->mtile_x3 = info->mtile_x1 * 3;
327       info->mtile_y3 = info->mtile_y1 * 3;
328       info->x_tile_max = info->num_tiles_x - 1;
329       info->y_tile_max = info->num_tiles_y - 1;
330    }
331 
332    info->tiles_per_mtile_x = info->mtile_x1 * samples_in_x;
333    info->tiles_per_mtile_y = info->mtile_y1 * samples_in_y;
334 }
335 
336 /* Note that the unit of the return value depends on the GPU. For cores with the
337  * simple_internal_parameter_format feature the returned size is interpreted as
338  * the number of region headers. For cores without this feature its interpreted
339  * as the size in dwords.
340  */
341 static uint64_t
pvr_rt_get_isp_region_size(struct pvr_device * device,const struct pvr_rt_mtile_info * mtile_info)342 pvr_rt_get_isp_region_size(struct pvr_device *device,
343                            const struct pvr_rt_mtile_info *mtile_info)
344 {
345    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
346    uint64_t rgn_size =
347       (uint64_t)mtile_info->tiles_per_mtile_x * mtile_info->tiles_per_mtile_y;
348 
349    if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
350       uint32_t version;
351 
352       rgn_size *= (uint64_t)mtile_info->mtiles_x * mtile_info->mtiles_y;
353 
354       if (PVR_FEATURE_VALUE(dev_info,
355                             simple_parameter_format_version,
356                             &version)) {
357          version = 0;
358       }
359 
360       if (version == 2) {
361          /* One region header per 2x2 tile group. */
362          rgn_size /= (2U * 2U);
363       }
364    } else {
365       const uint64_t single_rgn_header_size =
366          rogue_get_region_header_size(dev_info);
367 
368       /* Round up to next dword to prevent IPF overrun and convert to bytes.
369        */
370       rgn_size = DIV_ROUND_UP(rgn_size * single_rgn_header_size, 4);
371    }
372 
373    return rgn_size;
374 }
375 
pvr_rt_vheap_rtc_data_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,uint32_t layers)376 static VkResult pvr_rt_vheap_rtc_data_init(struct pvr_device *device,
377                                            struct pvr_rt_dataset *rt_dataset,
378                                            uint32_t layers)
379 {
380    uint64_t vheap_size;
381    uint32_t alignment;
382    uint64_t rtc_size;
383    VkResult result;
384 
385    vheap_size = ROGUE_CR_PM_VHEAP_TABLE_SIZE * ROGUE_PM_VHEAP_ENTRY_SIZE;
386 
387    if (layers > 1) {
388       uint64_t rtc_entries;
389 
390       vheap_size = ALIGN_POT(vheap_size, PVRX(CR_TA_RTC_ADDR_BASE_ALIGNMENT));
391 
392       rtc_entries = ROGUE_NUM_TEAC + ROGUE_NUM_TE + ROGUE_NUM_VCE;
393       if (PVR_HAS_QUIRK(&device->pdevice->dev_info, 48545))
394          rtc_entries += ROGUE_NUM_TE;
395 
396       rtc_size = rtc_entries * ROGUE_RTC_SIZE_IN_BYTES;
397    } else {
398       rtc_size = 0;
399    }
400 
401    alignment = MAX2(PVRX(CR_PM_VHEAP_TABLE_BASE_ADDR_ALIGNMENT),
402                     PVRX(CR_TA_RTC_ADDR_BASE_ALIGNMENT));
403 
404    result = pvr_bo_alloc(device,
405                          device->heaps.general_heap,
406                          vheap_size + rtc_size,
407                          alignment,
408                          PVR_BO_ALLOC_FLAG_GPU_UNCACHED,
409                          &rt_dataset->vheap_rtc_bo);
410    if (result != VK_SUCCESS)
411       return result;
412 
413    rt_dataset->vheap_dev_addr = rt_dataset->vheap_rtc_bo->vma->dev_addr;
414 
415    if (rtc_size > 0) {
416       rt_dataset->rtc_dev_addr =
417          PVR_DEV_ADDR_OFFSET(rt_dataset->vheap_dev_addr, vheap_size);
418    } else {
419       rt_dataset->rtc_dev_addr = PVR_DEV_ADDR_INVALID;
420    }
421 
422    return VK_SUCCESS;
423 }
424 
pvr_rt_vheap_rtc_data_fini(struct pvr_rt_dataset * rt_dataset)425 static void pvr_rt_vheap_rtc_data_fini(struct pvr_rt_dataset *rt_dataset)
426 {
427    rt_dataset->rtc_dev_addr = PVR_DEV_ADDR_INVALID;
428 
429    pvr_bo_free(rt_dataset->device, rt_dataset->vheap_rtc_bo);
430    rt_dataset->vheap_rtc_bo = NULL;
431 }
432 
433 static void
pvr_rt_get_tail_ptr_stride_size(const struct pvr_device * device,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers,uint64_t * const stride_out,uint64_t * const size_out)434 pvr_rt_get_tail_ptr_stride_size(const struct pvr_device *device,
435                                 const struct pvr_rt_mtile_info *mtile_info,
436                                 uint32_t layers,
437                                 uint64_t *const stride_out,
438                                 uint64_t *const size_out)
439 {
440    uint32_t max_num_mtiles;
441    uint32_t num_mtiles_x;
442    uint32_t num_mtiles_y;
443    uint32_t version;
444    uint64_t size;
445 
446    num_mtiles_x = mtile_info->mtiles_x * mtile_info->tiles_per_mtile_x;
447    num_mtiles_y = mtile_info->mtiles_y * mtile_info->tiles_per_mtile_y;
448 
449    max_num_mtiles = MAX2(util_next_power_of_two64(num_mtiles_x),
450                          util_next_power_of_two64(num_mtiles_y));
451 
452    size = (uint64_t)max_num_mtiles * max_num_mtiles;
453 
454    if (PVR_FEATURE_VALUE(&device->pdevice->dev_info,
455                          simple_parameter_format_version,
456                          &version)) {
457       version = 0;
458    }
459 
460    if (version == 2) {
461       /* One tail pointer cache entry per 2x2 tile group. */
462       size /= (2U * 2U);
463    }
464 
465    size *= ROGUE_TAIL_POINTER_SIZE;
466 
467    if (layers > 1) {
468       size = ALIGN_POT(size, ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE);
469 
470       *stride_out = size / ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE;
471       *size_out = size * layers;
472    } else {
473       *stride_out = 0;
474       *size_out = size;
475    }
476 }
477 
pvr_rt_tpc_data_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers)478 static VkResult pvr_rt_tpc_data_init(struct pvr_device *device,
479                                      struct pvr_rt_dataset *rt_dataset,
480                                      const struct pvr_rt_mtile_info *mtile_info,
481                                      uint32_t layers)
482 {
483    uint64_t tpc_size;
484 
485    pvr_rt_get_tail_ptr_stride_size(device,
486                                    mtile_info,
487                                    layers,
488                                    &rt_dataset->tpc_stride,
489                                    &rt_dataset->tpc_size);
490    tpc_size = ALIGN_POT(rt_dataset->tpc_size, ROGUE_TE_TPC_CACHE_LINE_SIZE);
491 
492    return pvr_bo_alloc(device,
493                        device->heaps.general_heap,
494                        tpc_size,
495                        PVRX(CR_TE_TPC_ADDR_BASE_ALIGNMENT),
496                        PVR_BO_ALLOC_FLAG_GPU_UNCACHED,
497                        &rt_dataset->tpc_bo);
498 }
499 
pvr_rt_tpc_data_fini(struct pvr_rt_dataset * rt_dataset)500 static void pvr_rt_tpc_data_fini(struct pvr_rt_dataset *rt_dataset)
501 {
502    pvr_bo_free(rt_dataset->device, rt_dataset->tpc_bo);
503    rt_dataset->tpc_bo = NULL;
504 }
505 
506 static uint32_t
pvr_rt_get_mlist_size(const struct pvr_free_list * global_free_list,const struct pvr_free_list * local_free_list)507 pvr_rt_get_mlist_size(const struct pvr_free_list *global_free_list,
508                       const struct pvr_free_list *local_free_list)
509 {
510    uint32_t num_pte_pages;
511    uint32_t num_pde_pages;
512    uint32_t num_pce_pages;
513    uint64_t total_pages;
514    uint32_t mlist_size;
515 
516    assert(global_free_list->size + local_free_list->size <=
517           ROGUE_PM_MAX_PB_VIRT_ADDR_SPACE);
518 
519    total_pages = (global_free_list->size + local_free_list->size) >>
520                  ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
521 
522    /* Calculate the total number of physical pages required to hold the page
523     * table, directory and catalog entries for the freelist pages.
524     */
525    num_pte_pages = DIV_ROUND_UP(total_pages, ROGUE_NUM_PT_ENTRIES_PER_PAGE);
526    num_pde_pages = DIV_ROUND_UP(num_pte_pages, ROGUE_NUM_PD_ENTRIES_PER_PAGE);
527    num_pce_pages = DIV_ROUND_UP(num_pde_pages, ROGUE_NUM_PC_ENTRIES_PER_PAGE);
528 
529    /* Calculate the MList size considering the total number of pages in the PB
530     * are shared among all the PM address spaces.
531     */
532    mlist_size = (num_pce_pages + num_pde_pages + num_pte_pages) *
533                 ROGUE_NUM_PM_ADDRESS_SPACES * ROGUE_MLIST_ENTRY_STRIDE;
534 
535    return ALIGN_POT(mlist_size, ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE);
536 }
537 
pvr_rt_get_region_headers_stride_size(const struct pvr_device * device,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers,uint64_t * const stride_out,uint64_t * const size_out)538 static void pvr_rt_get_region_headers_stride_size(
539    const struct pvr_device *device,
540    const struct pvr_rt_mtile_info *mtile_info,
541    uint32_t layers,
542    uint64_t *const stride_out,
543    uint64_t *const size_out)
544 {
545    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
546    const uint32_t single_rgn_header_size =
547       rogue_get_region_header_size(dev_info);
548    uint64_t rgn_headers_size;
549    uint32_t num_tiles_x;
550    uint32_t num_tiles_y;
551    uint32_t group_size;
552    uint32_t version;
553 
554    if (PVR_FEATURE_VALUE(dev_info, simple_parameter_format_version, &version))
555       version = 0;
556 
557    group_size = version == 2 ? 2 : 1;
558 
559    num_tiles_x = mtile_info->mtiles_x * mtile_info->tiles_per_mtile_x;
560    num_tiles_y = mtile_info->mtiles_y * mtile_info->tiles_per_mtile_y;
561 
562    rgn_headers_size = (uint64_t)num_tiles_x / group_size;
563    /* Careful here. We want the division to happen first. */
564    rgn_headers_size *= num_tiles_y / group_size;
565    rgn_headers_size *= single_rgn_header_size;
566 
567    if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
568       rgn_headers_size =
569          ALIGN_POT(rgn_headers_size, PVRX(CR_TE_PSGREGION_ADDR_BASE_ALIGNMENT));
570    }
571 
572    if (layers > 1) {
573       rgn_headers_size =
574          ALIGN_POT(rgn_headers_size, PVRX(CR_TE_PSG_REGION_STRIDE_UNIT_SIZE));
575    }
576 
577    *stride_out = rgn_headers_size;
578    *size_out = rgn_headers_size * layers;
579 }
580 
581 static VkResult
pvr_rt_mta_mlist_data_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,const struct pvr_free_list * global_free_list,const struct pvr_free_list * local_free_list,const struct pvr_rt_mtile_info * mtile_info)582 pvr_rt_mta_mlist_data_init(struct pvr_device *device,
583                            struct pvr_rt_dataset *rt_dataset,
584                            const struct pvr_free_list *global_free_list,
585                            const struct pvr_free_list *local_free_list,
586                            const struct pvr_rt_mtile_info *mtile_info)
587 {
588    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
589    const uint32_t mlist_size =
590       pvr_rt_get_mlist_size(global_free_list, local_free_list);
591    uint32_t mta_size = rogue_get_macrotile_array_size(dev_info);
592    const uint32_t num_rt_datas = ARRAY_SIZE(rt_dataset->rt_datas);
593    uint32_t rt_datas_mlist_size;
594    uint32_t rt_datas_mta_size;
595    pvr_dev_addr_t dev_addr;
596    VkResult result;
597 
598    /* Allocate memory for macrotile array and Mlist for all RT datas.
599     *
600     * Allocation layout: MTA[0..N] + Mlist alignment padding + Mlist[0..N].
601     *
602     * N is number of RT datas.
603     */
604    rt_datas_mta_size = ALIGN_POT(mta_size * num_rt_datas,
605                                  PVRX(CR_PM_MLIST0_BASE_ADDR_ALIGNMENT));
606    rt_datas_mlist_size = mlist_size * num_rt_datas;
607 
608    result = pvr_bo_alloc(device,
609                          device->heaps.general_heap,
610                          rt_datas_mta_size + rt_datas_mlist_size,
611                          PVRX(CR_PM_MTILE_ARRAY_BASE_ADDR_ALIGNMENT),
612                          PVR_BO_ALLOC_FLAG_GPU_UNCACHED,
613                          &rt_dataset->mta_mlist_bo);
614    if (result != VK_SUCCESS)
615       return result;
616 
617    dev_addr = rt_dataset->mta_mlist_bo->vma->dev_addr;
618 
619    for (uint32_t i = 0; i < num_rt_datas; i++) {
620       if (mta_size != 0) {
621          rt_dataset->rt_datas[i].mta_dev_addr = dev_addr;
622          dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, mta_size);
623       } else {
624          rt_dataset->rt_datas[i].mta_dev_addr = PVR_DEV_ADDR_INVALID;
625       }
626    }
627 
628    dev_addr = PVR_DEV_ADDR_OFFSET(rt_dataset->mta_mlist_bo->vma->dev_addr,
629                                   rt_datas_mta_size);
630 
631    for (uint32_t i = 0; i < num_rt_datas; i++) {
632       if (mlist_size != 0) {
633          rt_dataset->rt_datas[i].mlist_dev_addr = dev_addr;
634          dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, mlist_size);
635       } else {
636          rt_dataset->rt_datas[i].mlist_dev_addr = PVR_DEV_ADDR_INVALID;
637       }
638    }
639 
640    return VK_SUCCESS;
641 }
642 
pvr_rt_mta_mlist_data_fini(struct pvr_rt_dataset * rt_dataset)643 static void pvr_rt_mta_mlist_data_fini(struct pvr_rt_dataset *rt_dataset)
644 {
645    for (uint32_t i = 0; i < ARRAY_SIZE(rt_dataset->rt_datas); i++) {
646       rt_dataset->rt_datas[i].mlist_dev_addr = PVR_DEV_ADDR_INVALID;
647       rt_dataset->rt_datas[i].mta_dev_addr = PVR_DEV_ADDR_INVALID;
648    }
649 
650    pvr_bo_free(rt_dataset->device, rt_dataset->mta_mlist_bo);
651    rt_dataset->mta_mlist_bo = NULL;
652 }
653 
654 static VkResult
pvr_rt_rgn_headers_data_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers)655 pvr_rt_rgn_headers_data_init(struct pvr_device *device,
656                              struct pvr_rt_dataset *rt_dataset,
657                              const struct pvr_rt_mtile_info *mtile_info,
658                              uint32_t layers)
659 {
660    const uint32_t num_rt_datas = ARRAY_SIZE(rt_dataset->rt_datas);
661    uint64_t rgn_headers_size;
662    pvr_dev_addr_t dev_addr;
663    VkResult result;
664 
665    pvr_rt_get_region_headers_stride_size(device,
666                                          mtile_info,
667                                          layers,
668                                          &rt_dataset->rgn_headers_stride,
669                                          &rgn_headers_size);
670 
671    result = pvr_bo_alloc(device,
672                          device->heaps.rgn_hdr_heap,
673                          rgn_headers_size * num_rt_datas,
674                          PVRX(CR_TE_PSGREGION_ADDR_BASE_ALIGNMENT),
675                          PVR_BO_ALLOC_FLAG_GPU_UNCACHED,
676                          &rt_dataset->rgn_headers_bo);
677    if (result != VK_SUCCESS)
678       return result;
679 
680    dev_addr = rt_dataset->rgn_headers_bo->vma->dev_addr;
681 
682    for (uint32_t i = 0; i < num_rt_datas; i++) {
683       rt_dataset->rt_datas[i].rgn_headers_dev_addr = dev_addr;
684       dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, rgn_headers_size);
685    }
686 
687    return VK_SUCCESS;
688 }
689 
pvr_rt_rgn_headers_data_fini(struct pvr_rt_dataset * rt_dataset)690 static void pvr_rt_rgn_headers_data_fini(struct pvr_rt_dataset *rt_dataset)
691 {
692    for (uint32_t i = 0; i < ARRAY_SIZE(rt_dataset->rt_datas); i++)
693       rt_dataset->rt_datas[i].rgn_headers_dev_addr = PVR_DEV_ADDR_INVALID;
694 
695    pvr_bo_free(rt_dataset->device, rt_dataset->rgn_headers_bo);
696    rt_dataset->rgn_headers_bo = NULL;
697 }
698 
pvr_rt_datas_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,const struct pvr_free_list * global_free_list,const struct pvr_free_list * local_free_list,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers)699 static VkResult pvr_rt_datas_init(struct pvr_device *device,
700                                   struct pvr_rt_dataset *rt_dataset,
701                                   const struct pvr_free_list *global_free_list,
702                                   const struct pvr_free_list *local_free_list,
703                                   const struct pvr_rt_mtile_info *mtile_info,
704                                   uint32_t layers)
705 {
706    VkResult result;
707 
708    result = pvr_rt_mta_mlist_data_init(device,
709                                        rt_dataset,
710                                        global_free_list,
711                                        local_free_list,
712                                        mtile_info);
713    if (result != VK_SUCCESS)
714       return result;
715 
716    result =
717       pvr_rt_rgn_headers_data_init(device, rt_dataset, mtile_info, layers);
718    if (result != VK_SUCCESS)
719       goto err_pvr_rt_mta_mlist_data_fini;
720 
721    return VK_SUCCESS;
722 
723 err_pvr_rt_mta_mlist_data_fini:
724    pvr_rt_mta_mlist_data_fini(rt_dataset);
725 
726    return VK_SUCCESS;
727 }
728 
pvr_rt_datas_fini(struct pvr_rt_dataset * rt_dataset)729 static void pvr_rt_datas_fini(struct pvr_rt_dataset *rt_dataset)
730 {
731    pvr_rt_rgn_headers_data_fini(rt_dataset);
732    pvr_rt_mta_mlist_data_fini(rt_dataset);
733 }
734 
pvr_rt_dataset_ws_create_info_init(struct pvr_rt_dataset * rt_dataset,const struct pvr_rt_mtile_info * mtile_info,struct pvr_winsys_rt_dataset_create_info * create_info)735 static void pvr_rt_dataset_ws_create_info_init(
736    struct pvr_rt_dataset *rt_dataset,
737    const struct pvr_rt_mtile_info *mtile_info,
738    struct pvr_winsys_rt_dataset_create_info *create_info)
739 {
740    struct pvr_device *device = rt_dataset->device;
741    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
742 
743    memset(create_info, 0, sizeof(*create_info));
744 
745    /* Local freelist. */
746    create_info->local_free_list = rt_dataset->local_free_list->ws_free_list;
747 
748    create_info->width = rt_dataset->width;
749    create_info->height = rt_dataset->height;
750    create_info->samples = rt_dataset->samples;
751    create_info->layers = rt_dataset->layers;
752 
753    /* ISP register values. */
754    if (PVR_HAS_ERN(dev_info, 42307) &&
755        !(PVR_HAS_FEATURE(dev_info, roguexe) && mtile_info->tile_size_x == 16)) {
756       float value;
757 
758       if (rt_dataset->width != 0) {
759          value =
760             ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR / (float)rt_dataset->width;
761          create_info->isp_merge_lower_x = fui(value);
762 
763          value =
764             ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR / (float)rt_dataset->width;
765          create_info->isp_merge_upper_x = fui(value);
766       }
767 
768       if (rt_dataset->height != 0) {
769          value =
770             ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR / (float)rt_dataset->height;
771          create_info->isp_merge_lower_y = fui(value);
772 
773          value =
774             ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR / (float)rt_dataset->height;
775          create_info->isp_merge_upper_y = fui(value);
776       }
777 
778       value = ((float)rt_dataset->width * ROGUE_ISP_MERGE_SCALE_FACTOR) /
779               (ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR -
780                ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR);
781       create_info->isp_merge_scale_x = fui(value);
782 
783       value = ((float)rt_dataset->height * ROGUE_ISP_MERGE_SCALE_FACTOR) /
784               (ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR -
785                ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR);
786       create_info->isp_merge_scale_y = fui(value);
787    }
788 
789    /* Allocations and associated information. */
790    create_info->vheap_table_dev_addr = rt_dataset->vheap_dev_addr;
791    create_info->rtc_dev_addr = rt_dataset->rtc_dev_addr;
792 
793    create_info->tpc_dev_addr = rt_dataset->tpc_bo->vma->dev_addr;
794    create_info->tpc_stride = rt_dataset->tpc_stride;
795    create_info->tpc_size = rt_dataset->tpc_size;
796 
797    STATIC_ASSERT(ARRAY_SIZE(create_info->rt_datas) ==
798                  ARRAY_SIZE(rt_dataset->rt_datas));
799    for (uint32_t i = 0; i < ARRAY_SIZE(create_info->rt_datas); i++) {
800       create_info->rt_datas[i].pm_mlist_dev_addr =
801          rt_dataset->rt_datas[i].mlist_dev_addr;
802       create_info->rt_datas[i].macrotile_array_dev_addr =
803          rt_dataset->rt_datas[i].mta_dev_addr;
804       create_info->rt_datas[i].rgn_header_dev_addr =
805          rt_dataset->rt_datas[i].rgn_headers_dev_addr;
806    }
807 
808    create_info->rgn_header_size =
809       pvr_rt_get_isp_region_size(device, mtile_info);
810 }
811 
812 VkResult
pvr_render_target_dataset_create(struct pvr_device * device,uint32_t width,uint32_t height,uint32_t samples,uint32_t layers,struct pvr_rt_dataset ** const rt_dataset_out)813 pvr_render_target_dataset_create(struct pvr_device *device,
814                                  uint32_t width,
815                                  uint32_t height,
816                                  uint32_t samples,
817                                  uint32_t layers,
818                                  struct pvr_rt_dataset **const rt_dataset_out)
819 {
820    struct pvr_device_runtime_info *runtime_info =
821       &device->pdevice->dev_runtime_info;
822    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
823    struct pvr_winsys_rt_dataset_create_info rt_dataset_create_info;
824    struct pvr_rt_mtile_info mtile_info;
825    struct pvr_rt_dataset *rt_dataset;
826    VkResult result;
827 
828    assert(device->global_free_list);
829    assert(width <= rogue_get_render_size_max_x(dev_info));
830    assert(height <= rogue_get_render_size_max_y(dev_info));
831    assert(layers > 0 && layers <= PVR_MAX_FRAMEBUFFER_LAYERS);
832 
833    pvr_rt_mtile_info_init(dev_info, &mtile_info, width, height, samples);
834 
835    rt_dataset = vk_zalloc(&device->vk.alloc,
836                           sizeof(*rt_dataset),
837                           8,
838                           VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
839    if (!rt_dataset)
840       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
841 
842    rt_dataset->device = device;
843    rt_dataset->width = width;
844    rt_dataset->height = height;
845    rt_dataset->samples = samples;
846    rt_dataset->layers = layers;
847    rt_dataset->global_free_list = device->global_free_list;
848 
849    /* The maximum supported free list size is based on the assumption that this
850     * freelist (the "local" freelist) is always the minimum size required by
851     * the hardware. See the documentation of ROGUE_FREE_LIST_MAX_SIZE for more
852     * details.
853     */
854    result = pvr_free_list_create(device,
855                                  runtime_info->min_free_list_size,
856                                  runtime_info->min_free_list_size,
857                                  0 /* grow_size */,
858                                  0 /* grow_threshold */,
859                                  rt_dataset->global_free_list,
860                                  &rt_dataset->local_free_list);
861    if (result != VK_SUCCESS)
862       goto err_vk_free_rt_dataset;
863 
864    result = pvr_rt_vheap_rtc_data_init(device, rt_dataset, layers);
865    if (result != VK_SUCCESS)
866       goto err_pvr_free_list_destroy;
867 
868    result = pvr_rt_tpc_data_init(device, rt_dataset, &mtile_info, layers);
869    if (result != VK_SUCCESS)
870       goto err_pvr_rt_vheap_rtc_data_fini;
871 
872    result = pvr_rt_datas_init(device,
873                               rt_dataset,
874                               rt_dataset->global_free_list,
875                               rt_dataset->local_free_list,
876                               &mtile_info,
877                               layers);
878    if (result != VK_SUCCESS)
879       goto err_pvr_rt_tpc_data_fini;
880 
881    /* rt_dataset must be fully initialized by this point since
882     * pvr_rt_dataset_ws_create_info_init() depends on this.
883     */
884    pvr_rt_dataset_ws_create_info_init(rt_dataset,
885                                       &mtile_info,
886                                       &rt_dataset_create_info);
887 
888    result =
889       device->ws->ops->render_target_dataset_create(device->ws,
890                                                     &rt_dataset_create_info,
891                                                     dev_info,
892                                                     &rt_dataset->ws_rt_dataset);
893    if (result != VK_SUCCESS)
894       goto err_pvr_rt_datas_fini;
895 
896    *rt_dataset_out = rt_dataset;
897 
898    return VK_SUCCESS;
899 
900 err_pvr_rt_datas_fini:
901    pvr_rt_datas_fini(rt_dataset);
902 
903 err_pvr_rt_tpc_data_fini:
904    pvr_rt_tpc_data_fini(rt_dataset);
905 
906 err_pvr_rt_vheap_rtc_data_fini:
907    pvr_rt_vheap_rtc_data_fini(rt_dataset);
908 
909 err_pvr_free_list_destroy:
910    pvr_free_list_destroy(rt_dataset->local_free_list);
911 
912 err_vk_free_rt_dataset:
913    vk_free(&device->vk.alloc, rt_dataset);
914 
915    return result;
916 }
917 
pvr_render_target_dataset_destroy(struct pvr_rt_dataset * rt_dataset)918 void pvr_render_target_dataset_destroy(struct pvr_rt_dataset *rt_dataset)
919 {
920    struct pvr_device *device = rt_dataset->device;
921 
922    device->ws->ops->render_target_dataset_destroy(rt_dataset->ws_rt_dataset);
923 
924    pvr_rt_datas_fini(rt_dataset);
925    pvr_rt_tpc_data_fini(rt_dataset);
926    pvr_rt_vheap_rtc_data_fini(rt_dataset);
927 
928    pvr_free_list_destroy(rt_dataset->local_free_list);
929 
930    vk_free(&device->vk.alloc, rt_dataset);
931 }
932 
pvr_geom_state_stream_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct pvr_winsys_geometry_state * state)933 static void pvr_geom_state_stream_init(struct pvr_render_ctx *ctx,
934                                        struct pvr_render_job *job,
935                                        struct pvr_winsys_geometry_state *state)
936 {
937    const struct pvr_device *const device = ctx->device;
938    const struct pvr_device_info *const dev_info = &device->pdevice->dev_info;
939 
940    uint32_t *stream_ptr = (uint32_t *)state->fw_stream;
941    uint32_t *stream_len_ptr = stream_ptr;
942 
943    /* Leave space for stream header. */
944    stream_ptr += pvr_cmd_length(KMD_STREAM_HDR);
945 
946    pvr_csb_pack ((uint64_t *)stream_ptr, CR_VDM_CTRL_STREAM_BASE, value) {
947       value.addr = job->ctrl_stream_addr;
948    }
949    stream_ptr += pvr_cmd_length(CR_VDM_CTRL_STREAM_BASE);
950 
951    pvr_csb_pack ((uint64_t *)stream_ptr,
952                  CR_TPU_BORDER_COLOUR_TABLE_VDM,
953                  value) {
954       value.border_colour_table_address =
955          device->border_color_table.table->vma->dev_addr;
956    }
957    stream_ptr += pvr_cmd_length(CR_TPU_BORDER_COLOUR_TABLE_VDM);
958 
959    pvr_csb_pack (stream_ptr, CR_PPP_CTRL, value) {
960       value.wclampen = true;
961       value.fixed_point_format = 1;
962    }
963    stream_ptr += pvr_cmd_length(CR_PPP_CTRL);
964 
965    pvr_csb_pack (stream_ptr, CR_TE_PSG, value) {
966       value.completeonterminate = job->geometry_terminate;
967 
968       value.region_stride = job->rt_dataset->rgn_headers_stride /
969                             PVRX(CR_TE_PSG_REGION_STRIDE_UNIT_SIZE);
970 
971       value.forcenewstate = PVR_HAS_QUIRK(dev_info, 52942);
972    }
973    stream_ptr += pvr_cmd_length(CR_TE_PSG);
974 
975    /* Set up the USC common size for the context switch resume/load program
976     * (ctx->ctx_switch.programs[i].sr->pds_load_program), which was created
977     * as part of the render context.
978     */
979    pvr_csb_pack (stream_ptr, VDMCTRL_PDS_STATE0, value) {
980       /* Calculate the size in bytes. */
981       const uint16_t shared_registers_size = job->max_shared_registers * 4;
982 
983       value.usc_common_size =
984          DIV_ROUND_UP(shared_registers_size,
985                       PVRX(VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE));
986    }
987    stream_ptr += pvr_cmd_length(VDMCTRL_PDS_STATE0);
988 
989    /* clang-format off */
990    pvr_csb_pack (stream_ptr, KMD_STREAM_VIEW_IDX, value);
991    /* clang-format on */
992    stream_ptr += pvr_cmd_length(KMD_STREAM_VIEW_IDX);
993 
994    state->fw_stream_len = (uint8_t *)stream_ptr - (uint8_t *)state->fw_stream;
995    assert(state->fw_stream_len <= ARRAY_SIZE(state->fw_stream));
996 
997    pvr_csb_pack ((uint64_t *)stream_len_ptr, KMD_STREAM_HDR, value) {
998       value.length = state->fw_stream_len;
999    }
1000 }
1001 
1002 static void
pvr_geom_state_stream_ext_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct pvr_winsys_geometry_state * state)1003 pvr_geom_state_stream_ext_init(struct pvr_render_ctx *ctx,
1004                                struct pvr_render_job *job,
1005                                struct pvr_winsys_geometry_state *state)
1006 {
1007    const struct pvr_device_info *dev_info = &ctx->device->pdevice->dev_info;
1008 
1009    uint32_t main_stream_len =
1010       pvr_csb_unpack((uint64_t *)state->fw_stream, KMD_STREAM_HDR).length;
1011    uint32_t *ext_stream_ptr =
1012       (uint32_t *)((uint8_t *)state->fw_stream + main_stream_len);
1013    uint32_t *header0_ptr;
1014 
1015    header0_ptr = ext_stream_ptr;
1016    ext_stream_ptr += pvr_cmd_length(KMD_STREAM_EXTHDR_GEOM0);
1017 
1018    pvr_csb_pack (header0_ptr, KMD_STREAM_EXTHDR_GEOM0, header0) {
1019       if (PVR_HAS_QUIRK(dev_info, 49927)) {
1020          header0.has_brn49927 = true;
1021 
1022          /* The set up of CR_TPU must be identical to
1023           * pvr_render_job_ws_fragment_state_stream_ext_init().
1024           */
1025          pvr_csb_pack (ext_stream_ptr, CR_TPU, value) {
1026             value.tag_cem_4k_face_packing = true;
1027          }
1028          ext_stream_ptr += pvr_cmd_length(CR_TPU);
1029       }
1030    }
1031 
1032    if ((*header0_ptr & PVRX(KMD_STREAM_EXTHDR_DATA_MASK)) != 0) {
1033       state->fw_stream_len =
1034          (uint8_t *)ext_stream_ptr - (uint8_t *)state->fw_stream;
1035       assert(state->fw_stream_len <= ARRAY_SIZE(state->fw_stream));
1036    }
1037 }
1038 
1039 static void
pvr_geom_state_flags_init(const struct pvr_render_job * const job,struct pvr_winsys_geometry_state_flags * flags)1040 pvr_geom_state_flags_init(const struct pvr_render_job *const job,
1041                           struct pvr_winsys_geometry_state_flags *flags)
1042 {
1043    *flags = (struct pvr_winsys_geometry_state_flags){
1044       .is_first_geometry = !job->rt_dataset->need_frag,
1045       .is_last_geometry = job->geometry_terminate,
1046       .use_single_core = job->frag_uses_atomic_ops,
1047    };
1048 }
1049 
1050 static void
pvr_render_job_ws_geometry_state_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct vk_sync * wait,struct pvr_winsys_geometry_state * state)1051 pvr_render_job_ws_geometry_state_init(struct pvr_render_ctx *ctx,
1052                                       struct pvr_render_job *job,
1053                                       struct vk_sync *wait,
1054                                       struct pvr_winsys_geometry_state *state)
1055 {
1056    pvr_geom_state_stream_init(ctx, job, state);
1057    pvr_geom_state_stream_ext_init(ctx, job, state);
1058 
1059    state->wait = wait;
1060    pvr_geom_state_flags_init(job, &state->flags);
1061 }
1062 
pvr_frag_km_stream_pbe_reg_words_offset(const struct pvr_device_info * const dev_info)1063 static inline uint32_t pvr_frag_km_stream_pbe_reg_words_offset(
1064    const struct pvr_device_info *const dev_info)
1065 {
1066    uint32_t offset = 0;
1067 
1068    offset += pvr_cmd_length(KMD_STREAM_HDR);
1069    offset += pvr_cmd_length(CR_ISP_SCISSOR_BASE);
1070    offset += pvr_cmd_length(CR_ISP_DBIAS_BASE);
1071    offset += pvr_cmd_length(CR_ISP_OCLQRY_BASE);
1072    offset += pvr_cmd_length(CR_ISP_ZLSCTL);
1073    offset += pvr_cmd_length(CR_ISP_ZLOAD_BASE);
1074    offset += pvr_cmd_length(CR_ISP_STENCIL_LOAD_BASE);
1075 
1076    if (PVR_HAS_FEATURE(dev_info, requires_fb_cdc_zls_setup))
1077       offset += pvr_cmd_length(CR_FB_CDC_ZLS);
1078 
1079    return PVR_DW_TO_BYTES(offset);
1080 }
1081 
1082 #define DWORDS_PER_U64 2
1083 
pvr_frag_km_stream_pds_eot_data_addr_offset(const struct pvr_device_info * const dev_info)1084 static inline uint32_t pvr_frag_km_stream_pds_eot_data_addr_offset(
1085    const struct pvr_device_info *const dev_info)
1086 {
1087    uint32_t offset = 0;
1088 
1089    offset += pvr_frag_km_stream_pbe_reg_words_offset(dev_info) / 4U;
1090    offset +=
1091       PVR_MAX_COLOR_ATTACHMENTS * ROGUE_NUM_PBESTATE_REG_WORDS * DWORDS_PER_U64;
1092    offset += pvr_cmd_length(CR_TPU_BORDER_COLOUR_TABLE_PDM);
1093    offset += ROGUE_NUM_CR_PDS_BGRND_WORDS * DWORDS_PER_U64;
1094    offset += ROGUE_NUM_CR_PDS_BGRND_WORDS * DWORDS_PER_U64;
1095    offset += PVRX(KMD_STREAM_USC_CLEAR_REGISTER_COUNT) *
1096              pvr_cmd_length(CR_USC_CLEAR_REGISTER);
1097    offset += pvr_cmd_length(CR_USC_PIXEL_OUTPUT_CTRL);
1098    offset += pvr_cmd_length(CR_ISP_BGOBJDEPTH);
1099    offset += pvr_cmd_length(CR_ISP_BGOBJVALS);
1100    offset += pvr_cmd_length(CR_ISP_AA);
1101    offset += pvr_cmd_length(CR_ISP_CTL);
1102    offset += pvr_cmd_length(CR_EVENT_PIXEL_PDS_INFO);
1103 
1104    if (PVR_HAS_FEATURE(dev_info, cluster_grouping))
1105       offset += pvr_cmd_length(KMD_STREAM_PIXEL_PHANTOM);
1106 
1107    offset += pvr_cmd_length(KMD_STREAM_VIEW_IDX);
1108 
1109    return PVR_DW_TO_BYTES(offset);
1110 }
1111 
pvr_frag_state_stream_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct pvr_winsys_fragment_state * state)1112 static void pvr_frag_state_stream_init(struct pvr_render_ctx *ctx,
1113                                        struct pvr_render_job *job,
1114                                        struct pvr_winsys_fragment_state *state)
1115 {
1116    const struct pvr_device *const device = ctx->device;
1117    const struct pvr_physical_device *const pdevice = device->pdevice;
1118    const struct pvr_device_runtime_info *dev_runtime_info =
1119       &pdevice->dev_runtime_info;
1120    const struct pvr_device_info *dev_info = &pdevice->dev_info;
1121    const enum PVRX(CR_ISP_AA_MODE_TYPE)
1122       isp_aa_mode = pvr_cr_isp_aa_mode_type(job->samples);
1123 
1124    enum PVRX(CR_ZLS_FORMAT_TYPE) zload_format = PVRX(CR_ZLS_FORMAT_TYPE_F32Z);
1125    uint32_t *stream_ptr = (uint32_t *)state->fw_stream;
1126    uint32_t *stream_len_ptr = stream_ptr;
1127    uint32_t pixel_ctl;
1128    uint32_t isp_ctl;
1129 
1130    /* Leave space for stream header. */
1131    stream_ptr += pvr_cmd_length(KMD_STREAM_HDR);
1132 
1133    /* FIXME: pass in the number of samples rather than isp_aa_mode? */
1134    pvr_setup_tiles_in_flight(dev_info,
1135                              dev_runtime_info,
1136                              isp_aa_mode,
1137                              job->pixel_output_width,
1138                              false,
1139                              job->max_tiles_in_flight,
1140                              &isp_ctl,
1141                              &pixel_ctl);
1142 
1143    pvr_csb_pack ((uint64_t *)stream_ptr, CR_ISP_SCISSOR_BASE, value) {
1144       value.addr = job->scissor_table_addr;
1145    }
1146    stream_ptr += pvr_cmd_length(CR_ISP_SCISSOR_BASE);
1147 
1148    pvr_csb_pack ((uint64_t *)stream_ptr, CR_ISP_DBIAS_BASE, value) {
1149       value.addr = job->depth_bias_table_addr;
1150    }
1151    stream_ptr += pvr_cmd_length(CR_ISP_DBIAS_BASE);
1152 
1153    pvr_csb_pack ((uint64_t *)stream_ptr, CR_ISP_OCLQRY_BASE, value) {
1154       const struct pvr_sub_cmd_gfx *sub_cmd =
1155          container_of(job, const struct pvr_sub_cmd_gfx, job);
1156 
1157       if (sub_cmd->query_pool)
1158          value.addr = sub_cmd->query_pool->result_buffer->dev_addr;
1159       else
1160          value.addr = PVR_DEV_ADDR_INVALID;
1161    }
1162    stream_ptr += pvr_cmd_length(CR_ISP_OCLQRY_BASE);
1163 
1164    pvr_csb_pack ((uint64_t *)stream_ptr, CR_ISP_ZLSCTL, value) {
1165       if (job->has_depth_attachment || job->has_stencil_attachment) {
1166          uint32_t alignment_x;
1167          uint32_t alignment_y;
1168 
1169          if (job->ds.has_alignment_transfers) {
1170             rogue_get_zls_tile_size_xy(dev_info, &alignment_x, &alignment_y);
1171          } else {
1172             alignment_x = ROGUE_IPF_TILE_SIZE_PIXELS;
1173             alignment_y = ROGUE_IPF_TILE_SIZE_PIXELS;
1174          }
1175 
1176          rogue_get_isp_num_tiles_xy(
1177             dev_info,
1178             job->samples,
1179             ALIGN_POT(job->ds.physical_extent.width, alignment_x),
1180             ALIGN_POT(job->ds.physical_extent.height, alignment_y),
1181             &value.zlsextent_x_z,
1182             &value.zlsextent_y_z);
1183 
1184          value.zlsextent_x_z -= 1;
1185          value.zlsextent_y_z -= 1;
1186 
1187          if (job->ds.memlayout == PVR_MEMLAYOUT_TWIDDLED &&
1188              !job->ds.has_alignment_transfers) {
1189             value.loadtwiddled = true;
1190             value.storetwiddled = true;
1191          }
1192 
1193          value.zloadformat = job->ds.zls_format;
1194          value.zstoreformat = job->ds.zls_format;
1195 
1196          zload_format = value.zloadformat;
1197       }
1198 
1199       if (job->has_depth_attachment) {
1200          value.zloaden = job->ds.load.d;
1201          value.zstoreen = job->ds.store.d;
1202       }
1203 
1204       if (job->has_stencil_attachment) {
1205          value.sloaden = job->ds.load.s;
1206          value.sstoreen = job->ds.store.s;
1207       }
1208 
1209       value.forcezload = value.zloaden || value.sloaden;
1210       value.forcezstore = value.zstoreen || value.sstoreen;
1211    }
1212    stream_ptr += pvr_cmd_length(CR_ISP_ZLSCTL);
1213 
1214    pvr_csb_pack ((uint64_t *)stream_ptr, CR_ISP_ZLOAD_BASE, value) {
1215       if (job->has_depth_attachment)
1216          value.addr = job->ds.addr;
1217    }
1218    stream_ptr += pvr_cmd_length(CR_ISP_ZLOAD_BASE);
1219 
1220    pvr_csb_pack ((uint64_t *)stream_ptr, CR_ISP_STENCIL_LOAD_BASE, value) {
1221       if (job->has_stencil_attachment) {
1222          value.addr = job->ds.addr;
1223 
1224          /* Enable separate stencil. This should be enabled iff the buffer set
1225           * in CR_ISP_STENCIL_LOAD_BASE does not contain a depth component.
1226           */
1227          assert(job->has_depth_attachment ||
1228                 !pvr_zls_format_type_is_packed(job->ds.zls_format));
1229          value.enable = !job->has_depth_attachment;
1230       }
1231    }
1232    stream_ptr += pvr_cmd_length(CR_ISP_STENCIL_LOAD_BASE);
1233 
1234    if (PVR_HAS_FEATURE(dev_info, requires_fb_cdc_zls_setup)) {
1235       /* Currently no support for FBC, so just go ahead and set the default
1236        * values.
1237        */
1238       pvr_csb_pack ((uint64_t *)stream_ptr, CR_FB_CDC_ZLS, value) {
1239          value.fbdc_depth_fmt = PVRX(TEXSTATE_FORMAT_F32);
1240          value.fbdc_stencil_fmt = PVRX(TEXSTATE_FORMAT_U8);
1241       }
1242       stream_ptr += pvr_cmd_length(CR_FB_CDC_ZLS);
1243    }
1244 
1245    /* Make sure that the pvr_frag_km_...() function is returning the correct
1246     * offset.
1247     */
1248    assert((uint8_t *)stream_ptr - (uint8_t *)state->fw_stream ==
1249           pvr_frag_km_stream_pbe_reg_words_offset(dev_info));
1250 
1251    STATIC_ASSERT(ARRAY_SIZE(job->pbe_reg_words) == PVR_MAX_COLOR_ATTACHMENTS);
1252    STATIC_ASSERT(ARRAY_SIZE(job->pbe_reg_words[0]) ==
1253                  ROGUE_NUM_PBESTATE_REG_WORDS);
1254    STATIC_ASSERT(sizeof(job->pbe_reg_words[0][0]) == sizeof(uint64_t));
1255    memcpy(stream_ptr, job->pbe_reg_words, sizeof(job->pbe_reg_words));
1256    stream_ptr +=
1257       PVR_MAX_COLOR_ATTACHMENTS * ROGUE_NUM_PBESTATE_REG_WORDS * DWORDS_PER_U64;
1258 
1259    pvr_csb_pack ((uint64_t *)stream_ptr,
1260                  CR_TPU_BORDER_COLOUR_TABLE_PDM,
1261                  value) {
1262       value.border_colour_table_address =
1263          device->border_color_table.table->vma->dev_addr;
1264    }
1265    stream_ptr += pvr_cmd_length(CR_TPU_BORDER_COLOUR_TABLE_PDM);
1266 
1267    STATIC_ASSERT(ARRAY_SIZE(job->pds_bgnd_reg_values) ==
1268                  ROGUE_NUM_CR_PDS_BGRND_WORDS);
1269    STATIC_ASSERT(sizeof(job->pds_bgnd_reg_values[0]) == sizeof(uint64_t));
1270    memcpy(stream_ptr,
1271           job->pds_bgnd_reg_values,
1272           sizeof(job->pds_bgnd_reg_values));
1273    stream_ptr += ROGUE_NUM_CR_PDS_BGRND_WORDS * DWORDS_PER_U64;
1274 
1275    STATIC_ASSERT(ARRAY_SIZE(job->pds_pr_bgnd_reg_values) ==
1276                  ROGUE_NUM_CR_PDS_BGRND_WORDS);
1277    STATIC_ASSERT(sizeof(job->pds_pr_bgnd_reg_values[0]) == sizeof(uint64_t));
1278    memcpy(stream_ptr,
1279           job->pds_pr_bgnd_reg_values,
1280           sizeof(job->pds_pr_bgnd_reg_values));
1281    stream_ptr += ROGUE_NUM_CR_PDS_BGRND_WORDS * DWORDS_PER_U64;
1282 
1283 #undef DWORDS_PER_U64
1284 
1285    memset(stream_ptr,
1286           0,
1287           PVRX(KMD_STREAM_USC_CLEAR_REGISTER_COUNT) *
1288              PVR_DW_TO_BYTES(pvr_cmd_length(CR_USC_CLEAR_REGISTER)));
1289    stream_ptr += PVRX(KMD_STREAM_USC_CLEAR_REGISTER_COUNT) *
1290                  pvr_cmd_length(CR_USC_CLEAR_REGISTER);
1291 
1292    *stream_ptr = pixel_ctl;
1293    stream_ptr += pvr_cmd_length(CR_USC_PIXEL_OUTPUT_CTRL);
1294 
1295    pvr_csb_pack (stream_ptr, CR_ISP_BGOBJDEPTH, value) {
1296       const float depth_clear = job->ds_clear_value.depth;
1297 
1298       /* This is valid even when we don't have a depth attachment because:
1299        *  - zload_format is set to a sensible default above, and
1300        *  - job->depth_clear_value is set to a sensible default in that case.
1301        */
1302       switch (zload_format) {
1303       case PVRX(CR_ZLS_FORMAT_TYPE_F32Z):
1304          value.value = fui(depth_clear);
1305          break;
1306 
1307       case PVRX(CR_ZLS_FORMAT_TYPE_16BITINT):
1308          value.value = _mesa_float_to_unorm(depth_clear, 16);
1309          break;
1310 
1311       case PVRX(CR_ZLS_FORMAT_TYPE_24BITINT):
1312          value.value = _mesa_float_to_unorm(depth_clear, 24);
1313          break;
1314 
1315       default:
1316          unreachable("Unsupported depth format");
1317       }
1318    }
1319    stream_ptr += pvr_cmd_length(CR_ISP_BGOBJDEPTH);
1320 
1321    pvr_csb_pack (stream_ptr, CR_ISP_BGOBJVALS, value) {
1322       value.enablebgtag = job->enable_bg_tag;
1323 
1324       value.mask = true;
1325 
1326       value.stencil = job->ds_clear_value.stencil & 0xFF;
1327    }
1328    stream_ptr += pvr_cmd_length(CR_ISP_BGOBJVALS);
1329 
1330    pvr_csb_pack (stream_ptr, CR_ISP_AA, value) {
1331       value.mode = isp_aa_mode;
1332    }
1333    stream_ptr += pvr_cmd_length(CR_ISP_AA);
1334 
1335    pvr_csb_pack (stream_ptr, CR_ISP_CTL, value) {
1336       value.sample_pos = true;
1337       value.process_empty_tiles = job->process_empty_tiles;
1338 
1339       /* For integer depth formats we'll convert the specified floating point
1340        * depth bias values and specify them as integers. In this mode a depth
1341        * bias factor of 1.0 equates to 1 ULP of increase to the depth value.
1342        */
1343       value.dbias_is_int = PVR_HAS_ERN(dev_info, 42307) &&
1344                            pvr_zls_format_type_is_int(job->ds.zls_format);
1345    }
1346    /* FIXME: When pvr_setup_tiles_in_flight() is refactored it might be
1347     * possible to fully pack CR_ISP_CTL above rather than having to OR in part
1348     * of the value.
1349     */
1350    *stream_ptr |= isp_ctl;
1351    stream_ptr += pvr_cmd_length(CR_ISP_CTL);
1352 
1353    pvr_csb_pack (stream_ptr, CR_EVENT_PIXEL_PDS_INFO, value) {
1354       value.const_size =
1355          DIV_ROUND_UP(ctx->device->pixel_event_data_size_in_dwords,
1356                       PVRX(CR_EVENT_PIXEL_PDS_INFO_CONST_SIZE_UNIT_SIZE));
1357       value.temp_stride = 0;
1358       value.usc_sr_size =
1359          DIV_ROUND_UP(PVR_STATE_PBE_DWORDS,
1360                       PVRX(CR_EVENT_PIXEL_PDS_INFO_USC_SR_SIZE_UNIT_SIZE));
1361    }
1362    stream_ptr += pvr_cmd_length(CR_EVENT_PIXEL_PDS_INFO);
1363 
1364    if (PVR_HAS_FEATURE(dev_info, cluster_grouping)) {
1365       pvr_csb_pack (stream_ptr, KMD_STREAM_PIXEL_PHANTOM, value) {
1366          /* Each phantom has its own MCU, so atomicity can only be guaranteed
1367           * when all work items are processed on the same phantom. This means
1368           * we need to disable all USCs other than those of the first
1369           * phantom, which has 4 clusters. Note that we only need to do this
1370           * for atomic operations in fragment shaders, since hardware
1371           * prevents the TA to run on more than one phantom anyway.
1372           */
1373          /* Note that leaving all phantoms disabled (as csbgen will do by
1374           * default since it will zero out things) will set them to their
1375           * default state (i.e. enabled) instead of disabling them.
1376           */
1377          if (PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls) &&
1378              dev_runtime_info->num_phantoms > 1 && job->frag_uses_atomic_ops) {
1379             value.phantom_0 = PVRX(KMD_STREAM_PIXEL_PHANTOM_STATE_ENABLED);
1380          }
1381       }
1382       stream_ptr += pvr_cmd_length(KMD_STREAM_PIXEL_PHANTOM);
1383    }
1384 
1385    /* clang-format off */
1386    pvr_csb_pack (stream_ptr, KMD_STREAM_VIEW_IDX, value);
1387    /* clang-format on */
1388    stream_ptr += pvr_cmd_length(KMD_STREAM_VIEW_IDX);
1389 
1390    /* Make sure that the pvr_frag_km_...() function is returning the correct
1391     * offset.
1392     */
1393    assert((uint8_t *)stream_ptr - (uint8_t *)state->fw_stream ==
1394           pvr_frag_km_stream_pds_eot_data_addr_offset(dev_info));
1395 
1396    pvr_csb_pack (stream_ptr, CR_EVENT_PIXEL_PDS_DATA, value) {
1397       value.addr = PVR_DEV_ADDR(job->pds_pixel_event_data_offset);
1398    }
1399    stream_ptr += pvr_cmd_length(CR_EVENT_PIXEL_PDS_DATA);
1400 
1401    if (PVR_HAS_FEATURE(dev_info, gpu_multicore_support)) {
1402       pvr_finishme(
1403          "Emit isp_oclqry_stride when feature gpu_multicore_support is present");
1404       *stream_ptr = 0;
1405       stream_ptr++;
1406    }
1407 
1408    if (PVR_HAS_FEATURE(dev_info, zls_subtile)) {
1409       pvr_csb_pack (stream_ptr, CR_ISP_ZLS_PIXELS, value) {
1410          if (job->has_depth_attachment) {
1411             if (job->ds.has_alignment_transfers) {
1412                value.x = job->ds.physical_extent.width - 1;
1413                value.y = job->ds.physical_extent.height - 1;
1414             } else {
1415                value.x = job->ds.stride - 1;
1416                value.y = job->ds.height - 1;
1417             }
1418          }
1419       }
1420       stream_ptr += pvr_cmd_length(CR_ISP_ZLS_PIXELS);
1421    }
1422 
1423    /* zls_stride */
1424    *stream_ptr = job->has_depth_attachment ? job->ds.layer_size : 0;
1425    stream_ptr++;
1426 
1427    /* sls_stride */
1428    *stream_ptr = job->has_stencil_attachment ? job->ds.layer_size : 0;
1429    stream_ptr++;
1430 
1431    if (PVR_HAS_FEATURE(dev_info, gpu_multicore_support)) {
1432       pvr_finishme(
1433          "Emit execute_count when feature gpu_multicore_support is present");
1434       *stream_ptr = 0;
1435       stream_ptr++;
1436    }
1437 
1438    state->fw_stream_len = (uint8_t *)stream_ptr - (uint8_t *)state->fw_stream;
1439    assert(state->fw_stream_len <= ARRAY_SIZE(state->fw_stream));
1440 
1441    pvr_csb_pack ((uint64_t *)stream_len_ptr, KMD_STREAM_HDR, value) {
1442       value.length = state->fw_stream_len;
1443    }
1444 }
1445 
1446 #undef DWORDS_PER_U64
1447 
1448 static void
pvr_frag_state_stream_ext_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct pvr_winsys_fragment_state * state)1449 pvr_frag_state_stream_ext_init(struct pvr_render_ctx *ctx,
1450                                struct pvr_render_job *job,
1451                                struct pvr_winsys_fragment_state *state)
1452 {
1453    const struct pvr_device_info *dev_info = &ctx->device->pdevice->dev_info;
1454 
1455    uint32_t main_stream_len =
1456       pvr_csb_unpack((uint64_t *)state->fw_stream, KMD_STREAM_HDR).length;
1457    uint32_t *ext_stream_ptr =
1458       (uint32_t *)((uint8_t *)state->fw_stream + main_stream_len);
1459    uint32_t *header0_ptr;
1460 
1461    header0_ptr = ext_stream_ptr;
1462    ext_stream_ptr += pvr_cmd_length(KMD_STREAM_EXTHDR_FRAG0);
1463 
1464    pvr_csb_pack (header0_ptr, KMD_STREAM_EXTHDR_FRAG0, header0) {
1465       if (PVR_HAS_QUIRK(dev_info, 49927)) {
1466          header0.has_brn49927 = true;
1467 
1468          /* The set up of CR_TPU must be identical to
1469           * pvr_render_job_ws_geometry_state_stream_ext_init().
1470           */
1471          pvr_csb_pack (ext_stream_ptr, CR_TPU, value) {
1472             value.tag_cem_4k_face_packing = true;
1473          }
1474          ext_stream_ptr += pvr_cmd_length(CR_TPU);
1475       }
1476    }
1477 
1478    if ((*header0_ptr & PVRX(KMD_STREAM_EXTHDR_DATA_MASK)) != 0) {
1479       state->fw_stream_len =
1480          (uint8_t *)ext_stream_ptr - (uint8_t *)state->fw_stream;
1481       assert(state->fw_stream_len <= ARRAY_SIZE(state->fw_stream));
1482    }
1483 }
1484 
1485 static void
pvr_frag_state_flags_init(const struct pvr_render_job * const job,struct pvr_winsys_fragment_state_flags * flags)1486 pvr_frag_state_flags_init(const struct pvr_render_job *const job,
1487                           struct pvr_winsys_fragment_state_flags *flags)
1488 {
1489    *flags = (struct pvr_winsys_fragment_state_flags){
1490       .has_depth_buffer = job->has_depth_attachment,
1491       .has_stencil_buffer = job->has_stencil_attachment,
1492       .prevent_cdm_overlap = job->disable_compute_overlap,
1493       .use_single_core = job->frag_uses_atomic_ops,
1494       .get_vis_results = job->get_vis_results,
1495       .has_spm_scratch_buffer = job->requires_spm_scratch_buffer,
1496    };
1497 }
1498 
1499 static void
pvr_render_job_ws_fragment_state_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct vk_sync * wait,struct pvr_winsys_fragment_state * state)1500 pvr_render_job_ws_fragment_state_init(struct pvr_render_ctx *ctx,
1501                                       struct pvr_render_job *job,
1502                                       struct vk_sync *wait,
1503                                       struct pvr_winsys_fragment_state *state)
1504 {
1505    pvr_frag_state_stream_init(ctx, job, state);
1506    pvr_frag_state_stream_ext_init(ctx, job, state);
1507 
1508    state->wait = wait;
1509    pvr_frag_state_flags_init(job, &state->flags);
1510 }
1511 
1512 /**
1513  * \brief Sets up the fragment state for a Partial Render (PR) based on the
1514  * state for a normal fragment job.
1515  *
1516  * The state of a fragment PR is almost the same as of that for a normal
1517  * fragment job apart the PBE words and the EOT program, both of which are
1518  * necessary for the render to use the SPM scratch buffer instead of the final
1519  * render targets.
1520  *
1521  * By basing the fragment PR state on that of a normal fragment state,
1522  * repacking of the same words can be avoided as we end up mostly doing copies
1523  * instead.
1524  */
pvr_render_job_ws_fragment_pr_init_based_on_fragment_state(const struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct vk_sync * wait,struct pvr_winsys_fragment_state * frag,struct pvr_winsys_fragment_state * state)1525 static void pvr_render_job_ws_fragment_pr_init_based_on_fragment_state(
1526    const struct pvr_render_ctx *ctx,
1527    struct pvr_render_job *job,
1528    struct vk_sync *wait,
1529    struct pvr_winsys_fragment_state *frag,
1530    struct pvr_winsys_fragment_state *state)
1531 {
1532    const struct pvr_device_info *const dev_info =
1533       &ctx->device->pdevice->dev_info;
1534    const uint32_t pbe_reg_byte_offset =
1535       pvr_frag_km_stream_pbe_reg_words_offset(dev_info);
1536    const uint32_t eot_data_addr_byte_offset =
1537       pvr_frag_km_stream_pds_eot_data_addr_offset(dev_info);
1538 
1539    /* Massive copy :( */
1540    *state = *frag;
1541 
1542    assert(state->fw_stream_len >=
1543           pbe_reg_byte_offset + sizeof(job->pr_pbe_reg_words));
1544    memcpy(&state->fw_stream[pbe_reg_byte_offset],
1545           job->pr_pbe_reg_words,
1546           sizeof(job->pr_pbe_reg_words));
1547 
1548    /* TODO: Update this when csbgen is byte instead of dword granular. */
1549    assert(state->fw_stream_len >=
1550           eot_data_addr_byte_offset +
1551              PVR_DW_TO_BYTES(pvr_cmd_length(CR_EVENT_PIXEL_PDS_DATA)));
1552    pvr_csb_pack ((uint32_t *)&state->fw_stream[eot_data_addr_byte_offset],
1553                  CR_EVENT_PIXEL_PDS_DATA,
1554                  eot_pds_data) {
1555       eot_pds_data.addr = PVR_DEV_ADDR(job->pr_pds_pixel_event_data_offset);
1556    }
1557 }
1558 
pvr_render_job_ws_submit_info_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct vk_sync * wait_geom,struct vk_sync * wait_frag,struct pvr_winsys_render_submit_info * submit_info)1559 static void pvr_render_job_ws_submit_info_init(
1560    struct pvr_render_ctx *ctx,
1561    struct pvr_render_job *job,
1562    struct vk_sync *wait_geom,
1563    struct vk_sync *wait_frag,
1564    struct pvr_winsys_render_submit_info *submit_info)
1565 {
1566    memset(submit_info, 0, sizeof(*submit_info));
1567 
1568    submit_info->rt_dataset = job->rt_dataset->ws_rt_dataset;
1569    submit_info->rt_data_idx = job->rt_dataset->rt_data_idx;
1570 
1571    submit_info->frame_num = ctx->device->global_queue_present_count;
1572    submit_info->job_num = ctx->device->global_cmd_buffer_submit_count;
1573 
1574    pvr_render_job_ws_geometry_state_init(ctx,
1575                                          job,
1576                                          wait_geom,
1577                                          &submit_info->geometry);
1578 
1579    submit_info->has_fragment_job = job->run_frag;
1580 
1581    /* TODO: Move the job setup from queue submit into cmd_buffer if possible. */
1582 
1583    /* TODO: See if it's worth avoiding setting up the fragment state and setup
1584     * the pr state directly if `!job->run_frag`. For now we'll always set it up.
1585     */
1586    pvr_render_job_ws_fragment_state_init(ctx,
1587                                          job,
1588                                          wait_frag,
1589                                          &submit_info->fragment);
1590 
1591    /* TODO: In some cases we could eliminate the pr and use the frag directly in
1592     * case we enter SPM. There's likely some performance improvement to be had
1593     * there. For now we'll always setup the pr.
1594     */
1595    pvr_render_job_ws_fragment_pr_init_based_on_fragment_state(
1596       ctx,
1597       job,
1598       wait_frag,
1599       &submit_info->fragment,
1600       &submit_info->fragment_pr);
1601 }
1602 
pvr_render_job_submit(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct vk_sync * wait_geom,struct vk_sync * wait_frag,struct vk_sync * signal_sync_geom,struct vk_sync * signal_sync_frag)1603 VkResult pvr_render_job_submit(struct pvr_render_ctx *ctx,
1604                                struct pvr_render_job *job,
1605                                struct vk_sync *wait_geom,
1606                                struct vk_sync *wait_frag,
1607                                struct vk_sync *signal_sync_geom,
1608                                struct vk_sync *signal_sync_frag)
1609 {
1610    struct pvr_rt_dataset *rt_dataset = job->rt_dataset;
1611    struct pvr_winsys_render_submit_info submit_info;
1612    struct pvr_device *device = ctx->device;
1613    VkResult result;
1614 
1615    pvr_render_job_ws_submit_info_init(ctx,
1616                                       job,
1617                                       wait_geom,
1618                                       wait_frag,
1619                                       &submit_info);
1620 
1621    if (PVR_IS_DEBUG_SET(DUMP_CONTROL_STREAM)) {
1622       /* FIXME: This isn't an ideal method of accessing the information we
1623        * need, but it's considered good enough for a debug code path. It can be
1624        * streamlined and made more correct if/when pvr_render_job becomes a
1625        * subclass of pvr_sub_cmd.
1626        */
1627       const struct pvr_sub_cmd *sub_cmd =
1628          container_of(job, const struct pvr_sub_cmd, gfx.job);
1629 
1630       pvr_csb_dump(&sub_cmd->gfx.control_stream,
1631                    submit_info.frame_num,
1632                    submit_info.job_num);
1633    }
1634 
1635    result = device->ws->ops->render_submit(ctx->ws_ctx,
1636                                            &submit_info,
1637                                            &device->pdevice->dev_info,
1638                                            signal_sync_geom,
1639                                            signal_sync_frag);
1640    if (result != VK_SUCCESS)
1641       return result;
1642 
1643    if (job->run_frag) {
1644       /* Move to the next render target data now that a fragment job has been
1645        * successfully submitted. This will allow the next geometry job to be
1646        * submitted to been run in parallel with it.
1647        */
1648       rt_dataset->rt_data_idx =
1649          (rt_dataset->rt_data_idx + 1) % ARRAY_SIZE(rt_dataset->rt_datas);
1650 
1651       rt_dataset->need_frag = false;
1652    } else {
1653       rt_dataset->need_frag = true;
1654    }
1655 
1656    return VK_SUCCESS;
1657 }
1658