xref: /aosp_15_r20/external/mesa3d/src/imagination/vulkan/pvr_job_context.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2022 Imagination Technologies Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a copy
5  * of this software and associated documentation files (the "Software"), to deal
6  * in the Software without restriction, including without limitation the rights
7  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8  * copies of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <stddef.h>
27 #include <stdint.h>
28 #include <stdio.h>
29 #include <vulkan/vulkan.h>
30 
31 #include "hwdef/rogue_hw_utils.h"
32 #include "pvr_bo.h"
33 #include "pvr_cdm_load_sr.h"
34 #include "pvr_common.h"
35 #include "pvr_csb.h"
36 #include "pvr_job_context.h"
37 #include "pvr_pds.h"
38 #include "pvr_private.h"
39 #include "pvr_transfer_frag_store.h"
40 #include "pvr_types.h"
41 #include "pvr_uscgen.h"
42 #include "pvr_vdm_load_sr.h"
43 #include "pvr_vdm_store_sr.h"
44 #include "pvr_winsys.h"
45 #include "util/macros.h"
46 #include "util/os_file.h"
47 #include "util/u_dynarray.h"
48 #include "vk_alloc.h"
49 #include "vk_log.h"
50 
51 /* TODO: Is there some way to ensure the Vulkan driver doesn't exceed this
52  * value when constructing the control stream?
53  */
54 /* The VDM callstack is used by the hardware to implement control stream links
55  * with a return, i.e. sub-control streams/subroutines. This value specifies the
56  * maximum callstack depth.
57  */
58 #define PVR_VDM_CALLSTACK_MAX_DEPTH 1U
59 
60 #define ROGUE_PDS_TASK_PROGRAM_SIZE 256U
61 
pvr_ctx_reset_cmd_init(struct pvr_device * device,struct pvr_reset_cmd * const reset_cmd)62 static VkResult pvr_ctx_reset_cmd_init(struct pvr_device *device,
63                                        struct pvr_reset_cmd *const reset_cmd)
64 {
65    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
66 
67    /* The reset framework depends on compute support in the hw. */
68    assert(PVR_HAS_FEATURE(dev_info, compute));
69 
70    if (PVR_HAS_QUIRK(dev_info, 51764))
71       pvr_finishme("Missing reset support for brn51764");
72 
73    if (PVR_HAS_QUIRK(dev_info, 58839))
74       pvr_finishme("Missing reset support for brn58839");
75 
76    return VK_SUCCESS;
77 }
78 
pvr_ctx_reset_cmd_fini(struct pvr_device * device,struct pvr_reset_cmd * reset_cmd)79 static void pvr_ctx_reset_cmd_fini(struct pvr_device *device,
80                                    struct pvr_reset_cmd *reset_cmd)
81 
82 {
83    /* TODO: reset command cleanup. */
84 }
85 
pvr_pds_pt_store_program_create_and_upload(struct pvr_device * device,struct pvr_bo * pt_bo,uint32_t pt_bo_size,struct pvr_pds_upload * const pds_upload_out)86 static VkResult pvr_pds_pt_store_program_create_and_upload(
87    struct pvr_device *device,
88    struct pvr_bo *pt_bo,
89    uint32_t pt_bo_size,
90    struct pvr_pds_upload *const pds_upload_out)
91 {
92    struct pvr_pds_stream_out_terminate_program program = { 0 };
93    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
94    const uint32_t cache_line_size = rogue_get_slc_cache_line_size(dev_info);
95    size_t staging_buffer_size;
96    uint32_t *staging_buffer;
97    uint32_t *data_buffer;
98    uint32_t *code_buffer;
99    VkResult result;
100 
101    /* Check the bo size can be converted to dwords without any rounding. */
102    assert(pt_bo_size % 4 == 0);
103 
104    program.pds_persistent_temp_size_to_store = pt_bo_size / 4;
105    program.dev_address_for_storing_persistent_temp = pt_bo->vma->dev_addr.addr;
106 
107    pvr_pds_generate_stream_out_terminate_program(&program,
108                                                  NULL,
109                                                  PDS_GENERATE_SIZES,
110                                                  dev_info);
111 
112    staging_buffer_size = (program.stream_out_terminate_pds_data_size +
113                           program.stream_out_terminate_pds_code_size) *
114                          sizeof(*staging_buffer);
115 
116    staging_buffer = vk_zalloc(&device->vk.alloc,
117                               staging_buffer_size,
118                               8,
119                               VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
120    if (!staging_buffer)
121       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
122 
123    data_buffer = staging_buffer;
124    code_buffer =
125       pvr_pds_generate_stream_out_terminate_program(&program,
126                                                     data_buffer,
127                                                     PDS_GENERATE_DATA_SEGMENT,
128                                                     dev_info);
129    pvr_pds_generate_stream_out_terminate_program(&program,
130                                                  code_buffer,
131                                                  PDS_GENERATE_CODE_SEGMENT,
132                                                  dev_info);
133 
134    /* This PDS program is passed to the HW via the PPP state words. These only
135     * allow the data segment address to be specified and expect the code
136     * segment to immediately follow. Assume the code alignment is the same as
137     * the data.
138     */
139    result =
140       pvr_gpu_upload_pds(device,
141                          data_buffer,
142                          program.stream_out_terminate_pds_data_size,
143                          PVRX(TA_STATE_STREAM_OUT1_PDS_DATA_SIZE_UNIT_SIZE),
144                          code_buffer,
145                          program.stream_out_terminate_pds_code_size,
146                          PVRX(TA_STATE_STREAM_OUT1_PDS_DATA_SIZE_UNIT_SIZE),
147                          cache_line_size,
148                          pds_upload_out);
149 
150    vk_free(&device->vk.alloc, staging_buffer);
151 
152    return result;
153 }
154 
pvr_pds_pt_resume_program_create_and_upload(struct pvr_device * device,struct pvr_bo * pt_bo,uint32_t pt_bo_size,struct pvr_pds_upload * const pds_upload_out)155 static VkResult pvr_pds_pt_resume_program_create_and_upload(
156    struct pvr_device *device,
157    struct pvr_bo *pt_bo,
158    uint32_t pt_bo_size,
159    struct pvr_pds_upload *const pds_upload_out)
160 {
161    struct pvr_pds_stream_out_init_program program = { 0 };
162    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
163    const uint32_t cache_line_size = rogue_get_slc_cache_line_size(dev_info);
164    size_t staging_buffer_size;
165    uint32_t *staging_buffer;
166    uint32_t *data_buffer;
167    uint32_t *code_buffer;
168    VkResult result;
169 
170    /* Check the bo size can be converted to dwords without any rounding. */
171    assert(pt_bo_size % 4 == 0);
172 
173    program.num_buffers = 1;
174    program.pds_buffer_data_size[0] = pt_bo_size / 4;
175    program.dev_address_for_buffer_data[0] = pt_bo->vma->dev_addr.addr;
176 
177    pvr_pds_generate_stream_out_init_program(&program,
178                                             NULL,
179                                             false,
180                                             PDS_GENERATE_SIZES,
181                                             dev_info);
182 
183    staging_buffer_size = (program.stream_out_init_pds_data_size +
184                           program.stream_out_init_pds_code_size) *
185                          sizeof(*staging_buffer);
186 
187    staging_buffer = vk_zalloc(&device->vk.alloc,
188                               staging_buffer_size,
189                               8,
190                               VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
191    if (!staging_buffer)
192       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
193 
194    data_buffer = staging_buffer;
195    code_buffer =
196       pvr_pds_generate_stream_out_init_program(&program,
197                                                data_buffer,
198                                                false,
199                                                PDS_GENERATE_DATA_SEGMENT,
200                                                dev_info);
201    pvr_pds_generate_stream_out_init_program(&program,
202                                             code_buffer,
203                                             false,
204                                             PDS_GENERATE_CODE_SEGMENT,
205                                             dev_info);
206 
207    /* This PDS program is passed to the HW via the PPP state words. These only
208     * allow the data segment address to be specified and expect the code
209     * segment to immediately follow. Assume the code alignment is the same as
210     * the data.
211     */
212    result =
213       pvr_gpu_upload_pds(device,
214                          data_buffer,
215                          program.stream_out_init_pds_data_size,
216                          PVRX(TA_STATE_STREAM_OUT1_PDS_DATA_SIZE_UNIT_SIZE),
217                          code_buffer,
218                          program.stream_out_init_pds_code_size,
219                          PVRX(TA_STATE_STREAM_OUT1_PDS_DATA_SIZE_UNIT_SIZE),
220                          cache_line_size,
221                          pds_upload_out);
222 
223    vk_free(&device->vk.alloc, staging_buffer);
224 
225    return result;
226 }
227 
228 static VkResult
pvr_render_job_pt_programs_setup(struct pvr_device * device,struct rogue_pt_programs * pt_programs)229 pvr_render_job_pt_programs_setup(struct pvr_device *device,
230                                  struct rogue_pt_programs *pt_programs)
231 {
232    VkResult result;
233 
234    result = pvr_bo_alloc(device,
235                          device->heaps.pds_heap,
236                          ROGUE_LLS_PDS_PERSISTENT_TEMPS_BUFFER_SIZE,
237                          ROGUE_LLS_PDS_PERSISTENT_TEMPS_BUFFER_ALIGNMENT,
238                          PVR_BO_ALLOC_FLAG_CPU_ACCESS,
239                          &pt_programs->store_resume_state_bo);
240    if (result != VK_SUCCESS)
241       return result;
242 
243    result = pvr_pds_pt_store_program_create_and_upload(
244       device,
245       pt_programs->store_resume_state_bo,
246       ROGUE_LLS_PDS_PERSISTENT_TEMPS_BUFFER_SIZE,
247       &pt_programs->pds_store_program);
248    if (result != VK_SUCCESS)
249       goto err_free_store_resume_state_bo;
250 
251    result = pvr_pds_pt_resume_program_create_and_upload(
252       device,
253       pt_programs->store_resume_state_bo,
254       ROGUE_LLS_PDS_PERSISTENT_TEMPS_BUFFER_SIZE,
255       &pt_programs->pds_resume_program);
256    if (result != VK_SUCCESS)
257       goto err_free_pds_store_program;
258 
259    return VK_SUCCESS;
260 
261 err_free_pds_store_program:
262    pvr_bo_suballoc_free(pt_programs->pds_store_program.pvr_bo);
263 
264 err_free_store_resume_state_bo:
265    pvr_bo_free(device, pt_programs->store_resume_state_bo);
266 
267    return result;
268 }
269 
270 static void
pvr_render_job_pt_programs_cleanup(struct pvr_device * device,struct rogue_pt_programs * pt_programs)271 pvr_render_job_pt_programs_cleanup(struct pvr_device *device,
272                                    struct rogue_pt_programs *pt_programs)
273 {
274    pvr_bo_suballoc_free(pt_programs->pds_resume_program.pvr_bo);
275    pvr_bo_suballoc_free(pt_programs->pds_store_program.pvr_bo);
276    pvr_bo_free(device, pt_programs->store_resume_state_bo);
277 }
278 
pvr_pds_ctx_sr_program_setup(bool cc_enable,uint64_t usc_program_upload_offset,uint8_t usc_temps,pvr_dev_addr_t sr_addr,struct pvr_pds_shared_storing_program * const program_out)279 static void pvr_pds_ctx_sr_program_setup(
280    bool cc_enable,
281    uint64_t usc_program_upload_offset,
282    uint8_t usc_temps,
283    pvr_dev_addr_t sr_addr,
284    struct pvr_pds_shared_storing_program *const program_out)
285 {
286    /* The PDS task is the same for stores and loads. */
287    *program_out = (struct pvr_pds_shared_storing_program){
288 		.cc_enable = cc_enable,
289 		.doutw_control = {
290 			.dest_store = PDS_UNIFIED_STORE,
291 			.num_const64 = 2,
292 			.doutw_data = {
293 				[0] = sr_addr.addr,
294 				[1] = sr_addr.addr + ROGUE_LLS_SHARED_REGS_RESERVE_SIZE,
295 			},
296 			.last_instruction = false,
297 		},
298 	};
299 
300    pvr_pds_setup_doutu(&program_out->usc_task.usc_task_control,
301                        usc_program_upload_offset,
302                        usc_temps,
303                        PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
304                        false);
305 }
306 
307 /* Note: pvr_pds_compute_ctx_sr_program_create_and_upload() is very similar to
308  * this. If there is a problem here it's likely that the same problem exists
309  * there so don't forget to update the compute function.
310  */
pvr_pds_render_ctx_sr_program_create_and_upload(struct pvr_device * device,uint64_t usc_program_upload_offset,uint8_t usc_temps,pvr_dev_addr_t sr_addr,struct pvr_pds_upload * const pds_upload_out)311 static VkResult pvr_pds_render_ctx_sr_program_create_and_upload(
312    struct pvr_device *device,
313    uint64_t usc_program_upload_offset,
314    uint8_t usc_temps,
315    pvr_dev_addr_t sr_addr,
316    struct pvr_pds_upload *const pds_upload_out)
317 {
318    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
319    const uint32_t cache_line_size = rogue_get_slc_cache_line_size(dev_info);
320    const uint32_t pds_data_alignment =
321       PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE) / 4U;
322 
323    /* FIXME: pvr_pds_generate_shared_storing_program() doesn't return the data
324     * and code size when using the PDS_GENERATE_SIZES mode.
325     */
326    STATIC_ASSERT(ROGUE_PDS_TASK_PROGRAM_SIZE % 4 == 0);
327    uint32_t staging_buffer[ROGUE_PDS_TASK_PROGRAM_SIZE / 4U] = { 0 };
328    struct pvr_pds_shared_storing_program program;
329    ASSERTED uint32_t *buffer_end;
330    uint32_t code_offset;
331 
332    pvr_pds_ctx_sr_program_setup(false,
333                                 usc_program_upload_offset,
334                                 usc_temps,
335                                 sr_addr,
336                                 &program);
337 
338    pvr_pds_generate_shared_storing_program(&program,
339                                            &staging_buffer[0],
340                                            PDS_GENERATE_DATA_SEGMENT,
341                                            dev_info);
342 
343    code_offset = ALIGN_POT(program.data_size, pds_data_alignment);
344 
345    buffer_end =
346       pvr_pds_generate_shared_storing_program(&program,
347                                               &staging_buffer[code_offset],
348                                               PDS_GENERATE_CODE_SEGMENT,
349                                               dev_info);
350 
351    assert((uint32_t)(buffer_end - staging_buffer) * sizeof(staging_buffer[0]) <
352           ROGUE_PDS_TASK_PROGRAM_SIZE);
353 
354    return pvr_gpu_upload_pds(device,
355                              &staging_buffer[0],
356                              program.data_size,
357                              PVRX(VDMCTRL_PDS_STATE1_PDS_DATA_ADDR_ALIGNMENT),
358                              &staging_buffer[code_offset],
359                              program.code_size,
360                              PVRX(VDMCTRL_PDS_STATE2_PDS_CODE_ADDR_ALIGNMENT),
361                              cache_line_size,
362                              pds_upload_out);
363 }
364 
365 /* Note: pvr_pds_render_ctx_sr_program_create_and_upload() is very similar to
366  * this. If there is a problem here it's likely that the same problem exists
367  * there so don't forget to update the render_ctx function.
368  */
pvr_pds_compute_ctx_sr_program_create_and_upload(struct pvr_device * device,bool is_loading_program,uint64_t usc_program_upload_offset,uint8_t usc_temps,pvr_dev_addr_t sr_addr,struct pvr_pds_upload * const pds_upload_out)369 static VkResult pvr_pds_compute_ctx_sr_program_create_and_upload(
370    struct pvr_device *device,
371    bool is_loading_program,
372    uint64_t usc_program_upload_offset,
373    uint8_t usc_temps,
374    pvr_dev_addr_t sr_addr,
375    struct pvr_pds_upload *const pds_upload_out)
376 {
377    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
378    const uint32_t cache_line_size = rogue_get_slc_cache_line_size(dev_info);
379    const uint32_t pds_data_alignment =
380       PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE) / 4U;
381 
382    /* FIXME: pvr_pds_generate_shared_storing_program() doesn't return the data
383     * and code size when using the PDS_GENERATE_SIZES mode.
384     */
385    STATIC_ASSERT(ROGUE_PDS_TASK_PROGRAM_SIZE % 4 == 0);
386    uint32_t staging_buffer[ROGUE_PDS_TASK_PROGRAM_SIZE / 4U] = { 0 };
387    struct pvr_pds_shared_storing_program program;
388    uint32_t *buffer_ptr;
389    uint32_t code_offset;
390 
391    pvr_pds_ctx_sr_program_setup(PVR_HAS_ERN(dev_info, 35421),
392                                 usc_program_upload_offset,
393                                 usc_temps,
394                                 sr_addr,
395                                 &program);
396 
397    if (is_loading_program && PVR_NEED_SW_COMPUTE_PDS_BARRIER(dev_info)) {
398       pvr_pds_generate_compute_shared_loading_program(&program,
399                                                       &staging_buffer[0],
400                                                       PDS_GENERATE_DATA_SEGMENT,
401                                                       dev_info);
402    } else {
403       pvr_pds_generate_shared_storing_program(&program,
404                                               &staging_buffer[0],
405                                               PDS_GENERATE_DATA_SEGMENT,
406                                               dev_info);
407    }
408 
409    code_offset = ALIGN_POT(program.data_size, pds_data_alignment);
410 
411    buffer_ptr =
412       pvr_pds_generate_compute_barrier_conditional(&staging_buffer[code_offset],
413                                                    PDS_GENERATE_CODE_SEGMENT);
414 
415    if (is_loading_program && PVR_NEED_SW_COMPUTE_PDS_BARRIER(dev_info)) {
416       buffer_ptr = pvr_pds_generate_compute_shared_loading_program(
417          &program,
418          buffer_ptr,
419          PDS_GENERATE_CODE_SEGMENT,
420          dev_info);
421    } else {
422       buffer_ptr =
423          pvr_pds_generate_shared_storing_program(&program,
424                                                  buffer_ptr,
425                                                  PDS_GENERATE_CODE_SEGMENT,
426                                                  dev_info);
427    }
428 
429    assert((uint32_t)(buffer_ptr - staging_buffer) * sizeof(staging_buffer[0]) <
430           ROGUE_PDS_TASK_PROGRAM_SIZE);
431 
432    STATIC_ASSERT(PVRX(CR_CDM_CONTEXT_PDS0_DATA_ADDR_ALIGNMENT) ==
433                  PVRX(CR_CDM_CONTEXT_LOAD_PDS0_DATA_ADDR_ALIGNMENT));
434 
435    STATIC_ASSERT(PVRX(CR_CDM_CONTEXT_PDS0_CODE_ADDR_ALIGNMENT) ==
436                  PVRX(CR_CDM_CONTEXT_LOAD_PDS0_CODE_ADDR_ALIGNMENT));
437 
438    return pvr_gpu_upload_pds(
439       device,
440       &staging_buffer[0],
441       program.data_size,
442       PVRX(CR_CDM_CONTEXT_PDS0_DATA_ADDR_ALIGNMENT),
443       &staging_buffer[code_offset],
444       (uint32_t)(buffer_ptr - &staging_buffer[code_offset]),
445       PVRX(CR_CDM_CONTEXT_PDS0_CODE_ADDR_ALIGNMENT),
446       cache_line_size,
447       pds_upload_out);
448 }
449 
450 enum pvr_ctx_sr_program_target {
451    PVR_CTX_SR_RENDER_TARGET,
452    PVR_CTX_SR_COMPUTE_TARGET,
453 };
454 
pvr_ctx_sr_programs_setup(struct pvr_device * device,enum pvr_ctx_sr_program_target target,struct rogue_sr_programs * sr_programs)455 static VkResult pvr_ctx_sr_programs_setup(struct pvr_device *device,
456                                           enum pvr_ctx_sr_program_target target,
457                                           struct rogue_sr_programs *sr_programs)
458 {
459    const uint64_t store_load_state_bo_size =
460       PVRX(LLS_USC_SHARED_REGS_BUFFER_SIZE) +
461       ROGUE_LLS_SHARED_REGS_RESERVE_SIZE;
462    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
463    const uint32_t cache_line_size = rogue_get_slc_cache_line_size(dev_info);
464    uint64_t usc_store_program_upload_offset;
465    uint64_t usc_load_program_upload_offset;
466    const uint8_t *usc_load_sr_code;
467    uint32_t usc_load_sr_code_size;
468    VkResult result;
469 
470    /* Note that this is being used for both compute and render ctx. There is no
471     * compute equivalent define for the VDMCTRL unit size.
472     */
473    /* 4 blocks (16 dwords / 64 bytes) in USC to prevent fragmentation. */
474    sr_programs->usc.unified_size =
475       DIV_ROUND_UP(64, PVRX(VDMCTRL_PDS_STATE0_USC_UNIFIED_SIZE_UNIT_SIZE));
476 
477    result = pvr_bo_alloc(device,
478                          device->heaps.pds_heap,
479                          store_load_state_bo_size,
480                          cache_line_size,
481                          PVR_WINSYS_BO_FLAG_CPU_ACCESS,
482                          &sr_programs->store_load_state_bo);
483    if (result != VK_SUCCESS)
484       return result;
485 
486    /* USC state update: SR state store. */
487 
488    assert(sizeof(pvr_vdm_store_sr_code) < ROGUE_USC_TASK_PROGRAM_SIZE);
489 
490    result = pvr_gpu_upload_usc(device,
491                                pvr_vdm_store_sr_code,
492                                sizeof(pvr_vdm_store_sr_code),
493                                cache_line_size,
494                                &sr_programs->usc.store_program_bo);
495    if (result != VK_SUCCESS)
496       goto err_free_store_load_state_bo;
497 
498    usc_store_program_upload_offset =
499       sr_programs->usc.store_program_bo->dev_addr.addr -
500       device->heaps.usc_heap->base_addr.addr;
501 
502    /* USC state update: SR state load. */
503 
504    if (target == PVR_CTX_SR_COMPUTE_TARGET && PVR_HAS_QUIRK(dev_info, 62269)) {
505       STATIC_ASSERT(sizeof(pvr_cdm_load_sr_code) < ROGUE_USC_TASK_PROGRAM_SIZE);
506 
507       usc_load_sr_code = pvr_cdm_load_sr_code;
508       usc_load_sr_code_size = sizeof(pvr_cdm_load_sr_code);
509    } else {
510       STATIC_ASSERT(sizeof(pvr_vdm_load_sr_code) < ROGUE_USC_TASK_PROGRAM_SIZE);
511 
512       usc_load_sr_code = pvr_vdm_load_sr_code;
513       usc_load_sr_code_size = sizeof(pvr_vdm_load_sr_code);
514    }
515 
516    result = pvr_gpu_upload_usc(device,
517                                usc_load_sr_code,
518                                usc_load_sr_code_size,
519                                cache_line_size,
520                                &sr_programs->usc.load_program_bo);
521    if (result != VK_SUCCESS)
522       goto err_free_usc_store_program_bo;
523 
524    usc_load_program_upload_offset =
525       sr_programs->usc.load_program_bo->dev_addr.addr -
526       device->heaps.usc_heap->base_addr.addr;
527 
528    /* FIXME: The number of USC temps should be output alongside
529     * pvr_vdm_store_sr_code rather than hard coded.
530     */
531    /* Create and upload the PDS load and store programs. Point them to the
532     * appropriate USC load and store programs.
533     */
534    switch (target) {
535    case PVR_CTX_SR_RENDER_TARGET:
536       /* PDS state update: SR state store. */
537       result = pvr_pds_render_ctx_sr_program_create_and_upload(
538          device,
539          usc_store_program_upload_offset,
540          8,
541          sr_programs->store_load_state_bo->vma->dev_addr,
542          &sr_programs->pds.store_program);
543       if (result != VK_SUCCESS)
544          goto err_free_usc_load_program_bo;
545 
546       /* PDS state update: SR state load. */
547       result = pvr_pds_render_ctx_sr_program_create_and_upload(
548          device,
549          usc_load_program_upload_offset,
550          20,
551          sr_programs->store_load_state_bo->vma->dev_addr,
552          &sr_programs->pds.load_program);
553       if (result != VK_SUCCESS)
554          goto err_free_pds_store_program_bo;
555 
556       break;
557 
558    case PVR_CTX_SR_COMPUTE_TARGET:
559       /* PDS state update: SR state store. */
560       result = pvr_pds_compute_ctx_sr_program_create_and_upload(
561          device,
562          false,
563          usc_store_program_upload_offset,
564          8,
565          sr_programs->store_load_state_bo->vma->dev_addr,
566          &sr_programs->pds.store_program);
567       if (result != VK_SUCCESS)
568          goto err_free_usc_load_program_bo;
569 
570       /* PDS state update: SR state load. */
571       result = pvr_pds_compute_ctx_sr_program_create_and_upload(
572          device,
573          true,
574          usc_load_program_upload_offset,
575          20,
576          sr_programs->store_load_state_bo->vma->dev_addr,
577          &sr_programs->pds.load_program);
578       if (result != VK_SUCCESS)
579          goto err_free_pds_store_program_bo;
580 
581       break;
582 
583    default:
584       unreachable("Invalid target.");
585       break;
586    }
587 
588    return VK_SUCCESS;
589 
590 err_free_pds_store_program_bo:
591    pvr_bo_suballoc_free(sr_programs->pds.store_program.pvr_bo);
592 
593 err_free_usc_load_program_bo:
594    pvr_bo_suballoc_free(sr_programs->usc.load_program_bo);
595 
596 err_free_usc_store_program_bo:
597    pvr_bo_suballoc_free(sr_programs->usc.store_program_bo);
598 
599 err_free_store_load_state_bo:
600    pvr_bo_free(device, sr_programs->store_load_state_bo);
601 
602    return result;
603 }
604 
pvr_ctx_sr_programs_cleanup(struct pvr_device * device,struct rogue_sr_programs * sr_programs)605 static void pvr_ctx_sr_programs_cleanup(struct pvr_device *device,
606                                         struct rogue_sr_programs *sr_programs)
607 {
608    pvr_bo_suballoc_free(sr_programs->pds.load_program.pvr_bo);
609    pvr_bo_suballoc_free(sr_programs->pds.store_program.pvr_bo);
610    pvr_bo_suballoc_free(sr_programs->usc.load_program_bo);
611    pvr_bo_suballoc_free(sr_programs->usc.store_program_bo);
612    pvr_bo_free(device, sr_programs->store_load_state_bo);
613 }
614 
615 static VkResult
pvr_render_ctx_switch_programs_setup(struct pvr_device * device,struct pvr_render_ctx_programs * programs)616 pvr_render_ctx_switch_programs_setup(struct pvr_device *device,
617                                      struct pvr_render_ctx_programs *programs)
618 {
619    VkResult result;
620 
621    result = pvr_render_job_pt_programs_setup(device, &programs->pt);
622    if (result != VK_SUCCESS)
623       return result;
624 
625    result = pvr_ctx_sr_programs_setup(device,
626                                       PVR_CTX_SR_RENDER_TARGET,
627                                       &programs->sr);
628    if (result != VK_SUCCESS)
629       goto err_pt_programs_cleanup;
630 
631    return VK_SUCCESS;
632 
633 err_pt_programs_cleanup:
634    pvr_render_job_pt_programs_cleanup(device, &programs->pt);
635 
636    return result;
637 }
638 
639 static void
pvr_render_ctx_switch_programs_cleanup(struct pvr_device * device,struct pvr_render_ctx_programs * programs)640 pvr_render_ctx_switch_programs_cleanup(struct pvr_device *device,
641                                        struct pvr_render_ctx_programs *programs)
642 {
643    pvr_ctx_sr_programs_cleanup(device, &programs->sr);
644    pvr_render_job_pt_programs_cleanup(device, &programs->pt);
645 }
646 
pvr_render_ctx_switch_init(struct pvr_device * device,struct pvr_render_ctx * ctx)647 static VkResult pvr_render_ctx_switch_init(struct pvr_device *device,
648                                            struct pvr_render_ctx *ctx)
649 {
650    struct pvr_render_ctx_switch *ctx_switch = &ctx->ctx_switch;
651    const uint64_t vdm_state_bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED |
652                                        PVR_BO_ALLOC_FLAG_CPU_ACCESS;
653    const uint64_t geom_state_bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED |
654                                         PVR_BO_ALLOC_FLAG_CPU_ACCESS;
655    VkResult result;
656    uint32_t i;
657 
658    result = pvr_bo_alloc(device,
659                          device->heaps.general_heap,
660                          ROGUE_LLS_VDM_CONTEXT_RESUME_BUFFER_SIZE,
661                          ROGUE_LLS_VDM_CONTEXT_RESUME_BUFFER_ALIGNMENT,
662                          vdm_state_bo_flags,
663                          &ctx_switch->vdm_state_bo);
664    if (result != VK_SUCCESS)
665       return result;
666 
667    result = pvr_bo_alloc(device,
668                          device->heaps.general_heap,
669                          ROGUE_LLS_TA_STATE_BUFFER_SIZE,
670                          ROGUE_LLS_TA_STATE_BUFFER_ALIGNMENT,
671                          geom_state_bo_flags,
672                          &ctx_switch->geom_state_bo);
673    if (result != VK_SUCCESS)
674       goto err_pvr_bo_free_vdm_state_bo;
675 
676    for (i = 0; i < ARRAY_SIZE(ctx_switch->programs); i++) {
677       result =
678          pvr_render_ctx_switch_programs_setup(device, &ctx_switch->programs[i]);
679       if (result != VK_SUCCESS)
680          goto err_programs_cleanup;
681    }
682 
683    return VK_SUCCESS;
684 
685 err_programs_cleanup:
686    for (uint32_t j = 0; j < i; j++)
687       pvr_render_ctx_switch_programs_cleanup(device, &ctx_switch->programs[j]);
688 
689    pvr_bo_free(device, ctx_switch->geom_state_bo);
690 
691 err_pvr_bo_free_vdm_state_bo:
692    pvr_bo_free(device, ctx_switch->vdm_state_bo);
693 
694    return result;
695 }
696 
pvr_render_ctx_switch_fini(struct pvr_device * device,struct pvr_render_ctx * ctx)697 static void pvr_render_ctx_switch_fini(struct pvr_device *device,
698                                        struct pvr_render_ctx *ctx)
699 {
700    struct pvr_render_ctx_switch *ctx_switch = &ctx->ctx_switch;
701 
702    for (uint32_t i = 0; i < ARRAY_SIZE(ctx_switch->programs); i++)
703       pvr_render_ctx_switch_programs_cleanup(device, &ctx_switch->programs[i]);
704 
705    pvr_bo_free(device, ctx_switch->geom_state_bo);
706    pvr_bo_free(device, ctx_switch->vdm_state_bo);
707 }
708 
709 static void
pvr_rogue_get_vdmctrl_pds_state_words(struct pvr_pds_upload * pds_program,enum PVRX (VDMCTRL_USC_TARGET)usc_target,uint8_t usc_unified_size,uint32_t * const state0_out,uint32_t * const state1_out)710 pvr_rogue_get_vdmctrl_pds_state_words(struct pvr_pds_upload *pds_program,
711                                       enum PVRX(VDMCTRL_USC_TARGET) usc_target,
712                                       uint8_t usc_unified_size,
713                                       uint32_t *const state0_out,
714                                       uint32_t *const state1_out)
715 {
716    pvr_csb_pack (state0_out, VDMCTRL_PDS_STATE0, state) {
717       /* Convert the data size from dwords to bytes. */
718       const uint32_t pds_data_size = PVR_DW_TO_BYTES(pds_program->data_size);
719 
720       state.dm_target = PVRX(VDMCTRL_DM_TARGET_VDM);
721       state.usc_target = usc_target;
722       state.usc_common_size = 0;
723       state.usc_unified_size = usc_unified_size;
724       state.pds_temp_size = 0;
725 
726       assert(pds_data_size % PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE) ==
727              0);
728       state.pds_data_size =
729          pds_data_size / PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE);
730    };
731 
732    pvr_csb_pack (state1_out, VDMCTRL_PDS_STATE1, state) {
733       state.pds_data_addr = PVR_DEV_ADDR(pds_program->data_offset);
734       state.sd_type = PVRX(VDMCTRL_SD_TYPE_PDS);
735       state.sd_next_type = PVRX(VDMCTRL_SD_TYPE_PDS);
736    }
737 }
738 
739 static void
pvr_rogue_get_geom_state_stream_out_words(struct pvr_pds_upload * pds_program,uint32_t * const stream_out1_out,uint32_t * const stream_out2_out)740 pvr_rogue_get_geom_state_stream_out_words(struct pvr_pds_upload *pds_program,
741                                           uint32_t *const stream_out1_out,
742                                           uint32_t *const stream_out2_out)
743 {
744    pvr_csb_pack (stream_out1_out, TA_STATE_STREAM_OUT1, state) {
745       /* Convert the data size from dwords to bytes. */
746       const uint32_t pds_data_size = PVR_DW_TO_BYTES(pds_program->data_size);
747 
748       state.sync = true;
749 
750       assert(pds_data_size %
751                 PVRX(TA_STATE_STREAM_OUT1_PDS_DATA_SIZE_UNIT_SIZE) ==
752              0);
753       state.pds_data_size =
754          pds_data_size / PVRX(TA_STATE_STREAM_OUT1_PDS_DATA_SIZE_UNIT_SIZE);
755 
756       state.pds_temp_size = 0;
757    }
758 
759    pvr_csb_pack (stream_out2_out, TA_STATE_STREAM_OUT2, state) {
760       state.pds_data_addr = PVR_DEV_ADDR(pds_program->data_offset);
761    }
762 }
763 
pvr_render_ctx_ws_static_state_init(struct pvr_render_ctx * ctx,struct pvr_winsys_render_ctx_static_state * static_state)764 static void pvr_render_ctx_ws_static_state_init(
765    struct pvr_render_ctx *ctx,
766    struct pvr_winsys_render_ctx_static_state *static_state)
767 {
768    uint64_t *q_dst;
769    uint32_t *d_dst;
770 
771    q_dst = &static_state->vdm_ctx_state_base_addr;
772    pvr_csb_pack (q_dst, CR_VDM_CONTEXT_STATE_BASE, base) {
773       base.addr = ctx->ctx_switch.vdm_state_bo->vma->dev_addr;
774    }
775 
776    q_dst = &static_state->geom_ctx_state_base_addr;
777    pvr_csb_pack (q_dst, CR_TA_CONTEXT_STATE_BASE, base) {
778       base.addr = ctx->ctx_switch.geom_state_bo->vma->dev_addr;
779    }
780 
781    for (uint32_t i = 0; i < ARRAY_SIZE(ctx->ctx_switch.programs); i++) {
782       struct rogue_pt_programs *pt_prog = &ctx->ctx_switch.programs[i].pt;
783       struct rogue_sr_programs *sr_prog = &ctx->ctx_switch.programs[i].sr;
784 
785       /* Context store state. */
786       q_dst = &static_state->geom_state[i].vdm_ctx_store_task0;
787       pvr_csb_pack (q_dst, CR_VDM_CONTEXT_STORE_TASK0, task0) {
788          pvr_rogue_get_vdmctrl_pds_state_words(&sr_prog->pds.store_program,
789                                                PVRX(VDMCTRL_USC_TARGET_ANY),
790                                                sr_prog->usc.unified_size,
791                                                &task0.pds_state0,
792                                                &task0.pds_state1);
793       }
794 
795       d_dst = &static_state->geom_state[i].vdm_ctx_store_task1;
796       pvr_csb_pack (d_dst, CR_VDM_CONTEXT_STORE_TASK1, task1) {
797          pvr_csb_pack (&task1.pds_state2, VDMCTRL_PDS_STATE2, state) {
798             state.pds_code_addr =
799                PVR_DEV_ADDR(sr_prog->pds.store_program.code_offset);
800          }
801       }
802 
803       q_dst = &static_state->geom_state[i].vdm_ctx_store_task2;
804       pvr_csb_pack (q_dst, CR_VDM_CONTEXT_STORE_TASK2, task2) {
805          pvr_rogue_get_geom_state_stream_out_words(&pt_prog->pds_store_program,
806                                                    &task2.stream_out1,
807                                                    &task2.stream_out2);
808       }
809 
810       /* Context resume state. */
811       q_dst = &static_state->geom_state[i].vdm_ctx_resume_task0;
812       pvr_csb_pack (q_dst, CR_VDM_CONTEXT_RESUME_TASK0, task0) {
813          pvr_rogue_get_vdmctrl_pds_state_words(&sr_prog->pds.load_program,
814                                                PVRX(VDMCTRL_USC_TARGET_ALL),
815                                                sr_prog->usc.unified_size,
816                                                &task0.pds_state0,
817                                                &task0.pds_state1);
818       }
819 
820       d_dst = &static_state->geom_state[i].vdm_ctx_resume_task1;
821       pvr_csb_pack (d_dst, CR_VDM_CONTEXT_RESUME_TASK1, task1) {
822          pvr_csb_pack (&task1.pds_state2, VDMCTRL_PDS_STATE2, state) {
823             state.pds_code_addr =
824                PVR_DEV_ADDR(sr_prog->pds.load_program.code_offset);
825          }
826       }
827 
828       q_dst = &static_state->geom_state[i].vdm_ctx_resume_task2;
829       pvr_csb_pack (q_dst, CR_VDM_CONTEXT_RESUME_TASK2, task2) {
830          pvr_rogue_get_geom_state_stream_out_words(&pt_prog->pds_resume_program,
831                                                    &task2.stream_out1,
832                                                    &task2.stream_out2);
833       }
834    }
835 }
836 
pvr_render_ctx_ws_create_info_init(struct pvr_render_ctx * ctx,enum pvr_winsys_ctx_priority priority,struct pvr_winsys_render_ctx_create_info * create_info)837 static void pvr_render_ctx_ws_create_info_init(
838    struct pvr_render_ctx *ctx,
839    enum pvr_winsys_ctx_priority priority,
840    struct pvr_winsys_render_ctx_create_info *create_info)
841 {
842    create_info->priority = priority;
843    create_info->vdm_callstack_addr = ctx->vdm_callstack_bo->vma->dev_addr;
844 
845    pvr_render_ctx_ws_static_state_init(ctx, &create_info->static_state);
846 }
847 
pvr_render_ctx_create(struct pvr_device * device,enum pvr_winsys_ctx_priority priority,struct pvr_render_ctx ** const ctx_out)848 VkResult pvr_render_ctx_create(struct pvr_device *device,
849                                enum pvr_winsys_ctx_priority priority,
850                                struct pvr_render_ctx **const ctx_out)
851 {
852    const uint64_t vdm_callstack_size =
853       sizeof(uint64_t) * PVR_VDM_CALLSTACK_MAX_DEPTH;
854    struct pvr_winsys_render_ctx_create_info create_info;
855    struct pvr_render_ctx *ctx;
856    VkResult result;
857 
858    ctx = vk_alloc(&device->vk.alloc,
859                   sizeof(*ctx),
860                   8,
861                   VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
862    if (!ctx)
863       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
864 
865    ctx->device = device;
866 
867    result = pvr_bo_alloc(device,
868                          device->heaps.general_heap,
869                          vdm_callstack_size,
870                          PVRX(CR_VDM_CALL_STACK_POINTER_ADDR_ALIGNMENT),
871                          0,
872                          &ctx->vdm_callstack_bo);
873    if (result != VK_SUCCESS)
874       goto err_vk_free_ctx;
875 
876    result = pvr_render_ctx_switch_init(device, ctx);
877    if (result != VK_SUCCESS)
878       goto err_free_vdm_callstack_bo;
879 
880    result = pvr_ctx_reset_cmd_init(device, &ctx->reset_cmd);
881    if (result != VK_SUCCESS)
882       goto err_render_ctx_switch_fini;
883 
884    /* ctx must be fully initialized by this point since
885     * pvr_render_ctx_ws_create_info_init() depends on this.
886     */
887    pvr_render_ctx_ws_create_info_init(ctx, priority, &create_info);
888 
889    result = device->ws->ops->render_ctx_create(device->ws,
890                                                &create_info,
891                                                &ctx->ws_ctx);
892    if (result != VK_SUCCESS)
893       goto err_render_ctx_reset_cmd_fini;
894 
895    *ctx_out = ctx;
896 
897    return VK_SUCCESS;
898 
899 err_render_ctx_reset_cmd_fini:
900    pvr_ctx_reset_cmd_fini(device, &ctx->reset_cmd);
901 
902 err_render_ctx_switch_fini:
903    pvr_render_ctx_switch_fini(device, ctx);
904 
905 err_free_vdm_callstack_bo:
906    pvr_bo_free(device, ctx->vdm_callstack_bo);
907 
908 err_vk_free_ctx:
909    vk_free(&device->vk.alloc, ctx);
910 
911    return result;
912 }
913 
pvr_render_ctx_destroy(struct pvr_render_ctx * ctx)914 void pvr_render_ctx_destroy(struct pvr_render_ctx *ctx)
915 {
916    struct pvr_device *device = ctx->device;
917 
918    device->ws->ops->render_ctx_destroy(ctx->ws_ctx);
919 
920    pvr_ctx_reset_cmd_fini(device, &ctx->reset_cmd);
921    pvr_render_ctx_switch_fini(device, ctx);
922    pvr_bo_free(device, ctx->vdm_callstack_bo);
923    vk_free(&device->vk.alloc, ctx);
924 }
925 
pvr_pds_sr_fence_terminate_program_create_and_upload(struct pvr_device * device,struct pvr_pds_upload * const pds_upload_out)926 static VkResult pvr_pds_sr_fence_terminate_program_create_and_upload(
927    struct pvr_device *device,
928    struct pvr_pds_upload *const pds_upload_out)
929 {
930    const uint32_t pds_data_alignment =
931       PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE) / 4U;
932    const struct pvr_device_runtime_info *dev_runtime_info =
933       &device->pdevice->dev_runtime_info;
934    ASSERTED const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
935    uint32_t staging_buffer[PVRX(PDS_TASK_PROGRAM_SIZE) >> 2U];
936    struct pvr_pds_fence_program program = { 0 };
937    ASSERTED uint32_t *buffer_end;
938    uint32_t code_offset;
939    uint32_t data_size;
940 
941    /* SW_COMPUTE_PDS_BARRIER is not supported with 2 or more phantoms. */
942    assert(!(PVR_NEED_SW_COMPUTE_PDS_BARRIER(dev_info) &&
943             dev_runtime_info->num_phantoms >= 2));
944 
945    pvr_pds_generate_fence_terminate_program(&program,
946                                             staging_buffer,
947                                             PDS_GENERATE_DATA_SEGMENT,
948                                             &device->pdevice->dev_info);
949 
950    /* FIXME: pvr_pds_generate_fence_terminate_program() zeros out the data_size
951     * when we generate the code segment. Implement
952     * PDS_GENERATE_CODEDATA_SEGMENTS? Or wait for the pds gen api to change?
953     * This behavior doesn't seem consistent with the rest of the api. For now
954     * we store the size in a variable.
955     */
956    data_size = program.data_size;
957    code_offset = ALIGN_POT(program.data_size, pds_data_alignment);
958 
959    buffer_end =
960       pvr_pds_generate_fence_terminate_program(&program,
961                                                &staging_buffer[code_offset],
962                                                PDS_GENERATE_CODE_SEGMENT,
963                                                &device->pdevice->dev_info);
964 
965    assert((uint64_t)(buffer_end - staging_buffer) * sizeof(staging_buffer[0]) <
966           ROGUE_PDS_TASK_PROGRAM_SIZE);
967 
968    return pvr_gpu_upload_pds(device,
969                              staging_buffer,
970                              data_size,
971                              PVRX(CR_CDM_TERMINATE_PDS_DATA_ADDR_ALIGNMENT),
972                              &staging_buffer[code_offset],
973                              program.code_size,
974                              PVRX(CR_CDM_TERMINATE_PDS_CODE_ADDR_ALIGNMENT),
975                              0,
976                              pds_upload_out);
977 }
978 
pvr_compute_ctx_ws_static_state_init(const struct pvr_device_info * const dev_info,const struct pvr_compute_ctx * const ctx,struct pvr_winsys_compute_ctx_static_state * const static_state)979 static void pvr_compute_ctx_ws_static_state_init(
980    const struct pvr_device_info *const dev_info,
981    const struct pvr_compute_ctx *const ctx,
982    struct pvr_winsys_compute_ctx_static_state *const static_state)
983 {
984    const struct pvr_compute_ctx_switch *const ctx_switch = &ctx->ctx_switch;
985 
986    /* CR_CDM_CONTEXT_... use state store program info. */
987 
988    pvr_csb_pack (&static_state->cdm_ctx_store_pds0,
989                  CR_CDM_CONTEXT_PDS0,
990                  state) {
991       state.data_addr =
992          PVR_DEV_ADDR(ctx_switch->sr[0].pds.store_program.data_offset);
993       state.code_addr =
994          PVR_DEV_ADDR(ctx_switch->sr[0].pds.store_program.code_offset);
995    }
996 
997    pvr_csb_pack (&static_state->cdm_ctx_store_pds0_b,
998                  CR_CDM_CONTEXT_PDS0,
999                  state) {
1000       state.data_addr =
1001          PVR_DEV_ADDR(ctx_switch->sr[1].pds.store_program.data_offset);
1002       state.code_addr =
1003          PVR_DEV_ADDR(ctx_switch->sr[1].pds.store_program.code_offset);
1004    }
1005 
1006    pvr_csb_pack (&static_state->cdm_ctx_store_pds1,
1007                  CR_CDM_CONTEXT_PDS1,
1008                  state) {
1009       const uint32_t store_program_data_size =
1010          PVR_DW_TO_BYTES(ctx_switch->sr[0].pds.store_program.data_size);
1011 
1012       state.pds_seq_dep = true;
1013       state.usc_seq_dep = false;
1014       state.target = true;
1015       state.unified_size = ctx_switch->sr[0].usc.unified_size;
1016       state.common_shared = false;
1017       state.common_size = 0;
1018       state.temp_size = 0;
1019 
1020       assert(store_program_data_size %
1021                 PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE) ==
1022              0);
1023       state.data_size = store_program_data_size /
1024                         PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE);
1025 
1026       state.fence = true;
1027    }
1028 
1029    /* CR_CDM_TERMINATE_... use fence terminate info. */
1030 
1031    pvr_csb_pack (&static_state->cdm_ctx_terminate_pds,
1032                  CR_CDM_TERMINATE_PDS,
1033                  state) {
1034       state.data_addr =
1035          PVR_DEV_ADDR(ctx_switch->sr_fence_terminate_program.data_offset);
1036       state.code_addr =
1037          PVR_DEV_ADDR(ctx_switch->sr_fence_terminate_program.code_offset);
1038    }
1039 
1040    pvr_csb_pack (&static_state->cdm_ctx_terminate_pds1,
1041                  CR_CDM_TERMINATE_PDS1,
1042                  state) {
1043       /* Convert the data size from dwords to bytes. */
1044       const uint32_t fence_terminate_program_data_size =
1045          PVR_DW_TO_BYTES(ctx_switch->sr_fence_terminate_program.data_size);
1046 
1047       state.pds_seq_dep = true;
1048       state.usc_seq_dep = false;
1049       state.target = !PVR_HAS_FEATURE(dev_info, compute_morton_capable);
1050       state.unified_size = 0;
1051       /* Common store is for shareds -- this will free the partitions. */
1052       state.common_shared = true;
1053       state.common_size = 0;
1054       state.temp_size = 0;
1055 
1056       assert(fence_terminate_program_data_size %
1057                 PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE) ==
1058              0);
1059       state.data_size = fence_terminate_program_data_size /
1060                         PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE);
1061       state.fence = true;
1062    }
1063 
1064    /* CR_CDM_RESUME_... use state load program info. */
1065 
1066    pvr_csb_pack (&static_state->cdm_ctx_resume_pds0,
1067                  CR_CDM_CONTEXT_LOAD_PDS0,
1068                  state) {
1069       state.data_addr =
1070          PVR_DEV_ADDR(ctx_switch->sr[0].pds.load_program.data_offset);
1071       state.code_addr =
1072          PVR_DEV_ADDR(ctx_switch->sr[0].pds.load_program.code_offset);
1073    }
1074 
1075    pvr_csb_pack (&static_state->cdm_ctx_resume_pds0_b,
1076                  CR_CDM_CONTEXT_LOAD_PDS0,
1077                  state) {
1078       state.data_addr =
1079          PVR_DEV_ADDR(ctx_switch->sr[1].pds.load_program.data_offset);
1080       state.code_addr =
1081          PVR_DEV_ADDR(ctx_switch->sr[1].pds.load_program.code_offset);
1082    }
1083 }
1084 
pvr_compute_ctx_ws_create_info_init(const struct pvr_compute_ctx * const ctx,enum pvr_winsys_ctx_priority priority,struct pvr_winsys_compute_ctx_create_info * const create_info)1085 static void pvr_compute_ctx_ws_create_info_init(
1086    const struct pvr_compute_ctx *const ctx,
1087    enum pvr_winsys_ctx_priority priority,
1088    struct pvr_winsys_compute_ctx_create_info *const create_info)
1089 {
1090    create_info->priority = priority;
1091 
1092    pvr_compute_ctx_ws_static_state_init(&ctx->device->pdevice->dev_info,
1093                                         ctx,
1094                                         &create_info->static_state);
1095 }
1096 
pvr_compute_ctx_create(struct pvr_device * const device,enum pvr_winsys_ctx_priority priority,struct pvr_compute_ctx ** const ctx_out)1097 VkResult pvr_compute_ctx_create(struct pvr_device *const device,
1098                                 enum pvr_winsys_ctx_priority priority,
1099                                 struct pvr_compute_ctx **const ctx_out)
1100 {
1101    struct pvr_winsys_compute_ctx_create_info create_info;
1102    struct pvr_compute_ctx *ctx;
1103    VkResult result;
1104 
1105    ctx = vk_alloc(&device->vk.alloc,
1106                   sizeof(*ctx),
1107                   8,
1108                   VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1109    if (!ctx)
1110       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1111 
1112    ctx->device = device;
1113 
1114    result = pvr_bo_alloc(
1115       device,
1116       device->heaps.general_heap,
1117       rogue_get_cdm_context_resume_buffer_size(&device->pdevice->dev_info),
1118       rogue_get_cdm_context_resume_buffer_alignment(&device->pdevice->dev_info),
1119       PVR_WINSYS_BO_FLAG_CPU_ACCESS | PVR_WINSYS_BO_FLAG_GPU_UNCACHED,
1120       &ctx->ctx_switch.compute_state_bo);
1121    if (result != VK_SUCCESS)
1122       goto err_free_ctx;
1123 
1124    /* TODO: Change this so that enabling storage to B doesn't change the array
1125     * size. Instead of looping we could unroll this and have the second
1126     * programs setup depending on the B enable. Doing it that way would make
1127     * things more obvious.
1128     */
1129    for (uint32_t i = 0; i < ARRAY_SIZE(ctx->ctx_switch.sr); i++) {
1130       result = pvr_ctx_sr_programs_setup(device,
1131                                          PVR_CTX_SR_COMPUTE_TARGET,
1132                                          &ctx->ctx_switch.sr[i]);
1133       if (result != VK_SUCCESS) {
1134          for (uint32_t j = 0; j < i; j++)
1135             pvr_ctx_sr_programs_cleanup(device, &ctx->ctx_switch.sr[j]);
1136 
1137          goto err_free_state_buffer;
1138       }
1139    }
1140 
1141    result = pvr_pds_sr_fence_terminate_program_create_and_upload(
1142       device,
1143       &ctx->ctx_switch.sr_fence_terminate_program);
1144    if (result != VK_SUCCESS)
1145       goto err_free_sr_programs;
1146 
1147    pvr_compute_ctx_ws_create_info_init(ctx, priority, &create_info);
1148 
1149    result = pvr_ctx_reset_cmd_init(device, &ctx->reset_cmd);
1150    if (result != VK_SUCCESS)
1151       goto err_free_pds_fence_terminate_program;
1152 
1153    result = device->ws->ops->compute_ctx_create(device->ws,
1154                                                 &create_info,
1155                                                 &ctx->ws_ctx);
1156    if (result != VK_SUCCESS)
1157       goto err_fini_reset_cmd;
1158 
1159    *ctx_out = ctx;
1160 
1161    return VK_SUCCESS;
1162 
1163 err_fini_reset_cmd:
1164    pvr_ctx_reset_cmd_fini(device, &ctx->reset_cmd);
1165 
1166 err_free_pds_fence_terminate_program:
1167    pvr_bo_suballoc_free(ctx->ctx_switch.sr_fence_terminate_program.pvr_bo);
1168 
1169 err_free_sr_programs:
1170    for (uint32_t i = 0; i < ARRAY_SIZE(ctx->ctx_switch.sr); ++i)
1171       pvr_ctx_sr_programs_cleanup(device, &ctx->ctx_switch.sr[i]);
1172 
1173 err_free_state_buffer:
1174    pvr_bo_free(device, ctx->ctx_switch.compute_state_bo);
1175 
1176 err_free_ctx:
1177    vk_free(&device->vk.alloc, ctx);
1178 
1179    return result;
1180 }
1181 
pvr_compute_ctx_destroy(struct pvr_compute_ctx * const ctx)1182 void pvr_compute_ctx_destroy(struct pvr_compute_ctx *const ctx)
1183 {
1184    struct pvr_device *device = ctx->device;
1185 
1186    device->ws->ops->compute_ctx_destroy(ctx->ws_ctx);
1187 
1188    pvr_ctx_reset_cmd_fini(device, &ctx->reset_cmd);
1189 
1190    pvr_bo_suballoc_free(ctx->ctx_switch.sr_fence_terminate_program.pvr_bo);
1191    for (uint32_t i = 0; i < ARRAY_SIZE(ctx->ctx_switch.sr); ++i)
1192       pvr_ctx_sr_programs_cleanup(device, &ctx->ctx_switch.sr[i]);
1193 
1194    pvr_bo_free(device, ctx->ctx_switch.compute_state_bo);
1195 
1196    vk_free(&device->vk.alloc, ctx);
1197 }
1198 
pvr_transfer_ctx_ws_create_info_init(enum pvr_winsys_ctx_priority priority,struct pvr_winsys_transfer_ctx_create_info * const create_info)1199 static void pvr_transfer_ctx_ws_create_info_init(
1200    enum pvr_winsys_ctx_priority priority,
1201    struct pvr_winsys_transfer_ctx_create_info *const create_info)
1202 {
1203    create_info->priority = priority;
1204 }
1205 
pvr_transfer_eot_shaders_init(struct pvr_device * device,struct pvr_transfer_ctx * ctx)1206 static VkResult pvr_transfer_eot_shaders_init(struct pvr_device *device,
1207                                               struct pvr_transfer_ctx *ctx)
1208 {
1209    uint64_t rt_pbe_regs[PVR_TRANSFER_MAX_RENDER_TARGETS];
1210 
1211    /* Setup start indexes of the shared registers that will contain the PBE
1212     * state words for each render target. These must match the indexes used in
1213     * pvr_pds_generate_pixel_event(), which is used to generate the
1214     * corresponding PDS program in pvr_pbe_setup_emit() via
1215     * pvr_pds_generate_pixel_event_data_segment() and
1216     * pvr_pds_generate_pixel_event_code_segment().
1217     */
1218    /* TODO: store the shared register information somewhere so that it can be
1219     * shared with pvr_pbe_setup_emit() rather than having the shared register
1220     * indexes and number of shared registers hard coded in
1221     * pvr_pds_generate_pixel_event().
1222     */
1223    for (uint32_t i = 0; i < ARRAY_SIZE(rt_pbe_regs); i++)
1224       rt_pbe_regs[i] = i * PVR_STATE_PBE_DWORDS;
1225 
1226    STATIC_ASSERT(ARRAY_SIZE(rt_pbe_regs) == ARRAY_SIZE(ctx->usc_eot_bos));
1227 
1228    for (uint32_t i = 0; i < ARRAY_SIZE(ctx->usc_eot_bos); i++) {
1229       const uint32_t cache_line_size =
1230          rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
1231       const unsigned rt_count = i + 1;
1232       struct util_dynarray eot_bin;
1233       VkResult result;
1234 
1235       pvr_uscgen_tq_eot(rt_count, rt_pbe_regs, &eot_bin);
1236 
1237       result = pvr_gpu_upload_usc(device,
1238                                   util_dynarray_begin(&eot_bin),
1239                                   eot_bin.size,
1240                                   cache_line_size,
1241                                   &ctx->usc_eot_bos[i]);
1242       util_dynarray_fini(&eot_bin);
1243       if (result != VK_SUCCESS) {
1244          for (uint32_t j = 0; j < i; j++)
1245             pvr_bo_suballoc_free(ctx->usc_eot_bos[j]);
1246 
1247          return result;
1248       }
1249    }
1250 
1251    return VK_SUCCESS;
1252 }
1253 
pvr_transfer_eot_shaders_fini(struct pvr_device * device,struct pvr_transfer_ctx * ctx)1254 static void pvr_transfer_eot_shaders_fini(struct pvr_device *device,
1255                                           struct pvr_transfer_ctx *ctx)
1256 {
1257    for (uint32_t i = 0; i < ARRAY_SIZE(ctx->usc_eot_bos); i++)
1258       pvr_bo_suballoc_free(ctx->usc_eot_bos[i]);
1259 }
1260 
pvr_transfer_ctx_shaders_init(struct pvr_device * device,struct pvr_transfer_ctx * ctx)1261 static VkResult pvr_transfer_ctx_shaders_init(struct pvr_device *device,
1262                                               struct pvr_transfer_ctx *ctx)
1263 {
1264    VkResult result;
1265 
1266    result = pvr_transfer_frag_store_init(device, &ctx->frag_store);
1267    if (result != VK_SUCCESS)
1268       goto err_out;
1269 
1270    result = pvr_transfer_eot_shaders_init(device, ctx);
1271    if (result != VK_SUCCESS)
1272       goto err_frag_store_fini;
1273 
1274    return VK_SUCCESS;
1275 
1276 err_frag_store_fini:
1277    pvr_transfer_frag_store_fini(device, &ctx->frag_store);
1278 
1279 err_out:
1280    return result;
1281 }
1282 
pvr_transfer_ctx_shaders_fini(struct pvr_device * device,struct pvr_transfer_ctx * ctx)1283 static void pvr_transfer_ctx_shaders_fini(struct pvr_device *device,
1284                                           struct pvr_transfer_ctx *ctx)
1285 {
1286    pvr_transfer_eot_shaders_fini(device, ctx);
1287    pvr_transfer_frag_store_fini(device, &ctx->frag_store);
1288 }
1289 
pvr_transfer_ctx_create(struct pvr_device * const device,enum pvr_winsys_ctx_priority priority,struct pvr_transfer_ctx ** const ctx_out)1290 VkResult pvr_transfer_ctx_create(struct pvr_device *const device,
1291                                  enum pvr_winsys_ctx_priority priority,
1292                                  struct pvr_transfer_ctx **const ctx_out)
1293 {
1294    struct pvr_winsys_transfer_ctx_create_info create_info;
1295    struct pvr_transfer_ctx *ctx;
1296    VkResult result;
1297 
1298    ctx = vk_zalloc(&device->vk.alloc,
1299                    sizeof(*ctx),
1300                    8U,
1301                    VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1302    if (!ctx)
1303       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1304 
1305    ctx->device = device;
1306 
1307    result = pvr_ctx_reset_cmd_init(device, &ctx->reset_cmd);
1308    if (result != VK_SUCCESS)
1309       goto err_free_ctx;
1310 
1311    pvr_transfer_ctx_ws_create_info_init(priority, &create_info);
1312 
1313    result = device->ws->ops->transfer_ctx_create(device->ws,
1314                                                  &create_info,
1315                                                  &ctx->ws_ctx);
1316    if (result != VK_SUCCESS)
1317       goto err_fini_reset_cmd;
1318 
1319    result = pvr_transfer_ctx_shaders_init(device, ctx);
1320    if (result != VK_SUCCESS)
1321       goto err_destroy_transfer_ctx;
1322 
1323    /* Create the PDS Uniform/Tex state code segment array. */
1324    for (uint32_t i = 0U; i < ARRAY_SIZE(ctx->pds_unitex_code); i++) {
1325       for (uint32_t j = 0U; j < ARRAY_SIZE(ctx->pds_unitex_code[0U]); j++) {
1326          if (i == 0U && j == 0U)
1327             continue;
1328 
1329          result = pvr_pds_unitex_state_program_create_and_upload(
1330             device,
1331             NULL,
1332             i,
1333             j,
1334             &ctx->pds_unitex_code[i][j]);
1335          if (result != VK_SUCCESS) {
1336             goto err_free_pds_unitex_bos;
1337          }
1338       }
1339    }
1340 
1341    *ctx_out = ctx;
1342 
1343    return VK_SUCCESS;
1344 
1345 err_free_pds_unitex_bos:
1346    for (uint32_t i = 0U; i < ARRAY_SIZE(ctx->pds_unitex_code); i++) {
1347       for (uint32_t j = 0U; j < ARRAY_SIZE(ctx->pds_unitex_code[0U]); j++) {
1348          if (!ctx->pds_unitex_code[i][j].pvr_bo)
1349             continue;
1350 
1351          pvr_bo_suballoc_free(ctx->pds_unitex_code[i][j].pvr_bo);
1352       }
1353    }
1354 
1355    pvr_transfer_ctx_shaders_fini(device, ctx);
1356 
1357 err_destroy_transfer_ctx:
1358    device->ws->ops->transfer_ctx_destroy(ctx->ws_ctx);
1359 
1360 err_fini_reset_cmd:
1361    pvr_ctx_reset_cmd_fini(device, &ctx->reset_cmd);
1362 
1363 err_free_ctx:
1364    vk_free(&device->vk.alloc, ctx);
1365 
1366    return result;
1367 }
1368 
pvr_transfer_ctx_destroy(struct pvr_transfer_ctx * const ctx)1369 void pvr_transfer_ctx_destroy(struct pvr_transfer_ctx *const ctx)
1370 {
1371    struct pvr_device *device = ctx->device;
1372 
1373    for (uint32_t i = 0U; i < ARRAY_SIZE(ctx->pds_unitex_code); i++) {
1374       for (uint32_t j = 0U; j < ARRAY_SIZE(ctx->pds_unitex_code[0U]); j++) {
1375          if (!ctx->pds_unitex_code[i][j].pvr_bo)
1376             continue;
1377 
1378          pvr_bo_suballoc_free(ctx->pds_unitex_code[i][j].pvr_bo);
1379       }
1380    }
1381 
1382    pvr_transfer_ctx_shaders_fini(device, ctx);
1383    device->ws->ops->transfer_ctx_destroy(ctx->ws_ctx);
1384    pvr_ctx_reset_cmd_fini(device, &ctx->reset_cmd);
1385    vk_free(&device->vk.alloc, ctx);
1386 }
1387