xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/panfrost/pan_csf.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (C) 2023 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "decode.h"
25 
26 #include "drm-uapi/panthor_drm.h"
27 
28 #include "genxml/cs_builder.h"
29 #include "panfrost/lib/genxml/cs_builder.h"
30 
31 #include "pan_blitter.h"
32 #include "pan_cmdstream.h"
33 #include "pan_context.h"
34 #include "pan_csf.h"
35 #include "pan_job.h"
36 
37 #if PAN_ARCH < 10
38 #error "CSF helpers are only used for gen >= 10"
39 #endif
40 
41 static struct cs_buffer
csf_alloc_cs_buffer(void * cookie)42 csf_alloc_cs_buffer(void *cookie)
43 {
44    assert(cookie && "Self-contained queues can't be extended.");
45 
46    struct panfrost_batch *batch = cookie;
47    unsigned capacity = 4096;
48 
49    struct panfrost_ptr ptr =
50       pan_pool_alloc_aligned(&batch->csf.cs_chunk_pool.base, capacity * 8, 64);
51 
52    return (struct cs_buffer){
53       .cpu = ptr.cpu,
54       .gpu = ptr.gpu,
55       .capacity = capacity,
56    };
57 }
58 
59 void
GENX(csf_cleanup_batch)60 GENX(csf_cleanup_batch)(struct panfrost_batch *batch)
61 {
62    free(batch->csf.cs.builder);
63 
64    panfrost_pool_cleanup(&batch->csf.cs_chunk_pool);
65 }
66 
67 void
GENX(csf_init_batch)68 GENX(csf_init_batch)(struct panfrost_batch *batch)
69 {
70    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
71 
72    /* Initialize the CS chunk pool. */
73    panfrost_pool_init(&batch->csf.cs_chunk_pool, NULL, dev, 0, 32768,
74                       "CS chunk pool", false, true);
75 
76    /* Allocate and bind the command queue */
77    struct cs_buffer queue = csf_alloc_cs_buffer(batch);
78    const struct cs_builder_conf conf = {
79       .nr_registers = 96,
80       .nr_kernel_registers = 4,
81       .alloc_buffer = csf_alloc_cs_buffer,
82       .cookie = batch,
83    };
84 
85    /* Setup the queue builder */
86    batch->csf.cs.builder = malloc(sizeof(struct cs_builder));
87    cs_builder_init(batch->csf.cs.builder, &conf, queue);
88    cs_req_res(batch->csf.cs.builder,
89               CS_COMPUTE_RES | CS_TILER_RES | CS_IDVS_RES | CS_FRAG_RES);
90 
91    /* Set up entries */
92    struct cs_builder *b = batch->csf.cs.builder;
93    cs_set_scoreboard_entry(b, 2, 0);
94 
95    batch->framebuffer = pan_pool_alloc_desc_aggregate(
96       &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION),
97       PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
98    batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
99 }
100 
101 static void
csf_prepare_qsubmit(struct panfrost_context * ctx,struct drm_panthor_queue_submit * submit,uint8_t queue,uint64_t cs_start,uint32_t cs_size,struct drm_panthor_sync_op * syncs,uint32_t sync_count)102 csf_prepare_qsubmit(struct panfrost_context *ctx,
103                     struct drm_panthor_queue_submit *submit, uint8_t queue,
104                     uint64_t cs_start, uint32_t cs_size,
105                     struct drm_panthor_sync_op *syncs, uint32_t sync_count)
106 {
107    struct panfrost_device *dev = pan_device(ctx->base.screen);
108 
109    *submit = (struct drm_panthor_queue_submit){
110       .queue_index = queue,
111       .stream_addr = cs_start,
112       .stream_size = cs_size,
113       .latest_flush = panthor_kmod_get_flush_id(dev->kmod.dev),
114       .syncs = DRM_PANTHOR_OBJ_ARRAY(sync_count, syncs),
115    };
116 }
117 
118 static void
csf_prepare_gsubmit(struct panfrost_context * ctx,struct drm_panthor_group_submit * gsubmit,struct drm_panthor_queue_submit * qsubmits,uint32_t qsubmit_count)119 csf_prepare_gsubmit(struct panfrost_context *ctx,
120                     struct drm_panthor_group_submit *gsubmit,
121                     struct drm_panthor_queue_submit *qsubmits,
122                     uint32_t qsubmit_count)
123 {
124    *gsubmit = (struct drm_panthor_group_submit){
125       .group_handle = ctx->csf.group_handle,
126       .queue_submits = DRM_PANTHOR_OBJ_ARRAY(qsubmit_count, qsubmits),
127    };
128 }
129 
130 static int
csf_submit_gsubmit(struct panfrost_context * ctx,struct drm_panthor_group_submit * gsubmit)131 csf_submit_gsubmit(struct panfrost_context *ctx,
132                    struct drm_panthor_group_submit *gsubmit)
133 {
134    struct panfrost_device *dev = pan_device(ctx->base.screen);
135    int ret = 0;
136 
137    if (!ctx->is_noop) {
138       ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_SUBMIT,
139                      gsubmit);
140    }
141 
142    if (ret)
143       return errno;
144 
145    return 0;
146 }
147 
148 static void
csf_emit_batch_end(struct panfrost_batch * batch)149 csf_emit_batch_end(struct panfrost_batch *batch)
150 {
151    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
152    struct cs_builder *b = batch->csf.cs.builder;
153 
154    /* Barrier to let everything finish */
155    cs_wait_slots(b, BITFIELD_MASK(8), false);
156 
157    if (dev->debug & PAN_DBG_SYNC) {
158       /* Get the CS state */
159       batch->csf.cs.state = pan_pool_alloc_aligned(&batch->pool.base, 8, 8);
160       memset(batch->csf.cs.state.cpu, ~0, 8);
161       cs_move64_to(b, cs_reg64(b, 90), batch->csf.cs.state.gpu);
162       cs_store_state(b, cs_reg64(b, 90), 0, MALI_CS_STATE_ERROR_STATUS,
163                      cs_now());
164    }
165 
166    /* Flush caches now that we're done (synchronous) */
167    struct cs_index flush_id = cs_reg32(b, 74);
168    cs_move32_to(b, flush_id, 0);
169    cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN, true,
170                    flush_id, cs_defer(0, 0));
171    cs_wait_slot(b, 0, false);
172 
173    /* Finish the command stream */
174    assert(cs_is_valid(batch->csf.cs.builder));
175    cs_finish(batch->csf.cs.builder);
176 }
177 
178 static int
csf_submit_collect_wait_ops(struct panfrost_batch * batch,struct util_dynarray * syncops,uint32_t vm_sync_handle)179 csf_submit_collect_wait_ops(struct panfrost_batch *batch,
180                             struct util_dynarray *syncops,
181                             uint32_t vm_sync_handle)
182 {
183    struct panfrost_context *ctx = batch->ctx;
184    struct panfrost_device *dev = pan_device(ctx->base.screen);
185    uint64_t vm_sync_wait_point = 0, bo_sync_point;
186    uint32_t bo_sync_handle;
187    int ret;
188 
189    /* We don't wait on BOs attached to the various batch pools, because those
190     * are private to the batch, and are guaranteed to be idle at allocation
191     * time. We need to iterate over other BOs accessed by the batch though,
192     * to add the corresponding wait operations.
193     */
194    util_dynarray_foreach(&batch->bos, pan_bo_access, ptr) {
195       unsigned i = ptr - util_dynarray_element(&batch->bos, pan_bo_access, 0);
196       pan_bo_access flags = *ptr;
197 
198       if (!flags)
199          continue;
200 
201       /* Update the BO access flags so that panfrost_bo_wait() knows
202        * about all pending accesses.
203        * We only keep the READ/WRITE info since this is all the BO
204        * wait logic cares about.
205        * We also preserve existing flags as this batch might not
206        * be the first one to access the BO.
207        */
208       struct panfrost_bo *bo = pan_lookup_bo(dev, i);
209 
210       ret = panthor_kmod_bo_get_sync_point(bo->kmod_bo, &bo_sync_handle,
211                                            &bo_sync_point,
212                                            !(flags & PAN_BO_ACCESS_WRITE));
213       if (ret)
214          return ret;
215 
216       if (bo_sync_handle == vm_sync_handle) {
217          vm_sync_wait_point = MAX2(vm_sync_wait_point, bo_sync_point);
218          continue;
219       }
220 
221       assert(bo_sync_point == 0 || !bo->kmod_bo->exclusive_vm);
222 
223       struct drm_panthor_sync_op waitop = {
224          .flags =
225             DRM_PANTHOR_SYNC_OP_WAIT |
226             (bo_sync_point ? DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ
227                            : DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ),
228          .handle = bo_sync_handle,
229          .timeline_value = bo_sync_point,
230       };
231 
232       util_dynarray_append(syncops, struct drm_panthor_sync_op, waitop);
233    }
234 
235    if (vm_sync_wait_point > 0) {
236       struct drm_panthor_sync_op waitop = {
237          .flags = DRM_PANTHOR_SYNC_OP_WAIT |
238                   DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ,
239          .handle = vm_sync_handle,
240          .timeline_value = vm_sync_wait_point,
241       };
242 
243       util_dynarray_append(syncops, struct drm_panthor_sync_op, waitop);
244    }
245 
246    if (ctx->in_sync_fd >= 0) {
247       ret = drmSyncobjImportSyncFile(panfrost_device_fd(dev), ctx->in_sync_obj,
248                                      ctx->in_sync_fd);
249       if (ret)
250          return ret;
251 
252       struct drm_panthor_sync_op waitop = {
253          .flags =
254             DRM_PANTHOR_SYNC_OP_WAIT | DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ,
255          .handle = ctx->in_sync_obj,
256       };
257 
258       util_dynarray_append(syncops, struct drm_panthor_sync_op, waitop);
259 
260       close(ctx->in_sync_fd);
261       ctx->in_sync_fd = -1;
262    }
263 
264    return 0;
265 }
266 
267 static int
csf_attach_sync_points(struct panfrost_batch * batch,uint32_t vm_sync_handle,uint64_t vm_sync_signal_point)268 csf_attach_sync_points(struct panfrost_batch *batch, uint32_t vm_sync_handle,
269                        uint64_t vm_sync_signal_point)
270 {
271    struct panfrost_context *ctx = batch->ctx;
272    struct panfrost_device *dev = pan_device(ctx->base.screen);
273    int ret;
274 
275    /* There should be no invisble allocation on CSF. */
276    assert(batch->invisible_pool.bos.size == 0);
277 
278    /* Attach sync points to batch-private BOs first. We assume BOs can
279     * be written by the GPU to keep things simple.
280     */
281    util_dynarray_foreach(&batch->pool.bos, struct panfrost_bo *, bo) {
282       (*bo)->gpu_access |= PAN_BO_ACCESS_RW;
283       ret = panthor_kmod_bo_attach_sync_point((*bo)->kmod_bo, vm_sync_handle,
284                                               vm_sync_signal_point, true);
285       if (ret)
286          return ret;
287    }
288 
289    util_dynarray_foreach(&batch->csf.cs_chunk_pool.bos, struct panfrost_bo *,
290                          bo) {
291       (*bo)->gpu_access |= PAN_BO_ACCESS_RW;
292       ret = panthor_kmod_bo_attach_sync_point((*bo)->kmod_bo, vm_sync_handle,
293                                               vm_sync_signal_point, true);
294       if (ret)
295          return ret;
296    }
297 
298    /* Attach the VM sync point to all resources accessed by the batch. */
299    util_dynarray_foreach(&batch->bos, pan_bo_access, ptr) {
300       unsigned i = ptr - util_dynarray_element(&batch->bos, pan_bo_access, 0);
301       pan_bo_access flags = *ptr;
302 
303       if (!flags)
304          continue;
305 
306       struct panfrost_bo *bo = pan_lookup_bo(dev, i);
307 
308       bo->gpu_access |= flags & (PAN_BO_ACCESS_RW);
309       ret = panthor_kmod_bo_attach_sync_point(bo->kmod_bo, vm_sync_handle,
310                                               vm_sync_signal_point,
311                                               flags & PAN_BO_ACCESS_WRITE);
312       if (ret)
313          return ret;
314    }
315 
316    /* And finally transfer the VM sync point to the context syncobj. */
317    return drmSyncobjTransfer(panfrost_device_fd(dev), ctx->syncobj, 0,
318                              vm_sync_handle, vm_sync_signal_point, 0);
319 }
320 
321 static void
csf_check_ctx_state_and_reinit(struct panfrost_context * ctx)322 csf_check_ctx_state_and_reinit(struct panfrost_context *ctx)
323 {
324    struct panfrost_device *dev = pan_device(ctx->base.screen);
325    struct drm_panthor_group_get_state state = {
326       .group_handle = ctx->csf.group_handle,
327    };
328    int ret;
329 
330    ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_GET_STATE,
331                   &state);
332    if (ret) {
333       mesa_loge("DRM_IOCTL_PANTHOR_GROUP_GET_STATE failed (err=%d)", errno);
334       return;
335    }
336 
337    /* Context is still usable. This was a transient error. */
338    if (state.state == 0)
339       return;
340 
341    /* If the VM is unusable, we can't do much, as this is shared between all
342     * contexts, and restoring the VM state is non-trivial.
343     */
344    if (pan_kmod_vm_query_state(dev->kmod.vm) != PAN_KMOD_VM_USABLE) {
345       mesa_loge("VM became unusable, we can't reset the context");
346       assert(!"VM became unusable, we can't reset the context");
347    }
348 
349    panfrost_context_reinit(ctx);
350 }
351 
352 static void
csf_submit_wait_and_dump(struct panfrost_batch * batch,const struct drm_panthor_group_submit * gsubmit,uint32_t vm_sync_handle,uint64_t vm_sync_signal_point)353 csf_submit_wait_and_dump(struct panfrost_batch *batch,
354                          const struct drm_panthor_group_submit *gsubmit,
355                          uint32_t vm_sync_handle, uint64_t vm_sync_signal_point)
356 {
357    struct panfrost_context *ctx = batch->ctx;
358    struct panfrost_device *dev = pan_device(ctx->base.screen);
359    bool wait = (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) && !ctx->is_noop;
360    bool dump = (dev->debug & PAN_DBG_TRACE);
361    bool crash = false;
362 
363    if (!wait && !dump)
364       return;
365 
366    /* Wait so we can get errors reported back */
367    if (wait) {
368       int ret =
369          drmSyncobjTimelineWait(panfrost_device_fd(dev), &vm_sync_handle,
370                                 &vm_sync_signal_point, 1, INT64_MAX, 0, NULL);
371       assert(ret >= 0);
372    }
373 
374    /* Jobs won't be complete if blackhole rendering, that's ok */
375    if (!ctx->is_noop && (dev->debug & PAN_DBG_SYNC) &&
376        *((uint64_t *)batch->csf.cs.state.cpu) != 0) {
377       crash = true;
378       dump = true;
379    }
380 
381    if (dump) {
382       const struct drm_panthor_queue_submit *qsubmits =
383          (void *)(uintptr_t)gsubmit->queue_submits.array;
384 
385       for (unsigned i = 0; i < gsubmit->queue_submits.count; i++) {
386          uint32_t regs[256] = {0};
387          pandecode_cs(dev->decode_ctx, qsubmits[i].stream_addr,
388                       qsubmits[i].stream_size, panfrost_device_gpu_id(dev),
389                       regs);
390       }
391 
392       if (dev->debug & PAN_DBG_DUMP)
393          pandecode_dump_mappings(dev->decode_ctx);
394    }
395 
396    if (crash) {
397       fprintf(stderr, "Incomplete job or timeout\n");
398       fflush(NULL);
399       abort();
400    }
401 }
402 
403 int
GENX(csf_submit_batch)404 GENX(csf_submit_batch)(struct panfrost_batch *batch)
405 {
406    /* Close the batch before submitting. */
407    csf_emit_batch_end(batch);
408 
409    uint64_t cs_start = cs_root_chunk_gpu_addr(batch->csf.cs.builder);
410    uint32_t cs_size = cs_root_chunk_size(batch->csf.cs.builder);
411    struct panfrost_context *ctx = batch->ctx;
412    struct panfrost_device *dev = pan_device(ctx->base.screen);
413    uint32_t vm_sync_handle = panthor_kmod_vm_sync_handle(dev->kmod.vm);
414    struct util_dynarray syncops;
415    int ret;
416 
417    util_dynarray_init(&syncops, NULL);
418 
419    ret = csf_submit_collect_wait_ops(batch, &syncops, vm_sync_handle);
420    if (ret)
421       goto out_free_syncops;
422 
423    uint64_t vm_sync_cur_point = panthor_kmod_vm_sync_lock(dev->kmod.vm);
424    uint64_t vm_sync_signal_point = vm_sync_cur_point + 1;
425 
426    struct drm_panthor_sync_op signalop = {
427       .flags = DRM_PANTHOR_SYNC_OP_SIGNAL |
428                DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ,
429       .handle = vm_sync_handle,
430       .timeline_value = vm_sync_signal_point,
431    };
432 
433    util_dynarray_append(&syncops, struct drm_panthor_sync_op, signalop);
434 
435    struct drm_panthor_queue_submit qsubmit;
436    struct drm_panthor_group_submit gsubmit;
437 
438    csf_prepare_qsubmit(
439       ctx, &qsubmit, 0, cs_start, cs_size, util_dynarray_begin(&syncops),
440       util_dynarray_num_elements(&syncops, struct drm_panthor_sync_op));
441    csf_prepare_gsubmit(ctx, &gsubmit, &qsubmit, 1);
442    ret = csf_submit_gsubmit(ctx, &gsubmit);
443    panthor_kmod_vm_sync_unlock(dev->kmod.vm,
444                                ret ? vm_sync_cur_point : vm_sync_signal_point);
445 
446    if (!ret) {
447       csf_submit_wait_and_dump(batch, &gsubmit, vm_sync_handle,
448                                vm_sync_signal_point);
449       ret = csf_attach_sync_points(batch, vm_sync_handle, vm_sync_signal_point);
450    } else {
451       csf_check_ctx_state_and_reinit(batch->ctx);
452    }
453 
454 out_free_syncops:
455    util_dynarray_fini(&syncops);
456    return ret;
457 }
458 
459 void
GENX(csf_preload_fb)460 GENX(csf_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb)
461 {
462    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
463 
464    GENX(pan_preload_fb)
465    (&dev->blitter, &batch->pool.base, fb, 0, batch->tls.gpu, NULL);
466 }
467 
468 void
GENX(csf_emit_fragment_job)469 GENX(csf_emit_fragment_job)(struct panfrost_batch *batch,
470                             const struct pan_fb_info *pfb)
471 {
472    struct cs_builder *b = batch->csf.cs.builder;
473 
474    if (batch->draw_count > 0) {
475       /* Finish tiling and wait for IDVS and tiling */
476       cs_finish_tiling(b, false);
477       cs_wait_slot(b, 2, false);
478       cs_vt_end(b, cs_now());
479    }
480 
481    /* Set up the fragment job */
482    cs_move64_to(b, cs_reg64(b, 40), batch->framebuffer.gpu);
483    cs_move32_to(b, cs_reg32(b, 42), (batch->miny << 16) | batch->minx);
484    cs_move32_to(b, cs_reg32(b, 43),
485                 ((batch->maxy - 1) << 16) | (batch->maxx - 1));
486 
487    /* Run the fragment job and wait */
488    cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
489    cs_wait_slot(b, 2, false);
490 
491    /* Gather freed heap chunks and add them to the heap context free list
492     * so they can be re-used next time the tiler heap runs out of chunks.
493     * That's what cs_finish_fragment() is all about. The list of freed
494     * chunks is in the tiler context descriptor
495     * (completed_{top,bottom fields}). */
496    if (batch->draw_count > 0) {
497       assert(batch->tiler_ctx.valhall.desc);
498       cs_move64_to(b, cs_reg64(b, 90), batch->tiler_ctx.valhall.desc);
499       cs_load_to(b, cs_reg_tuple(b, 86, 4), cs_reg64(b, 90), BITFIELD_MASK(4),
500                  40);
501       cs_wait_slot(b, 0, false);
502       cs_finish_fragment(b, true, cs_reg64(b, 86), cs_reg64(b, 88), cs_now());
503    }
504 }
505 
506 static void
csf_emit_shader_regs(struct panfrost_batch * batch,enum pipe_shader_type stage,mali_ptr shader)507 csf_emit_shader_regs(struct panfrost_batch *batch, enum pipe_shader_type stage,
508                      mali_ptr shader)
509 {
510    mali_ptr resources = panfrost_emit_resources(batch, stage);
511 
512    assert(stage == PIPE_SHADER_VERTEX || stage == PIPE_SHADER_FRAGMENT ||
513           stage == PIPE_SHADER_COMPUTE);
514 
515    unsigned offset = (stage == PIPE_SHADER_FRAGMENT) ? 4 : 0;
516    unsigned fau_count = DIV_ROUND_UP(batch->nr_push_uniforms[stage], 2);
517 
518    struct cs_builder *b = batch->csf.cs.builder;
519    cs_move64_to(b, cs_reg64(b, 0 + offset), resources);
520    cs_move64_to(b, cs_reg64(b, 8 + offset),
521                 batch->push_uniforms[stage] | ((uint64_t)fau_count << 56));
522    cs_move64_to(b, cs_reg64(b, 16 + offset), shader);
523 }
524 
525 void
GENX(csf_launch_grid)526 GENX(csf_launch_grid)(struct panfrost_batch *batch,
527                       const struct pipe_grid_info *info)
528 {
529    /* Empty compute programs are invalid and don't make sense */
530    if (batch->rsd[PIPE_SHADER_COMPUTE] == 0)
531       return;
532 
533    struct panfrost_context *ctx = batch->ctx;
534    struct panfrost_device *dev = pan_device(ctx->base.screen);
535    struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
536    struct cs_builder *b = batch->csf.cs.builder;
537 
538    csf_emit_shader_regs(batch, PIPE_SHADER_COMPUTE,
539                         batch->rsd[PIPE_SHADER_COMPUTE]);
540 
541    cs_move64_to(b, cs_reg64(b, 24), batch->tls.gpu);
542 
543    /* Global attribute offset */
544    cs_move32_to(b, cs_reg32(b, 32), 0);
545 
546    /* Compute workgroup size */
547    uint32_t wg_size[4];
548    pan_pack(wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
549       cfg.workgroup_size_x = info->block[0];
550       cfg.workgroup_size_y = info->block[1];
551       cfg.workgroup_size_z = info->block[2];
552 
553       /* Workgroups may be merged if the shader does not use barriers
554        * or shared memory. This condition is checked against the
555        * static shared_size at compile-time. We need to check the
556        * variable shared size at launch_grid time, because the
557        * compiler doesn't know about that.
558        */
559       cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups &&
560                                      (info->variable_shared_mem == 0);
561    }
562 
563    cs_move32_to(b, cs_reg32(b, 33), wg_size[0]);
564 
565    /* Offset */
566    for (unsigned i = 0; i < 3; ++i)
567       cs_move32_to(b, cs_reg32(b, 34 + i), 0);
568 
569    unsigned threads_per_wg = info->block[0] * info->block[1] * info->block[2];
570    unsigned max_thread_cnt = panfrost_compute_max_thread_count(
571       &dev->kmod.props, cs->info.work_reg_count);
572 
573    if (info->indirect) {
574       /* Load size in workgroups per dimension from memory */
575       struct cs_index address = cs_reg64(b, 64);
576       cs_move64_to(
577          b, address,
578          pan_resource(info->indirect)->image.data.base + info->indirect_offset);
579 
580       struct cs_index grid_xyz = cs_reg_tuple(b, 37, 3);
581       cs_load_to(b, grid_xyz, address, BITFIELD_MASK(3), 0);
582 
583       /* Wait for the load */
584       cs_wait_slot(b, 0, false);
585 
586       /* Copy to FAU */
587       for (unsigned i = 0; i < 3; ++i) {
588          if (batch->num_wg_sysval[i]) {
589             cs_move64_to(b, address, batch->num_wg_sysval[i]);
590             cs_store(b, cs_extract32(b, grid_xyz, i), address, BITFIELD_MASK(1),
591                      0);
592          }
593       }
594 
595       /* Wait for the stores */
596       cs_wait_slot(b, 0, false);
597 
598       cs_run_compute_indirect(b, DIV_ROUND_UP(max_thread_cnt, threads_per_wg),
599                               false, cs_shader_res_sel(0, 0, 0, 0));
600    } else {
601       /* Set size in workgroups per dimension immediately */
602       for (unsigned i = 0; i < 3; ++i)
603          cs_move32_to(b, cs_reg32(b, 37 + i), info->grid[i]);
604 
605       /* Pick the task_axis and task_increment to maximize thread utilization. */
606       unsigned task_axis = MALI_TASK_AXIS_X;
607       unsigned threads_per_task = threads_per_wg;
608       unsigned task_increment = 0;
609 
610       for (unsigned i = 0; i < 3; i++) {
611          if (threads_per_task * info->grid[i] >= max_thread_cnt) {
612             /* We reached out thread limit, stop at the current axis and
613              * calculate the increment so it doesn't exceed the per-core
614              * thread capacity.
615              */
616             task_increment = max_thread_cnt / threads_per_task;
617             break;
618          } else if (task_axis == MALI_TASK_AXIS_Z) {
619             /* We reached the Z axis, and there's still room to stuff more
620              * threads. Pick the current axis grid size as our increment
621              * as there's no point using something bigger.
622              */
623             task_increment = info->grid[i];
624             break;
625          }
626 
627          threads_per_task *= info->grid[i];
628          task_axis++;
629       }
630 
631       assert(task_axis <= MALI_TASK_AXIS_Z);
632       assert(task_increment > 0);
633       cs_run_compute(b, task_increment, task_axis, false,
634                      cs_shader_res_sel(0, 0, 0, 0));
635    }
636 }
637 
638 void
GENX(csf_launch_xfb)639 GENX(csf_launch_xfb)(struct panfrost_batch *batch,
640                      const struct pipe_draw_info *info, unsigned count)
641 {
642    struct cs_builder *b = batch->csf.cs.builder;
643 
644    cs_move64_to(b, cs_reg64(b, 24), batch->tls.gpu);
645 
646    /* TODO: Indexing. Also, attribute_offset is a legacy feature.. */
647    cs_move32_to(b, cs_reg32(b, 32), batch->ctx->offset_start);
648 
649    /* Compute workgroup size */
650    uint32_t wg_size[4];
651    pan_pack(wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
652       cfg.workgroup_size_x = 1;
653       cfg.workgroup_size_y = 1;
654       cfg.workgroup_size_z = 1;
655 
656       /* Transform feedback shaders do not use barriers or
657        * shared memory, so we may merge workgroups.
658        */
659       cfg.allow_merging_workgroups = true;
660    }
661    cs_move32_to(b, cs_reg32(b, 33), wg_size[0]);
662 
663    /* Offset */
664    for (unsigned i = 0; i < 3; ++i)
665       cs_move32_to(b, cs_reg32(b, 34 + i), 0);
666 
667    cs_move32_to(b, cs_reg32(b, 37), count);
668    cs_move32_to(b, cs_reg32(b, 38), info->instance_count);
669    cs_move32_to(b, cs_reg32(b, 39), 1);
670 
671    csf_emit_shader_regs(batch, PIPE_SHADER_VERTEX,
672                         batch->rsd[PIPE_SHADER_VERTEX]);
673    /* force a barrier to avoid read/write sync issues with buffers */
674    cs_wait_slot(b, 2, false);
675 
676    /* XXX: Choose correctly */
677    cs_run_compute(b, 1, MALI_TASK_AXIS_Z, false, cs_shader_res_sel(0, 0, 0, 0));
678 }
679 
680 static mali_ptr
csf_get_tiler_desc(struct panfrost_batch * batch)681 csf_get_tiler_desc(struct panfrost_batch *batch)
682 {
683    struct panfrost_context *ctx = batch->ctx;
684    struct panfrost_device *dev = pan_device(ctx->base.screen);
685 
686    if (batch->tiler_ctx.valhall.desc)
687       return batch->tiler_ctx.valhall.desc;
688 
689    struct panfrost_ptr t =
690       pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
691    pan_pack(t.cpu, TILER_CONTEXT, tiler) {
692       unsigned max_levels = dev->tiler_features.max_levels;
693       assert(max_levels >= 2);
694 
695       /* TODO: Select hierarchy mask more effectively */
696       tiler.hierarchy_mask = (max_levels >= 8) ? 0xFF : 0x28;
697 
698       /* For large framebuffers, disable the smallest bin size to
699        * avoid pathological tiler memory usage. Required to avoid OOM
700        * on dEQP-GLES31.functional.fbo.no_attachments.maximums.all on
701        * Mali-G57.
702        */
703       if (MAX2(batch->key.width, batch->key.height) >= 4096)
704          tiler.hierarchy_mask &= ~1;
705 
706       tiler.fb_width = batch->key.width;
707       tiler.fb_height = batch->key.height;
708       tiler.heap = batch->ctx->csf.heap.desc_bo->ptr.gpu;
709       tiler.sample_pattern =
710          pan_sample_pattern(util_framebuffer_get_num_samples(&batch->key));
711       tiler.first_provoking_vertex =
712          pan_tristate_get(batch->first_provoking_vertex);
713       tiler.geometry_buffer = ctx->csf.tmp_geom_bo->ptr.gpu;
714       tiler.geometry_buffer_size = ctx->csf.tmp_geom_bo->kmod_bo->size;
715    }
716 
717    batch->tiler_ctx.valhall.desc = t.gpu;
718    return batch->tiler_ctx.valhall.desc;
719 }
720 
721 static uint32_t
csf_emit_draw_state(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset)722 csf_emit_draw_state(struct panfrost_batch *batch,
723                     const struct pipe_draw_info *info, unsigned drawid_offset)
724 {
725    struct panfrost_context *ctx = batch->ctx;
726    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
727    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
728 
729    bool idvs = vs->info.vs.idvs;
730    bool fs_required = panfrost_fs_required(
731       fs, ctx->blend, &ctx->pipe_framebuffer, ctx->depth_stencil);
732    bool secondary_shader = vs->info.vs.secondary_enable && fs_required;
733 
734    assert(idvs && "IDVS required for CSF");
735 
736    struct cs_builder *b = batch->csf.cs.builder;
737 
738    if (batch->draw_count == 0)
739       cs_vt_start(batch->csf.cs.builder, cs_now());
740 
741    csf_emit_shader_regs(batch, PIPE_SHADER_VERTEX,
742                         panfrost_get_position_shader(batch, info));
743 
744    if (fs_required) {
745       csf_emit_shader_regs(batch, PIPE_SHADER_FRAGMENT,
746                            batch->rsd[PIPE_SHADER_FRAGMENT]);
747    } else {
748       cs_move64_to(b, cs_reg64(b, 4), 0);
749       cs_move64_to(b, cs_reg64(b, 12), 0);
750       cs_move64_to(b, cs_reg64(b, 20), 0);
751    }
752 
753    if (secondary_shader) {
754       cs_move64_to(b, cs_reg64(b, 18), panfrost_get_varying_shader(batch));
755    }
756 
757    cs_move64_to(b, cs_reg64(b, 24), batch->tls.gpu);
758    cs_move64_to(b, cs_reg64(b, 30), batch->tls.gpu);
759    cs_move32_to(b, cs_reg32(b, 32), 0);
760    cs_move32_to(b, cs_reg32(b, 37), 0);
761    cs_move32_to(b, cs_reg32(b, 38), 0);
762 
763    cs_move64_to(b, cs_reg64(b, 40), csf_get_tiler_desc(batch));
764 
765    STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
766    STATIC_ASSERT(sizeof(uint64_t) == pan_size(SCISSOR));
767    uint64_t *sbd = (uint64_t *)&batch->scissor[0];
768    cs_move64_to(b, cs_reg64(b, 42), *sbd);
769 
770    cs_move32_to(b, cs_reg32(b, 44), fui(batch->minimum_z));
771    cs_move32_to(b, cs_reg32(b, 45), fui(batch->maximum_z));
772 
773    if (ctx->occlusion_query && ctx->active_queries) {
774       struct panfrost_resource *rsrc = pan_resource(ctx->occlusion_query->rsrc);
775       cs_move64_to(b, cs_reg64(b, 46), rsrc->image.data.base);
776       panfrost_batch_write_rsrc(ctx->batch, rsrc, PIPE_SHADER_FRAGMENT);
777    }
778 
779    cs_move32_to(b, cs_reg32(b, 48), panfrost_vertex_attribute_stride(vs, fs));
780    cs_move64_to(b, cs_reg64(b, 50),
781                 batch->blend | MAX2(batch->key.nr_cbufs, 1));
782    cs_move64_to(b, cs_reg64(b, 52), batch->depth_stencil);
783 
784    if (info->index_size)
785       cs_move64_to(b, cs_reg64(b, 54), batch->indices);
786 
787    struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
788 
789    uint32_t primitive_flags = 0;
790    pan_pack(&primitive_flags, PRIMITIVE_FLAGS, cfg) {
791       if (panfrost_writes_point_size(ctx))
792          cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
793 
794       cfg.allow_rotating_primitives = allow_rotating_primitives(fs, info);
795 
796       cfg.low_depth_cull = rast->depth_clip_near;
797       cfg.high_depth_cull = rast->depth_clip_far;
798 
799       /* Non-fixed restart indices should have been lowered */
800       assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
801       cfg.primitive_restart = info->primitive_restart;
802 
803       cfg.position_fifo_format = panfrost_writes_point_size(ctx)
804                                     ? MALI_FIFO_FORMAT_EXTENDED
805                                     : MALI_FIFO_FORMAT_BASIC;
806    }
807 
808    cs_move32_to(b, cs_reg32(b, 56), primitive_flags);
809 
810    uint32_t dcd_flags0 = 0, dcd_flags1 = 0;
811    pan_pack(&dcd_flags0, DCD_FLAGS_0, cfg) {
812       enum mesa_prim reduced_mode = u_reduced_prim(info->mode);
813       bool polygon = reduced_mode == MESA_PRIM_TRIANGLES;
814       bool lines = reduced_mode == MESA_PRIM_LINES;
815 
816       /*
817        * From the Gallium documentation,
818        * pipe_rasterizer_state::cull_face "indicates which faces of
819        * polygons to cull". Points and lines are not considered
820        * polygons and should be drawn even if all faces are culled.
821        * The hardware does not take primitive type into account when
822        * culling, so we need to do that check ourselves.
823        */
824       cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
825       cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
826       cfg.front_face_ccw = rast->front_ccw;
827 
828       cfg.multisample_enable = rast->multisample;
829 
830       /* Use per-sample shading if required by API Also use it when a
831        * blend shader is used with multisampling, as this is handled
832        * by a single ST_TILE in the blend shader with the current
833        * sample ID, requiring per-sample shading.
834        */
835       cfg.evaluate_per_sample =
836          (rast->multisample &&
837           ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader));
838 
839       cfg.single_sampled_lines = !rast->multisample;
840 
841       if (lines && rast->line_smooth) {
842          cfg.multisample_enable = true;
843          cfg.single_sampled_lines = false;
844       }
845 
846       bool has_oq = ctx->occlusion_query && ctx->active_queries;
847       if (has_oq) {
848          if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
849             cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
850          else
851             cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
852       }
853 
854       if (fs_required) {
855          struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
856             fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
857             ctx->blend->base.alpha_to_coverage,
858             ctx->depth_stencil->zs_always_passes);
859 
860          cfg.pixel_kill_operation = earlyzs.kill;
861          cfg.zs_update_operation = earlyzs.update;
862 
863          cfg.allow_forward_pixel_to_kill =
864             pan_allow_forward_pixel_to_kill(ctx, fs);
865          cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global;
866 
867          cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
868          cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
869 
870          /* Also use per-sample shading if required by the shader
871           */
872          cfg.evaluate_per_sample |= fs->info.fs.sample_shading;
873 
874          /* Unlike Bifrost, alpha-to-coverage must be included in
875           * this identically-named flag. Confusing, isn't it?
876           */
877          cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
878                                         fs->info.fs.can_discard ||
879                                         ctx->blend->base.alpha_to_coverage;
880 
881          cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
882       } else {
883          /* These operations need to be FORCE to benefit from the
884           * depth-only pass optimizations.
885           */
886          cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
887          cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
888 
889          /* No shader and no blend => no shader or blend
890           * reasons to disable FPK. The only FPK-related state
891           * not covered is alpha-to-coverage which we don't set
892           * without blend.
893           */
894          cfg.allow_forward_pixel_to_kill = true;
895 
896          /* No shader => no shader side effects */
897          cfg.allow_forward_pixel_to_be_killed = true;
898 
899          /* Alpha isn't written so these are vacuous */
900          cfg.overdraw_alpha0 = true;
901          cfg.overdraw_alpha1 = true;
902       }
903    }
904 
905    pan_pack(&dcd_flags1, DCD_FLAGS_1, cfg) {
906       cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
907 
908       if (fs_required) {
909          /* See JM Valhall equivalent code */
910          cfg.render_target_mask =
911             (fs->info.outputs_written >> FRAG_RESULT_DATA0) & ctx->fb_rt_mask;
912       }
913    }
914 
915    cs_move32_to(b, cs_reg32(b, 57), dcd_flags0);
916    cs_move32_to(b, cs_reg32(b, 58), dcd_flags1);
917 
918    uint64_t primsize = 0;
919    panfrost_emit_primitive_size(ctx, info->mode == MESA_PRIM_POINTS, 0,
920                                 &primsize);
921    cs_move64_to(b, cs_reg64(b, 60), primsize);
922 
923    uint32_t flags_override;
924    /* Pack with nodefaults so only explicitly set override fields affect the
925     * previously set register values */
926    pan_pack_nodefaults(&flags_override, PRIMITIVE_FLAGS, cfg) {
927       cfg.draw_mode = pan_draw_mode(info->mode);
928       cfg.index_type = panfrost_translate_index_size(info->index_size);
929       cfg.secondary_shader = secondary_shader;
930    };
931 
932    return flags_override;
933 }
934 
935 static struct cs_index
csf_emit_draw_id_register(struct panfrost_batch * batch,unsigned offset)936 csf_emit_draw_id_register(struct panfrost_batch *batch, unsigned offset)
937 {
938    struct cs_builder *b = batch->csf.cs.builder;
939    struct panfrost_context *ctx = batch->ctx;
940    struct panfrost_uncompiled_shader *vs = ctx->uncompiled[PIPE_SHADER_VERTEX];
941 
942    if (!BITSET_TEST(vs->nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID))
943       return cs_undef();
944 
945    struct cs_index drawid = cs_reg32(b, 67);
946    cs_move32_to(b, drawid, offset);
947 
948    return drawid;
949 }
950 
951 void
GENX(csf_launch_draw)952 GENX(csf_launch_draw)(struct panfrost_batch *batch,
953                       const struct pipe_draw_info *info, unsigned drawid_offset,
954                       const struct pipe_draw_start_count_bias *draw,
955                       unsigned vertex_count)
956 {
957    struct cs_builder *b = batch->csf.cs.builder;
958 
959    uint32_t flags_override = csf_emit_draw_state(batch, info, drawid_offset);
960    struct cs_index drawid = csf_emit_draw_id_register(batch, drawid_offset);
961 
962    cs_move32_to(b, cs_reg32(b, 33), draw->count);
963    cs_move32_to(b, cs_reg32(b, 34), info->instance_count);
964    cs_move32_to(b, cs_reg32(b, 35), 0);
965 
966    /* Base vertex offset on Valhall is used for both indexed and
967     * non-indexed draws, in a simple way for either. Handle both cases.
968     */
969    if (info->index_size) {
970       cs_move32_to(b, cs_reg32(b, 36), draw->index_bias);
971       cs_move32_to(b, cs_reg32(b, 39), info->index_size * draw->count);
972    } else {
973       cs_move32_to(b, cs_reg32(b, 36), draw->start);
974       cs_move32_to(b, cs_reg32(b, 39), 0);
975    }
976 
977    cs_run_idvs(b, flags_override, false, true, cs_shader_res_sel(0, 0, 1, 0),
978                cs_shader_res_sel(2, 2, 2, 0), drawid);
979 }
980 
981 void
GENX(csf_launch_draw_indirect)982 GENX(csf_launch_draw_indirect)(struct panfrost_batch *batch,
983                                const struct pipe_draw_info *info,
984                                unsigned drawid_offset,
985                                const struct pipe_draw_indirect_info *indirect)
986 {
987    struct cs_builder *b = batch->csf.cs.builder;
988 
989    uint32_t flags_override = csf_emit_draw_state(batch, info, drawid_offset);
990    struct cs_index drawid = csf_emit_draw_id_register(batch, drawid_offset);
991 
992    struct cs_index address = cs_reg64(b, 64);
993    struct cs_index counter = cs_reg32(b, 66);
994    cs_move64_to(
995       b, address,
996       pan_resource(indirect->buffer)->image.data.base + indirect->offset);
997    cs_move32_to(b, counter, indirect->draw_count);
998 
999    cs_while(b, MALI_CS_CONDITION_GREATER, counter) {
1000       if (info->index_size) {
1001          /* loads vertex count, instance count, index offset, vertex offset */
1002          cs_load_to(b, cs_reg_tuple(b, 33, 4), address, BITFIELD_MASK(4), 0);
1003          cs_move32_to(b, cs_reg32(b, 39), info->index.resource->width0);
1004       } else {
1005          /* vertex count, instance count */
1006          cs_load_to(b, cs_reg_tuple(b, 33, 2), address, BITFIELD_MASK(2), 0);
1007          cs_move32_to(b, cs_reg32(b, 35), 0);
1008          cs_load_to(b, cs_reg_tuple(b, 36, 1), address, BITFIELD_MASK(1),
1009                     2 * sizeof(uint32_t)); // instance offset
1010          cs_move32_to(b, cs_reg32(b, 37), 0);
1011          cs_move32_to(b, cs_reg32(b, 39), 0);
1012       }
1013 
1014       cs_wait_slot(b, 0, false);
1015       cs_run_idvs(b, flags_override, false, true, cs_shader_res_sel(0, 0, 1, 0),
1016                   cs_shader_res_sel(2, 2, 2, 0), drawid);
1017 
1018       cs_add64(b, address, address, indirect->stride);
1019       cs_add32(b, counter, counter, (unsigned int)-1);
1020       if (drawid.type != CS_INDEX_UNDEF)
1021          cs_add32(b, drawid, drawid, 1);
1022    }
1023 }
1024 
1025 #define POSITION_FIFO_SIZE (64 * 1024)
1026 
1027 int
GENX(csf_init_context)1028 GENX(csf_init_context)(struct panfrost_context *ctx)
1029 {
1030    struct panfrost_device *dev = pan_device(ctx->base.screen);
1031    struct drm_panthor_queue_create qc[] = {{
1032       .priority = 1,
1033       .ringbuf_size = 64 * 1024,
1034    }};
1035 
1036    struct drm_panthor_group_create gc = {
1037       .compute_core_mask = dev->kmod.props.shader_present,
1038       .fragment_core_mask = dev->kmod.props.shader_present,
1039       .tiler_core_mask = 1,
1040       .max_compute_cores = util_bitcount64(dev->kmod.props.shader_present),
1041       .max_fragment_cores = util_bitcount64(dev->kmod.props.shader_present),
1042       .max_tiler_cores = 1,
1043       .priority = PANTHOR_GROUP_PRIORITY_MEDIUM,
1044       .queues = DRM_PANTHOR_OBJ_ARRAY(ARRAY_SIZE(qc), qc),
1045       .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
1046    };
1047 
1048    int ret =
1049       drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_CREATE, &gc);
1050 
1051    if (ret)
1052       goto err_group_create;
1053 
1054    ctx->csf.group_handle = gc.group_handle;
1055 
1056    struct drm_panthor_group_destroy gd = {
1057       .group_handle = ctx->csf.group_handle,
1058    };
1059 
1060    /* Get tiler heap */
1061    struct drm_panthor_tiler_heap_create thc = {
1062       .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
1063       .chunk_size = pan_screen(ctx->base.screen)->csf_tiler_heap.chunk_size,
1064       .initial_chunk_count =
1065          pan_screen(ctx->base.screen)->csf_tiler_heap.initial_chunks,
1066       .max_chunks = pan_screen(ctx->base.screen)->csf_tiler_heap.max_chunks,
1067       .target_in_flight = 65535,
1068    };
1069    ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE,
1070                   &thc);
1071 
1072    if (ret)
1073       goto err_tiler_heap;
1074 
1075    ctx->csf.heap.handle = thc.handle;
1076 
1077    struct drm_panthor_tiler_heap_destroy thd = {
1078       .handle = ctx->csf.heap.handle,
1079    };
1080 
1081    ctx->csf.heap.desc_bo =
1082       panfrost_bo_create(dev, pan_size(TILER_HEAP), 0, "Tiler Heap");
1083 
1084    if (ctx->csf.heap.desc_bo == NULL)
1085       goto err_tiler_heap_desc_bo;
1086 
1087    pan_pack(ctx->csf.heap.desc_bo->ptr.cpu, TILER_HEAP, heap) {
1088       heap.size = pan_screen(ctx->base.screen)->csf_tiler_heap.chunk_size;
1089       heap.base = thc.first_heap_chunk_gpu_va;
1090       heap.bottom = heap.base + 64;
1091       heap.top = heap.base + heap.size;
1092    }
1093 
1094    ctx->csf.tmp_geom_bo = panfrost_bo_create(
1095       dev, POSITION_FIFO_SIZE, PAN_BO_INVISIBLE, "Temporary Geometry buffer");
1096 
1097    if (ctx->csf.tmp_geom_bo == NULL)
1098       goto err_tiler_heap_tmp_geom_bo;
1099 
1100    /* Setup the tiler heap */
1101    struct panfrost_bo *cs_bo =
1102       panfrost_bo_create(dev, 4096, 0, "Temporary CS buffer");
1103 
1104    if (cs_bo == NULL)
1105       goto err_tiler_heap_cs_bo;
1106 
1107    struct cs_buffer init_buffer = {
1108       .cpu = cs_bo->ptr.cpu,
1109       .gpu = cs_bo->ptr.gpu,
1110       .capacity = panfrost_bo_size(cs_bo) / sizeof(uint64_t),
1111    };
1112    const struct cs_builder_conf bconf = {
1113       .nr_registers = 96,
1114       .nr_kernel_registers = 4,
1115    };
1116    struct cs_builder b;
1117    cs_builder_init(&b, &bconf, init_buffer);
1118    struct cs_index heap = cs_reg64(&b, 72);
1119    cs_move64_to(&b, heap, thc.tiler_heap_ctx_gpu_va);
1120    cs_heap_set(&b, heap);
1121 
1122    struct drm_panthor_queue_submit qsubmit;
1123    struct drm_panthor_group_submit gsubmit;
1124    struct drm_panthor_sync_op sync = {
1125       .flags =
1126          DRM_PANTHOR_SYNC_OP_SIGNAL | DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ,
1127       .handle = ctx->syncobj,
1128    };
1129 
1130    assert(cs_is_valid(&b));
1131    cs_finish(&b);
1132 
1133    uint64_t cs_start = cs_root_chunk_gpu_addr(&b);
1134    uint32_t cs_size = cs_root_chunk_size(&b);
1135 
1136    csf_prepare_qsubmit(ctx, &qsubmit, 0, cs_start, cs_size, &sync, 1);
1137    csf_prepare_gsubmit(ctx, &gsubmit, &qsubmit, 1);
1138    ret = csf_submit_gsubmit(ctx, &gsubmit);
1139 
1140    if (ret)
1141       goto err_g_submit;
1142 
1143    /* Wait before freeing the buffer. */
1144    ret = drmSyncobjWait(panfrost_device_fd(dev), &ctx->syncobj, 1, INT64_MAX, 0,
1145                         NULL);
1146    assert(!ret);
1147 
1148    panfrost_bo_unreference(cs_bo);
1149 
1150    ctx->csf.is_init = true;
1151    return 0;
1152 err_g_submit:
1153    panfrost_bo_unreference(cs_bo);
1154 err_tiler_heap_cs_bo:
1155    panfrost_bo_unreference(ctx->csf.tmp_geom_bo);
1156 err_tiler_heap_tmp_geom_bo:
1157    panfrost_bo_unreference(ctx->csf.heap.desc_bo);
1158 err_tiler_heap_desc_bo:
1159    drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY,
1160             &thd);
1161 err_tiler_heap:
1162    drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_DESTROY, &gd);
1163 err_group_create:
1164    return -1;
1165 }
1166 
1167 void
GENX(csf_cleanup_context)1168 GENX(csf_cleanup_context)(struct panfrost_context *ctx)
1169 {
1170    if (!ctx->csf.is_init)
1171       return;
1172 
1173    struct panfrost_device *dev = pan_device(ctx->base.screen);
1174    struct drm_panthor_tiler_heap_destroy thd = {
1175       .handle = ctx->csf.heap.handle,
1176    };
1177    int ret;
1178 
1179    /* Make sure all jobs are done before destroying the heap. */
1180    ret = drmSyncobjWait(panfrost_device_fd(dev), &ctx->syncobj, 1, INT64_MAX, 0,
1181                         NULL);
1182    assert(!ret);
1183 
1184    ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY,
1185                   &thd);
1186    assert(!ret);
1187 
1188    struct drm_panthor_group_destroy gd = {
1189       .group_handle = ctx->csf.group_handle,
1190    };
1191 
1192    ret =
1193       drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_DESTROY, &gd);
1194    assert(!ret);
1195 
1196    panfrost_bo_unreference(ctx->csf.heap.desc_bo);
1197    ctx->csf.is_init = false;
1198 }
1199 
1200 void
GENX(csf_emit_write_timestamp)1201 GENX(csf_emit_write_timestamp)(struct panfrost_batch *batch,
1202                                struct panfrost_resource *dst, unsigned offset)
1203 {
1204    struct cs_builder *b = batch->csf.cs.builder;
1205 
1206    struct cs_index address = cs_reg64(b, 40);
1207    cs_move64_to(b, address,
1208                 dst->image.data.base + dst->image.data.offset + offset);
1209    cs_store_state(b, address, 0, MALI_CS_STATE_TIMESTAMP, cs_now());
1210 
1211    panfrost_batch_write_rsrc(batch, dst, PIPE_SHADER_VERTEX);
1212 }
1213