xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/panfrost/pan_jm.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (C) 2018 Alyssa Rosenzweig
3  * Copyright (C) 2020 Collabora Ltd.
4  * Copyright © 2017 Intel Corporation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  */
25 
26 #include "decode.h"
27 
28 #include "drm-uapi/panfrost_drm.h"
29 
30 #include "pan_blitter.h"
31 #include "pan_cmdstream.h"
32 #include "pan_context.h"
33 #include "pan_indirect_dispatch.h"
34 #include "pan_jm.h"
35 #include "pan_job.h"
36 
37 #if PAN_ARCH >= 10
38 #error "JM helpers are only used for gen < 10"
39 #endif
40 
41 void
GENX(jm_init_batch)42 GENX(jm_init_batch)(struct panfrost_batch *batch)
43 {
44    /* Reserve the framebuffer and local storage descriptors */
45    batch->framebuffer =
46 #if PAN_ARCH == 4
47       pan_pool_alloc_desc(&batch->pool.base, FRAMEBUFFER);
48 #else
49       pan_pool_alloc_desc_aggregate(
50          &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION),
51          PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
52 #endif
53 
54 #if PAN_ARCH >= 6
55    batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
56 #else
57    /* On Midgard, the TLS is embedded in the FB descriptor */
58    batch->tls = batch->framebuffer;
59 
60 #if PAN_ARCH == 5
61    struct mali_framebuffer_pointer_packed ptr;
62 
63    pan_pack(ptr.opaque, FRAMEBUFFER_POINTER, cfg) {
64       cfg.pointer = batch->framebuffer.gpu;
65       cfg.render_target_count = 1; /* a necessary lie */
66    }
67 
68    batch->tls.gpu = ptr.opaque[0];
69 #endif
70 #endif
71 }
72 
73 static int
jm_submit_jc(struct panfrost_batch * batch,mali_ptr first_job_desc,uint32_t reqs,uint32_t out_sync)74 jm_submit_jc(struct panfrost_batch *batch, mali_ptr first_job_desc,
75              uint32_t reqs, uint32_t out_sync)
76 {
77    struct panfrost_context *ctx = batch->ctx;
78    struct pipe_context *gallium = (struct pipe_context *)ctx;
79    struct panfrost_device *dev = pan_device(gallium->screen);
80    struct drm_panfrost_submit submit = {
81       0,
82    };
83    uint32_t in_syncs[1];
84    uint32_t *bo_handles;
85    int ret;
86 
87    /* If we trace, we always need a syncobj, so make one of our own if we
88     * weren't given one to use. Remember that we did so, so we can free it
89     * after we're done but preventing double-frees if we were given a
90     * syncobj */
91 
92    if (!out_sync && dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC))
93       out_sync = ctx->syncobj;
94 
95    submit.out_sync = out_sync;
96    submit.jc = first_job_desc;
97    submit.requirements = reqs;
98 
99    if (ctx->in_sync_fd >= 0) {
100       ret = drmSyncobjImportSyncFile(panfrost_device_fd(dev), ctx->in_sync_obj,
101                                      ctx->in_sync_fd);
102       assert(!ret);
103 
104       in_syncs[submit.in_sync_count++] = ctx->in_sync_obj;
105       close(ctx->in_sync_fd);
106       ctx->in_sync_fd = -1;
107    }
108 
109    if (submit.in_sync_count)
110       submit.in_syncs = (uintptr_t)in_syncs;
111 
112    bo_handles = calloc(panfrost_pool_num_bos(&batch->pool) +
113                           panfrost_pool_num_bos(&batch->invisible_pool) +
114                           batch->num_bos + 2,
115                        sizeof(*bo_handles));
116    assert(bo_handles);
117 
118    pan_bo_access *flags = util_dynarray_begin(&batch->bos);
119    unsigned end_bo = util_dynarray_num_elements(&batch->bos, pan_bo_access);
120 
121    for (int i = 0; i < end_bo; ++i) {
122       if (!flags[i])
123          continue;
124 
125       assert(submit.bo_handle_count < batch->num_bos);
126       bo_handles[submit.bo_handle_count++] = i;
127 
128       /* Update the BO access flags so that panfrost_bo_wait() knows
129        * about all pending accesses.
130        * We only keep the READ/WRITE info since this is all the BO
131        * wait logic cares about.
132        * We also preserve existing flags as this batch might not
133        * be the first one to access the BO.
134        */
135       struct panfrost_bo *bo = pan_lookup_bo(dev, i);
136 
137       bo->gpu_access |= flags[i] & (PAN_BO_ACCESS_RW);
138    }
139 
140    panfrost_pool_get_bo_handles(&batch->pool,
141                                 bo_handles + submit.bo_handle_count);
142    submit.bo_handle_count += panfrost_pool_num_bos(&batch->pool);
143    panfrost_pool_get_bo_handles(&batch->invisible_pool,
144                                 bo_handles + submit.bo_handle_count);
145    submit.bo_handle_count += panfrost_pool_num_bos(&batch->invisible_pool);
146 
147    /* Add the tiler heap to the list of accessed BOs if the batch has at
148     * least one tiler job. Tiler heap is written by tiler jobs and read
149     * by fragment jobs (the polygon list is coming from this heap).
150     */
151    if (batch->jm.jobs.vtc_jc.first_tiler)
152       bo_handles[submit.bo_handle_count++] =
153          panfrost_bo_handle(dev->tiler_heap);
154 
155    /* Always used on Bifrost, occassionally used on Midgard */
156    bo_handles[submit.bo_handle_count++] =
157       panfrost_bo_handle(dev->sample_positions);
158 
159    submit.bo_handles = (u64)(uintptr_t)bo_handles;
160    if (ctx->is_noop)
161       ret = 0;
162    else
163       ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANFROST_SUBMIT, &submit);
164    free(bo_handles);
165 
166    if (ret)
167       return errno;
168 
169    /* Trace the job if we're doing that */
170    if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) {
171       /* Wait so we can get errors reported back */
172       ret = drmSyncobjWait(panfrost_device_fd(dev), &out_sync, 1, INT64_MAX,
173                            0, NULL);
174       if (ret)
175          return errno;
176 
177       if (dev->debug & PAN_DBG_TRACE)
178          pandecode_jc(dev->decode_ctx, submit.jc, panfrost_device_gpu_id(dev));
179 
180       if (dev->debug & PAN_DBG_DUMP)
181          pandecode_dump_mappings(dev->decode_ctx);
182 
183       /* Jobs won't be complete if blackhole rendering, that's ok */
184       if (!ctx->is_noop && dev->debug & PAN_DBG_SYNC)
185          pandecode_abort_on_fault(dev->decode_ctx, submit.jc, panfrost_device_gpu_id(dev));
186    }
187 
188    return 0;
189 }
190 
191 /* Submit both vertex/tiler and fragment jobs for a batch, possibly with an
192  * outsync corresponding to the later of the two (since there will be an
193  * implicit dep between them) */
194 
195 int
GENX(jm_submit_batch)196 GENX(jm_submit_batch)(struct panfrost_batch *batch)
197 {
198    struct pipe_screen *pscreen = batch->ctx->base.screen;
199    struct panfrost_device *dev = pan_device(pscreen);
200    bool has_draws = batch->jm.jobs.vtc_jc.first_job;
201    bool has_tiler = batch->jm.jobs.vtc_jc.first_tiler;
202    bool has_frag = panfrost_has_fragment_job(batch);
203    uint32_t out_sync = batch->ctx->syncobj;
204    int ret = 0;
205 
206    unsigned reqs =
207       batch->need_job_req_cycle_count ? PANFROST_JD_REQ_CYCLE_COUNT : 0;
208 
209    /* Take the submit lock to make sure no tiler jobs from other context
210     * are inserted between our tiler and fragment jobs, failing to do that
211     * might result in tiler heap corruption.
212     */
213    if (has_tiler)
214       pthread_mutex_lock(&dev->submit_lock);
215 
216    if (has_draws) {
217       ret = jm_submit_jc(batch, batch->jm.jobs.vtc_jc.first_job, reqs,
218                          has_frag ? 0 : out_sync);
219 
220       if (ret)
221          goto done;
222    }
223 
224    if (has_frag) {
225       ret = jm_submit_jc(batch, batch->jm.jobs.frag, reqs | PANFROST_JD_REQ_FS,
226                          out_sync);
227       if (ret)
228          goto done;
229    }
230 
231 done:
232    if (has_tiler)
233       pthread_mutex_unlock(&dev->submit_lock);
234 
235    return ret;
236 }
237 
238 void
GENX(jm_preload_fb)239 GENX(jm_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb)
240 {
241    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
242    struct panfrost_ptr preload_jobs[2];
243 
244    unsigned preload_job_count = GENX(pan_preload_fb)(
245       &dev->blitter, &batch->pool.base, fb, 0, batch->tls.gpu, preload_jobs);
246 
247    assert(PAN_ARCH < 6 || !preload_job_count);
248 
249    for (unsigned j = 0; j < preload_job_count; j++) {
250       pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_TILER, false, false,
251                      0, 0, &preload_jobs[j], true);
252    }
253 }
254 
255 void
GENX(jm_emit_fragment_job)256 GENX(jm_emit_fragment_job)(struct panfrost_batch *batch,
257                            const struct pan_fb_info *pfb)
258 {
259    struct panfrost_ptr transfer =
260       pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB);
261 
262    GENX(pan_emit_fragment_job_payload)
263    (pfb, batch->framebuffer.gpu, transfer.cpu);
264 
265    pan_section_pack(transfer.cpu, FRAGMENT_JOB, HEADER, header) {
266       header.type = MALI_JOB_TYPE_FRAGMENT;
267       header.index = 1;
268    }
269 
270    batch->jm.jobs.frag = transfer.gpu;
271 }
272 
273 #if PAN_ARCH == 9
274 static void
jm_emit_shader_env(struct panfrost_batch * batch,struct MALI_SHADER_ENVIRONMENT * cfg,enum pipe_shader_type stage,mali_ptr shader_ptr)275 jm_emit_shader_env(struct panfrost_batch *batch,
276                    struct MALI_SHADER_ENVIRONMENT *cfg,
277                    enum pipe_shader_type stage, mali_ptr shader_ptr)
278 {
279    cfg->resources = panfrost_emit_resources(batch, stage);
280    cfg->thread_storage = batch->tls.gpu;
281    cfg->shader = shader_ptr;
282 
283    /* Each entry of FAU is 64-bits */
284    cfg->fau = batch->push_uniforms[stage];
285    cfg->fau_count = DIV_ROUND_UP(batch->nr_push_uniforms[stage], 2);
286 }
287 #endif
288 
289 void
GENX(jm_launch_grid)290 GENX(jm_launch_grid)(struct panfrost_batch *batch,
291                      const struct pipe_grid_info *info)
292 {
293    struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
294 
295    /* Invoke according to the grid info */
296 
297    unsigned num_wg[3] = {info->grid[0], info->grid[1], info->grid[2]};
298 
299    if (info->indirect)
300       num_wg[0] = num_wg[1] = num_wg[2] = 1;
301 
302 #if PAN_ARCH <= 7
303    panfrost_pack_work_groups_compute(
304       pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION), num_wg[0], num_wg[1],
305       num_wg[2], info->block[0], info->block[1], info->block[2], false,
306       info->indirect != NULL);
307 
308    pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
309       cfg.job_task_split = util_logbase2_ceil(info->block[0] + 1) +
310                            util_logbase2_ceil(info->block[1] + 1) +
311                            util_logbase2_ceil(info->block[2] + 1);
312    }
313 
314    pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {
315       cfg.state = batch->rsd[PIPE_SHADER_COMPUTE];
316       cfg.attributes = batch->attribs[PIPE_SHADER_COMPUTE];
317       cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_COMPUTE];
318       cfg.thread_storage = batch->tls.gpu;
319       cfg.uniform_buffers = batch->uniform_buffers[PIPE_SHADER_COMPUTE];
320       cfg.push_uniforms = batch->push_uniforms[PIPE_SHADER_COMPUTE];
321       cfg.textures = batch->textures[PIPE_SHADER_COMPUTE];
322       cfg.samplers = batch->samplers[PIPE_SHADER_COMPUTE];
323    }
324 
325 #if PAN_ARCH == 4
326    pan_section_pack(t.cpu, COMPUTE_JOB, COMPUTE_PADDING, cfg)
327       ;
328 #endif
329 #else
330    struct panfrost_context *ctx = batch->ctx;
331    struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
332 
333    pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
334       cfg.workgroup_size_x = info->block[0];
335       cfg.workgroup_size_y = info->block[1];
336       cfg.workgroup_size_z = info->block[2];
337 
338       cfg.workgroup_count_x = num_wg[0];
339       cfg.workgroup_count_y = num_wg[1];
340       cfg.workgroup_count_z = num_wg[2];
341 
342       jm_emit_shader_env(batch, &cfg.compute, PIPE_SHADER_COMPUTE,
343                          batch->rsd[PIPE_SHADER_COMPUTE]);
344 
345       /* Workgroups may be merged if the shader does not use barriers
346        * or shared memory. This condition is checked against the
347        * static shared_size at compile-time. We need to check the
348        * variable shared size at launch_grid time, because the
349        * compiler doesn't know about that.
350        */
351       cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups &&
352                                      (info->variable_shared_mem == 0);
353 
354       cfg.task_increment = 1;
355       cfg.task_axis = MALI_TASK_AXIS_Z;
356    }
357 #endif
358 
359    unsigned indirect_dep = 0;
360 #if PAN_GPU_SUPPORTS_DISPATCH_INDIRECT
361    if (info->indirect) {
362       struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
363       struct pan_indirect_dispatch_info indirect = {
364          .job = t.gpu,
365          .indirect_dim = pan_resource(info->indirect)->image.data.base +
366                          info->indirect_offset,
367          .num_wg_sysval =
368             {
369                batch->num_wg_sysval[0],
370                batch->num_wg_sysval[1],
371                batch->num_wg_sysval[2],
372             },
373       };
374 
375       indirect_dep = GENX(pan_indirect_dispatch_emit)(
376          &dev->indirect_dispatch, &batch->pool.base, &batch->jm.jobs.vtc_jc,
377          &indirect);
378    }
379 #endif
380 
381    pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_COMPUTE, true, false,
382                   indirect_dep, 0, &t, false);
383 }
384 
385 #if PAN_ARCH >= 6
386 static mali_ptr
jm_emit_tiler_desc(struct panfrost_batch * batch)387 jm_emit_tiler_desc(struct panfrost_batch *batch)
388 {
389    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
390    mali_ptr tiler_desc = PAN_ARCH >= 9 ? batch->tiler_ctx.bifrost.desc
391                                        : batch->tiler_ctx.valhall.desc;
392 
393    if (tiler_desc)
394       return tiler_desc;
395 
396    struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP);
397 
398    pan_pack(t.cpu, TILER_HEAP, heap) {
399       heap.size = panfrost_bo_size(dev->tiler_heap);
400       heap.base = dev->tiler_heap->ptr.gpu;
401       heap.bottom = dev->tiler_heap->ptr.gpu;
402       heap.top = dev->tiler_heap->ptr.gpu + panfrost_bo_size(dev->tiler_heap);
403    }
404 
405    mali_ptr heap = t.gpu;
406    unsigned max_levels = dev->tiler_features.max_levels;
407    assert(max_levels >= 2);
408 
409    t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
410    pan_pack(t.cpu, TILER_CONTEXT, tiler) {
411       /* TODO: Select hierarchy mask more effectively */
412       tiler.hierarchy_mask = (max_levels >= 8) ? 0xFF : 0x28;
413 
414       /* For large framebuffers, disable the smallest bin size to
415        * avoid pathological tiler memory usage. Required to avoid OOM
416        * on dEQP-GLES31.functional.fbo.no_attachments.maximums.all on
417        * Mali-G57.
418        */
419       if (MAX2(batch->key.width, batch->key.height) >= 4096)
420          tiler.hierarchy_mask &= ~1;
421 
422       tiler.fb_width = batch->key.width;
423       tiler.fb_height = batch->key.height;
424       tiler.heap = heap;
425       tiler.sample_pattern =
426          pan_sample_pattern(util_framebuffer_get_num_samples(&batch->key));
427 #if PAN_ARCH >= 9
428       tiler.first_provoking_vertex =
429          pan_tristate_get(batch->first_provoking_vertex);
430 #endif
431    }
432 
433    if (PAN_ARCH >= 9)
434       batch->tiler_ctx.valhall.desc = t.gpu;
435    else
436       batch->tiler_ctx.bifrost.desc = t.gpu;
437 
438    return t.gpu;
439 }
440 #endif
441 
442 #if PAN_ARCH <= 7
443 static inline void
jm_emit_draw_descs(struct panfrost_batch * batch,struct MALI_DRAW * d,enum pipe_shader_type st)444 jm_emit_draw_descs(struct panfrost_batch *batch, struct MALI_DRAW *d,
445                    enum pipe_shader_type st)
446 {
447    d->offset_start = batch->ctx->offset_start;
448    d->instance_size =
449       batch->ctx->instance_count > 1 ? batch->ctx->padded_count : 1;
450 
451    d->uniform_buffers = batch->uniform_buffers[st];
452    d->push_uniforms = batch->push_uniforms[st];
453    d->textures = batch->textures[st];
454    d->samplers = batch->samplers[st];
455 }
456 
457 static void
jm_emit_vertex_draw(struct panfrost_batch * batch,void * section)458 jm_emit_vertex_draw(struct panfrost_batch *batch, void *section)
459 {
460    pan_pack(section, DRAW, cfg) {
461       cfg.state = batch->rsd[PIPE_SHADER_VERTEX];
462       cfg.attributes = batch->attribs[PIPE_SHADER_VERTEX];
463       cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_VERTEX];
464       cfg.varyings = batch->varyings.vs;
465       cfg.varying_buffers = cfg.varyings ? batch->varyings.bufs : 0;
466       cfg.thread_storage = batch->tls.gpu;
467       jm_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX);
468    }
469 }
470 
471 static void
jm_emit_vertex_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,void * invocation_template,void * job)472 jm_emit_vertex_job(struct panfrost_batch *batch,
473                    const struct pipe_draw_info *info, void *invocation_template,
474                    void *job)
475 {
476    void *section = pan_section_ptr(job, COMPUTE_JOB, INVOCATION);
477    memcpy(section, invocation_template, pan_size(INVOCATION));
478 
479    pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) {
480       cfg.job_task_split = 5;
481    }
482 
483    section = pan_section_ptr(job, COMPUTE_JOB, DRAW);
484    jm_emit_vertex_draw(batch, section);
485 
486 #if PAN_ARCH == 4
487    pan_section_pack(job, COMPUTE_JOB, COMPUTE_PADDING, cfg)
488       ;
489 #endif
490 }
491 #endif /* PAN_ARCH <= 7 */
492 
493 static void
jm_emit_tiler_draw(void * out,struct panfrost_batch * batch,bool fs_required,enum mesa_prim prim)494 jm_emit_tiler_draw(void *out, struct panfrost_batch *batch, bool fs_required,
495                    enum mesa_prim prim)
496 {
497    struct panfrost_context *ctx = batch->ctx;
498    struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
499    bool polygon = (prim == MESA_PRIM_TRIANGLES);
500 
501    pan_pack(out, DRAW, cfg) {
502       /*
503        * From the Gallium documentation,
504        * pipe_rasterizer_state::cull_face "indicates which faces of
505        * polygons to cull". Points and lines are not considered
506        * polygons and should be drawn even if all faces are culled.
507        * The hardware does not take primitive type into account when
508        * culling, so we need to do that check ourselves.
509        */
510       cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
511       cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
512       cfg.front_face_ccw = rast->front_ccw;
513 
514       if (ctx->occlusion_query && ctx->active_queries) {
515          if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
516             cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
517          else
518             cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
519 
520          struct panfrost_resource *rsrc =
521             pan_resource(ctx->occlusion_query->rsrc);
522          cfg.occlusion = rsrc->image.data.base;
523          panfrost_batch_write_rsrc(ctx->batch, rsrc, PIPE_SHADER_FRAGMENT);
524       }
525 
526 #if PAN_ARCH >= 9
527       struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
528 
529       cfg.multisample_enable = rast->multisample;
530       cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
531 
532       /* Use per-sample shading if required by API Also use it when a
533        * blend shader is used with multisampling, as this is handled
534        * by a single ST_TILE in the blend shader with the current
535        * sample ID, requiring per-sample shading.
536        */
537       cfg.evaluate_per_sample =
538          (rast->multisample &&
539           ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader));
540 
541       cfg.single_sampled_lines = !rast->multisample;
542 
543       cfg.vertex_array.packet = true;
544 
545       cfg.minimum_z = batch->minimum_z;
546       cfg.maximum_z = batch->maximum_z;
547 
548       cfg.depth_stencil = batch->depth_stencil;
549 
550       if (prim == MESA_PRIM_LINES && rast->line_smooth) {
551          cfg.multisample_enable = true;
552          cfg.single_sampled_lines = false;
553       }
554 
555       if (fs_required) {
556          bool has_oq = ctx->occlusion_query && ctx->active_queries;
557 
558          struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
559             fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
560             ctx->blend->base.alpha_to_coverage,
561             ctx->depth_stencil->zs_always_passes);
562 
563          cfg.pixel_kill_operation = earlyzs.kill;
564          cfg.zs_update_operation = earlyzs.update;
565 
566          cfg.allow_forward_pixel_to_kill =
567             pan_allow_forward_pixel_to_kill(ctx, fs);
568          cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global;
569 
570          /* Mask of render targets that may be written. A render
571           * target may be written if the fragment shader writes
572           * to it AND it actually exists. If the render target
573           * doesn't actually exist, the blend descriptor will be
574           * OFF so it may be omitted from the mask.
575           *
576           * Only set when there is a fragment shader, since
577           * otherwise no colour updates are possible.
578           */
579          cfg.render_target_mask =
580             (fs->info.outputs_written >> FRAG_RESULT_DATA0) & ctx->fb_rt_mask;
581 
582          /* Also use per-sample shading if required by the shader
583           */
584          cfg.evaluate_per_sample |= fs->info.fs.sample_shading;
585 
586          /* Unlike Bifrost, alpha-to-coverage must be included in
587           * this identically-named flag. Confusing, isn't it?
588           */
589          cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
590                                         fs->info.fs.can_discard ||
591                                         ctx->blend->base.alpha_to_coverage;
592 
593          /* Blend descriptors are only accessed by a BLEND
594           * instruction on Valhall. It follows that if the
595           * fragment shader is omitted, we may also emit the
596           * blend descriptors.
597           */
598          cfg.blend = batch->blend;
599          cfg.blend_count = MAX2(batch->key.nr_cbufs, 1);
600          cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
601 
602          cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
603          cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
604 
605          jm_emit_shader_env(batch, &cfg.shader, PIPE_SHADER_FRAGMENT,
606                             batch->rsd[PIPE_SHADER_FRAGMENT]);
607       } else {
608          /* These operations need to be FORCE to benefit from the
609           * depth-only pass optimizations.
610           */
611          cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
612          cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
613 
614          /* No shader and no blend => no shader or blend
615           * reasons to disable FPK. The only FPK-related state
616           * not covered is alpha-to-coverage which we don't set
617           * without blend.
618           */
619          cfg.allow_forward_pixel_to_kill = true;
620 
621          /* No shader => no shader side effects */
622          cfg.allow_forward_pixel_to_be_killed = true;
623 
624          /* Alpha isn't written so these are vacuous */
625          cfg.overdraw_alpha0 = true;
626          cfg.overdraw_alpha1 = true;
627       }
628 #else
629       cfg.position = batch->varyings.pos;
630       cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT];
631       cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT];
632       cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT];
633       cfg.viewport = batch->viewport;
634       cfg.varyings = batch->varyings.fs;
635       cfg.varying_buffers = cfg.varyings ? batch->varyings.bufs : 0;
636       cfg.thread_storage = batch->tls.gpu;
637 
638       /* For all primitives but lines DRAW.flat_shading_vertex must
639        * be set to 0 and the provoking vertex is selected with the
640        * PRIMITIVE.first_provoking_vertex field.
641        */
642       if (prim == MESA_PRIM_LINES) {
643          /* The logic is inverted across arches. */
644          cfg.flat_shading_vertex = rast->flatshade_first ^ (PAN_ARCH <= 5);
645       }
646 
647       jm_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT);
648 #endif
649    }
650 }
651 
652 /* Packs a primitive descriptor, mostly common between Midgard/Bifrost tiler
653  * jobs and Valhall IDVS jobs
654  */
655 static void
jm_emit_primitive(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool secondary_shader,void * out)656 jm_emit_primitive(struct panfrost_batch *batch,
657                   const struct pipe_draw_info *info,
658                   const struct pipe_draw_start_count_bias *draw,
659                   bool secondary_shader, void *out)
660 {
661    struct panfrost_context *ctx = batch->ctx;
662    UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
663 
664    pan_pack(out, PRIMITIVE, cfg) {
665       cfg.draw_mode = pan_draw_mode(info->mode);
666       if (panfrost_writes_point_size(ctx))
667          cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
668 
669 #if PAN_ARCH <= 8
670       /* For line primitives, PRIMITIVE.first_provoking_vertex must
671        * be set to true and the provoking vertex is selected with
672        * DRAW.flat_shading_vertex.
673        */
674       if (u_reduced_prim(info->mode) == MESA_PRIM_LINES)
675          cfg.first_provoking_vertex = true;
676       else
677          cfg.first_provoking_vertex = rast->flatshade_first;
678 
679       if (panfrost_is_implicit_prim_restart(info)) {
680          cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT;
681       } else if (info->primitive_restart) {
682          cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT;
683          cfg.primitive_restart_index = info->restart_index;
684       }
685 
686       cfg.job_task_split = 6;
687 #else
688       struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
689 
690       cfg.allow_rotating_primitives = allow_rotating_primitives(fs, info);
691       cfg.primitive_restart = info->primitive_restart;
692 
693       /* Non-fixed restart indices should have been lowered */
694       assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
695 #endif
696 
697       cfg.low_depth_cull = rast->depth_clip_near;
698       cfg.high_depth_cull = rast->depth_clip_far;
699 
700       cfg.index_count = draw->count;
701       cfg.index_type = panfrost_translate_index_size(info->index_size);
702 
703       if (PAN_ARCH >= 9) {
704          /* Base vertex offset on Valhall is used for both
705           * indexed and non-indexed draws, in a simple way for
706           * either. Handle both cases.
707           */
708          if (cfg.index_type)
709             cfg.base_vertex_offset = draw->index_bias;
710          else
711             cfg.base_vertex_offset = draw->start;
712 
713          /* Indices are moved outside the primitive descriptor
714           * on Valhall, so we don't need to set that here
715           */
716       } else if (cfg.index_type) {
717          cfg.base_vertex_offset = draw->index_bias - ctx->offset_start;
718 
719 #if PAN_ARCH <= 7
720          cfg.indices = batch->indices;
721 #endif
722       }
723 
724 #if PAN_ARCH >= 6
725       cfg.secondary_shader = secondary_shader;
726 #endif
727    }
728 }
729 
730 #if PAN_ARCH == 9
731 static void
jm_emit_malloc_vertex_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool secondary_shader,void * job)732 jm_emit_malloc_vertex_job(struct panfrost_batch *batch,
733                           const struct pipe_draw_info *info,
734                           const struct pipe_draw_start_count_bias *draw,
735                           bool secondary_shader, void *job)
736 {
737    struct panfrost_context *ctx = batch->ctx;
738    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
739    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
740 
741    bool fs_required = panfrost_fs_required(
742       fs, ctx->blend, &ctx->pipe_framebuffer, ctx->depth_stencil);
743 
744    /* Varying shaders only feed data to the fragment shader, so if we omit
745     * the fragment shader, we should omit the varying shader too.
746     */
747    secondary_shader &= fs_required;
748 
749    jm_emit_primitive(batch, info, draw, secondary_shader,
750                      pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE));
751 
752    pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) {
753       cfg.count = info->instance_count;
754    }
755 
756    pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
757       if (secondary_shader) {
758          unsigned sz = panfrost_vertex_attribute_stride(vs, fs);
759          cfg.vertex_packet_stride = sz + 16;
760          cfg.vertex_attribute_stride = sz;
761       } else {
762          /* Hardware requirement for "no varyings" */
763          cfg.vertex_packet_stride = 16;
764          cfg.vertex_attribute_stride = 0;
765       }
766    }
767 
768    pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) {
769       cfg.address = jm_emit_tiler_desc(batch);
770    }
771 
772    STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
773    memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR), &batch->scissor,
774           pan_size(SCISSOR));
775 
776    panfrost_emit_primitive_size(
777       ctx, info->mode == MESA_PRIM_POINTS, 0,
778       pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE));
779 
780    pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) {
781       cfg.address = batch->indices;
782    }
783 
784    jm_emit_tiler_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW), batch,
785                       fs_required, u_reduced_prim(info->mode));
786 
787    pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) {
788       jm_emit_shader_env(batch, &cfg, PIPE_SHADER_VERTEX,
789                          panfrost_get_position_shader(batch, info));
790    }
791 
792    pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) {
793       /* If a varying shader is used, we configure it with the same
794        * state as the position shader for backwards compatible
795        * behaviour with Bifrost. This could be optimized.
796        */
797       if (!secondary_shader)
798          continue;
799 
800       jm_emit_shader_env(batch, &cfg, PIPE_SHADER_VERTEX,
801                          panfrost_get_varying_shader(batch));
802    }
803 }
804 #endif
805 
806 #if PAN_ARCH <= 7
807 static void
jm_emit_tiler_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,void * invocation_template,bool secondary_shader,void * job)808 jm_emit_tiler_job(struct panfrost_batch *batch,
809                   const struct pipe_draw_info *info,
810                   const struct pipe_draw_start_count_bias *draw,
811                   void *invocation_template, bool secondary_shader, void *job)
812 {
813    struct panfrost_context *ctx = batch->ctx;
814 
815    void *section = pan_section_ptr(job, TILER_JOB, INVOCATION);
816    memcpy(section, invocation_template, pan_size(INVOCATION));
817 
818    jm_emit_primitive(batch, info, draw, secondary_shader,
819                      pan_section_ptr(job, TILER_JOB, PRIMITIVE));
820 
821    void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE);
822    enum mesa_prim prim = u_reduced_prim(info->mode);
823 
824 #if PAN_ARCH >= 6
825    pan_section_pack(job, TILER_JOB, TILER, cfg) {
826       cfg.address = jm_emit_tiler_desc(batch);
827    }
828 
829    pan_section_pack(job, TILER_JOB, PADDING, cfg)
830       ;
831 #endif
832 
833    jm_emit_tiler_draw(pan_section_ptr(job, TILER_JOB, DRAW), batch, true, prim);
834 
835    panfrost_emit_primitive_size(ctx, prim == MESA_PRIM_POINTS,
836                                 batch->varyings.psiz, prim_size);
837 }
838 #endif
839 
840 void
GENX(jm_launch_xfb)841 GENX(jm_launch_xfb)(struct panfrost_batch *batch,
842                     const struct pipe_draw_info *info, unsigned count)
843 {
844    struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
845 
846 #if PAN_ARCH == 9
847    pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
848       cfg.workgroup_size_x = 1;
849       cfg.workgroup_size_y = 1;
850       cfg.workgroup_size_z = 1;
851 
852       cfg.workgroup_count_x = count;
853       cfg.workgroup_count_y = info->instance_count;
854       cfg.workgroup_count_z = 1;
855 
856       jm_emit_shader_env(batch, &cfg.compute, PIPE_SHADER_VERTEX,
857                          batch->rsd[PIPE_SHADER_VERTEX]);
858 
859       /* TODO: Indexing. Also, this is a legacy feature... */
860       cfg.compute.attribute_offset = batch->ctx->offset_start;
861 
862       /* Transform feedback shaders do not use barriers or shared
863        * memory, so we may merge workgroups.
864        */
865       cfg.allow_merging_workgroups = true;
866       cfg.task_increment = 1;
867       cfg.task_axis = MALI_TASK_AXIS_Z;
868    }
869 #else
870    struct mali_invocation_packed invocation;
871 
872    panfrost_pack_work_groups_compute(&invocation, 1, count,
873                                      info->instance_count, 1, 1, 1,
874                                      PAN_ARCH <= 5, false);
875 
876    /* No varyings on XFB compute jobs. */
877    mali_ptr saved_vs_varyings = batch->varyings.vs;
878 
879    batch->varyings.vs = 0;
880    jm_emit_vertex_job(batch, info, &invocation, t.cpu);
881    batch->varyings.vs = saved_vs_varyings;
882 
883 #endif
884    enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE;
885 #if PAN_ARCH <= 5
886    job_type = MALI_JOB_TYPE_VERTEX;
887 #endif
888    pan_jc_add_job(&batch->jm.jobs.vtc_jc, job_type, true, false, 0, 0, &t,
889                   false);
890 }
891 
892 #if PAN_ARCH < 9
893 /*
894  * Push jobs required for the rasterization pipeline. If there are side effects
895  * from the vertex shader, these are handled ahead-of-time with a compute
896  * shader. This function should not be called if rasterization is skipped.
897  */
898 static void
jm_push_vertex_tiler_jobs(struct panfrost_batch * batch,const struct panfrost_ptr * vertex_job,const struct panfrost_ptr * tiler_job)899 jm_push_vertex_tiler_jobs(struct panfrost_batch *batch,
900                           const struct panfrost_ptr *vertex_job,
901                           const struct panfrost_ptr *tiler_job)
902 {
903    unsigned vertex =
904       pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_VERTEX, false, false,
905                      0, 0, vertex_job, false);
906 
907    pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_TILER, false, false,
908                   vertex, 0, tiler_job, false);
909 }
910 #endif
911 
912 void
GENX(jm_launch_draw)913 GENX(jm_launch_draw)(struct panfrost_batch *batch,
914                      const struct pipe_draw_info *info, unsigned drawid_offset,
915                      const struct pipe_draw_start_count_bias *draw,
916                      unsigned vertex_count)
917 {
918    struct panfrost_context *ctx = batch->ctx;
919    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
920    bool secondary_shader = vs->info.vs.secondary_enable;
921    bool idvs = vs->info.vs.idvs;
922 
923 #if PAN_ARCH <= 7
924    struct mali_invocation_packed invocation;
925    if (info->instance_count > 1) {
926       panfrost_pack_work_groups_compute(&invocation, 1, vertex_count,
927                                         info->instance_count, 1, 1, 1, true,
928                                         false);
929    } else {
930       pan_pack(&invocation, INVOCATION, cfg) {
931          cfg.invocations = vertex_count - 1;
932          cfg.size_y_shift = 0;
933          cfg.size_z_shift = 0;
934          cfg.workgroups_x_shift = 0;
935          cfg.workgroups_y_shift = 0;
936          cfg.workgroups_z_shift = 32;
937          cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT;
938       }
939    }
940 
941    /* Emit all sort of descriptors. */
942 #endif
943 
944    UNUSED struct panfrost_ptr tiler, vertex;
945 
946    if (idvs) {
947 #if PAN_ARCH == 9
948       tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB);
949 #elif PAN_ARCH >= 6
950       tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB);
951 #else
952       unreachable("IDVS is unsupported on Midgard");
953 #endif
954    } else {
955       vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
956       tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
957    }
958 
959 #if PAN_ARCH == 9
960    assert(idvs && "Memory allocated IDVS required on Valhall");
961 
962    jm_emit_malloc_vertex_job(batch, info, draw, secondary_shader, tiler.cpu);
963 
964    pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_MALLOC_VERTEX, false,
965                   false, 0, 0, &tiler, false);
966 #else
967    /* Fire off the draw itself */
968    jm_emit_tiler_job(batch, info, draw, &invocation, secondary_shader,
969                      tiler.cpu);
970    if (idvs) {
971 #if PAN_ARCH >= 6
972       jm_emit_vertex_draw(
973          batch, pan_section_ptr(tiler.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW));
974 
975       pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_INDEXED_VERTEX,
976                      false, false, 0, 0, &tiler, false);
977 #endif
978    } else {
979       jm_emit_vertex_job(batch, info, &invocation, vertex.cpu);
980       jm_push_vertex_tiler_jobs(batch, &vertex, &tiler);
981    }
982 #endif
983 }
984 
985 void
GENX(jm_launch_draw_indirect)986 GENX(jm_launch_draw_indirect)(struct panfrost_batch *batch,
987                               const struct pipe_draw_info *info,
988                               unsigned drawid_offset,
989                               const struct pipe_draw_indirect_info *indirect)
990 {
991    unreachable("draw indirect not implemented for jm");
992 }
993 
994 void
GENX(jm_emit_write_timestamp)995 GENX(jm_emit_write_timestamp)(struct panfrost_batch *batch,
996                               struct panfrost_resource *dst, unsigned offset)
997 {
998    struct panfrost_ptr job =
999       pan_pool_alloc_desc(&batch->pool.base, WRITE_VALUE_JOB);
1000 
1001    pan_section_pack(job.cpu, WRITE_VALUE_JOB, PAYLOAD, cfg) {
1002       cfg.address = dst->image.data.base + dst->image.data.offset + offset;
1003       cfg.type = MALI_WRITE_VALUE_TYPE_SYSTEM_TIMESTAMP;
1004    }
1005 
1006    pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_WRITE_VALUE, false,
1007                   false, 0, 0, &job, false);
1008    panfrost_batch_write_rsrc(batch, dst, PIPE_SHADER_VERTEX);
1009 }
1010