xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/panfrost/pan_cmdstream.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (C) 2023 Amazon.com, Inc. or its affiliates.
3  * Copyright (C) 2018 Alyssa Rosenzweig
4  * Copyright (C) 2020 Collabora Ltd.
5  * Copyright © 2017 Intel Corporation
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the next
15  * paragraph) shall be included in all copies or substantial portions of the
16  * Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
21  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24  * SOFTWARE.
25  */
26 
27 #include "gallium/auxiliary/util/u_blend.h"
28 #include "pipe/p_defines.h"
29 #include "pipe/p_state.h"
30 #include "util/macros.h"
31 #include "util/u_draw.h"
32 #include "util/u_helpers.h"
33 #include "util/u_memory.h"
34 #include "util/u_prim.h"
35 #include "util/u_sample_positions.h"
36 #include "util/u_vbuf.h"
37 #include "util/u_viewport.h"
38 
39 #include "decode.h"
40 
41 #include "genxml/gen_macros.h"
42 
43 #include "pan_afbc_cso.h"
44 #include "pan_blend.h"
45 #include "pan_blitter.h"
46 #include "pan_bo.h"
47 #include "pan_cmdstream.h"
48 #include "pan_context.h"
49 #include "pan_csf.h"
50 #include "pan_format.h"
51 #include "pan_indirect_dispatch.h"
52 #include "pan_jm.h"
53 #include "pan_job.h"
54 #include "pan_pool.h"
55 #include "pan_resource.h"
56 #include "pan_samples.h"
57 #include "pan_shader.h"
58 #include "pan_texture.h"
59 #include "pan_util.h"
60 
61 /* JOBX() is used to select the job backend helpers to call from generic
62  * functions. */
63 #if PAN_ARCH <= 9
64 #define JOBX(__suffix) GENX(jm_##__suffix)
65 #elif PAN_ARCH <= 10
66 #define JOBX(__suffix) GENX(csf_##__suffix)
67 #else
68 #error "Unsupported arch"
69 #endif
70 
71 struct panfrost_sampler_state {
72    struct pipe_sampler_state base;
73    struct mali_sampler_packed hw;
74 };
75 
76 /* Misnomer: Sampler view corresponds to textures, not samplers */
77 
78 struct panfrost_sampler_view {
79    struct pipe_sampler_view base;
80    struct panfrost_pool_ref state;
81    struct mali_texture_packed bifrost_descriptor;
82    mali_ptr texture_bo;
83    uint64_t modifier;
84 
85    /* Pool used to allocate the descriptor. If NULL, defaults to the global
86     * descriptor pool. Can be set for short lived descriptors, useful for
87     * shader images on Valhall.
88     */
89    struct panfrost_pool *pool;
90 };
91 
92 /* Statically assert that PIPE_* enums match the hardware enums.
93  * (As long as they match, we don't need to translate them.)
94  */
95 static_assert((int)PIPE_FUNC_NEVER == MALI_FUNC_NEVER, "must match");
96 static_assert((int)PIPE_FUNC_LESS == MALI_FUNC_LESS, "must match");
97 static_assert((int)PIPE_FUNC_EQUAL == MALI_FUNC_EQUAL, "must match");
98 static_assert((int)PIPE_FUNC_LEQUAL == MALI_FUNC_LEQUAL, "must match");
99 static_assert((int)PIPE_FUNC_GREATER == MALI_FUNC_GREATER, "must match");
100 static_assert((int)PIPE_FUNC_NOTEQUAL == MALI_FUNC_NOT_EQUAL, "must match");
101 static_assert((int)PIPE_FUNC_GEQUAL == MALI_FUNC_GEQUAL, "must match");
102 static_assert((int)PIPE_FUNC_ALWAYS == MALI_FUNC_ALWAYS, "must match");
103 
104 static inline enum mali_sample_pattern
panfrost_sample_pattern(unsigned samples)105 panfrost_sample_pattern(unsigned samples)
106 {
107    switch (samples) {
108    case 1:
109       return MALI_SAMPLE_PATTERN_SINGLE_SAMPLED;
110    case 4:
111       return MALI_SAMPLE_PATTERN_ROTATED_4X_GRID;
112    case 8:
113       return MALI_SAMPLE_PATTERN_D3D_8X_GRID;
114    case 16:
115       return MALI_SAMPLE_PATTERN_D3D_16X_GRID;
116    default:
117       unreachable("Unsupported sample count");
118    }
119 }
120 
121 static unsigned
translate_tex_wrap(enum pipe_tex_wrap w,bool using_nearest)122 translate_tex_wrap(enum pipe_tex_wrap w, bool using_nearest)
123 {
124    /* CLAMP is only supported on Midgard, where it is broken for nearest
125     * filtering. Use CLAMP_TO_EDGE in that case.
126     */
127 
128    switch (w) {
129    case PIPE_TEX_WRAP_REPEAT:
130       return MALI_WRAP_MODE_REPEAT;
131    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
132       return MALI_WRAP_MODE_CLAMP_TO_EDGE;
133    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
134       return MALI_WRAP_MODE_CLAMP_TO_BORDER;
135    case PIPE_TEX_WRAP_MIRROR_REPEAT:
136       return MALI_WRAP_MODE_MIRRORED_REPEAT;
137    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
138       return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
139    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
140       return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
141 
142 #if PAN_ARCH <= 5
143    case PIPE_TEX_WRAP_CLAMP:
144       return using_nearest ? MALI_WRAP_MODE_CLAMP_TO_EDGE
145                            : MALI_WRAP_MODE_CLAMP;
146    case PIPE_TEX_WRAP_MIRROR_CLAMP:
147       return using_nearest ? MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE
148                            : MALI_WRAP_MODE_MIRRORED_CLAMP;
149 #endif
150 
151    default:
152       unreachable("Invalid wrap");
153    }
154 }
155 
156 /* The hardware compares in the wrong order order, so we have to flip before
157  * encoding. Yes, really. */
158 
159 static enum mali_func
panfrost_sampler_compare_func(const struct pipe_sampler_state * cso)160 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
161 {
162    return !cso->compare_mode
163              ? MALI_FUNC_NEVER
164              : panfrost_flip_compare_func((enum mali_func)cso->compare_func);
165 }
166 
167 static enum mali_mipmap_mode
pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)168 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
169 {
170    switch (f) {
171    case PIPE_TEX_MIPFILTER_NEAREST:
172       return MALI_MIPMAP_MODE_NEAREST;
173    case PIPE_TEX_MIPFILTER_LINEAR:
174       return MALI_MIPMAP_MODE_TRILINEAR;
175 #if PAN_ARCH >= 6
176    case PIPE_TEX_MIPFILTER_NONE:
177       return MALI_MIPMAP_MODE_NONE;
178 #else
179    case PIPE_TEX_MIPFILTER_NONE:
180       return MALI_MIPMAP_MODE_NEAREST;
181 #endif
182    default:
183       unreachable("Invalid");
184    }
185 }
186 
187 static void *
panfrost_create_sampler_state(struct pipe_context * pctx,const struct pipe_sampler_state * cso)188 panfrost_create_sampler_state(struct pipe_context *pctx,
189                               const struct pipe_sampler_state *cso)
190 {
191    struct panfrost_sampler_state *so = CALLOC_STRUCT(panfrost_sampler_state);
192    so->base = *cso;
193 
194 #if PAN_ARCH == 7 || PAN_ARCH >= 10
195    /* On v7 and v10+, pan_texture.c composes the API swizzle with a bijective
196     * swizzle derived from the format, to allow more formats than the
197     * hardware otherwise supports. When packing border colours, we need to
198     * undo this bijection, by swizzling with its inverse.
199     * On v10+, watch out for depth+stencil formats, because those have a
200     * swizzle that doesn't really apply to the border color
201     */
202 #if PAN_ARCH >= 10
203    if (!util_format_is_depth_and_stencil(cso->border_color_format)) {
204 #endif
205    unsigned mali_format =
206       GENX(panfrost_format_from_pipe_format)(cso->border_color_format)->hw;
207    enum mali_rgb_component_order order = mali_format & BITFIELD_MASK(12);
208 
209    unsigned char inverted_swizzle[4];
210    panfrost_invert_swizzle(GENX(pan_decompose_swizzle)(order).post,
211                            inverted_swizzle);
212 
213    util_format_apply_color_swizzle(&so->base.border_color, &cso->border_color,
214                                    inverted_swizzle,
215                                    false /* is_integer (irrelevant) */);
216 #if PAN_ARCH >= 10
217    }
218 #endif
219 
220 #endif
221 
222    bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST;
223 
224    pan_pack(&so->hw, SAMPLER, cfg) {
225       cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
226       cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
227 
228       cfg.normalized_coordinates = !cso->unnormalized_coords;
229       cfg.lod_bias = cso->lod_bias;
230       cfg.minimum_lod = cso->min_lod;
231       cfg.maximum_lod = cso->max_lod;
232 
233       cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s, using_nearest);
234       cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t, using_nearest);
235       cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r, using_nearest);
236 
237       cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
238       cfg.compare_function = panfrost_sampler_compare_func(cso);
239       cfg.seamless_cube_map = cso->seamless_cube_map;
240 
241       cfg.border_color_r = so->base.border_color.ui[0];
242       cfg.border_color_g = so->base.border_color.ui[1];
243       cfg.border_color_b = so->base.border_color.ui[2];
244       cfg.border_color_a = so->base.border_color.ui[3];
245 
246 #if PAN_ARCH >= 6
247       if (cso->max_anisotropy > 1) {
248          cfg.maximum_anisotropy = cso->max_anisotropy;
249          cfg.lod_algorithm = MALI_LOD_ALGORITHM_ANISOTROPIC;
250       }
251 #else
252       /* Emulate disabled mipmapping by clamping the LOD as tight as
253        * possible (from 0 to epsilon = 1/256) */
254       if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
255          cfg.maximum_lod = cfg.minimum_lod + (1.0 / 256.0);
256 #endif
257    }
258 
259    return so;
260 }
261 
262 /* Get pointers to the blend shaders bound to each active render target. Used
263  * to emit the blend descriptors, as well as the fragment renderer state
264  * descriptor.
265  */
266 static void
panfrost_get_blend_shaders(struct panfrost_batch * batch,mali_ptr * blend_shaders)267 panfrost_get_blend_shaders(struct panfrost_batch *batch,
268                            mali_ptr *blend_shaders)
269 {
270    unsigned shader_offset = 0;
271    struct panfrost_bo *shader_bo = NULL;
272 
273    for (unsigned c = 0; c < batch->key.nr_cbufs; ++c) {
274       if (batch->key.cbufs[c]) {
275          blend_shaders[c] =
276             panfrost_get_blend(batch, c, &shader_bo, &shader_offset);
277       }
278    }
279 
280    if (shader_bo)
281       perf_debug(batch->ctx, "Blend shader use");
282 }
283 
284 #if PAN_ARCH >= 5
285 UNUSED static uint16_t
pack_blend_constant(enum pipe_format format,float cons)286 pack_blend_constant(enum pipe_format format, float cons)
287 {
288    const struct util_format_description *format_desc =
289       util_format_description(format);
290 
291    unsigned chan_size = 0;
292 
293    for (unsigned i = 0; i < format_desc->nr_channels; i++)
294       chan_size = MAX2(format_desc->channel[0].size, chan_size);
295 
296    uint16_t unorm = (cons * ((1 << chan_size) - 1));
297    return unorm << (16 - chan_size);
298 }
299 
300 static void
panfrost_emit_blend(struct panfrost_batch * batch,void * rts,mali_ptr * blend_shaders)301 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
302                     mali_ptr *blend_shaders)
303 {
304    unsigned rt_count = batch->key.nr_cbufs;
305    struct panfrost_context *ctx = batch->ctx;
306    const struct panfrost_blend_state *so = ctx->blend;
307    bool dithered = so->base.dither;
308 
309    /* Always have at least one render target for depth-only passes */
310    for (unsigned i = 0; i < MAX2(rt_count, 1); ++i) {
311       struct mali_blend_packed *packed = rts + (i * pan_size(BLEND));
312 
313       /* Disable blending for unbacked render targets */
314       if (rt_count == 0 || !batch->key.cbufs[i] || !so->info[i].enabled) {
315          pan_pack(rts + i * pan_size(BLEND), BLEND, cfg) {
316             cfg.enable = false;
317 #if PAN_ARCH >= 6
318             cfg.internal.mode = MALI_BLEND_MODE_OFF;
319 #endif
320          }
321 
322          continue;
323       }
324 
325       struct pan_blend_info info = so->info[i];
326       enum pipe_format format = batch->key.cbufs[i]->format;
327       float cons =
328          pan_blend_get_constant(info.constant_mask, ctx->blend_color.color);
329 
330       /* Word 0: Flags and constant */
331       pan_pack(packed, BLEND, cfg) {
332          cfg.srgb = util_format_is_srgb(format);
333          cfg.load_destination = info.load_dest;
334          cfg.round_to_fb_precision = !dithered;
335          cfg.alpha_to_one = ctx->blend->base.alpha_to_one;
336 #if PAN_ARCH >= 6
337          if (!blend_shaders[i])
338             cfg.constant = pack_blend_constant(format, cons);
339 #else
340          cfg.blend_shader = (blend_shaders[i] != 0);
341 
342          if (blend_shaders[i])
343             cfg.shader_pc = blend_shaders[i];
344          else
345             cfg.constant = cons;
346 #endif
347       }
348 
349       if (!blend_shaders[i]) {
350          /* Word 1: Blend Equation */
351          STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
352          packed->opaque[PAN_ARCH >= 6 ? 1 : 2] = so->equation[i];
353       }
354 
355 #if PAN_ARCH >= 6
356       struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
357 
358       /* Words 2 and 3: Internal blend */
359       if (blend_shaders[i]) {
360          /* The blend shader's address needs to be at
361           * the same top 32 bit as the fragment shader.
362           * TODO: Ensure that's always the case.
363           */
364          assert(!fs->bin.bo || (blend_shaders[i] & (0xffffffffull << 32)) ==
365                                   (fs->bin.gpu & (0xffffffffull << 32)));
366 
367          pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) {
368             cfg.mode = MALI_BLEND_MODE_SHADER;
369             cfg.shader.pc = (u32)blend_shaders[i];
370 
371 #if PAN_ARCH <= 7
372             unsigned ret_offset = fs->info.bifrost.blend[i].return_offset;
373             assert(!(ret_offset & 0x7));
374 
375             cfg.shader.return_value = ret_offset ? fs->bin.gpu + ret_offset : 0;
376 #endif
377          }
378       } else {
379          pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) {
380             cfg.mode = info.opaque ? MALI_BLEND_MODE_OPAQUE
381                                    : MALI_BLEND_MODE_FIXED_FUNCTION;
382 
383             /* If we want the conversion to work properly,
384              * num_comps must be set to 4
385              */
386             cfg.fixed_function.num_comps = 4;
387             cfg.fixed_function.conversion.memory_format = GENX(
388                panfrost_dithered_format_from_pipe_format)(format, dithered);
389             cfg.fixed_function.rt = i;
390 
391 #if PAN_ARCH >= 7
392             if (cfg.mode == MALI_BLEND_MODE_FIXED_FUNCTION &&
393                 (cfg.fixed_function.conversion.memory_format & 0xff) ==
394                    MALI_RGB_COMPONENT_ORDER_RGB1) {
395                /* fixed function does not like RGB1 as the component order */
396                /* force this field to be the default 0 (RGBA) */
397                cfg.fixed_function.conversion.memory_format &= ~0xff;
398                cfg.fixed_function.conversion.memory_format |=
399                   MALI_RGB_COMPONENT_ORDER_RGBA;
400             }
401 #endif
402 #if PAN_ARCH <= 7
403             if (!info.opaque) {
404                cfg.fixed_function.alpha_zero_nop = info.alpha_zero_nop;
405                cfg.fixed_function.alpha_one_store = info.alpha_one_store;
406             }
407 
408             if (fs->info.fs.untyped_color_outputs) {
409                cfg.fixed_function.conversion.register_format = GENX(
410                   pan_fixup_blend_type)(fs->info.bifrost.blend[i].type, format);
411             } else {
412                cfg.fixed_function.conversion.register_format =
413                   fs->info.bifrost.blend[i].format;
414             }
415 #endif
416          }
417       }
418 #endif
419    }
420 }
421 #endif
422 
423 static mali_ptr
panfrost_emit_compute_shader_meta(struct panfrost_batch * batch,enum pipe_shader_type stage)424 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch,
425                                   enum pipe_shader_type stage)
426 {
427    struct panfrost_compiled_shader *ss = batch->ctx->prog[stage];
428 
429    panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_VERTEX);
430    panfrost_batch_add_bo(batch, ss->state.bo, PIPE_SHADER_VERTEX);
431 
432    return ss->state.gpu;
433 }
434 
435 static float
panfrost_z_depth_offset(struct panfrost_context * ctx,float offset_units)436 panfrost_z_depth_offset(struct panfrost_context *ctx, float offset_units)
437 {
438    if (ctx->pipe_framebuffer.zsbuf) {
439       if (util_format_is_float(ctx->pipe_framebuffer.zsbuf->format)) {
440          /* no scaling necessary, hw will do this at run time */
441          return offset_units;
442       }
443    }
444    /* if fixed point, apply the minimum resolvable difference scaling here */
445    return 2.0f * offset_units;
446 }
447 
448 #if PAN_ARCH <= 7
449 /* Construct a partial RSD corresponding to no executed fragment shader, and
450  * merge with the existing partial RSD. */
451 
452 static void
pan_merge_empty_fs(struct mali_renderer_state_packed * rsd)453 pan_merge_empty_fs(struct mali_renderer_state_packed *rsd)
454 {
455    struct mali_renderer_state_packed empty_rsd;
456 
457    pan_pack(&empty_rsd, RENDERER_STATE, cfg) {
458 #if PAN_ARCH >= 6
459       cfg.properties.shader_modifies_coverage = true;
460       cfg.properties.allow_forward_pixel_to_kill = true;
461       cfg.properties.allow_forward_pixel_to_be_killed = true;
462       cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
463 
464       /* Alpha isn't written so these are vacuous */
465       cfg.multisample_misc.overdraw_alpha0 = true;
466       cfg.multisample_misc.overdraw_alpha1 = true;
467 #else
468       cfg.shader.shader = 0x1;
469       cfg.properties.work_register_count = 1;
470       cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
471       cfg.properties.force_early_z = true;
472 #endif
473    }
474 
475    pan_merge((*rsd), empty_rsd, RENDERER_STATE);
476 }
477 
478 static void
panfrost_prepare_fs_state(struct panfrost_context * ctx,mali_ptr * blend_shaders,struct mali_renderer_state_packed * rsd)479 panfrost_prepare_fs_state(struct panfrost_context *ctx, mali_ptr *blend_shaders,
480                           struct mali_renderer_state_packed *rsd)
481 {
482    struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
483    const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
484    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
485    struct panfrost_blend_state *so = ctx->blend;
486    bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
487    bool msaa = rast->multisample;
488 
489    unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
490 
491    bool has_blend_shader = false;
492 
493    for (unsigned c = 0; c < rt_count; ++c)
494       has_blend_shader |= (blend_shaders[c] != 0);
495 
496    bool has_oq = ctx->occlusion_query && ctx->active_queries;
497 
498    pan_pack(rsd, RENDERER_STATE, cfg) {
499       if (panfrost_fs_required(fs, so, &ctx->pipe_framebuffer, zsa)) {
500 #if PAN_ARCH >= 6
501          struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
502             fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
503             ctx->blend->base.alpha_to_coverage,
504             ctx->depth_stencil->zs_always_passes);
505 
506          cfg.properties.pixel_kill_operation = earlyzs.kill;
507          cfg.properties.zs_update_operation = earlyzs.update;
508 
509          cfg.properties.allow_forward_pixel_to_kill =
510             pan_allow_forward_pixel_to_kill(ctx, fs);
511 #else
512          cfg.properties.force_early_z =
513             fs->info.fs.can_early_z && !alpha_to_coverage &&
514             ((enum mali_func)zsa->base.alpha_func == MALI_FUNC_ALWAYS);
515 
516          /* TODO: Reduce this limit? */
517          if (has_blend_shader)
518             cfg.properties.work_register_count =
519                MAX2(fs->info.work_reg_count, 8);
520          else
521             cfg.properties.work_register_count = fs->info.work_reg_count;
522 
523          /* Hardware quirks around early-zs forcing without a
524           * depth buffer. Note this breaks occlusion queries. */
525          bool force_ez_with_discard = !zsa->enabled && !has_oq;
526 
527          cfg.properties.shader_reads_tilebuffer =
528             force_ez_with_discard && fs->info.fs.can_discard;
529          cfg.properties.shader_contains_discard =
530             !force_ez_with_discard && fs->info.fs.can_discard;
531 #endif
532       }
533 
534 #if PAN_ARCH == 4
535       if (rt_count > 0) {
536          cfg.multisample_misc.load_destination = so->info[0].load_dest;
537          cfg.multisample_misc.blend_shader = (blend_shaders[0] != 0);
538          cfg.stencil_mask_misc.write_enable = so->info[0].enabled;
539          cfg.stencil_mask_misc.srgb =
540             util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
541          cfg.stencil_mask_misc.dither_disable = !so->base.dither;
542          cfg.stencil_mask_misc.alpha_to_one = so->base.alpha_to_one;
543 
544          if (blend_shaders[0]) {
545             cfg.blend_shader = blend_shaders[0];
546          } else {
547             cfg.blend_constant = pan_blend_get_constant(
548                so->info[0].constant_mask, ctx->blend_color.color);
549          }
550       } else {
551          /* If there is no colour buffer, leaving fields default is
552           * fine, except for blending which is nonnullable */
553          cfg.blend_equation.color_mask = 0xf;
554          cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
555          cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
556          cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
557          cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
558          cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
559          cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
560       }
561 #elif PAN_ARCH == 5
562       /* Workaround */
563       cfg.legacy_blend_shader = panfrost_last_nonnull(blend_shaders, rt_count);
564 #endif
565 
566       cfg.multisample_misc.sample_mask = msaa ? ctx->sample_mask : 0xFFFF;
567 
568       cfg.multisample_misc.evaluate_per_sample = msaa && (ctx->min_samples > 1);
569 
570 #if PAN_ARCH >= 6
571       /* MSAA blend shaders need to pass their sample ID to
572        * LD_TILE/ST_TILE, so we must preload it. Additionally, we
573        * need per-sample shading for the blend shader, accomplished
574        * by forcing per-sample shading for the whole program. */
575 
576       if (msaa && has_blend_shader) {
577          cfg.multisample_misc.evaluate_per_sample = true;
578          cfg.preload.fragment.sample_mask_id = true;
579       }
580 
581       /* Bifrost does not have native point sprites. Point sprites are
582        * lowered in the driver to gl_PointCoord reads. This field
583        * actually controls the orientation of gl_PointCoord. Both
584        * orientations are controlled with sprite_coord_mode in
585        * Gallium.
586        */
587       cfg.properties.point_sprite_coord_origin_max_y =
588          (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT);
589 
590       cfg.multisample_misc.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
591       cfg.multisample_misc.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
592 #endif
593 
594       cfg.stencil_mask_misc.alpha_to_coverage = alpha_to_coverage;
595       cfg.depth_units = panfrost_z_depth_offset(ctx, rast->offset_units);
596       cfg.depth_factor = rast->offset_scale;
597       cfg.depth_bias_clamp = rast->offset_clamp;
598 
599       bool back_enab = zsa->base.stencil[1].enabled;
600       cfg.stencil_front.reference_value = ctx->stencil_ref.ref_value[0];
601       cfg.stencil_back.reference_value =
602          ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
603 
604 #if PAN_ARCH <= 5
605       /* v6+ fits register preload here, no alpha testing */
606       cfg.alpha_reference = zsa->base.alpha_ref_value;
607 #endif
608    }
609 }
610 
611 static void
panfrost_emit_frag_shader(struct panfrost_context * ctx,struct mali_renderer_state_packed * fragmeta,mali_ptr * blend_shaders)612 panfrost_emit_frag_shader(struct panfrost_context *ctx,
613                           struct mali_renderer_state_packed *fragmeta,
614                           mali_ptr *blend_shaders)
615 {
616    const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
617    const struct panfrost_rasterizer *rast = ctx->rasterizer;
618    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
619 
620    /* We need to merge several several partial renderer state descriptors,
621     * so stage to temporary storage rather than reading back write-combine
622     * memory, which will trash performance. */
623    struct mali_renderer_state_packed rsd;
624    panfrost_prepare_fs_state(ctx, blend_shaders, &rsd);
625 
626 #if PAN_ARCH == 4
627    if (ctx->pipe_framebuffer.nr_cbufs > 0 && !blend_shaders[0]) {
628       /* Word 14: SFBD Blend Equation */
629       STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
630       rsd.opaque[14] = ctx->blend->equation[0];
631    }
632 #endif
633 
634    /* Merge with CSO state and upload */
635    if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa)) {
636       struct mali_renderer_state_packed *partial_rsd =
637          (struct mali_renderer_state_packed *)&fs->partial_rsd;
638       STATIC_ASSERT(sizeof(fs->partial_rsd) == sizeof(*partial_rsd));
639       pan_merge(rsd, *partial_rsd, RENDERER_STATE);
640    } else {
641       pan_merge_empty_fs(&rsd);
642    }
643 
644    /* Word 8, 9 Misc state */
645    rsd.opaque[8] |= zsa->rsd_depth.opaque[0] | rast->multisample.opaque[0];
646 
647    rsd.opaque[9] |= zsa->rsd_stencil.opaque[0] | rast->stencil_misc.opaque[0];
648 
649    /* late patching of the merged RSD in case of line-smoothing */
650    if (u_reduced_prim(ctx->active_prim) == MESA_PRIM_LINES &&
651        rast->base.line_smooth) {
652       rsd.opaque[8] |= (1u << 16); // multisample_enable = 1
653       rsd.opaque[9] &= ~(1u << 30); // single_sampled_lines = 0
654    }
655 
656    /* Word 10, 11 Stencil Front and Back */
657    rsd.opaque[10] |= zsa->stencil_front.opaque[0];
658    rsd.opaque[11] |= zsa->stencil_back.opaque[0];
659 
660    memcpy(fragmeta, &rsd, sizeof(rsd));
661 }
662 
663 static mali_ptr
panfrost_emit_frag_shader_meta(struct panfrost_batch * batch)664 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
665 {
666    struct panfrost_context *ctx = batch->ctx;
667    struct panfrost_compiled_shader *ss = ctx->prog[PIPE_SHADER_FRAGMENT];
668 
669    panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_FRAGMENT);
670    panfrost_batch_add_bo(batch, ss->state.bo, PIPE_SHADER_FRAGMENT);
671 
672    struct panfrost_ptr xfer;
673 
674 #if PAN_ARCH == 4
675    xfer = pan_pool_alloc_desc(&batch->pool.base, RENDERER_STATE);
676 #else
677    unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
678 
679    xfer =
680       pan_pool_alloc_desc_aggregate(&batch->pool.base, PAN_DESC(RENDERER_STATE),
681                                     PAN_DESC_ARRAY(rt_count, BLEND));
682 #endif
683 
684    mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = {0};
685    panfrost_get_blend_shaders(batch, blend_shaders);
686 
687    panfrost_emit_frag_shader(ctx, (struct mali_renderer_state_packed *)xfer.cpu,
688                              blend_shaders);
689 
690 #if PAN_ARCH >= 5
691    panfrost_emit_blend(batch, xfer.cpu + pan_size(RENDERER_STATE),
692                        blend_shaders);
693 #endif
694 
695    return xfer.gpu;
696 }
697 #endif
698 
699 static mali_ptr
panfrost_emit_viewport(struct panfrost_batch * batch)700 panfrost_emit_viewport(struct panfrost_batch *batch)
701 {
702    struct panfrost_context *ctx = batch->ctx;
703    const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
704    const struct pipe_scissor_state *ss = &ctx->scissor;
705    const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
706 
707    /* Derive min/max from translate/scale. Note since |x| >= 0 by
708     * definition, we have that -|x| <= |x| hence translate - |scale| <=
709     * translate + |scale|, so the ordering is correct here. */
710    float vp_minx = vp->translate[0] - fabsf(vp->scale[0]);
711    float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]);
712    float vp_miny = vp->translate[1] - fabsf(vp->scale[1]);
713    float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]);
714 
715    float minz, maxz;
716    util_viewport_zmin_zmax(vp, rast->clip_halfz, &minz, &maxz);
717 
718    /* Scissor to the intersection of viewport and to the scissor, clamped
719     * to the framebuffer */
720 
721    unsigned minx = MIN2(batch->key.width, MAX2((int)vp_minx, 0));
722    unsigned maxx = MIN2(batch->key.width, MAX2((int)vp_maxx, 0));
723    unsigned miny = MIN2(batch->key.height, MAX2((int)vp_miny, 0));
724    unsigned maxy = MIN2(batch->key.height, MAX2((int)vp_maxy, 0));
725 
726    if (ss && rast->scissor) {
727       minx = MAX2(ss->minx, minx);
728       miny = MAX2(ss->miny, miny);
729       maxx = MIN2(ss->maxx, maxx);
730       maxy = MIN2(ss->maxy, maxy);
731    }
732 
733    /* Set the range to [1, 1) so max values don't wrap round */
734    if (maxx == 0 || maxy == 0)
735       maxx = maxy = minx = miny = 1;
736 
737    panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
738    batch->scissor_culls_everything = (minx >= maxx || miny >= maxy);
739 
740    /* [minx, maxx) and [miny, maxy) are exclusive ranges in the hardware */
741    maxx--;
742    maxy--;
743 
744    batch->minimum_z = minz;
745    batch->maximum_z = maxz;
746 
747 #if PAN_ARCH <= 7
748    struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT);
749 
750    pan_pack(T.cpu, VIEWPORT, cfg) {
751       cfg.scissor_minimum_x = minx;
752       cfg.scissor_minimum_y = miny;
753       cfg.scissor_maximum_x = maxx;
754       cfg.scissor_maximum_y = maxy;
755 
756       cfg.minimum_z = batch->minimum_z;
757       cfg.maximum_z = batch->maximum_z;
758    }
759 
760    return T.gpu;
761 #else
762    pan_pack(&batch->scissor, SCISSOR, cfg) {
763       cfg.scissor_minimum_x = minx;
764       cfg.scissor_minimum_y = miny;
765       cfg.scissor_maximum_x = maxx;
766       cfg.scissor_maximum_y = maxy;
767    }
768 
769    return 0;
770 #endif
771 }
772 
773 #if PAN_ARCH >= 9
774 /**
775  * Emit a Valhall depth/stencil descriptor at draw-time. The bulk of the
776  * descriptor corresponds to a pipe_depth_stencil_alpha CSO and is packed at
777  * CSO create time. However, the stencil reference values and shader
778  * interactions are dynamic state. Pack only the dynamic state here and OR
779  * together.
780  */
781 static mali_ptr
panfrost_emit_depth_stencil(struct panfrost_batch * batch)782 panfrost_emit_depth_stencil(struct panfrost_batch *batch)
783 {
784    struct panfrost_context *ctx = batch->ctx;
785    const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
786    struct panfrost_rasterizer *rast = ctx->rasterizer;
787    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
788    bool back_enab = zsa->base.stencil[1].enabled;
789 
790    struct panfrost_ptr T =
791       pan_pool_alloc_desc(&batch->pool.base, DEPTH_STENCIL);
792    struct mali_depth_stencil_packed dynamic;
793 
794    pan_pack(&dynamic, DEPTH_STENCIL, cfg) {
795       cfg.front_reference_value = ctx->stencil_ref.ref_value[0];
796       cfg.back_reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
797 
798       cfg.stencil_from_shader = fs->info.fs.writes_stencil;
799       cfg.depth_source = pan_depth_source(&fs->info);
800 
801       cfg.depth_bias_enable = rast->base.offset_tri;
802       cfg.depth_units = panfrost_z_depth_offset(ctx, rast->base.offset_units);
803       cfg.depth_factor = rast->base.offset_scale;
804       cfg.depth_bias_clamp = rast->base.offset_clamp;
805 
806       assert(rast->base.depth_clip_near == rast->base.depth_clip_far);
807       cfg.depth_cull_enable = rast->base.depth_clip_near;
808       cfg.depth_clamp_mode = rast->base.depth_clamp
809                                 ? MALI_DEPTH_CLAMP_MODE_BOUNDS
810                                 : MALI_DEPTH_CLAMP_MODE_0_1;
811    }
812 
813    pan_merge(dynamic, zsa->desc, DEPTH_STENCIL);
814    memcpy(T.cpu, &dynamic, pan_size(DEPTH_STENCIL));
815 
816    return T.gpu;
817 }
818 
819 /**
820  * Emit Valhall blend descriptor at draw-time. The descriptor itself is shared
821  * with Bifrost, but the container data structure is simplified.
822  */
823 static mali_ptr
panfrost_emit_blend_valhall(struct panfrost_batch * batch)824 panfrost_emit_blend_valhall(struct panfrost_batch *batch)
825 {
826    unsigned rt_count = MAX2(batch->key.nr_cbufs, 1);
827 
828    struct panfrost_ptr T =
829       pan_pool_alloc_desc_array(&batch->pool.base, rt_count, BLEND);
830 
831    mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = {0};
832    panfrost_get_blend_shaders(batch, blend_shaders);
833 
834    panfrost_emit_blend(batch, T.cpu, blend_shaders);
835 
836    /* Precalculate for the per-draw path */
837    bool has_blend_shader = false;
838 
839    for (unsigned i = 0; i < rt_count; ++i)
840       has_blend_shader |= !!blend_shaders[i];
841 
842    batch->ctx->valhall_has_blend_shader = has_blend_shader;
843 
844    return T.gpu;
845 }
846 
847 /**
848  * Emit Valhall buffer descriptors for bound vertex buffers at draw-time.
849  */
850 static mali_ptr
panfrost_emit_vertex_buffers(struct panfrost_batch * batch)851 panfrost_emit_vertex_buffers(struct panfrost_batch *batch)
852 {
853    struct panfrost_context *ctx = batch->ctx;
854    unsigned buffer_count = util_last_bit(ctx->vb_mask);
855    struct panfrost_ptr T =
856       pan_pool_alloc_desc_array(&batch->pool.base, buffer_count, BUFFER);
857    struct mali_buffer_packed *buffers = T.cpu;
858 
859    u_foreach_bit(i, ctx->vb_mask) {
860       struct pipe_vertex_buffer vb = ctx->vertex_buffers[i];
861       struct pipe_resource *prsrc = vb.buffer.resource;
862       struct panfrost_resource *rsrc = pan_resource(prsrc);
863       assert(!vb.is_user_buffer);
864 
865       panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
866 
867       pan_pack(buffers + i, BUFFER, cfg) {
868          cfg.address = rsrc->image.data.base + vb.buffer_offset;
869 
870          cfg.size = prsrc->width0 - vb.buffer_offset;
871       }
872    }
873 
874    return T.gpu;
875 }
876 
877 static mali_ptr
panfrost_emit_vertex_data(struct panfrost_batch * batch)878 panfrost_emit_vertex_data(struct panfrost_batch *batch)
879 {
880    struct panfrost_context *ctx = batch->ctx;
881    struct panfrost_vertex_state *vtx = ctx->vertex;
882 
883    return pan_pool_upload_aligned(&batch->pool.base, vtx->attributes,
884                                   vtx->num_elements * pan_size(ATTRIBUTE),
885                                   pan_alignment(ATTRIBUTE));
886 }
887 
888 static void panfrost_update_sampler_view(struct panfrost_sampler_view *view,
889                                          struct pipe_context *pctx);
890 
891 static mali_ptr
panfrost_emit_images(struct panfrost_batch * batch,enum pipe_shader_type stage)892 panfrost_emit_images(struct panfrost_batch *batch, enum pipe_shader_type stage)
893 {
894    struct panfrost_context *ctx = batch->ctx;
895    unsigned last_bit = util_last_bit(ctx->image_mask[stage]);
896 
897    struct panfrost_ptr T =
898       pan_pool_alloc_desc_array(&batch->pool.base, last_bit, TEXTURE);
899 
900    struct mali_texture_packed *out = (struct mali_texture_packed *)T.cpu;
901 
902    for (int i = 0; i < last_bit; ++i) {
903       struct pipe_image_view *image = &ctx->images[stage][i];
904 
905       if (!(ctx->image_mask[stage] & BITFIELD_BIT(i))) {
906          memset(&out[i], 0, sizeof(out[i]));
907          continue;
908       }
909 
910       /* Construct a synthetic sampler view so we can use our usual
911        * sampler view code for the actual descriptor packing.
912        *
913        * Use the batch pool for a transient allocation, rather than
914        * allocating a long-lived descriptor.
915        */
916       struct panfrost_sampler_view view = {
917          .base = util_image_to_sampler_view(image),
918          .pool = &batch->pool,
919       };
920 
921       /* If we specify a cube map, the hardware internally treat it as
922        * a 2D array. Since cube maps as images can confuse our common
923        * texturing code, explicitly use a 2D array.
924        *
925        * Similar concerns apply to 3D textures.
926        */
927       if (view.base.target == PIPE_BUFFER)
928          view.base.target = PIPE_BUFFER;
929       else
930          view.base.target = PIPE_TEXTURE_2D_ARRAY;
931 
932       panfrost_update_sampler_view(&view, &ctx->base);
933       out[i] = view.bifrost_descriptor;
934 
935       panfrost_track_image_access(batch, stage, image);
936    }
937 
938    return T.gpu;
939 }
940 #endif
941 
942 static mali_ptr
panfrost_map_constant_buffer_gpu(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_constant_buffer * buf,unsigned index)943 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
944                                  enum pipe_shader_type st,
945                                  struct panfrost_constant_buffer *buf,
946                                  unsigned index)
947 {
948    struct pipe_constant_buffer *cb = &buf->cb[index];
949    struct panfrost_resource *rsrc = pan_resource(cb->buffer);
950 
951    if (rsrc) {
952       panfrost_batch_read_rsrc(batch, rsrc, st);
953 
954       /* Alignment gauranteed by
955        * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
956       return rsrc->image.data.base + cb->buffer_offset;
957    } else if (cb->user_buffer) {
958       return pan_pool_upload_aligned(&batch->pool.base,
959                                      cb->user_buffer + cb->buffer_offset,
960                                      cb->buffer_size, 16);
961    } else {
962       unreachable("No constant buffer");
963    }
964 }
965 
966 struct sysval_uniform {
967    union {
968       float f[4];
969       int32_t i[4];
970       uint32_t u[4];
971       uint64_t du[2];
972    };
973 };
974 
975 static void
panfrost_upload_viewport_scale_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)976 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
977                                       struct sysval_uniform *uniform)
978 {
979    struct panfrost_context *ctx = batch->ctx;
980    const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
981 
982    uniform->f[0] = vp->scale[0];
983    uniform->f[1] = vp->scale[1];
984    uniform->f[2] = vp->scale[2];
985 }
986 
987 static void
panfrost_upload_viewport_offset_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)988 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
989                                        struct sysval_uniform *uniform)
990 {
991    struct panfrost_context *ctx = batch->ctx;
992    const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
993 
994    uniform->f[0] = vp->translate[0];
995    uniform->f[1] = vp->translate[1];
996    uniform->f[2] = vp->translate[2];
997 }
998 
999 static void
panfrost_upload_txs_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)1000 panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1001                            enum pipe_shader_type st, unsigned int sysvalid,
1002                            struct sysval_uniform *uniform)
1003 {
1004    struct panfrost_context *ctx = batch->ctx;
1005    unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1006    unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1007    bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1008    struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1009 
1010    assert(dim);
1011 
1012    if (tex->target == PIPE_BUFFER) {
1013       assert(dim == 1);
1014       unsigned buf_size = tex->u.buf.size / util_format_get_blocksize(tex->format);
1015       uniform->i[0] = MIN2(buf_size, PAN_MAX_TEXEL_BUFFER_ELEMENTS);
1016       return;
1017    }
1018 
1019    uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1020 
1021    if (dim > 1)
1022       uniform->i[1] = u_minify(tex->texture->height0, tex->u.tex.first_level);
1023 
1024    if (dim > 2)
1025       uniform->i[2] = u_minify(tex->texture->depth0, tex->u.tex.first_level);
1026 
1027    if (is_array) {
1028       unsigned size = tex->texture->array_size;
1029 
1030       /* Internally, we store the number of 2D images (faces * array
1031        * size). Externally, we report the array size in terms of
1032        * complete cubes. So divide by the # of faces per cube.
1033        */
1034       if (tex->target == PIPE_TEXTURE_CUBE_ARRAY)
1035          size /= 6;
1036 
1037       uniform->i[dim] = size;
1038    }
1039 }
1040 
1041 static void
panfrost_upload_image_size_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)1042 panfrost_upload_image_size_sysval(struct panfrost_batch *batch,
1043                                   enum pipe_shader_type st,
1044                                   unsigned int sysvalid,
1045                                   struct sysval_uniform *uniform)
1046 {
1047    struct panfrost_context *ctx = batch->ctx;
1048    unsigned idx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1049    unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1050    unsigned is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1051 
1052    assert(dim && dim < 4);
1053 
1054    struct pipe_image_view *image = &ctx->images[st][idx];
1055 
1056    if (image->resource->target == PIPE_BUFFER) {
1057       unsigned blocksize = util_format_get_blocksize(image->format);
1058       uniform->i[0] = image->resource->width0 / blocksize;
1059       return;
1060    }
1061 
1062    uniform->i[0] = u_minify(image->resource->width0, image->u.tex.level);
1063 
1064    if (dim > 1)
1065       uniform->i[1] = u_minify(image->resource->height0, image->u.tex.level);
1066 
1067    if (dim > 2)
1068       uniform->i[2] = u_minify(image->resource->depth0, image->u.tex.level);
1069 
1070    if (is_array)
1071       uniform->i[dim] = image->resource->array_size;
1072 }
1073 
1074 static void
panfrost_upload_ssbo_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned ssbo_id,struct sysval_uniform * uniform)1075 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1076                             enum pipe_shader_type st, unsigned ssbo_id,
1077                             struct sysval_uniform *uniform)
1078 {
1079    struct panfrost_context *ctx = batch->ctx;
1080 
1081    assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1082    struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1083 
1084    /* Compute address */
1085    struct panfrost_resource *rsrc = pan_resource(sb.buffer);
1086    struct panfrost_bo *bo = rsrc->bo;
1087 
1088    panfrost_batch_write_rsrc(batch, rsrc, st);
1089 
1090    util_range_add(&rsrc->base, &rsrc->valid_buffer_range, sb.buffer_offset,
1091                   sb.buffer_size);
1092 
1093    /* Upload address and size as sysval */
1094    uniform->du[0] = bo->ptr.gpu + sb.buffer_offset;
1095    uniform->u[2] = sb.buffer_size;
1096 }
1097 
1098 static void
panfrost_upload_sampler_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned samp_idx,struct sysval_uniform * uniform)1099 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1100                                enum pipe_shader_type st, unsigned samp_idx,
1101                                struct sysval_uniform *uniform)
1102 {
1103    struct panfrost_context *ctx = batch->ctx;
1104    struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1105 
1106    uniform->f[0] = sampl->min_lod;
1107    uniform->f[1] = sampl->max_lod;
1108    uniform->f[2] = sampl->lod_bias;
1109 
1110    /* Even without any errata, Midgard represents "no mipmapping" as
1111     * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1112     * panfrost_create_sampler_state which also explains our choice of
1113     * epsilon value (again to keep behaviour consistent) */
1114 
1115    if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1116       uniform->f[1] = uniform->f[0] + (1.0 / 256.0);
1117 }
1118 
1119 static void
panfrost_upload_num_work_groups_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1120 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1121                                        struct sysval_uniform *uniform)
1122 {
1123    struct panfrost_context *ctx = batch->ctx;
1124 
1125    uniform->u[0] = ctx->compute_grid->grid[0];
1126    uniform->u[1] = ctx->compute_grid->grid[1];
1127    uniform->u[2] = ctx->compute_grid->grid[2];
1128 }
1129 
1130 static void
panfrost_upload_local_group_size_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1131 panfrost_upload_local_group_size_sysval(struct panfrost_batch *batch,
1132                                         struct sysval_uniform *uniform)
1133 {
1134    struct panfrost_context *ctx = batch->ctx;
1135 
1136    uniform->u[0] = ctx->compute_grid->block[0];
1137    uniform->u[1] = ctx->compute_grid->block[1];
1138    uniform->u[2] = ctx->compute_grid->block[2];
1139 }
1140 
1141 static void
panfrost_upload_work_dim_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1142 panfrost_upload_work_dim_sysval(struct panfrost_batch *batch,
1143                                 struct sysval_uniform *uniform)
1144 {
1145    struct panfrost_context *ctx = batch->ctx;
1146 
1147    uniform->u[0] = ctx->compute_grid->work_dim;
1148 }
1149 
1150 /* Sample positions are pushed in a Bifrost specific format on Bifrost. On
1151  * Midgard, we emulate the Bifrost path with some extra arithmetic in the
1152  * shader, to keep the code as unified as possible. */
1153 
1154 static void
panfrost_upload_sample_positions_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1155 panfrost_upload_sample_positions_sysval(struct panfrost_batch *batch,
1156                                         struct sysval_uniform *uniform)
1157 {
1158    struct panfrost_context *ctx = batch->ctx;
1159    struct panfrost_device *dev = pan_device(ctx->base.screen);
1160 
1161    unsigned samples = util_framebuffer_get_num_samples(&batch->key);
1162    uniform->du[0] =
1163       dev->sample_positions->ptr.gpu +
1164       panfrost_sample_positions_offset(panfrost_sample_pattern(samples));
1165 }
1166 
1167 static void
panfrost_upload_multisampled_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1168 panfrost_upload_multisampled_sysval(struct panfrost_batch *batch,
1169                                     struct sysval_uniform *uniform)
1170 {
1171    unsigned samples = util_framebuffer_get_num_samples(&batch->key);
1172    uniform->u[0] = (samples > 1) ? ~0 : 0;
1173 }
1174 
1175 #if PAN_ARCH >= 6
1176 static void
panfrost_upload_rt_conversion_sysval(struct panfrost_batch * batch,unsigned size_and_rt,struct sysval_uniform * uniform)1177 panfrost_upload_rt_conversion_sysval(struct panfrost_batch *batch,
1178                                      unsigned size_and_rt,
1179                                      struct sysval_uniform *uniform)
1180 {
1181    unsigned rt = size_and_rt & 0xF;
1182    unsigned size = size_and_rt >> 4;
1183 
1184    if (rt < batch->key.nr_cbufs && batch->key.cbufs[rt]) {
1185       enum pipe_format format = batch->key.cbufs[rt]->format;
1186       uniform->u[0] =
1187          GENX(pan_blend_get_internal_desc)(format, rt, size, false) >> 32;
1188    } else {
1189       pan_pack(&uniform->u[0], INTERNAL_CONVERSION, cfg)
1190          cfg.memory_format =
1191             GENX(panfrost_format_from_pipe_format)(PIPE_FORMAT_NONE)->hw;
1192    }
1193 }
1194 #endif
1195 
1196 static unsigned
panfrost_xfb_offset(unsigned stride,struct pipe_stream_output_target * target)1197 panfrost_xfb_offset(unsigned stride, struct pipe_stream_output_target *target)
1198 {
1199    return target->buffer_offset + (pan_so_target(target)->offset * stride);
1200 }
1201 
1202 static void
panfrost_upload_sysvals(struct panfrost_batch * batch,void * ptr_cpu,mali_ptr ptr_gpu,struct panfrost_compiled_shader * ss,enum pipe_shader_type st)1203 panfrost_upload_sysvals(struct panfrost_batch *batch, void *ptr_cpu,
1204                         mali_ptr ptr_gpu, struct panfrost_compiled_shader *ss,
1205                         enum pipe_shader_type st)
1206 {
1207    struct sysval_uniform *uniforms = ptr_cpu;
1208 
1209    for (unsigned i = 0; i < ss->sysvals.sysval_count; ++i) {
1210       int sysval = ss->sysvals.sysvals[i];
1211 
1212       switch (PAN_SYSVAL_TYPE(sysval)) {
1213       case PAN_SYSVAL_VIEWPORT_SCALE:
1214          panfrost_upload_viewport_scale_sysval(batch, &uniforms[i]);
1215          break;
1216       case PAN_SYSVAL_VIEWPORT_OFFSET:
1217          panfrost_upload_viewport_offset_sysval(batch, &uniforms[i]);
1218          break;
1219       case PAN_SYSVAL_TEXTURE_SIZE:
1220          panfrost_upload_txs_sysval(batch, st, PAN_SYSVAL_ID(sysval),
1221                                     &uniforms[i]);
1222          break;
1223       case PAN_SYSVAL_SSBO:
1224          panfrost_upload_ssbo_sysval(batch, st, PAN_SYSVAL_ID(sysval),
1225                                      &uniforms[i]);
1226          break;
1227 
1228       case PAN_SYSVAL_XFB: {
1229          unsigned buf = PAN_SYSVAL_ID(sysval);
1230          struct panfrost_compiled_shader *vs =
1231             batch->ctx->prog[PIPE_SHADER_VERTEX];
1232          struct pipe_stream_output_info *so = &vs->stream_output;
1233          unsigned stride = so->stride[buf] * 4;
1234 
1235          struct pipe_stream_output_target *target = NULL;
1236          if (buf < batch->ctx->streamout.num_targets)
1237             target = batch->ctx->streamout.targets[buf];
1238 
1239          if (!target) {
1240             /* Memory sink */
1241             uniforms[i].du[0] = 0x8ull << 60;
1242             break;
1243          }
1244 
1245          struct panfrost_resource *rsrc = pan_resource(target->buffer);
1246          unsigned offset = panfrost_xfb_offset(stride, target);
1247 
1248          util_range_add(&rsrc->base, &rsrc->valid_buffer_range, offset,
1249                         target->buffer_size - offset);
1250 
1251          panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
1252 
1253          uniforms[i].du[0] = rsrc->image.data.base + offset;
1254          break;
1255       }
1256 
1257       case PAN_SYSVAL_NUM_VERTICES:
1258          uniforms[i].u[0] = batch->ctx->vertex_count;
1259          break;
1260 
1261       case PAN_SYSVAL_NUM_WORK_GROUPS:
1262          for (unsigned j = 0; j < 3; j++) {
1263             batch->num_wg_sysval[j] =
1264                ptr_gpu + (i * sizeof(*uniforms)) + (j * 4);
1265          }
1266          panfrost_upload_num_work_groups_sysval(batch, &uniforms[i]);
1267          break;
1268       case PAN_SYSVAL_LOCAL_GROUP_SIZE:
1269          panfrost_upload_local_group_size_sysval(batch, &uniforms[i]);
1270          break;
1271       case PAN_SYSVAL_WORK_DIM:
1272          panfrost_upload_work_dim_sysval(batch, &uniforms[i]);
1273          break;
1274       case PAN_SYSVAL_SAMPLER:
1275          panfrost_upload_sampler_sysval(batch, st, PAN_SYSVAL_ID(sysval),
1276                                         &uniforms[i]);
1277          break;
1278       case PAN_SYSVAL_IMAGE_SIZE:
1279          panfrost_upload_image_size_sysval(batch, st, PAN_SYSVAL_ID(sysval),
1280                                            &uniforms[i]);
1281          break;
1282       case PAN_SYSVAL_SAMPLE_POSITIONS:
1283          panfrost_upload_sample_positions_sysval(batch, &uniforms[i]);
1284          break;
1285       case PAN_SYSVAL_MULTISAMPLED:
1286          panfrost_upload_multisampled_sysval(batch, &uniforms[i]);
1287          break;
1288 #if PAN_ARCH >= 6
1289       case PAN_SYSVAL_RT_CONVERSION:
1290          panfrost_upload_rt_conversion_sysval(batch, PAN_SYSVAL_ID(sysval),
1291                                               &uniforms[i]);
1292          break;
1293 #endif
1294       case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:
1295          uniforms[i].u[0] = batch->ctx->offset_start;
1296          uniforms[i].u[1] = batch->ctx->base_vertex;
1297          uniforms[i].u[2] = batch->ctx->base_instance;
1298          break;
1299       case PAN_SYSVAL_DRAWID:
1300          uniforms[i].u[0] = batch->ctx->drawid;
1301          break;
1302       default:
1303          assert(0);
1304       }
1305    }
1306 }
1307 
1308 static const void *
panfrost_map_constant_buffer_cpu(struct panfrost_context * ctx,struct panfrost_constant_buffer * buf,unsigned index)1309 panfrost_map_constant_buffer_cpu(struct panfrost_context *ctx,
1310                                  struct panfrost_constant_buffer *buf,
1311                                  unsigned index)
1312 {
1313    struct pipe_constant_buffer *cb = &buf->cb[index];
1314    struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1315 
1316    if (rsrc) {
1317       panfrost_bo_mmap(rsrc->bo);
1318       panfrost_flush_writer(ctx, rsrc, "CPU constant buffer mapping");
1319       panfrost_bo_wait(rsrc->bo, INT64_MAX, false);
1320 
1321       return rsrc->bo->ptr.cpu + cb->buffer_offset;
1322    } else if (cb->user_buffer) {
1323       return cb->user_buffer + cb->buffer_offset;
1324    } else
1325       unreachable("No constant buffer");
1326 }
1327 
1328 /* Emit a single UBO record. On Valhall, UBOs are dumb buffers and are
1329  * implemented with buffer descriptors in the resource table, sized in terms of
1330  * bytes. On Bifrost and older, UBOs have special uniform buffer data
1331  * structure, sized in terms of entries.
1332  */
1333 static void
panfrost_emit_ubo(void * base,unsigned index,mali_ptr address,size_t size)1334 panfrost_emit_ubo(void *base, unsigned index, mali_ptr address, size_t size)
1335 {
1336 #if PAN_ARCH >= 9
1337    struct mali_buffer_packed *out = base;
1338 
1339    pan_pack(out + index, BUFFER, cfg) {
1340       cfg.size = size;
1341       cfg.address = address;
1342    }
1343 #else
1344    struct mali_uniform_buffer_packed *out = base;
1345 
1346    /* Issue (57) for the ARB_uniform_buffer_object spec says that
1347     * the buffer can be larger than the uniform data inside it,
1348     * so clamp ubo size to what hardware supports. */
1349 
1350    pan_pack(out + index, UNIFORM_BUFFER, cfg) {
1351       cfg.entries = MIN2(DIV_ROUND_UP(size, 16), 1 << 12);
1352       cfg.pointer = address;
1353    }
1354 #endif
1355 }
1356 
1357 #if PAN_ARCH >= 9
1358 static mali_ptr
panfrost_emit_ssbos(struct panfrost_batch * batch,enum pipe_shader_type st)1359 panfrost_emit_ssbos(struct panfrost_batch *batch, enum pipe_shader_type st)
1360 {
1361    struct panfrost_context *ctx = batch->ctx;
1362    unsigned ssbo_count = util_last_bit(ctx->ssbo_mask[st]);
1363 
1364    if (!ssbo_count)
1365       return 0;
1366 
1367    struct panfrost_ptr ssbos =
1368       pan_pool_alloc_desc_array(&batch->pool.base, ssbo_count, BUFFER);
1369    struct mali_buffer_packed *bufs = ssbos.cpu;
1370 
1371    memset(bufs, 0, sizeof(bufs[0]) * ssbo_count);
1372 
1373    u_foreach_bit(ssbo_id, ctx->ssbo_mask[st]) {
1374       struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1375       struct panfrost_resource *rsrc = pan_resource(sb.buffer);
1376       struct panfrost_bo *bo = rsrc->bo;
1377 
1378       panfrost_batch_write_rsrc(batch, rsrc, st);
1379 
1380       util_range_add(&rsrc->base, &rsrc->valid_buffer_range, sb.buffer_offset,
1381                      sb.buffer_size);
1382       pan_pack(&bufs[ssbo_id], BUFFER, cfg) {
1383          cfg.size = sb.buffer_size;
1384          cfg.address = bo->ptr.gpu + sb.buffer_offset;
1385       }
1386    }
1387 
1388    return ssbos.gpu;
1389 }
1390 #endif
1391 
1392 static mali_ptr
panfrost_emit_const_buf(struct panfrost_batch * batch,enum pipe_shader_type stage,unsigned * buffer_count,mali_ptr * push_constants,unsigned * pushed_words)1393 panfrost_emit_const_buf(struct panfrost_batch *batch,
1394                         enum pipe_shader_type stage, unsigned *buffer_count,
1395                         mali_ptr *push_constants, unsigned *pushed_words)
1396 {
1397    struct panfrost_context *ctx = batch->ctx;
1398    struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1399    struct panfrost_compiled_shader *ss = ctx->prog[stage];
1400 
1401    if (!ss)
1402       return 0;
1403 
1404    /* Allocate room for the sysval and the uniforms */
1405    size_t sys_size = sizeof(float) * 4 * ss->sysvals.sysval_count;
1406    struct panfrost_ptr transfer =
1407       pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16);
1408 
1409    /* Upload sysvals requested by the shader */
1410    uint8_t *sysvals = alloca(sys_size);
1411    panfrost_upload_sysvals(batch, sysvals, transfer.gpu, ss, stage);
1412    memcpy(transfer.cpu, sysvals, sys_size);
1413 
1414    /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */
1415    struct panfrost_compiled_shader *shader = ctx->prog[stage];
1416    unsigned ubo_count = shader->info.ubo_count - (sys_size ? 1 : 0);
1417    unsigned sysval_ubo = sys_size ? ubo_count : ~0;
1418    struct panfrost_ptr ubos = {0};
1419 
1420 #if PAN_ARCH >= 9
1421    ubos = pan_pool_alloc_desc_array(&batch->pool.base, ubo_count + 1, BUFFER);
1422 #else
1423    ubos = pan_pool_alloc_desc_array(&batch->pool.base, ubo_count + 1,
1424                                     UNIFORM_BUFFER);
1425 #endif
1426 
1427    if (buffer_count)
1428       *buffer_count = ubo_count + (sys_size ? 1 : 0);
1429 
1430    /* Upload sysval as a final UBO */
1431 
1432    if (sys_size)
1433       panfrost_emit_ubo(ubos.cpu, ubo_count, transfer.gpu, sys_size);
1434 
1435    /* The rest are honest-to-goodness UBOs */
1436 
1437    u_foreach_bit(ubo, ss->info.ubo_mask & buf->enabled_mask) {
1438       size_t usz = buf->cb[ubo].buffer_size;
1439       mali_ptr address = 0;
1440 
1441       if (usz > 0) {
1442          address = panfrost_map_constant_buffer_gpu(batch, stage, buf, ubo);
1443       }
1444 
1445       panfrost_emit_ubo(ubos.cpu, ubo, address, usz);
1446    }
1447 
1448    if (pushed_words)
1449       *pushed_words = ss->info.push.count;
1450 
1451    if (ss->info.push.count == 0)
1452       return ubos.gpu;
1453 
1454    /* Copy push constants required by the shader */
1455    struct panfrost_ptr push_transfer =
1456       pan_pool_alloc_aligned(&batch->pool.base, ss->info.push.count * 4, 16);
1457 
1458    uint32_t *push_cpu = (uint32_t *)push_transfer.cpu;
1459    *push_constants = push_transfer.gpu;
1460 
1461    for (unsigned i = 0; i < ss->info.push.count; ++i) {
1462       struct panfrost_ubo_word src = ss->info.push.words[i];
1463 
1464       if (src.ubo == sysval_ubo) {
1465          unsigned sysval_idx = src.offset / 16;
1466          unsigned sysval_comp = (src.offset % 16) / 4;
1467          unsigned sysval_type =
1468             PAN_SYSVAL_TYPE(ss->sysvals.sysvals[sysval_idx]);
1469          mali_ptr ptr = push_transfer.gpu + (4 * i);
1470 
1471          if (sysval_type == PAN_SYSVAL_NUM_WORK_GROUPS &&
1472              sysval_comp < ARRAY_SIZE(batch->num_wg_sysval))
1473             batch->num_wg_sysval[sysval_comp] = ptr;
1474       }
1475       /* Map the UBO, this should be cheap. For some buffers this may
1476        * read from write-combine memory which is slow, though :-(
1477        */
1478       const void *mapped_ubo =
1479          (src.ubo == sysval_ubo)
1480             ? sysvals
1481             : panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo);
1482 
1483       /* TODO: Is there any benefit to combining ranges */
1484       memcpy(push_cpu + i, (uint8_t *)mapped_ubo + src.offset, 4);
1485    }
1486 
1487    return ubos.gpu;
1488 }
1489 
1490 /*
1491  * Choose the number of WLS instances to allocate. This must be a power-of-two.
1492  * The number of WLS instances limits the number of concurrent tasks on a given
1493  * shader core, setting to the (rounded) total number of tasks avoids any
1494  * throttling. Smaller values save memory at the expense of possible throttling.
1495  *
1496  * With indirect dispatch, we don't know at launch-time how many tasks will be
1497  * needed, so we use a conservative value that's unlikely to cause slowdown in
1498  * practice without wasting too much memory.
1499  */
1500 static unsigned
panfrost_choose_wls_instance_count(const struct pipe_grid_info * grid)1501 panfrost_choose_wls_instance_count(const struct pipe_grid_info *grid)
1502 {
1503    if (grid->indirect) {
1504       /* May need tuning in the future, conservative guess */
1505       return 128;
1506    } else {
1507       return util_next_power_of_two(grid->grid[0]) *
1508              util_next_power_of_two(grid->grid[1]) *
1509              util_next_power_of_two(grid->grid[2]);
1510    }
1511 }
1512 
1513 static mali_ptr
panfrost_emit_shared_memory(struct panfrost_batch * batch,const struct pipe_grid_info * grid)1514 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1515                             const struct pipe_grid_info *grid)
1516 {
1517    struct panfrost_context *ctx = batch->ctx;
1518    struct panfrost_device *dev = pan_device(ctx->base.screen);
1519    struct panfrost_compiled_shader *ss = ctx->prog[PIPE_SHADER_COMPUTE];
1520    struct panfrost_ptr t =
1521       pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
1522 
1523    struct pan_tls_info info = {
1524       .tls.size = ss->info.tls_size,
1525       .wls.size = ss->info.wls_size + grid->variable_shared_mem,
1526       .wls.instances = panfrost_choose_wls_instance_count(grid),
1527    };
1528 
1529    if (ss->info.tls_size) {
1530       struct panfrost_bo *bo = panfrost_batch_get_scratchpad(
1531          batch, ss->info.tls_size, dev->thread_tls_alloc, dev->core_id_range);
1532       info.tls.ptr = bo->ptr.gpu;
1533    }
1534 
1535    if (info.wls.size) {
1536       unsigned size = pan_wls_adjust_size(info.wls.size) * info.wls.instances *
1537                       dev->core_id_range;
1538 
1539       struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch, size, 1);
1540 
1541       info.wls.ptr = bo->ptr.gpu;
1542    }
1543 
1544    GENX(pan_emit_tls)(&info, t.cpu);
1545    return t.gpu;
1546 }
1547 
1548 #if PAN_ARCH <= 5
1549 static mali_ptr
panfrost_get_tex_desc(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_sampler_view * view)1550 panfrost_get_tex_desc(struct panfrost_batch *batch, enum pipe_shader_type st,
1551                       struct panfrost_sampler_view *view)
1552 {
1553    if (!view)
1554       return (mali_ptr)0;
1555 
1556    struct pipe_sampler_view *pview = &view->base;
1557    struct panfrost_resource *rsrc = pan_resource(pview->texture);
1558 
1559    panfrost_batch_read_rsrc(batch, rsrc, st);
1560    panfrost_batch_add_bo(batch, view->state.bo, st);
1561 
1562    return view->state.gpu;
1563 }
1564 #endif
1565 
1566 static void
panfrost_create_sampler_view_bo(struct panfrost_sampler_view * so,struct pipe_context * pctx,struct pipe_resource * texture)1567 panfrost_create_sampler_view_bo(struct panfrost_sampler_view *so,
1568                                 struct pipe_context *pctx,
1569                                 struct pipe_resource *texture)
1570 {
1571    struct panfrost_device *device = pan_device(pctx->screen);
1572    struct panfrost_context *ctx = pan_context(pctx);
1573    struct panfrost_resource *prsrc = (struct panfrost_resource *)texture;
1574    enum pipe_format format = so->base.format;
1575    assert(prsrc->bo);
1576 
1577    /* Format to access the stencil/depth portion of a Z32_S8 texture */
1578    if (format == PIPE_FORMAT_X32_S8X24_UINT) {
1579       assert(prsrc->separate_stencil);
1580       texture = &prsrc->separate_stencil->base;
1581       prsrc = (struct panfrost_resource *)texture;
1582       format = texture->format;
1583    } else if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
1584       format = PIPE_FORMAT_Z32_FLOAT;
1585    }
1586 
1587    so->texture_bo = prsrc->image.data.base;
1588    so->modifier = prsrc->image.layout.modifier;
1589 
1590    /* MSAA only supported for 2D textures */
1591 
1592    assert(texture->nr_samples <= 1 || so->base.target == PIPE_TEXTURE_2D ||
1593           so->base.target == PIPE_TEXTURE_2D_ARRAY);
1594 
1595    enum mali_texture_dimension type =
1596       panfrost_translate_texture_dimension(so->base.target);
1597 
1598    bool is_buffer = (so->base.target == PIPE_BUFFER);
1599 
1600    unsigned first_level = is_buffer ? 0 : so->base.u.tex.first_level;
1601    unsigned last_level = is_buffer ? 0 : so->base.u.tex.last_level;
1602    unsigned first_layer = is_buffer ? 0 : so->base.u.tex.first_layer;
1603    unsigned last_layer = is_buffer ? 0 : so->base.u.tex.last_layer;
1604    unsigned buf_offset = is_buffer ? so->base.u.buf.offset : 0;
1605    unsigned buf_size =
1606       (is_buffer ? so->base.u.buf.size : 0) / util_format_get_blocksize(format);
1607    buf_size = MIN2(buf_size, PAN_MAX_TEXEL_BUFFER_ELEMENTS);
1608 
1609    if (so->base.target == PIPE_TEXTURE_3D) {
1610       first_layer /= prsrc->image.layout.depth;
1611       last_layer /= prsrc->image.layout.depth;
1612       assert(!first_layer && !last_layer);
1613    }
1614 
1615    struct pan_image_view iview = {
1616       .format = format,
1617       .dim = type,
1618       .first_level = first_level,
1619       .last_level = last_level,
1620       .first_layer = first_layer,
1621       .last_layer = last_layer,
1622       .swizzle =
1623          {
1624             so->base.swizzle_r,
1625             so->base.swizzle_g,
1626             so->base.swizzle_b,
1627             so->base.swizzle_a,
1628          },
1629       .planes = {NULL},
1630       .buf.offset = buf_offset,
1631       .buf.size = buf_size,
1632    };
1633 
1634    panfrost_set_image_view_planes(&iview, texture);
1635 
1636    unsigned size = (PAN_ARCH <= 5 ? pan_size(TEXTURE) : 0) +
1637                    GENX(panfrost_estimate_texture_payload_size)(&iview);
1638 
1639    struct panfrost_pool *pool = so->pool ?: &ctx->descs;
1640    struct panfrost_ptr payload = pan_pool_alloc_aligned(&pool->base, size, 64);
1641    so->state = panfrost_pool_take_ref(&ctx->descs, payload.gpu);
1642 
1643    void *tex = (PAN_ARCH >= 6) ? &so->bifrost_descriptor : payload.cpu;
1644 
1645    if (PAN_ARCH <= 5) {
1646       payload.cpu += pan_size(TEXTURE);
1647       payload.gpu += pan_size(TEXTURE);
1648    }
1649 
1650    const struct util_format_description *desc =
1651       util_format_description(format);
1652 
1653    if ((device->debug & PAN_DBG_YUV) && panfrost_format_is_yuv(format)) {
1654 
1655       if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
1656          iview.swizzle[2] = PIPE_SWIZZLE_1;
1657       } else if (desc->layout == UTIL_FORMAT_LAYOUT_PLANAR2) {
1658          iview.swizzle[1] = PIPE_SWIZZLE_0;
1659          iview.swizzle[2] = PIPE_SWIZZLE_0;
1660       }
1661    }
1662 
1663    if (desc->layout == UTIL_FORMAT_LAYOUT_ASTC &&
1664        so->base.astc_decode_format == PIPE_ASTC_DECODE_FORMAT_UNORM8) {
1665       iview.astc.narrow = true;
1666    }
1667 
1668    GENX(panfrost_new_texture)(&iview, tex, &payload);
1669 }
1670 
1671 static void
panfrost_update_sampler_view(struct panfrost_sampler_view * view,struct pipe_context * pctx)1672 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1673                              struct pipe_context *pctx)
1674 {
1675    struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1676    if (view->texture_bo != rsrc->image.data.base ||
1677        view->modifier != rsrc->image.layout.modifier) {
1678       panfrost_bo_unreference(view->state.bo);
1679       panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1680    }
1681 }
1682 
1683 #if PAN_ARCH >= 6
1684 static void
panfrost_emit_null_texture(struct mali_texture_packed * out)1685 panfrost_emit_null_texture(struct mali_texture_packed *out)
1686 
1687 {
1688    /* Annoyingly, an all zero texture descriptor is not valid and will raise
1689     * a DATA_INVALID_FAULT if you try to texture it, instead of returning
1690     * 0000s! Fill in with sometthing that will behave robustly.
1691     */
1692    pan_pack(out, TEXTURE, cfg) {
1693       cfg.dimension = MALI_TEXTURE_DIMENSION_2D;
1694       cfg.width = 1;
1695       cfg.height = 1;
1696       cfg.depth = 1;
1697       cfg.array_size = 1;
1698       cfg.format = MALI_PACK_FMT(CONSTANT, 0000, L);
1699 #if PAN_ARCH <= 7
1700       cfg.texel_ordering = MALI_TEXTURE_LAYOUT_LINEAR;
1701 #endif
1702    }
1703 }
1704 #endif
1705 
1706 static mali_ptr
panfrost_emit_texture_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1707 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1708                                   enum pipe_shader_type stage)
1709 {
1710    struct panfrost_context *ctx = batch->ctx;
1711 
1712    unsigned actual_count = ctx->sampler_view_count[stage];
1713    unsigned needed_count = ctx->prog[stage]->info.texture_count;
1714    unsigned alloc_count = MAX2(actual_count, needed_count);
1715 
1716    if (!alloc_count)
1717       return 0;
1718 
1719 #if PAN_ARCH >= 6
1720    struct panfrost_ptr T =
1721       pan_pool_alloc_desc_array(&batch->pool.base, alloc_count, TEXTURE);
1722    struct mali_texture_packed *out = (struct mali_texture_packed *)T.cpu;
1723 
1724    for (int i = 0; i < actual_count; ++i) {
1725       struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1726 
1727       if (!view) {
1728          panfrost_emit_null_texture(&out[i]);
1729          continue;
1730       }
1731 
1732       struct pipe_sampler_view *pview = &view->base;
1733       struct panfrost_resource *rsrc = pan_resource(pview->texture);
1734 
1735       panfrost_update_sampler_view(view, &ctx->base);
1736       out[i] = view->bifrost_descriptor;
1737 
1738       panfrost_batch_read_rsrc(batch, rsrc, stage);
1739       panfrost_batch_add_bo(batch, view->state.bo, stage);
1740    }
1741 
1742    for (int i = actual_count; i < needed_count; ++i)
1743       panfrost_emit_null_texture(&out[i]);
1744 
1745    return T.gpu;
1746 #else
1747    uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1748 
1749    for (int i = 0; i < actual_count; ++i) {
1750       struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1751 
1752       if (!view) {
1753          trampolines[i] = 0;
1754          continue;
1755       }
1756 
1757       panfrost_update_sampler_view(view, &ctx->base);
1758 
1759       trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1760    }
1761 
1762    for (int i = actual_count; i < needed_count; ++i)
1763       trampolines[i] = 0;
1764 
1765    return pan_pool_upload_aligned(&batch->pool.base, trampolines,
1766                                   sizeof(uint64_t) * alloc_count,
1767                                   sizeof(uint64_t));
1768 #endif
1769 }
1770 
1771 static mali_ptr
panfrost_upload_wa_sampler(struct panfrost_batch * batch)1772 panfrost_upload_wa_sampler(struct panfrost_batch *batch)
1773 {
1774    struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, SAMPLER);
1775    pan_pack(T.cpu, SAMPLER, cfg)
1776       ;
1777    return T.gpu;
1778 }
1779 
1780 static mali_ptr
panfrost_emit_sampler_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1781 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1782                                   enum pipe_shader_type stage)
1783 {
1784    struct panfrost_context *ctx = batch->ctx;
1785 
1786    /* We always need at least 1 sampler for txf to work */
1787    if (!ctx->sampler_count[stage])
1788       return panfrost_upload_wa_sampler(batch);
1789 
1790    struct panfrost_ptr T = pan_pool_alloc_desc_array(
1791       &batch->pool.base, ctx->sampler_count[stage], SAMPLER);
1792    struct mali_sampler_packed *out = (struct mali_sampler_packed *)T.cpu;
1793 
1794    for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i) {
1795       struct panfrost_sampler_state *st = ctx->samplers[stage][i];
1796 
1797       out[i] = st ? st->hw : (struct mali_sampler_packed){0};
1798    }
1799 
1800    return T.gpu;
1801 }
1802 
1803 #if PAN_ARCH <= 7
1804 /* Packs all image attribute descs and attribute buffer descs.
1805  * `first_image_buf_index` must be the index of the first image attribute buffer
1806  * descriptor.
1807  */
1808 static void
emit_image_attribs(struct panfrost_context * ctx,enum pipe_shader_type shader,struct mali_attribute_packed * attribs,unsigned first_buf)1809 emit_image_attribs(struct panfrost_context *ctx, enum pipe_shader_type shader,
1810                    struct mali_attribute_packed *attribs, unsigned first_buf)
1811 {
1812    unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1813 
1814    for (unsigned i = 0; i < last_bit; ++i) {
1815       enum pipe_format format = ctx->images[shader][i].format;
1816 
1817       pan_pack(attribs + i, ATTRIBUTE, cfg) {
1818          /* Continuation record means 2 buffers per image */
1819          cfg.buffer_index = first_buf + (i * 2);
1820          cfg.offset_enable = (PAN_ARCH <= 5);
1821          cfg.format = GENX(panfrost_format_from_pipe_format)(format)->hw;
1822       }
1823    }
1824 }
1825 
1826 static enum mali_attribute_type
pan_modifier_to_attr_type(uint64_t modifier)1827 pan_modifier_to_attr_type(uint64_t modifier)
1828 {
1829    switch (modifier) {
1830    case DRM_FORMAT_MOD_LINEAR:
1831       return MALI_ATTRIBUTE_TYPE_3D_LINEAR;
1832    case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED:
1833       return MALI_ATTRIBUTE_TYPE_3D_INTERLEAVED;
1834    default:
1835       unreachable("Invalid modifier for attribute record");
1836    }
1837 }
1838 
1839 static void
emit_image_bufs(struct panfrost_batch * batch,enum pipe_shader_type shader,struct mali_attribute_buffer_packed * bufs,unsigned first_image_buf_index)1840 emit_image_bufs(struct panfrost_batch *batch, enum pipe_shader_type shader,
1841                 struct mali_attribute_buffer_packed *bufs,
1842                 unsigned first_image_buf_index)
1843 {
1844    struct panfrost_context *ctx = batch->ctx;
1845    unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1846 
1847    for (unsigned i = 0; i < last_bit; ++i) {
1848       struct pipe_image_view *image = &ctx->images[shader][i];
1849 
1850       if (!(ctx->image_mask[shader] & (1 << i)) ||
1851           !(image->shader_access & PIPE_IMAGE_ACCESS_READ_WRITE)) {
1852          /* Unused image bindings */
1853          pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg)
1854             ;
1855          pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER, cfg)
1856             ;
1857          continue;
1858       }
1859 
1860       struct panfrost_resource *rsrc = pan_resource(image->resource);
1861 
1862       bool is_msaa = image->resource->nr_samples > 1;
1863 
1864       bool is_3d = rsrc->base.target == PIPE_TEXTURE_3D;
1865       bool is_buffer = rsrc->base.target == PIPE_BUFFER;
1866 
1867       unsigned offset = is_buffer ? image->u.buf.offset
1868                                   : panfrost_texture_offset(
1869                                        &rsrc->image.layout, image->u.tex.level,
1870                                        (is_3d || is_msaa) ? 0 : image->u.tex.first_layer,
1871                                        (is_3d || is_msaa) ? image->u.tex.first_layer : 0);
1872 
1873       panfrost_track_image_access(batch, shader, image);
1874 
1875       pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg) {
1876          cfg.type = pan_modifier_to_attr_type(rsrc->image.layout.modifier);
1877          cfg.pointer = rsrc->image.data.base + offset;
1878          cfg.stride = util_format_get_blocksize(image->format);
1879          cfg.size = panfrost_bo_size(rsrc->bo) - offset;
1880       }
1881 
1882       if (is_buffer) {
1883          pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) {
1884             cfg.s_dimension =
1885                rsrc->base.width0 / util_format_get_blocksize(image->format);
1886             cfg.t_dimension = cfg.r_dimension = 1;
1887          }
1888 
1889          continue;
1890       }
1891 
1892       pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) {
1893          unsigned level = image->u.tex.level;
1894          unsigned samples = rsrc->image.layout.nr_samples;
1895 
1896          cfg.s_dimension = u_minify(rsrc->base.width0, level);
1897          cfg.t_dimension = u_minify(rsrc->base.height0, level);
1898          cfg.r_dimension = is_3d ? u_minify(rsrc->image.layout.depth, level)
1899             : (image->u.tex.last_layer - image->u.tex.first_layer + 1);
1900 
1901          cfg.row_stride = rsrc->image.layout.slices[level].row_stride;
1902          if (cfg.r_dimension > 1) {
1903             cfg.slice_stride =
1904                panfrost_get_layer_stride(&rsrc->image.layout, level);
1905          }
1906 
1907          if (is_msaa) {
1908             if (cfg.r_dimension == 1) {
1909                /* regular multisampled images get the sample index in
1910                   the R dimension */
1911                cfg.r_dimension = samples;
1912                cfg.slice_stride =
1913                   panfrost_get_layer_stride(&rsrc->image.layout, level) / samples;
1914             } else {
1915                /* multisampled image arrays are emulated by making the
1916                   image "samples" times higher than the original image,
1917                   and fixing up the T coordinate by the sample number
1918                   to address the correct sample (on bifrost) */
1919                cfg.t_dimension *= samples;
1920             }
1921          }
1922       }
1923    }
1924 }
1925 
1926 static mali_ptr
panfrost_emit_image_attribs(struct panfrost_batch * batch,mali_ptr * buffers,enum pipe_shader_type type)1927 panfrost_emit_image_attribs(struct panfrost_batch *batch, mali_ptr *buffers,
1928                             enum pipe_shader_type type)
1929 {
1930    struct panfrost_context *ctx = batch->ctx;
1931    struct panfrost_compiled_shader *shader = ctx->prog[type];
1932 
1933    if (!shader->info.attribute_count) {
1934       *buffers = 0;
1935       return 0;
1936    }
1937 
1938    /* Images always need a MALI_ATTRIBUTE_BUFFER_CONTINUATION_3D */
1939    unsigned attr_count = shader->info.attribute_count;
1940    unsigned buf_count = (attr_count * 2) + (PAN_ARCH >= 6 ? 1 : 0);
1941 
1942    struct panfrost_ptr bufs =
1943       pan_pool_alloc_desc_array(&batch->pool.base, buf_count, ATTRIBUTE_BUFFER);
1944 
1945    struct panfrost_ptr attribs =
1946       pan_pool_alloc_desc_array(&batch->pool.base, attr_count, ATTRIBUTE);
1947 
1948    emit_image_attribs(ctx, type, attribs.cpu, 0);
1949    emit_image_bufs(batch, type, bufs.cpu, 0);
1950 
1951    /* We need an empty attrib buf to stop the prefetching on Bifrost */
1952 #if PAN_ARCH >= 6
1953    pan_pack(bufs.cpu + ((buf_count - 1) * pan_size(ATTRIBUTE_BUFFER)),
1954             ATTRIBUTE_BUFFER, cfg)
1955       ;
1956 #endif
1957 
1958    *buffers = bufs.gpu;
1959    return attribs.gpu;
1960 }
1961 
1962 static mali_ptr
panfrost_emit_vertex_data(struct panfrost_batch * batch,mali_ptr * buffers)1963 panfrost_emit_vertex_data(struct panfrost_batch *batch, mali_ptr *buffers)
1964 {
1965    struct panfrost_context *ctx = batch->ctx;
1966    struct panfrost_vertex_state *so = ctx->vertex;
1967    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
1968    bool instanced = ctx->instance_count > 1;
1969    uint32_t image_mask = ctx->image_mask[PIPE_SHADER_VERTEX];
1970    unsigned nr_images = util_last_bit(image_mask);
1971 
1972    /* Worst case: everything is NPOT, which is only possible if instancing
1973     * is enabled. Otherwise single record is gauranteed.
1974     * Also, we allocate more memory than what's needed here if either instancing
1975     * is enabled or images are present, this can be improved. */
1976    unsigned bufs_per_attrib = (instanced || nr_images > 0) ? 2 : 1;
1977    unsigned nr_bufs =
1978       ((so->nr_bufs + nr_images) * bufs_per_attrib) + (PAN_ARCH >= 6 ? 1 : 0);
1979 
1980    unsigned count = vs->info.attribute_count;
1981 
1982    struct panfrost_compiled_shader *xfb =
1983       ctx->uncompiled[PIPE_SHADER_VERTEX]->xfb;
1984 
1985    if (xfb)
1986       count = MAX2(count, xfb->info.attribute_count);
1987 
1988 #if PAN_ARCH <= 5
1989    /* Midgard needs vertexid/instanceid handled specially */
1990    bool special_vbufs = count >= PAN_VERTEX_ID;
1991 
1992    if (special_vbufs)
1993       nr_bufs += 2;
1994 #endif
1995 
1996    if (!nr_bufs) {
1997       *buffers = 0;
1998       return 0;
1999    }
2000 
2001    struct panfrost_ptr S =
2002       pan_pool_alloc_desc_array(&batch->pool.base, nr_bufs, ATTRIBUTE_BUFFER);
2003    struct panfrost_ptr T =
2004       pan_pool_alloc_desc_array(&batch->pool.base, count, ATTRIBUTE);
2005 
2006    struct mali_attribute_buffer_packed *bufs =
2007       (struct mali_attribute_buffer_packed *)S.cpu;
2008 
2009    struct mali_attribute_packed *out = (struct mali_attribute_packed *)T.cpu;
2010 
2011    unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = {0};
2012    unsigned k = 0;
2013 
2014    for (unsigned i = 0; i < so->nr_bufs; ++i) {
2015       unsigned vbi = so->buffers[i].vbi;
2016       unsigned divisor = so->buffers[i].divisor;
2017       attrib_to_buffer[i] = k;
2018 
2019       if (!(ctx->vb_mask & (1 << vbi)))
2020          continue;
2021 
2022       struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
2023       struct panfrost_resource *rsrc;
2024 
2025       rsrc = pan_resource(buf->buffer.resource);
2026       if (!rsrc)
2027          continue;
2028 
2029       panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
2030 
2031       /* Mask off lower bits, see offset fixup below */
2032       mali_ptr raw_addr = rsrc->image.data.base + buf->buffer_offset;
2033       mali_ptr addr = raw_addr & ~63;
2034 
2035       /* Since we advanced the base pointer, we shrink the buffer
2036        * size, but add the offset we subtracted */
2037       unsigned size =
2038          rsrc->base.width0 + (raw_addr - addr) - buf->buffer_offset;
2039 
2040       /* When there is a divisor, the hardware-level divisor is
2041        * the product of the instance divisor and the padded count */
2042       unsigned stride = so->strides[vbi];
2043       unsigned hw_divisor = ctx->padded_count * divisor;
2044 
2045       if (ctx->instance_count <= 1) {
2046          /* Per-instance would be every attribute equal */
2047          if (divisor)
2048             stride = 0;
2049 
2050          pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2051             cfg.pointer = addr;
2052             cfg.stride = stride;
2053             cfg.size = size;
2054          }
2055       } else if (!divisor) {
2056          pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2057             cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
2058             cfg.pointer = addr;
2059             cfg.stride = stride;
2060             cfg.size = size;
2061             cfg.divisor = ctx->padded_count;
2062          }
2063       } else if (util_is_power_of_two_or_zero(hw_divisor)) {
2064          pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2065             cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
2066             cfg.pointer = addr;
2067             cfg.stride = stride;
2068             cfg.size = size;
2069             cfg.divisor_r = __builtin_ctz(hw_divisor);
2070          }
2071 
2072       } else {
2073          unsigned shift = 0, extra_flags = 0;
2074 
2075          unsigned magic_divisor =
2076             panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
2077 
2078          /* Records with continuations must be aligned */
2079          k = ALIGN_POT(k, 2);
2080          attrib_to_buffer[i] = k;
2081 
2082          pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2083             cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
2084             cfg.pointer = addr;
2085             cfg.stride = stride;
2086             cfg.size = size;
2087 
2088             cfg.divisor_r = shift;
2089             cfg.divisor_e = extra_flags;
2090          }
2091 
2092          pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
2093             cfg.divisor_numerator = magic_divisor;
2094             cfg.divisor = divisor;
2095          }
2096 
2097          ++k;
2098       }
2099 
2100       ++k;
2101    }
2102 
2103 #if PAN_ARCH <= 5
2104    /* Add special gl_VertexID/gl_InstanceID buffers */
2105    if (special_vbufs) {
2106       panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
2107 
2108       pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
2109          cfg.buffer_index = k++;
2110          cfg.format = so->formats[PAN_VERTEX_ID];
2111       }
2112 
2113       panfrost_instance_id(ctx->padded_count, &bufs[k],
2114                            ctx->instance_count > 1);
2115 
2116       pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
2117          cfg.buffer_index = k++;
2118          cfg.format = so->formats[PAN_INSTANCE_ID];
2119       }
2120    }
2121 #endif
2122 
2123    if (nr_images) {
2124       k = ALIGN_POT(k, 2);
2125       emit_image_attribs(ctx, PIPE_SHADER_VERTEX, out + so->num_elements, k);
2126       emit_image_bufs(batch, PIPE_SHADER_VERTEX, bufs + k, k);
2127       k += (util_last_bit(ctx->image_mask[PIPE_SHADER_VERTEX]) * 2);
2128    }
2129 
2130 #if PAN_ARCH >= 6
2131    /* We need an empty attrib buf to stop the prefetching on Bifrost */
2132    pan_pack(&bufs[k], ATTRIBUTE_BUFFER, cfg)
2133       ;
2134 #endif
2135 
2136    /* Attribute addresses require 64-byte alignment, so let:
2137     *
2138     *      base' = base & ~63 = base - (base & 63)
2139     *      offset' = offset + (base & 63)
2140     *
2141     * Since base' + offset' = base + offset, these are equivalent
2142     * addressing modes and now base is 64 aligned.
2143     */
2144 
2145    /* While these are usually equal, they are not required to be. In some
2146     * cases, u_blitter passes too high a value for num_elements.
2147     */
2148    assert(vs->info.attributes_read_count <= so->num_elements);
2149 
2150    for (unsigned i = 0; i < vs->info.attributes_read_count; ++i) {
2151       unsigned vbi = so->pipe[i].vertex_buffer_index;
2152       struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
2153 
2154       /* BOs are aligned; just fixup for buffer_offset */
2155       signed src_offset = so->pipe[i].src_offset;
2156       src_offset += (buf->buffer_offset & 63);
2157 
2158       /* Base instance offset */
2159       if (ctx->base_instance && so->pipe[i].instance_divisor) {
2160          src_offset += (ctx->base_instance * so->pipe[i].src_stride) /
2161                        so->pipe[i].instance_divisor;
2162       }
2163 
2164       /* Also, somewhat obscurely per-instance data needs to be
2165        * offset in response to a delayed start in an indexed draw */
2166 
2167       if (so->pipe[i].instance_divisor && ctx->instance_count > 1)
2168          src_offset -= so->pipe[i].src_stride * ctx->offset_start;
2169 
2170       pan_pack(out + i, ATTRIBUTE, cfg) {
2171          cfg.buffer_index = attrib_to_buffer[so->element_buffer[i]];
2172          cfg.format = so->formats[i];
2173          cfg.offset = src_offset;
2174       }
2175    }
2176 
2177    *buffers = S.gpu;
2178    return T.gpu;
2179 }
2180 
2181 static mali_ptr
panfrost_emit_varyings(struct panfrost_batch * batch,struct mali_attribute_buffer_packed * slot,unsigned stride,unsigned count)2182 panfrost_emit_varyings(struct panfrost_batch *batch,
2183                        struct mali_attribute_buffer_packed *slot,
2184                        unsigned stride, unsigned count)
2185 {
2186    unsigned size = stride * count;
2187    mali_ptr ptr =
2188       pan_pool_alloc_aligned(&batch->invisible_pool.base, size, 64).gpu;
2189 
2190    pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
2191       cfg.stride = stride;
2192       cfg.size = size;
2193       cfg.pointer = ptr;
2194    }
2195 
2196    return ptr;
2197 }
2198 
2199 /* Given a varying, figure out which index it corresponds to */
2200 
2201 static inline unsigned
pan_varying_index(unsigned present,enum pan_special_varying v)2202 pan_varying_index(unsigned present, enum pan_special_varying v)
2203 {
2204    return util_bitcount(present & BITFIELD_MASK(v));
2205 }
2206 
2207 /* Determines which varying buffers are required */
2208 
2209 static inline unsigned
pan_varying_present(const struct panfrost_device * dev,struct pan_shader_info * producer,struct pan_shader_info * consumer,uint16_t point_coord_mask)2210 pan_varying_present(const struct panfrost_device *dev,
2211                     struct pan_shader_info *producer,
2212                     struct pan_shader_info *consumer, uint16_t point_coord_mask)
2213 {
2214    /* At the moment we always emit general and position buffers. Not
2215     * strictly necessary but usually harmless */
2216 
2217    unsigned present =
2218       BITFIELD_BIT(PAN_VARY_GENERAL) | BITFIELD_BIT(PAN_VARY_POSITION);
2219 
2220    /* Enable special buffers by the shader info */
2221 
2222    if (producer->vs.writes_point_size)
2223       present |= BITFIELD_BIT(PAN_VARY_PSIZ);
2224 
2225 #if PAN_ARCH <= 5
2226    /* On Midgard, these exist as real varyings. Later architectures use
2227     * LD_VAR_SPECIAL reads instead. */
2228 
2229    if (consumer->fs.reads_point_coord)
2230       present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
2231 
2232    if (consumer->fs.reads_face)
2233       present |= BITFIELD_BIT(PAN_VARY_FACE);
2234 
2235    if (consumer->fs.reads_frag_coord)
2236       present |= BITFIELD_BIT(PAN_VARY_FRAGCOORD);
2237 
2238    /* Also, if we have a point sprite, we need a point coord buffer */
2239 
2240    for (unsigned i = 0; i < consumer->varyings.input_count; i++) {
2241       gl_varying_slot loc = consumer->varyings.input[i].location;
2242 
2243       if (util_varying_is_point_coord(loc, point_coord_mask))
2244          present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
2245    }
2246 #endif
2247 
2248    return present;
2249 }
2250 
2251 /* Emitters for varying records */
2252 
2253 static void
pan_emit_vary(const struct panfrost_device * dev,struct mali_attribute_packed * out,unsigned buffer_index,mali_pixel_format format,unsigned offset)2254 pan_emit_vary(const struct panfrost_device *dev,
2255               struct mali_attribute_packed *out, unsigned buffer_index,
2256               mali_pixel_format format, unsigned offset)
2257 {
2258    pan_pack(out, ATTRIBUTE, cfg) {
2259       cfg.buffer_index = buffer_index;
2260       cfg.offset_enable = (PAN_ARCH <= 5);
2261       cfg.format = format;
2262       cfg.offset = offset;
2263    }
2264 }
2265 
2266 /* Special records */
2267 
2268 /* clang-format off */
2269 static const struct {
2270    unsigned components;
2271    enum mali_format format;
2272 } pan_varying_formats[PAN_VARY_MAX] = {
2273    [PAN_VARY_POSITION]  = { 4, MALI_SNAP_4   },
2274    [PAN_VARY_PSIZ]      = { 1, MALI_R16F     },
2275    [PAN_VARY_PNTCOORD]  = { 4, MALI_RGBA32F  },
2276    [PAN_VARY_FACE]      = { 1, MALI_R32I     },
2277    [PAN_VARY_FRAGCOORD] = { 4, MALI_RGBA32F  },
2278 };
2279 /* clang-format on */
2280 
2281 static mali_pixel_format
pan_special_format(const struct panfrost_device * dev,enum pan_special_varying buf)2282 pan_special_format(const struct panfrost_device *dev,
2283                    enum pan_special_varying buf)
2284 {
2285    assert(buf < PAN_VARY_MAX);
2286    mali_pixel_format format = (pan_varying_formats[buf].format << 12);
2287 
2288 #if PAN_ARCH <= 6
2289    unsigned nr = pan_varying_formats[buf].components;
2290    format |= panfrost_get_default_swizzle(nr);
2291 #endif
2292 
2293    return format;
2294 }
2295 
2296 static void
pan_emit_vary_special(const struct panfrost_device * dev,struct mali_attribute_packed * out,unsigned present,enum pan_special_varying buf)2297 pan_emit_vary_special(const struct panfrost_device *dev,
2298                       struct mali_attribute_packed *out, unsigned present,
2299                       enum pan_special_varying buf)
2300 {
2301    pan_emit_vary(dev, out, pan_varying_index(present, buf),
2302                  pan_special_format(dev, buf), 0);
2303 }
2304 
2305 /* Negative indicates a varying is not found */
2306 
2307 static signed
pan_find_vary(const struct pan_shader_varying * vary,unsigned vary_count,unsigned loc)2308 pan_find_vary(const struct pan_shader_varying *vary, unsigned vary_count,
2309               unsigned loc)
2310 {
2311    for (unsigned i = 0; i < vary_count; ++i) {
2312       if (vary[i].location == loc)
2313          return i;
2314    }
2315 
2316    return -1;
2317 }
2318 
2319 /* Assign varying locations for the general buffer. Returns the calculated
2320  * per-vertex stride, and outputs offsets into the passed array. Negative
2321  * offset indicates a varying is not used. */
2322 
2323 static unsigned
pan_assign_varyings(const struct panfrost_device * dev,struct pan_shader_info * producer,struct pan_shader_info * consumer,signed * offsets)2324 pan_assign_varyings(const struct panfrost_device *dev,
2325                     struct pan_shader_info *producer,
2326                     struct pan_shader_info *consumer, signed *offsets)
2327 {
2328    unsigned producer_count = producer->varyings.output_count;
2329    unsigned consumer_count = consumer->varyings.input_count;
2330 
2331    const struct pan_shader_varying *producer_vars = producer->varyings.output;
2332    const struct pan_shader_varying *consumer_vars = consumer->varyings.input;
2333 
2334    unsigned stride = 0;
2335 
2336    for (unsigned i = 0; i < producer_count; ++i) {
2337       signed loc = pan_find_vary(consumer_vars, consumer_count,
2338                                  producer_vars[i].location);
2339       enum pipe_format format =
2340          loc >= 0 ? consumer_vars[loc].format : PIPE_FORMAT_NONE;
2341 
2342       if (format != PIPE_FORMAT_NONE) {
2343          offsets[i] = stride;
2344          stride += util_format_get_blocksize(format);
2345       } else {
2346          offsets[i] = -1;
2347       }
2348    }
2349 
2350    return stride;
2351 }
2352 
2353 /* Emitter for a single varying (attribute) descriptor */
2354 
2355 static void
panfrost_emit_varying(const struct panfrost_device * dev,struct mali_attribute_packed * out,const struct pan_shader_varying varying,enum pipe_format pipe_format,unsigned present,uint16_t point_sprite_mask,signed offset,enum pan_special_varying pos_varying)2356 panfrost_emit_varying(const struct panfrost_device *dev,
2357                       struct mali_attribute_packed *out,
2358                       const struct pan_shader_varying varying,
2359                       enum pipe_format pipe_format, unsigned present,
2360                       uint16_t point_sprite_mask, signed offset,
2361                       enum pan_special_varying pos_varying)
2362 {
2363    /* Note: varying.format != pipe_format in some obscure cases due to a
2364     * limitation of the NIR linker. This should be fixed in the future to
2365     * eliminate the additional lookups. See:
2366     * dEQP-GLES3.functional.shaders.conditionals.if.sequence_statements_vertex
2367     */
2368    gl_varying_slot loc = varying.location;
2369    mali_pixel_format format =
2370       GENX(panfrost_format_from_pipe_format)(pipe_format)->hw;
2371 
2372    if (util_varying_is_point_coord(loc, point_sprite_mask)) {
2373       pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD);
2374    } else if (loc == VARYING_SLOT_POS) {
2375       pan_emit_vary_special(dev, out, present, pos_varying);
2376    } else if (loc == VARYING_SLOT_PSIZ) {
2377       pan_emit_vary_special(dev, out, present, PAN_VARY_PSIZ);
2378    } else if (loc == VARYING_SLOT_FACE) {
2379       pan_emit_vary_special(dev, out, present, PAN_VARY_FACE);
2380    } else if (offset < 0) {
2381       pan_emit_vary(dev, out, 0, (MALI_CONSTANT << 12), 0);
2382    } else {
2383       STATIC_ASSERT(PAN_VARY_GENERAL == 0);
2384       pan_emit_vary(dev, out, 0, format, offset);
2385    }
2386 }
2387 
2388 /* Links varyings and uploads ATTRIBUTE descriptors. Can execute at link time,
2389  * rather than draw time (under good conditions). */
2390 
2391 static void
panfrost_emit_varying_descs(struct panfrost_pool * pool,struct panfrost_compiled_shader * producer,struct panfrost_compiled_shader * consumer,uint16_t point_coord_mask,struct pan_linkage * out)2392 panfrost_emit_varying_descs(struct panfrost_pool *pool,
2393                             struct panfrost_compiled_shader *producer,
2394                             struct panfrost_compiled_shader *consumer,
2395                             uint16_t point_coord_mask, struct pan_linkage *out)
2396 {
2397    struct panfrost_device *dev = pool->dev;
2398    unsigned producer_count = producer->info.varyings.output_count;
2399    unsigned consumer_count = consumer->info.varyings.input_count;
2400 
2401    /* Offsets within the general varying buffer, indexed by location */
2402    signed offsets[PAN_MAX_VARYINGS];
2403    assert(producer_count <= ARRAY_SIZE(offsets));
2404    assert(consumer_count <= ARRAY_SIZE(offsets));
2405 
2406    /* Allocate enough descriptors for both shader stages */
2407    struct panfrost_ptr T = pan_pool_alloc_desc_array(
2408       &pool->base, producer_count + consumer_count, ATTRIBUTE);
2409 
2410    /* Take a reference if we're being put on the CSO */
2411    if (!pool->owned) {
2412       out->bo = pool->transient_bo;
2413       panfrost_bo_reference(out->bo);
2414    }
2415 
2416    struct mali_attribute_packed *descs = T.cpu;
2417    out->producer = producer_count ? T.gpu : 0;
2418    out->consumer =
2419       consumer_count ? T.gpu + (pan_size(ATTRIBUTE) * producer_count) : 0;
2420 
2421    /* Lay out the varyings. Must use producer to lay out, in order to
2422     * respect transform feedback precisions. */
2423    out->present = pan_varying_present(dev, &producer->info, &consumer->info,
2424                                       point_coord_mask);
2425 
2426    out->stride =
2427       pan_assign_varyings(dev, &producer->info, &consumer->info, offsets);
2428 
2429    for (unsigned i = 0; i < producer_count; ++i) {
2430       signed j = pan_find_vary(consumer->info.varyings.input,
2431                                consumer->info.varyings.input_count,
2432                                producer->info.varyings.output[i].location);
2433 
2434       enum pipe_format format = (j >= 0)
2435                                    ? consumer->info.varyings.input[j].format
2436                                    : producer->info.varyings.output[i].format;
2437 
2438       panfrost_emit_varying(dev, descs + i, producer->info.varyings.output[i],
2439                             format, out->present, 0, offsets[i],
2440                             PAN_VARY_POSITION);
2441    }
2442 
2443    for (unsigned i = 0; i < consumer_count; ++i) {
2444       signed j = pan_find_vary(producer->info.varyings.output,
2445                                producer->info.varyings.output_count,
2446                                consumer->info.varyings.input[i].location);
2447 
2448       signed offset = (j >= 0) ? offsets[j] : -1;
2449 
2450       panfrost_emit_varying(
2451          dev, descs + producer_count + i, consumer->info.varyings.input[i],
2452          consumer->info.varyings.input[i].format, out->present,
2453          point_coord_mask, offset, PAN_VARY_FRAGCOORD);
2454    }
2455 }
2456 
2457 #if PAN_ARCH <= 5
2458 static void
pan_emit_special_input(struct mali_attribute_buffer_packed * out,unsigned present,enum pan_special_varying v,unsigned special)2459 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
2460                        unsigned present, enum pan_special_varying v,
2461                        unsigned special)
2462 {
2463    if (present & BITFIELD_BIT(v)) {
2464       unsigned idx = pan_varying_index(present, v);
2465 
2466       pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
2467          cfg.special = special;
2468          cfg.type = 0;
2469       }
2470    }
2471 }
2472 #endif
2473 
2474 static void
panfrost_emit_varying_descriptor(struct panfrost_batch * batch,unsigned vertex_count,bool point_coord_replace)2475 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
2476                                  unsigned vertex_count,
2477                                  bool point_coord_replace)
2478 {
2479    struct panfrost_context *ctx = batch->ctx;
2480    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
2481    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
2482 
2483    uint16_t point_coord_mask = 0;
2484 
2485    memset(&batch->varyings, 0, sizeof(batch->varyings));
2486 
2487 #if PAN_ARCH <= 5
2488    struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
2489 
2490    /* Point sprites are lowered on Bifrost and newer */
2491    if (point_coord_replace)
2492       point_coord_mask = ctx->rasterizer->base.sprite_coord_enable;
2493 #endif
2494 
2495    /* In good conditions, we only need to link varyings once */
2496    bool prelink =
2497       (point_coord_mask == 0) && !vs->info.separable && !fs->info.separable;
2498 
2499    /* Try to reduce copies */
2500    struct pan_linkage _linkage;
2501    struct pan_linkage *linkage = prelink ? &vs->linkage : &_linkage;
2502 
2503    /* Emit ATTRIBUTE descriptors if needed */
2504    if (!prelink || vs->linkage.bo == NULL) {
2505       struct panfrost_pool *pool = prelink ? &ctx->descs : &batch->pool;
2506 
2507       panfrost_emit_varying_descs(pool, vs, fs, point_coord_mask, linkage);
2508    }
2509 
2510    unsigned present = linkage->present, stride = linkage->stride;
2511    unsigned count = util_bitcount(present);
2512    struct panfrost_ptr T =
2513       pan_pool_alloc_desc_array(&batch->pool.base, count + 1, ATTRIBUTE_BUFFER);
2514    struct mali_attribute_buffer_packed *varyings =
2515       (struct mali_attribute_buffer_packed *)T.cpu;
2516 
2517    batch->varyings.nr_bufs = count;
2518 
2519 #if PAN_ARCH >= 6
2520    /* Suppress prefetch on Bifrost */
2521    memset(varyings + count, 0, sizeof(*varyings));
2522 #endif
2523 
2524    if (stride) {
2525       panfrost_emit_varyings(
2526          batch, &varyings[pan_varying_index(present, PAN_VARY_GENERAL)], stride,
2527          vertex_count);
2528    } else {
2529       /* The indirect draw code reads the stride field, make sure
2530        * that it is initialised */
2531       memset(varyings + pan_varying_index(present, PAN_VARY_GENERAL), 0,
2532              sizeof(*varyings));
2533    }
2534 
2535    /* fp32 vec4 gl_Position */
2536    batch->varyings.pos = panfrost_emit_varyings(
2537       batch, &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2538       sizeof(float) * 4, vertex_count);
2539 
2540    if (present & BITFIELD_BIT(PAN_VARY_PSIZ)) {
2541       batch->varyings.psiz = panfrost_emit_varyings(
2542          batch, &varyings[pan_varying_index(present, PAN_VARY_PSIZ)], 2,
2543          vertex_count);
2544    }
2545 
2546 #if PAN_ARCH <= 5
2547    pan_emit_special_input(
2548       varyings, present, PAN_VARY_PNTCOORD,
2549       (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
2550          ? MALI_ATTRIBUTE_SPECIAL_POINT_COORD_MAX_Y
2551          : MALI_ATTRIBUTE_SPECIAL_POINT_COORD_MIN_Y);
2552    pan_emit_special_input(varyings, present, PAN_VARY_FACE,
2553                           MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2554    pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD,
2555                           MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2556 #endif
2557 
2558    batch->varyings.bufs = T.gpu;
2559    batch->varyings.vs = linkage->producer;
2560    batch->varyings.fs = linkage->consumer;
2561 }
2562 #endif
2563 
2564 static void
emit_tls(struct panfrost_batch * batch)2565 emit_tls(struct panfrost_batch *batch)
2566 {
2567    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2568 
2569    /* Emitted with the FB descriptor on Midgard. */
2570    if (PAN_ARCH <= 5 && batch->framebuffer.gpu)
2571       return;
2572 
2573    struct panfrost_bo *tls_bo =
2574       batch->stack_size ? panfrost_batch_get_scratchpad(
2575                              batch, batch->stack_size, dev->thread_tls_alloc,
2576                              dev->core_id_range)
2577                         : NULL;
2578    struct pan_tls_info tls = {
2579       .tls =
2580          {
2581             .ptr = tls_bo ? tls_bo->ptr.gpu : 0,
2582             .size = batch->stack_size,
2583          },
2584    };
2585 
2586    assert(batch->tls.cpu);
2587    GENX(pan_emit_tls)(&tls, batch->tls.cpu);
2588 }
2589 
2590 static void
emit_fbd(struct panfrost_batch * batch,struct pan_fb_info * fb)2591 emit_fbd(struct panfrost_batch *batch, struct pan_fb_info *fb)
2592 {
2593    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2594    struct panfrost_bo *tls_bo =
2595       batch->stack_size ? panfrost_batch_get_scratchpad(
2596                              batch, batch->stack_size, dev->thread_tls_alloc,
2597                              dev->core_id_range)
2598                         : NULL;
2599    struct pan_tls_info tls = {
2600       .tls =
2601          {
2602             .ptr = tls_bo ? tls_bo->ptr.gpu : 0,
2603             .size = batch->stack_size,
2604          },
2605    };
2606 
2607 #if PAN_ARCH >= 6
2608    fb->sample_positions =
2609       dev->sample_positions->ptr.gpu +
2610       panfrost_sample_positions_offset(pan_sample_pattern(fb->nr_samples));
2611 #endif
2612 
2613    batch->framebuffer.gpu |=
2614       GENX(pan_emit_fbd)(fb, 0, &tls, &batch->tiler_ctx, batch->framebuffer.cpu);
2615 }
2616 
2617 /* Mark a surface as written */
2618 
2619 static void
panfrost_initialize_surface(struct panfrost_batch * batch,struct pipe_surface * surf)2620 panfrost_initialize_surface(struct panfrost_batch *batch,
2621                             struct pipe_surface *surf)
2622 {
2623    if (surf) {
2624       struct panfrost_resource *rsrc = pan_resource(surf->texture);
2625       BITSET_SET(rsrc->valid.data, surf->u.tex.level);
2626       if (rsrc->separate_stencil)
2627          BITSET_SET(rsrc->separate_stencil->valid.data, surf->u.tex.level);
2628    }
2629 }
2630 
2631 /* Generate a fragment job. This should be called once per frame. (Usually,
2632  * this corresponds to eglSwapBuffers or one of glFlush, glFinish)
2633  */
2634 static void
emit_fragment_job(struct panfrost_batch * batch,const struct pan_fb_info * pfb)2635 emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb)
2636 {
2637    /* Mark the affected buffers as initialized, since we're writing to it.
2638     * Also, add the surfaces we're writing to to the batch */
2639 
2640    struct pipe_framebuffer_state *fb = &batch->key;
2641 
2642    for (unsigned i = 0; i < fb->nr_cbufs; ++i)
2643       panfrost_initialize_surface(batch, fb->cbufs[i]);
2644 
2645    panfrost_initialize_surface(batch, fb->zsbuf);
2646 
2647    /* The passed tile coords can be out of range in some cases, so we need
2648     * to clamp them to the framebuffer size to avoid a TILE_RANGE_FAULT.
2649     * Theoretically we also need to clamp the coordinates positive, but we
2650     * avoid that edge case as all four values are unsigned. Also,
2651     * theoretically we could clamp the minima, but if that has to happen
2652     * the asserts would fail anyway (since the maxima would get clamped
2653     * and then be smaller than the minima). An edge case of sorts occurs
2654     * when no scissors are added to draw, so by default min=~0 and max=0.
2655     * But that can't happen if any actual drawing occurs (beyond a
2656     * wallpaper reload), so this is again irrelevant in practice. */
2657 
2658    batch->maxx = MIN2(batch->maxx, fb->width);
2659    batch->maxy = MIN2(batch->maxy, fb->height);
2660 
2661    /* Rendering region must be at least 1x1; otherwise, there is nothing
2662     * to do and the whole job chain should have been discarded. */
2663 
2664    assert(batch->maxx > batch->minx);
2665    assert(batch->maxy > batch->miny);
2666 
2667    JOBX(emit_fragment_job)(batch, pfb);
2668 }
2669 
2670 /* Count generated primitives (when there is no geom/tess shaders) for
2671  * transform feedback */
2672 
2673 static void
panfrost_statistics_record(struct panfrost_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw)2674 panfrost_statistics_record(struct panfrost_context *ctx,
2675                            const struct pipe_draw_info *info,
2676                            const struct pipe_draw_start_count_bias *draw)
2677 {
2678    if (!ctx->active_queries)
2679       return;
2680 
2681    uint32_t prims = u_prims_for_vertices(info->mode, draw->count);
2682    ctx->prims_generated += prims;
2683 
2684    if (!ctx->streamout.num_targets)
2685       return;
2686 
2687    ctx->tf_prims_generated += prims;
2688    ctx->dirty |= PAN_DIRTY_SO;
2689 }
2690 
2691 static void
panfrost_update_streamout_offsets(struct panfrost_context * ctx)2692 panfrost_update_streamout_offsets(struct panfrost_context *ctx)
2693 {
2694    unsigned count =
2695       u_stream_outputs_for_vertices(ctx->active_prim, ctx->vertex_count);
2696 
2697    for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2698       if (!ctx->streamout.targets[i])
2699          continue;
2700 
2701       pan_so_target(ctx->streamout.targets[i])->offset += count;
2702    }
2703 }
2704 
2705 /* On Bifrost and older, the Renderer State Descriptor aggregates many pieces of
2706  * 3D state. In particular, it groups the fragment shader descriptor with
2707  * depth/stencil, blend, polygon offset, and multisampling state. These pieces
2708  * of state are dirty tracked independently for the benefit of newer GPUs that
2709  * separate the descriptors. FRAGMENT_RSD_DIRTY_MASK contains the list of 3D
2710  * dirty flags that trigger re-emits of the fragment RSD.
2711  *
2712  * Obscurely, occlusion queries are included. Occlusion query state is nominally
2713  * specified in the draw call descriptor, but must be considered when determing
2714  * early-Z state which is part of the RSD.
2715  */
2716 #define FRAGMENT_RSD_DIRTY_MASK                                                \
2717    (PAN_DIRTY_ZS | PAN_DIRTY_BLEND | PAN_DIRTY_MSAA | PAN_DIRTY_RASTERIZER |   \
2718     PAN_DIRTY_OQ)
2719 
2720 static inline void
panfrost_update_shader_state(struct panfrost_batch * batch,enum pipe_shader_type st)2721 panfrost_update_shader_state(struct panfrost_batch *batch,
2722                              enum pipe_shader_type st)
2723 {
2724    struct panfrost_context *ctx = batch->ctx;
2725    struct panfrost_compiled_shader *ss = ctx->prog[st];
2726 
2727    bool frag = (st == PIPE_SHADER_FRAGMENT);
2728    unsigned dirty_3d = ctx->dirty;
2729    unsigned dirty = ctx->dirty_shader[st];
2730 
2731    if (dirty & (PAN_DIRTY_STAGE_TEXTURE | PAN_DIRTY_STAGE_SHADER)) {
2732       batch->textures[st] = panfrost_emit_texture_descriptors(batch, st);
2733    }
2734 
2735    if (dirty & PAN_DIRTY_STAGE_SAMPLER) {
2736       batch->samplers[st] = panfrost_emit_sampler_descriptors(batch, st);
2737    }
2738 
2739    /* On Bifrost and older, the fragment shader descriptor is fused
2740     * together with the renderer state; the combined renderer state
2741     * descriptor is emitted below. Otherwise, the shader descriptor is
2742     * standalone and is emitted here.
2743     */
2744    if ((dirty & PAN_DIRTY_STAGE_SHADER) && !((PAN_ARCH <= 7) && frag)) {
2745       batch->rsd[st] = panfrost_emit_compute_shader_meta(batch, st);
2746    }
2747 
2748 #if PAN_ARCH >= 9
2749    if (dirty & PAN_DIRTY_STAGE_IMAGE) {
2750       batch->images[st] =
2751          ctx->image_mask[st] ? panfrost_emit_images(batch, st) : 0;
2752    }
2753 
2754    if (dirty & PAN_DIRTY_STAGE_SSBO)
2755       batch->ssbos[st] = panfrost_emit_ssbos(batch, st);
2756 #endif
2757 
2758    if ((dirty & ss->dirty_shader) || (dirty_3d & ss->dirty_3d)) {
2759       batch->uniform_buffers[st] = panfrost_emit_const_buf(
2760          batch, st, &batch->nr_uniform_buffers[st], &batch->push_uniforms[st],
2761          &batch->nr_push_uniforms[st]);
2762    }
2763 
2764 #if PAN_ARCH <= 7
2765    /* On Bifrost and older, if the fragment shader changes OR any renderer
2766     * state specified with the fragment shader, the whole renderer state
2767     * descriptor is dirtied and must be reemited.
2768     */
2769    if (frag && ((dirty & PAN_DIRTY_STAGE_SHADER) ||
2770                 (dirty_3d & FRAGMENT_RSD_DIRTY_MASK))) {
2771 
2772       batch->rsd[st] = panfrost_emit_frag_shader_meta(batch);
2773    }
2774 
2775    /* Vertex shaders need to mix vertex data and image descriptors in the
2776     * attribute array. This is taken care of in panfrost_update_state_3d().
2777     */
2778    if (st != PIPE_SHADER_VERTEX && (dirty & PAN_DIRTY_STAGE_IMAGE)) {
2779       batch->attribs[st] =
2780          panfrost_emit_image_attribs(batch, &batch->attrib_bufs[st], st);
2781    }
2782 #endif
2783 }
2784 
2785 static inline void
panfrost_update_state_3d(struct panfrost_batch * batch)2786 panfrost_update_state_3d(struct panfrost_batch *batch)
2787 {
2788    struct panfrost_context *ctx = batch->ctx;
2789    unsigned dirty = ctx->dirty;
2790 
2791    if (dirty & PAN_DIRTY_TLS_SIZE)
2792       panfrost_batch_adjust_stack_size(batch);
2793 
2794    if (dirty & PAN_DIRTY_BLEND)
2795       panfrost_set_batch_masks_blend(batch);
2796 
2797    if (dirty & PAN_DIRTY_ZS)
2798       panfrost_set_batch_masks_zs(batch);
2799 
2800 #if PAN_ARCH >= 9
2801    if ((dirty & (PAN_DIRTY_ZS | PAN_DIRTY_RASTERIZER)) ||
2802        (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & PAN_DIRTY_STAGE_SHADER))
2803       batch->depth_stencil = panfrost_emit_depth_stencil(batch);
2804 
2805    if (dirty & PAN_DIRTY_BLEND)
2806       batch->blend = panfrost_emit_blend_valhall(batch);
2807 
2808    if (dirty & PAN_DIRTY_VERTEX) {
2809       batch->attribs[PIPE_SHADER_VERTEX] = panfrost_emit_vertex_data(batch);
2810 
2811       batch->attrib_bufs[PIPE_SHADER_VERTEX] =
2812          panfrost_emit_vertex_buffers(batch);
2813    }
2814 #else
2815    unsigned vt_shader_dirty = ctx->dirty_shader[PIPE_SHADER_VERTEX];
2816 
2817    /* Vertex data, vertex shader and images accessed by the vertex shader have
2818     * an impact on the attributes array, we need to re-emit anytime one of these
2819     * parameters changes. */
2820    if ((dirty & PAN_DIRTY_VERTEX) ||
2821        (vt_shader_dirty & (PAN_DIRTY_STAGE_IMAGE | PAN_DIRTY_STAGE_SHADER))) {
2822       batch->attribs[PIPE_SHADER_VERTEX] = panfrost_emit_vertex_data(
2823          batch, &batch->attrib_bufs[PIPE_SHADER_VERTEX]);
2824    }
2825 #endif
2826 }
2827 
2828 static void
panfrost_launch_xfb(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned count)2829 panfrost_launch_xfb(struct panfrost_batch *batch,
2830                     const struct pipe_draw_info *info, unsigned count)
2831 {
2832    struct panfrost_context *ctx = batch->ctx;
2833 
2834    /* Nothing to do */
2835    if (batch->ctx->streamout.num_targets == 0)
2836       return;
2837 
2838    /* TODO: XFB with index buffers */
2839    // assert(info->index_size == 0);
2840 
2841    if (!u_trim_pipe_prim(info->mode, &count))
2842       return;
2843 
2844    perf_debug(batch->ctx, "Emulating transform feedback");
2845 
2846    struct panfrost_uncompiled_shader *vs_uncompiled =
2847       ctx->uncompiled[PIPE_SHADER_VERTEX];
2848    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
2849 
2850    vs_uncompiled->xfb->stream_output = vs->stream_output;
2851 
2852    mali_ptr saved_rsd = batch->rsd[PIPE_SHADER_VERTEX];
2853    mali_ptr saved_ubo = batch->uniform_buffers[PIPE_SHADER_VERTEX];
2854    mali_ptr saved_push = batch->push_uniforms[PIPE_SHADER_VERTEX];
2855    unsigned saved_nr_push_uniforms =
2856       batch->nr_push_uniforms[PIPE_SHADER_VERTEX];
2857 
2858    ctx->uncompiled[PIPE_SHADER_VERTEX] = NULL; /* should not be read */
2859    ctx->prog[PIPE_SHADER_VERTEX] = vs_uncompiled->xfb;
2860    batch->rsd[PIPE_SHADER_VERTEX] =
2861       panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_VERTEX);
2862 
2863    batch->uniform_buffers[PIPE_SHADER_VERTEX] =
2864       panfrost_emit_const_buf(batch, PIPE_SHADER_VERTEX, NULL,
2865                               &batch->push_uniforms[PIPE_SHADER_VERTEX],
2866                               &batch->nr_push_uniforms[PIPE_SHADER_VERTEX]);
2867 
2868    JOBX(launch_xfb)(batch, info, count);
2869    batch->compute_count++;
2870 
2871    ctx->uncompiled[PIPE_SHADER_VERTEX] = vs_uncompiled;
2872    ctx->prog[PIPE_SHADER_VERTEX] = vs;
2873    batch->rsd[PIPE_SHADER_VERTEX] = saved_rsd;
2874    batch->uniform_buffers[PIPE_SHADER_VERTEX] = saved_ubo;
2875    batch->push_uniforms[PIPE_SHADER_VERTEX] = saved_push;
2876    batch->nr_push_uniforms[PIPE_SHADER_VERTEX] = saved_nr_push_uniforms;
2877 }
2878 
2879 /*
2880  * Increase the vertex count on the batch using a saturating add, and hope the
2881  * compiler can use the machine instruction here...
2882  */
2883 static inline void
panfrost_increase_vertex_count(struct panfrost_batch * batch,uint32_t increment)2884 panfrost_increase_vertex_count(struct panfrost_batch *batch, uint32_t increment)
2885 {
2886    uint32_t sum = batch->vertex_count + increment;
2887 
2888    if (sum >= batch->vertex_count)
2889       batch->vertex_count = sum;
2890    else
2891       batch->vertex_count = UINT32_MAX;
2892 
2893 #if PAN_ARCH <= 5
2894    batch->tiler_ctx.midgard.vertex_count = batch->vertex_count;
2895 #endif
2896 }
2897 
2898 /*
2899  * If we change whether we're drawing points, or whether point sprites are
2900  * enabled (specified in the rasterizer), we may need to rebind shaders
2901  * accordingly. This implicitly covers the case of rebinding framebuffers,
2902  * because all dirty flags are set there.
2903  */
2904 static void
panfrost_update_active_prim(struct panfrost_context * ctx,const struct pipe_draw_info * info)2905 panfrost_update_active_prim(struct panfrost_context *ctx,
2906                             const struct pipe_draw_info *info)
2907 {
2908    const enum mesa_prim prev_prim = u_reduced_prim(ctx->active_prim);
2909    const enum mesa_prim new_prim = u_reduced_prim(info->mode);
2910 
2911    ctx->active_prim = info->mode;
2912 
2913    if ((ctx->dirty & PAN_DIRTY_RASTERIZER) ||
2914        (prev_prim != new_prim)) {
2915       panfrost_update_shader_variant(ctx, PIPE_SHADER_FRAGMENT);
2916    }
2917 }
2918 
2919 static unsigned
panfrost_draw_get_vertex_count(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool idvs)2920 panfrost_draw_get_vertex_count(struct panfrost_batch *batch,
2921                                const struct pipe_draw_info *info,
2922                                const struct pipe_draw_start_count_bias *draw,
2923                                bool idvs)
2924 {
2925    struct panfrost_context *ctx = batch->ctx;
2926    unsigned vertex_count = ctx->vertex_count;
2927    unsigned min_index = 0, max_index = 0;
2928 
2929    batch->indices = 0;
2930    if (info->index_size && PAN_ARCH >= 9) {
2931       batch->indices = panfrost_get_index_buffer(batch, info, draw);
2932 
2933       /* Use index count to estimate vertex count */
2934       panfrost_increase_vertex_count(batch, draw->count);
2935    } else if (info->index_size) {
2936       batch->indices = panfrost_get_index_buffer_bounded(
2937          batch, info, draw, &min_index, &max_index);
2938 
2939       /* Use the corresponding values */
2940       vertex_count = max_index - min_index + 1;
2941       ctx->offset_start = min_index + draw->index_bias;
2942       panfrost_increase_vertex_count(batch, vertex_count);
2943    } else {
2944       ctx->offset_start = draw->start;
2945       panfrost_increase_vertex_count(batch, vertex_count);
2946    }
2947 
2948    if (PAN_ARCH <= 9 && info->instance_count > 1) {
2949       unsigned count = vertex_count;
2950 
2951       /* Index-Driven Vertex Shading requires different instances to
2952        * have different cache lines for position results. Each vertex
2953        * position is 16 bytes and the Mali cache line is 64 bytes, so
2954        * the instance count must be aligned to 4 vertices.
2955        */
2956       if (idvs)
2957          count = ALIGN_POT(count, 4);
2958 
2959       ctx->padded_count = panfrost_padded_vertex_count(count);
2960    } else {
2961       ctx->padded_count = vertex_count;
2962    }
2963 
2964    return vertex_count;
2965 }
2966 
2967 static void
panfrost_single_draw_direct(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_start_count_bias * draw)2968 panfrost_single_draw_direct(struct panfrost_batch *batch,
2969                             const struct pipe_draw_info *info,
2970                             unsigned drawid_offset,
2971                             const struct pipe_draw_start_count_bias *draw)
2972 {
2973    if (!draw->count || !info->instance_count)
2974       return;
2975 
2976    struct panfrost_context *ctx = batch->ctx;
2977 
2978    panfrost_update_active_prim(ctx, info);
2979 
2980    /* Take into account a negative bias */
2981    ctx->vertex_count =
2982       draw->count + (info->index_size ? abs(draw->index_bias) : 0);
2983    ctx->instance_count = info->instance_count;
2984    ctx->base_vertex = info->index_size ? draw->index_bias : 0;
2985    ctx->base_instance = info->start_instance;
2986    ctx->drawid = drawid_offset;
2987 
2988    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
2989    bool idvs = vs->info.vs.idvs;
2990 
2991    UNUSED unsigned vertex_count =
2992       panfrost_draw_get_vertex_count(batch, info, draw, idvs);
2993 
2994    panfrost_statistics_record(ctx, info, draw);
2995 
2996    panfrost_update_state_3d(batch);
2997    panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX);
2998    panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT);
2999    panfrost_clean_state_3d(ctx);
3000 
3001    if (ctx->uncompiled[PIPE_SHADER_VERTEX]->xfb) {
3002       panfrost_launch_xfb(batch, info, draw->count);
3003    }
3004 
3005    /* Increment transform feedback offsets */
3006    panfrost_update_streamout_offsets(ctx);
3007 
3008    /* Any side effects must be handled by the XFB shader, so we only need
3009     * to run vertex shaders if we need rasterization.
3010     */
3011    if (panfrost_batch_skip_rasterization(batch))
3012       return;
3013 
3014 #if PAN_ARCH <= 7
3015    /* Emit all sort of descriptors. */
3016    panfrost_emit_varying_descriptor(batch,
3017                                     ctx->padded_count * ctx->instance_count,
3018                                     info->mode == MESA_PRIM_POINTS);
3019 #endif
3020 
3021    JOBX(launch_draw)(batch, info, drawid_offset, draw, vertex_count);
3022    batch->draw_count++;
3023 }
3024 
3025 static bool
panfrost_compatible_batch_state(struct panfrost_batch * batch,enum mesa_prim reduced_prim)3026 panfrost_compatible_batch_state(struct panfrost_batch *batch,
3027                                 enum mesa_prim reduced_prim)
3028 {
3029    struct panfrost_context *ctx = batch->ctx;
3030    struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
3031 
3032    if (reduced_prim == MESA_PRIM_LINES &&
3033        !pan_tristate_set(&batch->line_smoothing, rast->line_smooth))
3034       return false;
3035 
3036    /* Only applies on Valhall */
3037    if (PAN_ARCH < 9)
3038       return true;
3039 
3040    bool coord = (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT);
3041    bool first = rast->flatshade_first;
3042 
3043    /* gl_PointCoord orientation only matters when drawing points, but
3044     * provoking vertex doesn't matter for points.
3045     */
3046    if (reduced_prim == MESA_PRIM_POINTS)
3047       return pan_tristate_set(&batch->sprite_coord_origin, coord);
3048    else
3049       return pan_tristate_set(&batch->first_provoking_vertex, first);
3050 }
3051 
3052 static struct panfrost_batch *
prepare_draw(struct pipe_context * pipe,const struct pipe_draw_info * info)3053 prepare_draw(struct pipe_context *pipe, const struct pipe_draw_info *info)
3054 {
3055    struct panfrost_context *ctx = pan_context(pipe);
3056    struct panfrost_device *dev = pan_device(pipe->screen);
3057 
3058    /* Do some common setup */
3059    struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
3060 
3061    /* Don't add too many jobs to a single batch. Job manager hardware has a
3062     * hard limit of 65536 jobs per job chain. Given a draw issues a maximum
3063     * of 3 jobs (a vertex, a tiler and a compute job is XFB is enabled), we
3064     * could use 65536 / 3 as a limit, but we choose a smaller soft limit
3065     * (arbitrary) to avoid the risk of timeouts. This might not be a good
3066     * idea. */
3067    if (unlikely(batch->draw_count > 10000))
3068       batch = panfrost_get_fresh_batch_for_fbo(ctx, "Too many draws");
3069 
3070    enum mesa_prim reduced_prim = u_reduced_prim(info->mode);
3071 
3072    if (unlikely(!panfrost_compatible_batch_state(batch, reduced_prim))) {
3073       batch = panfrost_get_fresh_batch_for_fbo(ctx, "State change");
3074 
3075       ASSERTED bool succ = panfrost_compatible_batch_state(batch, reduced_prim);
3076       assert(succ && "must be able to set state for a fresh batch");
3077    }
3078 
3079    /* panfrost_batch_skip_rasterization reads
3080     * batch->scissor_culls_everything, which is set by
3081     * panfrost_emit_viewport, so call that first.
3082     */
3083    if (ctx->dirty & (PAN_DIRTY_VIEWPORT | PAN_DIRTY_SCISSOR))
3084       batch->viewport = panfrost_emit_viewport(batch);
3085 
3086    /* Mark everything dirty when debugging */
3087    if (unlikely(dev->debug & PAN_DBG_DIRTY))
3088       panfrost_dirty_state_all(ctx);
3089 
3090    /* Conservatively assume draw parameters always change */
3091    ctx->dirty |= PAN_DIRTY_PARAMS | PAN_DIRTY_DRAWID;
3092 
3093    return batch;
3094 }
3095 
3096 static void
panfrost_draw_indirect(struct pipe_context * pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect)3097 panfrost_draw_indirect(struct pipe_context *pipe,
3098                        const struct pipe_draw_info *info,
3099                        unsigned drawid_offset,
3100                        const struct pipe_draw_indirect_info *indirect)
3101 {
3102    struct panfrost_context *ctx = pan_context(pipe);
3103 
3104    if (!PAN_GPU_SUPPORTS_DRAW_INDIRECT || ctx->active_queries ||
3105        ctx->streamout.num_targets) {
3106       util_draw_indirect(pipe, info, drawid_offset, indirect);
3107       perf_debug(ctx, "Emulating indirect draw on the CPU");
3108       return;
3109    }
3110 
3111    struct panfrost_batch *batch = prepare_draw(pipe, info);
3112    struct pipe_draw_info tmp_info = *info;
3113 
3114    panfrost_batch_read_rsrc(batch, pan_resource(indirect->buffer),
3115                             PIPE_SHADER_VERTEX);
3116 
3117    panfrost_update_active_prim(ctx, &tmp_info);
3118 
3119    ctx->drawid = drawid_offset;
3120 
3121    batch->indices = 0;
3122    if (info->index_size) {
3123       struct panfrost_resource *index_buffer =
3124          pan_resource(info->index.resource);
3125       panfrost_batch_read_rsrc(batch, index_buffer, PIPE_SHADER_VERTEX);
3126       batch->indices = index_buffer->image.data.base;
3127    }
3128 
3129    panfrost_update_state_3d(batch);
3130    panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX);
3131    panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT);
3132    panfrost_clean_state_3d(ctx);
3133 
3134    /* Increment transform feedback offsets */
3135    panfrost_update_streamout_offsets(ctx);
3136 
3137    /* Any side effects must be handled by the XFB shader, so we only need
3138     * to run vertex shaders if we need rasterization.
3139     */
3140    if (panfrost_batch_skip_rasterization(batch))
3141       return;
3142 
3143    JOBX(launch_draw_indirect)(batch, &tmp_info, drawid_offset, indirect);
3144    batch->draw_count++;
3145 }
3146 
3147 static void
panfrost_multi_draw_direct(struct pipe_context * pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3148 panfrost_multi_draw_direct(struct pipe_context *pipe,
3149                            const struct pipe_draw_info *info,
3150                            unsigned drawid_offset,
3151                            const struct pipe_draw_start_count_bias *draws,
3152                            unsigned num_draws)
3153 {
3154    struct panfrost_context *ctx = pan_context(pipe);
3155    struct panfrost_batch *batch = prepare_draw(pipe, info);
3156    struct pipe_draw_info tmp_info = *info;
3157    unsigned drawid = drawid_offset;
3158 
3159    for (unsigned i = 0; i < num_draws; i++) {
3160       panfrost_single_draw_direct(batch, &tmp_info, drawid, &draws[i]);
3161 
3162       if (tmp_info.increment_draw_id) {
3163          ctx->dirty |= PAN_DIRTY_DRAWID;
3164          drawid++;
3165       }
3166    }
3167 }
3168 
3169 static void
panfrost_draw_vbo(struct pipe_context * pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3170 panfrost_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
3171                   unsigned drawid_offset,
3172                   const struct pipe_draw_indirect_info *indirect,
3173                   const struct pipe_draw_start_count_bias *draws,
3174                   unsigned num_draws)
3175 {
3176    struct panfrost_context *ctx = pan_context(pipe);
3177 
3178    if (!panfrost_render_condition_check(ctx))
3179       return;
3180 
3181    ctx->draw_calls++;
3182 
3183    if (indirect && indirect->buffer) {
3184       assert(num_draws == 1);
3185       panfrost_draw_indirect(pipe, info, drawid_offset, indirect);
3186    } else {
3187       panfrost_multi_draw_direct(pipe, info, drawid_offset, draws, num_draws);
3188    }
3189 }
3190 
3191 /* Launch grid is the compute equivalent of draw_vbo, so in this routine, we
3192  * construct the COMPUTE job and some of its payload.
3193  */
3194 
3195 static void
panfrost_launch_grid_on_batch(struct pipe_context * pipe,struct panfrost_batch * batch,const struct pipe_grid_info * info)3196 panfrost_launch_grid_on_batch(struct pipe_context *pipe,
3197                               struct panfrost_batch *batch,
3198                               const struct pipe_grid_info *info)
3199 {
3200    struct panfrost_context *ctx = pan_context(pipe);
3201 
3202    util_dynarray_foreach(&ctx->global_buffers, struct pipe_resource *, res) {
3203       if (!*res)
3204          continue;
3205 
3206       struct panfrost_resource *buffer = pan_resource(*res);
3207       panfrost_batch_write_rsrc(batch, buffer, PIPE_SHADER_COMPUTE);
3208    }
3209 
3210    if (info->indirect && !PAN_GPU_SUPPORTS_DISPATCH_INDIRECT) {
3211       struct pipe_transfer *transfer;
3212       uint32_t *params =
3213          pipe_buffer_map_range(pipe, info->indirect, info->indirect_offset,
3214                                3 * sizeof(uint32_t), PIPE_MAP_READ, &transfer);
3215 
3216       struct pipe_grid_info direct = *info;
3217       direct.indirect = NULL;
3218       direct.grid[0] = params[0];
3219       direct.grid[1] = params[1];
3220       direct.grid[2] = params[2];
3221       pipe_buffer_unmap(pipe, transfer);
3222 
3223       if (params[0] && params[1] && params[2])
3224          panfrost_launch_grid_on_batch(pipe, batch, &direct);
3225 
3226       return;
3227    }
3228 
3229    ctx->compute_grid = info;
3230 
3231    /* Conservatively assume workgroup size changes every launch */
3232    ctx->dirty |= PAN_DIRTY_PARAMS;
3233 
3234    panfrost_update_shader_state(batch, PIPE_SHADER_COMPUTE);
3235 
3236    /* We want our compute thread descriptor to be per job.
3237     * Save the global one, and restore it when we're done emitting
3238     * the job.
3239     */
3240    mali_ptr saved_tls = batch->tls.gpu;
3241    batch->tls.gpu = panfrost_emit_shared_memory(batch, info);
3242 
3243    /* if indirect, mark the indirect buffer as being read */
3244    if (info->indirect)
3245       panfrost_batch_read_rsrc(batch, pan_resource(info->indirect), PIPE_SHADER_COMPUTE);
3246 
3247    /* launch it */
3248    JOBX(launch_grid)(batch, info);
3249    batch->compute_count++;
3250    batch->tls.gpu = saved_tls;
3251 }
3252 
3253 static void
panfrost_launch_grid(struct pipe_context * pipe,const struct pipe_grid_info * info)3254 panfrost_launch_grid(struct pipe_context *pipe,
3255                      const struct pipe_grid_info *info)
3256 {
3257    struct panfrost_context *ctx = pan_context(pipe);
3258 
3259    /* XXX - shouldn't be necessary with working memory barriers. Affected
3260     * test: KHR-GLES31.core.compute_shader.pipeline-post-xfb */
3261    panfrost_flush_all_batches(ctx, "Launch grid pre-barrier");
3262 
3263    struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
3264    panfrost_launch_grid_on_batch(pipe, batch, info);
3265 
3266    panfrost_flush_all_batches(ctx, "Launch grid post-barrier");
3267 }
3268 
3269 #define AFBC_BLOCK_ALIGN 16
3270 
3271 static void
panfrost_launch_afbc_shader(struct panfrost_batch * batch,void * cso,struct pipe_constant_buffer * cbuf,unsigned nr_blocks)3272 panfrost_launch_afbc_shader(struct panfrost_batch *batch, void *cso,
3273                             struct pipe_constant_buffer *cbuf,
3274                             unsigned nr_blocks)
3275 {
3276    struct pipe_context *pctx = &batch->ctx->base;
3277    void *saved_cso = NULL;
3278    struct pipe_constant_buffer saved_const = {};
3279    struct pipe_grid_info grid = {
3280       .block[0] = 1,
3281       .block[1] = 1,
3282       .block[2] = 1,
3283       .grid[0] = nr_blocks,
3284       .grid[1] = 1,
3285       .grid[2] = 1,
3286    };
3287 
3288    struct panfrost_constant_buffer *pbuf =
3289       &batch->ctx->constant_buffer[PIPE_SHADER_COMPUTE];
3290    saved_cso = batch->ctx->uncompiled[PIPE_SHADER_COMPUTE];
3291    util_copy_constant_buffer(&pbuf->cb[0], &saved_const, true);
3292 
3293    pctx->bind_compute_state(pctx, cso);
3294    pctx->set_constant_buffer(pctx, PIPE_SHADER_COMPUTE, 0, false, cbuf);
3295 
3296    panfrost_launch_grid_on_batch(pctx, batch, &grid);
3297 
3298    pctx->bind_compute_state(pctx, saved_cso);
3299    pctx->set_constant_buffer(pctx, PIPE_SHADER_COMPUTE, 0, true, &saved_const);
3300 }
3301 
3302 #define LAUNCH_AFBC_SHADER(name, batch, rsrc, consts, nr_blocks)               \
3303    struct pan_afbc_shader_data *shaders =                                      \
3304       panfrost_afbc_get_shaders(batch->ctx, rsrc, AFBC_BLOCK_ALIGN);           \
3305    struct pipe_constant_buffer constant_buffer = {                             \
3306       .buffer_size = sizeof(consts),                                           \
3307       .user_buffer = &consts};                                                 \
3308    panfrost_launch_afbc_shader(batch, shaders->name##_cso, &constant_buffer,   \
3309                                nr_blocks);
3310 
3311 static void
panfrost_afbc_size(struct panfrost_batch * batch,struct panfrost_resource * src,struct panfrost_bo * metadata,unsigned offset,unsigned level)3312 panfrost_afbc_size(struct panfrost_batch *batch, struct panfrost_resource *src,
3313                    struct panfrost_bo *metadata, unsigned offset,
3314                    unsigned level)
3315 {
3316    struct pan_image_slice_layout *slice = &src->image.layout.slices[level];
3317    struct panfrost_afbc_size_info consts = {
3318       .src =
3319          src->image.data.base + src->image.data.offset + slice->offset,
3320       .metadata = metadata->ptr.gpu + offset,
3321    };
3322 
3323    panfrost_batch_read_rsrc(batch, src, PIPE_SHADER_COMPUTE);
3324    panfrost_batch_write_bo(batch, metadata, PIPE_SHADER_COMPUTE);
3325 
3326    LAUNCH_AFBC_SHADER(size, batch, src, consts, slice->afbc.nr_blocks);
3327 }
3328 
3329 static void
panfrost_afbc_pack(struct panfrost_batch * batch,struct panfrost_resource * src,struct panfrost_bo * dst,struct pan_image_slice_layout * dst_slice,struct panfrost_bo * metadata,unsigned metadata_offset,unsigned level)3330 panfrost_afbc_pack(struct panfrost_batch *batch, struct panfrost_resource *src,
3331                    struct panfrost_bo *dst,
3332                    struct pan_image_slice_layout *dst_slice,
3333                    struct panfrost_bo *metadata, unsigned metadata_offset,
3334                    unsigned level)
3335 {
3336    struct pan_image_slice_layout *src_slice = &src->image.layout.slices[level];
3337    struct panfrost_afbc_pack_info consts = {
3338       .src = src->image.data.base + src->image.data.offset +
3339              src_slice->offset,
3340       .dst = dst->ptr.gpu + dst_slice->offset,
3341       .metadata = metadata->ptr.gpu + metadata_offset,
3342       .header_size = dst_slice->afbc.header_size,
3343       .src_stride = src_slice->afbc.stride,
3344       .dst_stride = dst_slice->afbc.stride,
3345    };
3346 
3347    panfrost_batch_write_rsrc(batch, src, PIPE_SHADER_COMPUTE);
3348    panfrost_batch_write_bo(batch, dst, PIPE_SHADER_COMPUTE);
3349    panfrost_batch_add_bo(batch, metadata, PIPE_SHADER_COMPUTE);
3350 
3351    LAUNCH_AFBC_SHADER(pack, batch, src, consts, dst_slice->afbc.nr_blocks);
3352 }
3353 
3354 static void *
panfrost_create_rasterizer_state(struct pipe_context * pctx,const struct pipe_rasterizer_state * cso)3355 panfrost_create_rasterizer_state(struct pipe_context *pctx,
3356                                  const struct pipe_rasterizer_state *cso)
3357 {
3358    struct panfrost_rasterizer *so = CALLOC_STRUCT(panfrost_rasterizer);
3359 
3360    so->base = *cso;
3361 
3362 #if PAN_ARCH <= 7
3363    pan_pack(&so->multisample, MULTISAMPLE_MISC, cfg) {
3364       cfg.multisample_enable = cso->multisample;
3365       cfg.fixed_function_near_discard = cso->depth_clip_near;
3366       cfg.fixed_function_far_discard = cso->depth_clip_far;
3367       cfg.fixed_function_depth_range_fixed = !cso->depth_clamp;
3368       cfg.shader_depth_range_fixed = true;
3369    }
3370 
3371    pan_pack(&so->stencil_misc, STENCIL_MASK_MISC, cfg) {
3372       cfg.front_facing_depth_bias = cso->offset_tri;
3373       cfg.back_facing_depth_bias = cso->offset_tri;
3374       cfg.single_sampled_lines = !cso->multisample;
3375    }
3376 #endif
3377 
3378    return so;
3379 }
3380 
3381 #if PAN_ARCH >= 9
3382 /*
3383  * Given a pipe_vertex_element, pack the corresponding Valhall attribute
3384  * descriptor. This function is called at CSO create time.
3385  */
3386 static void
panfrost_pack_attribute(struct panfrost_device * dev,const struct pipe_vertex_element el,struct mali_attribute_packed * out)3387 panfrost_pack_attribute(struct panfrost_device *dev,
3388                         const struct pipe_vertex_element el,
3389                         struct mali_attribute_packed *out)
3390 {
3391    pan_pack(out, ATTRIBUTE, cfg) {
3392       cfg.table = PAN_TABLE_ATTRIBUTE_BUFFER;
3393       cfg.frequency = (el.instance_divisor > 0)
3394                          ? MALI_ATTRIBUTE_FREQUENCY_INSTANCE
3395                          : MALI_ATTRIBUTE_FREQUENCY_VERTEX;
3396       cfg.format = GENX(panfrost_format_from_pipe_format)(el.src_format)->hw;
3397       cfg.offset = el.src_offset;
3398       cfg.buffer_index = el.vertex_buffer_index;
3399       cfg.stride = el.src_stride;
3400 
3401       if (el.instance_divisor == 0) {
3402          /* Per-vertex */
3403          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
3404          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
3405          cfg.offset_enable = true;
3406       } else if (util_is_power_of_two_or_zero(el.instance_divisor)) {
3407          /* Per-instance, POT divisor */
3408          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
3409          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
3410          cfg.divisor_r = __builtin_ctz(el.instance_divisor);
3411       } else {
3412          /* Per-instance, NPOT divisor */
3413          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
3414          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
3415 
3416          cfg.divisor_d = panfrost_compute_magic_divisor(
3417             el.instance_divisor, &cfg.divisor_r, &cfg.divisor_e);
3418       }
3419    }
3420 }
3421 #endif
3422 
3423 static void *
panfrost_create_vertex_elements_state(struct pipe_context * pctx,unsigned num_elements,const struct pipe_vertex_element * elements)3424 panfrost_create_vertex_elements_state(struct pipe_context *pctx,
3425                                       unsigned num_elements,
3426                                       const struct pipe_vertex_element *elements)
3427 {
3428    struct panfrost_vertex_state *so = CALLOC_STRUCT(panfrost_vertex_state);
3429    UNUSED struct panfrost_device *dev = pan_device(pctx->screen);
3430 
3431    so->num_elements = num_elements;
3432    memcpy(so->pipe, elements, sizeof(*elements) * num_elements);
3433 
3434    for (unsigned i = 0; i < num_elements; ++i)
3435       so->strides[elements[i].vertex_buffer_index] = elements[i].src_stride;
3436 #if PAN_ARCH >= 9
3437    for (unsigned i = 0; i < num_elements; ++i)
3438       panfrost_pack_attribute(dev, elements[i], &so->attributes[i]);
3439 #else
3440    /* Assign attribute buffers corresponding to the vertex buffers, keyed
3441     * for a particular divisor since that's how instancing works on Mali */
3442    for (unsigned i = 0; i < num_elements; ++i) {
3443       so->element_buffer[i] = pan_assign_vertex_buffer(
3444          so->buffers, &so->nr_bufs, elements[i].vertex_buffer_index,
3445          elements[i].instance_divisor);
3446    }
3447 
3448    for (int i = 0; i < num_elements; ++i) {
3449       enum pipe_format fmt = elements[i].src_format;
3450       so->formats[i] = GENX(panfrost_format_from_pipe_format)(fmt)->hw;
3451 
3452       assert(MALI_EXTRACT_INDEX(so->formats[i]) && "format must be supported");
3453    }
3454 
3455    /* Let's also prepare vertex builtins */
3456    so->formats[PAN_VERTEX_ID] =
3457       GENX(panfrost_format_from_pipe_format)(PIPE_FORMAT_R32_UINT)->hw;
3458    so->formats[PAN_INSTANCE_ID] =
3459       GENX(panfrost_format_from_pipe_format)(PIPE_FORMAT_R32_UINT)->hw;
3460 #endif
3461 
3462    return so;
3463 }
3464 
3465 static inline unsigned
pan_pipe_to_stencil_op(enum pipe_stencil_op in)3466 pan_pipe_to_stencil_op(enum pipe_stencil_op in)
3467 {
3468    switch (in) {
3469    case PIPE_STENCIL_OP_KEEP:
3470       return MALI_STENCIL_OP_KEEP;
3471    case PIPE_STENCIL_OP_ZERO:
3472       return MALI_STENCIL_OP_ZERO;
3473    case PIPE_STENCIL_OP_REPLACE:
3474       return MALI_STENCIL_OP_REPLACE;
3475    case PIPE_STENCIL_OP_INCR:
3476       return MALI_STENCIL_OP_INCR_SAT;
3477    case PIPE_STENCIL_OP_DECR:
3478       return MALI_STENCIL_OP_DECR_SAT;
3479    case PIPE_STENCIL_OP_INCR_WRAP:
3480       return MALI_STENCIL_OP_INCR_WRAP;
3481    case PIPE_STENCIL_OP_DECR_WRAP:
3482       return MALI_STENCIL_OP_DECR_WRAP;
3483    case PIPE_STENCIL_OP_INVERT:
3484       return MALI_STENCIL_OP_INVERT;
3485    default:
3486       unreachable("Invalid stencil op");
3487    }
3488 }
3489 
3490 #if PAN_ARCH <= 7
3491 static inline void
pan_pipe_to_stencil(const struct pipe_stencil_state * in,struct mali_stencil_packed * out)3492 pan_pipe_to_stencil(const struct pipe_stencil_state *in,
3493                     struct mali_stencil_packed *out)
3494 {
3495    pan_pack(out, STENCIL, s) {
3496       s.mask = in->valuemask;
3497       s.compare_function = (enum mali_func)in->func;
3498       s.stencil_fail = pan_pipe_to_stencil_op(in->fail_op);
3499       s.depth_fail = pan_pipe_to_stencil_op(in->zfail_op);
3500       s.depth_pass = pan_pipe_to_stencil_op(in->zpass_op);
3501    }
3502 }
3503 #endif
3504 
3505 static bool
pipe_zs_always_passes(const struct pipe_depth_stencil_alpha_state * zsa)3506 pipe_zs_always_passes(const struct pipe_depth_stencil_alpha_state *zsa)
3507 {
3508    if (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS)
3509       return false;
3510 
3511    if (zsa->stencil[0].enabled && zsa->stencil[0].func != PIPE_FUNC_ALWAYS)
3512       return false;
3513 
3514    if (zsa->stencil[1].enabled && zsa->stencil[1].func != PIPE_FUNC_ALWAYS)
3515       return false;
3516 
3517    return true;
3518 }
3519 
3520 static void *
panfrost_create_depth_stencil_state(struct pipe_context * pipe,const struct pipe_depth_stencil_alpha_state * zsa)3521 panfrost_create_depth_stencil_state(
3522    struct pipe_context *pipe, const struct pipe_depth_stencil_alpha_state *zsa)
3523 {
3524    struct panfrost_zsa_state *so = CALLOC_STRUCT(panfrost_zsa_state);
3525    so->base = *zsa;
3526 
3527    const struct pipe_stencil_state front = zsa->stencil[0];
3528    const struct pipe_stencil_state back =
3529       zsa->stencil[1].enabled ? zsa->stencil[1] : front;
3530 
3531    enum mali_func depth_func =
3532       zsa->depth_enabled ? (enum mali_func)zsa->depth_func : MALI_FUNC_ALWAYS;
3533 
3534    /* Normalize (there's no separate enable) */
3535    if (PAN_ARCH <= 5 && !zsa->alpha_enabled)
3536       so->base.alpha_func = MALI_FUNC_ALWAYS;
3537 
3538 #if PAN_ARCH <= 7
3539    /* Prepack relevant parts of the Renderer State Descriptor. They will
3540     * be ORed in at draw-time */
3541    pan_pack(&so->rsd_depth, MULTISAMPLE_MISC, cfg) {
3542       cfg.depth_function = depth_func;
3543       cfg.depth_write_mask = zsa->depth_writemask;
3544    }
3545 
3546    pan_pack(&so->rsd_stencil, STENCIL_MASK_MISC, cfg) {
3547       cfg.stencil_enable = front.enabled;
3548       cfg.stencil_mask_front = front.writemask;
3549       cfg.stencil_mask_back = back.writemask;
3550 
3551 #if PAN_ARCH <= 5
3552       cfg.alpha_test_compare_function = (enum mali_func)so->base.alpha_func;
3553 #endif
3554    }
3555 
3556    /* Stencil tests have their own words in the RSD */
3557    pan_pipe_to_stencil(&front, &so->stencil_front);
3558    pan_pipe_to_stencil(&back, &so->stencil_back);
3559 #else
3560    /* Pack with nodefaults so only explicitly set fields affect pan_merge() when
3561     * emitting depth stencil descriptor */
3562    pan_pack_nodefaults(&so->desc, DEPTH_STENCIL, cfg) {
3563       cfg.front_compare_function = (enum mali_func)front.func;
3564       cfg.front_stencil_fail = pan_pipe_to_stencil_op(front.fail_op);
3565       cfg.front_depth_fail = pan_pipe_to_stencil_op(front.zfail_op);
3566       cfg.front_depth_pass = pan_pipe_to_stencil_op(front.zpass_op);
3567 
3568       cfg.back_compare_function = (enum mali_func)back.func;
3569       cfg.back_stencil_fail = pan_pipe_to_stencil_op(back.fail_op);
3570       cfg.back_depth_fail = pan_pipe_to_stencil_op(back.zfail_op);
3571       cfg.back_depth_pass = pan_pipe_to_stencil_op(back.zpass_op);
3572 
3573       cfg.stencil_test_enable = front.enabled;
3574       cfg.front_write_mask = front.writemask;
3575       cfg.back_write_mask = back.writemask;
3576       cfg.front_value_mask = front.valuemask;
3577       cfg.back_value_mask = back.valuemask;
3578 
3579       cfg.depth_write_enable = zsa->depth_writemask;
3580       cfg.depth_function = depth_func;
3581    }
3582 #endif
3583 
3584    so->enabled = zsa->stencil[0].enabled ||
3585                  (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS);
3586 
3587    so->zs_always_passes = pipe_zs_always_passes(zsa);
3588    so->writes_zs = util_writes_depth_stencil(zsa);
3589 
3590    /* TODO: Bounds test should be easy */
3591    assert(!zsa->depth_bounds_test);
3592 
3593    return so;
3594 }
3595 
3596 static struct pipe_sampler_view *
panfrost_create_sampler_view(struct pipe_context * pctx,struct pipe_resource * texture,const struct pipe_sampler_view * template)3597 panfrost_create_sampler_view(struct pipe_context *pctx,
3598                              struct pipe_resource *texture,
3599                              const struct pipe_sampler_view *template)
3600 {
3601    struct panfrost_context *ctx = pan_context(pctx);
3602    struct panfrost_sampler_view *so =
3603       rzalloc(pctx, struct panfrost_sampler_view);
3604 
3605    pan_legalize_format(ctx, pan_resource(texture), template->format, false,
3606                        false);
3607 
3608    pipe_reference(NULL, &texture->reference);
3609 
3610    so->base = *template;
3611    so->base.texture = texture;
3612    so->base.reference.count = 1;
3613    so->base.context = pctx;
3614 
3615    panfrost_create_sampler_view_bo(so, pctx, texture);
3616 
3617    return (struct pipe_sampler_view *)so;
3618 }
3619 
3620 /* A given Gallium blend state can be encoded to the hardware in numerous,
3621  * dramatically divergent ways due to the interactions of blending with
3622  * framebuffer formats. Conceptually, there are two modes:
3623  *
3624  * - Fixed-function blending (for suitable framebuffer formats, suitable blend
3625  *   state, and suitable blend constant)
3626  *
3627  * - Blend shaders (for everything else)
3628  *
3629  * A given Gallium blend configuration will compile to exactly one
3630  * fixed-function blend state, if it compiles to any, although the constant
3631  * will vary across runs as that is tracked outside of the Gallium CSO.
3632  *
3633  * However, that same blend configuration will compile to many different blend
3634  * shaders, depending on the framebuffer formats active. The rationale is that
3635  * blend shaders override not just fixed-function blending but also
3636  * fixed-function format conversion, so blend shaders are keyed to a particular
3637  * framebuffer format. As an example, the tilebuffer format is identical for
3638  * RG16F and RG16UI -- both are simply 32-bit raw pixels -- so both require
3639  * blend shaders.
3640  *
3641  * All of this state is encapsulated in the panfrost_blend_state struct
3642  * (our subclass of pipe_blend_state).
3643  */
3644 
3645 /* Create a blend CSO. Essentially, try to compile a fixed-function
3646  * expression and initialize blend shaders */
3647 
3648 static void *
panfrost_create_blend_state(struct pipe_context * pipe,const struct pipe_blend_state * blend)3649 panfrost_create_blend_state(struct pipe_context *pipe,
3650                             const struct pipe_blend_state *blend)
3651 {
3652    struct panfrost_blend_state *so = CALLOC_STRUCT(panfrost_blend_state);
3653    so->base = *blend;
3654 
3655    so->pan.logicop_enable = blend->logicop_enable;
3656    so->pan.logicop_func = blend->logicop_func;
3657    so->pan.rt_count = blend->max_rt + 1;
3658 
3659    for (unsigned c = 0; c < so->pan.rt_count; ++c) {
3660       unsigned g = blend->independent_blend_enable ? c : 0;
3661       const struct pipe_rt_blend_state pipe = blend->rt[g];
3662       struct pan_blend_equation equation = {0};
3663 
3664       equation.color_mask = pipe.colormask;
3665       equation.blend_enable = pipe.blend_enable;
3666 
3667       if (pipe.blend_enable) {
3668          equation.rgb_func = pipe.rgb_func;
3669          equation.rgb_src_factor = pipe.rgb_src_factor;
3670          equation.rgb_dst_factor = pipe.rgb_dst_factor;
3671          equation.alpha_func = pipe.alpha_func;
3672          equation.alpha_src_factor = pipe.alpha_src_factor;
3673          equation.alpha_dst_factor = pipe.alpha_dst_factor;
3674       }
3675 
3676       /* Determine some common properties */
3677       unsigned constant_mask = pan_blend_constant_mask(equation);
3678       const bool supports_2src = pan_blend_supports_2src(PAN_ARCH);
3679       so->info[c] = (struct pan_blend_info){
3680          .enabled = (equation.color_mask != 0) &&
3681                     !(blend->logicop_enable &&
3682                       blend->logicop_func == PIPE_LOGICOP_NOOP),
3683          .opaque = !blend->logicop_enable && pan_blend_is_opaque(equation),
3684          .constant_mask = constant_mask,
3685 
3686          /* TODO: check the dest for the logicop */
3687          .load_dest = blend->logicop_enable || pan_blend_reads_dest(equation),
3688 
3689          /* Could this possibly be fixed-function? */
3690          .fixed_function =
3691             !blend->logicop_enable &&
3692             pan_blend_can_fixed_function(equation, supports_2src) &&
3693             (!constant_mask || pan_blend_supports_constant(PAN_ARCH, c)),
3694 
3695          .alpha_zero_nop = pan_blend_alpha_zero_nop(equation),
3696          .alpha_one_store = pan_blend_alpha_one_store(equation),
3697       };
3698 
3699       so->pan.rts[c].equation = equation;
3700 
3701       /* Bifrost needs to know if any render target loads its
3702        * destination in the hot draw path, so precompute this */
3703       if (so->info[c].load_dest)
3704          so->load_dest_mask |= BITFIELD_BIT(c);
3705 
3706       /* Bifrost needs to know if any render target loads its
3707        * destination in the hot draw path, so precompute this */
3708       if (so->info[c].enabled)
3709          so->enabled_mask |= BITFIELD_BIT(c);
3710 
3711       /* Converting equations to Mali style is expensive, do it at
3712        * CSO create time instead of draw-time */
3713       if (so->info[c].fixed_function) {
3714          so->equation[c] = pan_pack_blend(equation);
3715       }
3716    }
3717 
3718    return so;
3719 }
3720 
3721 #if PAN_ARCH >= 9
3722 static enum mali_flush_to_zero_mode
panfrost_ftz_mode(struct pan_shader_info * info)3723 panfrost_ftz_mode(struct pan_shader_info *info)
3724 {
3725    if (info->ftz_fp32) {
3726       if (info->ftz_fp16)
3727          return MALI_FLUSH_TO_ZERO_MODE_ALWAYS;
3728       else
3729          return MALI_FLUSH_TO_ZERO_MODE_DX11;
3730    } else {
3731       /* We don't have a "flush FP16, preserve FP32" mode, but APIs
3732        * should not be able to generate that.
3733        */
3734       assert(!info->ftz_fp16 && !info->ftz_fp32);
3735       return MALI_FLUSH_TO_ZERO_MODE_PRESERVE_SUBNORMALS;
3736    }
3737 }
3738 #endif
3739 
3740 static void
prepare_shader(struct panfrost_compiled_shader * state,struct panfrost_pool * pool,bool upload)3741 prepare_shader(struct panfrost_compiled_shader *state,
3742                struct panfrost_pool *pool, bool upload)
3743 {
3744 #if PAN_ARCH <= 7
3745    void *out = &state->partial_rsd;
3746 
3747    if (upload) {
3748       struct panfrost_ptr ptr =
3749          pan_pool_alloc_desc(&pool->base, RENDERER_STATE);
3750 
3751       state->state = panfrost_pool_take_ref(pool, ptr.gpu);
3752       out = ptr.cpu;
3753    }
3754 
3755    pan_pack(out, RENDERER_STATE, cfg) {
3756       pan_shader_prepare_rsd(&state->info, state->bin.gpu, &cfg);
3757    }
3758 #else
3759    assert(upload);
3760 
3761    /* The address in the shader program descriptor must be non-null, but
3762     * the entire shader program descriptor may be omitted.
3763     *
3764     * See dEQP-GLES31.functional.compute.basic.empty
3765     */
3766    if (!state->bin.gpu)
3767       return;
3768 
3769    bool vs = (state->info.stage == MESA_SHADER_VERTEX);
3770    bool secondary_enable = (vs && state->info.vs.secondary_enable);
3771 
3772    unsigned nr_variants = secondary_enable ? 3 : vs ? 2 : 1;
3773    struct panfrost_ptr ptr =
3774       pan_pool_alloc_desc_array(&pool->base, nr_variants, SHADER_PROGRAM);
3775 
3776    state->state = panfrost_pool_take_ref(pool, ptr.gpu);
3777 
3778    /* Generic, or IDVS/points */
3779    pan_pack(ptr.cpu, SHADER_PROGRAM, cfg) {
3780       cfg.stage = pan_shader_stage(&state->info);
3781 
3782       if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
3783          cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
3784       else if (vs)
3785          cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
3786 
3787       cfg.register_allocation =
3788          pan_register_allocation(state->info.work_reg_count);
3789       cfg.binary = state->bin.gpu;
3790       cfg.preload.r48_r63 = (state->info.preload >> 48);
3791       cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
3792 
3793       if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
3794          cfg.requires_helper_threads = state->info.contains_barrier;
3795    }
3796 
3797    if (!vs)
3798       return;
3799 
3800    /* IDVS/triangles */
3801    pan_pack(ptr.cpu + pan_size(SHADER_PROGRAM), SHADER_PROGRAM, cfg) {
3802       cfg.stage = pan_shader_stage(&state->info);
3803       cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
3804       cfg.register_allocation =
3805          pan_register_allocation(state->info.work_reg_count);
3806       cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset;
3807       cfg.preload.r48_r63 = (state->info.preload >> 48);
3808       cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
3809    }
3810 
3811    if (!secondary_enable)
3812       return;
3813 
3814    pan_pack(ptr.cpu + (pan_size(SHADER_PROGRAM) * 2), SHADER_PROGRAM, cfg) {
3815       unsigned work_count = state->info.vs.secondary_work_reg_count;
3816 
3817       cfg.stage = pan_shader_stage(&state->info);
3818       cfg.vertex_warp_limit = MALI_WARP_LIMIT_FULL;
3819       cfg.register_allocation = pan_register_allocation(work_count);
3820       cfg.binary = state->bin.gpu + state->info.vs.secondary_offset;
3821       cfg.preload.r48_r63 = (state->info.vs.secondary_preload >> 48);
3822       cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
3823    }
3824 #endif
3825 }
3826 
3827 static void
screen_destroy(struct pipe_screen * pscreen)3828 screen_destroy(struct pipe_screen *pscreen)
3829 {
3830    struct panfrost_device *dev = pan_device(pscreen);
3831    GENX(pan_blitter_cache_cleanup)(&dev->blitter);
3832 }
3833 
3834 static void
panfrost_sampler_view_destroy(struct pipe_context * pctx,struct pipe_sampler_view * pview)3835 panfrost_sampler_view_destroy(struct pipe_context *pctx,
3836                               struct pipe_sampler_view *pview)
3837 {
3838    struct panfrost_sampler_view *view = (struct panfrost_sampler_view *)pview;
3839 
3840    pipe_resource_reference(&pview->texture, NULL);
3841    panfrost_bo_unreference(view->state.bo);
3842    ralloc_free(view);
3843 }
3844 
3845 static void
context_populate_vtbl(struct pipe_context * pipe)3846 context_populate_vtbl(struct pipe_context *pipe)
3847 {
3848    pipe->draw_vbo = panfrost_draw_vbo;
3849    pipe->launch_grid = panfrost_launch_grid;
3850 
3851    pipe->create_vertex_elements_state = panfrost_create_vertex_elements_state;
3852    pipe->create_rasterizer_state = panfrost_create_rasterizer_state;
3853    pipe->create_depth_stencil_alpha_state = panfrost_create_depth_stencil_state;
3854    pipe->create_sampler_view = panfrost_create_sampler_view;
3855    pipe->sampler_view_destroy = panfrost_sampler_view_destroy;
3856    pipe->create_sampler_state = panfrost_create_sampler_state;
3857    pipe->create_blend_state = panfrost_create_blend_state;
3858 
3859    pipe->get_sample_position = u_default_get_sample_position;
3860 }
3861 
3862 static void
context_init(struct panfrost_context * ctx)3863 context_init(struct panfrost_context *ctx)
3864 {
3865 }
3866 
3867 static void
context_cleanup(struct panfrost_context * ctx)3868 context_cleanup(struct panfrost_context *ctx)
3869 {
3870 }
3871 
3872 #if PAN_ARCH <= 5
3873 
3874 /* Returns the polygon list's GPU address if available, or otherwise allocates
3875  * the polygon list.  It's perfectly fast to use allocate/free BO directly,
3876  * since we'll hit the BO cache and this is one-per-batch anyway. */
3877 
3878 static mali_ptr
batch_get_polygon_list(struct panfrost_batch * batch)3879 batch_get_polygon_list(struct panfrost_batch *batch)
3880 {
3881    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
3882 
3883    if (!batch->tiler_ctx.midgard.polygon_list) {
3884       bool has_draws = batch->draw_count > 0;
3885       unsigned size = panfrost_tiler_get_polygon_list_size(
3886          batch->key.width, batch->key.height, batch->vertex_count,
3887          !dev->model->quirks.no_hierarchical_tiling);
3888 
3889       /* Create the BO as invisible if we can. If there are no draws,
3890        * we need to write the polygon list manually because there's
3891        * no WRITE_VALUE job in the chain
3892        */
3893       bool init_polygon_list = !has_draws;
3894       batch->polygon_list_bo = panfrost_batch_create_bo(
3895          batch, size, init_polygon_list ? 0 : PAN_BO_INVISIBLE,
3896          PIPE_SHADER_VERTEX, "Polygon list");
3897       batch->tiler_ctx.midgard.polygon_list = batch->polygon_list_bo->ptr.gpu;
3898       panfrost_batch_add_bo(batch, batch->polygon_list_bo,
3899                             PIPE_SHADER_FRAGMENT);
3900 
3901       if (init_polygon_list && dev->model->quirks.no_hierarchical_tiling) {
3902          assert(batch->polygon_list_bo->ptr.cpu);
3903          uint32_t *polygon_list_body =
3904             batch->polygon_list_bo->ptr.cpu +
3905             MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE;
3906 
3907          /* Magic for Mali T720 */
3908          polygon_list_body[0] = 0xa0000000;
3909       } else if (init_polygon_list) {
3910          assert(batch->polygon_list_bo->ptr.cpu);
3911          uint32_t *header = batch->polygon_list_bo->ptr.cpu;
3912          memset(header, 0, size);
3913       }
3914 
3915       batch->tiler_ctx.midgard.disable = !has_draws;
3916       batch->tiler_ctx.midgard.no_hierarchical_tiling =
3917          dev->model->quirks.no_hierarchical_tiling;
3918       batch->tiler_ctx.midgard.heap.start = dev->tiler_heap->ptr.gpu;
3919       batch->tiler_ctx.midgard.heap.size = panfrost_bo_size(dev->tiler_heap);
3920    }
3921 
3922    return batch->tiler_ctx.midgard.polygon_list;
3923 }
3924 #endif
3925 
3926 static void
init_polygon_list(struct panfrost_batch * batch)3927 init_polygon_list(struct panfrost_batch *batch)
3928 {
3929 #if PAN_ARCH <= 5
3930    mali_ptr polygon_list = batch_get_polygon_list(batch);
3931    pan_jc_initialize_tiler(&batch->pool.base, &batch->jm.jobs.vtc_jc,
3932                            polygon_list);
3933 #endif
3934 }
3935 
3936 static int
submit_batch(struct panfrost_batch * batch,struct pan_fb_info * fb)3937 submit_batch(struct panfrost_batch *batch, struct pan_fb_info *fb)
3938 {
3939    JOBX(preload_fb)(batch, fb);
3940    init_polygon_list(batch);
3941 
3942    /* Now that all draws are in, we can finally prepare the
3943     * FBD for the batch (if there is one). */
3944 
3945    emit_tls(batch);
3946 
3947    if (panfrost_has_fragment_job(batch)) {
3948       emit_fbd(batch, fb);
3949       emit_fragment_job(batch, fb);
3950    }
3951 
3952    return JOBX(submit_batch)(batch);
3953 }
3954 
3955 static void
emit_write_timestamp(struct panfrost_batch * batch,struct panfrost_resource * dst,unsigned offset)3956 emit_write_timestamp(struct panfrost_batch *batch,
3957                      struct panfrost_resource *dst, unsigned offset)
3958 {
3959    batch->need_job_req_cycle_count = true;
3960    batch->has_time_query = true;
3961 
3962    JOBX(emit_write_timestamp)(batch, dst, offset);
3963 }
3964 
3965 void
GENX(panfrost_cmdstream_screen_init)3966 GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen)
3967 {
3968    struct panfrost_device *dev = &screen->dev;
3969 
3970    screen->vtbl.prepare_shader = prepare_shader;
3971    screen->vtbl.screen_destroy = screen_destroy;
3972    screen->vtbl.context_populate_vtbl = context_populate_vtbl;
3973    screen->vtbl.context_init = JOBX(init_context);
3974    screen->vtbl.context_cleanup = JOBX(cleanup_context);
3975    screen->vtbl.init_batch = JOBX(init_batch);
3976    screen->vtbl.cleanup_batch = JOBX(cleanup_batch);
3977    screen->vtbl.submit_batch = submit_batch;
3978    screen->vtbl.get_blend_shader = GENX(pan_blend_get_shader_locked);
3979    screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options);
3980    screen->vtbl.compile_shader = GENX(pan_shader_compile);
3981    screen->vtbl.afbc_size = panfrost_afbc_size;
3982    screen->vtbl.afbc_pack = panfrost_afbc_pack;
3983    screen->vtbl.emit_write_timestamp = emit_write_timestamp;
3984 
3985    GENX(pan_blitter_cache_init)
3986    (&dev->blitter, panfrost_device_gpu_id(dev), &dev->blend_shaders,
3987     &screen->blitter.bin_pool.base, &screen->blitter.desc_pool.base);
3988 
3989 #if PAN_GPU_SUPPORTS_DISPATCH_INDIRECT
3990    pan_indirect_dispatch_meta_init(
3991       &dev->indirect_dispatch, panfrost_device_gpu_id(dev),
3992       &screen->blitter.bin_pool.base, &screen->blitter.desc_pool.base);
3993 #endif
3994 }
3995