xref: /aosp_15_r20/external/mesa3d/src/broadcom/compiler/v3d_tex.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2016-2018 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3d_compiler.h"
25 
26 /* We don't do any address packing. */
27 #define __gen_user_data void
28 #define __gen_address_type uint32_t
29 #define __gen_address_offset(reloc) (*reloc)
30 #define __gen_emit_reloc(cl, reloc)
31 #include "cle/v3d_packet_v42_pack.h"
32 
33 static inline struct qinst *
vir_TMU_WRITE(struct v3d_compile * c,enum v3d_qpu_waddr waddr,struct qreg val)34 vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val)
35 {
36         /* XXX perf: We should figure out how to merge ALU operations
37          * producing the val with this MOV, when possible.
38          */
39         return vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
40 }
41 
42 static inline struct qinst *
vir_TMU_WRITE_or_count(struct v3d_compile * c,enum v3d_qpu_waddr waddr,struct qreg val,uint32_t * tmu_writes)43 vir_TMU_WRITE_or_count(struct v3d_compile *c,
44                        enum v3d_qpu_waddr waddr,
45                        struct qreg val,
46                        uint32_t *tmu_writes)
47 {
48         if (tmu_writes) {
49                 (*tmu_writes)++;
50                 return NULL;
51         } else {
52                 return vir_TMU_WRITE(c, waddr, val);
53         }
54 }
55 
56 static void
vir_WRTMUC(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)57 vir_WRTMUC(struct v3d_compile *c, enum quniform_contents contents, uint32_t data)
58 {
59         struct qinst *inst = vir_NOP(c);
60         inst->qpu.sig.wrtmuc = true;
61         inst->uniform = vir_get_uniform_index(c, contents, data);
62 }
63 
64 static const struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = {
65         .per_pixel_mask_enable = true,
66 };
67 
68 static const struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = {
69         .op = V3D_TMU_OP_REGULAR,
70 };
71 
72 /**
73  * If 'tmu_writes' is not NULL, then it just counts required register writes,
74  * otherwise, it emits the actual register writes.
75  *
76  * It is important to notice that emitting register writes for the current
77  * TMU operation may trigger a TMU flush, since it is possible that any
78  * of the inputs required for the register writes is the result of a pending
79  * TMU operation. If that happens we need to make sure that it doesn't happen
80  * in the middle of the TMU register writes for the current TMU operation,
81  * which is why we always call ntq_get_src() even if we are only interested in
82  * register write counts.
83  */
84 static void
handle_tex_src(struct v3d_compile * c,nir_tex_instr * instr,unsigned src_idx,unsigned non_array_components,struct V3D42_TMU_CONFIG_PARAMETER_2 * p2_unpacked,struct qreg * s_out,unsigned * tmu_writes)85 handle_tex_src(struct v3d_compile *c,
86                nir_tex_instr *instr,
87                unsigned src_idx,
88                unsigned non_array_components,
89                struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
90                struct qreg *s_out,
91                unsigned *tmu_writes)
92 {
93         /* Either we are calling this just to count required TMU writes, or we
94          * are calling this to emit the actual TMU writes.
95          */
96         assert(tmu_writes || (s_out && p2_unpacked));
97 
98         struct qreg s;
99         switch (instr->src[src_idx].src_type) {
100         case nir_tex_src_coord:
101                 /* S triggers the lookup, so save it for the end. */
102                 s = ntq_get_src(c, instr->src[src_idx].src, 0);
103                 if (tmu_writes)
104                         (*tmu_writes)++;
105                 else
106                         *s_out = s;
107 
108                 if (non_array_components > 1) {
109                         struct qreg src =
110                                 ntq_get_src(c, instr->src[src_idx].src, 1);
111                         vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUT, src,
112                                                 tmu_writes);
113                 }
114 
115                 if (non_array_components > 2) {
116                         struct qreg src =
117                                 ntq_get_src(c, instr->src[src_idx].src, 2);
118                         vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUR, src,
119                                                tmu_writes);
120                 }
121 
122                 if (instr->is_array) {
123                         struct qreg src =
124                                 ntq_get_src(c, instr->src[src_idx].src,
125                                             instr->coord_components - 1);
126                         vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUI, src,
127                                                tmu_writes);
128                 }
129                 break;
130 
131         case nir_tex_src_bias: {
132                 struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0);
133                 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUB, src, tmu_writes);
134                 break;
135         }
136 
137         case nir_tex_src_lod: {
138                 struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0);
139                 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUB, src, tmu_writes);
140                 if (!tmu_writes) {
141                         /* With texel fetch automatic LOD is already disabled,
142                          * and disable_autolod must not be enabled. For
143                          * non-cubes we can use the register TMUSLOD, that
144                          * implicitly sets disable_autolod.
145                          */
146                         assert(p2_unpacked);
147                         if (instr->op != nir_texop_txf &&
148                             instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
149                                     p2_unpacked->disable_autolod = true;
150                         }
151                }
152                break;
153         }
154 
155         case nir_tex_src_comparator: {
156                 struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0);
157                 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUDREF, src, tmu_writes);
158                 break;
159         }
160 
161         case nir_tex_src_offset: {
162                 bool is_const_offset = nir_src_is_const(instr->src[src_idx].src);
163                 if (is_const_offset) {
164                         if (!tmu_writes) {
165                                 p2_unpacked->offset_s =
166                                         nir_src_comp_as_int(instr->src[src_idx].src, 0);
167                                 if (non_array_components >= 2)
168                                         p2_unpacked->offset_t =
169                                                 nir_src_comp_as_int(instr->src[src_idx].src, 1);
170                                 if (non_array_components >= 3)
171                                         p2_unpacked->offset_r =
172                                                 nir_src_comp_as_int(instr->src[src_idx].src, 2);
173                         }
174                 } else {
175                         struct qreg src_0 =
176                                 ntq_get_src(c, instr->src[src_idx].src, 0);
177                         struct qreg src_1 =
178                                 ntq_get_src(c, instr->src[src_idx].src, 1);
179                         if (!tmu_writes) {
180                                 struct qreg mask = vir_uniform_ui(c, 0xf);
181                                 struct qreg x, y, offset;
182 
183                                 x = vir_AND(c, src_0, mask);
184                                 y = vir_AND(c, src_1, mask);
185                                 offset = vir_OR(c, x,
186                                                 vir_SHL(c, y, vir_uniform_ui(c, 4)));
187 
188                                 vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUOFF, offset);
189                         } else {
190                                 (*tmu_writes)++;
191                         }
192                 }
193                 break;
194         }
195 
196         default:
197                 unreachable("unknown texture source");
198         }
199 }
200 
201 static void
vir_tex_handle_srcs(struct v3d_compile * c,nir_tex_instr * instr,struct V3D42_TMU_CONFIG_PARAMETER_2 * p2_unpacked,struct qreg * s,unsigned * tmu_writes)202 vir_tex_handle_srcs(struct v3d_compile *c,
203                     nir_tex_instr *instr,
204                     struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
205                     struct qreg *s,
206                     unsigned *tmu_writes)
207 {
208         unsigned non_array_components = instr->op != nir_texop_lod ?
209                 instr->coord_components - instr->is_array :
210                 instr->coord_components;
211 
212         for (unsigned i = 0; i < instr->num_srcs; i++) {
213                 handle_tex_src(c, instr, i, non_array_components,
214                                p2_unpacked, s, tmu_writes);
215         }
216 }
217 
218 static unsigned
get_required_tex_tmu_writes(struct v3d_compile * c,nir_tex_instr * instr)219 get_required_tex_tmu_writes(struct v3d_compile *c, nir_tex_instr *instr)
220 {
221         unsigned tmu_writes = 0;
222         vir_tex_handle_srcs(c, instr, NULL, NULL, &tmu_writes);
223         return tmu_writes;
224 }
225 
226 void
v3d_vir_emit_tex(struct v3d_compile * c,nir_tex_instr * instr)227 v3d_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
228 {
229         unsigned texture_idx = instr->texture_index;
230 
231         /* For instructions that don't have a sampler (i.e. txf) we bind
232          * default sampler state via the backend_flags to handle precision.
233          */
234         unsigned sampler_idx = nir_tex_instr_need_sampler(instr) ?
235                                instr->sampler_index : instr->backend_flags;
236 
237         /* Even if the texture operation doesn't need a sampler by
238          * itself, we still need to add the sampler configuration
239          * parameter if the output is 32 bit
240          */
241         assert(sampler_idx < c->key->num_samplers_used);
242         bool output_type_32_bit =
243                 c->key->sampler[sampler_idx].return_size == 32;
244 
245         struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
246         };
247 
248         /* Limit the number of channels returned to both how many the NIR
249          * instruction writes and how many the instruction could produce.
250          */
251         uint32_t components_read = nir_def_components_read(&instr->def);
252         p0_unpacked.return_words_of_texture_data = output_type_32_bit ?
253                 (components_read & 0xf): (components_read & 0x3);
254         assert(p0_unpacked.return_words_of_texture_data != 0);
255 
256         struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = {
257                 .op = V3D_TMU_OP_REGULAR,
258                 .gather_mode = instr->op == nir_texop_tg4,
259                 .gather_component = instr->component,
260                 .coefficient_mode = instr->op == nir_texop_txd,
261                 .disable_autolod = instr->op == nir_texop_tg4,
262                 .lod_query = instr->op == nir_texop_lod,
263         };
264 
265         const unsigned tmu_writes = get_required_tex_tmu_writes(c, instr);
266 
267         /* The input FIFO has 16 slots across all threads so if we require
268          * more than that we need to lower thread count.
269          */
270         while (tmu_writes > 16 / c->threads)
271                 c->threads /= 2;
272 
273        /* If pipelining this TMU operation would overflow TMU fifos, we need
274         * to flush any outstanding TMU operations.
275         */
276         const unsigned dest_components =
277            util_bitcount(p0_unpacked.return_words_of_texture_data);
278         if (ntq_tmu_fifo_overflow(c, dest_components))
279                 ntq_flush_tmu(c);
280 
281         /* Process tex sources emitting corresponding TMU writes */
282         struct qreg s = { };
283         vir_tex_handle_srcs(c, instr, &p2_unpacked, &s, NULL);
284 
285         uint32_t p0_packed;
286         V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL,
287                                           (uint8_t *)&p0_packed,
288                                           &p0_unpacked);
289 
290         uint32_t p2_packed;
291         V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL,
292                                           (uint8_t *)&p2_packed,
293                                           &p2_unpacked);
294 
295         /* Load texture_idx number into the high bits of the texture address field,
296          * which will be be used by the driver to decide which texture to put
297          * in the actual address field.
298          */
299         p0_packed |= texture_idx << 24;
300 
301         vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P0, p0_packed);
302 
303         /* p1 is optional, but we can skip it only if p2 can be skipped too */
304         bool needs_p2_config =
305                 (instr->op == nir_texop_lod ||
306                  memcmp(&p2_unpacked, &p2_unpacked_default,
307                         sizeof(p2_unpacked)) != 0);
308 
309         /* To handle the cases were we can't just use p1_unpacked_default */
310         bool non_default_p1_config = nir_tex_instr_need_sampler(instr) ||
311                 output_type_32_bit;
312 
313         if (non_default_p1_config) {
314                 struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
315                         .output_type_32_bit = output_type_32_bit,
316 
317                         .unnormalized_coordinates = (instr->sampler_dim ==
318                                                      GLSL_SAMPLER_DIM_RECT),
319                 };
320 
321                 /* Word enables can't ask for more channels than the
322                  * output type could provide (2 for f16, 4 for
323                  * 32-bit).
324                  */
325                 assert(!p1_unpacked.output_type_32_bit ||
326                        p0_unpacked.return_words_of_texture_data < (1 << 4));
327                 assert(p1_unpacked.output_type_32_bit ||
328                        p0_unpacked.return_words_of_texture_data < (1 << 2));
329 
330                 uint32_t p1_packed;
331                 V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
332                                                   (uint8_t *)&p1_packed,
333                                                   &p1_unpacked);
334 
335                 if (nir_tex_instr_need_sampler(instr)) {
336                         /* Load sampler_idx number into the high bits of the
337                          * sampler address field, which will be be used by the
338                          * driver to decide which sampler to put in the actual
339                          * address field.
340                          */
341                         p1_packed |= sampler_idx << 24;
342 
343                         vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P1, p1_packed);
344                 } else {
345                         /* In this case, we don't need to merge in any
346                          * sampler state from the API and can just use
347                          * our packed bits */
348                         vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed);
349                 }
350         } else if (needs_p2_config) {
351                 /* Configuration parameters need to be set up in
352                  * order, and if P2 is needed, you need to set up P1
353                  * too even if sampler info is not needed by the
354                  * texture operation. But we can set up default info,
355                  * and avoid asking the driver for the sampler state
356                  * address
357                  */
358                 uint32_t p1_packed_default;
359                 V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
360                                                   (uint8_t *)&p1_packed_default,
361                                                   &p1_unpacked_default);
362                 vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed_default);
363         }
364 
365         if (needs_p2_config)
366                 vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
367 
368         /* Emit retiring TMU write */
369         struct qinst *retiring;
370         if (instr->op == nir_texop_txf) {
371                 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
372                 retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
373         } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
374                 retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
375         } else if (instr->op == nir_texop_txl) {
376                 retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
377         } else {
378                 retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
379         }
380 
381         retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
382         ntq_add_pending_tmu_flush(c, &instr->def,
383                                   p0_unpacked.return_words_of_texture_data);
384 }
385 
386 static uint32_t
v3d_image_atomic_tmu_op(nir_intrinsic_instr * instr)387 v3d_image_atomic_tmu_op(nir_intrinsic_instr *instr)
388 {
389         nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr);
390         switch (atomic_op) {
391         case nir_atomic_op_iadd:    return v3d_get_op_for_atomic_add(instr, 3);
392         case nir_atomic_op_imin:    return V3D_TMU_OP_WRITE_SMIN;
393         case nir_atomic_op_umin:    return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
394         case nir_atomic_op_imax:    return V3D_TMU_OP_WRITE_SMAX;
395         case nir_atomic_op_umax:    return V3D_TMU_OP_WRITE_UMAX;
396         case nir_atomic_op_iand:    return V3D_TMU_OP_WRITE_AND_READ_INC;
397         case nir_atomic_op_ior:     return V3D_TMU_OP_WRITE_OR_READ_DEC;
398         case nir_atomic_op_ixor:    return V3D_TMU_OP_WRITE_XOR_READ_NOT;
399         case nir_atomic_op_xchg:    return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
400         case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
401         default:                    unreachable("unknown atomic op");
402         }
403 }
404 
405 static uint32_t
v3d_image_load_store_tmu_op(nir_intrinsic_instr * instr)406 v3d_image_load_store_tmu_op(nir_intrinsic_instr *instr)
407 {
408         switch (instr->intrinsic) {
409         case nir_intrinsic_image_load:
410         case nir_intrinsic_image_store:
411                 return V3D_TMU_OP_REGULAR;
412 
413         case nir_intrinsic_image_atomic:
414         case nir_intrinsic_image_atomic_swap:
415                 return v3d_image_atomic_tmu_op(instr);
416 
417         default:
418                 unreachable("unknown image intrinsic");
419         };
420 }
421 
422 /**
423  * If 'tmu_writes' is not NULL, then it just counts required register writes,
424  * otherwise, it emits the actual register writes.
425  *
426  * It is important to notice that emitting register writes for the current
427  * TMU operation may trigger a TMU flush, since it is possible that any
428  * of the inputs required for the register writes is the result of a pending
429  * TMU operation. If that happens we need to make sure that it doesn't happen
430  * in the middle of the TMU register writes for the current TMU operation,
431  * which is why we always call ntq_get_src() even if we are only interested in
432  * register write counts.
433  */
434 static struct qinst *
vir_image_emit_register_writes(struct v3d_compile * c,nir_intrinsic_instr * instr,bool atomic_add_replaced,uint32_t * tmu_writes)435 vir_image_emit_register_writes(struct v3d_compile *c,
436                                nir_intrinsic_instr *instr,
437                                bool atomic_add_replaced,
438                                uint32_t *tmu_writes)
439 {
440         if (tmu_writes)
441                 *tmu_writes = 0;
442 
443         bool is_1d = false;
444         switch (nir_intrinsic_image_dim(instr)) {
445         case GLSL_SAMPLER_DIM_1D:
446                 is_1d = true;
447                 break;
448         case GLSL_SAMPLER_DIM_BUF:
449                 break;
450         case GLSL_SAMPLER_DIM_2D:
451         case GLSL_SAMPLER_DIM_RECT:
452         case GLSL_SAMPLER_DIM_CUBE: {
453                 struct qreg src = ntq_get_src(c, instr->src[1], 1);
454                 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUT, src, tmu_writes);
455                 break;
456         }
457         case GLSL_SAMPLER_DIM_3D: {
458                 struct qreg src_1_1 = ntq_get_src(c, instr->src[1], 1);
459                 struct qreg src_1_2 = ntq_get_src(c, instr->src[1], 2);
460                 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUT, src_1_1, tmu_writes);
461                 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUR, src_1_2, tmu_writes);
462                 break;
463         }
464         default:
465                 unreachable("bad image sampler dim");
466         }
467 
468         /* In order to fetch on a cube map, we need to interpret it as
469          * 2D arrays, where the third coord would be the face index.
470          */
471         if (nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_CUBE ||
472             nir_intrinsic_image_array(instr)) {
473                 struct qreg src = ntq_get_src(c, instr->src[1], is_1d ? 1 : 2);
474                 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUI, src, tmu_writes);
475         }
476 
477         /* Emit the data writes for atomics or image store. */
478         if (instr->intrinsic != nir_intrinsic_image_load &&
479             !atomic_add_replaced) {
480                 for (int i = 0; i < nir_intrinsic_src_components(instr, 3); i++) {
481                         struct qreg src_3_i = ntq_get_src(c, instr->src[3], i);
482                         vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUD, src_3_i,
483                                                tmu_writes);
484                 }
485 
486                 /* Second atomic argument */
487                 if (instr->intrinsic == nir_intrinsic_image_atomic_swap &&
488                     nir_intrinsic_atomic_op(instr) == nir_atomic_op_cmpxchg) {
489                         struct qreg src_4_0 = ntq_get_src(c, instr->src[4], 0);
490                         vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUD, src_4_0,
491                                                tmu_writes);
492                 }
493         }
494 
495         struct qreg src_1_0 = ntq_get_src(c, instr->src[1], 0);
496         if (!tmu_writes && vir_in_nonuniform_control_flow(c) &&
497             instr->intrinsic != nir_intrinsic_image_load) {
498                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
499                            V3D_QPU_PF_PUSHZ);
500         }
501 
502         struct qinst *retiring =
503                 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
504 
505         if (!tmu_writes && vir_in_nonuniform_control_flow(c) &&
506             instr->intrinsic != nir_intrinsic_image_load) {
507                 struct qinst *last_inst =
508                         (struct  qinst *)c->cur_block->instructions.prev;
509                 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
510         }
511 
512         return retiring;
513 }
514 
515 static unsigned
get_required_image_tmu_writes(struct v3d_compile * c,nir_intrinsic_instr * instr,bool atomic_add_replaced)516 get_required_image_tmu_writes(struct v3d_compile *c,
517                               nir_intrinsic_instr *instr,
518                               bool atomic_add_replaced)
519 {
520         unsigned tmu_writes;
521         vir_image_emit_register_writes(c, instr, atomic_add_replaced,
522                                        &tmu_writes);
523         return tmu_writes;
524 }
525 
526 static uint32_t
return_channels_required(nir_intrinsic_instr * instr,bool is_32bit)527 return_channels_required(nir_intrinsic_instr *instr, bool is_32bit)
528 {
529         if (nir_intrinsic_dest_components(instr) == 0)
530                 return 0;
531 
532         /* V3D requires that atomic operations always return data even if the
533          * shader doesn't use it.
534          */
535         if (instr->intrinsic == nir_intrinsic_image_atomic ||
536             instr->intrinsic == nir_intrinsic_image_atomic_swap) {
537                 return 1;
538         }
539 
540         /* Otherwise limit the number of words to read based on the components
541          * actually used by the shader, limited to the maximum allowed based
542          * on the output size.
543          */
544         nir_component_mask_t read_mask = nir_def_components_read(&instr->def);
545         read_mask &= is_32bit ? 0xf : 0x3;
546         assert(read_mask);
547 
548         if (read_mask & 0x8)
549                 return 4;
550         if (read_mask & 0x4)
551                 return 3;
552         if (read_mask & 0x2)
553                 return 2;
554         else
555                 return 1;
556 }
557 
558 void
v3d_vir_emit_image_load_store(struct v3d_compile * c,nir_intrinsic_instr * instr)559 v3d_vir_emit_image_load_store(struct v3d_compile *c,
560                               nir_intrinsic_instr *instr)
561 {
562         unsigned format = nir_intrinsic_format(instr);
563         unsigned unit = nir_src_as_uint(instr->src[0]);
564 
565         struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
566         };
567 
568         struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
569                 .per_pixel_mask_enable = true,
570                 .output_type_32_bit = v3d_gl_format_is_return_32(format),
571         };
572 
573         struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 };
574 
575         /* Limit the number of channels to those that are actually used */
576         uint32_t return_channels =
577                 return_channels_required(instr, p1_unpacked.output_type_32_bit);
578         assert(return_channels <= nir_intrinsic_dest_components(instr));
579         p0_unpacked.return_words_of_texture_data =
580                 (1 << return_channels) - 1;
581 
582         p2_unpacked.op = v3d_image_load_store_tmu_op(instr);
583 
584         /* If we were able to replace atomic_add for an inc/dec, then we
585          * need/can to do things slightly different, like not loading the
586          * amount to add/sub, as that is implicit.
587          */
588         bool atomic_add_replaced =
589                 instr->intrinsic == nir_intrinsic_image_atomic &&
590                 nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd &&
591                 (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC ||
592                  p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC);
593 
594         uint32_t p0_packed;
595         V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL,
596                                           (uint8_t *)&p0_packed,
597                                           &p0_unpacked);
598 
599         /* Load unit number into the high bits of the texture or sampler
600          * address field, which will be be used by the driver to decide which
601          * texture to put in the actual address field.
602          */
603         p0_packed |= unit << 24;
604 
605         uint32_t p1_packed;
606         V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
607                                           (uint8_t *)&p1_packed,
608                                           &p1_unpacked);
609 
610         uint32_t p2_packed;
611         V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL,
612                                           (uint8_t *)&p2_packed,
613                                           &p2_unpacked);
614 
615         if (instr->intrinsic != nir_intrinsic_image_load)
616                 c->tmu_dirty_rcl = true;
617 
618 
619         const uint32_t tmu_writes =
620                 get_required_image_tmu_writes(c, instr, atomic_add_replaced);
621 
622         /* The input FIFO has 16 slots across all threads so if we require
623          * more than that we need to lower thread count.
624          */
625         while (tmu_writes > 16 / c->threads)
626                 c->threads /= 2;
627 
628        /* If pipelining this TMU operation would overflow TMU fifos, we need
629         * to flush any outstanding TMU operations.
630         */
631         if (ntq_tmu_fifo_overflow(c, return_channels))
632                 ntq_flush_tmu(c);
633 
634         vir_WRTMUC(c, QUNIFORM_IMAGE_TMU_CONFIG_P0, p0_packed);
635         if (memcmp(&p1_unpacked, &p1_unpacked_default, sizeof(p1_unpacked)))
636                    vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed);
637         if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)))
638                    vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
639 
640         struct qinst *retiring =
641                 vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
642         retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
643         ntq_add_pending_tmu_flush(c, &instr->def,
644                                   p0_unpacked.return_words_of_texture_data);
645 }
646