1 /*
2 * Copyright © 2016-2018 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "v3d_compiler.h"
25
26 /* We don't do any address packing. */
27 #define __gen_user_data void
28 #define __gen_address_type uint32_t
29 #define __gen_address_offset(reloc) (*reloc)
30 #define __gen_emit_reloc(cl, reloc)
31 #include "cle/v3d_packet_v42_pack.h"
32
33 static inline struct qinst *
vir_TMU_WRITE(struct v3d_compile * c,enum v3d_qpu_waddr waddr,struct qreg val)34 vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val)
35 {
36 /* XXX perf: We should figure out how to merge ALU operations
37 * producing the val with this MOV, when possible.
38 */
39 return vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
40 }
41
42 static inline struct qinst *
vir_TMU_WRITE_or_count(struct v3d_compile * c,enum v3d_qpu_waddr waddr,struct qreg val,uint32_t * tmu_writes)43 vir_TMU_WRITE_or_count(struct v3d_compile *c,
44 enum v3d_qpu_waddr waddr,
45 struct qreg val,
46 uint32_t *tmu_writes)
47 {
48 if (tmu_writes) {
49 (*tmu_writes)++;
50 return NULL;
51 } else {
52 return vir_TMU_WRITE(c, waddr, val);
53 }
54 }
55
56 static void
vir_WRTMUC(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)57 vir_WRTMUC(struct v3d_compile *c, enum quniform_contents contents, uint32_t data)
58 {
59 struct qinst *inst = vir_NOP(c);
60 inst->qpu.sig.wrtmuc = true;
61 inst->uniform = vir_get_uniform_index(c, contents, data);
62 }
63
64 static const struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = {
65 .per_pixel_mask_enable = true,
66 };
67
68 static const struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = {
69 .op = V3D_TMU_OP_REGULAR,
70 };
71
72 /**
73 * If 'tmu_writes' is not NULL, then it just counts required register writes,
74 * otherwise, it emits the actual register writes.
75 *
76 * It is important to notice that emitting register writes for the current
77 * TMU operation may trigger a TMU flush, since it is possible that any
78 * of the inputs required for the register writes is the result of a pending
79 * TMU operation. If that happens we need to make sure that it doesn't happen
80 * in the middle of the TMU register writes for the current TMU operation,
81 * which is why we always call ntq_get_src() even if we are only interested in
82 * register write counts.
83 */
84 static void
handle_tex_src(struct v3d_compile * c,nir_tex_instr * instr,unsigned src_idx,unsigned non_array_components,struct V3D42_TMU_CONFIG_PARAMETER_2 * p2_unpacked,struct qreg * s_out,unsigned * tmu_writes)85 handle_tex_src(struct v3d_compile *c,
86 nir_tex_instr *instr,
87 unsigned src_idx,
88 unsigned non_array_components,
89 struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
90 struct qreg *s_out,
91 unsigned *tmu_writes)
92 {
93 /* Either we are calling this just to count required TMU writes, or we
94 * are calling this to emit the actual TMU writes.
95 */
96 assert(tmu_writes || (s_out && p2_unpacked));
97
98 struct qreg s;
99 switch (instr->src[src_idx].src_type) {
100 case nir_tex_src_coord:
101 /* S triggers the lookup, so save it for the end. */
102 s = ntq_get_src(c, instr->src[src_idx].src, 0);
103 if (tmu_writes)
104 (*tmu_writes)++;
105 else
106 *s_out = s;
107
108 if (non_array_components > 1) {
109 struct qreg src =
110 ntq_get_src(c, instr->src[src_idx].src, 1);
111 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUT, src,
112 tmu_writes);
113 }
114
115 if (non_array_components > 2) {
116 struct qreg src =
117 ntq_get_src(c, instr->src[src_idx].src, 2);
118 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUR, src,
119 tmu_writes);
120 }
121
122 if (instr->is_array) {
123 struct qreg src =
124 ntq_get_src(c, instr->src[src_idx].src,
125 instr->coord_components - 1);
126 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUI, src,
127 tmu_writes);
128 }
129 break;
130
131 case nir_tex_src_bias: {
132 struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0);
133 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUB, src, tmu_writes);
134 break;
135 }
136
137 case nir_tex_src_lod: {
138 struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0);
139 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUB, src, tmu_writes);
140 if (!tmu_writes) {
141 /* With texel fetch automatic LOD is already disabled,
142 * and disable_autolod must not be enabled. For
143 * non-cubes we can use the register TMUSLOD, that
144 * implicitly sets disable_autolod.
145 */
146 assert(p2_unpacked);
147 if (instr->op != nir_texop_txf &&
148 instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
149 p2_unpacked->disable_autolod = true;
150 }
151 }
152 break;
153 }
154
155 case nir_tex_src_comparator: {
156 struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0);
157 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUDREF, src, tmu_writes);
158 break;
159 }
160
161 case nir_tex_src_offset: {
162 bool is_const_offset = nir_src_is_const(instr->src[src_idx].src);
163 if (is_const_offset) {
164 if (!tmu_writes) {
165 p2_unpacked->offset_s =
166 nir_src_comp_as_int(instr->src[src_idx].src, 0);
167 if (non_array_components >= 2)
168 p2_unpacked->offset_t =
169 nir_src_comp_as_int(instr->src[src_idx].src, 1);
170 if (non_array_components >= 3)
171 p2_unpacked->offset_r =
172 nir_src_comp_as_int(instr->src[src_idx].src, 2);
173 }
174 } else {
175 struct qreg src_0 =
176 ntq_get_src(c, instr->src[src_idx].src, 0);
177 struct qreg src_1 =
178 ntq_get_src(c, instr->src[src_idx].src, 1);
179 if (!tmu_writes) {
180 struct qreg mask = vir_uniform_ui(c, 0xf);
181 struct qreg x, y, offset;
182
183 x = vir_AND(c, src_0, mask);
184 y = vir_AND(c, src_1, mask);
185 offset = vir_OR(c, x,
186 vir_SHL(c, y, vir_uniform_ui(c, 4)));
187
188 vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUOFF, offset);
189 } else {
190 (*tmu_writes)++;
191 }
192 }
193 break;
194 }
195
196 default:
197 unreachable("unknown texture source");
198 }
199 }
200
201 static void
vir_tex_handle_srcs(struct v3d_compile * c,nir_tex_instr * instr,struct V3D42_TMU_CONFIG_PARAMETER_2 * p2_unpacked,struct qreg * s,unsigned * tmu_writes)202 vir_tex_handle_srcs(struct v3d_compile *c,
203 nir_tex_instr *instr,
204 struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
205 struct qreg *s,
206 unsigned *tmu_writes)
207 {
208 unsigned non_array_components = instr->op != nir_texop_lod ?
209 instr->coord_components - instr->is_array :
210 instr->coord_components;
211
212 for (unsigned i = 0; i < instr->num_srcs; i++) {
213 handle_tex_src(c, instr, i, non_array_components,
214 p2_unpacked, s, tmu_writes);
215 }
216 }
217
218 static unsigned
get_required_tex_tmu_writes(struct v3d_compile * c,nir_tex_instr * instr)219 get_required_tex_tmu_writes(struct v3d_compile *c, nir_tex_instr *instr)
220 {
221 unsigned tmu_writes = 0;
222 vir_tex_handle_srcs(c, instr, NULL, NULL, &tmu_writes);
223 return tmu_writes;
224 }
225
226 void
v3d_vir_emit_tex(struct v3d_compile * c,nir_tex_instr * instr)227 v3d_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
228 {
229 unsigned texture_idx = instr->texture_index;
230
231 /* For instructions that don't have a sampler (i.e. txf) we bind
232 * default sampler state via the backend_flags to handle precision.
233 */
234 unsigned sampler_idx = nir_tex_instr_need_sampler(instr) ?
235 instr->sampler_index : instr->backend_flags;
236
237 /* Even if the texture operation doesn't need a sampler by
238 * itself, we still need to add the sampler configuration
239 * parameter if the output is 32 bit
240 */
241 assert(sampler_idx < c->key->num_samplers_used);
242 bool output_type_32_bit =
243 c->key->sampler[sampler_idx].return_size == 32;
244
245 struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
246 };
247
248 /* Limit the number of channels returned to both how many the NIR
249 * instruction writes and how many the instruction could produce.
250 */
251 uint32_t components_read = nir_def_components_read(&instr->def);
252 p0_unpacked.return_words_of_texture_data = output_type_32_bit ?
253 (components_read & 0xf): (components_read & 0x3);
254 assert(p0_unpacked.return_words_of_texture_data != 0);
255
256 struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = {
257 .op = V3D_TMU_OP_REGULAR,
258 .gather_mode = instr->op == nir_texop_tg4,
259 .gather_component = instr->component,
260 .coefficient_mode = instr->op == nir_texop_txd,
261 .disable_autolod = instr->op == nir_texop_tg4,
262 .lod_query = instr->op == nir_texop_lod,
263 };
264
265 const unsigned tmu_writes = get_required_tex_tmu_writes(c, instr);
266
267 /* The input FIFO has 16 slots across all threads so if we require
268 * more than that we need to lower thread count.
269 */
270 while (tmu_writes > 16 / c->threads)
271 c->threads /= 2;
272
273 /* If pipelining this TMU operation would overflow TMU fifos, we need
274 * to flush any outstanding TMU operations.
275 */
276 const unsigned dest_components =
277 util_bitcount(p0_unpacked.return_words_of_texture_data);
278 if (ntq_tmu_fifo_overflow(c, dest_components))
279 ntq_flush_tmu(c);
280
281 /* Process tex sources emitting corresponding TMU writes */
282 struct qreg s = { };
283 vir_tex_handle_srcs(c, instr, &p2_unpacked, &s, NULL);
284
285 uint32_t p0_packed;
286 V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL,
287 (uint8_t *)&p0_packed,
288 &p0_unpacked);
289
290 uint32_t p2_packed;
291 V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL,
292 (uint8_t *)&p2_packed,
293 &p2_unpacked);
294
295 /* Load texture_idx number into the high bits of the texture address field,
296 * which will be be used by the driver to decide which texture to put
297 * in the actual address field.
298 */
299 p0_packed |= texture_idx << 24;
300
301 vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P0, p0_packed);
302
303 /* p1 is optional, but we can skip it only if p2 can be skipped too */
304 bool needs_p2_config =
305 (instr->op == nir_texop_lod ||
306 memcmp(&p2_unpacked, &p2_unpacked_default,
307 sizeof(p2_unpacked)) != 0);
308
309 /* To handle the cases were we can't just use p1_unpacked_default */
310 bool non_default_p1_config = nir_tex_instr_need_sampler(instr) ||
311 output_type_32_bit;
312
313 if (non_default_p1_config) {
314 struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
315 .output_type_32_bit = output_type_32_bit,
316
317 .unnormalized_coordinates = (instr->sampler_dim ==
318 GLSL_SAMPLER_DIM_RECT),
319 };
320
321 /* Word enables can't ask for more channels than the
322 * output type could provide (2 for f16, 4 for
323 * 32-bit).
324 */
325 assert(!p1_unpacked.output_type_32_bit ||
326 p0_unpacked.return_words_of_texture_data < (1 << 4));
327 assert(p1_unpacked.output_type_32_bit ||
328 p0_unpacked.return_words_of_texture_data < (1 << 2));
329
330 uint32_t p1_packed;
331 V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
332 (uint8_t *)&p1_packed,
333 &p1_unpacked);
334
335 if (nir_tex_instr_need_sampler(instr)) {
336 /* Load sampler_idx number into the high bits of the
337 * sampler address field, which will be be used by the
338 * driver to decide which sampler to put in the actual
339 * address field.
340 */
341 p1_packed |= sampler_idx << 24;
342
343 vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P1, p1_packed);
344 } else {
345 /* In this case, we don't need to merge in any
346 * sampler state from the API and can just use
347 * our packed bits */
348 vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed);
349 }
350 } else if (needs_p2_config) {
351 /* Configuration parameters need to be set up in
352 * order, and if P2 is needed, you need to set up P1
353 * too even if sampler info is not needed by the
354 * texture operation. But we can set up default info,
355 * and avoid asking the driver for the sampler state
356 * address
357 */
358 uint32_t p1_packed_default;
359 V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
360 (uint8_t *)&p1_packed_default,
361 &p1_unpacked_default);
362 vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed_default);
363 }
364
365 if (needs_p2_config)
366 vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
367
368 /* Emit retiring TMU write */
369 struct qinst *retiring;
370 if (instr->op == nir_texop_txf) {
371 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
372 retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
373 } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
374 retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
375 } else if (instr->op == nir_texop_txl) {
376 retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
377 } else {
378 retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
379 }
380
381 retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
382 ntq_add_pending_tmu_flush(c, &instr->def,
383 p0_unpacked.return_words_of_texture_data);
384 }
385
386 static uint32_t
v3d_image_atomic_tmu_op(nir_intrinsic_instr * instr)387 v3d_image_atomic_tmu_op(nir_intrinsic_instr *instr)
388 {
389 nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr);
390 switch (atomic_op) {
391 case nir_atomic_op_iadd: return v3d_get_op_for_atomic_add(instr, 3);
392 case nir_atomic_op_imin: return V3D_TMU_OP_WRITE_SMIN;
393 case nir_atomic_op_umin: return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
394 case nir_atomic_op_imax: return V3D_TMU_OP_WRITE_SMAX;
395 case nir_atomic_op_umax: return V3D_TMU_OP_WRITE_UMAX;
396 case nir_atomic_op_iand: return V3D_TMU_OP_WRITE_AND_READ_INC;
397 case nir_atomic_op_ior: return V3D_TMU_OP_WRITE_OR_READ_DEC;
398 case nir_atomic_op_ixor: return V3D_TMU_OP_WRITE_XOR_READ_NOT;
399 case nir_atomic_op_xchg: return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
400 case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
401 default: unreachable("unknown atomic op");
402 }
403 }
404
405 static uint32_t
v3d_image_load_store_tmu_op(nir_intrinsic_instr * instr)406 v3d_image_load_store_tmu_op(nir_intrinsic_instr *instr)
407 {
408 switch (instr->intrinsic) {
409 case nir_intrinsic_image_load:
410 case nir_intrinsic_image_store:
411 return V3D_TMU_OP_REGULAR;
412
413 case nir_intrinsic_image_atomic:
414 case nir_intrinsic_image_atomic_swap:
415 return v3d_image_atomic_tmu_op(instr);
416
417 default:
418 unreachable("unknown image intrinsic");
419 };
420 }
421
422 /**
423 * If 'tmu_writes' is not NULL, then it just counts required register writes,
424 * otherwise, it emits the actual register writes.
425 *
426 * It is important to notice that emitting register writes for the current
427 * TMU operation may trigger a TMU flush, since it is possible that any
428 * of the inputs required for the register writes is the result of a pending
429 * TMU operation. If that happens we need to make sure that it doesn't happen
430 * in the middle of the TMU register writes for the current TMU operation,
431 * which is why we always call ntq_get_src() even if we are only interested in
432 * register write counts.
433 */
434 static struct qinst *
vir_image_emit_register_writes(struct v3d_compile * c,nir_intrinsic_instr * instr,bool atomic_add_replaced,uint32_t * tmu_writes)435 vir_image_emit_register_writes(struct v3d_compile *c,
436 nir_intrinsic_instr *instr,
437 bool atomic_add_replaced,
438 uint32_t *tmu_writes)
439 {
440 if (tmu_writes)
441 *tmu_writes = 0;
442
443 bool is_1d = false;
444 switch (nir_intrinsic_image_dim(instr)) {
445 case GLSL_SAMPLER_DIM_1D:
446 is_1d = true;
447 break;
448 case GLSL_SAMPLER_DIM_BUF:
449 break;
450 case GLSL_SAMPLER_DIM_2D:
451 case GLSL_SAMPLER_DIM_RECT:
452 case GLSL_SAMPLER_DIM_CUBE: {
453 struct qreg src = ntq_get_src(c, instr->src[1], 1);
454 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUT, src, tmu_writes);
455 break;
456 }
457 case GLSL_SAMPLER_DIM_3D: {
458 struct qreg src_1_1 = ntq_get_src(c, instr->src[1], 1);
459 struct qreg src_1_2 = ntq_get_src(c, instr->src[1], 2);
460 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUT, src_1_1, tmu_writes);
461 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUR, src_1_2, tmu_writes);
462 break;
463 }
464 default:
465 unreachable("bad image sampler dim");
466 }
467
468 /* In order to fetch on a cube map, we need to interpret it as
469 * 2D arrays, where the third coord would be the face index.
470 */
471 if (nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_CUBE ||
472 nir_intrinsic_image_array(instr)) {
473 struct qreg src = ntq_get_src(c, instr->src[1], is_1d ? 1 : 2);
474 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUI, src, tmu_writes);
475 }
476
477 /* Emit the data writes for atomics or image store. */
478 if (instr->intrinsic != nir_intrinsic_image_load &&
479 !atomic_add_replaced) {
480 for (int i = 0; i < nir_intrinsic_src_components(instr, 3); i++) {
481 struct qreg src_3_i = ntq_get_src(c, instr->src[3], i);
482 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUD, src_3_i,
483 tmu_writes);
484 }
485
486 /* Second atomic argument */
487 if (instr->intrinsic == nir_intrinsic_image_atomic_swap &&
488 nir_intrinsic_atomic_op(instr) == nir_atomic_op_cmpxchg) {
489 struct qreg src_4_0 = ntq_get_src(c, instr->src[4], 0);
490 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUD, src_4_0,
491 tmu_writes);
492 }
493 }
494
495 struct qreg src_1_0 = ntq_get_src(c, instr->src[1], 0);
496 if (!tmu_writes && vir_in_nonuniform_control_flow(c) &&
497 instr->intrinsic != nir_intrinsic_image_load) {
498 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
499 V3D_QPU_PF_PUSHZ);
500 }
501
502 struct qinst *retiring =
503 vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
504
505 if (!tmu_writes && vir_in_nonuniform_control_flow(c) &&
506 instr->intrinsic != nir_intrinsic_image_load) {
507 struct qinst *last_inst =
508 (struct qinst *)c->cur_block->instructions.prev;
509 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
510 }
511
512 return retiring;
513 }
514
515 static unsigned
get_required_image_tmu_writes(struct v3d_compile * c,nir_intrinsic_instr * instr,bool atomic_add_replaced)516 get_required_image_tmu_writes(struct v3d_compile *c,
517 nir_intrinsic_instr *instr,
518 bool atomic_add_replaced)
519 {
520 unsigned tmu_writes;
521 vir_image_emit_register_writes(c, instr, atomic_add_replaced,
522 &tmu_writes);
523 return tmu_writes;
524 }
525
526 static uint32_t
return_channels_required(nir_intrinsic_instr * instr,bool is_32bit)527 return_channels_required(nir_intrinsic_instr *instr, bool is_32bit)
528 {
529 if (nir_intrinsic_dest_components(instr) == 0)
530 return 0;
531
532 /* V3D requires that atomic operations always return data even if the
533 * shader doesn't use it.
534 */
535 if (instr->intrinsic == nir_intrinsic_image_atomic ||
536 instr->intrinsic == nir_intrinsic_image_atomic_swap) {
537 return 1;
538 }
539
540 /* Otherwise limit the number of words to read based on the components
541 * actually used by the shader, limited to the maximum allowed based
542 * on the output size.
543 */
544 nir_component_mask_t read_mask = nir_def_components_read(&instr->def);
545 read_mask &= is_32bit ? 0xf : 0x3;
546 assert(read_mask);
547
548 if (read_mask & 0x8)
549 return 4;
550 if (read_mask & 0x4)
551 return 3;
552 if (read_mask & 0x2)
553 return 2;
554 else
555 return 1;
556 }
557
558 void
v3d_vir_emit_image_load_store(struct v3d_compile * c,nir_intrinsic_instr * instr)559 v3d_vir_emit_image_load_store(struct v3d_compile *c,
560 nir_intrinsic_instr *instr)
561 {
562 unsigned format = nir_intrinsic_format(instr);
563 unsigned unit = nir_src_as_uint(instr->src[0]);
564
565 struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
566 };
567
568 struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
569 .per_pixel_mask_enable = true,
570 .output_type_32_bit = v3d_gl_format_is_return_32(format),
571 };
572
573 struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 };
574
575 /* Limit the number of channels to those that are actually used */
576 uint32_t return_channels =
577 return_channels_required(instr, p1_unpacked.output_type_32_bit);
578 assert(return_channels <= nir_intrinsic_dest_components(instr));
579 p0_unpacked.return_words_of_texture_data =
580 (1 << return_channels) - 1;
581
582 p2_unpacked.op = v3d_image_load_store_tmu_op(instr);
583
584 /* If we were able to replace atomic_add for an inc/dec, then we
585 * need/can to do things slightly different, like not loading the
586 * amount to add/sub, as that is implicit.
587 */
588 bool atomic_add_replaced =
589 instr->intrinsic == nir_intrinsic_image_atomic &&
590 nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd &&
591 (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC ||
592 p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC);
593
594 uint32_t p0_packed;
595 V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL,
596 (uint8_t *)&p0_packed,
597 &p0_unpacked);
598
599 /* Load unit number into the high bits of the texture or sampler
600 * address field, which will be be used by the driver to decide which
601 * texture to put in the actual address field.
602 */
603 p0_packed |= unit << 24;
604
605 uint32_t p1_packed;
606 V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
607 (uint8_t *)&p1_packed,
608 &p1_unpacked);
609
610 uint32_t p2_packed;
611 V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL,
612 (uint8_t *)&p2_packed,
613 &p2_unpacked);
614
615 if (instr->intrinsic != nir_intrinsic_image_load)
616 c->tmu_dirty_rcl = true;
617
618
619 const uint32_t tmu_writes =
620 get_required_image_tmu_writes(c, instr, atomic_add_replaced);
621
622 /* The input FIFO has 16 slots across all threads so if we require
623 * more than that we need to lower thread count.
624 */
625 while (tmu_writes > 16 / c->threads)
626 c->threads /= 2;
627
628 /* If pipelining this TMU operation would overflow TMU fifos, we need
629 * to flush any outstanding TMU operations.
630 */
631 if (ntq_tmu_fifo_overflow(c, return_channels))
632 ntq_flush_tmu(c);
633
634 vir_WRTMUC(c, QUNIFORM_IMAGE_TMU_CONFIG_P0, p0_packed);
635 if (memcmp(&p1_unpacked, &p1_unpacked_default, sizeof(p1_unpacked)))
636 vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed);
637 if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)))
638 vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
639
640 struct qinst *retiring =
641 vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
642 retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
643 ntq_add_pending_tmu_flush(c, &instr->def,
644 p0_unpacked.return_words_of_texture_data);
645 }
646