xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/etnaviv/etnaviv_ml_tp.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (c) 2023-2024 Tomeu Vizoso <[email protected]>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "util/u_inlines.h"
7 
8 #include "etnaviv_context.h"
9 #include "etnaviv_debug.h"
10 #include "etnaviv_emit.h"
11 #include "etnaviv_ml_tp.h"
12 
13 #define FIELD(field, bits) uint32_t field : bits;
14 
15 struct etna_tp_params {
16    /* 0 */
17    FIELD(in_image_x_size, 16)
18    FIELD(unused0, 16)
19 
20    /* 1 */
21    FIELD(in_image_y_size, 16)
22    FIELD(in_image_z_size, 16)
23 
24    /* 2 */
25    FIELD(in_image_stride, 16)
26    FIELD(unused1, 16)
27 
28    /* 3 */
29    FIELD(in_image_slice, 32)
30 
31    /* 4 */
32    FIELD(in_window_x_start, 16)
33    FIELD(in_window_y_start, 16)
34 
35    /* 5 */
36    FIELD(in_window_x_end, 16)
37    FIELD(in_window_y_end, 16)
38 
39    /* 6 */
40    FIELD(in_tile_sequence, 2)
41    FIELD(in_tile_global_mem, 1)
42    FIELD(in_image_global_mem, 1)
43    FIELD(alu_i2f_enable, 1)
44    FIELD(alu_square_enable, 1)
45    FIELD(alu_horz_processing, 3) /* Watch out, it is split in two in the blob */
46    FIELD(alu_horz_proc_count, 6)
47    FIELD(alu_horz_proc_stride, 1)
48    FIELD(alu_vert_processing, 2)
49    FIELD(unused2, 1)
50    FIELD(alu_vert_proc_count, 6)
51    FIELD(alu_vert_proc_stride, 1)
52    FIELD(alu_nms_enable, 1)
53    FIELD(alu_pwl_enable, 1)
54    FIELD(alu_mult_enable, 1)
55    FIELD(alu_f2i_enable, 1)
56    FIELD(alu_load_pwl_lut, 1)
57    FIELD(alu_load_pwl_lut_global_mem, 1)
58 
59    /* 7 */
60    FIELD(in_tile_list_address, 32)
61 
62    /* 8 */
63    FIELD(in_tile_x_size, 16)
64    FIELD(in_tile_y_size, 16)
65 
66    /* 9 */
67    FIELD(in_tile_x_inc, 16)
68    FIELD(in_tile_y_inc, 16)
69 
70    /* 10 */
71    FIELD(in_image_base_address, 32)
72 
73    /* 11 */
74    FIELD(alu_load_pwl_lut_address, 32)
75 
76    /* 12 */
77    FIELD(out_tile_skip_at_border, 1)
78    FIELD(out_image_global_mem, 1)
79    FIELD(out_loop_1_reset, 1)
80    FIELD(out_loop_2_reset, 1)
81    FIELD(out_loop_3_reset, 1)
82    FIELD(out_brick_mode, 1)
83    FIELD(alu_z_filter_mode, 1)
84    FIELD(unused3, 1)
85    FIELD(in_window_z_start_overfetch, 2)
86    FIELD(unused4, 1)
87    FIELD(in_window_z_end_overfetch, 2)
88    FIELD(unused5, 1)
89    FIELD(alu_square_preshift, 4)
90    FIELD(in_image_data_type, 3)
91    FIELD(out_image_data_type, 3)
92    FIELD(unused6, 4)
93    FIELD(alu_pwl_sign_support, 1)
94    FIELD(alu_relu_enable, 1)
95    FIELD(no_flush, 1)
96    FIELD(last, 1)
97 
98    /* 13 */
99    FIELD(out_image_base_address, 32)
100 
101    /* 14 */
102    FIELD(out_loop_0_inc, 32)
103 
104    /* 15 */
105    FIELD(out_loop_1_inc, 32)
106 
107    /* 16 */
108    FIELD(out_loop_0_count, 16)
109    FIELD(out_loop_1_count, 16)
110 
111    /* 17 */
112    FIELD(out_loop_2_inc, 32)
113 
114    /* 18 */
115    FIELD(out_loop_3_inc, 32)
116 
117    /* 19 */
118    FIELD(out_loop_2_count, 16)
119    FIELD(out_loop_3_count, 16)
120 
121    /* 20 */
122    FIELD(out_loop_4_inc, 32)
123 
124    /* 21 */
125    FIELD(out_loop_5_inc, 32)
126 
127    /* 22 */
128    FIELD(out_loop_4_count, 16)
129    FIELD(out_loop_5_count, 16)
130 
131    /* 23 */
132    FIELD(out_loop_6_inc, 32)
133 
134    /* 24 */
135    FIELD(alu_filter_pwl_swap, 1)
136    FIELD(flat_rounding_mode, 2)
137    FIELD(integer_rounding_mode, 2)
138    FIELD(alu_input_preshift, 5)
139    FIELD(alu_output_postshift, 5)
140    FIELD(alu_reorder_bits_used, 4)
141    FIELD(alu_reorder_loop_2_mode, 1)
142    FIELD(unused7, 4)
143    FIELD(in_image_border_mode, 2)
144    FIELD(alu_output_postshift_5_6, 2)
145    FIELD(unused8, 4)
146 
147    /* 25 */
148    FIELD(in_image_circular_buf_size, 32)  /* >> 6 */
149 
150    /* 26 */
151    FIELD(in_image_circular_buf_end_address_plus_1, 32)  /* >> 6 */
152 
153    /* 27 */
154    FIELD(out_image_circular_buf_size, 32)  /* >> 6 */
155 
156    /* 28 */
157    FIELD(out_image_circular_buf_end_address_plus_1, 32)  /* >> 6 */
158 
159    /* 29 */
160    FIELD(in_image_border_const, 16)
161    FIELD(coef_zp, 8)
162    FIELD(in_zp, 8)
163 
164    /* 30 */
165    FIELD(out_zp, 8)
166    FIELD(alu_output_post_multiplier, 15)
167    FIELD(unused9, 9)
168 };
169 
170 static void
set_default_tp_config(struct etna_tp_params * map)171 set_default_tp_config(struct etna_tp_params *map)
172 {
173    map->unused0 = 0x0;
174    map->unused1 = 0x0;
175    map->in_window_x_start = 0x0;
176    map->in_window_y_start = 0x0;
177    map->in_tile_sequence = 0x0;
178    map->in_tile_global_mem = 0x0;
179    map->in_image_global_mem = 0x1;
180    map->alu_i2f_enable = 0x1;
181    map->alu_square_enable = 0x0;
182    map->alu_horz_processing = 0x0;
183    map->alu_horz_proc_count = 0x0;
184    map->alu_horz_proc_stride = 0x0;
185    map->alu_vert_processing = 0x0;
186    map->unused2 = 0x0;
187    map->alu_vert_proc_count = 0x0;
188    map->alu_vert_proc_stride = 0x0;
189    map->alu_nms_enable = 0x0;
190    map->alu_pwl_enable = 0x0;
191    map->alu_mult_enable = 0x0;
192    map->alu_f2i_enable = 0x1;
193    map->alu_load_pwl_lut = 0x0;
194    map->alu_load_pwl_lut_global_mem = 0x0;
195    map->in_tile_list_address = 0x0;
196    map->in_tile_x_size = 0x1;
197    map->in_tile_x_inc = 0x1;
198    map->alu_load_pwl_lut_address = 0x0;
199    map->out_tile_skip_at_border = 0x0;
200    map->out_image_global_mem = 0x1;
201    map->out_loop_1_reset = 0x0;
202    map->out_loop_2_reset = 0x0;
203    map->out_loop_3_reset = 0x0;
204    map->out_brick_mode = 0x0;
205    map->alu_z_filter_mode = 0x0;
206    map->unused3 = 0x0;
207    map->in_window_z_start_overfetch = 0x0;
208    map->unused4 = 0x0;
209    map->in_window_z_end_overfetch = 0x0;
210    map->unused5 = 0x0;
211    map->alu_square_preshift = 0x0;
212    map->in_image_data_type = 0x0;
213    map->out_image_data_type = 0x0;
214    map->unused6 = 0x0;
215    map->alu_pwl_sign_support = 0x0;
216    map->alu_relu_enable = 0x0;
217    map->no_flush = 0x0;
218    map->last = 0x1;
219    map->out_loop_0_inc = 0x1;
220    map->out_loop_3_inc = 0x0;
221    map->out_loop_3_count = 0x1;
222    map->out_loop_4_inc = 0x0;
223    map->out_loop_5_inc = 0x0;
224    map->out_loop_4_count = 0x1;
225    map->out_loop_5_count = 0x1;
226    map->out_loop_6_inc = 0x0;
227    map->alu_filter_pwl_swap = 0x0;
228    map->flat_rounding_mode = 0x1;
229    map->integer_rounding_mode = 0x1;
230    map->alu_input_preshift = 0x0;
231    map->alu_output_postshift = 0x0;
232    map->alu_reorder_bits_used = 0x0;
233    map->alu_reorder_loop_2_mode = 0x0;
234    map->unused7 = 0x0;
235    map->in_image_border_mode = 0x0;
236    map->alu_output_postshift_5_6 = 0x0;
237    map->unused8 = 0x0;
238    map->in_image_border_const = 0x0;
239    map->coef_zp = 0x0;
240    map->alu_output_post_multiplier = 0x0;
241    map->unused9 = 0x0;
242 }
243 
244 static struct etna_bo *
create_transpose_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)245 create_transpose_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
246 {
247    struct etna_context *ctx = etna_context(subgraph->base.context);
248    struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
249                                     sizeof(struct etna_tp_params),
250                                     DRM_ETNA_GEM_CACHE_WC);
251 
252    etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
253 
254    struct etna_tp_params *map = etna_bo_map(bo);
255 
256    set_default_tp_config(map);
257 
258    map->in_image_x_size = operation->input_channels;
259    map->in_image_y_size = operation->input_height;
260    map->in_image_z_size = operation->input_width;
261    map->in_image_stride = operation->input_channels;
262    map->in_image_slice = operation->input_width * operation->input_channels;
263    map->in_window_x_end = operation->input_channels - 1;
264    map->in_window_y_end = operation->input_height - 1;
265    map->in_tile_y_size = operation->input_height;
266    map->in_tile_y_inc = operation->input_height;
267 
268    struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
269    map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
270 
271    struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
272    unsigned offset = etna_ml_get_offset(subgraph, operation->output_tensor);
273    map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
274 
275    map->out_loop_1_inc = operation->input_width * operation->input_height;
276    map->out_loop_0_count = operation->input_height;
277    map->out_loop_1_count = operation->input_channels;
278    map->out_loop_2_inc = operation->input_height;
279    map->out_loop_2_count = operation->input_width;
280    map->in_image_circular_buf_size = 0x0;
281    map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
282    map->out_image_circular_buf_size = 0x0;
283    map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
284    map->in_zp = operation->input_zero_point;
285    map->out_zp = operation->input_zero_point;
286    map->no_flush = 0x0;
287 
288    etna_bo_cpu_fini(bo);
289 
290    return bo;
291 }
292 
293 static struct etna_bo *
create_detranspose_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)294 create_detranspose_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
295 {
296    struct etna_context *ctx = etna_context(subgraph->base.context);
297    unsigned input_width = operation->input_width;
298    unsigned input_height = operation->input_height;
299    unsigned input_channels = operation->input_channels;
300    struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
301                                     sizeof(struct etna_tp_params),
302                                     DRM_ETNA_GEM_CACHE_WC);
303 
304    etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
305 
306    struct etna_tp_params *map = etna_bo_map(bo);
307 
308    set_default_tp_config(map);
309 
310    map->in_image_x_size = input_width;
311    map->in_image_y_size = input_height * input_channels;
312    map->in_image_z_size = 0x1;
313    map->in_image_stride = input_width;
314    map->in_image_slice = input_width * input_height * input_channels;
315    map->in_window_x_end = input_width - 1;
316    map->in_window_y_end = input_height * input_channels - 1;
317    map->in_tile_y_size = 0x1;
318    map->in_tile_y_inc = 0x1;
319 
320    struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
321    map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
322 
323    struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
324    map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo);
325 
326    map->out_loop_0_inc = input_channels;
327    map->out_loop_1_inc = 0x0;
328    map->out_loop_0_count = input_height;
329    map->out_loop_1_count = 0x1;
330    map->out_loop_2_inc = input_height * input_channels;
331    map->out_loop_2_count = input_width;
332    map->out_loop_3_inc = 0x1;
333    map->out_loop_3_count = input_channels;
334    map->out_loop_4_inc = input_width * input_height * input_channels;
335    map->in_image_circular_buf_size = 0x0;
336    map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
337    map->out_image_circular_buf_size = 0x0;
338    map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
339    map->in_zp = operation->input_zero_point;
340    map->out_zp = operation->input_zero_point;
341 
342    etna_bo_cpu_fini(bo);
343 
344    return bo;
345 }
346 
347 static void
set_input_size(const struct etna_operation * operation,struct etna_tp_params * map,unsigned tp_cores_used)348 set_input_size(const struct etna_operation *operation, struct etna_tp_params *map, unsigned tp_cores_used)
349 {
350    map->in_image_x_size = operation->input_width;
351 
352    if (operation->padding_same && operation->input_channels > 1) {
353       map->in_image_y_size = operation->input_height;
354       map->in_image_z_size = operation->input_channels / tp_cores_used;
355    } else if (operation->padding_same && operation->input_channels == 1) {
356       switch(operation->input_width) {
357       case 3:
358       case 5:
359          map->in_image_y_size = operation->input_height;
360          break;
361       case 8:
362          switch(operation->weight_width) {
363          case 3:
364             map->in_image_y_size = operation->input_height;
365             break;
366          case 5:
367             map->in_image_y_size = 5;
368             break;
369          }
370          break;
371       case 80:
372       case 112:
373          switch(operation->weight_width) {
374          case 3:
375             map->in_image_y_size = operation->input_height / tp_cores_used + 2;
376             break;
377          case 5:
378             map->in_image_y_size = operation->input_height / tp_cores_used + 1;
379             break;
380          }
381          break;
382       default:
383          unreachable("Unsupported input width");
384       }
385       map->in_image_z_size = operation->input_channels;
386    } else {
387       map->in_image_y_size = operation->input_height / tp_cores_used;
388       map->in_image_z_size = operation->input_channels;
389    }
390 }
391 
392 static struct etna_bo *
create_reshuffle_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned tp_core,unsigned tp_cores_used)393 create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
394                         unsigned tp_core, unsigned tp_cores_used)
395 {
396    struct etna_context *ctx = etna_context(subgraph->base.context);
397    unsigned tp_core_count = ctx->screen->info->npu.tp_core_count;
398    struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
399                                     sizeof(struct etna_tp_params),
400                                     DRM_ETNA_GEM_CACHE_WC);
401 
402    etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
403 
404    struct etna_tp_params *map = etna_bo_map(bo);
405 
406    set_default_tp_config(map);
407 
408    set_input_size(operation, map, tp_cores_used);
409 
410    map->in_image_stride = operation->input_width;
411    map->in_image_slice = operation->input_width * operation->input_height;
412 
413    if (operation->padding_same && (operation->weight_width == 5 || operation->input_width < 8)) {
414       if (operation->weight_width == 5 && operation->input_width < 8) {
415          map->in_window_x_start = 0xfffe;
416          map->in_window_y_start = 0xfffe;
417       } else {
418          map->in_window_x_start = 0xffff;
419          map->in_window_y_start = 0xffff;
420       }
421    } else {
422       map->in_window_x_start = 0x0;
423       map->in_window_y_start = 0x0;
424    }
425 
426    map->in_window_x_end = operation->input_width - 1;
427    map->in_window_y_end = (operation->input_height / tp_cores_used) - 1;
428    map->in_tile_x_size = operation->input_width;
429    map->in_tile_x_inc = operation->input_width;
430 
431    if (operation->input_width <= 8 && operation->input_channels == 1) {
432       map->in_tile_y_size = operation->input_height;
433       map->in_tile_y_inc = operation->input_height;
434    } else {
435       map->in_tile_y_size = operation->input_height / tp_cores_used;
436       map->in_tile_y_inc = operation->input_height / tp_cores_used;
437    }
438 
439    if (operation->padding_same) {
440       switch(operation->weight_width) {
441       case 3:
442          map->in_window_x_end += 2;
443          if (operation->input_width < 8) {
444             map->in_tile_x_size += 3;
445             map->in_tile_y_size += 1;
446             map->in_tile_y_inc += 1;
447          } else {
448             map->in_tile_x_size += 2;
449          }
450          break;
451       case 5:
452          map->in_window_x_end += 3;
453          if (operation->input_width < 8) {
454             map->in_tile_x_size += 5;
455          } else {
456             map->in_tile_x_size += 4;
457          }
458          break;
459       default:
460          unreachable("Unsupported weight size");
461       }
462 
463       if (operation->input_width <= 8 && operation->input_channels == 1 && operation->weight_width >= 5)
464          map->in_tile_x_size = operation->input_width / tp_cores_used + 2;
465 
466       if (operation->input_width > 8 && operation->input_channels == 1) {
467          switch(operation->weight_width) {
468          case 3:
469             map->in_window_y_end = (operation->input_height / tp_cores_used) + 1;
470             break;
471          case 5:
472             map->in_window_y_end = (operation->input_height / tp_cores_used);
473             break;
474          default:
475             unreachable("Unsupported weight size");
476          }
477       } else
478          map->in_window_y_end = map->in_window_x_end;
479 
480       map->in_tile_x_inc = map->in_tile_x_size;
481 
482       if (operation->input_channels > 1) {
483          map->in_tile_y_size = map->in_tile_x_size;
484          map->in_tile_y_inc = map->in_tile_x_size;
485       } else {
486          map->in_tile_y_size += 2;
487          map->in_tile_y_inc += 2;
488       }
489    } else {
490       if (operation->input_width < 8) {
491             map->in_window_x_end += 1;
492             map->in_window_y_end += 1;
493             map->in_tile_x_size += 1;
494             map->in_tile_y_size += 1;
495             map->in_tile_x_inc += 1;
496             map->in_tile_y_inc += 1;
497       }
498    }
499 
500    struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
501    map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
502 
503    if (operation->padding_same)
504       map->in_image_base_address += ((operation->input_width * operation->input_height * operation->input_channels) / tp_cores_used) * tp_core;
505    else
506       map->in_image_base_address += (operation->input_width * (operation->input_height / tp_cores_used)) * tp_core;
507 
508    struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
509    map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo);
510 
511    if (operation->padding_same)
512       map->out_image_base_address += ((map->in_tile_x_size * map->in_tile_y_size * operation->input_channels) / tp_cores_used) * tp_core;
513    else
514       map->out_image_base_address += ((operation->input_width * operation->input_width) / (operation->stride * operation->stride * tp_cores_used)) * tp_core;
515 
516    map->out_loop_1_reset = 0x1;
517    map->out_loop_2_reset = 0x0;
518    map->out_loop_3_reset = 0x1;
519    map->out_loop_0_inc = pow(round(operation->input_width / 2.0), 2);
520    map->out_loop_1_inc = 0x1;
521    map->out_loop_0_count = 0x2;
522    map->out_loop_1_count = round(operation->input_width / 2.0);
523    map->out_loop_2_count = 0x2;
524    map->out_loop_3_count = DIV_ROUND_UP(round(operation->input_width / 2.0), tp_cores_used);
525 
526    if (operation->padding_same) {
527       switch(operation->weight_width) {
528       case 3:
529          map->out_loop_0_inc = pow(round(operation->input_width / 2.0) + 1, 2);
530          map->out_loop_1_count += 1;
531          break;
532       case 5:
533          map->out_loop_0_inc = pow(round(operation->input_width / 2.0) + 2, 2);
534          map->out_loop_1_count += 2;
535          break;
536       default:
537          unreachable("Unsupported weight size");
538       }
539 
540       if (operation->input_channels == 1)
541         map->out_loop_3_count += 1;
542       else
543         map->out_loop_3_count = map->out_loop_1_count;
544    }
545 
546    map->out_loop_2_inc = map->out_loop_0_inc * 2;
547    map->out_loop_3_inc = map->out_loop_1_count;
548    map->out_loop_6_inc = map->out_loop_0_inc * 4;
549 
550    if (operation->padding_same && tp_cores_used > 1 && operation->input_channels == 1) {
551       if (tp_core > 0) {
552          map->in_image_y_size -= 2;
553          map->in_window_y_end -= 2;
554          map->in_tile_y_size -= 2;
555          map->in_tile_y_inc -= 2;
556          map->out_loop_3_count -= 1;
557       }
558 
559       if (tp_core == tp_core_count - 1) {
560          map->in_image_y_size -= 2;
561       }
562 
563       if (tp_core > 0) {
564          map->in_image_base_address += operation->input_width * 2;
565          map->out_image_base_address -= (tp_core - 1) * (round(operation->input_width / 2.0) + 1);
566       }
567    }
568 
569    unsigned alu_size = operation->input_width;
570    if (operation->padding_same) {
571       alu_size += 1;
572       if (operation->weight_width == 5)
573          alu_size += 1;
574       if (operation->input_width == 5)
575          alu_size += 1;
576    }
577 
578    map->alu_reorder_bits_used = sizeof(alu_size) * 8 - __builtin_clz(alu_size);
579 
580    map->in_zp = operation->input_zero_point;
581    map->out_zp = operation->input_zero_point;
582 
583    if (tp_cores_used > 1)
584       map->no_flush = tp_core < tp_cores_used - 1;
585 
586    map->in_image_circular_buf_size = 0x0;
587    map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
588    map->out_image_circular_buf_size = 0x0;
589    map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
590 
591    if (map->in_image_y_size < 2) {
592       map->in_image_y_size = operation->input_width;
593       map->in_image_z_size = (operation->input_width * operation->input_height * operation->input_channels) / (map->in_image_x_size * map->in_image_y_size) / tp_cores_used;
594       map->in_window_y_end = operation->input_width;
595       map->in_tile_y_size = operation->input_width + 1;
596       map->in_tile_y_inc = operation->input_width + 1;
597       map->out_loop_3_count += 1;
598 
599       map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
600       map->in_image_base_address += ((operation->input_width * operation->input_height * operation->input_channels) / tp_cores_used) * tp_core;
601 
602       map->out_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
603       map->out_image_base_address += ((map->in_tile_x_size * map->in_tile_y_size * operation->input_channels) / tp_cores_used) * tp_core;
604    }
605 
606    etna_bo_cpu_fini(bo);
607 
608    return bo;
609 }
610 
611 void
etna_ml_lower_transpose(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * first_operation,struct etna_operation * operation,unsigned * output_tensor)612 etna_ml_lower_transpose(struct etna_ml_subgraph *subgraph,
613                         const struct pipe_ml_operation *first_operation,
614                         struct etna_operation *operation,
615                         unsigned *output_tensor)
616 {
617    operation->type = ETNA_JOB_TYPE_TP;
618    operation->tp_type = ETNA_ML_TP_TRANSPOSE;
619 
620    operation->input_tensor = first_operation->input_tensor->index;
621    operation->input_width = first_operation->input_tensor->dims[1];
622    operation->input_height = first_operation->input_tensor->dims[2];
623    operation->input_channels = first_operation->input_tensor->dims[3];
624    operation->input_zero_point = first_operation->input_tensor->zero_point;
625    operation->input_scale = first_operation->input_tensor->scale;
626    operation->input_tensor_size = operation->input_width *
627                                   operation->input_height *
628                                   operation->input_channels;
629 
630    *output_tensor = etna_ml_allocate_tensor(subgraph);
631    operation->output_tensor = *output_tensor;
632    operation->output_width = first_operation->input_tensor->dims[1];
633    operation->output_height = first_operation->input_tensor->dims[2];
634    operation->output_channels = first_operation->input_tensor->dims[3];
635    operation->output_zero_point = first_operation->input_tensor->zero_point;
636    operation->output_scale = first_operation->input_tensor->scale;
637 }
638 
639 void
etna_ml_lower_detranspose(struct etna_ml_subgraph * subgraph,struct etna_operation * convolution,struct etna_operation * operation)640 etna_ml_lower_detranspose(struct etna_ml_subgraph *subgraph,
641                           struct etna_operation *convolution,
642                           struct etna_operation *operation)
643 {
644    operation->type = ETNA_JOB_TYPE_TP;
645    operation->tp_type = ETNA_ML_TP_DETRANSPOSE;
646 
647    operation->input_tensor = etna_ml_allocate_tensor(subgraph);
648    operation->input_width = convolution->output_width;
649    operation->input_height = convolution->output_height;
650    operation->input_channels = convolution->output_channels;
651    operation->input_zero_point = convolution->output_zero_point;
652    operation->input_scale = convolution->output_scale;
653    operation->input_tensor_size = operation->input_width *
654                                   operation->input_height *
655                                   operation->input_channels;
656 
657    operation->output_tensor = convolution->output_tensor;
658    operation->output_width = convolution->output_width;
659    operation->output_height = convolution->output_height;
660    operation->output_channels = convolution->output_channels;
661    operation->output_zero_point = convolution->output_zero_point;
662    operation->output_scale = convolution->output_scale;
663 }
664 
665 void
etna_ml_lower_reshuffle(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * convolution,struct etna_operation * operation,unsigned * output_tensor)666 etna_ml_lower_reshuffle(struct etna_ml_subgraph *subgraph,
667                         const struct pipe_ml_operation *convolution,
668                         struct etna_operation *operation,
669                         unsigned *output_tensor)
670 {
671    operation->type = ETNA_JOB_TYPE_TP;
672    operation->tp_type = ETNA_ML_TP_RESHUFFLE;
673    operation->stride = convolution->conv.stride_x;
674    operation->padding_same = convolution->conv.padding_same;
675 
676    operation->input_tensor = convolution->input_tensor->index;
677    operation->input_width = convolution->input_tensor->dims[1];
678    operation->input_height = convolution->input_tensor->dims[2];
679    operation->input_channels = convolution->input_tensor->dims[3];
680    operation->input_zero_point = convolution->input_tensor->zero_point;
681    operation->input_scale = convolution->input_tensor->scale;
682    operation->input_tensor_size = operation->input_width *
683                                   operation->input_height *
684                                   operation->input_channels;
685 
686    *output_tensor = etna_ml_allocate_tensor(subgraph);
687    operation->output_tensor = *output_tensor;
688    operation->output_width = DIV_ROUND_UP(operation->input_width, operation->stride);
689    operation->output_height = DIV_ROUND_UP(operation->input_height, operation->stride);
690    operation->output_channels = operation->input_channels * operation->stride * operation->stride;
691    operation->output_zero_point = convolution->input_tensor->zero_point;
692    operation->output_scale = convolution->input_tensor->scale;
693 
694    /* When destriding a convolution, the transformation to be made to the input
695     * tensor will depend on the size of the weight tensor.
696     */
697    operation->weight_width = convolution->conv.weight_tensor->dims[1];
698    operation->weight_height = convolution->conv.weight_tensor->dims[2];
699 
700    if (operation->padding_same) {
701       if (operation->weight_width == 5) {
702          operation->output_width += 2;
703          operation->output_height += 2;
704       } else {
705          operation->output_width += 1;
706          operation->output_height += 1;
707       }
708    }
709 }
710 
711 void
etna_ml_compile_operation_tp(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct etna_vip_instruction * instruction)712 etna_ml_compile_operation_tp(struct etna_ml_subgraph *subgraph,
713                              const struct etna_operation *operation,
714                              struct etna_vip_instruction *instruction)
715 {
716    struct etna_context *ctx = etna_context(subgraph->base.context);
717    struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
718    assert(input);
719    pipe_resource_reference(&instruction->input, input);
720 
721    struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
722    assert(output);
723    pipe_resource_reference(&instruction->output, output);
724 
725    switch (operation->tp_type) {
726    case ETNA_ML_TP_TRANSPOSE:
727       instruction->configs[0] = create_transpose_config(subgraph, operation);
728       break;
729    case ETNA_ML_TP_DETRANSPOSE:
730       instruction->configs[0] = create_detranspose_config(subgraph, operation);
731       break;
732    case ETNA_ML_TP_RESHUFFLE: {
733       unsigned tp_core_count = ctx->screen->info->npu.tp_core_count;
734       unsigned tp_cores_used;
735 
736       tp_cores_used = (operation->input_width > 8 || operation->input_channels > 1) ? tp_core_count : 1;
737 
738       /* TODO: Run among the 4 cores for faster performance */
739       if ((operation->input_width == 320 || operation->input_width == 224) &&
740           operation->input_channels == 3)
741          tp_cores_used = 1;
742 
743       ML_DBG("reshuffle: input_width %d tp_cores_used %d\n", operation->input_width, tp_cores_used);
744       for (unsigned i = 0; i < tp_cores_used; i++) {
745          instruction->configs[i] = create_reshuffle_config(subgraph, operation, i, tp_cores_used);
746       }
747       break;
748    }
749    }
750    instruction->type = ETNA_JOB_TYPE_TP;
751 }
752 
753 void
etna_ml_emit_operation_tp(struct etna_ml_subgraph * subgraph,struct etna_vip_instruction * operation,unsigned idx)754 etna_ml_emit_operation_tp(struct etna_ml_subgraph *subgraph,
755                           struct etna_vip_instruction *operation,
756                           unsigned idx)
757 {
758    struct etna_context *ctx = etna_context(subgraph->base.context);
759    unsigned tp_core_count = ctx->screen->info->npu.tp_core_count;
760    struct etna_cmd_stream *stream = ctx->stream;
761    bool more_than_one_tp_job = operation->configs[1] != NULL;
762    bool parallel = DBG_ENABLED(ETNA_DBG_NPU_PARALLEL);
763 
764    for (unsigned j = 0; j < tp_core_count && operation->configs[j]; j++) {
765       unsigned offset = parallel ? idx + 1 : 0;
766 
767       if (more_than_one_tp_job && (j < tp_core_count - 1))
768             offset = parallel ? 0x1f : 0x1;
769 
770       etna_set_state(stream, VIVS_GL_OCB_REMAP_START, 0x0);
771       etna_set_state(stream, VIVS_GL_OCB_REMAP_END, 0x0);
772       etna_set_state(stream, VIVS_GL_TP_CONFIG, 0x0);
773       etna_set_state_reloc(stream, VIVS_PS_TP_INST_ADDR, &(struct etna_reloc) {
774          .bo = operation->configs[j],
775          .flags = ETNA_RELOC_READ,
776          .offset = offset,
777       });
778    }
779    etna_set_state(stream, VIVS_PS_UNK10A4, parallel ? idx + 1 : 0x0);
780 }
781