1 /*
2 * Copyright (c) 2023-2024 Tomeu Vizoso <[email protected]>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "util/u_inlines.h"
7
8 #include "etnaviv_context.h"
9 #include "etnaviv_debug.h"
10 #include "etnaviv_emit.h"
11 #include "etnaviv_ml_tp.h"
12
13 #define FIELD(field, bits) uint32_t field : bits;
14
15 struct etna_tp_params {
16 /* 0 */
17 FIELD(in_image_x_size, 16)
18 FIELD(unused0, 16)
19
20 /* 1 */
21 FIELD(in_image_y_size, 16)
22 FIELD(in_image_z_size, 16)
23
24 /* 2 */
25 FIELD(in_image_stride, 16)
26 FIELD(unused1, 16)
27
28 /* 3 */
29 FIELD(in_image_slice, 32)
30
31 /* 4 */
32 FIELD(in_window_x_start, 16)
33 FIELD(in_window_y_start, 16)
34
35 /* 5 */
36 FIELD(in_window_x_end, 16)
37 FIELD(in_window_y_end, 16)
38
39 /* 6 */
40 FIELD(in_tile_sequence, 2)
41 FIELD(in_tile_global_mem, 1)
42 FIELD(in_image_global_mem, 1)
43 FIELD(alu_i2f_enable, 1)
44 FIELD(alu_square_enable, 1)
45 FIELD(alu_horz_processing, 3) /* Watch out, it is split in two in the blob */
46 FIELD(alu_horz_proc_count, 6)
47 FIELD(alu_horz_proc_stride, 1)
48 FIELD(alu_vert_processing, 2)
49 FIELD(unused2, 1)
50 FIELD(alu_vert_proc_count, 6)
51 FIELD(alu_vert_proc_stride, 1)
52 FIELD(alu_nms_enable, 1)
53 FIELD(alu_pwl_enable, 1)
54 FIELD(alu_mult_enable, 1)
55 FIELD(alu_f2i_enable, 1)
56 FIELD(alu_load_pwl_lut, 1)
57 FIELD(alu_load_pwl_lut_global_mem, 1)
58
59 /* 7 */
60 FIELD(in_tile_list_address, 32)
61
62 /* 8 */
63 FIELD(in_tile_x_size, 16)
64 FIELD(in_tile_y_size, 16)
65
66 /* 9 */
67 FIELD(in_tile_x_inc, 16)
68 FIELD(in_tile_y_inc, 16)
69
70 /* 10 */
71 FIELD(in_image_base_address, 32)
72
73 /* 11 */
74 FIELD(alu_load_pwl_lut_address, 32)
75
76 /* 12 */
77 FIELD(out_tile_skip_at_border, 1)
78 FIELD(out_image_global_mem, 1)
79 FIELD(out_loop_1_reset, 1)
80 FIELD(out_loop_2_reset, 1)
81 FIELD(out_loop_3_reset, 1)
82 FIELD(out_brick_mode, 1)
83 FIELD(alu_z_filter_mode, 1)
84 FIELD(unused3, 1)
85 FIELD(in_window_z_start_overfetch, 2)
86 FIELD(unused4, 1)
87 FIELD(in_window_z_end_overfetch, 2)
88 FIELD(unused5, 1)
89 FIELD(alu_square_preshift, 4)
90 FIELD(in_image_data_type, 3)
91 FIELD(out_image_data_type, 3)
92 FIELD(unused6, 4)
93 FIELD(alu_pwl_sign_support, 1)
94 FIELD(alu_relu_enable, 1)
95 FIELD(no_flush, 1)
96 FIELD(last, 1)
97
98 /* 13 */
99 FIELD(out_image_base_address, 32)
100
101 /* 14 */
102 FIELD(out_loop_0_inc, 32)
103
104 /* 15 */
105 FIELD(out_loop_1_inc, 32)
106
107 /* 16 */
108 FIELD(out_loop_0_count, 16)
109 FIELD(out_loop_1_count, 16)
110
111 /* 17 */
112 FIELD(out_loop_2_inc, 32)
113
114 /* 18 */
115 FIELD(out_loop_3_inc, 32)
116
117 /* 19 */
118 FIELD(out_loop_2_count, 16)
119 FIELD(out_loop_3_count, 16)
120
121 /* 20 */
122 FIELD(out_loop_4_inc, 32)
123
124 /* 21 */
125 FIELD(out_loop_5_inc, 32)
126
127 /* 22 */
128 FIELD(out_loop_4_count, 16)
129 FIELD(out_loop_5_count, 16)
130
131 /* 23 */
132 FIELD(out_loop_6_inc, 32)
133
134 /* 24 */
135 FIELD(alu_filter_pwl_swap, 1)
136 FIELD(flat_rounding_mode, 2)
137 FIELD(integer_rounding_mode, 2)
138 FIELD(alu_input_preshift, 5)
139 FIELD(alu_output_postshift, 5)
140 FIELD(alu_reorder_bits_used, 4)
141 FIELD(alu_reorder_loop_2_mode, 1)
142 FIELD(unused7, 4)
143 FIELD(in_image_border_mode, 2)
144 FIELD(alu_output_postshift_5_6, 2)
145 FIELD(unused8, 4)
146
147 /* 25 */
148 FIELD(in_image_circular_buf_size, 32) /* >> 6 */
149
150 /* 26 */
151 FIELD(in_image_circular_buf_end_address_plus_1, 32) /* >> 6 */
152
153 /* 27 */
154 FIELD(out_image_circular_buf_size, 32) /* >> 6 */
155
156 /* 28 */
157 FIELD(out_image_circular_buf_end_address_plus_1, 32) /* >> 6 */
158
159 /* 29 */
160 FIELD(in_image_border_const, 16)
161 FIELD(coef_zp, 8)
162 FIELD(in_zp, 8)
163
164 /* 30 */
165 FIELD(out_zp, 8)
166 FIELD(alu_output_post_multiplier, 15)
167 FIELD(unused9, 9)
168 };
169
170 static void
set_default_tp_config(struct etna_tp_params * map)171 set_default_tp_config(struct etna_tp_params *map)
172 {
173 map->unused0 = 0x0;
174 map->unused1 = 0x0;
175 map->in_window_x_start = 0x0;
176 map->in_window_y_start = 0x0;
177 map->in_tile_sequence = 0x0;
178 map->in_tile_global_mem = 0x0;
179 map->in_image_global_mem = 0x1;
180 map->alu_i2f_enable = 0x1;
181 map->alu_square_enable = 0x0;
182 map->alu_horz_processing = 0x0;
183 map->alu_horz_proc_count = 0x0;
184 map->alu_horz_proc_stride = 0x0;
185 map->alu_vert_processing = 0x0;
186 map->unused2 = 0x0;
187 map->alu_vert_proc_count = 0x0;
188 map->alu_vert_proc_stride = 0x0;
189 map->alu_nms_enable = 0x0;
190 map->alu_pwl_enable = 0x0;
191 map->alu_mult_enable = 0x0;
192 map->alu_f2i_enable = 0x1;
193 map->alu_load_pwl_lut = 0x0;
194 map->alu_load_pwl_lut_global_mem = 0x0;
195 map->in_tile_list_address = 0x0;
196 map->in_tile_x_size = 0x1;
197 map->in_tile_x_inc = 0x1;
198 map->alu_load_pwl_lut_address = 0x0;
199 map->out_tile_skip_at_border = 0x0;
200 map->out_image_global_mem = 0x1;
201 map->out_loop_1_reset = 0x0;
202 map->out_loop_2_reset = 0x0;
203 map->out_loop_3_reset = 0x0;
204 map->out_brick_mode = 0x0;
205 map->alu_z_filter_mode = 0x0;
206 map->unused3 = 0x0;
207 map->in_window_z_start_overfetch = 0x0;
208 map->unused4 = 0x0;
209 map->in_window_z_end_overfetch = 0x0;
210 map->unused5 = 0x0;
211 map->alu_square_preshift = 0x0;
212 map->in_image_data_type = 0x0;
213 map->out_image_data_type = 0x0;
214 map->unused6 = 0x0;
215 map->alu_pwl_sign_support = 0x0;
216 map->alu_relu_enable = 0x0;
217 map->no_flush = 0x0;
218 map->last = 0x1;
219 map->out_loop_0_inc = 0x1;
220 map->out_loop_3_inc = 0x0;
221 map->out_loop_3_count = 0x1;
222 map->out_loop_4_inc = 0x0;
223 map->out_loop_5_inc = 0x0;
224 map->out_loop_4_count = 0x1;
225 map->out_loop_5_count = 0x1;
226 map->out_loop_6_inc = 0x0;
227 map->alu_filter_pwl_swap = 0x0;
228 map->flat_rounding_mode = 0x1;
229 map->integer_rounding_mode = 0x1;
230 map->alu_input_preshift = 0x0;
231 map->alu_output_postshift = 0x0;
232 map->alu_reorder_bits_used = 0x0;
233 map->alu_reorder_loop_2_mode = 0x0;
234 map->unused7 = 0x0;
235 map->in_image_border_mode = 0x0;
236 map->alu_output_postshift_5_6 = 0x0;
237 map->unused8 = 0x0;
238 map->in_image_border_const = 0x0;
239 map->coef_zp = 0x0;
240 map->alu_output_post_multiplier = 0x0;
241 map->unused9 = 0x0;
242 }
243
244 static struct etna_bo *
create_transpose_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)245 create_transpose_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
246 {
247 struct etna_context *ctx = etna_context(subgraph->base.context);
248 struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
249 sizeof(struct etna_tp_params),
250 DRM_ETNA_GEM_CACHE_WC);
251
252 etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
253
254 struct etna_tp_params *map = etna_bo_map(bo);
255
256 set_default_tp_config(map);
257
258 map->in_image_x_size = operation->input_channels;
259 map->in_image_y_size = operation->input_height;
260 map->in_image_z_size = operation->input_width;
261 map->in_image_stride = operation->input_channels;
262 map->in_image_slice = operation->input_width * operation->input_channels;
263 map->in_window_x_end = operation->input_channels - 1;
264 map->in_window_y_end = operation->input_height - 1;
265 map->in_tile_y_size = operation->input_height;
266 map->in_tile_y_inc = operation->input_height;
267
268 struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
269 map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
270
271 struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
272 unsigned offset = etna_ml_get_offset(subgraph, operation->output_tensor);
273 map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
274
275 map->out_loop_1_inc = operation->input_width * operation->input_height;
276 map->out_loop_0_count = operation->input_height;
277 map->out_loop_1_count = operation->input_channels;
278 map->out_loop_2_inc = operation->input_height;
279 map->out_loop_2_count = operation->input_width;
280 map->in_image_circular_buf_size = 0x0;
281 map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
282 map->out_image_circular_buf_size = 0x0;
283 map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
284 map->in_zp = operation->input_zero_point;
285 map->out_zp = operation->input_zero_point;
286 map->no_flush = 0x0;
287
288 etna_bo_cpu_fini(bo);
289
290 return bo;
291 }
292
293 static struct etna_bo *
create_detranspose_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)294 create_detranspose_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
295 {
296 struct etna_context *ctx = etna_context(subgraph->base.context);
297 unsigned input_width = operation->input_width;
298 unsigned input_height = operation->input_height;
299 unsigned input_channels = operation->input_channels;
300 struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
301 sizeof(struct etna_tp_params),
302 DRM_ETNA_GEM_CACHE_WC);
303
304 etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
305
306 struct etna_tp_params *map = etna_bo_map(bo);
307
308 set_default_tp_config(map);
309
310 map->in_image_x_size = input_width;
311 map->in_image_y_size = input_height * input_channels;
312 map->in_image_z_size = 0x1;
313 map->in_image_stride = input_width;
314 map->in_image_slice = input_width * input_height * input_channels;
315 map->in_window_x_end = input_width - 1;
316 map->in_window_y_end = input_height * input_channels - 1;
317 map->in_tile_y_size = 0x1;
318 map->in_tile_y_inc = 0x1;
319
320 struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
321 map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
322
323 struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
324 map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo);
325
326 map->out_loop_0_inc = input_channels;
327 map->out_loop_1_inc = 0x0;
328 map->out_loop_0_count = input_height;
329 map->out_loop_1_count = 0x1;
330 map->out_loop_2_inc = input_height * input_channels;
331 map->out_loop_2_count = input_width;
332 map->out_loop_3_inc = 0x1;
333 map->out_loop_3_count = input_channels;
334 map->out_loop_4_inc = input_width * input_height * input_channels;
335 map->in_image_circular_buf_size = 0x0;
336 map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
337 map->out_image_circular_buf_size = 0x0;
338 map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
339 map->in_zp = operation->input_zero_point;
340 map->out_zp = operation->input_zero_point;
341
342 etna_bo_cpu_fini(bo);
343
344 return bo;
345 }
346
347 static void
set_input_size(const struct etna_operation * operation,struct etna_tp_params * map,unsigned tp_cores_used)348 set_input_size(const struct etna_operation *operation, struct etna_tp_params *map, unsigned tp_cores_used)
349 {
350 map->in_image_x_size = operation->input_width;
351
352 if (operation->padding_same && operation->input_channels > 1) {
353 map->in_image_y_size = operation->input_height;
354 map->in_image_z_size = operation->input_channels / tp_cores_used;
355 } else if (operation->padding_same && operation->input_channels == 1) {
356 switch(operation->input_width) {
357 case 3:
358 case 5:
359 map->in_image_y_size = operation->input_height;
360 break;
361 case 8:
362 switch(operation->weight_width) {
363 case 3:
364 map->in_image_y_size = operation->input_height;
365 break;
366 case 5:
367 map->in_image_y_size = 5;
368 break;
369 }
370 break;
371 case 80:
372 case 112:
373 switch(operation->weight_width) {
374 case 3:
375 map->in_image_y_size = operation->input_height / tp_cores_used + 2;
376 break;
377 case 5:
378 map->in_image_y_size = operation->input_height / tp_cores_used + 1;
379 break;
380 }
381 break;
382 default:
383 unreachable("Unsupported input width");
384 }
385 map->in_image_z_size = operation->input_channels;
386 } else {
387 map->in_image_y_size = operation->input_height / tp_cores_used;
388 map->in_image_z_size = operation->input_channels;
389 }
390 }
391
392 static struct etna_bo *
create_reshuffle_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned tp_core,unsigned tp_cores_used)393 create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
394 unsigned tp_core, unsigned tp_cores_used)
395 {
396 struct etna_context *ctx = etna_context(subgraph->base.context);
397 unsigned tp_core_count = ctx->screen->info->npu.tp_core_count;
398 struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
399 sizeof(struct etna_tp_params),
400 DRM_ETNA_GEM_CACHE_WC);
401
402 etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
403
404 struct etna_tp_params *map = etna_bo_map(bo);
405
406 set_default_tp_config(map);
407
408 set_input_size(operation, map, tp_cores_used);
409
410 map->in_image_stride = operation->input_width;
411 map->in_image_slice = operation->input_width * operation->input_height;
412
413 if (operation->padding_same && (operation->weight_width == 5 || operation->input_width < 8)) {
414 if (operation->weight_width == 5 && operation->input_width < 8) {
415 map->in_window_x_start = 0xfffe;
416 map->in_window_y_start = 0xfffe;
417 } else {
418 map->in_window_x_start = 0xffff;
419 map->in_window_y_start = 0xffff;
420 }
421 } else {
422 map->in_window_x_start = 0x0;
423 map->in_window_y_start = 0x0;
424 }
425
426 map->in_window_x_end = operation->input_width - 1;
427 map->in_window_y_end = (operation->input_height / tp_cores_used) - 1;
428 map->in_tile_x_size = operation->input_width;
429 map->in_tile_x_inc = operation->input_width;
430
431 if (operation->input_width <= 8 && operation->input_channels == 1) {
432 map->in_tile_y_size = operation->input_height;
433 map->in_tile_y_inc = operation->input_height;
434 } else {
435 map->in_tile_y_size = operation->input_height / tp_cores_used;
436 map->in_tile_y_inc = operation->input_height / tp_cores_used;
437 }
438
439 if (operation->padding_same) {
440 switch(operation->weight_width) {
441 case 3:
442 map->in_window_x_end += 2;
443 if (operation->input_width < 8) {
444 map->in_tile_x_size += 3;
445 map->in_tile_y_size += 1;
446 map->in_tile_y_inc += 1;
447 } else {
448 map->in_tile_x_size += 2;
449 }
450 break;
451 case 5:
452 map->in_window_x_end += 3;
453 if (operation->input_width < 8) {
454 map->in_tile_x_size += 5;
455 } else {
456 map->in_tile_x_size += 4;
457 }
458 break;
459 default:
460 unreachable("Unsupported weight size");
461 }
462
463 if (operation->input_width <= 8 && operation->input_channels == 1 && operation->weight_width >= 5)
464 map->in_tile_x_size = operation->input_width / tp_cores_used + 2;
465
466 if (operation->input_width > 8 && operation->input_channels == 1) {
467 switch(operation->weight_width) {
468 case 3:
469 map->in_window_y_end = (operation->input_height / tp_cores_used) + 1;
470 break;
471 case 5:
472 map->in_window_y_end = (operation->input_height / tp_cores_used);
473 break;
474 default:
475 unreachable("Unsupported weight size");
476 }
477 } else
478 map->in_window_y_end = map->in_window_x_end;
479
480 map->in_tile_x_inc = map->in_tile_x_size;
481
482 if (operation->input_channels > 1) {
483 map->in_tile_y_size = map->in_tile_x_size;
484 map->in_tile_y_inc = map->in_tile_x_size;
485 } else {
486 map->in_tile_y_size += 2;
487 map->in_tile_y_inc += 2;
488 }
489 } else {
490 if (operation->input_width < 8) {
491 map->in_window_x_end += 1;
492 map->in_window_y_end += 1;
493 map->in_tile_x_size += 1;
494 map->in_tile_y_size += 1;
495 map->in_tile_x_inc += 1;
496 map->in_tile_y_inc += 1;
497 }
498 }
499
500 struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
501 map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
502
503 if (operation->padding_same)
504 map->in_image_base_address += ((operation->input_width * operation->input_height * operation->input_channels) / tp_cores_used) * tp_core;
505 else
506 map->in_image_base_address += (operation->input_width * (operation->input_height / tp_cores_used)) * tp_core;
507
508 struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
509 map->out_image_base_address = etna_bo_gpu_va(etna_resource(output)->bo);
510
511 if (operation->padding_same)
512 map->out_image_base_address += ((map->in_tile_x_size * map->in_tile_y_size * operation->input_channels) / tp_cores_used) * tp_core;
513 else
514 map->out_image_base_address += ((operation->input_width * operation->input_width) / (operation->stride * operation->stride * tp_cores_used)) * tp_core;
515
516 map->out_loop_1_reset = 0x1;
517 map->out_loop_2_reset = 0x0;
518 map->out_loop_3_reset = 0x1;
519 map->out_loop_0_inc = pow(round(operation->input_width / 2.0), 2);
520 map->out_loop_1_inc = 0x1;
521 map->out_loop_0_count = 0x2;
522 map->out_loop_1_count = round(operation->input_width / 2.0);
523 map->out_loop_2_count = 0x2;
524 map->out_loop_3_count = DIV_ROUND_UP(round(operation->input_width / 2.0), tp_cores_used);
525
526 if (operation->padding_same) {
527 switch(operation->weight_width) {
528 case 3:
529 map->out_loop_0_inc = pow(round(operation->input_width / 2.0) + 1, 2);
530 map->out_loop_1_count += 1;
531 break;
532 case 5:
533 map->out_loop_0_inc = pow(round(operation->input_width / 2.0) + 2, 2);
534 map->out_loop_1_count += 2;
535 break;
536 default:
537 unreachable("Unsupported weight size");
538 }
539
540 if (operation->input_channels == 1)
541 map->out_loop_3_count += 1;
542 else
543 map->out_loop_3_count = map->out_loop_1_count;
544 }
545
546 map->out_loop_2_inc = map->out_loop_0_inc * 2;
547 map->out_loop_3_inc = map->out_loop_1_count;
548 map->out_loop_6_inc = map->out_loop_0_inc * 4;
549
550 if (operation->padding_same && tp_cores_used > 1 && operation->input_channels == 1) {
551 if (tp_core > 0) {
552 map->in_image_y_size -= 2;
553 map->in_window_y_end -= 2;
554 map->in_tile_y_size -= 2;
555 map->in_tile_y_inc -= 2;
556 map->out_loop_3_count -= 1;
557 }
558
559 if (tp_core == tp_core_count - 1) {
560 map->in_image_y_size -= 2;
561 }
562
563 if (tp_core > 0) {
564 map->in_image_base_address += operation->input_width * 2;
565 map->out_image_base_address -= (tp_core - 1) * (round(operation->input_width / 2.0) + 1);
566 }
567 }
568
569 unsigned alu_size = operation->input_width;
570 if (operation->padding_same) {
571 alu_size += 1;
572 if (operation->weight_width == 5)
573 alu_size += 1;
574 if (operation->input_width == 5)
575 alu_size += 1;
576 }
577
578 map->alu_reorder_bits_used = sizeof(alu_size) * 8 - __builtin_clz(alu_size);
579
580 map->in_zp = operation->input_zero_point;
581 map->out_zp = operation->input_zero_point;
582
583 if (tp_cores_used > 1)
584 map->no_flush = tp_core < tp_cores_used - 1;
585
586 map->in_image_circular_buf_size = 0x0;
587 map->in_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
588 map->out_image_circular_buf_size = 0x0;
589 map->out_image_circular_buf_end_address_plus_1 = 0xFFFFFFFF >> 6;
590
591 if (map->in_image_y_size < 2) {
592 map->in_image_y_size = operation->input_width;
593 map->in_image_z_size = (operation->input_width * operation->input_height * operation->input_channels) / (map->in_image_x_size * map->in_image_y_size) / tp_cores_used;
594 map->in_window_y_end = operation->input_width;
595 map->in_tile_y_size = operation->input_width + 1;
596 map->in_tile_y_inc = operation->input_width + 1;
597 map->out_loop_3_count += 1;
598
599 map->in_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
600 map->in_image_base_address += ((operation->input_width * operation->input_height * operation->input_channels) / tp_cores_used) * tp_core;
601
602 map->out_image_base_address = etna_bo_gpu_va(etna_resource(input)->bo);
603 map->out_image_base_address += ((map->in_tile_x_size * map->in_tile_y_size * operation->input_channels) / tp_cores_used) * tp_core;
604 }
605
606 etna_bo_cpu_fini(bo);
607
608 return bo;
609 }
610
611 void
etna_ml_lower_transpose(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * first_operation,struct etna_operation * operation,unsigned * output_tensor)612 etna_ml_lower_transpose(struct etna_ml_subgraph *subgraph,
613 const struct pipe_ml_operation *first_operation,
614 struct etna_operation *operation,
615 unsigned *output_tensor)
616 {
617 operation->type = ETNA_JOB_TYPE_TP;
618 operation->tp_type = ETNA_ML_TP_TRANSPOSE;
619
620 operation->input_tensor = first_operation->input_tensor->index;
621 operation->input_width = first_operation->input_tensor->dims[1];
622 operation->input_height = first_operation->input_tensor->dims[2];
623 operation->input_channels = first_operation->input_tensor->dims[3];
624 operation->input_zero_point = first_operation->input_tensor->zero_point;
625 operation->input_scale = first_operation->input_tensor->scale;
626 operation->input_tensor_size = operation->input_width *
627 operation->input_height *
628 operation->input_channels;
629
630 *output_tensor = etna_ml_allocate_tensor(subgraph);
631 operation->output_tensor = *output_tensor;
632 operation->output_width = first_operation->input_tensor->dims[1];
633 operation->output_height = first_operation->input_tensor->dims[2];
634 operation->output_channels = first_operation->input_tensor->dims[3];
635 operation->output_zero_point = first_operation->input_tensor->zero_point;
636 operation->output_scale = first_operation->input_tensor->scale;
637 }
638
639 void
etna_ml_lower_detranspose(struct etna_ml_subgraph * subgraph,struct etna_operation * convolution,struct etna_operation * operation)640 etna_ml_lower_detranspose(struct etna_ml_subgraph *subgraph,
641 struct etna_operation *convolution,
642 struct etna_operation *operation)
643 {
644 operation->type = ETNA_JOB_TYPE_TP;
645 operation->tp_type = ETNA_ML_TP_DETRANSPOSE;
646
647 operation->input_tensor = etna_ml_allocate_tensor(subgraph);
648 operation->input_width = convolution->output_width;
649 operation->input_height = convolution->output_height;
650 operation->input_channels = convolution->output_channels;
651 operation->input_zero_point = convolution->output_zero_point;
652 operation->input_scale = convolution->output_scale;
653 operation->input_tensor_size = operation->input_width *
654 operation->input_height *
655 operation->input_channels;
656
657 operation->output_tensor = convolution->output_tensor;
658 operation->output_width = convolution->output_width;
659 operation->output_height = convolution->output_height;
660 operation->output_channels = convolution->output_channels;
661 operation->output_zero_point = convolution->output_zero_point;
662 operation->output_scale = convolution->output_scale;
663 }
664
665 void
etna_ml_lower_reshuffle(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * convolution,struct etna_operation * operation,unsigned * output_tensor)666 etna_ml_lower_reshuffle(struct etna_ml_subgraph *subgraph,
667 const struct pipe_ml_operation *convolution,
668 struct etna_operation *operation,
669 unsigned *output_tensor)
670 {
671 operation->type = ETNA_JOB_TYPE_TP;
672 operation->tp_type = ETNA_ML_TP_RESHUFFLE;
673 operation->stride = convolution->conv.stride_x;
674 operation->padding_same = convolution->conv.padding_same;
675
676 operation->input_tensor = convolution->input_tensor->index;
677 operation->input_width = convolution->input_tensor->dims[1];
678 operation->input_height = convolution->input_tensor->dims[2];
679 operation->input_channels = convolution->input_tensor->dims[3];
680 operation->input_zero_point = convolution->input_tensor->zero_point;
681 operation->input_scale = convolution->input_tensor->scale;
682 operation->input_tensor_size = operation->input_width *
683 operation->input_height *
684 operation->input_channels;
685
686 *output_tensor = etna_ml_allocate_tensor(subgraph);
687 operation->output_tensor = *output_tensor;
688 operation->output_width = DIV_ROUND_UP(operation->input_width, operation->stride);
689 operation->output_height = DIV_ROUND_UP(operation->input_height, operation->stride);
690 operation->output_channels = operation->input_channels * operation->stride * operation->stride;
691 operation->output_zero_point = convolution->input_tensor->zero_point;
692 operation->output_scale = convolution->input_tensor->scale;
693
694 /* When destriding a convolution, the transformation to be made to the input
695 * tensor will depend on the size of the weight tensor.
696 */
697 operation->weight_width = convolution->conv.weight_tensor->dims[1];
698 operation->weight_height = convolution->conv.weight_tensor->dims[2];
699
700 if (operation->padding_same) {
701 if (operation->weight_width == 5) {
702 operation->output_width += 2;
703 operation->output_height += 2;
704 } else {
705 operation->output_width += 1;
706 operation->output_height += 1;
707 }
708 }
709 }
710
711 void
etna_ml_compile_operation_tp(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct etna_vip_instruction * instruction)712 etna_ml_compile_operation_tp(struct etna_ml_subgraph *subgraph,
713 const struct etna_operation *operation,
714 struct etna_vip_instruction *instruction)
715 {
716 struct etna_context *ctx = etna_context(subgraph->base.context);
717 struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
718 assert(input);
719 pipe_resource_reference(&instruction->input, input);
720
721 struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
722 assert(output);
723 pipe_resource_reference(&instruction->output, output);
724
725 switch (operation->tp_type) {
726 case ETNA_ML_TP_TRANSPOSE:
727 instruction->configs[0] = create_transpose_config(subgraph, operation);
728 break;
729 case ETNA_ML_TP_DETRANSPOSE:
730 instruction->configs[0] = create_detranspose_config(subgraph, operation);
731 break;
732 case ETNA_ML_TP_RESHUFFLE: {
733 unsigned tp_core_count = ctx->screen->info->npu.tp_core_count;
734 unsigned tp_cores_used;
735
736 tp_cores_used = (operation->input_width > 8 || operation->input_channels > 1) ? tp_core_count : 1;
737
738 /* TODO: Run among the 4 cores for faster performance */
739 if ((operation->input_width == 320 || operation->input_width == 224) &&
740 operation->input_channels == 3)
741 tp_cores_used = 1;
742
743 ML_DBG("reshuffle: input_width %d tp_cores_used %d\n", operation->input_width, tp_cores_used);
744 for (unsigned i = 0; i < tp_cores_used; i++) {
745 instruction->configs[i] = create_reshuffle_config(subgraph, operation, i, tp_cores_used);
746 }
747 break;
748 }
749 }
750 instruction->type = ETNA_JOB_TYPE_TP;
751 }
752
753 void
etna_ml_emit_operation_tp(struct etna_ml_subgraph * subgraph,struct etna_vip_instruction * operation,unsigned idx)754 etna_ml_emit_operation_tp(struct etna_ml_subgraph *subgraph,
755 struct etna_vip_instruction *operation,
756 unsigned idx)
757 {
758 struct etna_context *ctx = etna_context(subgraph->base.context);
759 unsigned tp_core_count = ctx->screen->info->npu.tp_core_count;
760 struct etna_cmd_stream *stream = ctx->stream;
761 bool more_than_one_tp_job = operation->configs[1] != NULL;
762 bool parallel = DBG_ENABLED(ETNA_DBG_NPU_PARALLEL);
763
764 for (unsigned j = 0; j < tp_core_count && operation->configs[j]; j++) {
765 unsigned offset = parallel ? idx + 1 : 0;
766
767 if (more_than_one_tp_job && (j < tp_core_count - 1))
768 offset = parallel ? 0x1f : 0x1;
769
770 etna_set_state(stream, VIVS_GL_OCB_REMAP_START, 0x0);
771 etna_set_state(stream, VIVS_GL_OCB_REMAP_END, 0x0);
772 etna_set_state(stream, VIVS_GL_TP_CONFIG, 0x0);
773 etna_set_state_reloc(stream, VIVS_PS_TP_INST_ADDR, &(struct etna_reloc) {
774 .bo = operation->configs[j],
775 .flags = ETNA_RELOC_READ,
776 .offset = offset,
777 });
778 }
779 etna_set_state(stream, VIVS_PS_UNK10A4, parallel ? idx + 1 : 0x0);
780 }
781