1 /*
2 * Copyright (c) 2023-2024 Tomeu Vizoso <[email protected]>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "util/u_inlines.h"
7
8 #include "etnaviv_context.h"
9 #include "etnaviv_debug.h"
10 #include "etnaviv_emit.h"
11 #include "etnaviv_ml_nn.h"
12
13 #define ETNA_NN_INT8 0
14
15 #define SRAM_CACHE_MODE_NO_CACHE 0x0
16 #define SRAM_CACHE_MODE_FULL_CACHE 0x1
17 #define SRAM_CACHE_MODE_PARTIAL_CACHE 0x2
18
19 enum pooling_type {
20 ETNA_NN_POOLING_NON,
21 ETNA_NN_POOLING_MAX,
22 ETNA_NN_POOLING_AVG,
23 ETNA_NN_POOLING_FIRST_PIXEL
24 };
25
26 #define FIELD(field, bits) uint32_t field : bits;
27
28 struct etna_nn_params {
29
30 FIELD(layer_type, 1) /* conv: 0 fully_connected: 1 */
31 FIELD(no_z_offset, 1)
32 FIELD(kernel_xy_size, 4)
33 FIELD(kernel_z_size, 14) /* & 0x3FFF */
34 FIELD(kernels_per_core, 7)
35 FIELD(pooling, 2)
36 FIELD(pooling_xy_size, 1)
37 FIELD(prelu, 1)
38 FIELD(nn_layer_flush, 1)
39
40 /* 1 */
41 FIELD(kernel_data_type, 2) /* UINT8 0x2 INT8 0x0 */
42 FIELD(in_image_data_type, 2) /* UINT8 0x2 INT8 0x0 */
43 FIELD(out_image_data_type, 2) /* UINT8 0x2 INT8 0x0 */
44 FIELD(in_image_x_size, 13)
45 FIELD(in_image_y_size, 13)
46
47 /* 2 */
48 FIELD(in_image_x_offset, 3)
49 FIELD(in_image_y_offset, 3)
50 FIELD(unused0, 1)
51 FIELD(brick_mode, 1)
52 FIELD(brick_distance, 16)
53 FIELD(relu, 1)
54 FIELD(unused1, 1)
55 FIELD(post_multiplier, 1)
56 FIELD(post_shift, 5)
57
58 /* 3 */
59 FIELD(unused2, 3)
60 FIELD(no_flush, 1)
61 FIELD(unused3, 2)
62 FIELD(out_image_x_size, 13)
63 FIELD(out_image_y_size, 13)
64
65 /* 4 */
66 /* Changes based on gcFEATURE_VALUE_NN_INIMAGE_OFFSET_BITS == 4 */
67 FIELD(out_image_z_size, 14)
68 FIELD(rounding_mode, 2)
69 FIELD(in_image_x_offset_bit_3, 1) /* >> 3 & 0x1 */
70 FIELD(in_image_y_offset_bit_3, 1) /* >> 3 & 0x1 */
71 FIELD(out_image_tile_x_size, 7)
72 FIELD(out_image_tile_y_size, 7)
73
74 /* 5 */
75 FIELD(kernel_address, 26) /* >> 6 */
76 FIELD(kernel_z_size2, 6) /* >> 14 & 0x3F */
77
78 /* 6 */
79 FIELD(in_image_address, 32)
80
81 /* 7 */
82 FIELD(out_image_address, 32)
83
84 /* 8 */
85 FIELD(image_caching_mode, 2)
86 FIELD(kernel_caching_mode, 2)
87 FIELD(partial_cache_data_unit, 2)
88 FIELD(kernel_pattern_msb, 6)
89 FIELD(kernel_y_size, 4)
90 FIELD(out_image_y_stride, 16)
91
92 /* 9 */
93 FIELD(kernel_pattern_low, 32)
94
95 /* 10 */
96 FIELD(kernel_pattern_high, 32)
97
98 /* 11 */
99 FIELD(kernel_cache_start_address, 32)
100
101 /* 12 */
102 FIELD(kernel_cache_end_address, 32)
103
104 /* 13 */
105 FIELD(image_cache_start_address, 32)
106
107 /* 14 */
108 FIELD(image_cache_end_address, 32)
109
110 /* 15 */
111 FIELD(in_image_border_mode, 2)
112 FIELD(in_image_border_const, 16)
113 FIELD(unused4, 1)
114 FIELD(kernel_data_type_bit_2, 1)
115 FIELD(in_image_data_type_bit_2, 1)
116 FIELD(out_image_data_type_bit_2, 1)
117 FIELD(post_multiplier_1_to_6, 6)
118 FIELD(post_shift_bit_5_6, 2)
119 FIELD(unused5, 2)
120
121 /* 16 */
122 FIELD(in_image_x_stride, 16)
123 FIELD(in_image_y_stride, 16)
124
125 /* 17 */
126 FIELD(out_image_x_stride, 16)
127 FIELD(unused6, 8)
128 FIELD(post_multiplier_7_to_14, 8)
129
130 /* 18 */
131 FIELD(out_image_circular_buf_size, 26) /* >> 6 */
132 FIELD(per_channel_post_mul, 1)
133 FIELD(unused7, 5)
134
135 /* 19 */
136 FIELD(out_image_circular_buf_end_addr_plus_1, 26) /* >> 6 */
137 FIELD(unused8, 6)
138
139 /* 20 */
140 FIELD(in_image_circular_buf_size, 26) /* >> 6 */
141 FIELD(unused9, 6)
142
143 /* 21 */
144 FIELD(in_image_circular_buf_end_addr_plus_1, 26) /* >> 6 */
145 FIELD(unused10, 6)
146
147 /* 22 */
148 FIELD(coef_zero_point, 8)
149 FIELD(out_zero_point, 8)
150 FIELD(kernel_direct_stream_from_VIP_sram, 1)
151 FIELD(depthwise, 1)
152 FIELD(post_multiplier_15_to_22, 8)
153 FIELD(unused11, 6)
154
155 /* 23, from here they aren't set on */
156 FIELD(unused12, 32)
157
158 /* 24 */
159 FIELD(unused13, 4)
160 FIELD(unused14, 28) /* 0 >> 4 */
161
162 /* 25 */
163 FIELD(unused15, 4)
164 FIELD(unused16, 28) /* 0 >> 4 */
165
166 /* 26 */
167 FIELD(further1, 32)
168 FIELD(further2, 32)
169 FIELD(further3, 32)
170 FIELD(further4, 32)
171 FIELD(further5, 32)
172 FIELD(further6, 32)
173 FIELD(further7, 32)
174 FIELD(further8, 32)
175 };
176
177 static void *
map_resource(struct pipe_resource * resource)178 map_resource(struct pipe_resource *resource)
179 {
180 return etna_bo_map(etna_resource(resource)->bo);
181 }
182
183
184 static void
pointwise_to_2x2(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)185 pointwise_to_2x2(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
186 {
187 /* Fill a Nx2x2xN tensor with zero_points */
188 struct pipe_context *context = subgraph->base.context;
189 uint8_t *input = map_resource(operation->weight_tensor);
190 unsigned new_size = operation->output_channels * 2 * 2 * operation->input_channels;
191 struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT,
192 new_size);
193 uint8_t *output = map_resource(output_res);
194
195 for (unsigned channel = 0; channel < operation->output_channels; channel++) {
196 uint8_t *map_in = input + channel * 1 * 1 * operation->input_channels;
197 uint8_t *map_out = output + channel * 2 * 2 * operation->input_channels;
198
199 map_out[0] = map_in[0];
200 map_out[1] = operation->weight_zero_point;
201 map_out[2] = operation->weight_zero_point;
202 map_out[3] = operation->weight_zero_point;
203 }
204
205 pipe_resource_reference(&operation->weight_tensor, NULL);
206 operation->weight_tensor = output_res;
207
208 operation->weight_width = operation->weight_height = 2;
209 operation->pointwise = false;
210 }
211
212 static void
expand_depthwise(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)213 expand_depthwise(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
214 {
215 struct pipe_context *context = subgraph->base.context;
216 uint8_t *input = map_resource(operation->weight_tensor);
217 unsigned new_size = operation->output_channels * operation->weight_width * operation->weight_height * operation->input_channels;
218 struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT,
219 new_size);
220 uint8_t *output = map_resource(output_res);
221
222 /* Lower depthwise convolution to regular convolution, as the hardware doesn't support those */
223 for (unsigned channel = 0; channel < operation->output_channels; channel++) {
224 unsigned in_channel = channel / operation->output_channels;
225 unsigned in_depth = channel % operation->output_channels;
226
227 uint8_t *map_in = input + in_channel * operation->weight_width * operation->weight_height * operation->input_channels;
228 uint8_t *map_out = output + channel * operation->weight_width * operation->weight_height * operation->input_channels;
229
230 for (unsigned i = 0; i < operation->weight_width * operation->weight_height * operation->input_channels; i++) {
231 if (i % operation->input_channels == in_depth)
232 map_out[i] = map_in[i];
233 else
234 map_out[i] = operation->weight_zero_point;
235 }
236 }
237
238 pipe_resource_reference(&operation->weight_tensor, NULL);
239 operation->weight_tensor = output_res;
240 }
241
242 static void
transpose(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)243 transpose(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
244 {
245 struct pipe_context *context = subgraph->base.context;
246 void *map = map_resource(operation->weight_tensor);
247 unsigned new_size = operation->output_channels * operation->weight_width * \
248 operation->weight_height * operation->input_channels;
249 struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT,
250 new_size);
251 uint8_t *output = map_resource(output_res);
252 unsigned output_channels = operation->output_channels;
253 unsigned input_channels = operation->input_channels;
254
255 if (operation->addition) {
256 output_channels = 1;
257 input_channels = 2;
258 }
259
260 uint8_t (*input)[operation->weight_width][operation->weight_height][input_channels] = map;
261 unsigned i = 0;
262 for (unsigned d0 = 0; d0 < output_channels; d0++)
263 for (unsigned d3 = 0; d3 < input_channels; d3++)
264 for (unsigned d1 = 0; d1 < operation->weight_width; d1++)
265 for (unsigned d2 = 0; d2 < operation->weight_height; d2++)
266 ((uint8_t*)output)[i++] = input[d0][d1][d2][d3];
267
268 pipe_resource_reference(&operation->weight_tensor, NULL);
269 operation->weight_tensor = output_res;
270 }
271
272 static void
subsample(uint8_t * map_in,unsigned in_width,unsigned in_height,unsigned in_depth,unsigned out_width,unsigned out_height,unsigned in_z,unsigned offset_x,unsigned offset_y,unsigned stride,uint8_t * map_out,int in_zp)273 subsample(uint8_t *map_in, unsigned in_width, unsigned in_height, unsigned in_depth, unsigned out_width, unsigned out_height, unsigned in_z, unsigned offset_x, unsigned offset_y, unsigned stride, uint8_t *map_out, int in_zp)
274 {
275 uint8_t (*in)[in_height][in_depth] = (uint8_t(*)[in_height][in_depth])map_in;
276 uint8_t (*out)[out_height] = (uint8_t(*)[out_height])map_out;
277
278 for(unsigned x = 0; x < out_width; x++)
279 for(unsigned y = 0; y < out_height; y++) {
280 unsigned in_x = x * stride + offset_x;
281 unsigned in_y = y * stride + offset_y;
282 if (in_x < in_width && in_y < in_height)
283 out[x][y] = in[in_x][in_y][in_z];
284 else
285 out[x][y] = in_zp;
286 }
287 }
288
289 /* TODO: Do the reshaping in the TP units, for big enough buffers */
290 static void
reshape(uint8_t * input,uint8_t * output,unsigned stride,int in_zp,unsigned dims_in[4],unsigned dims_out[4])291 reshape(uint8_t *input, uint8_t *output, unsigned stride, int in_zp, unsigned dims_in[4], unsigned dims_out[4])
292 {
293 for (unsigned out_channel = 0; out_channel < dims_in[0]; out_channel++) {
294 void *map_in = input + out_channel * dims_in[1] * dims_in[2] * dims_in[3];
295 void *map_out = output + out_channel * dims_out[1] * dims_out[2] * dims_out[3];
296
297 /* See Figure 3 in https://arxiv.org/abs/1712.02502 */
298 /* This is only valid for stride == 2 */
299 assert(stride == 2);
300 uint8_t (*out)[dims_out[1]][dims_out[2]] = (uint8_t(*)[dims_out[1]][dims_out[2]])map_out;
301 for (unsigned z = 0; z < dims_in[3]; z++) {
302 subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 0, 0, stride, (uint8_t *)out[0 + z * stride * stride], in_zp);
303 subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 0, 1, stride, (uint8_t *)out[1 + z * stride * stride], in_zp);
304 subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 1, 0, stride, (uint8_t *)out[2 + z * stride * stride], in_zp);
305 subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 1, 1, stride, (uint8_t *)out[3 + z * stride * stride], in_zp);
306 }
307 }
308 }
309
310 static void
strided_to_normal(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)311 strided_to_normal(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
312 {
313 struct pipe_context *context = subgraph->base.context;
314 uint8_t *input = map_resource(operation->weight_tensor);
315 unsigned new_size;
316 struct pipe_resource *output_res;
317 uint8_t *output;
318
319 /* The hardware doesn't support strides natively, so we "lower" them as
320 * described in this paper:
321 *
322 * "Take it in your stride: Do we need striding in CNNs?" https://arxiv.org/abs/1712.02502
323 */
324
325 /* TODO: Support more strides */
326 assert(operation->stride == 2);
327
328 unsigned wdims_in[4] = {operation->output_channels,
329 operation->weight_width,
330 operation->weight_height,
331 operation->input_channels};
332
333 operation->input_channels = operation->input_channels * operation->stride * operation->stride;
334 operation->input_width = DIV_ROUND_UP(operation->input_width, operation->stride);
335 operation->input_height = DIV_ROUND_UP(operation->input_height, operation->stride);
336
337 if (operation->padding_same) {
338 if (operation->weight_width == 5) {
339 operation->input_width += 2;
340 operation->input_height += 2;
341 } else {
342 operation->input_width += 1;
343 operation->input_height += 1;
344 }
345 }
346
347 operation->weight_width = DIV_ROUND_UP(operation->weight_width, operation->stride);
348 operation->weight_height = DIV_ROUND_UP(operation->weight_height, operation->stride);
349
350 new_size = operation->output_channels * operation->weight_width * operation->weight_height * operation->input_channels;
351 output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, new_size);
352 output = map_resource(output_res);
353
354 unsigned wdims_out[4] = {operation->output_channels, operation->weight_width, operation->weight_height, operation->input_channels};
355 reshape(input, output, operation->stride, operation->weight_zero_point, wdims_in, wdims_out);
356
357 pipe_resource_reference(&operation->weight_tensor, NULL);
358 operation->weight_tensor = output_res;
359 }
360
361 void
etna_ml_lower_convolution(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation,struct etna_operation * operation)362 etna_ml_lower_convolution(struct etna_ml_subgraph *subgraph,
363 const struct pipe_ml_operation *poperation,
364 struct etna_operation *operation)
365 {
366 /* TODO: Support stride_x != stride_y */
367 assert(poperation->conv.stride_x == poperation->conv.stride_y);
368 assert(poperation->type == PIPE_ML_OPERATION_TYPE_CONVOLUTION);
369
370 operation->type = ETNA_JOB_TYPE_NN;
371 operation->addition = false;
372 operation->depthwise = poperation->conv.depthwise;
373 operation->pointwise = poperation->conv.pointwise;
374 operation->pooling_first_pixel = poperation->conv.stride_x > 1 && \
375 (poperation->conv.depthwise || poperation->conv.pointwise);
376 operation->padding_same = poperation->conv.padding_same;
377 operation->stride = poperation->conv.stride_x;
378
379 operation->input_tensor = poperation->input_tensor->index;
380 operation->input_width = poperation->input_tensor->dims[1];
381 operation->input_height = poperation->input_tensor->dims[2];
382 operation->input_channels = poperation->input_tensor->dims[3];
383 operation->input_zero_point = poperation->input_tensor->zero_point;
384 operation->input_scale = poperation->input_tensor->scale;
385
386 operation->output_tensor = poperation->output_tensor->index;
387 operation->output_width = poperation->output_tensor->dims[1];
388 operation->output_height = poperation->output_tensor->dims[2];
389 operation->output_channels = poperation->output_tensor->dims[3];
390 operation->output_zero_point = poperation->output_tensor->zero_point;
391 operation->output_scale = poperation->output_tensor->scale;
392
393 pipe_resource_reference(&operation->weight_tensor, poperation->conv.weight_tensor->resource);
394 operation->weight_width = poperation->conv.weight_tensor->dims[1];
395 operation->weight_height = poperation->conv.weight_tensor->dims[2];
396 operation->weight_zero_point = poperation->conv.weight_tensor->zero_point;
397 operation->weight_scale = poperation->conv.weight_tensor->scale;
398
399 pipe_resource_reference(&operation->bias_tensor, poperation->conv.bias_tensor->resource);
400
401 if (operation->pointwise && operation->input_channels == 1)
402 pointwise_to_2x2(subgraph, operation);
403
404 if (operation->depthwise && (operation->output_channels > 1 || operation->stride > 1)) {
405
406 if (operation->input_width < 8 && operation->input_width > 2)
407 operation->pooling_first_pixel = false;
408
409 expand_depthwise(subgraph, operation);
410 }
411
412 if (operation->stride > 1 && !operation->pooling_first_pixel)
413 strided_to_normal(subgraph, operation); /* This will already transpose if input_channels > 1 */
414 else if (operation->input_channels > 1)
415 transpose(subgraph, operation);
416
417 operation->input_tensor_size = operation->input_width *
418 operation->input_height *
419 operation->input_channels;
420 ML_DBG("%dx%dx%d\n", operation->input_width, operation->input_height, operation->input_channels);
421 }
422
423 static float
compute_weight_scale_add(float input1_scale,float input2_scale)424 compute_weight_scale_add(float input1_scale, float input2_scale)
425 {
426 double scale_ratio = input1_scale / input2_scale;
427
428 return (float) MAX2(scale_ratio, 1.0) / 255.0;
429 }
430
431 static uint8_t
compute_addition_offset(float input1_scale,float input2_scale,float weight_scale)432 compute_addition_offset(float input1_scale, float input2_scale, float weight_scale)
433 {
434 double addition_offset = input1_scale / input2_scale;
435 addition_offset /= weight_scale;
436 return round(addition_offset + 0.0) * 1;
437 }
438
439 static uint8_t
compute_weight_add(float input1_scale,float input2_scale,float weight_scale)440 compute_weight_add(float input1_scale, float input2_scale, float weight_scale)
441 {
442 double weight = 1.0 / weight_scale;
443 return round(weight + 0.0);
444 }
445
446 static uint32_t
compute_bias_add(float input1_scale,float input2_scale,uint8_t input1_zp,uint8_t input2_zp,float weight_scale)447 compute_bias_add(float input1_scale, float input2_scale, uint8_t input1_zp, uint8_t input2_zp, float weight_scale)
448 {
449 int zero_point_diff = input2_zp - input1_zp;
450 double bias = zero_point_diff * input1_scale;
451 bias /= weight_scale * input2_scale;
452
453 double addition_offset = input1_scale / input2_scale;
454 addition_offset /= weight_scale;
455 addition_offset = round(addition_offset + 0.0) * 1;
456
457 return (int) (round(bias) - round(addition_offset) * input2_zp);
458 }
459
460 void
etna_ml_lower_add(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation,struct etna_operation * operation)461 etna_ml_lower_add(struct etna_ml_subgraph *subgraph,
462 const struct pipe_ml_operation *poperation,
463 struct etna_operation *operation)
464 {
465 struct pipe_context *context = subgraph->base.context;
466
467 assert(poperation->type == PIPE_ML_OPERATION_TYPE_ADD);
468
469 operation->addition = true;
470 operation->depthwise = false;
471 operation->pointwise = false;
472 operation->pooling_first_pixel = false;
473 operation->padding_same = false;
474 operation->stride = 1;
475
476 operation->input_tensor = poperation->input_tensor->index;
477 operation->add_input_tensor = poperation->add.input_tensor->index;
478 operation->input_width = poperation->input_tensor->dims[1];
479 operation->input_height = poperation->input_tensor->dims[2];
480 operation->input_channels = poperation->input_tensor->dims[3];
481 operation->input_zero_point = poperation->input_tensor->zero_point;
482 operation->input_scale = poperation->input_tensor->scale;
483 operation->input_tensor_size = operation->input_width *
484 operation->input_height *
485 operation->input_channels *
486 2;
487
488 operation->output_tensor = poperation->output_tensor->index;
489 operation->output_width = poperation->output_tensor->dims[1];
490 operation->output_height = poperation->output_tensor->dims[2];
491 operation->output_channels = poperation->output_tensor->dims[3];
492 operation->output_zero_point = poperation->output_tensor->zero_point;
493 operation->output_scale = poperation->output_tensor->scale;
494
495 operation->weight_tensor = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, 8);
496 operation->weight_width = 2;
497 operation->weight_height = 2;
498 operation->weight_zero_point = 0x0;
499 operation->weight_scale = compute_weight_scale_add(poperation->add.input_tensor->scale, poperation->input_tensor->scale);
500 operation->addition_offset = compute_addition_offset(poperation->add.input_tensor->scale, poperation->input_tensor->scale, operation->weight_scale);
501
502 uint8_t *weight_map = map_resource(operation->weight_tensor);
503 memset(weight_map, 0, pipe_buffer_size(operation->weight_tensor));
504 weight_map[0] = compute_weight_add(poperation->add.input_tensor->scale, poperation->input_tensor->scale, operation->weight_scale);
505
506 operation->bias_tensor = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, 4);
507 int32_t *bias_map = map_resource(operation->bias_tensor);
508 bias_map[0] = compute_bias_add(poperation->add.input_tensor->scale, poperation->input_tensor->scale,
509 poperation->add.input_tensor->zero_point, poperation->input_tensor->zero_point,
510 operation->weight_scale);
511 }
512
513 #define MAX_TILE_WIDTH 64
514
515 static unsigned
calc_superblocks(struct etna_context * ctx,const struct etna_operation * operation,unsigned tile_y,unsigned interleave_mode)516 calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode)
517 {
518 unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
519 unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
520 unsigned output_channels = operation->addition ? 1 : operation->output_channels;
521 unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
522 unsigned foo = (nn_accum_buffer_depth * interleave_mode) / tile_y;
523
524 if (operation->weight_width == 1)
525 foo = MIN2(foo, nn_accum_buffer_depth / 3);
526
527 foo = MIN2(foo, kernels_per_core);
528 foo = MIN2(foo, 127);
529
530 kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * foo);
531 unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count);
532 unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels);
533
534 return superblocks;
535 }
536
537 static unsigned
calc_interleave_mode(unsigned tile_width,unsigned weight_height)538 calc_interleave_mode(unsigned tile_width, unsigned weight_height)
539 {
540 unsigned mode = 8;
541
542 if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2)
543 return 1;
544
545 if (tile_width > MAX_TILE_WIDTH / 2)
546 mode = 1;
547 else if (tile_width > MAX_TILE_WIDTH / 4)
548 mode = 2;
549 else if (tile_width > MAX_TILE_WIDTH / 8)
550 mode = 4;
551
552 if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4)
553 return MIN2(mode, 4);
554
555 return MIN2(mode, 2);
556 }
557
558 static void
calc_addition_sizes(unsigned * input_width,unsigned * input_height,unsigned * input_channels,unsigned * output_width,unsigned * output_height,unsigned * output_channels)559 calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsigned *input_channels,
560 unsigned *output_width, unsigned *output_height, unsigned *output_channels)
561 {
562 ML_DBG("addition input width %d channels %d\n", *input_width, *input_channels);
563
564 unsigned channel_size = *input_width * *input_height;
565 unsigned width = 0;
566 if (channel_size % 128 == 0)
567 width = 128;
568 else if (channel_size % 64 == 0)
569 width = 64;
570 else if (channel_size % 32 == 0)
571 width = 32;
572 else {
573 for (int i = 63; i > 0; i--) {
574 if (channel_size % i == 0) {
575 width = i;
576 break;
577 }
578 }
579 }
580
581 *input_height = (*input_width * *input_height * *input_channels) / width;
582 *input_width = width;
583 *input_channels = 2;
584
585 *output_height = *output_width * *output_height * *output_channels / width;
586 *output_width = width;
587 *output_channels = 1;
588 }
589
590 static unsigned
calculate_tiling(struct etna_context * ctx,const struct etna_operation * operation,unsigned * tile_width_out,unsigned * tile_height_out)591 calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
592 {
593 unsigned nn_input_buffer_depth = ctx->screen->info->npu.nn_input_buffer_depth;
594 unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
595 unsigned input_width = operation->input_width;
596 unsigned input_height = operation->input_height;
597 unsigned input_channels = operation->input_channels;
598 unsigned output_width = operation->output_width;
599 unsigned output_height = operation->output_height;
600 unsigned output_channels = operation->output_channels;
601 unsigned tile_width;
602 unsigned tile_height;
603 unsigned superblocks;
604 unsigned interleave_mode;
605
606 if (operation->addition)
607 calc_addition_sizes(&input_width, &input_height, &input_channels,
608 &output_width, &output_height, &output_channels);
609
610 if (operation->pooling_first_pixel) {
611 output_width *= 2;
612 output_height *= 2;
613 }
614
615 tile_width = MIN2(output_width, 64);
616 interleave_mode = calc_interleave_mode(tile_width, operation->weight_height);
617
618 tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1;
619 tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth);
620 tile_height = MIN2(tile_height, output_height);
621
622 if (operation->stride > 1 && tile_height % 2 > 0)
623 tile_height -= 1;
624
625 tile_height = MAX2(tile_height, 1);
626 superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode);
627
628 if (tile_width_out)
629 *tile_width_out = tile_width;
630
631 if (tile_height_out)
632 *tile_height_out = tile_height;
633
634 return superblocks;
635 }
636
637 static struct etna_bo *
create_nn_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct etna_bo * coefficients,unsigned coef_cache_size)638 create_nn_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct etna_bo *coefficients, unsigned coef_cache_size)
639 {
640 struct pipe_context *context = subgraph->base.context;
641 struct etna_context *ctx = etna_context(context);
642 unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
643 unsigned nn_core_version = ctx->screen->specs.nn_core_version;
644 unsigned oc_sram_size = ctx->screen->info->npu.on_chip_sram_size;
645 struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
646 sizeof(struct etna_nn_params),
647 DRM_ETNA_GEM_CACHE_WC);
648 unsigned input_width = operation->input_width;
649 unsigned input_height = operation->input_height;
650 unsigned input_channels = operation->input_channels;
651 unsigned output_width = operation->output_width;
652 unsigned output_height = operation->output_height;
653 unsigned output_channels = operation->output_channels;
654 unsigned weight_width = operation->weight_width;
655 unsigned weight_height = operation->weight_height;
656
657 if (operation->pointwise && input_channels == 1)
658 weight_width = weight_height = 2;
659
660 if (operation->addition)
661 calc_addition_sizes(&input_width, &input_height, &input_channels,
662 &output_width, &output_height, &output_channels);
663
664 etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
665
666 struct etna_nn_params *map = etna_bo_map(bo);
667 map->layer_type = 0x0;
668 map->no_z_offset = 0x0;
669 map->prelu = 0x0;
670 map->nn_layer_flush = 0x1;
671 map->brick_mode = 0x0;
672 map->brick_distance = 0x0;
673 map->relu = 0x0;
674 map->no_flush = 0x0;
675 map->rounding_mode = 0x1;
676 map->partial_cache_data_unit = 0x0;
677 map->depthwise = 0x0;
678
679 map->unused0 = 0x0;
680 map->unused1 = 0x0;
681 map->unused2 = 0x0;
682 map->unused3 = 0x0;
683 map->unused4 = 0x0;
684 map->unused5 = 0x0;
685 map->unused6 = 0x0;
686 map->unused7 = 0x0;
687 map->unused8 = 0x0;
688 map->unused9 = 0x0;
689 map->unused10 = 0x0;
690 map->unused11 = 0x0;
691 map->unused12 = 0x0;
692 map->unused13 = 0x0;
693 map->unused14 = 0x0;
694 map->further1 = 0x0;
695 map->further2 = 0x0;
696 map->further3 = 0x3ffffff;
697 map->further4 = 0x7f800000;
698 map->further5 = 0xff800000;
699 map->further6 = 0x0;
700 map->further7 = 0x0;
701 map->further8 = 0x0;
702
703 struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
704 unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensor);
705 map->in_image_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset;
706 map->in_image_x_size = input_width;
707 map->in_image_y_size = input_height;
708 map->in_image_x_stride = input_width;
709 map->in_image_y_stride = input_height;
710 map->in_image_data_type = ETNA_NN_INT8;
711 map->in_image_data_type_bit_2 = ETNA_NN_INT8 >> 2;
712 map->in_image_circular_buf_size = 0x0;
713 map->in_image_circular_buf_end_addr_plus_1 = 0xFFFFFFFF >> 6;
714 map->in_image_border_mode = 0x0;
715 map->in_image_border_const = operation->input_zero_point;
716
717 if (operation->padding_same && operation->stride == 1 && weight_width > 2) {
718 if (weight_width < 5) {
719 map->in_image_x_offset = 0x7;
720 map->in_image_y_offset = 0x7;
721 } else {
722 map->in_image_x_offset = 0x6;
723 map->in_image_y_offset = 0x6;
724 }
725 map->in_image_x_offset_bit_3 = 0x1;
726 map->in_image_y_offset_bit_3 = 0x1;
727 } else {
728 map->in_image_x_offset = 0x0;
729 map->in_image_y_offset = 0x0;
730 map->in_image_x_offset_bit_3 = 0x0;
731 map->in_image_y_offset_bit_3 = 0x0;
732 }
733
734 if (operation->padding_same && operation->stride == 2 && weight_width == 5) {
735 map->in_image_x_offset = 0x7;
736 map->in_image_y_offset = 0x7;
737 map->in_image_x_offset_bit_3 = 0x1;
738 map->in_image_y_offset_bit_3 = 0x1;
739 }
740
741 struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
742 offset = etna_ml_get_offset(subgraph, operation->output_tensor);
743 map->out_image_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
744 map->out_image_x_size = output_width;
745 map->out_image_y_size = output_height;
746 map->out_image_z_size = output_channels;
747
748 map->out_image_x_stride = map->out_image_x_size;
749 map->out_image_y_stride = map->out_image_y_size;
750
751 map->out_image_data_type = ETNA_NN_INT8;
752 map->out_image_data_type_bit_2 = ETNA_NN_INT8 >> 2;
753 map->out_image_circular_buf_size = 0x0;
754 map->out_image_circular_buf_end_addr_plus_1 = 0xFFFFFFFF >> 6;
755 map->out_zero_point = operation->output_zero_point;
756
757 if (operation->pooling_first_pixel) {
758 map->pooling = ETNA_NN_POOLING_FIRST_PIXEL;
759 map->pooling_xy_size = 0x0;
760
761 map->out_image_x_size *= 2;
762 map->out_image_y_size *= 2;
763 } else {
764 map->pooling = ETNA_NN_POOLING_NON;
765 map->pooling_xy_size = 0x1;
766 }
767
768 unsigned tile_x, tile_y;
769 unsigned superblocks = calculate_tiling(ctx, operation, &tile_x, &tile_y);
770 map->out_image_tile_x_size = tile_x;
771 map->out_image_tile_y_size = tile_y;
772
773 map->kernel_address = etna_bo_gpu_va(coefficients) >> 6;
774 map->kernel_xy_size = weight_width;
775 map->kernel_y_size = weight_height;
776 map->kernel_z_size = input_channels;
777 map->kernel_z_size2 = 0x0;
778 map->kernel_data_type = ETNA_NN_INT8;
779 map->kernel_data_type_bit_2 = ETNA_NN_INT8 >> 2;
780 map->kernel_direct_stream_from_VIP_sram = 0x0;
781
782 map->coef_zero_point = operation->weight_zero_point;
783
784 map->kernels_per_core = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), superblocks);
785
786 unsigned image_cache_size;
787 if (superblocks == 1) {
788 /* No point in caching the input image if there is only one iteration */
789 image_cache_size = 0;
790 } else {
791 unsigned in_image_tile_x_size = map->out_image_tile_x_size + weight_width - 1;
792 unsigned in_image_tile_y_size = map->out_image_tile_y_size + weight_width - 1;
793 image_cache_size = in_image_tile_x_size * in_image_tile_y_size;
794 image_cache_size = ALIGN(image_cache_size, 16);
795 image_cache_size *= input_channels;
796 image_cache_size = ALIGN(image_cache_size, 128);
797 }
798
799 ML_DBG("coefficients_size 0x%x (%d) image_size 0x%x (%d)\n", coef_cache_size, coef_cache_size, image_cache_size, image_cache_size);
800
801 map->kernel_cache_start_address = 0x800;
802
803 /* Get all the image tiles in the cache, then use the rest for the kernels */
804 if (map->kernel_cache_start_address + coef_cache_size + image_cache_size < oc_sram_size) {
805 map->kernel_caching_mode = SRAM_CACHE_MODE_FULL_CACHE;
806 map->kernel_pattern_msb = 0x0;
807 map->kernel_pattern_low = 0x0;
808 map->kernel_pattern_high = 0x0;
809 map->kernel_cache_end_address = MAX2(MIN2(ALIGN(map->kernel_cache_start_address + coef_cache_size, 128), oc_sram_size), 0xa00);
810 } else {
811 /* Doesn't fit in the 512KB we have of on-chip SRAM */
812 map->kernel_caching_mode = SRAM_CACHE_MODE_PARTIAL_CACHE;
813 if (map->out_image_z_size >= 1024) {
814 map->kernel_pattern_msb = 0x13;
815 map->kernel_pattern_low = 0x80000;
816 map->kernel_pattern_high = 0x0;
817 } else if (map->out_image_z_size >= 512) {
818 map->kernel_pattern_msb = 0x3d;
819 map->kernel_pattern_low = 0x0;
820 map->kernel_pattern_high = 0x2aaaaaa0;
821 } else if (map->out_image_z_size >= 256) {
822 map->kernel_pattern_msb = 0x3e;
823 map->kernel_pattern_low = 0xffffaaaa;
824 map->kernel_pattern_high = 0x7fffffff;
825 } else if (map->out_image_z_size >= 160) {
826 map->kernel_pattern_msb = 0x6;
827 map->kernel_pattern_low = 0x7e;
828 map->kernel_pattern_high = 0x0;
829 } else {
830 map->kernel_pattern_msb = 0x3f;
831 map->kernel_pattern_low = 0xfffffffe;
832 map->kernel_pattern_high = 0xffffffff;
833 }
834 if (map->kernel_cache_start_address + coef_cache_size >= oc_sram_size) {
835 map->kernel_cache_end_address = oc_sram_size;
836 image_cache_size = 0;
837 } else if (image_cache_size > oc_sram_size) {
838 image_cache_size = 0;
839 } else
840 map->kernel_cache_end_address = oc_sram_size - image_cache_size;
841 }
842
843 if (image_cache_size == 0) {
844 map->image_caching_mode = SRAM_CACHE_MODE_NO_CACHE;
845 map->image_cache_start_address = 0x0;
846 map->image_cache_end_address = 0x800;
847 } else {
848 map->image_caching_mode = SRAM_CACHE_MODE_FULL_CACHE;
849 if (image_cache_size >= map->kernel_cache_start_address) {
850 map->image_cache_start_address = map->kernel_cache_end_address;
851 map->image_cache_end_address = MIN2(map->image_cache_start_address + image_cache_size, oc_sram_size);
852 ML_DBG("image_cache_end_address %d image_cache_start_address %d image_cache_size %d oc_sram_size %d\n", map->image_cache_end_address, map->image_cache_start_address, image_cache_size, oc_sram_size);
853 } else {
854 map->image_cache_start_address = 0x0;
855 map->image_cache_end_address = 0x800;
856 }
857 }
858
859 float conv_scale = (operation->input_scale * operation->weight_scale) / operation->output_scale;
860 uint32_t scale_bits = fui(conv_scale);
861 /* Taken from https://github.com/pytorch/QNNPACK/blob/master/src/qnnpack/requantization.h#L130 */
862 unsigned shift = 127 + 31 - 32 - (scale_bits >> 23);
863 if (nn_core_version == 8)
864 shift += 1;
865 else
866 shift += 16;
867
868 /* Divides by 2 * (post_shift - 18), rounding to nearest integer. If result doesn't fit in 8 bits, it is clamped to 255. galcore sets to 15 if INT8, to 0 if UINT8. */
869 map->post_shift = shift & 0x1f;
870 map->post_shift_bit_5_6 = (shift >> 5) & 0x3;
871
872 /* Multiplies by (multiplier * 2^15) */
873 if (nn_core_version == 8) {
874 map->post_multiplier = scale_bits & 0x1;
875 map->post_multiplier_1_to_6 = (scale_bits >> 1) & 0x3f;
876 map->post_multiplier_7_to_14 = (scale_bits >> 7) & 0xff;
877 map->post_multiplier_15_to_22 = (scale_bits >> 15) & 0xff;
878 } else {
879 map->post_multiplier = (scale_bits >> 8) & 0x1;
880 map->post_multiplier_1_to_6 = (scale_bits >> 9) & 0x3f;
881 map->post_multiplier_7_to_14 = (scale_bits >> 15) & 0xff;
882 }
883
884 map->per_channel_post_mul = 0x0;
885
886 etna_bo_cpu_fini(bo);
887
888 return bo;
889 }
890
calculate_bias_correction(uint8_t * weights,const struct etna_operation * operation)891 static uint32_t calculate_bias_correction(uint8_t *weights, const struct etna_operation *operation)
892 {
893 int32_t correction = 0;
894
895 for (unsigned i = 0; i < operation->weight_width * operation->weight_height * operation->input_channels; i++) {
896 correction += (weights[i] - operation->weight_zero_point) * operation->input_zero_point;
897 }
898
899 return correction;
900 }
901
902
903 static void
append_bits(uint32_t value,size_t size,unsigned * bits_in_buffer,uint64_t * buffer,uint32_t ** dest,bool do_write)904 append_bits(uint32_t value, size_t size, unsigned *bits_in_buffer, uint64_t *buffer, uint32_t **dest, bool do_write)
905 {
906 *buffer |= (uint64_t)value << *bits_in_buffer;
907 *bits_in_buffer += size;
908 if (*bits_in_buffer >= 32) {
909 if (do_write)
910 **dest = *buffer & 0xffffffff;
911 *dest += 1;
912 *buffer >>= 32;
913 *bits_in_buffer -= 32;
914 }
915 }
916
917 struct wb_stream {
918 unsigned zero_point;
919 unsigned zrl_bits;
920 unsigned *bits_in_buffer;
921 uint64_t *buffer;
922 uint32_t **map;
923 bool do_write;
924
925 unsigned accum_zeroes;
926 };
927
928 static void
wb_stream_flush_zeroes(struct wb_stream * wb_stream)929 wb_stream_flush_zeroes(struct wb_stream *wb_stream)
930 {
931 if (wb_stream->accum_zeroes == 0)
932 return;
933
934 append_bits(wb_stream->accum_zeroes - 1, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
935 wb_stream->accum_zeroes = 0;
936 append_bits(wb_stream->zero_point, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
937 }
938
939 static void
wb_stream_write(struct wb_stream * wb_stream,unsigned value)940 wb_stream_write(struct wb_stream *wb_stream, unsigned value)
941 {
942 unsigned max_zeroes = (1 << wb_stream->zrl_bits) - 1;
943
944 if (wb_stream->zrl_bits == 0) {
945 append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
946 return;
947 }
948
949 if (wb_stream->accum_zeroes == max_zeroes) {
950 append_bits(max_zeroes, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
951 wb_stream->accum_zeroes = 0;
952 append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
953 return;
954 }
955
956 if (value == wb_stream->zero_point) {
957 wb_stream->accum_zeroes++;
958 return;
959 }
960
961 append_bits(wb_stream->accum_zeroes, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
962 wb_stream->accum_zeroes = 0;
963 append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
964 }
965
966 static unsigned
write_core_6(struct etna_ml_subgraph * subgraph,uint32_t * map,unsigned core,const struct etna_operation * operation,unsigned zrl_bits)967 write_core_6(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
968 {
969 struct pipe_context *pctx = subgraph->base.context;
970 unsigned nn_core_count = etna_context(pctx)->screen->info->npu.nn_core_count;
971 unsigned input_channels = operation->addition ? 1 : operation->input_channels;
972 unsigned output_channels = operation->addition ? 1 : operation->output_channels;
973 unsigned cores_used = MIN2(output_channels, nn_core_count);
974 unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
975 uint8_t *input = map_resource(operation->weight_tensor);
976 uint32_t *biases = map_resource(operation->bias_tensor);
977 unsigned out_values_per_channel = operation->output_width * operation->output_height;
978 unsigned stride = MIN2(input_channels, 6);
979 unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
980 uint8_t *weights_maps[DIV_ROUND_UP(kernels_per_core, superblocks)];
981 uint32_t *initial_ptr = map;
982 bool do_write = initial_ptr != NULL;
983 uint64_t buffer = 0;
984 unsigned bits_in_buffer = 0;
985 struct wb_stream wb_stream = {
986 .zero_point = operation->weight_zero_point,
987 .zrl_bits = zrl_bits,
988 .bits_in_buffer = &bits_in_buffer,
989 .buffer = &buffer,
990 .map = &map,
991 .do_write = do_write,
992 };
993
994 ML_DBG("%s core %d zrl_bits %d\n", __func__, core, zrl_bits);
995
996 append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
997 append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);
998
999 for (unsigned superblock = 0; superblock < superblocks; superblock++) {
1000
1001 unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
1002 if (superblock == superblocks - 1)
1003 kernels_in_superblock = kernels_per_core - kernels_in_superblock * (superblocks - 1);
1004
1005 for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
1006 unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
1007 weights_maps[kernel] = input + out_channel * operation->weight_width * operation->weight_height * input_channels;
1008 }
1009
1010 for (unsigned block = 0; block < DIV_ROUND_UP(input_channels, stride); block++) {
1011 for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
1012 unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
1013
1014 if (block == 0) {
1015 wb_stream_write(&wb_stream, weights_maps[kernel][0]);
1016
1017 uint32_t corr = calculate_bias_correction(weights_maps[kernel], operation);
1018 wb_stream_flush_zeroes(&wb_stream);
1019 append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);
1020
1021 for (int i = 1; i < stride; i++) {
1022 wb_stream_write(&wb_stream, weights_maps[kernel][i]);
1023 }
1024 } else {
1025 for (int i = 0; i < stride; i++) {
1026 if (i + block * stride < input_channels)
1027 wb_stream_write(&wb_stream, weights_maps[kernel][i + block * stride]);
1028 }
1029 }
1030 if (block == DIV_ROUND_UP(input_channels, stride) - 1) {
1031 wb_stream_flush_zeroes(&wb_stream);
1032 append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
1033 }
1034 }
1035 }
1036 }
1037
1038 wb_stream_flush_zeroes(&wb_stream);
1039
1040 if (bits_in_buffer > 0)
1041 append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
1042
1043 return (uint8_t *)map - (uint8_t *)initial_ptr - 1;
1044 }
1045
1046 static unsigned
write_core_interleaved(struct etna_ml_subgraph * subgraph,uint32_t * map,unsigned core,const struct etna_operation * operation,unsigned zrl_bits)1047 write_core_interleaved(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
1048 {
1049 struct pipe_context *pctx = subgraph->base.context;
1050 unsigned nn_core_count = etna_context(pctx)->screen->info->npu.nn_core_count;
1051 unsigned input_channels = operation->addition ? 1 : operation->input_channels;
1052 unsigned output_channels = operation->addition ? 1 : operation->output_channels;
1053 unsigned cores_used = MIN2(output_channels, nn_core_count);
1054 unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
1055 uint8_t *input = map_resource(operation->weight_tensor);
1056 uint32_t *biases = map_resource(operation->bias_tensor);
1057 unsigned out_values_per_channel = operation->output_width * operation->output_height;
1058 unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
1059 uint8_t (*weights_map)[input_channels][operation->weight_width][operation->weight_height] = (void *)input;
1060 uint32_t *initial_ptr = map;
1061 bool do_write = initial_ptr != NULL;
1062 uint64_t buffer = 0;
1063 unsigned bits_in_buffer = 0;
1064 struct wb_stream wb_stream = {
1065 .zero_point = operation->weight_zero_point,
1066 .zrl_bits = zrl_bits,
1067 .bits_in_buffer = &bits_in_buffer,
1068 .buffer = &buffer,
1069 .map = &map,
1070 .do_write = do_write,
1071 };
1072
1073 ML_DBG("%s core %d zrl_bits %d map %p\n", __func__, core, zrl_bits, map);
1074
1075 append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
1076 append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);
1077
1078 for (unsigned superblock = 0; superblock < superblocks; superblock++) {
1079
1080 unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
1081 if (superblock == superblocks - 1)
1082 kernels_in_superblock = kernels_per_core - kernels_in_superblock * (superblocks - 1);
1083
1084 for (unsigned z = 0; z < input_channels; z++) {
1085 for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
1086 unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
1087
1088 for (unsigned block = 0; block < DIV_ROUND_UP(operation->weight_width, 2); block++) {
1089 unsigned stride = operation->weight_height;
1090 if (operation->weight_height > 3)
1091 stride = 3;
1092 for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
1093 if (x >= operation->weight_width)
1094 break;
1095 for (unsigned y = 0; y < stride; y++) {
1096 wb_stream_write(&wb_stream, weights_map[out_channel][z][x][y]);
1097 if (x == 0 && y == 0 && z == 0) {
1098 uint32_t corr = calculate_bias_correction((uint8_t *)weights_map[out_channel], operation);
1099 wb_stream_flush_zeroes(&wb_stream);
1100 append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);
1101 }
1102 }
1103 }
1104 if (operation->weight_height > 3) {
1105 for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
1106 if (x >= operation->weight_width)
1107 break;
1108 for (unsigned y = stride; y < operation->weight_width; y++) {
1109 wb_stream_write(&wb_stream, weights_map[out_channel][z][x][y]);
1110 }
1111 }
1112 }
1113 }
1114
1115 if (z == input_channels - 1) {
1116 wb_stream_flush_zeroes(&wb_stream);
1117 append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
1118 }
1119 }
1120 if (superblock == superblocks - 1)
1121 wb_stream_flush_zeroes(&wb_stream);
1122 }
1123 }
1124
1125 wb_stream_flush_zeroes(&wb_stream);
1126
1127 if (bits_in_buffer > 0)
1128 append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
1129
1130 return (uint8_t *)map - (uint8_t *)initial_ptr;
1131 }
1132
1133 static unsigned
write_core_sequential(struct etna_ml_subgraph * subgraph,uint32_t * map,unsigned core,const struct etna_operation * operation,unsigned zrl_bits)1134 write_core_sequential(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
1135 {
1136 struct pipe_context *pctx = subgraph->base.context;
1137 unsigned nn_core_count = etna_context(pctx)->screen->info->npu.nn_core_count;
1138 unsigned output_channels = operation->addition ? 1 : operation->output_channels;
1139 unsigned cores_used = MIN2(output_channels, nn_core_count);
1140 unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
1141 uint8_t *input = map_resource(operation->weight_tensor);
1142 uint32_t *biases = map_resource(operation->bias_tensor);
1143 unsigned out_values_per_channel = operation->output_width * operation->output_height;
1144 unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
1145 uint32_t *initial_ptr = map;
1146 bool do_write = initial_ptr != NULL;
1147 uint64_t buffer = 0;
1148 unsigned bits_in_buffer = 0;
1149 struct wb_stream wb_stream = {
1150 .zero_point = operation->weight_zero_point,
1151 .zrl_bits = zrl_bits,
1152 .bits_in_buffer = &bits_in_buffer,
1153 .buffer = &buffer,
1154 .map = &map,
1155 .do_write = do_write,
1156 };
1157
1158 ML_DBG("%s core %d zrl_bits %d superblocks %d\n", __func__, core, zrl_bits, superblocks);
1159
1160 append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
1161 append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);
1162
1163 for (unsigned superblock = 0; superblock < superblocks; superblock++) {
1164
1165 unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
1166 if (superblock == superblocks - 1)
1167 kernels_in_superblock = kernels_per_core - kernels_in_superblock * (superblocks - 1);
1168
1169 for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
1170 unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
1171
1172 uint8_t (*weights_map)[operation->weight_height] = (void*) input + out_channel * operation->weight_width * operation->weight_height;
1173
1174 for (unsigned block = 0; block < DIV_ROUND_UP(operation->weight_width, 2); block++) {
1175 unsigned stride = operation->weight_height;
1176 if ((operation->depthwise || operation->input_width > 64) && \
1177 operation->weight_height > 3)
1178 stride = 3;
1179 for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
1180 if (x >= operation->weight_width)
1181 break;
1182 for (unsigned y = 0; y < stride; y++) {
1183
1184 wb_stream_write(&wb_stream, weights_map[x][y]);
1185 if (x == 0 && y == 0) {
1186 uint32_t corr = calculate_bias_correction((uint8_t *)weights_map, operation);
1187 wb_stream_flush_zeroes(&wb_stream);
1188 append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);
1189 }
1190 }
1191 }
1192 if ((operation->depthwise || operation->input_width > 64) && \
1193 operation->weight_height > 3) {
1194 for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
1195 if (x >= operation->weight_width)
1196 break;
1197 for (unsigned y = stride; y < operation->weight_width; y++) {
1198 wb_stream_write(&wb_stream, weights_map[x][y]);
1199 }
1200 }
1201 }
1202 }
1203 wb_stream_flush_zeroes(&wb_stream);
1204 if (operation->addition)
1205 append_bits(operation->addition_offset, 32, &bits_in_buffer, &buffer, &map, do_write);
1206 else
1207 append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
1208 }
1209 }
1210
1211 wb_stream_flush_zeroes(&wb_stream);
1212
1213 if (bits_in_buffer > 0)
1214 append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
1215
1216 return (uint8_t *)map - (uint8_t *)initial_ptr - 1;
1217 }
1218
1219 static unsigned
calculate_weight_bo_size(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)1220 calculate_weight_bo_size(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
1221 {
1222 struct pipe_context *context = subgraph->base.context;
1223 struct etna_context *ctx = etna_context(context);
1224 unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
1225 unsigned header_size = ALIGN(nn_core_count * 4, 64);
1226 unsigned input_channels = operation->addition ? 1 : operation->input_channels;
1227 unsigned output_channels = operation->addition ? 1 : operation->output_channels;
1228 unsigned cores_used = MIN2(output_channels, nn_core_count);
1229 unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
1230 unsigned weights_size;
1231 unsigned core_size;
1232 unsigned core_size_aligned;
1233 unsigned compressed_size_aligned;
1234
1235 weights_size = operation->weight_width * operation->weight_height * input_channels;
1236 core_size = 1 + 2 + (weights_size + 4 + 4) * kernels_per_core;
1237 core_size_aligned = ALIGN(core_size, 64);
1238 compressed_size_aligned = header_size + core_size_aligned * cores_used;
1239
1240 return compressed_size_aligned;
1241 }
1242
1243 static unsigned
calculate_zrl_bits(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)1244 calculate_zrl_bits(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
1245 {
1246 struct pipe_context *context = subgraph->base.context;
1247 struct etna_context *ctx = etna_context(context);
1248 unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
1249 unsigned max_zrl_bits = ctx->screen->info->npu.nn_zrl_bits;
1250 unsigned header_size = ALIGN(nn_core_count * 4, 64);
1251 unsigned input_channels = operation->addition ? 1 : operation->input_channels;
1252 unsigned output_channels = operation->addition ? 1 : operation->output_channels;
1253 unsigned cores_used = MIN2(output_channels, nn_core_count);
1254 unsigned best_compressed_size;
1255 unsigned best_zrl_bits;
1256
1257 /* These are very unlikely to have enough zeroes for compression to be useful. */
1258 if (operation->addition ||
1259 operation->pointwise) {
1260
1261 return 0;
1262 }
1263
1264 /* This calculation can be really slow. Start from max_zrl_bits as big
1265 * buffers will benefit the most from high zero compression.
1266 */
1267 best_compressed_size = UINT_MAX;
1268 best_zrl_bits = 0;
1269 for (int zrl_bits = max_zrl_bits; zrl_bits >= 0; zrl_bits--) {
1270
1271 unsigned compressed_size = header_size;
1272 for (unsigned core = 0; core < cores_used; core++) {
1273
1274 unsigned actual_size;
1275 if (operation->pointwise && output_channels > 8)
1276 actual_size = write_core_6(subgraph, NULL, core, operation, zrl_bits);
1277 else if (input_channels > 1)
1278 actual_size = write_core_interleaved(subgraph, NULL, core, operation, zrl_bits);
1279 else
1280 actual_size = write_core_sequential(subgraph, NULL, core, operation, zrl_bits);
1281
1282 compressed_size += actual_size;
1283 }
1284
1285 /* If more bits don't compress further, then stop */
1286 if (compressed_size <= best_compressed_size) {
1287 best_compressed_size = compressed_size;
1288 best_zrl_bits = zrl_bits;
1289 } else
1290 break;
1291 }
1292
1293 return best_zrl_bits;
1294 }
1295
1296 static struct etna_bo *
create_coefficients_bo(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned * cache_size)1297 create_coefficients_bo(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size)
1298 {
1299 struct pipe_context *context = subgraph->base.context;
1300 struct etna_context *ctx = etna_context(context);
1301 unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
1302 unsigned header_size = ALIGN(nn_core_count * 4, 64);
1303 unsigned input_channels = operation->addition ? 1 : operation->input_channels;
1304 unsigned output_channels = operation->addition ? 1 : operation->output_channels;
1305 unsigned cores_used = MIN2(output_channels, nn_core_count);
1306 unsigned zrl_bits;
1307 unsigned max_core_size = 0;
1308 unsigned bo_size;
1309
1310 bo_size = calculate_weight_bo_size(subgraph, operation);
1311 zrl_bits = calculate_zrl_bits(subgraph, operation);
1312
1313 struct etna_bo *compressed = etna_bo_new(ctx->screen->dev,
1314 bo_size,
1315 DRM_ETNA_GEM_CACHE_WC);
1316
1317 etna_bo_cpu_prep(compressed, DRM_ETNA_PREP_WRITE);
1318
1319 uint32_t *map = etna_bo_map(compressed);
1320 memset(map, 0, bo_size);
1321
1322 uint32_t *header = map;
1323 map += header_size / 4;
1324
1325 for (unsigned core = 0; core < cores_used; core++) {
1326
1327 unsigned actual_size;
1328 if (operation->pointwise && output_channels > 8)
1329 actual_size = write_core_6(subgraph, map, core, operation, zrl_bits);
1330 else if (input_channels > 1)
1331 actual_size = write_core_interleaved(subgraph, map, core, operation, zrl_bits);
1332 else
1333 actual_size = write_core_sequential(subgraph, map, core, operation, zrl_bits);
1334
1335 actual_size = ALIGN(actual_size, 64);
1336 max_core_size = MAX2(actual_size, max_core_size);
1337
1338 header[core] = actual_size;
1339
1340 map += actual_size / 4;
1341 }
1342
1343 etna_bo_cpu_fini(compressed);
1344
1345 *cache_size = max_core_size * cores_used;
1346
1347 return compressed;
1348 }
1349
1350 void
etna_ml_compile_operation_nn(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct etna_vip_instruction * instruction)1351 etna_ml_compile_operation_nn(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
1352 struct etna_vip_instruction *instruction)
1353 {
1354 unsigned coef_cache_size;
1355
1356 instruction->type = ETNA_JOB_TYPE_NN;
1357 instruction->coefficients = create_coefficients_bo(subgraph, operation, &coef_cache_size);
1358
1359 struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
1360 assert(input);
1361 pipe_resource_reference(&instruction->input, input);
1362
1363 struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
1364 assert(output);
1365 pipe_resource_reference(&instruction->output, output);
1366
1367 instruction->configs[0] = create_nn_config(subgraph, operation, instruction->coefficients, coef_cache_size);
1368 }
1369
1370 void
etna_ml_emit_operation_nn(struct etna_ml_subgraph * subgraph,struct etna_vip_instruction * operation,unsigned idx)1371 etna_ml_emit_operation_nn(struct etna_ml_subgraph *subgraph,
1372 struct etna_vip_instruction *operation,
1373 unsigned idx)
1374 {
1375 struct pipe_context *pctx = subgraph->base.context;
1376 struct etna_context *ctx = etna_context(pctx);
1377 struct etna_cmd_stream *stream = ctx->stream;
1378 unsigned offset = idx + 1;
1379 unsigned nn_config = VIVS_GL_NN_CONFIG_NN_CORE_COUNT(0x0); /* This disables power control of NN cores and enables all of them */
1380
1381 if (!DBG_ENABLED(ETNA_DBG_NPU_PARALLEL)) {
1382 nn_config |= VIVS_GL_NN_CONFIG_SMALL_BATCH;
1383 offset = 0;
1384 }
1385
1386 etna_set_state(stream, VIVS_GL_OCB_REMAP_START, 0x0);
1387 etna_set_state(stream, VIVS_GL_OCB_REMAP_END, 0x0);
1388
1389 etna_set_state(stream, VIVS_GL_NN_CONFIG, nn_config);
1390 etna_set_state_reloc(stream, VIVS_PS_NN_INST_ADDR, &(struct etna_reloc) {
1391 .bo = operation->configs[0],
1392 .flags = ETNA_RELOC_READ,
1393 .offset = offset,
1394 });
1395 etna_set_state(stream, VIVS_PS_UNK10A4, offset);
1396 }
1397