xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (c) 2023-2024 Tomeu Vizoso <[email protected]>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "util/u_inlines.h"
7 
8 #include "etnaviv_context.h"
9 #include "etnaviv_debug.h"
10 #include "etnaviv_emit.h"
11 #include "etnaviv_ml_nn.h"
12 
13 #define ETNA_NN_INT8 0
14 
15 #define SRAM_CACHE_MODE_NO_CACHE 0x0
16 #define SRAM_CACHE_MODE_FULL_CACHE 0x1
17 #define SRAM_CACHE_MODE_PARTIAL_CACHE 0x2
18 
19 enum pooling_type {
20     ETNA_NN_POOLING_NON,
21     ETNA_NN_POOLING_MAX,
22     ETNA_NN_POOLING_AVG,
23     ETNA_NN_POOLING_FIRST_PIXEL
24 };
25 
26 #define FIELD(field, bits) uint32_t field : bits;
27 
28 struct etna_nn_params {
29 
30    FIELD(layer_type, 1) /* conv: 0 fully_connected: 1 */
31    FIELD(no_z_offset, 1)
32    FIELD(kernel_xy_size, 4)
33    FIELD(kernel_z_size, 14) /* & 0x3FFF */
34    FIELD(kernels_per_core, 7)
35    FIELD(pooling, 2)
36    FIELD(pooling_xy_size, 1)
37    FIELD(prelu, 1)
38    FIELD(nn_layer_flush, 1)
39 
40    /* 1 */
41    FIELD(kernel_data_type, 2) /* UINT8 0x2 INT8 0x0 */
42    FIELD(in_image_data_type, 2) /* UINT8 0x2 INT8 0x0 */
43    FIELD(out_image_data_type, 2) /* UINT8 0x2 INT8 0x0 */
44    FIELD(in_image_x_size, 13)
45    FIELD(in_image_y_size, 13)
46 
47    /* 2 */
48    FIELD(in_image_x_offset, 3)
49    FIELD(in_image_y_offset, 3)
50    FIELD(unused0, 1)
51    FIELD(brick_mode, 1)
52    FIELD(brick_distance, 16)
53    FIELD(relu, 1)
54    FIELD(unused1, 1)
55    FIELD(post_multiplier, 1)
56    FIELD(post_shift, 5)
57 
58    /* 3 */
59    FIELD(unused2, 3)
60    FIELD(no_flush, 1)
61    FIELD(unused3, 2)
62    FIELD(out_image_x_size, 13)
63    FIELD(out_image_y_size, 13)
64 
65    /* 4 */
66    /* Changes based on gcFEATURE_VALUE_NN_INIMAGE_OFFSET_BITS == 4 */
67    FIELD(out_image_z_size, 14)
68    FIELD(rounding_mode, 2)
69    FIELD(in_image_x_offset_bit_3, 1) /*  >> 3 & 0x1 */
70    FIELD(in_image_y_offset_bit_3, 1) /*  >> 3 & 0x1 */
71    FIELD(out_image_tile_x_size, 7)
72    FIELD(out_image_tile_y_size, 7)
73 
74    /* 5 */
75    FIELD(kernel_address, 26) /* >> 6 */
76    FIELD(kernel_z_size2, 6) /* >> 14 & 0x3F */
77 
78    /* 6 */
79    FIELD(in_image_address, 32)
80 
81    /* 7 */
82    FIELD(out_image_address, 32)
83 
84    /* 8 */
85    FIELD(image_caching_mode, 2)
86    FIELD(kernel_caching_mode, 2)
87    FIELD(partial_cache_data_unit, 2)
88    FIELD(kernel_pattern_msb, 6)
89    FIELD(kernel_y_size, 4)
90    FIELD(out_image_y_stride, 16)
91 
92    /* 9 */
93    FIELD(kernel_pattern_low, 32)
94 
95    /* 10 */
96    FIELD(kernel_pattern_high, 32)
97 
98    /* 11 */
99    FIELD(kernel_cache_start_address, 32)
100 
101    /* 12 */
102    FIELD(kernel_cache_end_address, 32)
103 
104    /* 13 */
105    FIELD(image_cache_start_address, 32)
106 
107    /* 14 */
108    FIELD(image_cache_end_address, 32)
109 
110    /* 15 */
111    FIELD(in_image_border_mode, 2)
112    FIELD(in_image_border_const, 16)
113    FIELD(unused4, 1)
114    FIELD(kernel_data_type_bit_2, 1)
115    FIELD(in_image_data_type_bit_2, 1)
116    FIELD(out_image_data_type_bit_2, 1)
117    FIELD(post_multiplier_1_to_6, 6)
118    FIELD(post_shift_bit_5_6, 2)
119    FIELD(unused5, 2)
120 
121    /* 16 */
122    FIELD(in_image_x_stride, 16)
123    FIELD(in_image_y_stride, 16)
124 
125    /* 17 */
126    FIELD(out_image_x_stride, 16)
127    FIELD(unused6, 8)
128    FIELD(post_multiplier_7_to_14, 8)
129 
130    /* 18 */
131    FIELD(out_image_circular_buf_size, 26) /* >> 6 */
132    FIELD(per_channel_post_mul, 1)
133    FIELD(unused7, 5)
134 
135    /* 19 */
136    FIELD(out_image_circular_buf_end_addr_plus_1, 26) /* >> 6 */
137    FIELD(unused8, 6)
138 
139    /* 20 */
140    FIELD(in_image_circular_buf_size, 26) /* >> 6 */
141    FIELD(unused9, 6)
142 
143    /* 21 */
144    FIELD(in_image_circular_buf_end_addr_plus_1, 26) /* >> 6 */
145    FIELD(unused10, 6)
146 
147    /* 22 */
148    FIELD(coef_zero_point, 8)
149    FIELD(out_zero_point, 8)
150    FIELD(kernel_direct_stream_from_VIP_sram, 1)
151    FIELD(depthwise, 1)
152    FIELD(post_multiplier_15_to_22, 8)
153    FIELD(unused11, 6)
154 
155    /* 23, from here they aren't set on  */
156    FIELD(unused12, 32)
157 
158    /* 24 */
159    FIELD(unused13, 4)
160    FIELD(unused14, 28)  /* 0 >> 4 */
161 
162    /* 25 */
163    FIELD(unused15, 4)
164    FIELD(unused16, 28)  /* 0 >> 4 */
165 
166    /* 26 */
167    FIELD(further1, 32)
168    FIELD(further2, 32)
169    FIELD(further3, 32)
170    FIELD(further4, 32)
171    FIELD(further5, 32)
172    FIELD(further6, 32)
173    FIELD(further7, 32)
174    FIELD(further8, 32)
175 };
176 
177 static void *
map_resource(struct pipe_resource * resource)178 map_resource(struct pipe_resource *resource)
179 {
180    return etna_bo_map(etna_resource(resource)->bo);
181 }
182 
183 
184 static void
pointwise_to_2x2(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)185 pointwise_to_2x2(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
186 {
187    /* Fill a Nx2x2xN tensor with zero_points */
188    struct pipe_context *context = subgraph->base.context;
189    uint8_t *input = map_resource(operation->weight_tensor);
190    unsigned new_size = operation->output_channels * 2 * 2 * operation->input_channels;
191    struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT,
192                                                          new_size);
193    uint8_t *output = map_resource(output_res);
194 
195    for (unsigned channel = 0; channel < operation->output_channels; channel++) {
196       uint8_t *map_in = input + channel * 1 * 1 * operation->input_channels;
197       uint8_t *map_out = output + channel * 2 * 2 * operation->input_channels;
198 
199       map_out[0] = map_in[0];
200       map_out[1] = operation->weight_zero_point;
201       map_out[2] = operation->weight_zero_point;
202       map_out[3] = operation->weight_zero_point;
203    }
204 
205    pipe_resource_reference(&operation->weight_tensor, NULL);
206    operation->weight_tensor = output_res;
207 
208    operation->weight_width = operation->weight_height = 2;
209    operation->pointwise = false;
210 }
211 
212 static void
expand_depthwise(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)213 expand_depthwise(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
214 {
215    struct pipe_context *context = subgraph->base.context;
216    uint8_t *input = map_resource(operation->weight_tensor);
217    unsigned new_size = operation->output_channels * operation->weight_width * operation->weight_height * operation->input_channels;
218    struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT,
219                                                          new_size);
220    uint8_t *output = map_resource(output_res);
221 
222    /* Lower depthwise convolution to regular convolution, as the hardware doesn't support those */
223    for (unsigned channel = 0; channel < operation->output_channels; channel++) {
224       unsigned in_channel = channel / operation->output_channels;
225       unsigned in_depth = channel % operation->output_channels;
226 
227       uint8_t *map_in = input + in_channel * operation->weight_width * operation->weight_height * operation->input_channels;
228       uint8_t *map_out = output + channel * operation->weight_width * operation->weight_height * operation->input_channels;
229 
230       for (unsigned i = 0; i < operation->weight_width * operation->weight_height * operation->input_channels; i++) {
231          if (i % operation->input_channels == in_depth)
232             map_out[i] = map_in[i];
233          else
234             map_out[i] = operation->weight_zero_point;
235       }
236    }
237 
238    pipe_resource_reference(&operation->weight_tensor, NULL);
239    operation->weight_tensor = output_res;
240 }
241 
242 static void
transpose(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)243 transpose(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
244 {
245    struct pipe_context *context = subgraph->base.context;
246    void *map = map_resource(operation->weight_tensor);
247    unsigned new_size = operation->output_channels * operation->weight_width * \
248                        operation->weight_height * operation->input_channels;
249    struct pipe_resource *output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT,
250                                                          new_size);
251    uint8_t *output = map_resource(output_res);
252    unsigned output_channels = operation->output_channels;
253    unsigned input_channels = operation->input_channels;
254 
255    if (operation->addition) {
256       output_channels = 1;
257       input_channels = 2;
258    }
259 
260    uint8_t (*input)[operation->weight_width][operation->weight_height][input_channels] = map;
261    unsigned i = 0;
262    for (unsigned d0 = 0; d0 < output_channels; d0++)
263       for (unsigned d3 = 0; d3 < input_channels; d3++)
264          for (unsigned d1 = 0; d1 < operation->weight_width; d1++)
265             for (unsigned d2 = 0; d2 < operation->weight_height; d2++)
266                ((uint8_t*)output)[i++] = input[d0][d1][d2][d3];
267 
268    pipe_resource_reference(&operation->weight_tensor, NULL);
269    operation->weight_tensor = output_res;
270 }
271 
272 static void
subsample(uint8_t * map_in,unsigned in_width,unsigned in_height,unsigned in_depth,unsigned out_width,unsigned out_height,unsigned in_z,unsigned offset_x,unsigned offset_y,unsigned stride,uint8_t * map_out,int in_zp)273 subsample(uint8_t *map_in, unsigned in_width, unsigned in_height, unsigned in_depth, unsigned out_width, unsigned out_height, unsigned in_z, unsigned offset_x, unsigned offset_y, unsigned stride, uint8_t *map_out, int in_zp)
274 {
275    uint8_t (*in)[in_height][in_depth] = (uint8_t(*)[in_height][in_depth])map_in;
276    uint8_t (*out)[out_height] = (uint8_t(*)[out_height])map_out;
277 
278    for(unsigned x = 0; x < out_width; x++)
279       for(unsigned y = 0; y < out_height; y++) {
280          unsigned in_x = x * stride + offset_x;
281          unsigned in_y = y * stride + offset_y;
282          if (in_x < in_width && in_y < in_height)
283             out[x][y] = in[in_x][in_y][in_z];
284          else
285             out[x][y] = in_zp;
286       }
287 }
288 
289 /* TODO: Do the reshaping in the TP units, for big enough buffers */
290 static void
reshape(uint8_t * input,uint8_t * output,unsigned stride,int in_zp,unsigned dims_in[4],unsigned dims_out[4])291 reshape(uint8_t *input, uint8_t *output, unsigned stride, int in_zp, unsigned dims_in[4], unsigned dims_out[4])
292 {
293    for (unsigned out_channel = 0; out_channel < dims_in[0]; out_channel++) {
294       void *map_in = input + out_channel * dims_in[1] * dims_in[2] * dims_in[3];
295       void *map_out = output + out_channel * dims_out[1] * dims_out[2] * dims_out[3];
296 
297       /* See Figure 3 in https://arxiv.org/abs/1712.02502 */
298       /* This is only valid for stride == 2 */
299       assert(stride == 2);
300       uint8_t (*out)[dims_out[1]][dims_out[2]] = (uint8_t(*)[dims_out[1]][dims_out[2]])map_out;
301       for (unsigned z = 0; z < dims_in[3]; z++) {
302          subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 0, 0, stride, (uint8_t *)out[0 + z * stride * stride], in_zp);
303          subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 0, 1, stride, (uint8_t *)out[1 + z * stride * stride], in_zp);
304          subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 1, 0, stride, (uint8_t *)out[2 + z * stride * stride], in_zp);
305          subsample(map_in, dims_in[1], dims_in[2], dims_in[3], dims_out[1], dims_out[2], z, 1, 1, stride, (uint8_t *)out[3 + z * stride * stride], in_zp);
306       }
307    }
308 }
309 
310 static void
strided_to_normal(struct etna_ml_subgraph * subgraph,struct etna_operation * operation)311 strided_to_normal(struct etna_ml_subgraph *subgraph, struct etna_operation *operation)
312 {
313    struct pipe_context *context = subgraph->base.context;
314    uint8_t *input = map_resource(operation->weight_tensor);
315    unsigned new_size;
316    struct pipe_resource *output_res;
317    uint8_t *output;
318 
319    /* The hardware doesn't support strides natively, so we "lower" them as
320       * described in this paper:
321       *
322       * "Take it in your stride: Do we need striding in CNNs?" https://arxiv.org/abs/1712.02502
323       */
324 
325    /* TODO: Support more strides */
326    assert(operation->stride == 2);
327 
328    unsigned wdims_in[4] = {operation->output_channels,
329                            operation->weight_width,
330                            operation->weight_height,
331                            operation->input_channels};
332 
333    operation->input_channels = operation->input_channels * operation->stride * operation->stride;
334    operation->input_width = DIV_ROUND_UP(operation->input_width, operation->stride);
335    operation->input_height = DIV_ROUND_UP(operation->input_height, operation->stride);
336 
337    if (operation->padding_same) {
338       if (operation->weight_width == 5) {
339          operation->input_width += 2;
340          operation->input_height += 2;
341       } else {
342          operation->input_width += 1;
343          operation->input_height += 1;
344       }
345    }
346 
347    operation->weight_width = DIV_ROUND_UP(operation->weight_width, operation->stride);
348    operation->weight_height = DIV_ROUND_UP(operation->weight_height, operation->stride);
349 
350    new_size = operation->output_channels * operation->weight_width * operation->weight_height * operation->input_channels;
351    output_res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, new_size);
352    output = map_resource(output_res);
353 
354    unsigned wdims_out[4] = {operation->output_channels, operation->weight_width, operation->weight_height, operation->input_channels};
355    reshape(input, output, operation->stride, operation->weight_zero_point, wdims_in, wdims_out);
356 
357    pipe_resource_reference(&operation->weight_tensor, NULL);
358    operation->weight_tensor = output_res;
359 }
360 
361 void
etna_ml_lower_convolution(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation,struct etna_operation * operation)362 etna_ml_lower_convolution(struct etna_ml_subgraph *subgraph,
363                           const struct pipe_ml_operation *poperation,
364                           struct etna_operation *operation)
365 {
366    /* TODO: Support stride_x != stride_y */
367    assert(poperation->conv.stride_x == poperation->conv.stride_y);
368    assert(poperation->type == PIPE_ML_OPERATION_TYPE_CONVOLUTION);
369 
370    operation->type = ETNA_JOB_TYPE_NN;
371    operation->addition = false;
372    operation->depthwise = poperation->conv.depthwise;
373    operation->pointwise = poperation->conv.pointwise;
374    operation->pooling_first_pixel = poperation->conv.stride_x > 1 && \
375       (poperation->conv.depthwise || poperation->conv.pointwise);
376    operation->padding_same = poperation->conv.padding_same;
377    operation->stride = poperation->conv.stride_x;
378 
379    operation->input_tensor = poperation->input_tensor->index;
380    operation->input_width = poperation->input_tensor->dims[1];
381    operation->input_height = poperation->input_tensor->dims[2];
382    operation->input_channels = poperation->input_tensor->dims[3];
383    operation->input_zero_point = poperation->input_tensor->zero_point;
384    operation->input_scale = poperation->input_tensor->scale;
385 
386    operation->output_tensor = poperation->output_tensor->index;
387    operation->output_width = poperation->output_tensor->dims[1];
388    operation->output_height = poperation->output_tensor->dims[2];
389    operation->output_channels = poperation->output_tensor->dims[3];
390    operation->output_zero_point = poperation->output_tensor->zero_point;
391    operation->output_scale = poperation->output_tensor->scale;
392 
393    pipe_resource_reference(&operation->weight_tensor, poperation->conv.weight_tensor->resource);
394    operation->weight_width = poperation->conv.weight_tensor->dims[1];
395    operation->weight_height = poperation->conv.weight_tensor->dims[2];
396    operation->weight_zero_point = poperation->conv.weight_tensor->zero_point;
397    operation->weight_scale = poperation->conv.weight_tensor->scale;
398 
399    pipe_resource_reference(&operation->bias_tensor, poperation->conv.bias_tensor->resource);
400 
401    if (operation->pointwise && operation->input_channels == 1)
402       pointwise_to_2x2(subgraph, operation);
403 
404    if (operation->depthwise && (operation->output_channels > 1 || operation->stride > 1)) {
405 
406       if (operation->input_width < 8 && operation->input_width > 2)
407          operation->pooling_first_pixel = false;
408 
409       expand_depthwise(subgraph, operation);
410    }
411 
412    if (operation->stride > 1 && !operation->pooling_first_pixel)
413       strided_to_normal(subgraph, operation);  /* This will already transpose if input_channels > 1 */
414    else if (operation->input_channels > 1)
415       transpose(subgraph, operation);
416 
417    operation->input_tensor_size = operation->input_width *
418                                   operation->input_height *
419                                   operation->input_channels;
420    ML_DBG("%dx%dx%d\n", operation->input_width, operation->input_height, operation->input_channels);
421 }
422 
423 static float
compute_weight_scale_add(float input1_scale,float input2_scale)424 compute_weight_scale_add(float input1_scale, float input2_scale)
425 {
426    double scale_ratio = input1_scale / input2_scale;
427 
428    return (float) MAX2(scale_ratio, 1.0) / 255.0;
429 }
430 
431 static uint8_t
compute_addition_offset(float input1_scale,float input2_scale,float weight_scale)432 compute_addition_offset(float input1_scale, float input2_scale, float weight_scale)
433 {
434   double addition_offset = input1_scale / input2_scale;
435   addition_offset /= weight_scale;
436   return round(addition_offset + 0.0) * 1;
437 }
438 
439 static uint8_t
compute_weight_add(float input1_scale,float input2_scale,float weight_scale)440 compute_weight_add(float input1_scale, float input2_scale, float weight_scale)
441 {
442    double weight = 1.0 / weight_scale;
443    return round(weight + 0.0);
444 }
445 
446 static uint32_t
compute_bias_add(float input1_scale,float input2_scale,uint8_t input1_zp,uint8_t input2_zp,float weight_scale)447 compute_bias_add(float input1_scale, float input2_scale, uint8_t input1_zp, uint8_t input2_zp, float weight_scale)
448 {
449    int zero_point_diff = input2_zp - input1_zp;
450    double bias = zero_point_diff * input1_scale;
451    bias /= weight_scale * input2_scale;
452 
453    double addition_offset = input1_scale / input2_scale;
454    addition_offset /= weight_scale;
455    addition_offset = round(addition_offset + 0.0) * 1;
456 
457    return (int) (round(bias) - round(addition_offset) * input2_zp);
458 }
459 
460 void
etna_ml_lower_add(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperation,struct etna_operation * operation)461 etna_ml_lower_add(struct etna_ml_subgraph *subgraph,
462                   const struct pipe_ml_operation *poperation,
463                   struct etna_operation *operation)
464 {
465    struct pipe_context *context = subgraph->base.context;
466 
467    assert(poperation->type == PIPE_ML_OPERATION_TYPE_ADD);
468 
469    operation->addition = true;
470    operation->depthwise = false;
471    operation->pointwise = false;
472    operation->pooling_first_pixel = false;
473    operation->padding_same = false;
474    operation->stride = 1;
475 
476    operation->input_tensor = poperation->input_tensor->index;
477    operation->add_input_tensor = poperation->add.input_tensor->index;
478    operation->input_width = poperation->input_tensor->dims[1];
479    operation->input_height = poperation->input_tensor->dims[2];
480    operation->input_channels = poperation->input_tensor->dims[3];
481    operation->input_zero_point = poperation->input_tensor->zero_point;
482    operation->input_scale = poperation->input_tensor->scale;
483    operation->input_tensor_size = operation->input_width *
484                                   operation->input_height *
485                                   operation->input_channels *
486                                   2;
487 
488    operation->output_tensor = poperation->output_tensor->index;
489    operation->output_width = poperation->output_tensor->dims[1];
490    operation->output_height = poperation->output_tensor->dims[2];
491    operation->output_channels = poperation->output_tensor->dims[3];
492    operation->output_zero_point = poperation->output_tensor->zero_point;
493    operation->output_scale = poperation->output_tensor->scale;
494 
495    operation->weight_tensor = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, 8);
496    operation->weight_width = 2;
497    operation->weight_height = 2;
498    operation->weight_zero_point = 0x0;
499    operation->weight_scale = compute_weight_scale_add(poperation->add.input_tensor->scale, poperation->input_tensor->scale);
500    operation->addition_offset = compute_addition_offset(poperation->add.input_tensor->scale, poperation->input_tensor->scale, operation->weight_scale);
501 
502    uint8_t *weight_map = map_resource(operation->weight_tensor);
503    memset(weight_map, 0, pipe_buffer_size(operation->weight_tensor));
504    weight_map[0] = compute_weight_add(poperation->add.input_tensor->scale, poperation->input_tensor->scale, operation->weight_scale);
505 
506    operation->bias_tensor = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, 4);
507    int32_t *bias_map = map_resource(operation->bias_tensor);
508    bias_map[0] = compute_bias_add(poperation->add.input_tensor->scale, poperation->input_tensor->scale,
509                                   poperation->add.input_tensor->zero_point, poperation->input_tensor->zero_point,
510                                   operation->weight_scale);
511 }
512 
513 #define MAX_TILE_WIDTH 64
514 
515 static unsigned
calc_superblocks(struct etna_context * ctx,const struct etna_operation * operation,unsigned tile_y,unsigned interleave_mode)516 calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode)
517 {
518    unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
519    unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
520    unsigned output_channels = operation->addition ? 1 : operation->output_channels;
521    unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
522    unsigned foo = (nn_accum_buffer_depth * interleave_mode) / tile_y;
523 
524    if (operation->weight_width == 1)
525       foo = MIN2(foo, nn_accum_buffer_depth / 3);
526 
527    foo = MIN2(foo, kernels_per_core);
528    foo = MIN2(foo, 127);
529 
530    kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * foo);
531    unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count);
532    unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels);
533 
534    return superblocks;
535 }
536 
537 static unsigned
calc_interleave_mode(unsigned tile_width,unsigned weight_height)538 calc_interleave_mode(unsigned tile_width, unsigned weight_height)
539 {
540    unsigned mode = 8;
541 
542    if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 2)
543       return 1;
544 
545    if (tile_width > MAX_TILE_WIDTH / 2)
546       mode = 1;
547    else if (tile_width > MAX_TILE_WIDTH / 4)
548       mode = 2;
549    else if (tile_width > MAX_TILE_WIDTH / 8)
550       mode = 4;
551 
552    if (weight_height - 1 + tile_width > (MAX_TILE_WIDTH + 8) / 4)
553       return MIN2(mode, 4);
554 
555    return MIN2(mode, 2);
556 }
557 
558 static void
calc_addition_sizes(unsigned * input_width,unsigned * input_height,unsigned * input_channels,unsigned * output_width,unsigned * output_height,unsigned * output_channels)559 calc_addition_sizes(unsigned *input_width, unsigned *input_height, unsigned *input_channels,
560                     unsigned *output_width, unsigned *output_height, unsigned *output_channels)
561 {
562    ML_DBG("addition input width %d channels %d\n", *input_width, *input_channels);
563 
564    unsigned channel_size = *input_width * *input_height;
565    unsigned width = 0;
566    if (channel_size % 128 == 0)
567       width = 128;
568    else if (channel_size % 64 == 0)
569       width = 64;
570    else if (channel_size % 32 == 0)
571       width = 32;
572    else {
573       for (int i = 63; i > 0; i--) {
574          if (channel_size % i == 0) {
575             width = i;
576             break;
577          }
578       }
579    }
580 
581    *input_height = (*input_width * *input_height * *input_channels) / width;
582    *input_width = width;
583    *input_channels = 2;
584 
585    *output_height = *output_width * *output_height * *output_channels / width;
586    *output_width = width;
587    *output_channels = 1;
588 }
589 
590 static unsigned
calculate_tiling(struct etna_context * ctx,const struct etna_operation * operation,unsigned * tile_width_out,unsigned * tile_height_out)591 calculate_tiling(struct etna_context *ctx, const struct etna_operation *operation, unsigned *tile_width_out, unsigned *tile_height_out)
592 {
593    unsigned nn_input_buffer_depth = ctx->screen->info->npu.nn_input_buffer_depth;
594    unsigned nn_accum_buffer_depth = ctx->screen->info->npu.nn_accum_buffer_depth;
595    unsigned input_width = operation->input_width;
596    unsigned input_height = operation->input_height;
597    unsigned input_channels = operation->input_channels;
598    unsigned output_width = operation->output_width;
599    unsigned output_height = operation->output_height;
600    unsigned output_channels = operation->output_channels;
601    unsigned tile_width;
602    unsigned tile_height;
603    unsigned superblocks;
604    unsigned interleave_mode;
605 
606    if (operation->addition)
607       calc_addition_sizes(&input_width, &input_height, &input_channels,
608                           &output_width, &output_height, &output_channels);
609 
610    if (operation->pooling_first_pixel) {
611       output_width *= 2;
612       output_height *= 2;
613    }
614 
615    tile_width = MIN2(output_width, 64);
616    interleave_mode = calc_interleave_mode(tile_width, operation->weight_height);
617 
618    tile_height = nn_input_buffer_depth * interleave_mode - operation->weight_height + 1;
619    tile_height = MIN2(tile_height, interleave_mode * nn_accum_buffer_depth);
620    tile_height = MIN2(tile_height, output_height);
621 
622    if (operation->stride > 1 && tile_height % 2 > 0)
623       tile_height -= 1;
624 
625    tile_height = MAX2(tile_height, 1);
626    superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode);
627 
628    if (tile_width_out)
629       *tile_width_out = tile_width;
630 
631    if (tile_height_out)
632       *tile_height_out = tile_height;
633 
634    return superblocks;
635 }
636 
637 static struct etna_bo *
create_nn_config(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct etna_bo * coefficients,unsigned coef_cache_size)638 create_nn_config(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, struct etna_bo *coefficients, unsigned coef_cache_size)
639 {
640    struct pipe_context *context = subgraph->base.context;
641    struct etna_context *ctx = etna_context(context);
642    unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
643    unsigned nn_core_version = ctx->screen->specs.nn_core_version;
644    unsigned oc_sram_size = ctx->screen->info->npu.on_chip_sram_size;
645    struct etna_bo *bo = etna_bo_new(ctx->screen->dev,
646                                     sizeof(struct etna_nn_params),
647                                     DRM_ETNA_GEM_CACHE_WC);
648    unsigned input_width = operation->input_width;
649    unsigned input_height = operation->input_height;
650    unsigned input_channels = operation->input_channels;
651    unsigned output_width = operation->output_width;
652    unsigned output_height = operation->output_height;
653    unsigned output_channels = operation->output_channels;
654    unsigned weight_width = operation->weight_width;
655    unsigned weight_height = operation->weight_height;
656 
657    if (operation->pointwise && input_channels == 1)
658       weight_width = weight_height = 2;
659 
660    if (operation->addition)
661       calc_addition_sizes(&input_width, &input_height, &input_channels,
662                           &output_width, &output_height, &output_channels);
663 
664    etna_bo_cpu_prep(bo, DRM_ETNA_PREP_WRITE);
665 
666    struct etna_nn_params *map = etna_bo_map(bo);
667    map->layer_type = 0x0;
668    map->no_z_offset = 0x0;
669    map->prelu = 0x0;
670    map->nn_layer_flush = 0x1;
671    map->brick_mode = 0x0;
672    map->brick_distance = 0x0;
673    map->relu = 0x0;
674    map->no_flush = 0x0;
675    map->rounding_mode = 0x1;
676    map->partial_cache_data_unit = 0x0;
677    map->depthwise = 0x0;
678 
679    map->unused0 = 0x0;
680    map->unused1 = 0x0;
681    map->unused2 = 0x0;
682    map->unused3 = 0x0;
683    map->unused4 = 0x0;
684    map->unused5 = 0x0;
685    map->unused6 = 0x0;
686    map->unused7 = 0x0;
687    map->unused8 = 0x0;
688    map->unused9 = 0x0;
689    map->unused10 = 0x0;
690    map->unused11 = 0x0;
691    map->unused12 = 0x0;
692    map->unused13 = 0x0;
693    map->unused14 = 0x0;
694    map->further1 = 0x0;
695    map->further2 = 0x0;
696    map->further3 = 0x3ffffff;
697    map->further4 = 0x7f800000;
698    map->further5 = 0xff800000;
699    map->further6 = 0x0;
700    map->further7 = 0x0;
701    map->further8 = 0x0;
702 
703    struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
704    unsigned offset = etna_ml_get_offset(subgraph, operation->input_tensor);
705    map->in_image_address = etna_bo_gpu_va(etna_resource(input)->bo) + offset;
706    map->in_image_x_size = input_width;
707    map->in_image_y_size = input_height;
708    map->in_image_x_stride = input_width;
709    map->in_image_y_stride = input_height;
710    map->in_image_data_type = ETNA_NN_INT8;
711    map->in_image_data_type_bit_2 = ETNA_NN_INT8 >> 2;
712    map->in_image_circular_buf_size = 0x0;
713    map->in_image_circular_buf_end_addr_plus_1 = 0xFFFFFFFF >> 6;
714    map->in_image_border_mode = 0x0;
715    map->in_image_border_const = operation->input_zero_point;
716 
717    if (operation->padding_same && operation->stride == 1 && weight_width > 2) {
718       if (weight_width < 5) {
719          map->in_image_x_offset = 0x7;
720          map->in_image_y_offset = 0x7;
721       } else {
722          map->in_image_x_offset = 0x6;
723          map->in_image_y_offset = 0x6;
724       }
725       map->in_image_x_offset_bit_3 = 0x1;
726       map->in_image_y_offset_bit_3 = 0x1;
727    } else {
728       map->in_image_x_offset = 0x0;
729       map->in_image_y_offset = 0x0;
730       map->in_image_x_offset_bit_3 = 0x0;
731       map->in_image_y_offset_bit_3 = 0x0;
732    }
733 
734    if (operation->padding_same && operation->stride == 2 && weight_width == 5) {
735       map->in_image_x_offset = 0x7;
736       map->in_image_y_offset = 0x7;
737       map->in_image_x_offset_bit_3 = 0x1;
738       map->in_image_y_offset_bit_3 = 0x1;
739    }
740 
741    struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
742    offset = etna_ml_get_offset(subgraph, operation->output_tensor);
743    map->out_image_address = etna_bo_gpu_va(etna_resource(output)->bo) + offset;
744    map->out_image_x_size = output_width;
745    map->out_image_y_size = output_height;
746    map->out_image_z_size = output_channels;
747 
748    map->out_image_x_stride = map->out_image_x_size;
749    map->out_image_y_stride = map->out_image_y_size;
750 
751    map->out_image_data_type = ETNA_NN_INT8;
752    map->out_image_data_type_bit_2 = ETNA_NN_INT8 >> 2;
753    map->out_image_circular_buf_size = 0x0;
754    map->out_image_circular_buf_end_addr_plus_1 = 0xFFFFFFFF >> 6;
755    map->out_zero_point = operation->output_zero_point;
756 
757    if (operation->pooling_first_pixel) {
758       map->pooling = ETNA_NN_POOLING_FIRST_PIXEL;
759       map->pooling_xy_size = 0x0;
760 
761       map->out_image_x_size *= 2;
762       map->out_image_y_size *= 2;
763    } else {
764       map->pooling = ETNA_NN_POOLING_NON;
765       map->pooling_xy_size = 0x1;
766    }
767 
768    unsigned tile_x, tile_y;
769    unsigned superblocks = calculate_tiling(ctx, operation, &tile_x, &tile_y);
770    map->out_image_tile_x_size = tile_x;
771    map->out_image_tile_y_size = tile_y;
772 
773    map->kernel_address = etna_bo_gpu_va(coefficients) >> 6;
774    map->kernel_xy_size = weight_width;
775    map->kernel_y_size = weight_height;
776    map->kernel_z_size = input_channels;
777    map->kernel_z_size2 = 0x0;
778    map->kernel_data_type = ETNA_NN_INT8;
779    map->kernel_data_type_bit_2 = ETNA_NN_INT8 >> 2;
780    map->kernel_direct_stream_from_VIP_sram = 0x0;
781 
782    map->coef_zero_point = operation->weight_zero_point;
783 
784    map->kernels_per_core = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), superblocks);
785 
786    unsigned image_cache_size;
787    if (superblocks == 1) {
788       /* No point in caching the input image if there is only one iteration */
789       image_cache_size = 0;
790    } else {
791       unsigned in_image_tile_x_size = map->out_image_tile_x_size + weight_width - 1;
792       unsigned in_image_tile_y_size = map->out_image_tile_y_size + weight_width - 1;
793       image_cache_size = in_image_tile_x_size * in_image_tile_y_size;
794       image_cache_size = ALIGN(image_cache_size, 16);
795       image_cache_size *= input_channels;
796       image_cache_size = ALIGN(image_cache_size, 128);
797    }
798 
799    ML_DBG("coefficients_size 0x%x (%d) image_size 0x%x (%d)\n", coef_cache_size, coef_cache_size, image_cache_size, image_cache_size);
800 
801    map->kernel_cache_start_address = 0x800;
802 
803    /* Get all the image tiles in the cache, then use the rest for the kernels */
804    if (map->kernel_cache_start_address + coef_cache_size + image_cache_size < oc_sram_size) {
805       map->kernel_caching_mode = SRAM_CACHE_MODE_FULL_CACHE;
806       map->kernel_pattern_msb = 0x0;
807       map->kernel_pattern_low = 0x0;
808       map->kernel_pattern_high = 0x0;
809       map->kernel_cache_end_address = MAX2(MIN2(ALIGN(map->kernel_cache_start_address + coef_cache_size, 128), oc_sram_size), 0xa00);
810    } else {
811       /* Doesn't fit in the 512KB we have of on-chip SRAM */
812       map->kernel_caching_mode = SRAM_CACHE_MODE_PARTIAL_CACHE;
813       if (map->out_image_z_size >= 1024) {
814          map->kernel_pattern_msb = 0x13;
815          map->kernel_pattern_low = 0x80000;
816          map->kernel_pattern_high = 0x0;
817       } else if (map->out_image_z_size >= 512) {
818          map->kernel_pattern_msb = 0x3d;
819          map->kernel_pattern_low = 0x0;
820          map->kernel_pattern_high = 0x2aaaaaa0;
821       } else if (map->out_image_z_size >= 256) {
822          map->kernel_pattern_msb = 0x3e;
823          map->kernel_pattern_low = 0xffffaaaa;
824          map->kernel_pattern_high = 0x7fffffff;
825       } else if (map->out_image_z_size >= 160) {
826          map->kernel_pattern_msb = 0x6;
827          map->kernel_pattern_low = 0x7e;
828          map->kernel_pattern_high = 0x0;
829       } else {
830          map->kernel_pattern_msb = 0x3f;
831          map->kernel_pattern_low = 0xfffffffe;
832          map->kernel_pattern_high = 0xffffffff;
833       }
834       if (map->kernel_cache_start_address + coef_cache_size >= oc_sram_size) {
835          map->kernel_cache_end_address = oc_sram_size;
836          image_cache_size = 0;
837       } else if (image_cache_size > oc_sram_size) {
838          image_cache_size = 0;
839       } else
840          map->kernel_cache_end_address = oc_sram_size - image_cache_size;
841    }
842 
843    if (image_cache_size == 0) {
844       map->image_caching_mode = SRAM_CACHE_MODE_NO_CACHE;
845       map->image_cache_start_address = 0x0;
846       map->image_cache_end_address = 0x800;
847    } else {
848       map->image_caching_mode = SRAM_CACHE_MODE_FULL_CACHE;
849       if (image_cache_size >= map->kernel_cache_start_address) {
850          map->image_cache_start_address = map->kernel_cache_end_address;
851          map->image_cache_end_address = MIN2(map->image_cache_start_address + image_cache_size, oc_sram_size);
852          ML_DBG("image_cache_end_address %d image_cache_start_address %d image_cache_size %d oc_sram_size %d\n", map->image_cache_end_address, map->image_cache_start_address, image_cache_size, oc_sram_size);
853       } else {
854          map->image_cache_start_address = 0x0;
855          map->image_cache_end_address = 0x800;
856       }
857    }
858 
859    float conv_scale = (operation->input_scale * operation->weight_scale) / operation->output_scale;
860    uint32_t scale_bits = fui(conv_scale);
861    /* Taken from https://github.com/pytorch/QNNPACK/blob/master/src/qnnpack/requantization.h#L130 */
862    unsigned shift = 127 + 31 - 32 - (scale_bits >> 23);
863    if (nn_core_version == 8)
864       shift += 1;
865    else
866       shift += 16;
867 
868    /* Divides by 2 * (post_shift - 18), rounding to nearest integer. If result doesn't fit in 8 bits, it is clamped to 255. galcore sets to 15 if INT8, to 0 if UINT8. */
869    map->post_shift = shift & 0x1f;
870    map->post_shift_bit_5_6 = (shift >> 5) & 0x3;
871 
872    /* Multiplies by (multiplier * 2^15) */
873    if (nn_core_version == 8) {
874       map->post_multiplier = scale_bits & 0x1;
875       map->post_multiplier_1_to_6 = (scale_bits >> 1) & 0x3f;
876       map->post_multiplier_7_to_14 = (scale_bits >> 7) & 0xff;
877       map->post_multiplier_15_to_22 = (scale_bits >> 15) & 0xff;
878    } else {
879       map->post_multiplier = (scale_bits >> 8) & 0x1;
880       map->post_multiplier_1_to_6 = (scale_bits >> 9) & 0x3f;
881       map->post_multiplier_7_to_14 = (scale_bits >> 15) & 0xff;
882    }
883 
884    map->per_channel_post_mul = 0x0;
885 
886    etna_bo_cpu_fini(bo);
887 
888    return bo;
889 }
890 
calculate_bias_correction(uint8_t * weights,const struct etna_operation * operation)891 static uint32_t calculate_bias_correction(uint8_t *weights, const struct etna_operation *operation)
892 {
893    int32_t correction = 0;
894 
895    for (unsigned i = 0; i < operation->weight_width * operation->weight_height * operation->input_channels; i++) {
896       correction += (weights[i] - operation->weight_zero_point) * operation->input_zero_point;
897    }
898 
899    return correction;
900 }
901 
902 
903 static void
append_bits(uint32_t value,size_t size,unsigned * bits_in_buffer,uint64_t * buffer,uint32_t ** dest,bool do_write)904 append_bits(uint32_t value, size_t size, unsigned *bits_in_buffer, uint64_t *buffer, uint32_t **dest, bool do_write)
905 {
906    *buffer |= (uint64_t)value << *bits_in_buffer;
907    *bits_in_buffer += size;
908    if (*bits_in_buffer >= 32) {
909       if (do_write)
910          **dest = *buffer & 0xffffffff;
911       *dest += 1;
912       *buffer >>= 32;
913       *bits_in_buffer -= 32;
914    }
915 }
916 
917 struct wb_stream {
918    unsigned zero_point;
919    unsigned zrl_bits;
920    unsigned *bits_in_buffer;
921    uint64_t *buffer;
922    uint32_t **map;
923    bool do_write;
924 
925    unsigned accum_zeroes;
926 };
927 
928 static void
wb_stream_flush_zeroes(struct wb_stream * wb_stream)929 wb_stream_flush_zeroes(struct wb_stream *wb_stream)
930 {
931    if (wb_stream->accum_zeroes == 0)
932       return;
933 
934    append_bits(wb_stream->accum_zeroes - 1, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
935    wb_stream->accum_zeroes = 0;
936    append_bits(wb_stream->zero_point, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
937 }
938 
939 static void
wb_stream_write(struct wb_stream * wb_stream,unsigned value)940 wb_stream_write(struct wb_stream *wb_stream, unsigned value)
941 {
942    unsigned max_zeroes = (1 << wb_stream->zrl_bits) - 1;
943 
944    if (wb_stream->zrl_bits == 0) {
945       append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
946       return;
947    }
948 
949    if (wb_stream->accum_zeroes == max_zeroes) {
950       append_bits(max_zeroes, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
951       wb_stream->accum_zeroes = 0;
952       append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
953       return;
954    }
955 
956    if (value == wb_stream->zero_point) {
957       wb_stream->accum_zeroes++;
958       return;
959    }
960 
961    append_bits(wb_stream->accum_zeroes, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
962    wb_stream->accum_zeroes = 0;
963    append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
964 }
965 
966 static unsigned
write_core_6(struct etna_ml_subgraph * subgraph,uint32_t * map,unsigned core,const struct etna_operation * operation,unsigned zrl_bits)967 write_core_6(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
968 {
969    struct pipe_context *pctx = subgraph->base.context;
970    unsigned nn_core_count = etna_context(pctx)->screen->info->npu.nn_core_count;
971    unsigned input_channels = operation->addition ? 1 : operation->input_channels;
972    unsigned output_channels = operation->addition ? 1 : operation->output_channels;
973    unsigned cores_used = MIN2(output_channels, nn_core_count);
974    unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
975    uint8_t *input = map_resource(operation->weight_tensor);
976    uint32_t *biases = map_resource(operation->bias_tensor);
977    unsigned out_values_per_channel = operation->output_width * operation->output_height;
978    unsigned stride = MIN2(input_channels, 6);
979    unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
980    uint8_t *weights_maps[DIV_ROUND_UP(kernels_per_core, superblocks)];
981    uint32_t *initial_ptr = map;
982    bool do_write = initial_ptr != NULL;
983    uint64_t buffer = 0;
984    unsigned bits_in_buffer = 0;
985    struct wb_stream wb_stream = {
986       .zero_point = operation->weight_zero_point,
987       .zrl_bits = zrl_bits,
988       .bits_in_buffer = &bits_in_buffer,
989       .buffer = &buffer,
990       .map = &map,
991       .do_write = do_write,
992    };
993 
994    ML_DBG("%s core %d zrl_bits %d\n", __func__, core, zrl_bits);
995 
996    append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
997    append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);
998 
999    for (unsigned superblock = 0; superblock < superblocks; superblock++) {
1000 
1001       unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
1002       if (superblock == superblocks - 1)
1003          kernels_in_superblock = kernels_per_core - kernels_in_superblock * (superblocks - 1);
1004 
1005       for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
1006          unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
1007          weights_maps[kernel] = input + out_channel * operation->weight_width * operation->weight_height * input_channels;
1008       }
1009 
1010       for (unsigned block = 0; block < DIV_ROUND_UP(input_channels, stride); block++) {
1011          for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
1012             unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
1013 
1014             if (block == 0) {
1015                wb_stream_write(&wb_stream, weights_maps[kernel][0]);
1016 
1017                uint32_t corr = calculate_bias_correction(weights_maps[kernel], operation);
1018                wb_stream_flush_zeroes(&wb_stream);
1019                append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);
1020 
1021                for (int i = 1; i < stride; i++) {
1022                   wb_stream_write(&wb_stream, weights_maps[kernel][i]);
1023                }
1024             } else {
1025                for (int i = 0; i < stride; i++) {
1026                   if (i + block * stride < input_channels)
1027                      wb_stream_write(&wb_stream, weights_maps[kernel][i + block * stride]);
1028                }
1029             }
1030             if (block == DIV_ROUND_UP(input_channels, stride) - 1) {
1031                wb_stream_flush_zeroes(&wb_stream);
1032                append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
1033             }
1034          }
1035       }
1036    }
1037 
1038    wb_stream_flush_zeroes(&wb_stream);
1039 
1040    if (bits_in_buffer > 0)
1041       append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
1042 
1043    return (uint8_t *)map - (uint8_t *)initial_ptr - 1;
1044 }
1045 
1046 static unsigned
write_core_interleaved(struct etna_ml_subgraph * subgraph,uint32_t * map,unsigned core,const struct etna_operation * operation,unsigned zrl_bits)1047 write_core_interleaved(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
1048 {
1049    struct pipe_context *pctx = subgraph->base.context;
1050    unsigned nn_core_count = etna_context(pctx)->screen->info->npu.nn_core_count;
1051    unsigned input_channels = operation->addition ? 1 : operation->input_channels;
1052    unsigned output_channels = operation->addition ? 1 : operation->output_channels;
1053    unsigned cores_used = MIN2(output_channels, nn_core_count);
1054    unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
1055    uint8_t *input = map_resource(operation->weight_tensor);
1056    uint32_t *biases = map_resource(operation->bias_tensor);
1057    unsigned out_values_per_channel = operation->output_width * operation->output_height;
1058    unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
1059    uint8_t (*weights_map)[input_channels][operation->weight_width][operation->weight_height] = (void *)input;
1060    uint32_t *initial_ptr = map;
1061    bool do_write = initial_ptr != NULL;
1062    uint64_t buffer = 0;
1063    unsigned bits_in_buffer = 0;
1064    struct wb_stream wb_stream = {
1065       .zero_point = operation->weight_zero_point,
1066       .zrl_bits = zrl_bits,
1067       .bits_in_buffer = &bits_in_buffer,
1068       .buffer = &buffer,
1069       .map = &map,
1070       .do_write = do_write,
1071    };
1072 
1073    ML_DBG("%s core %d zrl_bits %d map %p\n", __func__, core, zrl_bits, map);
1074 
1075    append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
1076    append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);
1077 
1078    for (unsigned superblock = 0; superblock < superblocks; superblock++) {
1079 
1080       unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
1081       if (superblock == superblocks - 1)
1082          kernels_in_superblock = kernels_per_core - kernels_in_superblock * (superblocks - 1);
1083 
1084       for (unsigned z = 0; z < input_channels; z++) {
1085          for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
1086             unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
1087 
1088             for (unsigned block = 0; block < DIV_ROUND_UP(operation->weight_width, 2); block++) {
1089                unsigned stride = operation->weight_height;
1090                if (operation->weight_height > 3)
1091                   stride = 3;
1092                for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
1093                   if (x >= operation->weight_width)
1094                      break;
1095                   for (unsigned y = 0; y < stride; y++) {
1096                      wb_stream_write(&wb_stream, weights_map[out_channel][z][x][y]);
1097                      if (x == 0 && y == 0 && z == 0) {
1098                         uint32_t corr = calculate_bias_correction((uint8_t *)weights_map[out_channel], operation);
1099                         wb_stream_flush_zeroes(&wb_stream);
1100                         append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);
1101                      }
1102                   }
1103                }
1104                if (operation->weight_height > 3) {
1105                   for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
1106                      if (x >= operation->weight_width)
1107                         break;
1108                      for (unsigned y = stride; y < operation->weight_width; y++) {
1109                         wb_stream_write(&wb_stream, weights_map[out_channel][z][x][y]);
1110                      }
1111                   }
1112                }
1113             }
1114 
1115             if (z == input_channels - 1) {
1116                wb_stream_flush_zeroes(&wb_stream);
1117                append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
1118             }
1119          }
1120          if (superblock == superblocks - 1)
1121             wb_stream_flush_zeroes(&wb_stream);
1122       }
1123    }
1124 
1125    wb_stream_flush_zeroes(&wb_stream);
1126 
1127    if (bits_in_buffer > 0)
1128       append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
1129 
1130    return (uint8_t *)map - (uint8_t *)initial_ptr;
1131 }
1132 
1133 static unsigned
write_core_sequential(struct etna_ml_subgraph * subgraph,uint32_t * map,unsigned core,const struct etna_operation * operation,unsigned zrl_bits)1134 write_core_sequential(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
1135 {
1136    struct pipe_context *pctx = subgraph->base.context;
1137    unsigned nn_core_count = etna_context(pctx)->screen->info->npu.nn_core_count;
1138    unsigned output_channels = operation->addition ? 1 : operation->output_channels;
1139    unsigned cores_used = MIN2(output_channels, nn_core_count);
1140    unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
1141    uint8_t *input = map_resource(operation->weight_tensor);
1142    uint32_t *biases = map_resource(operation->bias_tensor);
1143    unsigned out_values_per_channel = operation->output_width * operation->output_height;
1144    unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
1145    uint32_t *initial_ptr = map;
1146    bool do_write = initial_ptr != NULL;
1147    uint64_t buffer = 0;
1148    unsigned bits_in_buffer = 0;
1149    struct wb_stream wb_stream = {
1150       .zero_point = operation->weight_zero_point,
1151       .zrl_bits = zrl_bits,
1152       .bits_in_buffer = &bits_in_buffer,
1153       .buffer = &buffer,
1154       .map = &map,
1155       .do_write = do_write,
1156    };
1157 
1158    ML_DBG("%s core %d zrl_bits %d superblocks %d\n", __func__, core, zrl_bits, superblocks);
1159 
1160    append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
1161    append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);
1162 
1163    for (unsigned superblock = 0; superblock < superblocks; superblock++) {
1164 
1165       unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks);
1166       if (superblock == superblocks - 1)
1167          kernels_in_superblock = kernels_per_core - kernels_in_superblock * (superblocks - 1);
1168 
1169       for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
1170          unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
1171 
1172          uint8_t (*weights_map)[operation->weight_height] = (void*) input + out_channel * operation->weight_width * operation->weight_height;
1173 
1174          for (unsigned block = 0; block < DIV_ROUND_UP(operation->weight_width, 2); block++) {
1175             unsigned stride = operation->weight_height;
1176             if ((operation->depthwise || operation->input_width > 64) && \
1177                operation->weight_height > 3)
1178                stride = 3;
1179             for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
1180                if (x >= operation->weight_width)
1181                   break;
1182                for (unsigned y = 0; y < stride; y++) {
1183 
1184                   wb_stream_write(&wb_stream, weights_map[x][y]);
1185                   if (x == 0 && y == 0) {
1186                      uint32_t corr = calculate_bias_correction((uint8_t *)weights_map, operation);
1187                      wb_stream_flush_zeroes(&wb_stream);
1188                      append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);
1189                   }
1190                }
1191             }
1192             if ((operation->depthwise || operation->input_width > 64) && \
1193                operation->weight_height > 3) {
1194                for (unsigned x = block * 2; x < (block + 1) * 2; x++ ) {
1195                   if (x >= operation->weight_width)
1196                      break;
1197                   for (unsigned y = stride; y < operation->weight_width; y++) {
1198                      wb_stream_write(&wb_stream, weights_map[x][y]);
1199                   }
1200                }
1201             }
1202          }
1203          wb_stream_flush_zeroes(&wb_stream);
1204          if (operation->addition)
1205             append_bits(operation->addition_offset, 32, &bits_in_buffer, &buffer, &map, do_write);
1206          else
1207             append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
1208       }
1209    }
1210 
1211    wb_stream_flush_zeroes(&wb_stream);
1212 
1213    if (bits_in_buffer > 0)
1214       append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
1215 
1216    return (uint8_t *)map - (uint8_t *)initial_ptr - 1;
1217 }
1218 
1219 static unsigned
calculate_weight_bo_size(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)1220 calculate_weight_bo_size(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
1221 {
1222    struct pipe_context *context = subgraph->base.context;
1223    struct etna_context *ctx = etna_context(context);
1224    unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
1225    unsigned header_size = ALIGN(nn_core_count * 4, 64);
1226    unsigned input_channels = operation->addition ? 1 : operation->input_channels;
1227    unsigned output_channels = operation->addition ? 1 : operation->output_channels;
1228    unsigned cores_used = MIN2(output_channels, nn_core_count);
1229    unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
1230    unsigned weights_size;
1231    unsigned core_size;
1232    unsigned core_size_aligned;
1233    unsigned compressed_size_aligned;
1234 
1235    weights_size = operation->weight_width * operation->weight_height * input_channels;
1236    core_size = 1 + 2 + (weights_size + 4 + 4) * kernels_per_core;
1237    core_size_aligned = ALIGN(core_size, 64);
1238    compressed_size_aligned = header_size + core_size_aligned * cores_used;
1239 
1240    return compressed_size_aligned;
1241 }
1242 
1243 static unsigned
calculate_zrl_bits(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation)1244 calculate_zrl_bits(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
1245 {
1246    struct pipe_context *context = subgraph->base.context;
1247    struct etna_context *ctx = etna_context(context);
1248    unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
1249    unsigned max_zrl_bits = ctx->screen->info->npu.nn_zrl_bits;
1250    unsigned header_size = ALIGN(nn_core_count * 4, 64);
1251    unsigned input_channels = operation->addition ? 1 : operation->input_channels;
1252    unsigned output_channels = operation->addition ? 1 : operation->output_channels;
1253    unsigned cores_used = MIN2(output_channels, nn_core_count);
1254    unsigned best_compressed_size;
1255    unsigned best_zrl_bits;
1256 
1257    /* These are very unlikely to have enough zeroes for compression to be useful. */
1258    if (operation->addition ||
1259        operation->pointwise) {
1260 
1261       return 0;
1262    }
1263 
1264    /* This calculation can be really slow. Start from max_zrl_bits as big
1265     * buffers will benefit the most from high zero compression.
1266     */
1267    best_compressed_size = UINT_MAX;
1268    best_zrl_bits = 0;
1269    for (int zrl_bits = max_zrl_bits; zrl_bits >= 0; zrl_bits--) {
1270 
1271       unsigned compressed_size = header_size;
1272       for (unsigned core = 0; core < cores_used; core++) {
1273 
1274          unsigned actual_size;
1275          if (operation->pointwise && output_channels > 8)
1276             actual_size = write_core_6(subgraph, NULL, core, operation, zrl_bits);
1277          else if (input_channels > 1)
1278             actual_size = write_core_interleaved(subgraph, NULL, core, operation, zrl_bits);
1279          else
1280             actual_size = write_core_sequential(subgraph, NULL, core, operation, zrl_bits);
1281 
1282          compressed_size += actual_size;
1283       }
1284 
1285       /* If more bits don't compress further, then stop */
1286       if (compressed_size <= best_compressed_size) {
1287          best_compressed_size = compressed_size;
1288          best_zrl_bits = zrl_bits;
1289       } else
1290          break;
1291    }
1292 
1293    return best_zrl_bits;
1294 }
1295 
1296 static struct etna_bo *
create_coefficients_bo(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,unsigned * cache_size)1297 create_coefficients_bo(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *cache_size)
1298 {
1299    struct pipe_context *context = subgraph->base.context;
1300    struct etna_context *ctx = etna_context(context);
1301    unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
1302    unsigned header_size = ALIGN(nn_core_count * 4, 64);
1303    unsigned input_channels = operation->addition ? 1 : operation->input_channels;
1304    unsigned output_channels = operation->addition ? 1 : operation->output_channels;
1305    unsigned cores_used = MIN2(output_channels, nn_core_count);
1306    unsigned zrl_bits;
1307    unsigned max_core_size = 0;
1308    unsigned bo_size;
1309 
1310    bo_size = calculate_weight_bo_size(subgraph, operation);
1311    zrl_bits = calculate_zrl_bits(subgraph, operation);
1312 
1313    struct etna_bo *compressed = etna_bo_new(ctx->screen->dev,
1314                                             bo_size,
1315                                             DRM_ETNA_GEM_CACHE_WC);
1316 
1317    etna_bo_cpu_prep(compressed, DRM_ETNA_PREP_WRITE);
1318 
1319    uint32_t *map = etna_bo_map(compressed);
1320    memset(map, 0, bo_size);
1321 
1322    uint32_t *header = map;
1323    map += header_size / 4;
1324 
1325    for (unsigned core = 0; core < cores_used; core++) {
1326 
1327       unsigned actual_size;
1328       if (operation->pointwise && output_channels > 8)
1329          actual_size = write_core_6(subgraph, map, core, operation, zrl_bits);
1330       else if (input_channels > 1)
1331          actual_size = write_core_interleaved(subgraph, map, core, operation, zrl_bits);
1332       else
1333          actual_size = write_core_sequential(subgraph, map, core, operation, zrl_bits);
1334 
1335       actual_size = ALIGN(actual_size, 64);
1336       max_core_size = MAX2(actual_size, max_core_size);
1337 
1338       header[core] = actual_size;
1339 
1340       map += actual_size / 4;
1341    }
1342 
1343    etna_bo_cpu_fini(compressed);
1344 
1345    *cache_size = max_core_size * cores_used;
1346 
1347    return compressed;
1348 }
1349 
1350 void
etna_ml_compile_operation_nn(struct etna_ml_subgraph * subgraph,const struct etna_operation * operation,struct etna_vip_instruction * instruction)1351 etna_ml_compile_operation_nn(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation,
1352                              struct etna_vip_instruction *instruction)
1353 {
1354    unsigned coef_cache_size;
1355 
1356    instruction->type = ETNA_JOB_TYPE_NN;
1357    instruction->coefficients = create_coefficients_bo(subgraph, operation, &coef_cache_size);
1358 
1359    struct pipe_resource *input = etna_ml_get_tensor(subgraph, operation->input_tensor);
1360    assert(input);
1361    pipe_resource_reference(&instruction->input, input);
1362 
1363    struct pipe_resource *output = etna_ml_get_tensor(subgraph, operation->output_tensor);
1364    assert(output);
1365    pipe_resource_reference(&instruction->output, output);
1366 
1367    instruction->configs[0] = create_nn_config(subgraph, operation, instruction->coefficients, coef_cache_size);
1368 }
1369 
1370 void
etna_ml_emit_operation_nn(struct etna_ml_subgraph * subgraph,struct etna_vip_instruction * operation,unsigned idx)1371 etna_ml_emit_operation_nn(struct etna_ml_subgraph *subgraph,
1372                           struct etna_vip_instruction *operation,
1373                           unsigned idx)
1374 {
1375    struct pipe_context *pctx = subgraph->base.context;
1376    struct etna_context *ctx = etna_context(pctx);
1377    struct etna_cmd_stream *stream = ctx->stream;
1378    unsigned offset = idx + 1;
1379    unsigned nn_config = VIVS_GL_NN_CONFIG_NN_CORE_COUNT(0x0); /* This disables power control of NN cores and enables all of them */
1380 
1381    if (!DBG_ENABLED(ETNA_DBG_NPU_PARALLEL)) {
1382       nn_config |= VIVS_GL_NN_CONFIG_SMALL_BATCH;
1383       offset = 0;
1384    }
1385 
1386    etna_set_state(stream, VIVS_GL_OCB_REMAP_START, 0x0);
1387    etna_set_state(stream, VIVS_GL_OCB_REMAP_END, 0x0);
1388 
1389    etna_set_state(stream, VIVS_GL_NN_CONFIG, nn_config);
1390    etna_set_state_reloc(stream, VIVS_PS_NN_INST_ADDR, &(struct etna_reloc) {
1391       .bo = operation->configs[0],
1392       .flags = ETNA_RELOC_READ,
1393       .offset = offset,
1394    });
1395    etna_set_state(stream, VIVS_PS_UNK10A4, offset);
1396 }
1397