1 /*
2 * Copyright (c) 2023-2024 Tomeu Vizoso <[email protected]>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include <stdio.h>
7 #include <unistd.h>
8 #include <sys/time.h>
9
10 #include "util/u_inlines.h"
11
12 #include "etnaviv_context.h"
13 #include "etnaviv_debug.h"
14 #include "etnaviv_emit.h"
15 #include "etnaviv_ml_nn.h"
16 #include "etnaviv_ml_tp.h"
17 #include "etnaviv_ml.h"
18
19 struct pipe_resource *
etna_ml_get_tensor(struct etna_ml_subgraph * subgraph,unsigned idx)20 etna_ml_get_tensor(struct etna_ml_subgraph *subgraph, unsigned idx)
21 {
22 return *util_dynarray_element(&subgraph->tensors, struct pipe_resource *, idx);
23 }
24
25 unsigned
etna_ml_get_offset(struct etna_ml_subgraph * subgraph,unsigned idx)26 etna_ml_get_offset(struct etna_ml_subgraph *subgraph, unsigned idx)
27 {
28 return *util_dynarray_element(&subgraph->offsets, unsigned, idx);
29 }
30
31 unsigned
etna_ml_allocate_tensor(struct etna_ml_subgraph * subgraph)32 etna_ml_allocate_tensor(struct etna_ml_subgraph *subgraph)
33 {
34 struct pipe_resource **tensors = util_dynarray_grow(&subgraph->tensors, struct pipe_resource *, 1);
35 tensors[0] = NULL;
36
37 unsigned *offsets = util_dynarray_grow(&subgraph->offsets, unsigned, 1);
38 offsets[0] = 0;
39
40 return util_dynarray_num_elements(&subgraph->tensors, struct pipe_resource *) - 1;
41 }
42
43 static void
etna_ml_create_tensor(struct etna_ml_subgraph * subgraph,unsigned idx,unsigned size)44 etna_ml_create_tensor(struct etna_ml_subgraph *subgraph, unsigned idx, unsigned size)
45 {
46 struct pipe_context *context = subgraph->base.context;
47 struct pipe_resource **tensors = util_dynarray_begin(&subgraph->tensors);
48
49 assert(idx < util_dynarray_num_elements(&subgraph->tensors, struct pipe_resource *));
50
51 struct pipe_resource *res = tensors[idx];
52
53 if (res != NULL) {
54 assert(size == pipe_buffer_size(res));
55 return;
56 }
57
58 res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, size);
59 tensors[idx] = res;
60
61 ML_DBG("created resource %p for tensor %d with size %d\n", res, idx, size);
62 }
63
64 static bool
needs_reshuffle(const struct pipe_ml_operation * poperation)65 needs_reshuffle(const struct pipe_ml_operation *poperation)
66 {
67 bool has_stride = poperation->conv.stride_x > 1 || poperation->conv.stride_y > 1;
68 bool pointwise = poperation->conv.pointwise;
69 unsigned input_width = poperation->input_tensor->dims[1];
70
71 return has_stride && !(poperation->conv.depthwise && (input_width > 5 || input_width < 3)) && !pointwise;
72 }
73
74 static void
reference_tensor_with_offset(struct etna_ml_subgraph * subgraph,unsigned src_tensor,unsigned dst_tensor,unsigned offset)75 reference_tensor_with_offset(struct etna_ml_subgraph *subgraph,
76 unsigned src_tensor,
77 unsigned dst_tensor,
78 unsigned offset)
79 {
80 struct pipe_resource **tensors = util_dynarray_begin(&subgraph->tensors);
81 unsigned *offsets = util_dynarray_begin(&subgraph->offsets);
82 pipe_resource_reference(&tensors[dst_tensor], tensors[src_tensor]);
83 offsets[dst_tensor] = offset;
84 }
85
86 static void
dump_graph(struct list_head * etna_operations)87 dump_graph(struct list_head *etna_operations)
88 {
89 ML_DBG("\n");
90 ML_DBG("dumping intermediate graph: %d operations\n", list_length(etna_operations));
91
92 ML_DBG("\n");
93 ML_DBG("%3s %-4s %3s %3s %s\n", "idx", "type", "in", "out", "operation type-specific");
94 ML_DBG("================================================================================================\n");
95 unsigned i = 0;
96 list_for_each_entry(struct etna_operation, operation, etna_operations, link) {
97 switch(operation->type) {
98 case ETNA_JOB_TYPE_TP:
99 ML_DBG("%3d %-4s %3d %3d",
100 i, "TP", operation->input_tensor, operation->output_tensor);
101 break;
102 case ETNA_JOB_TYPE_NN:
103 ML_DBG("%3d %-4s %3d %3d in2: %3d",
104 i, "NN", operation->input_tensor, operation->output_tensor, operation->add_input_tensor);
105 break;
106 }
107 ML_DBG("\n");
108 i++;
109 }
110 ML_DBG("\n");
111 }
112
113 static void
lower_operations(struct etna_ml_subgraph * subgraph,const struct pipe_ml_operation * poperations,unsigned count,struct list_head * etna_operations)114 lower_operations(struct etna_ml_subgraph *subgraph,
115 const struct pipe_ml_operation *poperations,
116 unsigned count,
117 struct list_head *etna_operations)
118 {
119 for (unsigned i = 0; i < count; i++) {
120 const struct pipe_ml_operation *poperation = &poperations[i];
121
122 switch(poperation->type) {
123 case PIPE_ML_OPERATION_TYPE_CONVOLUTION: {
124 unsigned input_tensor = poperation->input_tensor->index;
125 if (needs_reshuffle(poperation)) {
126 struct etna_operation *operation = calloc(1, sizeof(*operation));
127 etna_ml_lower_reshuffle(subgraph, poperation, operation, &input_tensor);
128 list_addtail(&operation->link, etna_operations);
129 }
130
131 struct etna_operation *operation = calloc(1, sizeof(*operation));
132 etna_ml_lower_convolution(subgraph, poperation, operation);
133 operation->input_tensor = input_tensor;
134 list_addtail(&operation->link, etna_operations);
135 break;
136 }
137 case PIPE_ML_OPERATION_TYPE_ADD: {
138 struct etna_operation *operation = calloc(1, sizeof(*operation));
139 etna_ml_lower_add(subgraph, poperation, operation);
140 list_addtail(&operation->link, etna_operations);
141 break;
142 }
143 default:
144 unreachable("Unsupported ML operation type");
145 }
146 }
147
148 /* TODO: Support graphs with more than one input */
149 if (poperations[0].input_tensor->dims[3] > 1) {
150 struct etna_operation *operation = calloc(1, sizeof(*operation));
151 unsigned input_tensor = poperations[0].input_tensor->index;
152 unsigned output_tensor;
153 etna_ml_lower_transpose(subgraph, &poperations[0], operation, &output_tensor);
154 list_for_each_entry(struct etna_operation, operation, etna_operations, link) {
155 if (operation->input_tensor == input_tensor)
156 operation->input_tensor = output_tensor;
157 if (operation->type == ETNA_JOB_TYPE_NN && operation->addition) {
158 if (operation->add_input_tensor == input_tensor)
159 operation->add_input_tensor = output_tensor;
160 }
161 }
162 list_add(&operation->link, etna_operations);
163 }
164
165 list_for_each_entry(struct etna_operation, operation, etna_operations, link) {
166 etna_ml_create_tensor(subgraph, operation->input_tensor, operation->input_tensor_size);
167
168 if (operation->type == ETNA_JOB_TYPE_NN && operation->addition)
169 reference_tensor_with_offset(subgraph,
170 operation->input_tensor,
171 operation->add_input_tensor,
172 operation->input_tensor_size / 2);
173 }
174
175 /* Detranspose any output tensors that aren't inputs to other operations
176 * and have output channels, these are the outputs of the graph.
177 */
178 list_for_each_entry_safe(struct etna_operation, operation, etna_operations, link) {
179 struct pipe_resource *res = etna_ml_get_tensor(subgraph, operation->output_tensor);
180 if (res != NULL)
181 continue;
182
183 if (operation->output_channels > 1) {
184 struct etna_operation *transpose_operation = calloc(1, sizeof(*operation));
185 etna_ml_lower_detranspose(subgraph, operation, transpose_operation);
186 operation->output_tensor = transpose_operation->input_tensor;
187 list_add(&transpose_operation->link, &operation->link);
188 }
189 }
190
191 /* Create any output tensors that aren't inputs to other operations, these
192 * are the outputs of the graph.
193 */
194 ML_DBG("Ensuring all output tensors have their memory backing.\n");
195 list_for_each_entry(struct etna_operation, operation, etna_operations, link) {
196 struct pipe_resource *res = etna_ml_get_tensor(subgraph, operation->output_tensor);
197 if (res != NULL)
198 continue;
199
200 unsigned size = operation->output_width * operation->output_height * operation->output_channels;
201 etna_ml_create_tensor(subgraph, operation->output_tensor, size);
202 }
203
204 if (DBG_ENABLED(ETNA_DBG_ML_MSGS))
205 dump_graph(etna_operations);
206 }
207
208 static unsigned
count_tensors(const struct pipe_ml_operation * poperations,unsigned count)209 count_tensors(const struct pipe_ml_operation *poperations,
210 unsigned count)
211 {
212 unsigned tensor_count = 0;
213
214 for (unsigned i = 0; i < count; i++) {
215 const struct pipe_ml_operation *poperation = &poperations[i];
216 tensor_count = MAX2(tensor_count, poperation->input_tensor->index);
217 tensor_count = MAX2(tensor_count, poperation->output_tensor->index);
218 switch (poperation->type) {
219 case PIPE_ML_OPERATION_TYPE_CONVOLUTION:
220 tensor_count = MAX2(tensor_count, poperation->conv.weight_tensor->index);
221 tensor_count = MAX2(tensor_count, poperation->conv.bias_tensor->index);
222 break;
223 case PIPE_ML_OPERATION_TYPE_ADD:
224 tensor_count = MAX2(tensor_count, poperation->add.input_tensor->index);
225 break;
226 default:
227 unreachable("Unsupported ML operation type");
228 }
229 }
230
231 return tensor_count + 1;
232 }
233
234 struct pipe_ml_subgraph *
etna_ml_subgraph_create(struct pipe_context * pcontext,const struct pipe_ml_operation * poperations,unsigned count)235 etna_ml_subgraph_create(struct pipe_context *pcontext,
236 const struct pipe_ml_operation *poperations,
237 unsigned count)
238 {
239 struct etna_context *ctx = etna_context(pcontext);
240 unsigned nn_core_count = ctx->screen->info->npu.nn_core_count;
241 struct etna_ml_subgraph *subgraph;
242 struct list_head operations;
243 unsigned tensor_count;
244
245 if (nn_core_count < 1) {
246 fprintf(stderr, "We need at least 1 NN core to do anything useful.\n");
247 abort();
248 }
249
250 subgraph = calloc(1, sizeof(*subgraph));
251 tensor_count = count_tensors(poperations, count);
252
253 list_inithead(&operations);
254
255 subgraph->base.context = pcontext;
256 util_dynarray_init(&subgraph->operations, NULL);
257
258 util_dynarray_init(&subgraph->tensors, NULL);
259 if (!util_dynarray_resize(&subgraph->tensors, struct pipe_resource *, tensor_count))
260 return NULL;
261 memset(util_dynarray_begin(&subgraph->tensors), 0, subgraph->tensors.size);
262
263 util_dynarray_init(&subgraph->offsets, NULL);
264 if (!util_dynarray_resize(&subgraph->offsets, unsigned, tensor_count))
265 return NULL;
266 memset(util_dynarray_begin(&subgraph->offsets), 0, subgraph->offsets.size);
267
268 lower_operations(subgraph, poperations, count, &operations);
269
270 list_for_each_entry(struct etna_operation, operation, &operations, link) {
271 struct etna_vip_instruction instruction = {0};
272
273 switch(operation->type) {
274 case ETNA_JOB_TYPE_NN:
275 etna_ml_compile_operation_nn(subgraph, operation, &instruction);
276 break;
277 case ETNA_JOB_TYPE_TP:
278 etna_ml_compile_operation_tp(subgraph, operation, &instruction);
279 break;
280 }
281
282 util_dynarray_append(&subgraph->operations, struct etna_vip_instruction, instruction);
283 }
284
285 list_for_each_entry_safe(struct etna_operation, operation, &operations, link) {
286 pipe_resource_reference(&operation->weight_tensor, NULL);
287 pipe_resource_reference(&operation->bias_tensor, NULL);
288 free(operation);
289 }
290
291 return &subgraph->base;
292 }
293
294 static void
dump_buffer(struct etna_bo * bo,char * name,int operation_nr)295 dump_buffer(struct etna_bo *bo, char *name, int operation_nr)
296 {
297 char buffer[255];
298
299 uint32_t *map = etna_bo_map(bo);
300 snprintf(buffer, sizeof(buffer), "mesa-%s-%08u.bin", name, operation_nr);
301 ML_DBG("Dumping buffer from 0x%lx (0x%x) to %s\n", map, etna_bo_gpu_va(bo), buffer);
302 FILE *f = fopen(buffer, "wb");
303 assert(f);
304 fwrite(map, 1, etna_bo_size(bo), f);
305 if(ferror(f)) {
306 ML_DBG("Error in writing to file: %s\n", strerror(errno));
307 }
308 fflush(f);
309 fclose(f);
310 }
311
312 static void
init_npu(struct pipe_context * pctx)313 init_npu(struct pipe_context *pctx)
314 {
315 struct etna_context *ctx = etna_context(pctx);
316 struct etna_cmd_stream *stream = ctx->stream;
317
318 /* These zeroes match the blob's cmdstream. They are here to make diff'ing easier.*/
319 etna_cmd_stream_emit(stream, 0x0);
320 etna_cmd_stream_emit(stream, 0x0);
321 etna_cmd_stream_emit(stream, 0x0);
322 etna_cmd_stream_emit(stream, 0x0);
323 etna_cmd_stream_emit(stream, 0x0);
324 etna_cmd_stream_emit(stream, 0x0);
325 etna_cmd_stream_emit(stream, 0x0);
326 etna_cmd_stream_emit(stream, 0x0);
327
328 etna_set_state(stream, VIVS_PA_SYSTEM_MODE, VIVS_PA_SYSTEM_MODE_PROVOKING_VERTEX_LAST |
329 VIVS_PA_SYSTEM_MODE_HALF_PIXEL_CENTER);
330 etna_set_state(stream, VIVS_GL_API_MODE, VIVS_GL_API_MODE_OPENCL);
331
332 etna_cmd_stream_emit(stream, 0x0);
333 etna_cmd_stream_emit(stream, 0x0);
334
335 pctx->flush(pctx, NULL, 0);
336 }
337
338 static void
close_batch(struct pipe_context * pctx)339 close_batch(struct pipe_context *pctx)
340 {
341 struct etna_context *ctx = etna_context(pctx);
342 struct etna_cmd_stream *stream = ctx->stream;
343
344 unsigned cache = VIVS_GL_FLUSH_CACHE_DEPTH | VIVS_GL_FLUSH_CACHE_COLOR | VIVS_GL_FLUSH_CACHE_UNK10;
345 if (!DBG_ENABLED(ETNA_DBG_NPU_PARALLEL))
346 cache |= VIVS_GL_FLUSH_CACHE_UNK11 | VIVS_GL_FLUSH_CACHE_SHADER_L1;
347
348 etna_set_state(stream, VIVS_GL_FLUSH_CACHE, cache);
349 etna_set_state(stream, VIVS_GL_FLUSH_CACHE, cache);
350
351 etna_cmd_stream_emit(stream, 0x0);
352 etna_cmd_stream_emit(stream, 0x0);
353
354 ctx->dirty = 0;
355 }
356
357 void
etna_ml_subgraph_invoke(struct pipe_context * pctx,struct pipe_ml_subgraph * psubgraph,struct pipe_tensor * input)358 etna_ml_subgraph_invoke(struct pipe_context *pctx, struct pipe_ml_subgraph *psubgraph, struct pipe_tensor *input)
359 {
360 struct etna_context *ctx = etna_context(pctx);
361 unsigned tp_core_count = ctx->screen->info->npu.tp_core_count;
362 struct etna_ml_subgraph *subgraph = (struct etna_ml_subgraph *)(psubgraph);
363 struct etna_cmd_stream *stream = ctx->stream;
364 static bool is_initialized = false;
365
366 if (!is_initialized) {
367 init_npu(pctx);
368 is_initialized = true;
369 }
370
371 if (!DBG_ENABLED(ETNA_DBG_NPU_NO_BATCHING)) {
372 /* These zeroes match the blob's cmdstream. They are here to make diff'ing easier.*/
373 etna_cmd_stream_emit(stream, 0x0);
374 etna_cmd_stream_emit(stream, 0x0);
375 etna_cmd_stream_emit(stream, 0x0);
376 etna_cmd_stream_emit(stream, 0x0);
377 etna_cmd_stream_emit(stream, 0x0);
378 etna_cmd_stream_emit(stream, 0x0);
379 etna_cmd_stream_emit(stream, 0x0);
380 etna_cmd_stream_emit(stream, 0x0);
381 }
382
383 unsigned i = 0;
384 unsigned dump_id = 0;
385 util_dynarray_foreach(&subgraph->operations, struct etna_vip_instruction, operation) {
386 #if 0
387 if (i == util_dynarray_num_elements(&subgraph->operations, struct etna_vip_instruction) - 1) {
388 /* TODO: This may be necessary when bypassing all-zero kernels */
389 etna_bo_cpu_prep(etna_resource(operation->output)->bo, DRM_ETNA_PREP_WRITE);
390 uint8_t *dst_map = etna_bo_map(etna_resource(operation->output)->bo);
391 memset(dst_map, 0x77, etna_bo_size(etna_resource(operation->output)->bo));
392 etna_bo_cpu_fini(etna_resource(operation->output)->bo);
393 }
394 #endif
395
396 if (i == 0) {
397 unsigned size = input->dims[0] * input->dims[1] * input->dims[2] * input->dims[3];
398 pipe_buffer_copy(pctx, operation->input, input->resource, 0, 0, size);
399 }
400
401 if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS)) {
402 switch (operation->type) {
403 case ETNA_JOB_TYPE_TP:
404 for (unsigned j = 0; j < tp_core_count && operation->configs[j]; j++) {
405 dump_buffer(operation->configs[j], "tp", dump_id);
406 dump_id++;
407 }
408 break;
409 case ETNA_JOB_TYPE_NN:
410 dump_buffer(operation->configs[0], "nn", dump_id);
411 dump_buffer(operation->coefficients, "compressed", dump_id);
412 dump_id++;
413 break;
414 default:
415 unreachable("Unsupported ML operation type");
416 }
417 }
418
419 if (DBG_ENABLED(ETNA_DBG_NPU_NO_BATCHING)) {
420 /* These zeroes match the blob's cmdstream. They are here to make diff'ing easier.*/
421 etna_cmd_stream_emit(stream, 0x0);
422 etna_cmd_stream_emit(stream, 0x0);
423 etna_cmd_stream_emit(stream, 0x0);
424 etna_cmd_stream_emit(stream, 0x0);
425 etna_cmd_stream_emit(stream, 0x0);
426 etna_cmd_stream_emit(stream, 0x0);
427 etna_cmd_stream_emit(stream, 0x0);
428 etna_cmd_stream_emit(stream, 0x0);
429 }
430
431 for (unsigned j = 0; j < tp_core_count && operation->configs[j]; j++)
432 etna_cmd_stream_ref_bo(stream, operation->configs[j], ETNA_RELOC_READ);
433 if (operation->coefficients)
434 etna_cmd_stream_ref_bo(stream, operation->coefficients, ETNA_RELOC_READ);
435 etna_cmd_stream_ref_bo(stream, etna_resource(operation->input)->bo, ETNA_RELOC_READ);
436 etna_cmd_stream_ref_bo(stream, etna_resource(operation->output)->bo, ETNA_RELOC_WRITE);
437
438 switch (operation->type) {
439 case ETNA_JOB_TYPE_TP:
440 etna_ml_emit_operation_tp(subgraph, operation, i);
441 break;
442 case ETNA_JOB_TYPE_NN:
443 etna_ml_emit_operation_nn(subgraph, operation, i);
444 break;
445 default:
446 unreachable("Unsupported ML operation type");
447 }
448
449 if (DBG_ENABLED(ETNA_DBG_NPU_NO_BATCHING)) {
450 ML_DBG("Running operation %d - %d\n", i, operation->type);
451 close_batch(pctx);
452 pctx->flush(pctx, NULL, 0);
453 stream = ctx->stream;
454 }
455
456 i++;
457 }
458
459 if (!DBG_ENABLED(ETNA_DBG_NPU_NO_BATCHING))
460 close_batch(pctx);
461
462 if (DBG_ENABLED(ETNA_DBG_FLUSH_ALL))
463 pctx->flush(pctx, NULL, 0);
464 }
465
466 void
etna_ml_subgraph_read_outputs(struct pipe_context * context,struct pipe_ml_subgraph * psubgraph,unsigned outputs_count,unsigned output_idxs[],void * outputs[])467 etna_ml_subgraph_read_outputs(struct pipe_context *context, struct pipe_ml_subgraph *psubgraph,
468 unsigned outputs_count, unsigned output_idxs[], void *outputs[])
469 {
470 struct etna_ml_subgraph *subgraph = (struct etna_ml_subgraph *)(psubgraph);
471 unsigned operation_count = util_dynarray_num_elements(&subgraph->operations, struct etna_vip_instruction);
472 struct etna_vip_instruction *last_operation;
473
474 last_operation = util_dynarray_element(&subgraph->operations,
475 struct etna_vip_instruction,
476 operation_count - 1);
477
478 if (DBG_ENABLED(ETNA_DBG_ML_MSGS)) {
479 long start, end;
480 struct timespec time;
481
482 clock_gettime(CLOCK_MONOTONIC, &time);
483 start = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
484
485 context->flush(context, NULL, 0);
486
487 struct pipe_transfer *transfer = NULL;
488 pipe_buffer_map(context, last_operation->output, PIPE_MAP_READ, &transfer);
489 pipe_buffer_unmap(context, transfer);
490
491 clock_gettime(CLOCK_MONOTONIC, &time);
492 end = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
493 ML_DBG("Running the NN job took %ld ms.\n", (end - start));
494 } else
495 context->flush(context, NULL, 0);
496
497 for (int i = 0; i < outputs_count; i++) {
498 struct pipe_resource *res = etna_ml_get_tensor(subgraph, output_idxs[i]);
499 pipe_buffer_read(context, res, 0, pipe_buffer_size(res), outputs[i]);
500 }
501
502 if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS)) {
503 unsigned i = 0;
504 util_dynarray_foreach(&subgraph->operations, struct etna_vip_instruction, operation) {
505 struct pipe_transfer *transfer = NULL;
506
507 pipe_buffer_map(context, operation->input, PIPE_MAP_READ, &transfer);
508 dump_buffer(etna_resource(operation->input)->bo, "input", i);
509 pipe_buffer_unmap(context, transfer);
510
511 pipe_buffer_map(context, operation->output, PIPE_MAP_READ, &transfer);
512 dump_buffer(etna_resource(operation->output)->bo, "output", i);
513 pipe_buffer_unmap(context, transfer);
514
515 i++;
516 }
517 }
518 }
519
520 void
etna_ml_subgraph_destroy(struct pipe_context * context,struct pipe_ml_subgraph * psubgraph)521 etna_ml_subgraph_destroy(struct pipe_context *context, struct pipe_ml_subgraph *psubgraph)
522 {
523 struct etna_ml_subgraph *subgraph = (struct etna_ml_subgraph *)(psubgraph);
524
525 util_dynarray_foreach(&subgraph->operations, struct etna_vip_instruction, operation) {
526 for (unsigned j = 0; j < MAX_CONFIG_BOS && operation->configs[j]; j++)
527 etna_bo_del(operation->configs[j]);
528 etna_bo_del(operation->coefficients);
529 pipe_resource_reference(&operation->input, NULL);
530 pipe_resource_reference(&operation->output, NULL);
531 }
532 util_dynarray_fini(&subgraph->operations);
533
534 util_dynarray_foreach(&subgraph->tensors, struct pipe_resource *, tensor) {
535 pipe_resource_reference(tensor, NULL);
536 }
537 util_dynarray_fini(&subgraph->tensors);
538 util_dynarray_fini(&subgraph->offsets);
539
540 free(subgraph);
541 }
542