xref: /aosp_15_r20/external/mesa3d/src/gallium/frontends/teflon/tfl_device.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (c) 2023-2024 Tomeu Vizoso <[email protected]>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "pipe-loader/pipe_loader.h"
7 #include "pipe/p_context.h"
8 #include "pipe/p_screen.h"
9 #include "pipe/p_state.h"
10 #include "util/format/u_format.h"
11 #include "util/u_inlines.h"
12 
13 #include "tensorflow/lite/c/common.h"
14 #include "tensorflow/lite/builtin_ops.h"
15 #include "tensorflow/lite/core/c/builtin_op_data.h"
16 
17 /* TODO: Move to TfLiteAsyncKernel for zero-copy of buffers */
18 
19 enum teflon_debug_flags {
20    TEFLON_DEBUG_VERBOSE = 1 << 1,
21 };
22 
23 static const struct debug_named_value teflon_debug_flags[] = {
24     { "verbose", TEFLON_DEBUG_VERBOSE, "Verbose logging." },
25     DEBUG_NAMED_VALUE_END
26 };
27 
28 DEBUG_GET_ONCE_FLAGS_OPTION(debug_teflon, "TEFLON_DEBUG", teflon_debug_flags, 0)
29 
30 static inline void
teflon_debug(const char * format,...)31 teflon_debug(const char *format, ...)
32 {
33    if (unlikely(debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)) {
34       va_list ap;
35       va_start(ap, format);
36       _debug_vprintf(format, ap);
37       va_end(ap);
38    }
39 }
40 
41 struct teflon_delegate
42 {
43    TfLiteDelegate base;
44    struct pipe_loader_device *dev;
45    struct pipe_context *context;
46 };
47 
48 struct teflon_subgraph
49 {
50    struct pipe_ml_subgraph *base;
51 
52    unsigned *input_tensors;
53    unsigned input_count;
54 
55    unsigned *output_tensors;
56    unsigned output_count;
57 };
58 
59 static struct pipe_resource *
create_resource(struct pipe_context * context,TfLiteTensor tensor)60 create_resource(struct pipe_context *context, TfLiteTensor tensor)
61 {
62    unsigned bytes;
63    unsigned size = 1;
64 
65    for (int i = 0; i < tensor.dims->size; i++)
66       size *= tensor.dims->data[i];
67 
68    switch(tensor.type) {
69       case kTfLiteInt8:
70       case kTfLiteUInt8:
71          bytes = 1;
72          break;
73       case kTfLiteInt16:
74       case kTfLiteUInt16:
75       case kTfLiteFloat16:
76          bytes = 2;
77          break;
78       case kTfLiteInt32:
79       case kTfLiteUInt32:
80       case kTfLiteFloat32:
81          bytes = 4;
82          break;
83       case kTfLiteInt64:
84       case kTfLiteUInt64:
85       case kTfLiteFloat64:
86       case kTfLiteComplex64:
87          bytes = 8;
88          break;
89       default:
90          unreachable("Unsupported TF type");
91    }
92 
93    return pipe_buffer_create_with_data(context, 0, PIPE_USAGE_DEFAULT, size * bytes, tensor.data.data);
94 }
95 
96 static void
fill_operation(struct teflon_delegate * delegate,TfLiteContext * tf_context,TfLiteNode * node,TfLiteRegistration * node_registration,struct pipe_ml_operation * operation,struct pipe_tensor * tensors)97 fill_operation(struct teflon_delegate *delegate, TfLiteContext *tf_context, TfLiteNode *node, TfLiteRegistration *node_registration, struct pipe_ml_operation *operation, struct pipe_tensor *tensors)
98 {
99    TfLiteConvParams* params = (TfLiteConvParams*)node->builtin_data;
100 
101    operation->input_tensor = &tensors[node->inputs->data[0]];
102    operation->output_tensor = &tensors[node->outputs->data[0]];
103 
104    switch(node_registration->builtin_code) {
105       case kTfLiteBuiltinConv2d:
106       case kTfLiteBuiltinDepthwiseConv2d:
107          operation->type = PIPE_ML_OPERATION_TYPE_CONVOLUTION;
108          operation->conv.weight_tensor = &tensors[node->inputs->data[1]];
109          operation->conv.bias_tensor = &tensors[node->inputs->data[2]];
110          operation->conv.stride_x = params->stride_width;
111          operation->conv.stride_y = params->stride_height;
112          operation->conv.padding_same = params->padding == kTfLitePaddingSame;
113          operation->conv.depthwise = node_registration->builtin_code == kTfLiteBuiltinDepthwiseConv2d;
114          operation->conv.pointwise = operation->conv.weight_tensor->dims[1] == 1 && \
115                                      operation->conv.weight_tensor->dims[2] == 1;
116          break;
117       case kTfLiteBuiltinAveragePool2d:
118          operation->type = PIPE_ML_OPERATION_TYPE_POOLING;
119          break;
120       case kTfLiteBuiltinAdd:
121          operation->type = PIPE_ML_OPERATION_TYPE_ADD;
122          operation->add.input_tensor = &tensors[node->inputs->data[1]];
123          break;
124       default:
125          unreachable("Unsupported ML operation type");
126    }
127 }
128 
129 static void
fill_tensor(struct teflon_delegate * delegate,TfLiteContext * tf_context,struct pipe_tensor * tensor,unsigned index)130 fill_tensor(struct teflon_delegate *delegate, TfLiteContext *tf_context, struct pipe_tensor *tensor, unsigned index)
131 {
132    struct pipe_context *context = delegate->context;
133    TfLiteTensor tf_tensor = tf_context->tensors[index];
134    const TfLiteAffineQuantization *quant = (const TfLiteAffineQuantization *)tf_tensor.quantization.params;
135 
136    if (tf_tensor.type == kTfLiteNoType)
137       return; /* Placeholder tensor */
138 
139    if (tf_tensor.data.data)
140       tensor->resource = create_resource(context, tf_tensor);
141 
142    tensor->index = index;
143    memcpy(tensor->dims, tf_tensor.dims->data, tf_tensor.dims->size * sizeof(*tensor->dims));
144    tensor->scale = quant->scale->data[0];
145    tensor->zero_point = quant->zero_point->data[0];
146 
147    switch(tf_tensor.type) {
148       case kTfLiteUInt8:
149       case kTfLiteUInt16:
150       case kTfLiteUInt32:
151       case kTfLiteUInt64:
152          tensor->is_signed = false;
153          break;
154       default:
155          tensor->is_signed = true;
156    }
157 }
158 
159 static void
dump_graph(struct pipe_tensor * tensors,unsigned tensor_count,struct pipe_ml_operation * operations,unsigned operation_count)160 dump_graph(struct pipe_tensor *tensors, unsigned tensor_count, struct pipe_ml_operation *operations, unsigned operation_count)
161 {
162    teflon_debug("\n");
163    teflon_debug("teflon: compiling graph: %d tensors %d operations\n",
164                 tensor_count, operation_count);
165 
166    teflon_debug("%3s %-8s %3s %s %-12s\n", "idx", "scale", "zp", "has_data", "size");
167    teflon_debug("=======================================\n");
168    for (int i = 0; i < tensor_count; i++) {
169       teflon_debug("%3d %6f %3x %-8s %dx%dx%dx%d\n",
170                   tensors[i].index,
171                   tensors[i].scale,
172                   tensors[i].zero_point,
173                   tensors[i].resource == NULL ? "no" : "yes",
174                   tensors[i].dims[0], tensors[i].dims[1], tensors[i].dims[2], tensors[i].dims[3]);
175    }
176 
177    teflon_debug("\n");
178    teflon_debug("%3s %-6s %3s %3s  %s\n", "idx", "type", "in", "out", "operation type-specific");
179    teflon_debug("================================================================================================\n");
180    for (int i = 0; i < operation_count; i++) {
181       switch(operations[i].type) {
182       case PIPE_ML_OPERATION_TYPE_ADD:
183          teflon_debug("%3d %-6s %3d %3d  in: %d",
184                      i,
185                      "ADD",
186                      operations[i].input_tensor->index,
187                      operations[i].output_tensor->index,
188                      operations[i].add.input_tensor->index);
189          break;
190       case PIPE_ML_OPERATION_TYPE_CONVOLUTION:
191          teflon_debug("%3d %-6s %3d %3d  w: %d b: %d stride: %d pad: %s",
192                      i,
193                      operations[i].conv.depthwise ? "DWCONV" : "CONV",
194                      operations[i].input_tensor->index,
195                      operations[i].output_tensor->index,
196                      operations[i].conv.weight_tensor->index,
197                      operations[i].conv.bias_tensor->index,
198                      operations[i].conv.stride_x,
199                      operations[i].conv.padding_same ? "SAME" : "VALID");
200          break;
201       case PIPE_ML_OPERATION_TYPE_POOLING:
202          teflon_debug("%3d %-6s %3d %3d  filter: %dx%d stride: %d pad: %s",
203                      i,
204                      "POOL",
205                      operations[i].input_tensor->index,
206                      operations[i].output_tensor->index,
207                      operations[i].pooling.filter_height,
208                      operations[i].pooling.filter_width,
209                      operations[i].pooling.stride_x,
210                      operations[i].pooling.padding_same ? "SAME" : "VALID");
211          break;
212       }
213 
214       teflon_debug("\n");
215    }
216    teflon_debug("\n");
217 }
218 
219 static void *
partition_init(TfLiteContext * tf_context,const char * buffer,size_t length)220 partition_init(TfLiteContext *tf_context, const char *buffer, size_t length)
221 {
222    const TfLiteDelegateParams *params = (const TfLiteDelegateParams *)buffer;
223    struct teflon_delegate *delegate = (struct teflon_delegate *)params->delegate;
224    struct pipe_context *context = delegate->context;
225    struct pipe_ml_operation operations[params->nodes_to_replace->size];
226    struct pipe_tensor tensors[tf_context->tensors_size];
227    long start = 0, end = 0;
228 
229    memset(operations, 0, sizeof(operations));
230    memset(tensors, 0, sizeof(tensors));
231 
232    if (unlikely(debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)) {
233       struct timespec time;
234       clock_gettime(CLOCK_MONOTONIC, &time);
235       start = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
236    }
237 
238    for (int i = 0; i < tf_context->tensors_size; i++)
239       fill_tensor(delegate, tf_context, &tensors[i], i);
240 
241    for (int i = 0; i < params->nodes_to_replace->size; i++)
242    {
243       const int node_index = params->nodes_to_replace->data[i];
244       TfLiteNode *delegated_node = NULL;
245       TfLiteRegistration *delegated_node_registration = NULL;
246       tf_context->GetNodeAndRegistration(tf_context, node_index, &delegated_node,
247                                          &delegated_node_registration);
248 
249       fill_operation(delegate, tf_context, delegated_node, delegated_node_registration, &operations[i], tensors);
250    }
251 
252    if (debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)
253       dump_graph(tensors, tf_context->tensors_size, operations, params->nodes_to_replace->size);
254 
255    struct pipe_ml_subgraph *subgraph;
256    subgraph = context->ml_subgraph_create(context,
257                                           operations,
258                                           params->nodes_to_replace->size);
259 
260    for (int i = 0; i < tf_context->tensors_size; i++)
261       pipe_resource_reference(&tensors[i].resource, NULL);
262 
263    struct teflon_subgraph *tsubgraph = calloc(1, sizeof(*tsubgraph));
264    tsubgraph->base = subgraph;
265 
266    tsubgraph->input_tensors = malloc(params->input_tensors->size * sizeof(*tsubgraph->input_tensors));
267    for (int i = 0; i < params->input_tensors->size; i++) {
268       unsigned tensor_idx = params->input_tensors->data[i];
269       TfLiteTensor *tensor = &tf_context->tensors[tensor_idx];
270       if (tensor->allocation_type == kTfLiteMmapRo)
271          continue;
272       tsubgraph->input_tensors[tsubgraph->input_count] = tensor_idx;
273       tsubgraph->input_count++;
274    }
275 
276    tsubgraph->output_count = params->output_tensors->size;
277    tsubgraph->output_tensors = malloc(params->output_tensors->size * sizeof(*tsubgraph->output_tensors));
278    memcpy(tsubgraph->output_tensors, params->output_tensors->data,
279           params->output_tensors->size * sizeof(*tsubgraph->output_tensors));
280 
281    if (unlikely(debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)) {
282       struct timespec time;
283       clock_gettime(CLOCK_MONOTONIC, &time);
284       end = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
285       teflon_debug("teflon: compiled graph, took %ld ms\n", (end - start));
286    }
287 
288    return tsubgraph;
289 }
290 
291 static TfLiteStatus
partition_prepare(TfLiteContext * context,TfLiteNode * node)292 partition_prepare(TfLiteContext *context, TfLiteNode *node)
293 {
294    // TODO: If input size has changed, resize input, intermediate and output buffers
295 
296    return kTfLiteOk;
297 }
298 
299 // De-allocates the per-node-and-Interpreter custom data.
300 static void
partition_free(TfLiteContext * tf_context,void * buffer)301 partition_free(TfLiteContext *tf_context, void *buffer)
302 {
303    struct teflon_subgraph *tsubgraph = (struct teflon_subgraph *)buffer;
304    struct pipe_ml_subgraph *subgraph = tsubgraph->base;
305    struct pipe_context *context = subgraph->context;
306 
307    context->ml_subgraph_destroy(context, subgraph);
308    free(tsubgraph->input_tensors);
309    free(tsubgraph->output_tensors);
310    free(tsubgraph);
311 }
312 
313 static TfLiteStatus
partition_invoke(TfLiteContext * tf_context,TfLiteNode * node)314 partition_invoke(TfLiteContext *tf_context, TfLiteNode *node)
315 {
316    struct teflon_delegate *delegate = (struct teflon_delegate *)node->delegate;
317    struct teflon_subgraph *tsubgraph = (struct teflon_subgraph *)node->user_data;
318    struct pipe_ml_subgraph *subgraph = tsubgraph->base;
319    struct pipe_context *context = delegate->context;
320    long start = 0, end = 0;
321 
322    if (unlikely(debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)) {
323       struct timespec time;
324       clock_gettime(CLOCK_MONOTONIC, &time);
325       start = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
326    }
327 
328    struct pipe_tensor input = {0};
329    /* FIXME: Support mutiple inputs */
330    fill_tensor(delegate, tf_context, &input, tsubgraph->input_tensors[0]);
331    context->ml_subgraph_invoke(context, subgraph, &input);
332 
333    void **buffers = malloc(tsubgraph->output_count * sizeof(*buffers));
334    for (unsigned i = 0; i < tsubgraph->output_count; i++)
335       buffers[i] = tf_context->tensors[tsubgraph->output_tensors[i]].data.data;
336    context->ml_subgraph_read_output(context, subgraph, tsubgraph->output_count, tsubgraph->output_tensors, buffers);
337    free(buffers);
338 
339    pipe_resource_reference(&input.resource, NULL);
340 
341    if (unlikely(debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)) {
342       struct timespec time;
343       clock_gettime(CLOCK_MONOTONIC, &time);
344       end = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
345       teflon_debug("teflon: invoked graph, took %ld ms\n", (end - start));
346    }
347 
348    return kTfLiteOk;
349 }
350 
351 static TfLiteStatus
PrepareDelegate(TfLiteContext * context,TfLiteDelegate * delegate)352 PrepareDelegate(TfLiteContext *context, TfLiteDelegate *delegate)
353 {
354    TfLiteIntArray *plan;
355    TfLiteNode *node;
356    TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
357 
358    // Get a list of supported nodes.
359    TfLiteIntArray *supported_nodes = malloc(plan->size * sizeof(int) + sizeof(*supported_nodes));
360    supported_nodes->size = plan->size;
361    unsigned node_count = 0;
362    for (int i = 0; i < plan->size; i++) {
363       int node_index = plan->data[i];
364       bool supported = false;
365       TfLiteRegistration *registration;
366       TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
367           context, node_index, &node, &registration));
368 
369       switch(registration->builtin_code) {
370          case kTfLiteBuiltinConv2d:
371          case kTfLiteBuiltinDepthwiseConv2d:
372          case kTfLiteBuiltinAdd:
373             supported = true;
374             break;
375       }
376 
377       if (supported)
378          supported_nodes->data[node_count++] = node_index;
379    }
380    supported_nodes->size = node_count;
381 
382    TfLiteRegistration registration;
383 
384    registration.init = partition_init;
385    registration.free = partition_free;
386    registration.prepare = partition_prepare;
387    registration.invoke = partition_invoke;
388 
389    registration.profiling_string = NULL;
390    registration.builtin_code = kTfLiteBuiltinDelegate;
391    registration.version = 1;
392    registration.registration_external = NULL;
393    registration.custom_name = "Teflon Delegate";
394 
395    // Replace supported subgraphs.
396    TfLiteStatus status = context->ReplaceNodeSubsetsWithDelegateKernels(
397        context,
398        registration,
399        supported_nodes,
400        delegate);
401 
402    free(supported_nodes);
403 
404    return status;
405 }
406 
407 static TfLiteStatus
CopyFromBufferHandle(TfLiteContext * context,TfLiteDelegate * delegate,TfLiteBufferHandle buffer_handle,TfLiteTensor * tensor)408 CopyFromBufferHandle(TfLiteContext *context,
409                                   TfLiteDelegate *delegate,
410                                   TfLiteBufferHandle buffer_handle,
411                                   TfLiteTensor *tensor)
412 {
413    return kTfLiteOk;
414 }
415 
416 static void
FreeBufferHandle(TfLiteContext * context,TfLiteDelegate * delegate,TfLiteBufferHandle * handle)417 FreeBufferHandle(TfLiteContext *context,
418                       TfLiteDelegate *delegate,
419                       TfLiteBufferHandle *handle)
420 {
421 }
422 
423 TfLiteDelegate *tflite_plugin_create_delegate(char **options_keys,
424                                                 char **options_values,
425                                                 size_t num_options,
426                                                 void (*report_error)(const char *));
427 
428 void tflite_plugin_destroy_delegate(TfLiteDelegate *delegate);
429 
tflite_plugin_create_delegate(char ** options_keys,char ** options_values,size_t num_options,void (* report_error)(const char *))430 __attribute__((visibility("default"))) TfLiteDelegate *tflite_plugin_create_delegate(char **options_keys,
431                                                                                        char **options_values,
432                                                                                        size_t num_options,
433                                                                                        void (*report_error)(const char *))
434 {
435    struct teflon_delegate *delegate = (struct teflon_delegate *)calloc(1, sizeof(*delegate));
436    struct pipe_screen *screen;
437    struct pipe_loader_device **devs;
438 
439    delegate->base.flags = kTfLiteDelegateFlagsAllowDynamicTensors | kTfLiteDelegateFlagsRequirePropagatedShapes;
440    delegate->base.Prepare = &PrepareDelegate;
441    delegate->base.CopyFromBufferHandle = &CopyFromBufferHandle;
442    delegate->base.FreeBufferHandle = &FreeBufferHandle;
443 
444    int n = pipe_loader_probe(NULL, 0, false);
445    devs = (struct pipe_loader_device **)malloc(sizeof(*devs) * n);
446    pipe_loader_probe(devs, n, false);
447 
448    for (int i = 0; i < n; i++) {
449       if (strstr("etnaviv", devs[i]->driver_name))
450          delegate->dev = devs[i];
451       else
452          pipe_loader_release(&devs[i], 1);
453    }
454    free(devs);
455 
456    if (delegate->dev == NULL) {
457       fprintf(stderr, "Couldn't open kernel device\n");
458       return NULL;
459    }
460 
461    teflon_debug("Teflon delegate: loaded %s driver\n", delegate->dev->driver_name);
462 
463    screen = pipe_loader_create_screen(delegate->dev, false);
464    delegate->context = screen->context_create(screen, NULL, PIPE_CONTEXT_COMPUTE_ONLY);
465 
466    return &delegate->base;
467 }
468 
tflite_plugin_destroy_delegate(TfLiteDelegate * tflite_delegate)469 __attribute__((visibility("default"))) void tflite_plugin_destroy_delegate(TfLiteDelegate *tflite_delegate)
470 {
471    struct teflon_delegate *delegate = (struct teflon_delegate *)tflite_delegate;
472    struct pipe_screen *screen;
473 
474    if (tflite_delegate == NULL) {
475       fprintf(stderr, "tflite_plugin_destroy_delegate: NULL delegate!\n");
476       return;
477    }
478 
479    screen = delegate->context->screen;
480    delegate->context->destroy(delegate->context);
481    screen->destroy(screen);
482    pipe_loader_release(&delegate->dev, 1);
483    free(delegate);
484 }
485