1 /* 2 * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. 3 * 4 * SPDX-License-Identifier: Apache-2.0 5 * 6 * Licensed under the Apache License, Version 2.0 (the License); you may 7 * not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 19 /* ---------------------------------------------------------------------- 20 * Project: CMSIS NN Library 21 * Title: arm_nnfunctions.h 22 * Description: Public header file for CMSIS NN Library 23 * 24 * $Date: 13. July 2018 25 * $Revision: V.1.0.0 26 * 27 * Target Processor: Cortex-M cores 28 * -------------------------------------------------------------------- */ 29 30 /** 31 \mainpage CMSIS NN Software Library 32 * 33 * Introduction 34 * ------------ 35 * 36 * This user manual describes the CMSIS NN software library, 37 * a collection of efficient neural network kernels developed to maximize the 38 * performance and minimize the memory footprint of neural networks on Cortex-M processor cores. 39 * 40 * The library is divided into a number of functions each covering a specific category: 41 * - Neural Network Convolution Functions 42 * - Neural Network Activation Functions 43 * - Fully-connected Layer Functions 44 * - Neural Network Pooling Functions 45 * - Softmax Functions 46 * - Neural Network Support Functions 47 * 48 * The library has separate functions for operating on different weight and activation data 49 * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the 50 * kernels are included in the function description. The implementation details are also 51 * described in this paper [1]. 52 * 53 * Block Diagram 54 * -------- 55 * \image html CMSIS-NN-OVERVIEW.PNG 56 * 57 * Examples 58 * -------- 59 * 60 * The library ships with a number of examples which demonstrate how to use the library functions. 61 * 62 * Pre-processor Macros 63 * ------------ 64 * 65 * Each library project have differant pre-processor macros. 66 * 67 * - ARM_MATH_DSP: 68 * 69 * Define macro ARM_MATH_DSP, If the silicon supports DSP instructions. 70 * 71 * - ARM_MATH_BIG_ENDIAN: 72 * 73 * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. By default library builds for little endian targets. 74 * 75 * - ARM_NN_TRUNCATE: 76 * 77 * Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation. 78 * 79 * Copyright Notice 80 * ------------ 81 * 82 * Copyright (C) 2010-2018 Arm Limited. All rights reserved. 83 * 84 * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601 85 */ 86 87 /** 88 * @defgroup groupNN Neural Network Functions 89 * These functions perform basic operations for neural network layers. 90 */ 91 92 #ifndef _ARM_NNFUNCTIONS_H 93 #define _ARM_NNFUNCTIONS_H 94 95 #include "arm_nnsupportfunctions.h" 96 #include "arm_nn_tables.h" 97 98 #define USE_INTRINSIC 99 100 //#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */ 101 102 #ifdef __cplusplus 103 extern "C" 104 { 105 #endif 106 107 /** 108 * @defgroup NNConv Neural Network Convolution Functions 109 * 110 * Perform convolution layer 111 * 112 * The convolution is implemented in 2 steps: im2col and GEMM 113 * 114 * im2col is a process of converting each patch of image data into 115 * a column. After im2col, the convolution is computed as matrix-matrix 116 * multiplication. 117 * 118 * To reduce the memory footprint, the im2col is performed partially. 119 * Each iteration, only a few column (i.e., patches) are generated and 120 * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions. 121 * 122 */ 123 124 /** 125 * @brief Basic Q7 convolution function 126 * @param[in] Im_in pointer to input tensor 127 * @param[in] dim_im_in input tensor dimention 128 * @param[in] ch_im_in number of input tensor channels 129 * @param[in] wt pointer to kernel weights 130 * @param[in] ch_im_out number of filters, i.e., output tensor channels 131 * @param[in] dim_kernel filter kernel size 132 * @param[in] padding padding sizes 133 * @param[in] stride convolution stride 134 * @param[in] bias pointer to bias 135 * @param[in] bias_shift amount of left-shift for bias 136 * @param[in] out_shift amount of right-shift for output 137 * @param[in,out] Im_out pointer to output tensor 138 * @param[in] dim_im_out output tensor dimension 139 * @param[in,out] bufferA pointer to buffer space for input 140 * @param[in,out] bufferB pointer to buffer space for output 141 * @return The function returns <code>ARM_MATH_SUCCESS</code> 142 * 143 */ 144 145 arm_status arm_convolve_HWC_q7_basic(const q7_t * Im_in, 146 const uint16_t dim_im_in, 147 const uint16_t ch_im_in, 148 const q7_t * wt, 149 const uint16_t ch_im_out, 150 const uint16_t dim_kernel, 151 const uint16_t padding, 152 const uint16_t stride, 153 const q7_t * bias, 154 const uint16_t bias_shift, 155 const uint16_t out_shift, 156 q7_t * Im_out, 157 const uint16_t dim_im_out, 158 q15_t * bufferA, 159 q7_t * bufferB); 160 161 /** 162 * @brief Basic Q7 convolution function (non-sqaure shape) 163 * @param[in] Im_in pointer to input tensor 164 * @param[in] dim_im_in_x input tensor dimention x 165 * @param[in] dim_im_in_y input tensor dimention y 166 * @param[in] ch_im_in number of input tensor channels 167 * @param[in] wt pointer to kernel weights 168 * @param[in] ch_im_out number of filters, i.e., output tensor channels 169 * @param[in] dim_kernel_x filter kernel size x 170 * @param[in] dim_kernel_y filter kernel size y 171 * @param[in] padding_x padding size x 172 * @param[in] padding_y padding size y 173 * @param[in] stride_x convolution stride x 174 * @param[in] stride_y convolution stride y 175 * @param[in] bias pointer to bias 176 * @param[in] bias_shift amount of left-shift for bias 177 * @param[in] out_shift amount of right-shift for output 178 * @param[in,out] Im_out pointer to output tensor 179 * @param[in] dim_im_out_x output tensor dimension x 180 * @param[in] dim_im_out_y output tensor dimension y 181 * @param[in,out] bufferA pointer to buffer space for input 182 * @param[in,out] bufferB pointer to buffer space for output 183 * @return The function returns <code>ARM_MATH_SUCCESS</code> 184 */ 185 186 arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in, 187 const uint16_t dim_im_in_x, 188 const uint16_t dim_im_in_y, 189 const uint16_t ch_im_in, 190 const q7_t * wt, 191 const uint16_t ch_im_out, 192 const uint16_t dim_kernel_x, 193 const uint16_t dim_kernel_y, 194 const uint16_t padding_x, 195 const uint16_t padding_y, 196 const uint16_t stride_x, 197 const uint16_t stride_y, 198 const q7_t * bias, 199 const uint16_t bias_shift, 200 const uint16_t out_shift, 201 q7_t * Im_out, 202 const uint16_t dim_im_out_x, 203 const uint16_t dim_im_out_y, 204 q15_t * bufferA, 205 q7_t * bufferB); 206 207 /** 208 * @brief Basic Q15 convolution function 209 * @param[in] Im_in pointer to input tensor 210 * @param[in] dim_im_in input tensor dimention 211 * @param[in] ch_im_in number of input tensor channels 212 * @param[in] wt pointer to kernel weights 213 * @param[in] ch_im_out number of filters, i.e., output tensor channels 214 * @param[in] dim_kernel filter kernel size 215 * @param[in] padding padding sizes 216 * @param[in] stride convolution stride 217 * @param[in] bias pointer to bias 218 * @param[in] bias_shift amount of left-shift for bias 219 * @param[in] out_shift amount of right-shift for output 220 * @param[in,out] Im_out pointer to output tensor 221 * @param[in] dim_im_out output tensor dimension 222 * @param[in,out] bufferA pointer to buffer space for input 223 * @param[in,out] bufferB pointer to buffer space for output 224 * @return The function returns <code>ARM_MATH_SUCCESS</code> 225 * 226 */ 227 228 arm_status arm_convolve_HWC_q15_basic(const q15_t * Im_in, 229 const uint16_t dim_im_in, 230 const uint16_t ch_im_in, 231 const q15_t * wt, 232 const uint16_t ch_im_out, 233 const uint16_t dim_kernel, 234 const uint16_t padding, 235 const uint16_t stride, 236 const q15_t * bias, 237 const uint16_t bias_shift, 238 const uint16_t out_shift, 239 q15_t * Im_out, 240 const uint16_t dim_im_out, 241 q15_t * bufferA, 242 q7_t * bufferB); 243 244 /** 245 * @brief Fast Q7 convolution function 246 * @param[in] Im_in pointer to input tensor 247 * @param[in] dim_im_in input tensor dimention 248 * @param[in] ch_im_in number of input tensor channels 249 * @param[in] wt pointer to kernel weights 250 * @param[in] ch_im_out number of filters, i.e., output tensor channels 251 * @param[in] dim_kernel filter kernel size 252 * @param[in] padding padding sizes 253 * @param[in] stride convolution stride 254 * @param[in] bias pointer to bias 255 * @param[in] bias_shift amount of left-shift for bias 256 * @param[in] out_shift amount of right-shift for output 257 * @param[in,out] Im_out pointer to output tensor 258 * @param[in] dim_im_out output tensor dimension 259 * @param[in,out] bufferA pointer to buffer space for input 260 * @param[in,out] bufferB pointer to buffer space for output 261 * @return The function returns either 262 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 263 * 264 * This function is the version with full list of optimization tricks, but with 265 * some contraints: 266 * ch_im_in is multiple of 4 267 * ch_im_out is multiple of 2 268 */ 269 270 arm_status arm_convolve_HWC_q7_fast(const q7_t * Im_in, 271 const uint16_t dim_im_in, 272 const uint16_t ch_im_in, 273 const q7_t * wt, 274 const uint16_t ch_im_out, 275 const uint16_t dim_kernel, 276 const uint16_t padding, 277 const uint16_t stride, 278 const q7_t * bias, 279 const uint16_t bias_shift, 280 const uint16_t out_shift, 281 q7_t * Im_out, 282 const uint16_t dim_im_out, 283 q15_t * bufferA, 284 q7_t * bufferB); 285 286 /** 287 * @brief Fast Q7 convolution function (non-sqaure shape) 288 * @param[in] Im_in pointer to input tensor 289 * @param[in] dim_im_in_x input tensor dimention x 290 * @param[in] dim_im_in_y input tensor dimention y 291 * @param[in] ch_im_in number of input tensor channels 292 * @param[in] wt pointer to kernel weights 293 * @param[in] ch_im_out number of filters, i.e., output tensor channels 294 * @param[in] dim_kernel_x filter kernel size x 295 * @param[in] dim_kernel_y filter kernel size y 296 * @param[in] padding_x padding size x 297 * @param[in] padding_y padding size y 298 * @param[in] stride_x convolution stride x 299 * @param[in] stride_y convolution stride y 300 * @param[in] bias pointer to bias 301 * @param[in] bias_shift amount of left-shift for bias 302 * @param[in] out_shift amount of right-shift for output 303 * @param[in,out] Im_out pointer to output tensor 304 * @param[in] dim_im_out_x output tensor dimension x 305 * @param[in] dim_im_out_y output tensor dimension y 306 * @param[in,out] bufferA pointer to buffer space for input 307 * @param[in,out] bufferB pointer to buffer space for output 308 * @return The function returns either 309 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 310 * 311 * This function is the version with full list of optimization tricks, but with 312 * some contraints: 313 * ch_im_in is multiple of 4 314 * ch_im_out is multiple of 2 315 */ 316 317 arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in, 318 const uint16_t dim_im_in_x, 319 const uint16_t dim_im_in_y, 320 const uint16_t ch_im_in, 321 const q7_t * wt, 322 const uint16_t ch_im_out, 323 const uint16_t dim_kernel_x, 324 const uint16_t dim_kernel_y, 325 const uint16_t padding_x, 326 const uint16_t padding_y, 327 const uint16_t stride_x, 328 const uint16_t stride_y, 329 const q7_t * bias, 330 const uint16_t bias_shift, 331 const uint16_t out_shift, 332 q7_t * Im_out, 333 const uint16_t dim_im_out_x, 334 const uint16_t dim_im_out_y, 335 q15_t * bufferA, 336 q7_t * bufferB); 337 338 /** 339 * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape) 340 * @param[in] Im_in pointer to input tensor 341 * @param[in] dim_im_in_x input tensor dimention x 342 * @param[in] dim_im_in_y input tensor dimention y 343 * @param[in] ch_im_in number of input tensor channels 344 * @param[in] wt pointer to kernel weights 345 * @param[in] ch_im_out number of filters, i.e., output tensor channels 346 * @param[in] dim_kernel_x filter kernel size x 347 * @param[in] dim_kernel_y filter kernel size y 348 * @param[in] padding_x padding size x 349 * @param[in] padding_y padding size y 350 * @param[in] stride_x convolution stride x 351 * @param[in] stride_y convolution stride y 352 * @param[in] bias pointer to bias 353 * @param[in] bias_shift amount of left-shift for bias 354 * @param[in] out_shift amount of right-shift for output 355 * @param[in,out] Im_out pointer to output tensor 356 * @param[in] dim_im_out_x output tensor dimension x 357 * @param[in] dim_im_out_y output tensor dimension y 358 * @param[in,out] bufferA pointer to buffer space for input 359 * @param[in,out] bufferB pointer to buffer space for output 360 * @return The function returns either 361 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 362 * 363 * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1 364 * and dim_kernel_y=1). It can be used for 365 * second half of MobileNets after depthwise separable convolution. 366 * 367 * This function is the version with full list of optimization tricks, but with 368 * some contraints: 369 * ch_im_in is multiple of 4 370 * ch_im_out is multiple of 2 371 */ 372 arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t * Im_in, 373 const uint16_t dim_im_in_x, 374 const uint16_t dim_im_in_y, 375 const uint16_t ch_im_in, 376 const q7_t * wt, 377 const uint16_t ch_im_out, 378 const uint16_t dim_kernel_x, 379 const uint16_t dim_kernel_y, 380 const uint16_t padding_x, 381 const uint16_t padding_y, 382 const uint16_t stride_x, 383 const uint16_t stride_y, 384 const q7_t * bias, 385 const uint16_t bias_shift, 386 const uint16_t out_shift, 387 q7_t * Im_out, 388 const uint16_t dim_im_out_x, 389 const uint16_t dim_im_out_y, 390 q15_t * bufferA, 391 q7_t * bufferB); 392 393 /** 394 * @brief Q7 version of convolution for RGB image 395 * @param[in] Im_in pointer to input tensor 396 * @param[in] dim_im_in input tensor dimention 397 * @param[in] ch_im_in number of input tensor channels 398 * @param[in] wt pointer to kernel weights 399 * @param[in] ch_im_out number of filters, i.e., output tensor channels 400 * @param[in] dim_kernel filter kernel size 401 * @param[in] padding padding sizes 402 * @param[in] stride convolution stride 403 * @param[in] bias pointer to bias 404 * @param[in] bias_shift amount of left-shift for bias 405 * @param[in] out_shift amount of right-shift for output 406 * @param[in,out] Im_out pointer to output tensor 407 * @param[in] dim_im_out output tensor dimension 408 * @param[in,out] bufferA pointer to buffer space for input 409 * @param[in,out] bufferB pointer to buffer space for output 410 * @return The function returns either 411 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 412 * 413 * This kernel is written exclusively for convolution with ch_im_in 414 * equals 3. This applies on the first layer of CNNs which has input 415 * image with RGB format. 416 */ 417 418 arm_status arm_convolve_HWC_q7_RGB(const q7_t * Im_in, 419 const uint16_t dim_im_in, 420 const uint16_t ch_im_in, 421 const q7_t * wt, 422 const uint16_t ch_im_out, 423 const uint16_t dim_kernel, 424 const uint16_t padding, 425 const uint16_t stride, 426 const q7_t * bias, 427 const uint16_t bias_shift, 428 const uint16_t out_shift, 429 q7_t * Im_out, 430 const uint16_t dim_im_out, 431 q15_t * bufferA, 432 q7_t * bufferB); 433 434 /** 435 * @brief Fast Q15 convolution function 436 * @param[in] Im_in pointer to input tensor 437 * @param[in] dim_im_in input tensor dimention 438 * @param[in] ch_im_in number of input tensor channels 439 * @param[in] wt pointer to kernel weights 440 * @param[in] ch_im_out number of filters, i.e., output tensor channels 441 * @param[in] dim_kernel filter kernel size 442 * @param[in] padding padding sizes 443 * @param[in] stride convolution stride 444 * @param[in] bias pointer to bias 445 * @param[in] bias_shift amount of left-shift for bias 446 * @param[in] out_shift amount of right-shift for output 447 * @param[in,out] Im_out pointer to output tensor 448 * @param[in] dim_im_out output tensor dimension 449 * @param[in,out] bufferA pointer to buffer space for input 450 * @param[in,out] bufferB pointer to buffer space for output 451 * @return The function returns either 452 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 453 * 454 * This function is the version with full list of optimization tricks, but with 455 * some contraints: 456 * ch_im_in is multiple of 2 457 * ch_im_out is multiple of 2 458 */ 459 460 arm_status arm_convolve_HWC_q15_fast(const q15_t * Im_in, 461 const uint16_t dim_im_in, 462 const uint16_t ch_im_in, 463 const q15_t * wt, 464 const uint16_t ch_im_out, 465 const uint16_t dim_kernel, 466 const uint16_t padding, 467 const uint16_t stride, 468 const q15_t * bias, 469 const uint16_t bias_shift, 470 const uint16_t out_shift, 471 q15_t * Im_out, 472 const uint16_t dim_im_out, 473 q15_t * bufferA, 474 q7_t * bufferB); 475 476 /** 477 * @brief Fast Q15 convolution function (non-sqaure shape) 478 * @param[in] Im_in pointer to input tensor 479 * @param[in] dim_im_in_x input tensor dimention x 480 * @param[in] dim_im_in_y input tensor dimention y 481 * @param[in] ch_im_in number of input tensor channels 482 * @param[in] wt pointer to kernel weights 483 * @param[in] ch_im_out number of filters, i.e., output tensor channels 484 * @param[in] dim_kernel_x filter kernel size x 485 * @param[in] dim_kernel_y filter kernel size y 486 * @param[in] padding_x padding size x 487 * @param[in] padding_y padding size y 488 * @param[in] stride_x convolution stride x 489 * @param[in] stride_y convolution stride y 490 * @param[in] bias pointer to bias 491 * @param[in] bias_shift amount of left-shift for bias 492 * @param[in] out_shift amount of right-shift for output 493 * @param[in,out] Im_out pointer to output tensor 494 * @param[in] dim_im_out_x output tensor dimension x 495 * @param[in] dim_im_out_y output tensor dimension y 496 * @param[in,out] bufferA pointer to buffer space for input 497 * @param[in,out] bufferB pointer to buffer space for output 498 * @return The function returns either 499 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 500 * 501 * @details 502 * 503 * <b>Buffer size:</b> 504 * 505 * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel 506 * 507 * bufferB size: 0 508 * 509 * <b>Input dimension constraints:</b> 510 * 511 * ch_im_in is multiple of 2 512 * 513 * ch_im_out is multipe of 2 514 * 515 */ 516 517 arm_status 518 arm_convolve_HWC_q15_fast_nonsquare(const q15_t * Im_in, 519 const uint16_t dim_im_in_x, 520 const uint16_t dim_im_in_y, 521 const uint16_t ch_im_in, 522 const q15_t * wt, 523 const uint16_t ch_im_out, 524 const uint16_t dim_kernel_x, 525 const uint16_t dim_kernel_y, 526 const uint16_t padding_x, 527 const uint16_t padding_y, 528 const uint16_t stride_x, 529 const uint16_t stride_y, 530 const q15_t * bias, 531 const uint16_t bias_shift, 532 const uint16_t out_shift, 533 q15_t * Im_out, 534 const uint16_t dim_im_out_x, 535 const uint16_t dim_im_out_y, 536 q15_t * bufferA, 537 q7_t * bufferB); 538 539 /** 540 * @brief Q7 depthwise separable convolution function 541 * @param[in] Im_in pointer to input tensor 542 * @param[in] dim_im_in input tensor dimention 543 * @param[in] ch_im_in number of input tensor channels 544 * @param[in] wt pointer to kernel weights 545 * @param[in] ch_im_out number of filters, i.e., output tensor channels 546 * @param[in] dim_kernel filter kernel size 547 * @param[in] padding padding sizes 548 * @param[in] stride convolution stride 549 * @param[in] bias pointer to bias 550 * @param[in] bias_shift amount of left-shift for bias 551 * @param[in] out_shift amount of right-shift for output 552 * @param[in,out] Im_out pointer to output tensor 553 * @param[in] dim_im_out output tensor dimension 554 * @param[in,out] bufferA pointer to buffer space for input 555 * @param[in,out] bufferB pointer to buffer space for output 556 * @return The function returns either 557 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 558 * 559 * This function is the version with full list of optimization tricks, but with 560 * some contraints: 561 * ch_im_in is multiple of 2 562 * ch_im_out is multiple of 2 563 */ 564 565 arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t * Im_in, 566 const uint16_t dim_im_in, 567 const uint16_t ch_im_in, 568 const q7_t * wt, 569 const uint16_t ch_im_out, 570 const uint16_t dim_kernel, 571 const uint16_t padding, 572 const uint16_t stride, 573 const q7_t * bias, 574 const uint16_t bias_shift, 575 const uint16_t out_shift, 576 q7_t * Im_out, 577 const uint16_t dim_im_out, 578 q15_t * bufferA, 579 q7_t * bufferB); 580 581 /** 582 * @brief Q7 depthwise separable convolution function (non-square shape) 583 * @param[in] Im_in pointer to input tensor 584 * @param[in] dim_im_in_x input tensor dimention x 585 * @param[in] dim_im_in_y input tensor dimention y 586 * @param[in] ch_im_in number of input tensor channels 587 * @param[in] wt pointer to kernel weights 588 * @param[in] ch_im_out number of filters, i.e., output tensor channels 589 * @param[in] dim_kernel_x filter kernel size x 590 * @param[in] dim_kernel_y filter kernel size y 591 * @param[in] padding_x padding sizes x 592 * @param[in] padding_y padding sizes y 593 * @param[in] stride_x convolution stride x 594 * @param[in] stride_y convolution stride y 595 * @param[in] bias pointer to bias 596 * @param[in] bias_shift amount of left-shift for bias 597 * @param[in] out_shift amount of right-shift for output 598 * @param[in,out] Im_out pointer to output tensor 599 * @param[in] dim_im_out_x output tensor dimension x 600 * @param[in] dim_im_out_y output tensor dimension y 601 * @param[in,out] bufferA pointer to buffer space for input 602 * @param[in,out] bufferB pointer to buffer space for output 603 * @return The function returns either 604 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 605 * 606 * This function is the version with full list of optimization tricks, but with 607 * some contraints: 608 * ch_im_in is multiple of 2 609 * ch_im_out is multiple of 2 610 */ 611 arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in, 612 const uint16_t dim_im_in_x, 613 const uint16_t dim_im_in_y, 614 const uint16_t ch_im_in, 615 const q7_t * wt, 616 const uint16_t ch_im_out, 617 const uint16_t dim_kernel_x, 618 const uint16_t dim_kernel_y, 619 const uint16_t padding_x, 620 const uint16_t padding_y, 621 const uint16_t stride_x, 622 const uint16_t stride_y, 623 const q7_t * bias, 624 const uint16_t bias_shift, 625 const uint16_t out_shift, 626 q7_t * Im_out, 627 const uint16_t dim_im_out_x, 628 const uint16_t dim_im_out_y, 629 q15_t * bufferA, 630 q7_t * bufferB); 631 632 633 /** 634 * @defgroup FC Fully-connected Layer Functions 635 * 636 * Perform fully-connected layer 637 * 638 * Fully-connected layer is basically a matrix-vector multiplication 639 * with bias. The matrix is the weights and the input/output vectors 640 * are the activation values. Supported {weight, activation} precisions 641 * include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}. 642 * 643 * Here we have two types of kernel functions. The basic function 644 * implements the function using regular GEMV approach. The opt functions 645 * operates with weights in interleaved formats. 646 * 647 */ 648 649 /** 650 * @brief Q7 basic fully-connected layer function 651 * @param[in] pV pointer to input vector 652 * @param[in] pM pointer to matrix weights 653 * @param[in] dim_vec length of the vector 654 * @param[in] num_of_rows number of rows in weight matrix 655 * @param[in] bias_shift amount of left-shift for bias 656 * @param[in] out_shift amount of right-shift for output 657 * @param[in] bias pointer to bias 658 * @param[in,out] pOut pointer to output vector 659 * @param[in,out] vec_buffer pointer to buffer space for input 660 * @return The function returns <code>ARM_MATH_SUCCESS</code> 661 * 662 */ 663 664 arm_status arm_fully_connected_q7(const q7_t * pV, 665 const q7_t * pM, 666 const uint16_t dim_vec, 667 const uint16_t num_of_rows, 668 const uint16_t bias_shift, 669 const uint16_t out_shift, 670 const q7_t * bias, 671 q7_t * pOut, 672 q15_t * vec_buffer); 673 674 /** 675 * @brief Q7 opt fully-connected layer function 676 * @param[in] pV pointer to input vector 677 * @param[in] pM pointer to matrix weights 678 * @param[in] dim_vec length of the vector 679 * @param[in] num_of_rows number of rows in weight matrix 680 * @param[in] bias_shift amount of left-shift for bias 681 * @param[in] out_shift amount of right-shift for output 682 * @param[in] bias pointer to bias 683 * @param[in,out] pOut pointer to output vector 684 * @param[in,out] vec_buffer pointer to buffer space for input 685 * @return The function returns <code>ARM_MATH_SUCCESS</code> 686 * 687 */ 688 689 arm_status arm_fully_connected_q7_opt(const q7_t * pV, 690 const q7_t * pM, 691 const uint16_t dim_vec, 692 const uint16_t num_of_rows, 693 const uint16_t bias_shift, 694 const uint16_t out_shift, 695 const q7_t * bias, 696 q7_t * pOut, 697 q15_t * vec_buffer); 698 699 /** 700 * @brief Q15 basic fully-connected layer function 701 * @param[in] pV pointer to input vector 702 * @param[in] pM pointer to matrix weights 703 * @param[in] dim_vec length of the vector 704 * @param[in] num_of_rows number of rows in weight matrix 705 * @param[in] bias_shift amount of left-shift for bias 706 * @param[in] out_shift amount of right-shift for output 707 * @param[in] bias pointer to bias 708 * @param[in,out] pOut pointer to output vector 709 * @param[in,out] vec_buffer pointer to buffer space for input 710 * @return The function returns <code>ARM_MATH_SUCCESS</code> 711 * 712 */ 713 714 arm_status arm_fully_connected_q15(const q15_t * pV, 715 const q15_t * pM, 716 const uint16_t dim_vec, 717 const uint16_t num_of_rows, 718 const uint16_t bias_shift, 719 const uint16_t out_shift, 720 const q15_t * bias, 721 q15_t * pOut, 722 q15_t * vec_buffer); 723 724 /** 725 * @brief Q15 opt fully-connected layer function 726 * @param[in] pV pointer to input vector 727 * @param[in] pM pointer to matrix weights 728 * @param[in] dim_vec length of the vector 729 * @param[in] num_of_rows number of rows in weight matrix 730 * @param[in] bias_shift amount of left-shift for bias 731 * @param[in] out_shift amount of right-shift for output 732 * @param[in] bias pointer to bias 733 * @param[in,out] pOut pointer to output vector 734 * @param[in,out] vec_buffer pointer to buffer space for input 735 * @return The function returns <code>ARM_MATH_SUCCESS</code> 736 * 737 */ 738 739 arm_status arm_fully_connected_q15_opt(const q15_t * pV, 740 const q15_t * pM, 741 const uint16_t dim_vec, 742 const uint16_t num_of_rows, 743 const uint16_t bias_shift, 744 const uint16_t out_shift, 745 const q15_t * bias, 746 q15_t * pOut, 747 q15_t * vec_buffer); 748 749 /** 750 * @brief Mixed Q15-Q7 fully-connected layer function 751 * @param[in] pV pointer to input vector 752 * @param[in] pM pointer to matrix weights 753 * @param[in] dim_vec length of the vector 754 * @param[in] num_of_rows number of rows in weight matrix 755 * @param[in] bias_shift amount of left-shift for bias 756 * @param[in] out_shift amount of right-shift for output 757 * @param[in] bias pointer to bias 758 * @param[in,out] pOut pointer to output vector 759 * @param[in,out] vec_buffer pointer to buffer space for input 760 * @return The function returns <code>ARM_MATH_SUCCESS</code> 761 * 762 */ 763 764 arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t * pV, 765 const q7_t * pM, 766 const uint16_t dim_vec, 767 const uint16_t num_of_rows, 768 const uint16_t bias_shift, 769 const uint16_t out_shift, 770 const q7_t * bias, 771 q15_t * pOut, 772 q15_t * vec_buffer); 773 774 /** 775 * @brief Mixed Q15-Q7 opt fully-connected layer function 776 * @param[in] pV pointer to input vector 777 * @param[in] pM pointer to matrix weights 778 * @param[in] dim_vec length of the vector 779 * @param[in] num_of_rows number of rows in weight matrix 780 * @param[in] bias_shift amount of left-shift for bias 781 * @param[in] out_shift amount of right-shift for output 782 * @param[in] bias pointer to bias 783 * @param[in,out] pOut pointer to output vector 784 * @param[in,out] vec_buffer pointer to buffer space for input 785 * @return The function returns <code>ARM_MATH_SUCCESS</code> 786 * 787 */ 788 789 arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV, 790 const q7_t * pM, 791 const uint16_t dim_vec, 792 const uint16_t num_of_rows, 793 const uint16_t bias_shift, 794 const uint16_t out_shift, 795 const q7_t * bias, 796 q15_t * pOut, 797 q15_t * vec_buffer); 798 799 /** 800 * @brief Matrix-Multiplication Kernels for Convolution 801 * 802 * These functions are used within convolution layer functions for 803 * matrix multiplication. 804 * 805 * The implementation is similar to CMSIS-DSP arm_mat_mult functions 806 * with one Q7 and one Q15 operands. The Q15 operand is the im2col 807 * output which is always with 2 columns. 808 * 809 */ 810 811 /** 812 * @brief Matrix-multiplication function for convolution 813 * @param[in] pA pointer to operand A 814 * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors 815 * @param[in] ch_im_out numRow of A 816 * @param[in] numCol_A numCol of A 817 * @param[in] bias_shift amount of left-shift for bias 818 * @param[in] out_shift amount of right-shift for output 819 * @param[in] bias the bias 820 * @param[in,out] pOut pointer to output 821 * @return The function returns the incremented output pointer 822 */ 823 824 q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t * pA, 825 const q15_t * pInBuffer, 826 const uint16_t ch_im_out, 827 const uint16_t numCol_A, 828 const uint16_t bias_shift, 829 const uint16_t out_shift, 830 const q7_t * bias, 831 q7_t * pOut); 832 833 /** 834 * @brief Matrix-multiplication function for convolution with reordered columns 835 * @param[in] pA pointer to operand A 836 * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors 837 * @param[in] ch_im_out numRow of A 838 * @param[in] numCol_A numCol of A 839 * @param[in] bias_shift amount of left-shift for bias 840 * @param[in] out_shift amount of right-shift for output 841 * @param[in] bias the bias 842 * @param[in,out] pOut pointer to output 843 * @return The function returns the incremented output pointer 844 */ 845 846 q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t * pA, 847 const q15_t * pInBuffer, 848 const uint16_t ch_im_out, 849 const uint16_t numCol_A, 850 const uint16_t bias_shift, 851 const uint16_t out_shift, 852 const q7_t * bias, 853 q7_t * pOut); 854 855 #ifdef __cplusplus 856 } 857 #endif 858 859 /* 860 * Other functions 861 * These layers are typically not timing critical 862 * Basic implementation is supported here 863 */ 864 865 #ifdef __cplusplus 866 extern "C" 867 { 868 #endif 869 870 /** 871 * @defgroup Acti Neural Network Activation Functions 872 * 873 * Perform activation layers, including ReLU (Rectified Linear Unit), 874 * sigmoid and tanh 875 * 876 */ 877 878 /** 879 * @brief Q7 RELU function 880 * @param[in,out] data pointer to input 881 * @param[in] size number of elements 882 * @return none. 883 */ 884 885 void arm_relu_q7(q7_t * data, uint16_t size); 886 887 /** 888 * @brief Q15 RELU function 889 * @param[in,out] data pointer to input 890 * @param[in] size number of elements 891 * @return none. 892 */ 893 894 void arm_relu_q15(q15_t * data, uint16_t size); 895 896 /** 897 * @brief Q7 neural network activation function using direct table look-up 898 * @param[in,out] data pointer to input 899 * @param[in] size number of elements 900 * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 901 * @param[in] type type of activation functions 902 * @return none. 903 */ 904 905 void arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width, 906 arm_nn_activation_type type); 907 908 /** 909 * @brief Q15 neural network activation function using direct table look-up 910 * @param[in,out] data pointer to input 911 * @param[in] size number of elements 912 * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 913 * @param[in] type type of activation functions 914 * @return none. 915 */ 916 917 void arm_nn_activations_direct_q15(q15_t * data, uint16_t size, uint16_t int_width, 918 arm_nn_activation_type type); 919 920 /** 921 * @defgroup Pooling Neural Network Pooling Functions 922 * 923 * Perform pooling functions, including max pooling and average pooling 924 * 925 */ 926 927 /** 928 * @brief Q7 max pooling function 929 * @param[in] Im_in pointer to input tensor 930 * @param[in] dim_im_in input tensor dimention 931 * @param[in] ch_im_in number of input tensor channels 932 * @param[in] dim_kernel filter kernel size 933 * @param[in] padding padding sizes 934 * @param[in] stride convolution stride 935 * @param[in] dim_im_out output tensor dimension 936 * @param[in,out] bufferA pointer to buffer space for input 937 * @param[in,out] Im_out pointer to output tensor 938 * @return none. 939 * 940 */ 941 942 void arm_maxpool_q7_HWC(q7_t * Im_in, 943 const uint16_t dim_im_in, 944 const uint16_t ch_im_in, 945 const uint16_t dim_kernel, 946 const uint16_t padding, 947 const uint16_t stride, 948 const uint16_t dim_im_out, 949 q7_t * bufferA, 950 q7_t * Im_out); 951 952 /** 953 * @brief Q7 average pooling function 954 * @param[in] Im_in pointer to input tensor 955 * @param[in] dim_im_in input tensor dimention 956 * @param[in] ch_im_in number of input tensor channels 957 * @param[in] dim_kernel filter kernel size 958 * @param[in] padding padding sizes 959 * @param[in] stride convolution stride 960 * @param[in] dim_im_out output tensor dimension 961 * @param[in,out] bufferA pointer to buffer space for input 962 * @param[in,out] Im_out pointer to output tensor 963 * @return none. 964 * 965 */ 966 967 void arm_avepool_q7_HWC(q7_t * Im_in, 968 const uint16_t dim_im_in, 969 const uint16_t ch_im_in, 970 const uint16_t dim_kernel, 971 const uint16_t padding, 972 const uint16_t stride, 973 const uint16_t dim_im_out, 974 q7_t * bufferA, 975 q7_t * Im_out); 976 977 /** 978 * @defgroup Softmax Softmax Functions 979 * 980 * EXP(2) based softmax function 981 * 982 */ 983 984 /** 985 * @brief Q7 softmax function 986 * @param[in] vec_in pointer to input vector 987 * @param[in] dim_vec input vector dimention 988 * @param[out] p_out pointer to output vector 989 * @return none. 990 * 991 */ 992 993 void arm_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out); 994 995 /** 996 * @brief Q15 softmax function 997 * @param[in] vec_in pointer to input vector 998 * @param[in] dim_vec input vector dimention 999 * @param[out] p_out pointer to output vector 1000 * @return none. 1001 * 1002 */ 1003 1004 void arm_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out); 1005 1006 #ifdef __cplusplus 1007 } 1008 #endif 1009 1010 #endif 1011