1 /* 2 * Copyright (c) Meta Platforms, Inc. and affiliates. 3 * All rights reserved. 4 * 5 * This source code is licensed under the BSD-style license found in the 6 * LICENSE file in the root directory of this source tree. 7 */ 8 9 /******************************************************************************* 10 * Copyright (c) 2018-2023 Cadence Design Systems, Inc. 11 * 12 * Permission is hereby granted, free of charge, to any person obtaining 13 * a copy of this software and associated documentation files (the 14 * "Software"), to use this Software with Cadence processor cores only and 15 * not with any other processors and platforms, subject to 16 * the following conditions: 17 * 18 * The above copyright notice and this permission notice shall be included 19 * in all copies or substantial portions of the Software. 20 * 21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 22 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 24 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 25 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 26 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 27 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 28 29 ******************************************************************************/ 30 31 #pragma once 32 33 /* 34 This file modifies the macros defined in 35 nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_macros.h 36 to adjust the accumulated value for per-channel quantized scheme. 37 1. The ADJUST_ACC_BATCH_ASYM8b in nnlib multiplies the accumulated value in 38 matmul with the requantized scale. The requantized scale is an fp32 value 39 (in_scale*weight_scale/out_scale). This fp32 requanzied_scale is repsented as a 40 fixed-point computation with an int32 out_multiplier, and an in32 out_shift. 41 The weight_scale can be an array for per-channel quantized weight. So we allow 42 the left_shift/right_shift and out_multiplier to be arrays in 43 ADJUST_ACC_BATCH_ASYM8b. 44 2. The new macros SETUP_SHIFT and UNROLL_ROW_SETUP_SHIFT are not present in 45 nnlib. We add these to allow finding the correct out_shift for each channel 46 (i.e., unrolled row of p_mat1). 47 */ 48 49 #include "xa_nnlib_matmul_unroll_macros.h" 50 51 #define UNROLL_ROW_SETUP_SHIFT(idx_row) \ 52 ae_int32x2 _ae_int32x2_ch_idx_##idx_row = 0; \ 53 AE_MOVT32X2( \ 54 _ae_int32x2_ch_idx_##idx_row, \ 55 AE_MOVDA32X2(m_itr + idx_row, m_itr + idx_row), \ 56 per_channel_quantized); \ 57 int _ch_idx_##idx_row = AE_MOVAD32_L(_ae_int32x2_ch_idx_##idx_row); \ 58 left_shift[idx_row] = AE_MAX32(0, out_shift[_ch_idx_##idx_row]); \ 59 right_shift[idx_row] = AE_MAX32(0, -out_shift[_ch_idx_##idx_row]); 60 61 #define ADJUST_ACC_BATCH_ASYM8b(idx_row, idx_vec) \ 62 /* Multiply accumulator with 'out_multiplier', same as Tensorflow */ \ 63 ae_int32x2 _ae_int32x2_acc_##idx_row##_##idx_vec; \ 64 MPY_BY_QUANT_MULT_X2_OUT32( \ 65 _ae_int32x2_acc_##idx_row##_##idx_vec, \ 66 AE_MOVINT32X2_FROMINT64(_ae_int64_acc_##idx_row##_##idx_vec), \ 67 out_multiplier[_ch_idx_##idx_row], \ 68 left_shift[idx_row], \ 69 right_shift[idx_row]); \ 70 /* Add output zero point */ \ 71 (_ae_int32x2_acc_##idx_row##_##idx_vec) = AE_ADD32S( \ 72 _ae_int32x2_acc_##idx_row##_##idx_vec, AE_MOVDA32(out_zero_bias)); 73 74 #if (ROW_UNROLL == 2) 75 76 #define SETUP_SHIFT \ 77 UNROLL_ROW_SETUP_SHIFT(0) \ 78 UNROLL_ROW_SETUP_SHIFT(1) 79 80 #elif (ROW_UNROLL == 4) 81 82 #define SETUP_SHIFT \ 83 UNROLL_ROW_SETUP_SHIFT(0) \ 84 UNROLL_ROW_SETUP_SHIFT(1) \ 85 UNROLL_ROW_SETUP_SHIFT(2) \ 86 UNROLL_ROW_SETUP_SHIFT(3) 87 88 #endif /* (ROW_UNROLL == 4)*/ 89