xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/SVEAsymm.h (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2020-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef ARM_COMPUTE_SVEASYMM_H
25 #define ARM_COMPUTE_SVEASYMM_H
26 
27 #if defined(ARM_COMPUTE_ENABLE_SVE2)
28 #include "src/core/NEON/SVEMath.h"
29 #include <arm_sve.h>
30 
31 namespace arm_compute
32 {
33 /** Perform a multiply-accumulate on all components of a QASYMM8 vector
34  *
35  * vd*vs + vo
36  *
37  * @param[in] pg Predicate value.
38  * @param[in] vd Input vector value in QASYMM8 format
39  * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
40  * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
41  *
42  * @return A vector in QASYMM8 format, saturated to fit
43  */
44 svuint8_t svmla_qasymm8_z(svbool_t pg, svuint8_t vd, svfloat32_t vs, svfloat32_t vo);
45 
46 /** Perform a multiply-accumulate on all components of a QASYMM8_SIGNED vector
47  *
48  * vd*vs + vo
49  *
50  * @param[in] pg Predicate value.
51  * @param[in] vd Input vector value in QASYMM8_SIGNED format
52  * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
53  * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
54  *
55  * @return A vector in QASYMM8_SIGNED format, saturated to fit
56  */
57 svint8_t svmla_qasymm8_signed_z(svbool_t pg, svint8_t vd, svfloat32_t vs, svfloat32_t vo);
58 
59 /** Dequantize following an asymmetric quantization scheme a sve vector.
60  *
61  * @param[in] pg     Predicate value.
62  * @param[in] qv     Input values to be dequantized.
63  * @param[in] scale  Quantization scaling factor.
64  * @param[in] offset Zero quantization offset.
65  *
66  * @return Dequantized values in an sve vector
67  */
svdequantize_z(svbool_t pg,const svuint8_t & qv,float scale,int32_t offset)68 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svuint8_t &qv, float scale, int32_t offset)
69 {
70     const auto          voffset            = svdup_n_s32(offset);
71     const auto          vscale             = svdup_n_f32(scale);
72     const svfloat32x4_t vdequantized_input = svcreate4_f32(
73                                                  svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)), vscale),
74                                                  svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)), vscale),
75                                                  svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)), vscale),
76                                                  svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)), vscale));
77     return vdequantized_input;
78 }
79 
80 /** Dequantize an sve vector
81  *
82  * @param[in] pg Predicate value.
83  * @param[in] qv Input values to be dequantized.
84  * @param[in] qi Quantization information to be used in the computation.
85  *
86  * @return Dequantized values in an sve vector
87  */
svdequantize_z(svbool_t pg,const svuint8_t & qv,const UniformQuantizationInfo & qi)88 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svuint8_t &qv, const UniformQuantizationInfo &qi)
89 {
90     return svdequantize_z(pg, qv, qi.scale, qi.offset);
91 }
92 
93 /** Dequantize an sve vector stored as signed asymmetric.
94  *
95  * @param[in] pg     Predicate value.
96  * @param[in] qv     Input values to be dequantized.
97  * @param[in] scale  Quantization scaling factor.
98  * @param[in] offset Zero quantization offset.
99  *
100  * @return Dequantized values in a sve vector
101  */
svdequantize_z(svbool_t pg,const svint8_t & qv,float scale,int32_t offset)102 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale, int32_t offset)
103 {
104     const auto          voffset            = svdup_n_s32(offset);
105     const auto          vscale             = svdup_n_f32(scale);
106     const svfloat32x4_t vdequantized_input = svcreate4_f32(
107                                                  svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale),
108                                                  svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale),
109                                                  svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale),
110                                                  svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale));
111 
112     return vdequantized_input;
113 }
114 
115 /** Dequantize an sve vector.
116  *
117  * @param[in] pg Predicate value.
118  * @param[in] qv Input values to be dequantized.
119  * @param[in] qi Quantization information to be used in the computation.
120  *
121  * @return Dequantized values in an sve vector
122  */
svdequantize_z(svbool_t pg,const svint8_t & qv,const UniformQuantizationInfo & qi)123 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const UniformQuantizationInfo &qi)
124 {
125     return svdequantize_z(pg, qv, qi.scale, qi.offset);
126 }
127 
128 /** Dequantize following symmetric quantization scheme on an sve vector.
129  *
130  * @param[in] pg     Predicate value.
131  * @param[in] qv     Input values to be dequantized.
132  * @param[in] vscale Vector containing quantization scaling factors.
133  *
134  * @return Dequantized values in a sve vector
135  */
svdequantize_z(svbool_t pg,const svint8_t & qv,const svfloat32x4_t vscale)136 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const svfloat32x4_t vscale)
137 {
138     const svfloat32x4_t vdequantized_input = svcreate4_f32(
139                                                  svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)),
140                                                  svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)),
141                                                  svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)),
142                                                  svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3)));
143 
144     return vdequantized_input;
145 }
146 
147 /** Dequantize following a symmetric quantization scheme an sve vector.
148  *
149  * @param[in] qv    Input values to be dequantized.
150  * @param[in] scale Quantization scaling factor.
151  *
152  * @return Dequantized values in a sve vector
153  */
svdequantize_z(svbool_t pg,const svint8_t & qv,float scale)154 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale)
155 {
156     const auto          vscale             = svdup_n_f32(scale);
157     const svfloat32x4_t vdequantized_input = svcreate4_f32(
158                                                  svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale),
159                                                  svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale),
160                                                  svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale),
161                                                  svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale));
162     return vdequantized_input;
163 }
164 
165 /** Quantize an sve vector holding floating point values.
166  *
167  * @param[in] pg Predicate value.
168  * @param[in] qv Input values to be quantized.
169  * @param[in] qi Quantization information to be used in the computation.
170  *
171  * @return An sve vector holding the quantized values
172  */
svquantize_z(svbool_t pg,const svfloat32x4_t qv,const UniformQuantizationInfo & qi)173 inline svuint8_t svquantize_z(svbool_t pg, const svfloat32x4_t qv, const UniformQuantizationInfo &qi)
174 {
175     const float scale     = qi.scale;
176     const int   offset    = qi.offset;
177     const auto  voffset   = svdup_n_f32(offset);
178     const auto  vinvscale = svdup_n_f32(1.f / scale);
179 
180     const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 0), vinvscale));
181     const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 1), vinvscale));
182     const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 2), vinvscale));
183     const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 3), vinvscale));
184 
185     const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
186     const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
187 
188     return svqxtnt_u16(svqxtnb_u16(pa), pb);
189 }
190 
191 /** Signed quantize an sve vector holding floating point values.
192  *
193  * @param[in] pg Predicate value.
194  * @param[in] qv Input values to be quantized.
195  * @param[in] qi Quantization information to be used in the computation.
196  *
197  * @return An sve vector holding the quantized values
198  */
svquantize_signed_z(svbool_t pg,const svfloat32x4_t qv,const UniformQuantizationInfo & qi)199 inline svint8_t svquantize_signed_z(svbool_t pg, const svfloat32x4_t qv, const UniformQuantizationInfo &qi)
200 {
201     const float scale     = qi.scale;
202     const int   offset    = qi.offset;
203     const auto  voffset   = svdup_n_f32(offset);
204     const auto  vinvscale = svdup_n_f32(1.f / scale);
205     const auto  rf_0      = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 0), vinvscale));
206     const auto  rf_1      = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 1), vinvscale));
207     const auto  rf_2      = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 2), vinvscale));
208     const auto  rf_3      = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 3), vinvscale));
209 
210     const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
211     const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
212 
213     return svqxtnt_s16(svqxtnb_s16(pa), pb);
214 }
215 
216 /** Quantize to QASYMM16 an sve vector holding 16 floating point values.
217  *
218  * @param[in] pg Predicate value.
219  * @param[in] qv Input values to be quantized.
220  * @param[in] qi Quantization information to be used in the computation.
221  *
222  * @return An sve vector holding the quantized values
223  */
svquantize_qasymm16_z(svbool_t pg,const svfloat32x4_t qv,const UniformQuantizationInfo & qi)224 inline svuint16x2_t svquantize_qasymm16_z(svbool_t pg, const svfloat32x4_t qv, const UniformQuantizationInfo &qi)
225 {
226     const float scale     = qi.scale;
227     const int   offset    = qi.offset;
228     const auto  voffset   = svdup_n_f32(offset);
229     const auto  vinvscale = svdup_n_f32(1.f / scale);
230 
231     const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 0), vinvscale));
232     const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 1), vinvscale));
233     const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 2), vinvscale));
234     const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 3), vinvscale));
235 
236     const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
237     const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
238 
239     return svcreate2_u16(pa, pb);
240 }
241 } // namespace arm_compute
242 #include "src/core/NEON/SVEAsymm.inl"
243 #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
244 #endif // ARM_COMPUTE_NEASYMM_H
245