xref: /aosp_15_r20/external/executorch/kernels/portable/cpu/vec_ops.h (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  * Copyright 2024 Arm Limited and/or its affiliates.
5  *
6  * This source code is licensed under the BSD-style license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 #pragma once
11 
12 #include <algorithm>
13 #include <cmath>
14 #include <cstdint>
15 #include <cstring>
16 #include <iostream>
17 #include <numeric>
18 #include <ostream>
19 #include <type_traits>
20 /**
21  * @file
22  * This header defines common, low-level operations that can often be
23  * vectorized/accelerated on hardware targets.
24  *
25  * Although they do not yet have hardware-optimized implementations, operators
26  * that use this API can benefit from optimizations in the future.
27  */
28 
29 namespace torch {
30 namespace executor {
31 
32 /// Returns the minimum element of the array at `x`, which must have `size`
33 /// elements.
vec_minf(const float * x,size_t size)34 inline float vec_minf(const float* x, size_t size) {
35   return *std::min_element(x, x + size);
36 }
37 
38 /// Returns the maximum element of the array at `x`, which must have `size`
39 /// elements.
vec_maxf(const float * x,size_t size)40 inline float vec_maxf(const float* x, size_t size) {
41   return *std::max_element(x, x + size);
42 }
43 
44 /// Add each element of `x` and `y` into the corresponding element of `z`. All
45 /// arrays must have `size` elements.
vec_addf(float * __restrict__ z,const float * __restrict__ x,const float * __restrict__ y,size_t size)46 inline void vec_addf(
47     float* __restrict__ z,
48     const float* __restrict__ x,
49     const float* __restrict__ y,
50     size_t size) {
51   for (size_t i = 0; i < size; ++i) {
52     z[i] = x[i] + y[i];
53   }
54 }
55 
56 /// Multiplies every element of `x` by `scale`, and writes the result into the
57 /// corresponding element of `y`. `x` and `y` must have `size` elements.
vec_scalef(float * __restrict__ y,const float * __restrict__ x,float scale,size_t size)58 inline void vec_scalef(
59     float* __restrict__ y,
60     const float* __restrict__ x,
61     float scale,
62     size_t size) {
63   for (size_t i = 0; i < size; ++i) {
64     y[i] = x[i] * scale;
65   }
66 }
67 
68 /// x: m * n, y: n * p, z: m * p.
69 /// z[i][j] = sum(x[i][k] * y[k][j])
70 template <typename T, typename U = T>
vec_matmul(T * __restrict__ z,const U * __restrict__ x,const U * __restrict__ y,int64_t m,int64_t n,int64_t p)71 inline void vec_matmul(
72     T* __restrict__ z,
73     const U* __restrict__ x,
74     const U* __restrict__ y,
75     int64_t m,
76     int64_t n,
77     int64_t p) {
78   for (size_t i = 0; i < m; ++i) {
79     for (size_t j = 0; j < p; ++j) {
80       T sum = 0;
81       for (size_t k = 0; k < n; ++k) {
82         sum += x[i * n + k] * y[k * p + j];
83       }
84       z[i * p + j] = sum;
85     }
86   }
87 }
88 
89 template <typename T, typename U = T>
vec_quantized_matmul_int8(T * __restrict__ z,const U * __restrict__ x,const int8_t * __restrict__ y,const U * __restrict__ s,int64_t m,int64_t n,int64_t p)90 inline void vec_quantized_matmul_int8(
91     T* __restrict__ z,
92     const U* __restrict__ x,
93     const int8_t* __restrict__ y,
94     const U* __restrict__ s,
95     int64_t m,
96     int64_t n,
97     int64_t p) {
98   for (size_t i = 0; i < m; ++i) {
99     for (size_t j = 0; j < p; ++j) {
100       T sum = 0;
101       for (size_t k = 0; k < n; ++k) {
102         sum += x[i * n + k] * static_cast<U>(y[k * p + j]) * s[k];
103       }
104       z[i * p + j] = sum;
105     }
106   }
107 }
108 
bounds_min(size_t a,size_t b)109 static inline size_t bounds_min(size_t a, size_t b) {
110   return (a < b) ? a : b;
111 }
112 
113 /// x: m * n, y: p * n, z: m * p, s: p * groups
114 /// z[i][j] = sum(x[i][k] * y[j][k] * s[j][k/g])
115 template <typename T, typename U = T, typename V = U>
vec_quantized_matmul_transb_int8(T * __restrict__ z,const U * __restrict__ x,const int8_t * __restrict__ y,const V * __restrict__ s,int64_t m,int64_t n,int64_t p,int64_t g)116 inline void vec_quantized_matmul_transb_int8(
117     T* __restrict__ z,
118     const U* __restrict__ x,
119     const int8_t* __restrict__ y,
120     const V* __restrict__ s,
121     int64_t m,
122     int64_t n,
123     int64_t p,
124     int64_t g) {
125   int64_t n_over_g = (n + g - 1) / g;
126 
127   for (size_t i = 0; i < m; ++i) {
128     for (size_t j = 0; j < p; ++j) {
129       T sum = 0;
130       for (size_t k = 0; k < n; k += g) {
131         T psum = 0;
132         // the last group may have fewer than g elements
133         for (size_t k2 = k; k2 < bounds_min(k + g, n); k2++) {
134           psum += x[i * n + k2] * static_cast<U>(y[j * n + k2]);
135         }
136         sum += psum * s[j * n_over_g + k / g];
137       }
138       z[i * p + j] = sum;
139     }
140   }
141 }
142 
143 // mat1 (m x n), mat2 (n x p), out (m, p), self (m x p)
144 // z[i][j] = sum(x[i][k] * y[k][j]), for k in range(n)
145 // T for tensor dtype, U for scalar type
146 template <typename T, typename U = T>
vec_addmm(T * __restrict__ out_data,const T * __restrict__ self_data,const T * __restrict__ mat1_data,const T * __restrict__ mat2_data,int64_t m,int64_t n,int64_t p,U beta,U alpha)147 inline void vec_addmm(
148     T* __restrict__ out_data,
149     const T* __restrict__ self_data,
150     const T* __restrict__ mat1_data,
151     const T* __restrict__ mat2_data,
152     int64_t m,
153     int64_t n,
154     int64_t p,
155     U beta,
156     U alpha) {
157   for (size_t i = 0; i < m; ++i) {
158     for (size_t j = 0; j < p; ++j) {
159       T sum = 0;
160       for (size_t k = 0; k < n; ++k) {
161         sum += mat1_data[i * n + k] * mat2_data[k * p + j];
162       }
163       out_data[i * p + j] = sum * alpha + self_data[i * p + j] * beta;
164     }
165   }
166 }
167 
168 /// Returns the sum of all elements in `x`, which must have `size` elements.
169 template <typename T>
reduce_add(const T * x,size_t size)170 inline float reduce_add(const T* x, size_t size) {
171   return std::accumulate(x, x + size, 0.f);
172 }
173 
174 /// Returns the sum of the squares of all elements in `x`, which must have
175 /// `size` elements.
176 template <typename T>
vec_powerf(const T * x,size_t size)177 inline float vec_powerf(const T* x, size_t size) {
178   float sum = 0;
179   for (size_t i = 0; i < size; ++i) {
180     sum += x[i] * x[i];
181   }
182   return sum;
183 }
184 
185 /// Computes the result of softmax(x, x+n), write into y.
186 /// y = e ^ (x - max(x)) / sum(e^(x - max(x))
187 /// T, U can only be one of double, float
188 template <
189     typename T,
190     typename U,
191     typename checkT = typename std::enable_if<
192         std::is_same<float, typename std::remove_cv<T>::type>::value ||
193         std::is_same<double, typename std::remove_cv<T>::type>::value>::type,
194     typename checkU = typename std::enable_if<
195         std::is_same<float, typename std::remove_cv<U>::type>::value ||
196         std::is_same<double, typename std::remove_cv<U>::type>::value>::type>
vec_softmax(T * __restrict__ y,const U * __restrict__ x,int n)197 inline void vec_softmax(T* __restrict__ y, const U* __restrict__ x, int n) {
198   U max_x = *std::max_element(x, x + n);
199   T sum = 0;
200 
201   for (int i = 0; i < n; ++i) {
202     y[i] = expf(x[i] - max_x);
203     sum += y[i];
204   }
205 
206   for (int i = 0; i < n; ++i) {
207     y[i] /= sum;
208   }
209 }
210 
211 namespace internal {
212 template <class T>
clamp(const T & v,const T & lo,const T & hi)213 constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
214 #ifdef __cpp_lib_clamp
215   return std::clamp(v, lo, hi);
216 #else
217   return v < lo ? lo : hi < v ? hi : v;
218 #endif
219 }
220 } // namespace internal
221 
222 /// Quantizes the elements of `x` into `y`, both of which must have `size`
223 /// elements. Inverse of `dequantize_i8_f32()`.
quantize_i8_f32(int8_t * __restrict__ y,const float * __restrict__ x,float scale,int32_t zero_point,size_t size)224 inline void quantize_i8_f32(
225     int8_t* __restrict__ y,
226     const float* __restrict__ x,
227     float scale,
228     int32_t zero_point,
229     size_t size) {
230   for (size_t i = 0; i < size; ++i) {
231     float tmp = roundf(x[i] * scale + zero_point);
232     y[i] = internal::clamp(tmp, -128.f, 127.f);
233   }
234 }
235 
236 /// Dequantizes the elements of `x` into `y`, both of which must have `size`
237 /// elements. Inverse of `quantize_i8_f32()`.
dequantize_i8_f32(float * __restrict__ y,const int8_t * __restrict__ x,float scale,int32_t zero_point,size_t size)238 inline void dequantize_i8_f32(
239     float* __restrict__ y,
240     const int8_t* __restrict__ x,
241     float scale,
242     int32_t zero_point,
243     size_t size) {
244   for (size_t i = 0; i < size; ++i) {
245     y[i] = scale * (x[i] - zero_point);
246   }
247 }
248 
249 } // namespace executor
250 } // namespace torch
251