1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 * Copyright 2024 Arm Limited and/or its affiliates.
5 *
6 * This source code is licensed under the BSD-style license found in the
7 * LICENSE file in the root directory of this source tree.
8 */
9
10 #pragma once
11
12 #include <algorithm>
13 #include <cmath>
14 #include <cstdint>
15 #include <cstring>
16 #include <iostream>
17 #include <numeric>
18 #include <ostream>
19 #include <type_traits>
20 /**
21 * @file
22 * This header defines common, low-level operations that can often be
23 * vectorized/accelerated on hardware targets.
24 *
25 * Although they do not yet have hardware-optimized implementations, operators
26 * that use this API can benefit from optimizations in the future.
27 */
28
29 namespace torch {
30 namespace executor {
31
32 /// Returns the minimum element of the array at `x`, which must have `size`
33 /// elements.
vec_minf(const float * x,size_t size)34 inline float vec_minf(const float* x, size_t size) {
35 return *std::min_element(x, x + size);
36 }
37
38 /// Returns the maximum element of the array at `x`, which must have `size`
39 /// elements.
vec_maxf(const float * x,size_t size)40 inline float vec_maxf(const float* x, size_t size) {
41 return *std::max_element(x, x + size);
42 }
43
44 /// Add each element of `x` and `y` into the corresponding element of `z`. All
45 /// arrays must have `size` elements.
vec_addf(float * __restrict__ z,const float * __restrict__ x,const float * __restrict__ y,size_t size)46 inline void vec_addf(
47 float* __restrict__ z,
48 const float* __restrict__ x,
49 const float* __restrict__ y,
50 size_t size) {
51 for (size_t i = 0; i < size; ++i) {
52 z[i] = x[i] + y[i];
53 }
54 }
55
56 /// Multiplies every element of `x` by `scale`, and writes the result into the
57 /// corresponding element of `y`. `x` and `y` must have `size` elements.
vec_scalef(float * __restrict__ y,const float * __restrict__ x,float scale,size_t size)58 inline void vec_scalef(
59 float* __restrict__ y,
60 const float* __restrict__ x,
61 float scale,
62 size_t size) {
63 for (size_t i = 0; i < size; ++i) {
64 y[i] = x[i] * scale;
65 }
66 }
67
68 /// x: m * n, y: n * p, z: m * p.
69 /// z[i][j] = sum(x[i][k] * y[k][j])
70 template <typename T, typename U = T>
vec_matmul(T * __restrict__ z,const U * __restrict__ x,const U * __restrict__ y,int64_t m,int64_t n,int64_t p)71 inline void vec_matmul(
72 T* __restrict__ z,
73 const U* __restrict__ x,
74 const U* __restrict__ y,
75 int64_t m,
76 int64_t n,
77 int64_t p) {
78 for (size_t i = 0; i < m; ++i) {
79 for (size_t j = 0; j < p; ++j) {
80 T sum = 0;
81 for (size_t k = 0; k < n; ++k) {
82 sum += x[i * n + k] * y[k * p + j];
83 }
84 z[i * p + j] = sum;
85 }
86 }
87 }
88
89 template <typename T, typename U = T>
vec_quantized_matmul_int8(T * __restrict__ z,const U * __restrict__ x,const int8_t * __restrict__ y,const U * __restrict__ s,int64_t m,int64_t n,int64_t p)90 inline void vec_quantized_matmul_int8(
91 T* __restrict__ z,
92 const U* __restrict__ x,
93 const int8_t* __restrict__ y,
94 const U* __restrict__ s,
95 int64_t m,
96 int64_t n,
97 int64_t p) {
98 for (size_t i = 0; i < m; ++i) {
99 for (size_t j = 0; j < p; ++j) {
100 T sum = 0;
101 for (size_t k = 0; k < n; ++k) {
102 sum += x[i * n + k] * static_cast<U>(y[k * p + j]) * s[k];
103 }
104 z[i * p + j] = sum;
105 }
106 }
107 }
108
bounds_min(size_t a,size_t b)109 static inline size_t bounds_min(size_t a, size_t b) {
110 return (a < b) ? a : b;
111 }
112
113 /// x: m * n, y: p * n, z: m * p, s: p * groups
114 /// z[i][j] = sum(x[i][k] * y[j][k] * s[j][k/g])
115 template <typename T, typename U = T, typename V = U>
vec_quantized_matmul_transb_int8(T * __restrict__ z,const U * __restrict__ x,const int8_t * __restrict__ y,const V * __restrict__ s,int64_t m,int64_t n,int64_t p,int64_t g)116 inline void vec_quantized_matmul_transb_int8(
117 T* __restrict__ z,
118 const U* __restrict__ x,
119 const int8_t* __restrict__ y,
120 const V* __restrict__ s,
121 int64_t m,
122 int64_t n,
123 int64_t p,
124 int64_t g) {
125 int64_t n_over_g = (n + g - 1) / g;
126
127 for (size_t i = 0; i < m; ++i) {
128 for (size_t j = 0; j < p; ++j) {
129 T sum = 0;
130 for (size_t k = 0; k < n; k += g) {
131 T psum = 0;
132 // the last group may have fewer than g elements
133 for (size_t k2 = k; k2 < bounds_min(k + g, n); k2++) {
134 psum += x[i * n + k2] * static_cast<U>(y[j * n + k2]);
135 }
136 sum += psum * s[j * n_over_g + k / g];
137 }
138 z[i * p + j] = sum;
139 }
140 }
141 }
142
143 // mat1 (m x n), mat2 (n x p), out (m, p), self (m x p)
144 // z[i][j] = sum(x[i][k] * y[k][j]), for k in range(n)
145 // T for tensor dtype, U for scalar type
146 template <typename T, typename U = T>
vec_addmm(T * __restrict__ out_data,const T * __restrict__ self_data,const T * __restrict__ mat1_data,const T * __restrict__ mat2_data,int64_t m,int64_t n,int64_t p,U beta,U alpha)147 inline void vec_addmm(
148 T* __restrict__ out_data,
149 const T* __restrict__ self_data,
150 const T* __restrict__ mat1_data,
151 const T* __restrict__ mat2_data,
152 int64_t m,
153 int64_t n,
154 int64_t p,
155 U beta,
156 U alpha) {
157 for (size_t i = 0; i < m; ++i) {
158 for (size_t j = 0; j < p; ++j) {
159 T sum = 0;
160 for (size_t k = 0; k < n; ++k) {
161 sum += mat1_data[i * n + k] * mat2_data[k * p + j];
162 }
163 out_data[i * p + j] = sum * alpha + self_data[i * p + j] * beta;
164 }
165 }
166 }
167
168 /// Returns the sum of all elements in `x`, which must have `size` elements.
169 template <typename T>
reduce_add(const T * x,size_t size)170 inline float reduce_add(const T* x, size_t size) {
171 return std::accumulate(x, x + size, 0.f);
172 }
173
174 /// Returns the sum of the squares of all elements in `x`, which must have
175 /// `size` elements.
176 template <typename T>
vec_powerf(const T * x,size_t size)177 inline float vec_powerf(const T* x, size_t size) {
178 float sum = 0;
179 for (size_t i = 0; i < size; ++i) {
180 sum += x[i] * x[i];
181 }
182 return sum;
183 }
184
185 /// Computes the result of softmax(x, x+n), write into y.
186 /// y = e ^ (x - max(x)) / sum(e^(x - max(x))
187 /// T, U can only be one of double, float
188 template <
189 typename T,
190 typename U,
191 typename checkT = typename std::enable_if<
192 std::is_same<float, typename std::remove_cv<T>::type>::value ||
193 std::is_same<double, typename std::remove_cv<T>::type>::value>::type,
194 typename checkU = typename std::enable_if<
195 std::is_same<float, typename std::remove_cv<U>::type>::value ||
196 std::is_same<double, typename std::remove_cv<U>::type>::value>::type>
vec_softmax(T * __restrict__ y,const U * __restrict__ x,int n)197 inline void vec_softmax(T* __restrict__ y, const U* __restrict__ x, int n) {
198 U max_x = *std::max_element(x, x + n);
199 T sum = 0;
200
201 for (int i = 0; i < n; ++i) {
202 y[i] = expf(x[i] - max_x);
203 sum += y[i];
204 }
205
206 for (int i = 0; i < n; ++i) {
207 y[i] /= sum;
208 }
209 }
210
211 namespace internal {
212 template <class T>
clamp(const T & v,const T & lo,const T & hi)213 constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
214 #ifdef __cpp_lib_clamp
215 return std::clamp(v, lo, hi);
216 #else
217 return v < lo ? lo : hi < v ? hi : v;
218 #endif
219 }
220 } // namespace internal
221
222 /// Quantizes the elements of `x` into `y`, both of which must have `size`
223 /// elements. Inverse of `dequantize_i8_f32()`.
quantize_i8_f32(int8_t * __restrict__ y,const float * __restrict__ x,float scale,int32_t zero_point,size_t size)224 inline void quantize_i8_f32(
225 int8_t* __restrict__ y,
226 const float* __restrict__ x,
227 float scale,
228 int32_t zero_point,
229 size_t size) {
230 for (size_t i = 0; i < size; ++i) {
231 float tmp = roundf(x[i] * scale + zero_point);
232 y[i] = internal::clamp(tmp, -128.f, 127.f);
233 }
234 }
235
236 /// Dequantizes the elements of `x` into `y`, both of which must have `size`
237 /// elements. Inverse of `quantize_i8_f32()`.
dequantize_i8_f32(float * __restrict__ y,const int8_t * __restrict__ x,float scale,int32_t zero_point,size_t size)238 inline void dequantize_i8_f32(
239 float* __restrict__ y,
240 const int8_t* __restrict__ x,
241 float scale,
242 int32_t zero_point,
243 size_t size) {
244 for (size_t i = 0; i < size; ++i) {
245 y[i] = scale * (x[i] - zero_point);
246 }
247 }
248
249 } // namespace executor
250 } // namespace torch
251