1 /****************************************************************************** 2 * 3 * Copyright 2022 Google LLC 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 19 #if __ARM_NEON 20 21 #include <arm_neon.h> 22 23 #else 24 #define __ARM_NEON 1 25 26 #include <stdint.h> 27 28 29 /* ---------------------------------------------------------------------------- 30 * Integer 31 * -------------------------------------------------------------------------- */ 32 33 typedef struct { int16_t e[4]; } int16x4_t; 34 35 typedef struct { int16_t e[8]; } int16x8_t; 36 typedef struct { int32_t e[4]; } int32x4_t; 37 typedef struct { int64_t e[2]; } int64x2_t; 38 39 40 /** 41 * Load / Store 42 */ 43 44 __attribute__((unused)) 45 static int16x4_t vld1_s16(const int16_t *p) 46 { 47 return (int16x4_t){ { p[0], p[1], p[2], p[3] } }; 48 } 49 50 51 /** 52 * Arithmetic 53 */ 54 55 __attribute__((unused)) 56 static int32x4_t vmull_s16(int16x4_t a, int16x4_t b) 57 { 58 return (int32x4_t){ { a.e[0] * b.e[0], a.e[1] * b.e[1], 59 a.e[2] * b.e[2], a.e[3] * b.e[3] } }; 60 } 61 62 __attribute__((unused)) 63 static int32x4_t vmlal_s16(int32x4_t r, int16x4_t a, int16x4_t b) 64 { 65 return (int32x4_t){ { 66 r.e[0] + a.e[0] * b.e[0], r.e[1] + a.e[1] * b.e[1], 67 r.e[2] + a.e[2] * b.e[2], r.e[3] + a.e[3] * b.e[3] } }; 68 } 69 70 __attribute__((unused)) 71 static int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b) 72 { 73 int64x2_t r; 74 75 r.e[0] = a.e[0] + ((int64_t)b.e[0] + b.e[1]); 76 r.e[1] = a.e[1] + ((int64_t)b.e[2] + b.e[3]); 77 78 return r; 79 } 80 81 82 /** 83 * Reduce 84 */ 85 86 __attribute__((unused)) 87 static int32_t vaddvq_s32(int32x4_t v) 88 { 89 return v.e[0] + v.e[1] + v.e[2] + v.e[3]; 90 } 91 92 __attribute__((unused)) 93 static int64_t vaddvq_s64(int64x2_t v) 94 { 95 return v.e[0] + v.e[1]; 96 } 97 98 99 /** 100 * Manipulation 101 */ 102 103 __attribute__((unused)) 104 static int16x4_t vext_s16(int16x4_t a, int16x4_t b, const int n) 105 { 106 int16_t x[] = { a.e[0], a.e[1], a.e[2], a.e[3], 107 b.e[0], b.e[1], b.e[2], b.e[3] }; 108 109 return (int16x4_t){ { x[n], x[n+1], x[n+2], x[n+3] } }; 110 } 111 112 __attribute__((unused)) 113 static int32x4_t vmovq_n_s32(uint32_t v) 114 { 115 return (int32x4_t){ { v, v, v, v } }; 116 } 117 118 __attribute__((unused)) 119 static int64x2_t vmovq_n_s64(int64_t v) 120 { 121 return (int64x2_t){ { v, v, } }; 122 } 123 124 125 126 /* ---------------------------------------------------------------------------- 127 * Floating Point 128 * -------------------------------------------------------------------------- */ 129 130 typedef struct { float e[2]; } float32x2_t; 131 typedef struct { float e[4]; } float32x4_t; 132 133 typedef struct { float32x2_t val[2]; } float32x2x2_t; 134 typedef struct { float32x4_t val[2]; } float32x4x2_t; 135 136 137 /** 138 * Load / Store 139 */ 140 141 __attribute__((unused)) 142 static float32x2_t vld1_f32(const float *p) 143 { 144 return (float32x2_t){ { p[0], p[1] } }; 145 } 146 147 __attribute__((unused)) 148 static float32x4_t vld1q_f32(const float *p) 149 { 150 return (float32x4_t){ { p[0], p[1], p[2], p[3] } }; 151 } 152 153 __attribute__((unused)) 154 static float32x4_t vld1q_dup_f32(const float *p) 155 { 156 return (float32x4_t){ { p[0], p[0], p[0], p[0] } }; 157 } 158 159 __attribute__((unused)) 160 static float32x2x2_t vld2_f32(const float *p) 161 { 162 return (float32x2x2_t){ .val[0] = { { p[0], p[2] } }, 163 .val[1] = { { p[1], p[3] } } }; 164 } 165 166 __attribute__((unused)) 167 static float32x4x2_t vld2q_f32(const float *p) 168 { 169 return (float32x4x2_t){ .val[0] = { { p[0], p[2], p[4], p[6] } }, 170 .val[1] = { { p[1], p[3], p[5], p[7] } } }; 171 } 172 173 __attribute__((unused)) 174 static void vst1_f32(float *p, float32x2_t v) 175 { 176 p[0] = v.e[0], p[1] = v.e[1]; 177 } 178 179 __attribute__((unused)) 180 static void vst1q_f32(float *p, float32x4_t v) 181 { 182 p[0] = v.e[0], p[1] = v.e[1], p[2] = v.e[2], p[3] = v.e[3]; 183 } 184 185 /** 186 * Arithmetic 187 */ 188 189 __attribute__((unused)) 190 static float32x2_t vneg_f32(float32x2_t a) 191 { 192 return (float32x2_t){ { -a.e[0], -a.e[1] } }; 193 } 194 195 __attribute__((unused)) 196 static float32x4_t vnegq_f32(float32x4_t a) 197 { 198 return (float32x4_t){ { -a.e[0], -a.e[1], -a.e[2], -a.e[3] } }; 199 } 200 201 __attribute__((unused)) 202 static float32x4_t vaddq_f32(float32x4_t a, float32x4_t b) 203 { 204 return (float32x4_t){ { a.e[0] + b.e[0], a.e[1] + b.e[1], 205 a.e[2] + b.e[2], a.e[3] + b.e[3] } }; 206 } 207 208 __attribute__((unused)) 209 static float32x4_t vsubq_f32(float32x4_t a, float32x4_t b) 210 { 211 return (float32x4_t){ { a.e[0] - b.e[0], a.e[1] - b.e[1], 212 a.e[2] - b.e[2], a.e[3] - b.e[3] } }; 213 } 214 215 __attribute__((unused)) 216 static float32x2_t vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) 217 { 218 return (float32x2_t){ { 219 a.e[0] + b.e[0] * c.e[0], a.e[1] + b.e[1] * c.e[1] } }; 220 } 221 222 __attribute__((unused)) 223 static float32x4_t vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) 224 { 225 return (float32x4_t){ { 226 a.e[0] + b.e[0] * c.e[0], a.e[1] + b.e[1] * c.e[1], 227 a.e[2] + b.e[2] * c.e[2], a.e[3] + b.e[3] * c.e[3] } }; 228 } 229 230 __attribute__((unused)) 231 static float32x2_t vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c) 232 { 233 return (float32x2_t){ { 234 a.e[0] - b.e[0] * c.e[0], a.e[1] - b.e[1] * c.e[1] } }; 235 } 236 237 __attribute__((unused)) 238 static float32x4_t vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) 239 { 240 return (float32x4_t){ { 241 a.e[0] - b.e[0] * c.e[0], a.e[1] - b.e[1] * c.e[1], 242 a.e[2] - b.e[2] * c.e[2], a.e[3] - b.e[3] * c.e[3] } }; 243 } 244 245 246 /** 247 * Manipulation 248 */ 249 250 __attribute__((unused)) 251 static float32x2_t vcreate_f32(uint64_t u) 252 { 253 float *f = (float *)&u; 254 return (float32x2_t){ { f[0] , f[1] } }; 255 } 256 257 __attribute__((unused)) 258 static float32x4_t vcombine_f32(float32x2_t a, float32x2_t b) 259 { 260 return (float32x4_t){ { a.e[0], a.e[1], b.e[0], b.e[1] } }; 261 } 262 263 __attribute__((unused)) 264 static float32x2_t vget_low_f32(float32x4_t a) 265 { 266 return (float32x2_t){ { a.e[0], a.e[1] } }; 267 } 268 269 __attribute__((unused)) 270 static float32x2_t vget_high_f32(float32x4_t a) 271 { 272 return (float32x2_t){ { a.e[2], a.e[3] } }; 273 } 274 275 __attribute__((unused)) 276 static float32x4_t vmovq_n_f32(float v) 277 { 278 return (float32x4_t){ { v, v, v, v } }; 279 } 280 281 __attribute__((unused)) 282 static float32x2_t vrev64_f32(float32x2_t v) 283 { 284 return (float32x2_t){ { v.e[1], v.e[0] } }; 285 } 286 287 __attribute__((unused)) 288 static float32x4_t vrev64q_f32(float32x4_t v) 289 { 290 return (float32x4_t){ { v.e[1], v.e[0], v.e[3], v.e[2] } }; 291 } 292 293 __attribute__((unused)) 294 static float32x2_t vtrn1_f32(float32x2_t a, float32x2_t b) 295 { 296 return (float32x2_t){ { a.e[0], b.e[0] } }; 297 } 298 299 __attribute__((unused)) 300 static float32x2_t vtrn2_f32(float32x2_t a, float32x2_t b) 301 { 302 return (float32x2_t){ { a.e[1], b.e[1] } }; 303 } 304 305 __attribute__((unused)) 306 static float32x4_t vtrn1q_f32(float32x4_t a, float32x4_t b) 307 { 308 return (float32x4_t){ { a.e[0], b.e[0], a.e[2], b.e[2] } }; 309 } 310 311 __attribute__((unused)) 312 static float32x4_t vtrn2q_f32(float32x4_t a, float32x4_t b) 313 { 314 return (float32x4_t){ { a.e[1], b.e[1], a.e[3], b.e[3] } }; 315 } 316 317 __attribute__((unused)) 318 static float32x4_t vzip1q_f32(float32x4_t a, float32x4_t b) 319 { 320 return (float32x4_t){ { a.e[0], b.e[0], a.e[1], b.e[1] } }; 321 } 322 323 __attribute__((unused)) 324 static float32x4_t vzip2q_f32(float32x4_t a, float32x4_t b) 325 { 326 return (float32x4_t){ { a.e[2], b.e[2], a.e[3], b.e[3] } }; 327 } 328 329 330 #endif /* __ARM_NEON */ 331