1 /****************************************************************************** 2 * 3 * Copyright 2022 Google LLC 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 19 #if __ARM_NEON && __ARM_ARCH_ISA_A64 20 21 #include <arm_neon.h> 22 23 #else 24 25 #include <stdint.h> 26 27 28 /* ---------------------------------------------------------------------------- 29 * Integer 30 * -------------------------------------------------------------------------- */ 31 32 typedef struct { int16_t e[4]; } int16x4_t; 33 34 typedef struct { int16_t e[8]; } int16x8_t; 35 typedef struct { int32_t e[4]; } int32x4_t; 36 typedef struct { int64_t e[2]; } int64x2_t; 37 38 39 /** 40 * Load / Store 41 */ 42 43 __attribute__((unused)) 44 static int16x4_t vld1_s16(const int16_t *p) 45 { 46 return (int16x4_t){ { p[0], p[1], p[2], p[3] } }; 47 } 48 49 50 /** 51 * Arithmetic 52 */ 53 54 __attribute__((unused)) 55 static int32x4_t vmull_s16(int16x4_t a, int16x4_t b) 56 { 57 return (int32x4_t){ { a.e[0] * b.e[0], a.e[1] * b.e[1], 58 a.e[2] * b.e[2], a.e[3] * b.e[3] } }; 59 } 60 61 __attribute__((unused)) 62 static int32x4_t vmlal_s16(int32x4_t r, int16x4_t a, int16x4_t b) 63 { 64 return (int32x4_t){ { 65 r.e[0] + a.e[0] * b.e[0], r.e[1] + a.e[1] * b.e[1], 66 r.e[2] + a.e[2] * b.e[2], r.e[3] + a.e[3] * b.e[3] } }; 67 } 68 69 __attribute__((unused)) 70 static int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b) 71 { 72 int64x2_t r; 73 74 r.e[0] = a.e[0] + ((int64_t)b.e[0] + b.e[1]); 75 r.e[1] = a.e[1] + ((int64_t)b.e[2] + b.e[3]); 76 77 return r; 78 } 79 80 81 /** 82 * Reduce 83 */ 84 85 __attribute__((unused)) 86 static int32_t vaddvq_s32(int32x4_t v) 87 { 88 return v.e[0] + v.e[1] + v.e[2] + v.e[3]; 89 } 90 91 __attribute__((unused)) 92 static int64_t vaddvq_s64(int64x2_t v) 93 { 94 return v.e[0] + v.e[1]; 95 } 96 97 98 /** 99 * Manipulation 100 */ 101 102 __attribute__((unused)) 103 static int16x4_t vext_s16(int16x4_t a, int16x4_t b, const int n) 104 { 105 int16_t x[] = { a.e[0], a.e[1], a.e[2], a.e[3], 106 b.e[0], b.e[1], b.e[2], b.e[3] }; 107 108 return (int16x4_t){ { x[n], x[n+1], x[n+2], x[n+3] } }; 109 } 110 111 __attribute__((unused)) 112 static int32x4_t vmovq_n_s32(uint32_t v) 113 { 114 return (int32x4_t){ { v, v, v, v } }; 115 } 116 117 __attribute__((unused)) 118 static int64x2_t vmovq_n_s64(int64_t v) 119 { 120 return (int64x2_t){ { v, v, } }; 121 } 122 123 124 125 /* ---------------------------------------------------------------------------- 126 * Floating Point 127 * -------------------------------------------------------------------------- */ 128 129 typedef struct { float e[2]; } float32x2_t; 130 typedef struct { float e[4]; } float32x4_t; 131 132 typedef struct { float32x2_t val[2]; } float32x2x2_t; 133 typedef struct { float32x4_t val[2]; } float32x4x2_t; 134 135 136 /** 137 * Load / Store 138 */ 139 140 __attribute__((unused)) 141 static float32x2_t vld1_f32(const float *p) 142 { 143 return (float32x2_t){ { p[0], p[1] } }; 144 } 145 146 __attribute__((unused)) 147 static float32x4_t vld1q_f32(const float *p) 148 { 149 return (float32x4_t){ { p[0], p[1], p[2], p[3] } }; 150 } 151 152 __attribute__((unused)) 153 static float32x4_t vld1q_dup_f32(const float *p) 154 { 155 return (float32x4_t){ { p[0], p[0], p[0], p[0] } }; 156 } 157 158 __attribute__((unused)) 159 static float32x2x2_t vld2_f32(const float *p) 160 { 161 return (float32x2x2_t){ .val[0] = { { p[0], p[2] } }, 162 .val[1] = { { p[1], p[3] } } }; 163 } 164 165 __attribute__((unused)) 166 static float32x4x2_t vld2q_f32(const float *p) 167 { 168 return (float32x4x2_t){ .val[0] = { { p[0], p[2], p[4], p[6] } }, 169 .val[1] = { { p[1], p[3], p[5], p[7] } } }; 170 } 171 172 __attribute__((unused)) 173 static void vst1_f32(float *p, float32x2_t v) 174 { 175 p[0] = v.e[0], p[1] = v.e[1]; 176 } 177 178 __attribute__((unused)) 179 static void vst1q_f32(float *p, float32x4_t v) 180 { 181 p[0] = v.e[0], p[1] = v.e[1], p[2] = v.e[2], p[3] = v.e[3]; 182 } 183 184 /** 185 * Arithmetic 186 */ 187 188 __attribute__((unused)) 189 static float32x2_t vneg_f32(float32x2_t a) 190 { 191 return (float32x2_t){ { -a.e[0], -a.e[1] } }; 192 } 193 194 __attribute__((unused)) 195 static float32x4_t vnegq_f32(float32x4_t a) 196 { 197 return (float32x4_t){ { -a.e[0], -a.e[1], -a.e[2], -a.e[3] } }; 198 } 199 200 __attribute__((unused)) 201 static float32x4_t vaddq_f32(float32x4_t a, float32x4_t b) 202 { 203 return (float32x4_t){ { a.e[0] + b.e[0], a.e[1] + b.e[1], 204 a.e[2] + b.e[2], a.e[3] + b.e[3] } }; 205 } 206 207 __attribute__((unused)) 208 static float32x4_t vsubq_f32(float32x4_t a, float32x4_t b) 209 { 210 return (float32x4_t){ { a.e[0] - b.e[0], a.e[1] - b.e[1], 211 a.e[2] - b.e[2], a.e[3] - b.e[3] } }; 212 } 213 214 __attribute__((unused)) 215 static float32x2_t vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) 216 { 217 return (float32x2_t){ { 218 a.e[0] + b.e[0] * c.e[0], a.e[1] + b.e[1] * c.e[1] } }; 219 } 220 221 __attribute__((unused)) 222 static float32x4_t vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) 223 { 224 return (float32x4_t){ { 225 a.e[0] + b.e[0] * c.e[0], a.e[1] + b.e[1] * c.e[1], 226 a.e[2] + b.e[2] * c.e[2], a.e[3] + b.e[3] * c.e[3] } }; 227 } 228 229 __attribute__((unused)) 230 static float32x2_t vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c) 231 { 232 return (float32x2_t){ { 233 a.e[0] - b.e[0] * c.e[0], a.e[1] - b.e[1] * c.e[1] } }; 234 } 235 236 __attribute__((unused)) 237 static float32x4_t vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) 238 { 239 return (float32x4_t){ { 240 a.e[0] - b.e[0] * c.e[0], a.e[1] - b.e[1] * c.e[1], 241 a.e[2] - b.e[2] * c.e[2], a.e[3] - b.e[3] * c.e[3] } }; 242 } 243 244 245 /** 246 * Manipulation 247 */ 248 249 __attribute__((unused)) 250 static float32x2_t vcreate_f32(uint64_t u) 251 { 252 float *f = (float *)&u; 253 return (float32x2_t){ { f[0] , f[1] } }; 254 } 255 256 __attribute__((unused)) 257 static float32x4_t vcombine_f32(float32x2_t a, float32x2_t b) 258 { 259 return (float32x4_t){ { a.e[0], a.e[1], b.e[0], b.e[1] } }; 260 } 261 262 __attribute__((unused)) 263 static float32x2_t vget_low_f32(float32x4_t a) 264 { 265 return (float32x2_t){ { a.e[0], a.e[1] } }; 266 } 267 268 __attribute__((unused)) 269 static float32x2_t vget_high_f32(float32x4_t a) 270 { 271 return (float32x2_t){ { a.e[2], a.e[3] } }; 272 } 273 274 __attribute__((unused)) 275 static float32x4_t vmovq_n_f32(float v) 276 { 277 return (float32x4_t){ { v, v, v, v } }; 278 } 279 280 __attribute__((unused)) 281 static float32x2_t vrev64_f32(float32x2_t v) 282 { 283 return (float32x2_t){ { v.e[1], v.e[0] } }; 284 } 285 286 __attribute__((unused)) 287 static float32x4_t vrev64q_f32(float32x4_t v) 288 { 289 return (float32x4_t){ { v.e[1], v.e[0], v.e[3], v.e[2] } }; 290 } 291 292 __attribute__((unused)) 293 static float32x2_t vtrn1_f32(float32x2_t a, float32x2_t b) 294 { 295 return (float32x2_t){ { a.e[0], b.e[0] } }; 296 } 297 298 __attribute__((unused)) 299 static float32x2_t vtrn2_f32(float32x2_t a, float32x2_t b) 300 { 301 return (float32x2_t){ { a.e[1], b.e[1] } }; 302 } 303 304 __attribute__((unused)) 305 static float32x4_t vtrn1q_f32(float32x4_t a, float32x4_t b) 306 { 307 return (float32x4_t){ { a.e[0], b.e[0], a.e[2], b.e[2] } }; 308 } 309 310 __attribute__((unused)) 311 static float32x4_t vtrn2q_f32(float32x4_t a, float32x4_t b) 312 { 313 return (float32x4_t){ { a.e[1], b.e[1], a.e[3], b.e[3] } }; 314 } 315 316 __attribute__((unused)) 317 static float32x4_t vzip1q_f32(float32x4_t a, float32x4_t b) 318 { 319 return (float32x4_t){ { a.e[0], b.e[0], a.e[1], b.e[1] } }; 320 } 321 322 __attribute__((unused)) 323 static float32x4_t vzip2q_f32(float32x4_t a, float32x4_t b) 324 { 325 return (float32x4_t){ { a.e[2], b.e[2], a.e[3], b.e[3] } }; 326 } 327 328 329 #endif /* __ARM_NEON */ 330