1 /*
2 * Copyright 2024 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef ULTRAHDR_DSP_ARM_MEM_NEON_H
18 #define ULTRAHDR_DSP_ARM_MEM_NEON_H
19
20 #include <arm_neon.h>
21
22 #include "ultrahdr/ultrahdrcommon.h"
23
24 namespace ultrahdr {
25
26 // The multi-vector load/store intrinsics are well-supported on AArch64 but
27 // only supported from GCC 14.1 (and not at all on Clang) for 32-bit platforms.
28 #if __aarch64__ || (!__clang__ && __GNUC__ >= 14)
29 #define COMPILER_SUPPORTS_LDST_MULTIPLE 1
30 #endif
31
load_u8x16_x2(const uint8_t * src)32 static FORCE_INLINE uint8x16x2_t load_u8x16_x2(const uint8_t *src) {
33 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
34 return vld1q_u8_x2(src);
35 #else
36 uint8x16x2_t res = {{vld1q_u8(src + 0), vld1q_u8(src + 16)}};
37 return res;
38 #endif
39 }
40
load_u8x16_x4(const uint8_t * src)41 static FORCE_INLINE uint8x16x4_t load_u8x16_x4(const uint8_t *src) {
42 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
43 return vld1q_u8_x4(src);
44 #else
45 uint8x16x4_t res = {
46 {vld1q_u8(src + 0), vld1q_u8(src + 16), vld1q_u8(src + 32), vld1q_u8(src + 48)}};
47 return res;
48 #endif
49 }
50
load_u16x8_x2(const uint16_t * src)51 static FORCE_INLINE uint16x8x2_t load_u16x8_x2(const uint16_t *src) {
52 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
53 return vld1q_u16_x2(src);
54 #else
55 uint16x8x2_t res = {{vld1q_u16(src + 0), vld1q_u16(src + 8)}};
56 return res;
57 #endif
58 }
59
load_u16x8_x4(const uint16_t * src)60 static FORCE_INLINE uint16x8x4_t load_u16x8_x4(const uint16_t *src) {
61 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
62 return vld1q_u16_x4(src);
63 #else
64 uint16x8x4_t res = {
65 {vld1q_u16(src + 0), vld1q_u16(src + 8), vld1q_u16(src + 16), vld1q_u16(src + 24)}};
66 return res;
67 #endif
68 }
69
load_u32x4_x2(const uint32_t * src)70 static FORCE_INLINE uint32x4x2_t load_u32x4_x2(const uint32_t *src) {
71 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
72 return vld1q_u32_x2(src);
73 #else
74 uint32x4x2_t res = {{vld1q_u32(src + 0), vld1q_u32(src + 4)}};
75 return res;
76 #endif
77 }
78
load_u32x4_x4(const uint32_t * src)79 static FORCE_INLINE uint32x4x4_t load_u32x4_x4(const uint32_t *src) {
80 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
81 return vld1q_u32_x4(src);
82 #else
83 uint32x4x4_t res = {
84 {vld1q_u32(src + 0), vld1q_u32(src + 4), vld1q_u32(src + 8), vld1q_u32(src + 12)}};
85 return res;
86 #endif
87 }
88
store_u8x16_x2(uint8_t * dst,uint8x16x2_t a)89 static FORCE_INLINE void store_u8x16_x2(uint8_t *dst, uint8x16x2_t a) {
90 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
91 vst1q_u8_x2(dst, a);
92 #else
93 vst1q_u8(dst + 0, a.val[0]);
94 vst1q_u8(dst + 16, a.val[1]);
95 #endif
96 }
97
store_u8x16_x4(uint8_t * dst,uint8x16x4_t a)98 static FORCE_INLINE void store_u8x16_x4(uint8_t *dst, uint8x16x4_t a) {
99 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
100 vst1q_u8_x4(dst, a);
101 #else
102 vst1q_u8(dst + 0, a.val[0]);
103 vst1q_u8(dst + 16, a.val[1]);
104 vst1q_u8(dst + 32, a.val[2]);
105 vst1q_u8(dst + 48, a.val[3]);
106 #endif
107 }
108
store_u16x8_x2(uint16_t * dst,uint16x8x2_t a)109 static FORCE_INLINE void store_u16x8_x2(uint16_t *dst, uint16x8x2_t a) {
110 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
111 vst1q_u16_x2(dst, a);
112 #else
113 vst1q_u16(dst + 0, a.val[0]);
114 vst1q_u16(dst + 8, a.val[1]);
115 #endif
116 }
117
store_u16x8_x4(uint16_t * dst,uint16x8x4_t a)118 static FORCE_INLINE void store_u16x8_x4(uint16_t *dst, uint16x8x4_t a) {
119 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
120 vst1q_u16_x4(dst, a);
121 #else
122 vst1q_u16(dst + 0, a.val[0]);
123 vst1q_u16(dst + 8, a.val[1]);
124 vst1q_u16(dst + 16, a.val[2]);
125 vst1q_u16(dst + 24, a.val[3]);
126 #endif
127 }
128
store_u32x4_x2(uint32_t * dst,uint32x4x2_t a)129 static FORCE_INLINE void store_u32x4_x2(uint32_t *dst, uint32x4x2_t a) {
130 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
131 vst1q_u32_x2(dst, a);
132 #else
133 vst1q_u32(dst + 0, a.val[0]);
134 vst1q_u32(dst + 4, a.val[1]);
135 #endif
136 }
137
store_u32x4_x4(uint32_t * dst,uint32x4x4_t a)138 static FORCE_INLINE void store_u32x4_x4(uint32_t *dst, uint32x4x4_t a) {
139 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
140 vst1q_u32_x4(dst, a);
141 #else
142 vst1q_u32(dst + 0, a.val[0]);
143 vst1q_u32(dst + 4, a.val[1]);
144 vst1q_u32(dst + 8, a.val[2]);
145 vst1q_u32(dst + 12, a.val[3]);
146 #endif
147 }
148
149 } // namespace ultrahdr
150
151 #endif // ULTRAHDR_DSP_ARM_MEM_NEON_H
152