xref: /aosp_15_r20/external/libultrahdr/lib/include/ultrahdr/dsp/arm/mem_neon.h (revision 89a0ef05262152531a00a15832a2d3b1e3990773)
1 /*
2  * Copyright 2024 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ULTRAHDR_DSP_ARM_MEM_NEON_H
18 #define ULTRAHDR_DSP_ARM_MEM_NEON_H
19 
20 #include <arm_neon.h>
21 
22 #include "ultrahdr/ultrahdrcommon.h"
23 
24 namespace ultrahdr {
25 
26 // The multi-vector load/store intrinsics are well-supported on AArch64 but
27 // only supported from GCC 14.1 (and not at all on Clang) for 32-bit platforms.
28 #if __aarch64__ || (!__clang__ && __GNUC__ >= 14)
29 #define COMPILER_SUPPORTS_LDST_MULTIPLE 1
30 #endif
31 
load_u8x16_x2(const uint8_t * src)32 static FORCE_INLINE uint8x16x2_t load_u8x16_x2(const uint8_t *src) {
33 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
34   return vld1q_u8_x2(src);
35 #else
36   uint8x16x2_t res = {{vld1q_u8(src + 0), vld1q_u8(src + 16)}};
37   return res;
38 #endif
39 }
40 
load_u8x16_x4(const uint8_t * src)41 static FORCE_INLINE uint8x16x4_t load_u8x16_x4(const uint8_t *src) {
42 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
43   return vld1q_u8_x4(src);
44 #else
45   uint8x16x4_t res = {
46       {vld1q_u8(src + 0), vld1q_u8(src + 16), vld1q_u8(src + 32), vld1q_u8(src + 48)}};
47   return res;
48 #endif
49 }
50 
load_u16x8_x2(const uint16_t * src)51 static FORCE_INLINE uint16x8x2_t load_u16x8_x2(const uint16_t *src) {
52 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
53   return vld1q_u16_x2(src);
54 #else
55   uint16x8x2_t res = {{vld1q_u16(src + 0), vld1q_u16(src + 8)}};
56   return res;
57 #endif
58 }
59 
load_u16x8_x4(const uint16_t * src)60 static FORCE_INLINE uint16x8x4_t load_u16x8_x4(const uint16_t *src) {
61 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
62   return vld1q_u16_x4(src);
63 #else
64   uint16x8x4_t res = {
65       {vld1q_u16(src + 0), vld1q_u16(src + 8), vld1q_u16(src + 16), vld1q_u16(src + 24)}};
66   return res;
67 #endif
68 }
69 
load_u32x4_x2(const uint32_t * src)70 static FORCE_INLINE uint32x4x2_t load_u32x4_x2(const uint32_t *src) {
71 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
72   return vld1q_u32_x2(src);
73 #else
74   uint32x4x2_t res = {{vld1q_u32(src + 0), vld1q_u32(src + 4)}};
75   return res;
76 #endif
77 }
78 
load_u32x4_x4(const uint32_t * src)79 static FORCE_INLINE uint32x4x4_t load_u32x4_x4(const uint32_t *src) {
80 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
81   return vld1q_u32_x4(src);
82 #else
83   uint32x4x4_t res = {
84       {vld1q_u32(src + 0), vld1q_u32(src + 4), vld1q_u32(src + 8), vld1q_u32(src + 12)}};
85   return res;
86 #endif
87 }
88 
store_u8x16_x2(uint8_t * dst,uint8x16x2_t a)89 static FORCE_INLINE void store_u8x16_x2(uint8_t *dst, uint8x16x2_t a) {
90 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
91   vst1q_u8_x2(dst, a);
92 #else
93   vst1q_u8(dst + 0, a.val[0]);
94   vst1q_u8(dst + 16, a.val[1]);
95 #endif
96 }
97 
store_u8x16_x4(uint8_t * dst,uint8x16x4_t a)98 static FORCE_INLINE void store_u8x16_x4(uint8_t *dst, uint8x16x4_t a) {
99 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
100   vst1q_u8_x4(dst, a);
101 #else
102   vst1q_u8(dst + 0, a.val[0]);
103   vst1q_u8(dst + 16, a.val[1]);
104   vst1q_u8(dst + 32, a.val[2]);
105   vst1q_u8(dst + 48, a.val[3]);
106 #endif
107 }
108 
store_u16x8_x2(uint16_t * dst,uint16x8x2_t a)109 static FORCE_INLINE void store_u16x8_x2(uint16_t *dst, uint16x8x2_t a) {
110 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
111   vst1q_u16_x2(dst, a);
112 #else
113   vst1q_u16(dst + 0, a.val[0]);
114   vst1q_u16(dst + 8, a.val[1]);
115 #endif
116 }
117 
store_u16x8_x4(uint16_t * dst,uint16x8x4_t a)118 static FORCE_INLINE void store_u16x8_x4(uint16_t *dst, uint16x8x4_t a) {
119 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
120   vst1q_u16_x4(dst, a);
121 #else
122   vst1q_u16(dst + 0, a.val[0]);
123   vst1q_u16(dst + 8, a.val[1]);
124   vst1q_u16(dst + 16, a.val[2]);
125   vst1q_u16(dst + 24, a.val[3]);
126 #endif
127 }
128 
store_u32x4_x2(uint32_t * dst,uint32x4x2_t a)129 static FORCE_INLINE void store_u32x4_x2(uint32_t *dst, uint32x4x2_t a) {
130 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
131   vst1q_u32_x2(dst, a);
132 #else
133   vst1q_u32(dst + 0, a.val[0]);
134   vst1q_u32(dst + 4, a.val[1]);
135 #endif
136 }
137 
store_u32x4_x4(uint32_t * dst,uint32x4x4_t a)138 static FORCE_INLINE void store_u32x4_x4(uint32_t *dst, uint32x4x4_t a) {
139 #ifdef COMPILER_SUPPORTS_LDST_MULTIPLE
140   vst1q_u32_x4(dst, a);
141 #else
142   vst1q_u32(dst + 0, a.val[0]);
143   vst1q_u32(dst + 4, a.val[1]);
144   vst1q_u32(dst + 8, a.val[2]);
145   vst1q_u32(dst + 12, a.val[3]);
146 #endif
147 }
148 
149 }  // namespace ultrahdr
150 
151 #endif  // ULTRAHDR_DSP_ARM_MEM_NEON_H
152