xref: /aosp_15_r20/external/llvm-libc/src/string/memory_utils/op_aarch64.h (revision 71db0c75aadcf003ffe3238005f61d7618a3fead)
1 //===-- aarch64 implementation of memory function building blocks ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file provides aarch64 specific building blocks to compose memory
10 // functions.
11 //
12 //===----------------------------------------------------------------------===//
13 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
14 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
15 
16 #include "src/__support/macros/config.h"
17 #include "src/__support/macros/properties/architectures.h"
18 
19 #if defined(LIBC_TARGET_ARCH_IS_AARCH64)
20 
21 #include "src/__support/CPP/type_traits.h" // cpp::always_false
22 #include "src/__support/common.h"
23 #include "src/string/memory_utils/op_generic.h"
24 
25 #ifdef __ARM_NEON
26 #include <arm_neon.h>
27 #endif //__ARM_NEON
28 
29 namespace LIBC_NAMESPACE_DECL {
30 namespace aarch64 {
31 
32 LIBC_INLINE_VAR constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);
33 
34 namespace neon {
35 
36 struct BzeroCacheLine {
37   static constexpr size_t SIZE = 64;
38 
blockBzeroCacheLine39   LIBC_INLINE static void block(Ptr dst, uint8_t) {
40 #if __SIZEOF_POINTER__ == 4
41     asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
42 #else
43     asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
44 #endif
45   }
46 
loop_and_tailBzeroCacheLine47   LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
48     size_t offset = 0;
49     do {
50       block(dst + offset, value);
51       offset += SIZE;
52     } while (offset < count - SIZE);
53     // Unaligned store, we can't use 'dc zva' here.
54     generic::Memset<generic_v512>::tail(dst, value, count);
55   }
56 };
57 
hasZva()58 LIBC_INLINE bool hasZva() {
59   uint64_t zva_val;
60   asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
61   // DC ZVA is permitted if DZP, bit [4] is zero.
62   // BS, bits [3:0] is log2 of the block count in words.
63   // So the next line checks whether the instruction is permitted and block
64   // count is 16 words (i.e. 64 bytes).
65   return (zva_val & 0b11111) == 0b00100;
66 }
67 
68 } // namespace neon
69 
70 ///////////////////////////////////////////////////////////////////////////////
71 // Bcmp
72 template <size_t Size> struct Bcmp {
73   static constexpr size_t SIZE = Size;
74   static constexpr size_t BlockSize = 32;
75 
as_u8Bcmp76   LIBC_INLINE static const unsigned char *as_u8(CPtr ptr) {
77     return reinterpret_cast<const unsigned char *>(ptr);
78   }
79 
blockBcmp80   LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
81     if constexpr (Size == 16) {
82       auto _p1 = as_u8(p1);
83       auto _p2 = as_u8(p2);
84       uint8x16_t a = vld1q_u8(_p1);
85       uint8x16_t n = vld1q_u8(_p2);
86       uint8x16_t an = veorq_u8(a, n);
87       uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an));
88       return vmaxv_u32(an_reduced);
89     } else if constexpr (Size == 32) {
90       auto _p1 = as_u8(p1);
91       auto _p2 = as_u8(p2);
92       uint8x16_t a = vld1q_u8(_p1);
93       uint8x16_t b = vld1q_u8(_p1 + 16);
94       uint8x16_t n = vld1q_u8(_p2);
95       uint8x16_t o = vld1q_u8(_p2 + 16);
96       uint8x16_t an = veorq_u8(a, n);
97       uint8x16_t bo = veorq_u8(b, o);
98       // anbo = (a ^ n) | (b ^ o).  At least one byte is nonzero if there is
99       // a difference between the two buffers.  We reduce this value down to 4
100       // bytes in two steps. First, calculate the saturated move value when
101       // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
102       // a single 32 bit nonzero value if a mismatch occurred.
103       uint8x16_t anbo = vorrq_u8(an, bo);
104       uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
105       return vmaxv_u32(anbo_reduced);
106     } else if constexpr ((Size % BlockSize) == 0) {
107       for (size_t offset = 0; offset < Size; offset += BlockSize)
108         if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
109           return value;
110     } else {
111       static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented");
112     }
113     return BcmpReturnType::zero();
114   }
115 
tailBcmp116   LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
117     return block(p1 + count - SIZE, p2 + count - SIZE);
118   }
119 
head_tailBcmp120   LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
121     if constexpr (Size == 16) {
122       auto _p1 = as_u8(p1);
123       auto _p2 = as_u8(p2);
124       uint8x16_t a = vld1q_u8(_p1);
125       uint8x16_t b = vld1q_u8(_p1 + count - 16);
126       uint8x16_t n = vld1q_u8(_p2);
127       uint8x16_t o = vld1q_u8(_p2 + count - 16);
128       uint8x16_t an = veorq_u8(a, n);
129       uint8x16_t bo = veorq_u8(b, o);
130       // anbo = (a ^ n) | (b ^ o)
131       uint8x16_t anbo = vorrq_u8(an, bo);
132       uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
133       return vmaxv_u32(anbo_reduced);
134     } else if constexpr (Size == 32) {
135       auto _p1 = as_u8(p1);
136       auto _p2 = as_u8(p2);
137       uint8x16_t a = vld1q_u8(_p1);
138       uint8x16_t b = vld1q_u8(_p1 + 16);
139       uint8x16_t c = vld1q_u8(_p1 + count - 16);
140       uint8x16_t d = vld1q_u8(_p1 + count - 32);
141       uint8x16_t n = vld1q_u8(_p2);
142       uint8x16_t o = vld1q_u8(_p2 + 16);
143       uint8x16_t p = vld1q_u8(_p2 + count - 16);
144       uint8x16_t q = vld1q_u8(_p2 + count - 32);
145       uint8x16_t an = veorq_u8(a, n);
146       uint8x16_t bo = veorq_u8(b, o);
147       uint8x16_t cp = veorq_u8(c, p);
148       uint8x16_t dq = veorq_u8(d, q);
149       uint8x16_t anbo = vorrq_u8(an, bo);
150       uint8x16_t cpdq = vorrq_u8(cp, dq);
151       // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)).  Reduce this to
152       // a nonzero 32 bit value if a mismatch occurred.
153       uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
154       uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
155       return vmaxv_u32(abnocpdq_reduced);
156     } else {
157       static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented");
158     }
159     return BcmpReturnType::zero();
160   }
161 
loop_and_tailBcmp162   LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
163                                                   size_t count) {
164     static_assert(Size > 1, "a loop of size 1 does not need tail");
165     size_t offset = 0;
166     do {
167       if (auto value = block(p1 + offset, p2 + offset))
168         return value;
169       offset += SIZE;
170     } while (offset < count - SIZE);
171     return tail(p1, p2, count);
172   }
173 };
174 
175 } // namespace aarch64
176 } // namespace LIBC_NAMESPACE_DECL
177 
178 namespace LIBC_NAMESPACE_DECL {
179 namespace generic {
180 
181 ///////////////////////////////////////////////////////////////////////////////
182 // Specializations for uint16_t
183 template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
184 template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
185   return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
186 }
187 template <>
188 LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
189   return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
190 }
191 template <>
192 LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
193   return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
194          static_cast<int32_t>(load_be<uint16_t>(p2, offset));
195 }
196 
197 ///////////////////////////////////////////////////////////////////////////////
198 // Specializations for uint32_t
199 template <> struct cmp_is_expensive<uint32_t> : cpp::false_type {};
200 template <>
201 LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
202   return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
203 }
204 template <>
205 LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
206   const auto a = load_be<uint32_t>(p1, offset);
207   const auto b = load_be<uint32_t>(p2, offset);
208   return a > b ? 1 : a < b ? -1 : 0;
209 }
210 
211 ///////////////////////////////////////////////////////////////////////////////
212 // Specializations for uint64_t
213 template <> struct cmp_is_expensive<uint64_t> : cpp::false_type {};
214 template <>
215 LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
216   return load<uint64_t>(p1, offset) != load<uint64_t>(p2, offset);
217 }
218 template <>
219 LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
220   const auto a = load_be<uint64_t>(p1, offset);
221   const auto b = load_be<uint64_t>(p2, offset);
222   if (a != b)
223     return a > b ? 1 : -1;
224   return MemcmpReturnType::zero();
225 }
226 
227 ///////////////////////////////////////////////////////////////////////////////
228 // Specializations for uint8x16_t
229 template <> struct is_vector<uint8x16_t> : cpp::true_type {};
230 template <> struct cmp_is_expensive<uint8x16_t> : cpp::false_type {};
231 template <>
232 LIBC_INLINE uint32_t neq<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
233   for (size_t i = 0; i < 2; ++i) {
234     auto a = load<uint64_t>(p1, offset);
235     auto b = load<uint64_t>(p2, offset);
236     uint32_t cond = a != b;
237     if (cond)
238       return cond;
239     offset += sizeof(uint64_t);
240   }
241   return 0;
242 }
243 template <>
244 LIBC_INLINE MemcmpReturnType cmp<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
245   for (size_t i = 0; i < 2; ++i) {
246     auto a = load_be<uint64_t>(p1, offset);
247     auto b = load_be<uint64_t>(p2, offset);
248     if (a != b)
249       return cmp_neq_uint64_t(a, b);
250     offset += sizeof(uint64_t);
251   }
252   return MemcmpReturnType::zero();
253 }
254 
255 ///////////////////////////////////////////////////////////////////////////////
256 // Specializations for uint8x16x2_t
257 template <> struct is_vector<uint8x16x2_t> : cpp::true_type {};
258 template <> struct cmp_is_expensive<uint8x16x2_t> : cpp::false_type {};
259 template <>
260 LIBC_INLINE MemcmpReturnType cmp<uint8x16x2_t>(CPtr p1, CPtr p2,
261                                                size_t offset) {
262   for (size_t i = 0; i < 4; ++i) {
263     auto a = load_be<uint64_t>(p1, offset);
264     auto b = load_be<uint64_t>(p2, offset);
265     if (a != b)
266       return cmp_neq_uint64_t(a, b);
267     offset += sizeof(uint64_t);
268   }
269   return MemcmpReturnType::zero();
270 }
271 } // namespace generic
272 } // namespace LIBC_NAMESPACE_DECL
273 
274 #endif // LIBC_TARGET_ARCH_IS_AARCH64
275 
276 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
277