1 // Copyright 2022 The Abseil Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
16 #define ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
17 
18 #ifdef _MSC_VER
19 #include <intrin.h>
20 #endif
21 
22 #ifdef __SSE__
23 #include <xmmintrin.h>
24 #endif
25 
26 #ifdef __SSE2__
27 #include <emmintrin.h>
28 #endif
29 
30 #ifdef __SSE3__
31 #include <pmmintrin.h>
32 #endif
33 
34 #ifdef __AVX__
35 #include <immintrin.h>
36 #endif
37 
38 #ifdef __aarch64__
39 #include "absl/crc/internal/non_temporal_arm_intrinsics.h"
40 #endif
41 
42 #include <algorithm>
43 #include <cassert>
44 #include <cstdint>
45 #include <cstring>
46 
47 #include "absl/base/config.h"
48 #include "absl/base/optimization.h"
49 
50 namespace absl {
51 ABSL_NAMESPACE_BEGIN
52 namespace crc_internal {
53 
54 // This non-temporal memcpy does regular load and non-temporal store memory
55 // copy. It is compatible to both 16-byte aligned and unaligned addresses. If
56 // data at the destination is not immediately accessed, using non-temporal
57 // memcpy can save 1 DRAM load of the destination cacheline.
58 constexpr size_t kCacheLineSize = ABSL_CACHELINE_SIZE;
59 
60 // If the objects overlap, the behavior is undefined.
non_temporal_store_memcpy(void * __restrict dst,const void * __restrict src,size_t len)61 inline void *non_temporal_store_memcpy(void *__restrict dst,
62                                        const void *__restrict src, size_t len) {
63 #if defined(__SSE3__) || defined(__aarch64__) || \
64     (defined(_MSC_VER) && defined(__AVX__))
65   // This implementation requires SSE3.
66   // MSVC cannot target SSE3 directly, but when MSVC targets AVX,
67   // SSE3 support is implied.
68   uint8_t *d = reinterpret_cast<uint8_t *>(dst);
69   const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
70 
71   // memcpy() the misaligned header. At the end of this if block, <d> is
72   // aligned to a 64-byte cacheline boundary or <len> == 0.
73   if (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)) {
74     uintptr_t bytes_before_alignment_boundary =
75         kCacheLineSize -
76         (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
77     size_t header_len = (std::min)(bytes_before_alignment_boundary, len);
78     assert(bytes_before_alignment_boundary < kCacheLineSize);
79     memcpy(d, s, header_len);
80     d += header_len;
81     s += header_len;
82     len -= header_len;
83   }
84 
85   if (len >= kCacheLineSize) {
86     _mm_sfence();
87     __m128i *dst_cacheline = reinterpret_cast<__m128i *>(d);
88     const __m128i *src_cacheline = reinterpret_cast<const __m128i *>(s);
89     constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m128i);
90     size_t loops = len / kCacheLineSize;
91 
92     while (len >= kCacheLineSize) {
93       __m128i temp1, temp2, temp3, temp4;
94       temp1 = _mm_lddqu_si128(src_cacheline + 0);
95       temp2 = _mm_lddqu_si128(src_cacheline + 1);
96       temp3 = _mm_lddqu_si128(src_cacheline + 2);
97       temp4 = _mm_lddqu_si128(src_cacheline + 3);
98       _mm_stream_si128(dst_cacheline + 0, temp1);
99       _mm_stream_si128(dst_cacheline + 1, temp2);
100       _mm_stream_si128(dst_cacheline + 2, temp3);
101       _mm_stream_si128(dst_cacheline + 3, temp4);
102       src_cacheline += kOpsPerCacheLine;
103       dst_cacheline += kOpsPerCacheLine;
104       len -= kCacheLineSize;
105     }
106     d += loops * kCacheLineSize;
107     s += loops * kCacheLineSize;
108     _mm_sfence();
109   }
110 
111   // memcpy the tail.
112   if (len) {
113     memcpy(d, s, len);
114   }
115   return dst;
116 #else
117   // Fallback to regular memcpy.
118   return memcpy(dst, src, len);
119 #endif  // __SSE3__ || __aarch64__ || (_MSC_VER && __AVX__)
120 }
121 
non_temporal_store_memcpy_avx(void * __restrict dst,const void * __restrict src,size_t len)122 inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
123                                            const void *__restrict src,
124                                            size_t len) {
125 #ifdef __AVX__
126   uint8_t *d = reinterpret_cast<uint8_t *>(dst);
127   const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
128 
129   // memcpy() the misaligned header. At the end of this if block, <d> is
130   // aligned to a 64-byte cacheline boundary or <len> == 0.
131   if (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)) {
132     uintptr_t bytes_before_alignment_boundary =
133         kCacheLineSize -
134         (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
135     size_t header_len = (std::min)(bytes_before_alignment_boundary, len);
136     assert(bytes_before_alignment_boundary < kCacheLineSize);
137     memcpy(d, s, header_len);
138     d += header_len;
139     s += header_len;
140     len -= header_len;
141   }
142 
143   if (len >= kCacheLineSize) {
144     _mm_sfence();
145     __m256i *dst_cacheline = reinterpret_cast<__m256i *>(d);
146     const __m256i *src_cacheline = reinterpret_cast<const __m256i *>(s);
147     constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m256i);
148     size_t loops = len / kCacheLineSize;
149 
150     while (len >= kCacheLineSize) {
151       __m256i temp1, temp2;
152       temp1 = _mm256_lddqu_si256(src_cacheline + 0);
153       temp2 = _mm256_lddqu_si256(src_cacheline + 1);
154       _mm256_stream_si256(dst_cacheline + 0, temp1);
155       _mm256_stream_si256(dst_cacheline + 1, temp2);
156       src_cacheline += kOpsPerCacheLine;
157       dst_cacheline += kOpsPerCacheLine;
158       len -= kCacheLineSize;
159     }
160     d += loops * kCacheLineSize;
161     s += loops * kCacheLineSize;
162     _mm_sfence();
163   }
164 
165   // memcpy the tail.
166   if (len) {
167     memcpy(d, s, len);
168   }
169   return dst;
170 #else
171   // Fallback to regular memcpy when AVX is not available.
172   return memcpy(dst, src, len);
173 #endif  // __AVX__
174 }
175 
176 }  // namespace crc_internal
177 ABSL_NAMESPACE_END
178 }  // namespace absl
179 
180 #endif  // ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
181