1 // Copyright 2022 The Abseil Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #ifndef ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
16 #define ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
17
18 #ifdef _MSC_VER
19 #include <intrin.h>
20 #endif
21
22 #ifdef __SSE__
23 #include <xmmintrin.h>
24 #endif
25
26 #ifdef __SSE2__
27 #include <emmintrin.h>
28 #endif
29
30 #ifdef __SSE3__
31 #include <pmmintrin.h>
32 #endif
33
34 #ifdef __AVX__
35 #include <immintrin.h>
36 #endif
37
38 #ifdef __aarch64__
39 #include "absl/crc/internal/non_temporal_arm_intrinsics.h"
40 #endif
41
42 #include <algorithm>
43 #include <cassert>
44 #include <cstdint>
45 #include <cstring>
46
47 #include "absl/base/config.h"
48 #include "absl/base/optimization.h"
49
50 namespace absl {
51 ABSL_NAMESPACE_BEGIN
52 namespace crc_internal {
53
54 // This non-temporal memcpy does regular load and non-temporal store memory
55 // copy. It is compatible to both 16-byte aligned and unaligned addresses. If
56 // data at the destination is not immediately accessed, using non-temporal
57 // memcpy can save 1 DRAM load of the destination cacheline.
58 constexpr size_t kCacheLineSize = ABSL_CACHELINE_SIZE;
59
60 // If the objects overlap, the behavior is undefined.
non_temporal_store_memcpy(void * __restrict dst,const void * __restrict src,size_t len)61 inline void *non_temporal_store_memcpy(void *__restrict dst,
62 const void *__restrict src, size_t len) {
63 #if defined(__SSE3__) || defined(__aarch64__) || \
64 (defined(_MSC_VER) && defined(__AVX__))
65 // This implementation requires SSE3.
66 // MSVC cannot target SSE3 directly, but when MSVC targets AVX,
67 // SSE3 support is implied.
68 uint8_t *d = reinterpret_cast<uint8_t *>(dst);
69 const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
70
71 // memcpy() the misaligned header. At the end of this if block, <d> is
72 // aligned to a 64-byte cacheline boundary or <len> == 0.
73 if (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)) {
74 uintptr_t bytes_before_alignment_boundary =
75 kCacheLineSize -
76 (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
77 size_t header_len = (std::min)(bytes_before_alignment_boundary, len);
78 assert(bytes_before_alignment_boundary < kCacheLineSize);
79 memcpy(d, s, header_len);
80 d += header_len;
81 s += header_len;
82 len -= header_len;
83 }
84
85 if (len >= kCacheLineSize) {
86 _mm_sfence();
87 __m128i *dst_cacheline = reinterpret_cast<__m128i *>(d);
88 const __m128i *src_cacheline = reinterpret_cast<const __m128i *>(s);
89 constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m128i);
90 size_t loops = len / kCacheLineSize;
91
92 while (len >= kCacheLineSize) {
93 __m128i temp1, temp2, temp3, temp4;
94 temp1 = _mm_lddqu_si128(src_cacheline + 0);
95 temp2 = _mm_lddqu_si128(src_cacheline + 1);
96 temp3 = _mm_lddqu_si128(src_cacheline + 2);
97 temp4 = _mm_lddqu_si128(src_cacheline + 3);
98 _mm_stream_si128(dst_cacheline + 0, temp1);
99 _mm_stream_si128(dst_cacheline + 1, temp2);
100 _mm_stream_si128(dst_cacheline + 2, temp3);
101 _mm_stream_si128(dst_cacheline + 3, temp4);
102 src_cacheline += kOpsPerCacheLine;
103 dst_cacheline += kOpsPerCacheLine;
104 len -= kCacheLineSize;
105 }
106 d += loops * kCacheLineSize;
107 s += loops * kCacheLineSize;
108 _mm_sfence();
109 }
110
111 // memcpy the tail.
112 if (len) {
113 memcpy(d, s, len);
114 }
115 return dst;
116 #else
117 // Fallback to regular memcpy.
118 return memcpy(dst, src, len);
119 #endif // __SSE3__ || __aarch64__ || (_MSC_VER && __AVX__)
120 }
121
non_temporal_store_memcpy_avx(void * __restrict dst,const void * __restrict src,size_t len)122 inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
123 const void *__restrict src,
124 size_t len) {
125 #ifdef __AVX__
126 uint8_t *d = reinterpret_cast<uint8_t *>(dst);
127 const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
128
129 // memcpy() the misaligned header. At the end of this if block, <d> is
130 // aligned to a 64-byte cacheline boundary or <len> == 0.
131 if (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)) {
132 uintptr_t bytes_before_alignment_boundary =
133 kCacheLineSize -
134 (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
135 size_t header_len = (std::min)(bytes_before_alignment_boundary, len);
136 assert(bytes_before_alignment_boundary < kCacheLineSize);
137 memcpy(d, s, header_len);
138 d += header_len;
139 s += header_len;
140 len -= header_len;
141 }
142
143 if (len >= kCacheLineSize) {
144 _mm_sfence();
145 __m256i *dst_cacheline = reinterpret_cast<__m256i *>(d);
146 const __m256i *src_cacheline = reinterpret_cast<const __m256i *>(s);
147 constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m256i);
148 size_t loops = len / kCacheLineSize;
149
150 while (len >= kCacheLineSize) {
151 __m256i temp1, temp2;
152 temp1 = _mm256_lddqu_si256(src_cacheline + 0);
153 temp2 = _mm256_lddqu_si256(src_cacheline + 1);
154 _mm256_stream_si256(dst_cacheline + 0, temp1);
155 _mm256_stream_si256(dst_cacheline + 1, temp2);
156 src_cacheline += kOpsPerCacheLine;
157 dst_cacheline += kOpsPerCacheLine;
158 len -= kCacheLineSize;
159 }
160 d += loops * kCacheLineSize;
161 s += loops * kCacheLineSize;
162 _mm_sfence();
163 }
164
165 // memcpy the tail.
166 if (len) {
167 memcpy(d, s, len);
168 }
169 return dst;
170 #else
171 // Fallback to regular memcpy when AVX is not available.
172 return memcpy(dst, src, len);
173 #endif // __AVX__
174 }
175
176 } // namespace crc_internal
177 ABSL_NAMESPACE_END
178 } // namespace absl
179
180 #endif // ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
181