1 // Copyright 2022 The Abseil Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Simultaneous memcopy and CRC-32C for x86-64.  Uses integer registers because
16 // XMM registers do not support the CRC instruction (yet).  While copying,
17 // compute the running CRC of the data being copied.
18 //
19 // It is assumed that any CPU running this code has SSE4.2 instructions
20 // available (for CRC32C).  This file will do nothing if that is not true.
21 //
22 // The CRC instruction has a 3-byte latency, and we are stressing the ALU ports
23 // here (unlike a traditional memcopy, which has almost no ALU use), so we will
24 // need to copy in such a way that the CRC unit is used efficiently. We have two
25 // regimes in this code:
26 //  1. For operations of size < kCrcSmallSize, do the CRC then the memcpy
27 //  2. For operations of size > kCrcSmallSize:
28 //      a) compute an initial CRC + copy on a small amount of data to align the
29 //         destination pointer on a 16-byte boundary.
30 //      b) Split the data into 3 main regions and a tail (smaller than 48 bytes)
31 //      c) Do the copy and CRC of the 3 main regions, interleaving (start with
32 //         full cache line copies for each region, then move to single 16 byte
33 //         pieces per region).
34 //      d) Combine the CRCs with CRC32C::Concat.
35 //      e) Copy the tail and extend the CRC with the CRC of the tail.
36 // This method is not ideal for op sizes between ~1k and ~8k because CRC::Concat
37 // takes a significant amount of time.  A medium-sized approach could be added
38 // using 3 CRCs over fixed-size blocks where the zero-extensions required for
39 // CRC32C::Concat can be precomputed.
40 
41 #ifdef __SSE4_2__
42 #include <immintrin.h>
43 #endif
44 
45 #ifdef _MSC_VER
46 #include <intrin.h>
47 #endif
48 
49 #include <array>
50 #include <cstddef>
51 #include <cstdint>
52 #include <type_traits>
53 
54 #include "absl/base/dynamic_annotations.h"
55 #include "absl/base/internal/prefetch.h"
56 #include "absl/base/optimization.h"
57 #include "absl/crc/crc32c.h"
58 #include "absl/crc/internal/cpu_detect.h"
59 #include "absl/crc/internal/crc_memcpy.h"
60 #include "absl/strings/string_view.h"
61 
62 #ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
63 
64 namespace absl {
65 ABSL_NAMESPACE_BEGIN
66 namespace crc_internal {
67 
68 namespace {
69 
ShortCrcCopy(char * dst,const char * src,std::size_t length,crc32c_t crc)70 inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
71                              crc32c_t crc) {
72   // Small copy: just go 1 byte at a time: being nice to the branch predictor
73   // is more important here than anything else
74   uint32_t crc_uint32 = static_cast<uint32_t>(crc);
75   for (std::size_t i = 0; i < length; i++) {
76     uint8_t data = *reinterpret_cast<const uint8_t*>(src);
77     crc_uint32 = _mm_crc32_u8(crc_uint32, data);
78     *reinterpret_cast<uint8_t*>(dst) = data;
79     ++src;
80     ++dst;
81   }
82   return crc32c_t{crc_uint32};
83 }
84 
85 constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
86 
87 // Common function for copying the tails of multiple large regions.
88 template <size_t vec_regions, size_t int_regions>
LargeTailCopy(crc32c_t * crcs,char ** dst,const char ** src,size_t region_size,size_t copy_rounds)89 inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
90                           size_t region_size, size_t copy_rounds) {
91   std::array<__m128i, vec_regions> data;
92   std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;
93 
94   while (copy_rounds > 0) {
95     for (size_t i = 0; i < vec_regions; i++) {
96       size_t region = i;
97 
98       auto* vsrc =
99           reinterpret_cast<const __m128i*>(*src + region_size * region);
100       auto* vdst = reinterpret_cast<__m128i*>(*dst + region_size * region);
101 
102       // Load the blocks, unaligned
103       data[i] = _mm_loadu_si128(vsrc);
104 
105       // Store the blocks, aligned
106       _mm_store_si128(vdst, data[i]);
107 
108       // Compute the running CRC
109       crcs[region] = crc32c_t{static_cast<uint32_t>(
110           _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
111                         static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))};
112       crcs[region] = crc32c_t{static_cast<uint32_t>(
113           _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
114                         static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))};
115     }
116 
117     for (size_t i = 0; i < int_regions; i++) {
118       size_t region = vec_regions + i;
119 
120       auto* usrc =
121           reinterpret_cast<const uint64_t*>(*src + region_size * region);
122       auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region);
123 
124       for (size_t j = 0; j < kIntLoadsPerVec; j++) {
125         size_t data_index = i * kIntLoadsPerVec + j;
126 
127         int_data[data_index] = *(usrc + j);
128         crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
129             static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
130 
131         *(udst + j) = int_data[data_index];
132       }
133     }
134 
135     // Increment pointers
136     *src += sizeof(__m128i);
137     *dst += sizeof(__m128i);
138     --copy_rounds;
139   }
140 }
141 
142 }  // namespace
143 
144 template <size_t vec_regions, size_t int_regions>
145 class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
146  public:
147   AcceleratedCrcMemcpyEngine() = default;
148   AcceleratedCrcMemcpyEngine(const AcceleratedCrcMemcpyEngine&) = delete;
149   AcceleratedCrcMemcpyEngine operator=(const AcceleratedCrcMemcpyEngine&) =
150       delete;
151 
152   crc32c_t Compute(void* __restrict dst, const void* __restrict src,
153                    std::size_t length, crc32c_t initial_crc) const override;
154 };
155 
156 template <size_t vec_regions, size_t int_regions>
Compute(void * __restrict dst,const void * __restrict src,std::size_t length,crc32c_t initial_crc) const157 crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
158     void* __restrict dst, const void* __restrict src, std::size_t length,
159     crc32c_t initial_crc) const {
160   constexpr std::size_t kRegions = vec_regions + int_regions;
161   constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
162   constexpr std::size_t kBlockSize = sizeof(__m128i);
163   constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
164 
165   // Number of blocks per cacheline.
166   constexpr std::size_t kBlocksPerCacheLine = ABSL_CACHELINE_SIZE / kBlockSize;
167 
168   char* dst_bytes = static_cast<char*>(dst);
169   const char* src_bytes = static_cast<const char*>(src);
170 
171   // Make sure that one prefetch per big block is enough to cover the whole
172   // dataset, and we don't prefetch too much.
173   static_assert(ABSL_CACHELINE_SIZE % kBlockSize == 0,
174                 "Cache lines are not divided evenly into blocks, may have "
175                 "unintended behavior!");
176 
177   // Experimentally-determined boundary between a small and large copy.
178   // Below this number, spin-up and concatenation of CRCs takes enough time that
179   // it kills the throughput gains of using 3 regions and wide vectors.
180   constexpr size_t kCrcSmallSize = 256;
181 
182   // Experimentally-determined prefetch distance.  Main loop copies will
183   // prefeth data 2 cache lines ahead.
184   constexpr std::size_t kPrefetchAhead = 2 * ABSL_CACHELINE_SIZE;
185 
186   // Small-size CRC-memcpy : just do CRC + memcpy
187   if (length < kCrcSmallSize) {
188     crc32c_t crc =
189         ExtendCrc32c(initial_crc, absl::string_view(src_bytes, length));
190     memcpy(dst, src, length);
191     return crc;
192   }
193 
194   // Start work on the CRC: undo the XOR from the previous calculation or set up
195   // the initial value of the CRC.
196   // initial_crc ^= kCrcDataXor;
197   initial_crc = crc32c_t{static_cast<uint32_t>(initial_crc) ^ kCrcDataXor};
198 
199   // Do an initial alignment copy, so we can use aligned store instructions to
200   // the destination pointer.  We align the destination pointer because the
201   // penalty for an unaligned load is small compared to the penalty of an
202   // unaligned store on modern CPUs.
203   std::size_t bytes_from_last_aligned =
204       reinterpret_cast<uintptr_t>(dst) & (kBlockSize - 1);
205   if (bytes_from_last_aligned != 0) {
206     std::size_t bytes_for_alignment = kBlockSize - bytes_from_last_aligned;
207 
208     // Do the short-sized copy and CRC.
209     initial_crc =
210         ShortCrcCopy(dst_bytes, src_bytes, bytes_for_alignment, initial_crc);
211     src_bytes += bytes_for_alignment;
212     dst_bytes += bytes_for_alignment;
213     length -= bytes_for_alignment;
214   }
215 
216   // We are going to do the copy and CRC in kRegions regions to make sure that
217   // we can saturate the CRC unit.  The CRCs will be combined at the end of the
218   // run.  Copying will use the SSE registers, and we will extract words from
219   // the SSE registers to add to the CRC.  Initially, we run the loop one full
220   // cache line per region at a time, in order to insert prefetches.
221 
222   // Initialize CRCs for kRegions regions.
223   crc32c_t crcs[kRegions];
224   crcs[0] = initial_crc;
225   for (size_t i = 1; i < kRegions; i++) {
226     crcs[i] = crc32c_t{kCrcDataXor};
227   }
228 
229   // Find the number of rounds to copy and the region size.  Also compute the
230   // tail size here.
231   size_t copy_rounds = length / kCopyRoundSize;
232 
233   // Find the size of each region and the size of the tail.
234   const std::size_t region_size = copy_rounds * kBlockSize;
235   const std::size_t tail_size = length - (kRegions * region_size);
236 
237   // Holding registers for data in each region.
238   std::array<__m128i, vec_regions> vec_data;
239   std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data;
240 
241   // Main loop.
242   while (copy_rounds > kBlocksPerCacheLine) {
243     // Prefetch kPrefetchAhead bytes ahead of each pointer.
244     for (size_t i = 0; i < kRegions; i++) {
245       absl::base_internal::PrefetchT0(src_bytes + kPrefetchAhead +
246                                       region_size * i);
247       absl::base_internal::PrefetchT0(dst_bytes + kPrefetchAhead +
248                                       region_size * i);
249     }
250 
251     // Load and store data, computing CRC on the way.
252     for (size_t i = 0; i < kBlocksPerCacheLine; i++) {
253       // Copy and CRC the data for the CRC regions.
254       for (size_t j = 0; j < vec_regions; j++) {
255         // Cycle which regions get vector load/store and integer load/store, to
256         // engage prefetching logic around vector load/stores and save issue
257         // slots by using the integer registers.
258         size_t region = (j + i) % kRegions;
259 
260         auto* vsrc =
261             reinterpret_cast<const __m128i*>(src_bytes + region_size * region);
262         auto* vdst =
263             reinterpret_cast<__m128i*>(dst_bytes + region_size * region);
264 
265         // Load and CRC data.
266         vec_data[j] = _mm_loadu_si128(vsrc + i);
267         crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
268             static_cast<uint32_t>(crcs[region]),
269             static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))};
270         crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
271             static_cast<uint32_t>(crcs[region]),
272             static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))};
273 
274         // Store the data.
275         _mm_store_si128(vdst + i, vec_data[j]);
276       }
277 
278       // Preload the partial CRCs for the CLMUL subregions.
279       for (size_t j = 0; j < int_regions; j++) {
280         // Cycle which regions get vector load/store and integer load/store, to
281         // engage prefetching logic around vector load/stores and save issue
282         // slots by using the integer registers.
283         size_t region = (j + vec_regions + i) % kRegions;
284 
285         auto* usrc =
286             reinterpret_cast<const uint64_t*>(src_bytes + region_size * region);
287         auto* udst =
288             reinterpret_cast<uint64_t*>(dst_bytes + region_size * region);
289 
290         for (size_t k = 0; k < kIntLoadsPerVec; k++) {
291           size_t data_index = j * kIntLoadsPerVec + k;
292 
293           // Load and CRC the data.
294           int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
295           crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
296               static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
297 
298           // Store the data.
299           *(udst + i * kIntLoadsPerVec + k) = int_data[data_index];
300         }
301       }
302     }
303 
304     // Increment pointers
305     src_bytes += kBlockSize * kBlocksPerCacheLine;
306     dst_bytes += kBlockSize * kBlocksPerCacheLine;
307     copy_rounds -= kBlocksPerCacheLine;
308   }
309 
310   // Copy and CRC the tails of each region.
311   LargeTailCopy<vec_regions, int_regions>(crcs, &dst_bytes, &src_bytes,
312                                           region_size, copy_rounds);
313 
314   // Move the source and destination pointers to the end of the region
315   src_bytes += region_size * (kRegions - 1);
316   dst_bytes += region_size * (kRegions - 1);
317 
318   // Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the
319   // XOR done before doing block copy + CRCs.
320   for (size_t i = 0; i + 1 < kRegions; i++) {
321     crcs[i] = crc32c_t{static_cast<uint32_t>(crcs[i]) ^ kCrcDataXor};
322   }
323 
324   // Build a CRC of the first kRegions - 1 regions.
325   crc32c_t full_crc = crcs[0];
326   for (size_t i = 1; i + 1 < kRegions; i++) {
327     full_crc = ConcatCrc32c(full_crc, crcs[i], region_size);
328   }
329 
330   // Copy and CRC the tail through the XMM registers.
331   std::size_t tail_blocks = tail_size / kBlockSize;
332   LargeTailCopy<0, 1>(&crcs[kRegions - 1], &dst_bytes, &src_bytes, 0,
333                       tail_blocks);
334 
335   // Final tail copy for under 16 bytes.
336   crcs[kRegions - 1] =
337       ShortCrcCopy(dst_bytes, src_bytes, tail_size - tail_blocks * kBlockSize,
338                    crcs[kRegions - 1]);
339 
340   // Finalize and concatenate the final CRC, then return.
341   crcs[kRegions - 1] =
342       crc32c_t{static_cast<uint32_t>(crcs[kRegions - 1]) ^ kCrcDataXor};
343   return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size);
344 }
345 
GetArchSpecificEngines()346 CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
347 #ifdef UNDEFINED_BEHAVIOR_SANITIZER
348   // UBSAN does not play nicely with unaligned loads (which we use a lot).
349   // Get the underlying architecture.
350   CpuType cpu_type = GetCpuType();
351   switch (cpu_type) {
352     case CpuType::kUnknown:
353     case CpuType::kAmdRome:
354     case CpuType::kAmdNaples:
355     case CpuType::kIntelCascadelakeXeon:
356     case CpuType::kIntelSkylakeXeon:
357     case CpuType::kIntelSkylake:
358     case CpuType::kIntelBroadwell:
359     case CpuType::kIntelHaswell:
360     case CpuType::kIntelIvybridge:
361       return {
362           .temporal = new FallbackCrcMemcpyEngine(),
363           .non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
364       };
365     // INTEL_SANDYBRIDGE performs better with SSE than AVX.
366     case CpuType::kIntelSandybridge:
367       return {
368           .temporal = new FallbackCrcMemcpyEngine(),
369           .non_temporal = new CrcNonTemporalMemcpyEngine(),
370       };
371     default:
372       return {.temporal = new FallbackCrcMemcpyEngine(),
373               .non_temporal = new FallbackCrcMemcpyEngine()};
374   }
375 #else
376   // Get the underlying architecture.
377   CpuType cpu_type = GetCpuType();
378   switch (cpu_type) {
379     // On Zen 2, PEXTRQ uses 2 micro-ops, including one on the vector store port
380     // which data movement from the vector registers to the integer registers
381     // (where CRC32C happens) to crowd the same units as vector stores.  As a
382     // result, using that path exclusively causes bottlenecking on this port.
383     // We can avoid this bottleneck by using the integer side of the CPU for
384     // most operations rather than the vector side.  We keep a vector region to
385     // engage some of the prefetching logic in the cache hierarchy which seems
386     // to give vector instructions special treatment.  These prefetch units see
387     // strided access to each region, and do the right thing.
388     case CpuType::kAmdRome:
389     case CpuType::kAmdNaples:
390       return {
391           .temporal = new AcceleratedCrcMemcpyEngine<1, 2>(),
392           .non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
393       };
394     // PCLMULQDQ is slow and we don't have wide enough issue width to take
395     // advantage of it.  For an unknown architecture, don't risk using CLMULs.
396     case CpuType::kIntelCascadelakeXeon:
397     case CpuType::kIntelSkylakeXeon:
398     case CpuType::kIntelSkylake:
399     case CpuType::kIntelBroadwell:
400     case CpuType::kIntelHaswell:
401     case CpuType::kIntelIvybridge:
402       return {
403           .temporal = new AcceleratedCrcMemcpyEngine<3, 0>(),
404           .non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
405       };
406     // INTEL_SANDYBRIDGE performs better with SSE than AVX.
407     case CpuType::kIntelSandybridge:
408       return {
409           .temporal = new AcceleratedCrcMemcpyEngine<3, 0>(),
410           .non_temporal = new CrcNonTemporalMemcpyEngine(),
411       };
412     default:
413       return {.temporal = new FallbackCrcMemcpyEngine(),
414               .non_temporal = new FallbackCrcMemcpyEngine()};
415   }
416 #endif  // UNDEFINED_BEHAVIOR_SANITIZER
417 }
418 
419 // For testing, allow the user to specify which engine they want.
GetTestEngine(int vector,int integer)420 std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
421                                                           int integer) {
422   if (vector == 3 && integer == 0) {
423     return std::make_unique<AcceleratedCrcMemcpyEngine<3, 0>>();
424   } else if (vector == 1 && integer == 2) {
425     return std::make_unique<AcceleratedCrcMemcpyEngine<1, 2>>();
426   }
427   return nullptr;
428 }
429 
430 }  // namespace crc_internal
431 ABSL_NAMESPACE_END
432 }  // namespace absl
433 
434 #endif  // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
435