1 #include "fast_copy.h"
2 #include <string.h>
3 #include <stdlib.h>
4 #if (defined(__x86_64__) || defined(__i386__))
5 # include <x86intrin.h>
6 #endif
7
8 #ifdef FHT_HEADER_ONLY
9 # define _STORAGE_ static inline
10 #else
11 # define _STORAGE_
12 #endif
13
14 // These functions all assume that the size of memory being copied is a power of 2.
15
16 #if _FEATURE_AVX512F
17 // If n is less than 64, defaults to memcpy. Otherwise, being a power of 2, we can just use unaligned stores and loads.
fast_copy(void * out,void * in,size_t n)18 _STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
19 if(n >= FAST_COPY_MEMCPY_THRESHOLD) {
20 return memcpy(out, in, n);
21 }
22 n >>= 6;
23 for(__m512 *ov = (__m512 *)out, *iv = (__m512 *)in; n--;) {
24 _mm512_storeu_ps((float *)(ov++), _mm512_loadu_ps((float *)(iv++)));
25 }
26 return out;
27 }
28 #elif __AVX2__
29 // If n is less than 32, defaults to memcpy. Otherwise, being a power of 2, we can just use unaligned stores and loads.
fast_copy(void * out,void * in,size_t n)30 _STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
31 if(n >= FAST_COPY_MEMCPY_THRESHOLD) {
32 return memcpy(out, in, n);
33 }
34 n >>= 5;
35 for(__m256 *ov = (__m256 *)out, *iv = (__m256 *)in; n--;) {
36 _mm256_storeu_ps((float *)(ov++), _mm256_loadu_ps((float *)(iv++)));
37 }
38 return out;
39 }
40 #elif __SSE2__
41 // If n is less than 16, defaults to memcpy. Otherwise, being a power of 2, we can just use unaligned stores and loads.
fast_copy(void * out,void * in,size_t n)42 _STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
43 if(n >= FAST_COPY_MEMCPY_THRESHOLD) {
44 return memcpy(out, in, n);
45 }
46 n >>= 4;
47 for(__m128 *ov = (__m128 *)out, *iv = (__m128 *)in; n--;) {
48 _mm_storeu_ps((float *)(ov++), _mm_loadu_ps((float *)(iv++)));
49 }
50 return out;
51 }
52 #else
fast_copy(void * out,void * in,size_t n)53 _STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
54 return memcpy(out, in, n);
55 }
56 #endif
57
58 #ifdef FHT_HEADER_ONLY
59 # undef _STORAGE_
60 #endif
61