xref: /aosp_15_r20/external/executorch/extension/llm/custom_ops/spinquant/third-party/FFHT/fast_copy.c (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 #include "fast_copy.h"
2 #include <string.h>
3 #include <stdlib.h>
4 #if (defined(__x86_64__) || defined(__i386__))
5 #  include <x86intrin.h>
6 #endif
7 
8 #ifdef FHT_HEADER_ONLY
9 #  define _STORAGE_ static inline
10 #else
11 #  define _STORAGE_
12 #endif
13 
14 // These functions all assume that the size of memory being copied is a power of 2.
15 
16 #if _FEATURE_AVX512F
17 // If n is less than 64, defaults to memcpy. Otherwise, being a power of 2, we can just use unaligned stores and loads.
fast_copy(void * out,void * in,size_t n)18 _STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
19     if(n >= FAST_COPY_MEMCPY_THRESHOLD) {
20         return memcpy(out, in, n);
21     }
22     n >>= 6;
23     for(__m512 *ov = (__m512 *)out, *iv = (__m512 *)in; n--;) {
24         _mm512_storeu_ps((float *)(ov++), _mm512_loadu_ps((float *)(iv++)));
25     }
26     return out;
27 }
28 #elif __AVX2__
29 // If n is less than 32, defaults to memcpy. Otherwise, being a power of 2, we can just use unaligned stores and loads.
fast_copy(void * out,void * in,size_t n)30 _STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
31     if(n >= FAST_COPY_MEMCPY_THRESHOLD) {
32         return memcpy(out, in, n);
33     }
34     n >>= 5;
35     for(__m256 *ov = (__m256 *)out, *iv = (__m256 *)in; n--;) {
36         _mm256_storeu_ps((float *)(ov++), _mm256_loadu_ps((float *)(iv++)));
37     }
38     return out;
39 }
40 #elif __SSE2__
41 // If n is less than 16, defaults to memcpy. Otherwise, being a power of 2, we can just use unaligned stores and loads.
fast_copy(void * out,void * in,size_t n)42 _STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
43     if(n >= FAST_COPY_MEMCPY_THRESHOLD) {
44         return memcpy(out, in, n);
45     }
46     n >>= 4;
47     for(__m128 *ov = (__m128 *)out, *iv = (__m128 *)in; n--;) {
48         _mm_storeu_ps((float *)(ov++), _mm_loadu_ps((float *)(iv++)));
49     }
50     return out;
51 }
52 #else
fast_copy(void * out,void * in,size_t n)53 _STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
54     return memcpy(out, in, n);
55 }
56 #endif
57 
58 #ifdef FHT_HEADER_ONLY
59 #  undef _STORAGE_
60 #endif
61