xref: /aosp_15_r20/external/zlib/contrib/optimizations/chunkcopy.h (revision 86ee64e75fa5f8bce2c8c356138035642429cd05)
1*86ee64e7SAndroid Build Coastguard Worker /* chunkcopy.h -- fast chunk copy and set operations
2*86ee64e7SAndroid Build Coastguard Worker  * Copyright (C) 2017 ARM, Inc.
3*86ee64e7SAndroid Build Coastguard Worker  * Copyright 2017 The Chromium Authors
4*86ee64e7SAndroid Build Coastguard Worker  * Use of this source code is governed by a BSD-style license that can be
5*86ee64e7SAndroid Build Coastguard Worker  * found in the Chromium source repository LICENSE file.
6*86ee64e7SAndroid Build Coastguard Worker  */
7*86ee64e7SAndroid Build Coastguard Worker 
8*86ee64e7SAndroid Build Coastguard Worker #ifndef CHUNKCOPY_H
9*86ee64e7SAndroid Build Coastguard Worker #define CHUNKCOPY_H
10*86ee64e7SAndroid Build Coastguard Worker 
11*86ee64e7SAndroid Build Coastguard Worker #include <stdint.h>
12*86ee64e7SAndroid Build Coastguard Worker #include "zutil.h"
13*86ee64e7SAndroid Build Coastguard Worker 
14*86ee64e7SAndroid Build Coastguard Worker #define Z_STATIC_ASSERT(name, assert) typedef char name[(assert) ? 1 : -1]
15*86ee64e7SAndroid Build Coastguard Worker 
16*86ee64e7SAndroid Build Coastguard Worker #if __STDC_VERSION__ >= 199901L
17*86ee64e7SAndroid Build Coastguard Worker #define Z_RESTRICT restrict
18*86ee64e7SAndroid Build Coastguard Worker #else
19*86ee64e7SAndroid Build Coastguard Worker #define Z_RESTRICT
20*86ee64e7SAndroid Build Coastguard Worker #endif
21*86ee64e7SAndroid Build Coastguard Worker 
22*86ee64e7SAndroid Build Coastguard Worker #if defined(__clang__) || defined(__GNUC__) || defined(__llvm__)
23*86ee64e7SAndroid Build Coastguard Worker #define Z_BUILTIN_MEMCPY __builtin_memcpy
24*86ee64e7SAndroid Build Coastguard Worker #define Z_BUILTIN_MEMSET __builtin_memset
25*86ee64e7SAndroid Build Coastguard Worker #else
26*86ee64e7SAndroid Build Coastguard Worker #define Z_BUILTIN_MEMCPY zmemcpy
27*86ee64e7SAndroid Build Coastguard Worker #define Z_BUILTIN_MEMSET zmemset
28*86ee64e7SAndroid Build Coastguard Worker #endif
29*86ee64e7SAndroid Build Coastguard Worker 
30*86ee64e7SAndroid Build Coastguard Worker #if defined(INFLATE_CHUNK_SIMD_NEON)
31*86ee64e7SAndroid Build Coastguard Worker #include <arm_neon.h>
32*86ee64e7SAndroid Build Coastguard Worker typedef uint8x16_t z_vec128i_t;
33*86ee64e7SAndroid Build Coastguard Worker #elif defined(INFLATE_CHUNK_SIMD_SSE2)
34*86ee64e7SAndroid Build Coastguard Worker #include <emmintrin.h>
35*86ee64e7SAndroid Build Coastguard Worker typedef __m128i z_vec128i_t;
36*86ee64e7SAndroid Build Coastguard Worker #elif defined(INFLATE_CHUNK_GENERIC)
37*86ee64e7SAndroid Build Coastguard Worker typedef struct { uint8_t x[16]; } z_vec128i_t;
38*86ee64e7SAndroid Build Coastguard Worker #else
39*86ee64e7SAndroid Build Coastguard Worker #error chunkcopy.h inflate chunk SIMD is not defined for your build target
40*86ee64e7SAndroid Build Coastguard Worker #endif
41*86ee64e7SAndroid Build Coastguard Worker 
42*86ee64e7SAndroid Build Coastguard Worker /*
43*86ee64e7SAndroid Build Coastguard Worker  * Suppress MSan errors about copying uninitialized bytes (crbug.com/1376033).
44*86ee64e7SAndroid Build Coastguard Worker  */
45*86ee64e7SAndroid Build Coastguard Worker #define Z_DISABLE_MSAN
46*86ee64e7SAndroid Build Coastguard Worker #if defined(__has_feature)
47*86ee64e7SAndroid Build Coastguard Worker   #if __has_feature(memory_sanitizer)
48*86ee64e7SAndroid Build Coastguard Worker     #undef Z_DISABLE_MSAN
49*86ee64e7SAndroid Build Coastguard Worker     #define Z_DISABLE_MSAN __attribute__((no_sanitize("memory")))
50*86ee64e7SAndroid Build Coastguard Worker   #endif
51*86ee64e7SAndroid Build Coastguard Worker #endif
52*86ee64e7SAndroid Build Coastguard Worker 
53*86ee64e7SAndroid Build Coastguard Worker /*
54*86ee64e7SAndroid Build Coastguard Worker  * chunk copy type: the z_vec128i_t type size should be exactly 128-bits
55*86ee64e7SAndroid Build Coastguard Worker  * and equal to CHUNKCOPY_CHUNK_SIZE.
56*86ee64e7SAndroid Build Coastguard Worker  */
57*86ee64e7SAndroid Build Coastguard Worker #define CHUNKCOPY_CHUNK_SIZE sizeof(z_vec128i_t)
58*86ee64e7SAndroid Build Coastguard Worker 
59*86ee64e7SAndroid Build Coastguard Worker Z_STATIC_ASSERT(vector_128_bits_wide,
60*86ee64e7SAndroid Build Coastguard Worker                 CHUNKCOPY_CHUNK_SIZE == sizeof(int8_t) * 16);
61*86ee64e7SAndroid Build Coastguard Worker 
62*86ee64e7SAndroid Build Coastguard Worker /*
63*86ee64e7SAndroid Build Coastguard Worker  * Ask the compiler to perform a wide, unaligned load with a machine
64*86ee64e7SAndroid Build Coastguard Worker  * instruction appropriate for the z_vec128i_t type.
65*86ee64e7SAndroid Build Coastguard Worker  */
loadchunk(const unsigned char FAR * s)66*86ee64e7SAndroid Build Coastguard Worker static inline z_vec128i_t loadchunk(
67*86ee64e7SAndroid Build Coastguard Worker     const unsigned char FAR* s) Z_DISABLE_MSAN {
68*86ee64e7SAndroid Build Coastguard Worker   z_vec128i_t v;
69*86ee64e7SAndroid Build Coastguard Worker   Z_BUILTIN_MEMCPY(&v, s, sizeof(v));
70*86ee64e7SAndroid Build Coastguard Worker   return v;
71*86ee64e7SAndroid Build Coastguard Worker }
72*86ee64e7SAndroid Build Coastguard Worker 
73*86ee64e7SAndroid Build Coastguard Worker /*
74*86ee64e7SAndroid Build Coastguard Worker  * Ask the compiler to perform a wide, unaligned store with a machine
75*86ee64e7SAndroid Build Coastguard Worker  * instruction appropriate for the z_vec128i_t type.
76*86ee64e7SAndroid Build Coastguard Worker  */
storechunk(unsigned char FAR * d,const z_vec128i_t v)77*86ee64e7SAndroid Build Coastguard Worker static inline void storechunk(
78*86ee64e7SAndroid Build Coastguard Worker     unsigned char FAR* d,
79*86ee64e7SAndroid Build Coastguard Worker     const z_vec128i_t v) {
80*86ee64e7SAndroid Build Coastguard Worker   Z_BUILTIN_MEMCPY(d, &v, sizeof(v));
81*86ee64e7SAndroid Build Coastguard Worker }
82*86ee64e7SAndroid Build Coastguard Worker 
83*86ee64e7SAndroid Build Coastguard Worker /*
84*86ee64e7SAndroid Build Coastguard Worker  * Perform a memcpy-like operation, assuming that length is non-zero and that
85*86ee64e7SAndroid Build Coastguard Worker  * it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
86*86ee64e7SAndroid Build Coastguard Worker  * the length is shorter than this.
87*86ee64e7SAndroid Build Coastguard Worker  *
88*86ee64e7SAndroid Build Coastguard Worker  * It also guarantees that it will properly unroll the data if the distance
89*86ee64e7SAndroid Build Coastguard Worker  * between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on
90*86ee64e7SAndroid Build Coastguard Worker  * in chunkcopy_relaxed().
91*86ee64e7SAndroid Build Coastguard Worker  *
92*86ee64e7SAndroid Build Coastguard Worker  * Aside from better memory bus utilisation, this means that short copies
93*86ee64e7SAndroid Build Coastguard Worker  * (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop
94*86ee64e7SAndroid Build Coastguard Worker  * without iteration, which will hopefully make the branch prediction more
95*86ee64e7SAndroid Build Coastguard Worker  * reliable.
96*86ee64e7SAndroid Build Coastguard Worker  */
chunkcopy_core(unsigned char FAR * out,const unsigned char FAR * from,unsigned len)97*86ee64e7SAndroid Build Coastguard Worker static inline unsigned char FAR* chunkcopy_core(
98*86ee64e7SAndroid Build Coastguard Worker     unsigned char FAR* out,
99*86ee64e7SAndroid Build Coastguard Worker     const unsigned char FAR* from,
100*86ee64e7SAndroid Build Coastguard Worker     unsigned len) Z_DISABLE_MSAN {
101*86ee64e7SAndroid Build Coastguard Worker   const int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1;
102*86ee64e7SAndroid Build Coastguard Worker   storechunk(out, loadchunk(from));
103*86ee64e7SAndroid Build Coastguard Worker   out += bump;
104*86ee64e7SAndroid Build Coastguard Worker   from += bump;
105*86ee64e7SAndroid Build Coastguard Worker   len /= CHUNKCOPY_CHUNK_SIZE;
106*86ee64e7SAndroid Build Coastguard Worker   while (len-- > 0) {
107*86ee64e7SAndroid Build Coastguard Worker     storechunk(out, loadchunk(from));
108*86ee64e7SAndroid Build Coastguard Worker     out += CHUNKCOPY_CHUNK_SIZE;
109*86ee64e7SAndroid Build Coastguard Worker     from += CHUNKCOPY_CHUNK_SIZE;
110*86ee64e7SAndroid Build Coastguard Worker   }
111*86ee64e7SAndroid Build Coastguard Worker   return out;
112*86ee64e7SAndroid Build Coastguard Worker }
113*86ee64e7SAndroid Build Coastguard Worker 
114*86ee64e7SAndroid Build Coastguard Worker /*
115*86ee64e7SAndroid Build Coastguard Worker  * Like chunkcopy_core(), but avoid writing beyond of legal output.
116*86ee64e7SAndroid Build Coastguard Worker  *
117*86ee64e7SAndroid Build Coastguard Worker  * Accepts an additional pointer to the end of safe output.  A generic safe
118*86ee64e7SAndroid Build Coastguard Worker  * copy would use (out + len), but it's normally the case that the end of the
119*86ee64e7SAndroid Build Coastguard Worker  * output buffer is beyond the end of the current copy, and this can still be
120*86ee64e7SAndroid Build Coastguard Worker  * exploited.
121*86ee64e7SAndroid Build Coastguard Worker  */
chunkcopy_core_safe(unsigned char FAR * out,const unsigned char FAR * from,unsigned len,unsigned char FAR * limit)122*86ee64e7SAndroid Build Coastguard Worker static inline unsigned char FAR* chunkcopy_core_safe(
123*86ee64e7SAndroid Build Coastguard Worker     unsigned char FAR* out,
124*86ee64e7SAndroid Build Coastguard Worker     const unsigned char FAR* from,
125*86ee64e7SAndroid Build Coastguard Worker     unsigned len,
126*86ee64e7SAndroid Build Coastguard Worker     unsigned char FAR* limit) {
127*86ee64e7SAndroid Build Coastguard Worker   Assert(out + len <= limit, "chunk copy exceeds safety limit");
128*86ee64e7SAndroid Build Coastguard Worker   if ((limit - out) < (ptrdiff_t)CHUNKCOPY_CHUNK_SIZE) {
129*86ee64e7SAndroid Build Coastguard Worker     const unsigned char FAR* Z_RESTRICT rfrom = from;
130*86ee64e7SAndroid Build Coastguard Worker     Assert((uintptr_t)out - (uintptr_t)from >= len,
131*86ee64e7SAndroid Build Coastguard Worker            "invalid restrict in chunkcopy_core_safe");
132*86ee64e7SAndroid Build Coastguard Worker     Assert((uintptr_t)from - (uintptr_t)out >= len,
133*86ee64e7SAndroid Build Coastguard Worker            "invalid restrict in chunkcopy_core_safe");
134*86ee64e7SAndroid Build Coastguard Worker     if (len & 8) {
135*86ee64e7SAndroid Build Coastguard Worker       Z_BUILTIN_MEMCPY(out, rfrom, 8);
136*86ee64e7SAndroid Build Coastguard Worker       out += 8;
137*86ee64e7SAndroid Build Coastguard Worker       rfrom += 8;
138*86ee64e7SAndroid Build Coastguard Worker     }
139*86ee64e7SAndroid Build Coastguard Worker     if (len & 4) {
140*86ee64e7SAndroid Build Coastguard Worker       Z_BUILTIN_MEMCPY(out, rfrom, 4);
141*86ee64e7SAndroid Build Coastguard Worker       out += 4;
142*86ee64e7SAndroid Build Coastguard Worker       rfrom += 4;
143*86ee64e7SAndroid Build Coastguard Worker     }
144*86ee64e7SAndroid Build Coastguard Worker     if (len & 2) {
145*86ee64e7SAndroid Build Coastguard Worker       Z_BUILTIN_MEMCPY(out, rfrom, 2);
146*86ee64e7SAndroid Build Coastguard Worker       out += 2;
147*86ee64e7SAndroid Build Coastguard Worker       rfrom += 2;
148*86ee64e7SAndroid Build Coastguard Worker     }
149*86ee64e7SAndroid Build Coastguard Worker     if (len & 1) {
150*86ee64e7SAndroid Build Coastguard Worker       *out++ = *rfrom++;
151*86ee64e7SAndroid Build Coastguard Worker     }
152*86ee64e7SAndroid Build Coastguard Worker     return out;
153*86ee64e7SAndroid Build Coastguard Worker   }
154*86ee64e7SAndroid Build Coastguard Worker   return chunkcopy_core(out, from, len);
155*86ee64e7SAndroid Build Coastguard Worker }
156*86ee64e7SAndroid Build Coastguard Worker 
157*86ee64e7SAndroid Build Coastguard Worker /*
158*86ee64e7SAndroid Build Coastguard Worker  * Perform short copies until distance can be rewritten as being at least
159*86ee64e7SAndroid Build Coastguard Worker  * CHUNKCOPY_CHUNK_SIZE.
160*86ee64e7SAndroid Build Coastguard Worker  *
161*86ee64e7SAndroid Build Coastguard Worker  * Assumes it's OK to overwrite at least the first 2*CHUNKCOPY_CHUNK_SIZE
162*86ee64e7SAndroid Build Coastguard Worker  * bytes of output even if the copy is shorter than this.  This assumption
163*86ee64e7SAndroid Build Coastguard Worker  * holds within zlib inflate_fast(), which starts every iteration with at
164*86ee64e7SAndroid Build Coastguard Worker  * least 258 bytes of output space available (258 being the maximum length
165*86ee64e7SAndroid Build Coastguard Worker  * output from a single token; see inffast.c).
166*86ee64e7SAndroid Build Coastguard Worker  */
chunkunroll_relaxed(unsigned char FAR * out,unsigned FAR * dist,unsigned FAR * len)167*86ee64e7SAndroid Build Coastguard Worker static inline unsigned char FAR* chunkunroll_relaxed(
168*86ee64e7SAndroid Build Coastguard Worker     unsigned char FAR* out,
169*86ee64e7SAndroid Build Coastguard Worker     unsigned FAR* dist,
170*86ee64e7SAndroid Build Coastguard Worker     unsigned FAR* len) Z_DISABLE_MSAN {
171*86ee64e7SAndroid Build Coastguard Worker   const unsigned char FAR* from = out - *dist;
172*86ee64e7SAndroid Build Coastguard Worker   while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) {
173*86ee64e7SAndroid Build Coastguard Worker     storechunk(out, loadchunk(from));
174*86ee64e7SAndroid Build Coastguard Worker     out += *dist;
175*86ee64e7SAndroid Build Coastguard Worker     *len -= *dist;
176*86ee64e7SAndroid Build Coastguard Worker     *dist += *dist;
177*86ee64e7SAndroid Build Coastguard Worker   }
178*86ee64e7SAndroid Build Coastguard Worker   return out;
179*86ee64e7SAndroid Build Coastguard Worker }
180*86ee64e7SAndroid Build Coastguard Worker 
181*86ee64e7SAndroid Build Coastguard Worker #if defined(INFLATE_CHUNK_SIMD_NEON)
182*86ee64e7SAndroid Build Coastguard Worker /*
183*86ee64e7SAndroid Build Coastguard Worker  * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
184*86ee64e7SAndroid Build Coastguard Worker  * every 64-bit component of the 128-bit result (64-bit int splat).
185*86ee64e7SAndroid Build Coastguard Worker  */
v_load64_dup(const void * src)186*86ee64e7SAndroid Build Coastguard Worker static inline z_vec128i_t v_load64_dup(const void* src) {
187*86ee64e7SAndroid Build Coastguard Worker   return vcombine_u8(vld1_u8(src), vld1_u8(src));
188*86ee64e7SAndroid Build Coastguard Worker }
189*86ee64e7SAndroid Build Coastguard Worker 
190*86ee64e7SAndroid Build Coastguard Worker /*
191*86ee64e7SAndroid Build Coastguard Worker  * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
192*86ee64e7SAndroid Build Coastguard Worker  * every 32-bit component of the 128-bit result (32-bit int splat).
193*86ee64e7SAndroid Build Coastguard Worker  */
v_load32_dup(const void * src)194*86ee64e7SAndroid Build Coastguard Worker static inline z_vec128i_t v_load32_dup(const void* src) {
195*86ee64e7SAndroid Build Coastguard Worker   int32_t i32;
196*86ee64e7SAndroid Build Coastguard Worker   Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
197*86ee64e7SAndroid Build Coastguard Worker   return vreinterpretq_u8_s32(vdupq_n_s32(i32));
198*86ee64e7SAndroid Build Coastguard Worker }
199*86ee64e7SAndroid Build Coastguard Worker 
200*86ee64e7SAndroid Build Coastguard Worker /*
201*86ee64e7SAndroid Build Coastguard Worker  * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
202*86ee64e7SAndroid Build Coastguard Worker  * every 16-bit component of the 128-bit result (16-bit int splat).
203*86ee64e7SAndroid Build Coastguard Worker  */
v_load16_dup(const void * src)204*86ee64e7SAndroid Build Coastguard Worker static inline z_vec128i_t v_load16_dup(const void* src) {
205*86ee64e7SAndroid Build Coastguard Worker   int16_t i16;
206*86ee64e7SAndroid Build Coastguard Worker   Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
207*86ee64e7SAndroid Build Coastguard Worker   return vreinterpretq_u8_s16(vdupq_n_s16(i16));
208*86ee64e7SAndroid Build Coastguard Worker }
209*86ee64e7SAndroid Build Coastguard Worker 
210*86ee64e7SAndroid Build Coastguard Worker /*
211*86ee64e7SAndroid Build Coastguard Worker  * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
212*86ee64e7SAndroid Build Coastguard Worker  * component of the 128-bit result (8-bit int splat).
213*86ee64e7SAndroid Build Coastguard Worker  */
v_load8_dup(const void * src)214*86ee64e7SAndroid Build Coastguard Worker static inline z_vec128i_t v_load8_dup(const void* src) {
215*86ee64e7SAndroid Build Coastguard Worker   return vld1q_dup_u8((const uint8_t*)src);
216*86ee64e7SAndroid Build Coastguard Worker }
217*86ee64e7SAndroid Build Coastguard Worker 
218*86ee64e7SAndroid Build Coastguard Worker /*
219*86ee64e7SAndroid Build Coastguard Worker  * v_store_128(): store the 128-bit vec in a memory destination (that might
220*86ee64e7SAndroid Build Coastguard Worker  * not be 16-byte aligned) void* out.
221*86ee64e7SAndroid Build Coastguard Worker  */
v_store_128(void * out,const z_vec128i_t vec)222*86ee64e7SAndroid Build Coastguard Worker static inline void v_store_128(void* out, const z_vec128i_t vec) {
223*86ee64e7SAndroid Build Coastguard Worker   vst1q_u8(out, vec);
224*86ee64e7SAndroid Build Coastguard Worker }
225*86ee64e7SAndroid Build Coastguard Worker 
226*86ee64e7SAndroid Build Coastguard Worker #elif defined(INFLATE_CHUNK_SIMD_SSE2)
227*86ee64e7SAndroid Build Coastguard Worker /*
228*86ee64e7SAndroid Build Coastguard Worker  * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
229*86ee64e7SAndroid Build Coastguard Worker  * every 64-bit component of the 128-bit result (64-bit int splat).
230*86ee64e7SAndroid Build Coastguard Worker  */
v_load64_dup(const void * src)231*86ee64e7SAndroid Build Coastguard Worker static inline z_vec128i_t v_load64_dup(const void* src) {
232*86ee64e7SAndroid Build Coastguard Worker   int64_t i64;
233*86ee64e7SAndroid Build Coastguard Worker   Z_BUILTIN_MEMCPY(&i64, src, sizeof(i64));
234*86ee64e7SAndroid Build Coastguard Worker   return _mm_set1_epi64x(i64);
235*86ee64e7SAndroid Build Coastguard Worker }
236*86ee64e7SAndroid Build Coastguard Worker 
237*86ee64e7SAndroid Build Coastguard Worker /*
238*86ee64e7SAndroid Build Coastguard Worker  * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
239*86ee64e7SAndroid Build Coastguard Worker  * every 32-bit component of the 128-bit result (32-bit int splat).
240*86ee64e7SAndroid Build Coastguard Worker  */
v_load32_dup(const void * src)241*86ee64e7SAndroid Build Coastguard Worker static inline z_vec128i_t v_load32_dup(const void* src) {
242*86ee64e7SAndroid Build Coastguard Worker   int32_t i32;
243*86ee64e7SAndroid Build Coastguard Worker   Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
244*86ee64e7SAndroid Build Coastguard Worker   return _mm_set1_epi32(i32);
245*86ee64e7SAndroid Build Coastguard Worker }
246*86ee64e7SAndroid Build Coastguard Worker 
247*86ee64e7SAndroid Build Coastguard Worker /*
248*86ee64e7SAndroid Build Coastguard Worker  * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
249*86ee64e7SAndroid Build Coastguard Worker  * every 16-bit component of the 128-bit result (16-bit int splat).
250*86ee64e7SAndroid Build Coastguard Worker  */
v_load16_dup(const void * src)251*86ee64e7SAndroid Build Coastguard Worker static inline z_vec128i_t v_load16_dup(const void* src) {
252*86ee64e7SAndroid Build Coastguard Worker   int16_t i16;
253*86ee64e7SAndroid Build Coastguard Worker   Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
254*86ee64e7SAndroid Build Coastguard Worker   return _mm_set1_epi16(i16);
255*86ee64e7SAndroid Build Coastguard Worker }
256*86ee64e7SAndroid Build Coastguard Worker 
257*86ee64e7SAndroid Build Coastguard Worker /*
258*86ee64e7SAndroid Build Coastguard Worker  * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
259*86ee64e7SAndroid Build Coastguard Worker  * component of the 128-bit result (8-bit int splat).
260*86ee64e7SAndroid Build Coastguard Worker  */
v_load8_dup(const void * src)261*86ee64e7SAndroid Build Coastguard Worker static inline z_vec128i_t v_load8_dup(const void* src) {
262*86ee64e7SAndroid Build Coastguard Worker   return _mm_set1_epi8(*(const char*)src);
263*86ee64e7SAndroid Build Coastguard Worker }
264*86ee64e7SAndroid Build Coastguard Worker 
265*86ee64e7SAndroid Build Coastguard Worker /*
266*86ee64e7SAndroid Build Coastguard Worker  * v_store_128(): store the 128-bit vec in a memory destination (that might
267*86ee64e7SAndroid Build Coastguard Worker  * not be 16-byte aligned) void* out.
268*86ee64e7SAndroid Build Coastguard Worker  */
v_store_128(void * out,const z_vec128i_t vec)269*86ee64e7SAndroid Build Coastguard Worker static inline void v_store_128(void* out, const z_vec128i_t vec) {
270*86ee64e7SAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i*)out, vec);
271*86ee64e7SAndroid Build Coastguard Worker }
272*86ee64e7SAndroid Build Coastguard Worker #elif defined(INFLATE_CHUNK_GENERIC)
273*86ee64e7SAndroid Build Coastguard Worker /*
274*86ee64e7SAndroid Build Coastguard Worker  * Default implementations for chunk-copy functions rely on memcpy() being
275*86ee64e7SAndroid Build Coastguard Worker  * inlined by the compiler for best performance.  This is most likely to work
276*86ee64e7SAndroid Build Coastguard Worker  * as expected when the length argument is constant (as is the case here) and
277*86ee64e7SAndroid Build Coastguard Worker  * the target supports unaligned loads and stores.  Since that's not always a
278*86ee64e7SAndroid Build Coastguard Worker  * safe assumption, this may need extra compiler arguments such as
279*86ee64e7SAndroid Build Coastguard Worker  * `-mno-strict-align` or `-munaligned-access`, or the availability of
280*86ee64e7SAndroid Build Coastguard Worker  * extensions like SIMD.
281*86ee64e7SAndroid Build Coastguard Worker  */
282*86ee64e7SAndroid Build Coastguard Worker 
283*86ee64e7SAndroid Build Coastguard Worker /*
284*86ee64e7SAndroid Build Coastguard Worker  * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
285*86ee64e7SAndroid Build Coastguard Worker  * every 64-bit component of the 128-bit result (64-bit int splat).
286*86ee64e7SAndroid Build Coastguard Worker  */
v_load64_dup(const void * src)287*86ee64e7SAndroid Build Coastguard Worker static inline z_vec128i_t v_load64_dup(const void* src) {
288*86ee64e7SAndroid Build Coastguard Worker   int64_t in;
289*86ee64e7SAndroid Build Coastguard Worker   Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
290*86ee64e7SAndroid Build Coastguard Worker   z_vec128i_t out;
291*86ee64e7SAndroid Build Coastguard Worker   for (int i = 0; i < sizeof(out); i += sizeof(in)) {
292*86ee64e7SAndroid Build Coastguard Worker     Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
293*86ee64e7SAndroid Build Coastguard Worker   }
294*86ee64e7SAndroid Build Coastguard Worker   return out;
295*86ee64e7SAndroid Build Coastguard Worker }
296*86ee64e7SAndroid Build Coastguard Worker 
297*86ee64e7SAndroid Build Coastguard Worker /*
298*86ee64e7SAndroid Build Coastguard Worker  * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
299*86ee64e7SAndroid Build Coastguard Worker  * every 32-bit component of the 128-bit result (32-bit int splat).
300*86ee64e7SAndroid Build Coastguard Worker  */
v_load32_dup(const void * src)301*86ee64e7SAndroid Build Coastguard Worker static inline z_vec128i_t v_load32_dup(const void* src) {
302*86ee64e7SAndroid Build Coastguard Worker   int32_t in;
303*86ee64e7SAndroid Build Coastguard Worker   Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
304*86ee64e7SAndroid Build Coastguard Worker   z_vec128i_t out;
305*86ee64e7SAndroid Build Coastguard Worker   for (int i = 0; i < sizeof(out); i += sizeof(in)) {
306*86ee64e7SAndroid Build Coastguard Worker     Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
307*86ee64e7SAndroid Build Coastguard Worker   }
308*86ee64e7SAndroid Build Coastguard Worker   return out;
309*86ee64e7SAndroid Build Coastguard Worker }
310*86ee64e7SAndroid Build Coastguard Worker 
311*86ee64e7SAndroid Build Coastguard Worker /*
312*86ee64e7SAndroid Build Coastguard Worker  * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
313*86ee64e7SAndroid Build Coastguard Worker  * every 16-bit component of the 128-bit result (16-bit int splat).
314*86ee64e7SAndroid Build Coastguard Worker  */
v_load16_dup(const void * src)315*86ee64e7SAndroid Build Coastguard Worker static inline z_vec128i_t v_load16_dup(const void* src) {
316*86ee64e7SAndroid Build Coastguard Worker   int16_t in;
317*86ee64e7SAndroid Build Coastguard Worker   Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
318*86ee64e7SAndroid Build Coastguard Worker   z_vec128i_t out;
319*86ee64e7SAndroid Build Coastguard Worker   for (int i = 0; i < sizeof(out); i += sizeof(in)) {
320*86ee64e7SAndroid Build Coastguard Worker     Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
321*86ee64e7SAndroid Build Coastguard Worker   }
322*86ee64e7SAndroid Build Coastguard Worker   return out;
323*86ee64e7SAndroid Build Coastguard Worker }
324*86ee64e7SAndroid Build Coastguard Worker 
325*86ee64e7SAndroid Build Coastguard Worker /*
326*86ee64e7SAndroid Build Coastguard Worker  * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
327*86ee64e7SAndroid Build Coastguard Worker  * component of the 128-bit result (8-bit int splat).
328*86ee64e7SAndroid Build Coastguard Worker  */
v_load8_dup(const void * src)329*86ee64e7SAndroid Build Coastguard Worker static inline z_vec128i_t v_load8_dup(const void* src) {
330*86ee64e7SAndroid Build Coastguard Worker   int8_t in = *(const uint8_t*)src;
331*86ee64e7SAndroid Build Coastguard Worker   z_vec128i_t out;
332*86ee64e7SAndroid Build Coastguard Worker   Z_BUILTIN_MEMSET(&out, in, sizeof(out));
333*86ee64e7SAndroid Build Coastguard Worker   return out;
334*86ee64e7SAndroid Build Coastguard Worker }
335*86ee64e7SAndroid Build Coastguard Worker 
336*86ee64e7SAndroid Build Coastguard Worker /*
337*86ee64e7SAndroid Build Coastguard Worker  * v_store_128(): store the 128-bit vec in a memory destination (that might
338*86ee64e7SAndroid Build Coastguard Worker  * not be 16-byte aligned) void* out.
339*86ee64e7SAndroid Build Coastguard Worker  */
v_store_128(void * out,const z_vec128i_t vec)340*86ee64e7SAndroid Build Coastguard Worker static inline void v_store_128(void* out, const z_vec128i_t vec) {
341*86ee64e7SAndroid Build Coastguard Worker   Z_BUILTIN_MEMCPY(out, &vec, sizeof(vec));
342*86ee64e7SAndroid Build Coastguard Worker }
343*86ee64e7SAndroid Build Coastguard Worker #endif
344*86ee64e7SAndroid Build Coastguard Worker 
345*86ee64e7SAndroid Build Coastguard Worker /*
346*86ee64e7SAndroid Build Coastguard Worker  * Perform an overlapping copy which behaves as a memset() operation, but
347*86ee64e7SAndroid Build Coastguard Worker  * supporting periods other than one, and assume that length is non-zero and
348*86ee64e7SAndroid Build Coastguard Worker  * that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output
349*86ee64e7SAndroid Build Coastguard Worker  * even if the length is shorter than this.
350*86ee64e7SAndroid Build Coastguard Worker  */
chunkset_core(unsigned char FAR * out,unsigned period,unsigned len)351*86ee64e7SAndroid Build Coastguard Worker static inline unsigned char FAR* chunkset_core(
352*86ee64e7SAndroid Build Coastguard Worker     unsigned char FAR* out,
353*86ee64e7SAndroid Build Coastguard Worker     unsigned period,
354*86ee64e7SAndroid Build Coastguard Worker     unsigned len) {
355*86ee64e7SAndroid Build Coastguard Worker   z_vec128i_t v;
356*86ee64e7SAndroid Build Coastguard Worker   const int bump = ((len - 1) % sizeof(v)) + 1;
357*86ee64e7SAndroid Build Coastguard Worker 
358*86ee64e7SAndroid Build Coastguard Worker   switch (period) {
359*86ee64e7SAndroid Build Coastguard Worker     case 1:
360*86ee64e7SAndroid Build Coastguard Worker       v = v_load8_dup(out - 1);
361*86ee64e7SAndroid Build Coastguard Worker       v_store_128(out, v);
362*86ee64e7SAndroid Build Coastguard Worker       out += bump;
363*86ee64e7SAndroid Build Coastguard Worker       len -= bump;
364*86ee64e7SAndroid Build Coastguard Worker       while (len > 0) {
365*86ee64e7SAndroid Build Coastguard Worker         v_store_128(out, v);
366*86ee64e7SAndroid Build Coastguard Worker         out += sizeof(v);
367*86ee64e7SAndroid Build Coastguard Worker         len -= sizeof(v);
368*86ee64e7SAndroid Build Coastguard Worker       }
369*86ee64e7SAndroid Build Coastguard Worker       return out;
370*86ee64e7SAndroid Build Coastguard Worker     case 2:
371*86ee64e7SAndroid Build Coastguard Worker       v = v_load16_dup(out - 2);
372*86ee64e7SAndroid Build Coastguard Worker       v_store_128(out, v);
373*86ee64e7SAndroid Build Coastguard Worker       out += bump;
374*86ee64e7SAndroid Build Coastguard Worker       len -= bump;
375*86ee64e7SAndroid Build Coastguard Worker       if (len > 0) {
376*86ee64e7SAndroid Build Coastguard Worker         v = v_load16_dup(out - 2);
377*86ee64e7SAndroid Build Coastguard Worker         do {
378*86ee64e7SAndroid Build Coastguard Worker           v_store_128(out, v);
379*86ee64e7SAndroid Build Coastguard Worker           out += sizeof(v);
380*86ee64e7SAndroid Build Coastguard Worker           len -= sizeof(v);
381*86ee64e7SAndroid Build Coastguard Worker         } while (len > 0);
382*86ee64e7SAndroid Build Coastguard Worker       }
383*86ee64e7SAndroid Build Coastguard Worker       return out;
384*86ee64e7SAndroid Build Coastguard Worker     case 4:
385*86ee64e7SAndroid Build Coastguard Worker       v = v_load32_dup(out - 4);
386*86ee64e7SAndroid Build Coastguard Worker       v_store_128(out, v);
387*86ee64e7SAndroid Build Coastguard Worker       out += bump;
388*86ee64e7SAndroid Build Coastguard Worker       len -= bump;
389*86ee64e7SAndroid Build Coastguard Worker       if (len > 0) {
390*86ee64e7SAndroid Build Coastguard Worker         v = v_load32_dup(out - 4);
391*86ee64e7SAndroid Build Coastguard Worker         do {
392*86ee64e7SAndroid Build Coastguard Worker           v_store_128(out, v);
393*86ee64e7SAndroid Build Coastguard Worker           out += sizeof(v);
394*86ee64e7SAndroid Build Coastguard Worker           len -= sizeof(v);
395*86ee64e7SAndroid Build Coastguard Worker         } while (len > 0);
396*86ee64e7SAndroid Build Coastguard Worker       }
397*86ee64e7SAndroid Build Coastguard Worker       return out;
398*86ee64e7SAndroid Build Coastguard Worker     case 8:
399*86ee64e7SAndroid Build Coastguard Worker       v = v_load64_dup(out - 8);
400*86ee64e7SAndroid Build Coastguard Worker       v_store_128(out, v);
401*86ee64e7SAndroid Build Coastguard Worker       out += bump;
402*86ee64e7SAndroid Build Coastguard Worker       len -= bump;
403*86ee64e7SAndroid Build Coastguard Worker       if (len > 0) {
404*86ee64e7SAndroid Build Coastguard Worker         v = v_load64_dup(out - 8);
405*86ee64e7SAndroid Build Coastguard Worker         do {
406*86ee64e7SAndroid Build Coastguard Worker           v_store_128(out, v);
407*86ee64e7SAndroid Build Coastguard Worker           out += sizeof(v);
408*86ee64e7SAndroid Build Coastguard Worker           len -= sizeof(v);
409*86ee64e7SAndroid Build Coastguard Worker         } while (len > 0);
410*86ee64e7SAndroid Build Coastguard Worker       }
411*86ee64e7SAndroid Build Coastguard Worker       return out;
412*86ee64e7SAndroid Build Coastguard Worker   }
413*86ee64e7SAndroid Build Coastguard Worker   out = chunkunroll_relaxed(out, &period, &len);
414*86ee64e7SAndroid Build Coastguard Worker   return chunkcopy_core(out, out - period, len);
415*86ee64e7SAndroid Build Coastguard Worker }
416*86ee64e7SAndroid Build Coastguard Worker 
417*86ee64e7SAndroid Build Coastguard Worker /*
418*86ee64e7SAndroid Build Coastguard Worker  * Perform a memcpy-like operation, but assume that length is non-zero and that
419*86ee64e7SAndroid Build Coastguard Worker  * it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
420*86ee64e7SAndroid Build Coastguard Worker  * the length is shorter than this.
421*86ee64e7SAndroid Build Coastguard Worker  *
422*86ee64e7SAndroid Build Coastguard Worker  * Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour
423*86ee64e7SAndroid Build Coastguard Worker  * of overlapping buffers, regardless of the distance between the pointers.
424*86ee64e7SAndroid Build Coastguard Worker  * This is reflected in the `restrict`-qualified pointers, allowing the
425*86ee64e7SAndroid Build Coastguard Worker  * compiler to re-order loads and stores.
426*86ee64e7SAndroid Build Coastguard Worker  */
chunkcopy_relaxed(unsigned char FAR * Z_RESTRICT out,const unsigned char FAR * Z_RESTRICT from,unsigned len)427*86ee64e7SAndroid Build Coastguard Worker static inline unsigned char FAR* chunkcopy_relaxed(
428*86ee64e7SAndroid Build Coastguard Worker     unsigned char FAR* Z_RESTRICT out,
429*86ee64e7SAndroid Build Coastguard Worker     const unsigned char FAR* Z_RESTRICT from,
430*86ee64e7SAndroid Build Coastguard Worker     unsigned len) {
431*86ee64e7SAndroid Build Coastguard Worker   Assert((uintptr_t)out - (uintptr_t)from >= len,
432*86ee64e7SAndroid Build Coastguard Worker          "invalid restrict in chunkcopy_relaxed");
433*86ee64e7SAndroid Build Coastguard Worker   Assert((uintptr_t)from - (uintptr_t)out >= len,
434*86ee64e7SAndroid Build Coastguard Worker          "invalid restrict in chunkcopy_relaxed");
435*86ee64e7SAndroid Build Coastguard Worker   return chunkcopy_core(out, from, len);
436*86ee64e7SAndroid Build Coastguard Worker }
437*86ee64e7SAndroid Build Coastguard Worker 
438*86ee64e7SAndroid Build Coastguard Worker /*
439*86ee64e7SAndroid Build Coastguard Worker  * Like chunkcopy_relaxed(), but avoid writing beyond of legal output.
440*86ee64e7SAndroid Build Coastguard Worker  *
441*86ee64e7SAndroid Build Coastguard Worker  * Unlike chunkcopy_core_safe() above, no guarantee is made regarding the
442*86ee64e7SAndroid Build Coastguard Worker  * behaviour of overlapping buffers, regardless of the distance between the
443*86ee64e7SAndroid Build Coastguard Worker  * pointers.  This is reflected in the `restrict`-qualified pointers, allowing
444*86ee64e7SAndroid Build Coastguard Worker  * the compiler to re-order loads and stores.
445*86ee64e7SAndroid Build Coastguard Worker  *
446*86ee64e7SAndroid Build Coastguard Worker  * Accepts an additional pointer to the end of safe output.  A generic safe
447*86ee64e7SAndroid Build Coastguard Worker  * copy would use (out + len), but it's normally the case that the end of the
448*86ee64e7SAndroid Build Coastguard Worker  * output buffer is beyond the end of the current copy, and this can still be
449*86ee64e7SAndroid Build Coastguard Worker  * exploited.
450*86ee64e7SAndroid Build Coastguard Worker  */
chunkcopy_safe(unsigned char FAR * out,const unsigned char FAR * Z_RESTRICT from,unsigned len,unsigned char FAR * limit)451*86ee64e7SAndroid Build Coastguard Worker static inline unsigned char FAR* chunkcopy_safe(
452*86ee64e7SAndroid Build Coastguard Worker     unsigned char FAR* out,
453*86ee64e7SAndroid Build Coastguard Worker     const unsigned char FAR* Z_RESTRICT from,
454*86ee64e7SAndroid Build Coastguard Worker     unsigned len,
455*86ee64e7SAndroid Build Coastguard Worker     unsigned char FAR* limit) {
456*86ee64e7SAndroid Build Coastguard Worker   Assert(out + len <= limit, "chunk copy exceeds safety limit");
457*86ee64e7SAndroid Build Coastguard Worker   Assert((uintptr_t)out - (uintptr_t)from >= len,
458*86ee64e7SAndroid Build Coastguard Worker          "invalid restrict in chunkcopy_safe");
459*86ee64e7SAndroid Build Coastguard Worker   Assert((uintptr_t)from - (uintptr_t)out >= len,
460*86ee64e7SAndroid Build Coastguard Worker          "invalid restrict in chunkcopy_safe");
461*86ee64e7SAndroid Build Coastguard Worker 
462*86ee64e7SAndroid Build Coastguard Worker   return chunkcopy_core_safe(out, from, len, limit);
463*86ee64e7SAndroid Build Coastguard Worker }
464*86ee64e7SAndroid Build Coastguard Worker 
465*86ee64e7SAndroid Build Coastguard Worker /*
466*86ee64e7SAndroid Build Coastguard Worker  * Perform chunky copy within the same buffer, where the source and destination
467*86ee64e7SAndroid Build Coastguard Worker  * may potentially overlap.
468*86ee64e7SAndroid Build Coastguard Worker  *
469*86ee64e7SAndroid Build Coastguard Worker  * Assumes that len > 0 on entry, and that it's safe to write at least
470*86ee64e7SAndroid Build Coastguard Worker  * CHUNKCOPY_CHUNK_SIZE*3 bytes to the output.
471*86ee64e7SAndroid Build Coastguard Worker  */
chunkcopy_lapped_relaxed(unsigned char FAR * out,unsigned dist,unsigned len)472*86ee64e7SAndroid Build Coastguard Worker static inline unsigned char FAR* chunkcopy_lapped_relaxed(
473*86ee64e7SAndroid Build Coastguard Worker     unsigned char FAR* out,
474*86ee64e7SAndroid Build Coastguard Worker     unsigned dist,
475*86ee64e7SAndroid Build Coastguard Worker     unsigned len) {
476*86ee64e7SAndroid Build Coastguard Worker   if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) {
477*86ee64e7SAndroid Build Coastguard Worker     return chunkset_core(out, dist, len);
478*86ee64e7SAndroid Build Coastguard Worker   }
479*86ee64e7SAndroid Build Coastguard Worker   return chunkcopy_core(out, out - dist, len);
480*86ee64e7SAndroid Build Coastguard Worker }
481*86ee64e7SAndroid Build Coastguard Worker 
482*86ee64e7SAndroid Build Coastguard Worker /*
483*86ee64e7SAndroid Build Coastguard Worker  * Behave like chunkcopy_lapped_relaxed(), but avoid writing beyond of legal
484*86ee64e7SAndroid Build Coastguard Worker  * output.
485*86ee64e7SAndroid Build Coastguard Worker  *
486*86ee64e7SAndroid Build Coastguard Worker  * Accepts an additional pointer to the end of safe output.  A generic safe
487*86ee64e7SAndroid Build Coastguard Worker  * copy would use (out + len), but it's normally the case that the end of the
488*86ee64e7SAndroid Build Coastguard Worker  * output buffer is beyond the end of the current copy, and this can still be
489*86ee64e7SAndroid Build Coastguard Worker  * exploited.
490*86ee64e7SAndroid Build Coastguard Worker  */
chunkcopy_lapped_safe(unsigned char FAR * out,unsigned dist,unsigned len,unsigned char FAR * limit)491*86ee64e7SAndroid Build Coastguard Worker static inline unsigned char FAR* chunkcopy_lapped_safe(
492*86ee64e7SAndroid Build Coastguard Worker     unsigned char FAR* out,
493*86ee64e7SAndroid Build Coastguard Worker     unsigned dist,
494*86ee64e7SAndroid Build Coastguard Worker     unsigned len,
495*86ee64e7SAndroid Build Coastguard Worker     unsigned char FAR* limit) {
496*86ee64e7SAndroid Build Coastguard Worker   Assert(out + len <= limit, "chunk copy exceeds safety limit");
497*86ee64e7SAndroid Build Coastguard Worker   if ((limit - out) < (ptrdiff_t)(3 * CHUNKCOPY_CHUNK_SIZE)) {
498*86ee64e7SAndroid Build Coastguard Worker     /* TODO(cavalcantii): try harder to optimise this */
499*86ee64e7SAndroid Build Coastguard Worker     while (len-- > 0) {
500*86ee64e7SAndroid Build Coastguard Worker       *out = *(out - dist);
501*86ee64e7SAndroid Build Coastguard Worker       out++;
502*86ee64e7SAndroid Build Coastguard Worker     }
503*86ee64e7SAndroid Build Coastguard Worker     return out;
504*86ee64e7SAndroid Build Coastguard Worker   }
505*86ee64e7SAndroid Build Coastguard Worker   return chunkcopy_lapped_relaxed(out, dist, len);
506*86ee64e7SAndroid Build Coastguard Worker }
507*86ee64e7SAndroid Build Coastguard Worker 
508*86ee64e7SAndroid Build Coastguard Worker /* TODO(cavalcanti): see crbug.com/1110083. */
chunkcopy_safe_ugly(unsigned char FAR * out,unsigned dist,unsigned len,unsigned char FAR * limit)509*86ee64e7SAndroid Build Coastguard Worker static inline unsigned char FAR* chunkcopy_safe_ugly(unsigned char FAR* out,
510*86ee64e7SAndroid Build Coastguard Worker                                                      unsigned dist,
511*86ee64e7SAndroid Build Coastguard Worker                                                      unsigned len,
512*86ee64e7SAndroid Build Coastguard Worker                                                      unsigned char FAR* limit) {
513*86ee64e7SAndroid Build Coastguard Worker #if defined(__GNUC__) && !defined(__clang__)
514*86ee64e7SAndroid Build Coastguard Worker   /* Speed is the same as using chunkcopy_safe
515*86ee64e7SAndroid Build Coastguard Worker      w/ GCC on ARM (tested gcc 6.3 and 7.5) and avoids
516*86ee64e7SAndroid Build Coastguard Worker      undefined behavior.
517*86ee64e7SAndroid Build Coastguard Worker   */
518*86ee64e7SAndroid Build Coastguard Worker   return chunkcopy_core_safe(out, out - dist, len, limit);
519*86ee64e7SAndroid Build Coastguard Worker #elif defined(__clang__) && defined(ARMV8_OS_ANDROID) && !defined(__aarch64__)
520*86ee64e7SAndroid Build Coastguard Worker   /* Seems to perform better on 32bit (i.e. Android). */
521*86ee64e7SAndroid Build Coastguard Worker   return chunkcopy_core_safe(out, out - dist, len, limit);
522*86ee64e7SAndroid Build Coastguard Worker #else
523*86ee64e7SAndroid Build Coastguard Worker   /* Seems to perform better on 64bit. */
524*86ee64e7SAndroid Build Coastguard Worker   return chunkcopy_lapped_safe(out, dist, len, limit);
525*86ee64e7SAndroid Build Coastguard Worker #endif
526*86ee64e7SAndroid Build Coastguard Worker }
527*86ee64e7SAndroid Build Coastguard Worker 
528*86ee64e7SAndroid Build Coastguard Worker /*
529*86ee64e7SAndroid Build Coastguard Worker  * The chunk-copy code above deals with writing the decoded DEFLATE data to
530*86ee64e7SAndroid Build Coastguard Worker  * the output with SIMD methods to increase decode speed. Reading the input
531*86ee64e7SAndroid Build Coastguard Worker  * to the DEFLATE decoder with a wide, SIMD method can also increase decode
532*86ee64e7SAndroid Build Coastguard Worker  * speed. This option is supported on little endian machines, and reads the
533*86ee64e7SAndroid Build Coastguard Worker  * input data in 64-bit (8 byte) chunks.
534*86ee64e7SAndroid Build Coastguard Worker  */
535*86ee64e7SAndroid Build Coastguard Worker 
536*86ee64e7SAndroid Build Coastguard Worker #ifdef INFLATE_CHUNK_READ_64LE
537*86ee64e7SAndroid Build Coastguard Worker /*
538*86ee64e7SAndroid Build Coastguard Worker  * Buffer the input in a uint64_t (8 bytes) in the wide input reading case.
539*86ee64e7SAndroid Build Coastguard Worker  */
540*86ee64e7SAndroid Build Coastguard Worker typedef uint64_t inflate_holder_t;
541*86ee64e7SAndroid Build Coastguard Worker 
542*86ee64e7SAndroid Build Coastguard Worker /*
543*86ee64e7SAndroid Build Coastguard Worker  * Ask the compiler to perform a wide, unaligned load of a uint64_t using a
544*86ee64e7SAndroid Build Coastguard Worker  * machine instruction appropriate for the uint64_t type.
545*86ee64e7SAndroid Build Coastguard Worker  */
read64le(const unsigned char FAR * in)546*86ee64e7SAndroid Build Coastguard Worker static inline inflate_holder_t read64le(const unsigned char FAR *in) {
547*86ee64e7SAndroid Build Coastguard Worker     inflate_holder_t input;
548*86ee64e7SAndroid Build Coastguard Worker     Z_BUILTIN_MEMCPY(&input, in, sizeof(input));
549*86ee64e7SAndroid Build Coastguard Worker     return input;
550*86ee64e7SAndroid Build Coastguard Worker }
551*86ee64e7SAndroid Build Coastguard Worker #else
552*86ee64e7SAndroid Build Coastguard Worker /*
553*86ee64e7SAndroid Build Coastguard Worker  * Otherwise, buffer the input bits using zlib's default input buffer type.
554*86ee64e7SAndroid Build Coastguard Worker  */
555*86ee64e7SAndroid Build Coastguard Worker typedef unsigned long inflate_holder_t;
556*86ee64e7SAndroid Build Coastguard Worker 
557*86ee64e7SAndroid Build Coastguard Worker #endif /* INFLATE_CHUNK_READ_64LE */
558*86ee64e7SAndroid Build Coastguard Worker 
559*86ee64e7SAndroid Build Coastguard Worker #undef Z_STATIC_ASSERT
560*86ee64e7SAndroid Build Coastguard Worker #undef Z_RESTRICT
561*86ee64e7SAndroid Build Coastguard Worker #undef Z_BUILTIN_MEMCPY
562*86ee64e7SAndroid Build Coastguard Worker #undef Z_DISABLE_MSAN
563*86ee64e7SAndroid Build Coastguard Worker 
564*86ee64e7SAndroid Build Coastguard Worker #endif /* CHUNKCOPY_H */
565