xref: /aosp_15_r20/external/zlib/contrib/optimizations/chunkcopy.h (revision 86ee64e75fa5f8bce2c8c356138035642429cd05)
1 /* chunkcopy.h -- fast chunk copy and set operations
2  * Copyright (C) 2017 ARM, Inc.
3  * Copyright 2017 The Chromium Authors
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the Chromium source repository LICENSE file.
6  */
7 
8 #ifndef CHUNKCOPY_H
9 #define CHUNKCOPY_H
10 
11 #include <stdint.h>
12 #include "zutil.h"
13 
14 #define Z_STATIC_ASSERT(name, assert) typedef char name[(assert) ? 1 : -1]
15 
16 #if __STDC_VERSION__ >= 199901L
17 #define Z_RESTRICT restrict
18 #else
19 #define Z_RESTRICT
20 #endif
21 
22 #if defined(__clang__) || defined(__GNUC__) || defined(__llvm__)
23 #define Z_BUILTIN_MEMCPY __builtin_memcpy
24 #define Z_BUILTIN_MEMSET __builtin_memset
25 #else
26 #define Z_BUILTIN_MEMCPY zmemcpy
27 #define Z_BUILTIN_MEMSET zmemset
28 #endif
29 
30 #if defined(INFLATE_CHUNK_SIMD_NEON)
31 #include <arm_neon.h>
32 typedef uint8x16_t z_vec128i_t;
33 #elif defined(INFLATE_CHUNK_SIMD_SSE2)
34 #include <emmintrin.h>
35 typedef __m128i z_vec128i_t;
36 #elif defined(INFLATE_CHUNK_GENERIC)
37 typedef struct { uint8_t x[16]; } z_vec128i_t;
38 #else
39 #error chunkcopy.h inflate chunk SIMD is not defined for your build target
40 #endif
41 
42 /*
43  * Suppress MSan errors about copying uninitialized bytes (crbug.com/1376033).
44  */
45 #define Z_DISABLE_MSAN
46 #if defined(__has_feature)
47   #if __has_feature(memory_sanitizer)
48     #undef Z_DISABLE_MSAN
49     #define Z_DISABLE_MSAN __attribute__((no_sanitize("memory")))
50   #endif
51 #endif
52 
53 /*
54  * chunk copy type: the z_vec128i_t type size should be exactly 128-bits
55  * and equal to CHUNKCOPY_CHUNK_SIZE.
56  */
57 #define CHUNKCOPY_CHUNK_SIZE sizeof(z_vec128i_t)
58 
59 Z_STATIC_ASSERT(vector_128_bits_wide,
60                 CHUNKCOPY_CHUNK_SIZE == sizeof(int8_t) * 16);
61 
62 /*
63  * Ask the compiler to perform a wide, unaligned load with a machine
64  * instruction appropriate for the z_vec128i_t type.
65  */
loadchunk(const unsigned char FAR * s)66 static inline z_vec128i_t loadchunk(
67     const unsigned char FAR* s) Z_DISABLE_MSAN {
68   z_vec128i_t v;
69   Z_BUILTIN_MEMCPY(&v, s, sizeof(v));
70   return v;
71 }
72 
73 /*
74  * Ask the compiler to perform a wide, unaligned store with a machine
75  * instruction appropriate for the z_vec128i_t type.
76  */
storechunk(unsigned char FAR * d,const z_vec128i_t v)77 static inline void storechunk(
78     unsigned char FAR* d,
79     const z_vec128i_t v) {
80   Z_BUILTIN_MEMCPY(d, &v, sizeof(v));
81 }
82 
83 /*
84  * Perform a memcpy-like operation, assuming that length is non-zero and that
85  * it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
86  * the length is shorter than this.
87  *
88  * It also guarantees that it will properly unroll the data if the distance
89  * between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on
90  * in chunkcopy_relaxed().
91  *
92  * Aside from better memory bus utilisation, this means that short copies
93  * (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop
94  * without iteration, which will hopefully make the branch prediction more
95  * reliable.
96  */
chunkcopy_core(unsigned char FAR * out,const unsigned char FAR * from,unsigned len)97 static inline unsigned char FAR* chunkcopy_core(
98     unsigned char FAR* out,
99     const unsigned char FAR* from,
100     unsigned len) Z_DISABLE_MSAN {
101   const int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1;
102   storechunk(out, loadchunk(from));
103   out += bump;
104   from += bump;
105   len /= CHUNKCOPY_CHUNK_SIZE;
106   while (len-- > 0) {
107     storechunk(out, loadchunk(from));
108     out += CHUNKCOPY_CHUNK_SIZE;
109     from += CHUNKCOPY_CHUNK_SIZE;
110   }
111   return out;
112 }
113 
114 /*
115  * Like chunkcopy_core(), but avoid writing beyond of legal output.
116  *
117  * Accepts an additional pointer to the end of safe output.  A generic safe
118  * copy would use (out + len), but it's normally the case that the end of the
119  * output buffer is beyond the end of the current copy, and this can still be
120  * exploited.
121  */
chunkcopy_core_safe(unsigned char FAR * out,const unsigned char FAR * from,unsigned len,unsigned char FAR * limit)122 static inline unsigned char FAR* chunkcopy_core_safe(
123     unsigned char FAR* out,
124     const unsigned char FAR* from,
125     unsigned len,
126     unsigned char FAR* limit) {
127   Assert(out + len <= limit, "chunk copy exceeds safety limit");
128   if ((limit - out) < (ptrdiff_t)CHUNKCOPY_CHUNK_SIZE) {
129     const unsigned char FAR* Z_RESTRICT rfrom = from;
130     Assert((uintptr_t)out - (uintptr_t)from >= len,
131            "invalid restrict in chunkcopy_core_safe");
132     Assert((uintptr_t)from - (uintptr_t)out >= len,
133            "invalid restrict in chunkcopy_core_safe");
134     if (len & 8) {
135       Z_BUILTIN_MEMCPY(out, rfrom, 8);
136       out += 8;
137       rfrom += 8;
138     }
139     if (len & 4) {
140       Z_BUILTIN_MEMCPY(out, rfrom, 4);
141       out += 4;
142       rfrom += 4;
143     }
144     if (len & 2) {
145       Z_BUILTIN_MEMCPY(out, rfrom, 2);
146       out += 2;
147       rfrom += 2;
148     }
149     if (len & 1) {
150       *out++ = *rfrom++;
151     }
152     return out;
153   }
154   return chunkcopy_core(out, from, len);
155 }
156 
157 /*
158  * Perform short copies until distance can be rewritten as being at least
159  * CHUNKCOPY_CHUNK_SIZE.
160  *
161  * Assumes it's OK to overwrite at least the first 2*CHUNKCOPY_CHUNK_SIZE
162  * bytes of output even if the copy is shorter than this.  This assumption
163  * holds within zlib inflate_fast(), which starts every iteration with at
164  * least 258 bytes of output space available (258 being the maximum length
165  * output from a single token; see inffast.c).
166  */
chunkunroll_relaxed(unsigned char FAR * out,unsigned FAR * dist,unsigned FAR * len)167 static inline unsigned char FAR* chunkunroll_relaxed(
168     unsigned char FAR* out,
169     unsigned FAR* dist,
170     unsigned FAR* len) Z_DISABLE_MSAN {
171   const unsigned char FAR* from = out - *dist;
172   while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) {
173     storechunk(out, loadchunk(from));
174     out += *dist;
175     *len -= *dist;
176     *dist += *dist;
177   }
178   return out;
179 }
180 
181 #if defined(INFLATE_CHUNK_SIMD_NEON)
182 /*
183  * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
184  * every 64-bit component of the 128-bit result (64-bit int splat).
185  */
v_load64_dup(const void * src)186 static inline z_vec128i_t v_load64_dup(const void* src) {
187   return vcombine_u8(vld1_u8(src), vld1_u8(src));
188 }
189 
190 /*
191  * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
192  * every 32-bit component of the 128-bit result (32-bit int splat).
193  */
v_load32_dup(const void * src)194 static inline z_vec128i_t v_load32_dup(const void* src) {
195   int32_t i32;
196   Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
197   return vreinterpretq_u8_s32(vdupq_n_s32(i32));
198 }
199 
200 /*
201  * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
202  * every 16-bit component of the 128-bit result (16-bit int splat).
203  */
v_load16_dup(const void * src)204 static inline z_vec128i_t v_load16_dup(const void* src) {
205   int16_t i16;
206   Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
207   return vreinterpretq_u8_s16(vdupq_n_s16(i16));
208 }
209 
210 /*
211  * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
212  * component of the 128-bit result (8-bit int splat).
213  */
v_load8_dup(const void * src)214 static inline z_vec128i_t v_load8_dup(const void* src) {
215   return vld1q_dup_u8((const uint8_t*)src);
216 }
217 
218 /*
219  * v_store_128(): store the 128-bit vec in a memory destination (that might
220  * not be 16-byte aligned) void* out.
221  */
v_store_128(void * out,const z_vec128i_t vec)222 static inline void v_store_128(void* out, const z_vec128i_t vec) {
223   vst1q_u8(out, vec);
224 }
225 
226 #elif defined(INFLATE_CHUNK_SIMD_SSE2)
227 /*
228  * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
229  * every 64-bit component of the 128-bit result (64-bit int splat).
230  */
v_load64_dup(const void * src)231 static inline z_vec128i_t v_load64_dup(const void* src) {
232   int64_t i64;
233   Z_BUILTIN_MEMCPY(&i64, src, sizeof(i64));
234   return _mm_set1_epi64x(i64);
235 }
236 
237 /*
238  * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
239  * every 32-bit component of the 128-bit result (32-bit int splat).
240  */
v_load32_dup(const void * src)241 static inline z_vec128i_t v_load32_dup(const void* src) {
242   int32_t i32;
243   Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
244   return _mm_set1_epi32(i32);
245 }
246 
247 /*
248  * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
249  * every 16-bit component of the 128-bit result (16-bit int splat).
250  */
v_load16_dup(const void * src)251 static inline z_vec128i_t v_load16_dup(const void* src) {
252   int16_t i16;
253   Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
254   return _mm_set1_epi16(i16);
255 }
256 
257 /*
258  * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
259  * component of the 128-bit result (8-bit int splat).
260  */
v_load8_dup(const void * src)261 static inline z_vec128i_t v_load8_dup(const void* src) {
262   return _mm_set1_epi8(*(const char*)src);
263 }
264 
265 /*
266  * v_store_128(): store the 128-bit vec in a memory destination (that might
267  * not be 16-byte aligned) void* out.
268  */
v_store_128(void * out,const z_vec128i_t vec)269 static inline void v_store_128(void* out, const z_vec128i_t vec) {
270   _mm_storeu_si128((__m128i*)out, vec);
271 }
272 #elif defined(INFLATE_CHUNK_GENERIC)
273 /*
274  * Default implementations for chunk-copy functions rely on memcpy() being
275  * inlined by the compiler for best performance.  This is most likely to work
276  * as expected when the length argument is constant (as is the case here) and
277  * the target supports unaligned loads and stores.  Since that's not always a
278  * safe assumption, this may need extra compiler arguments such as
279  * `-mno-strict-align` or `-munaligned-access`, or the availability of
280  * extensions like SIMD.
281  */
282 
283 /*
284  * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
285  * every 64-bit component of the 128-bit result (64-bit int splat).
286  */
v_load64_dup(const void * src)287 static inline z_vec128i_t v_load64_dup(const void* src) {
288   int64_t in;
289   Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
290   z_vec128i_t out;
291   for (int i = 0; i < sizeof(out); i += sizeof(in)) {
292     Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
293   }
294   return out;
295 }
296 
297 /*
298  * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
299  * every 32-bit component of the 128-bit result (32-bit int splat).
300  */
v_load32_dup(const void * src)301 static inline z_vec128i_t v_load32_dup(const void* src) {
302   int32_t in;
303   Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
304   z_vec128i_t out;
305   for (int i = 0; i < sizeof(out); i += sizeof(in)) {
306     Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
307   }
308   return out;
309 }
310 
311 /*
312  * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
313  * every 16-bit component of the 128-bit result (16-bit int splat).
314  */
v_load16_dup(const void * src)315 static inline z_vec128i_t v_load16_dup(const void* src) {
316   int16_t in;
317   Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
318   z_vec128i_t out;
319   for (int i = 0; i < sizeof(out); i += sizeof(in)) {
320     Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
321   }
322   return out;
323 }
324 
325 /*
326  * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
327  * component of the 128-bit result (8-bit int splat).
328  */
v_load8_dup(const void * src)329 static inline z_vec128i_t v_load8_dup(const void* src) {
330   int8_t in = *(const uint8_t*)src;
331   z_vec128i_t out;
332   Z_BUILTIN_MEMSET(&out, in, sizeof(out));
333   return out;
334 }
335 
336 /*
337  * v_store_128(): store the 128-bit vec in a memory destination (that might
338  * not be 16-byte aligned) void* out.
339  */
v_store_128(void * out,const z_vec128i_t vec)340 static inline void v_store_128(void* out, const z_vec128i_t vec) {
341   Z_BUILTIN_MEMCPY(out, &vec, sizeof(vec));
342 }
343 #endif
344 
345 /*
346  * Perform an overlapping copy which behaves as a memset() operation, but
347  * supporting periods other than one, and assume that length is non-zero and
348  * that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output
349  * even if the length is shorter than this.
350  */
chunkset_core(unsigned char FAR * out,unsigned period,unsigned len)351 static inline unsigned char FAR* chunkset_core(
352     unsigned char FAR* out,
353     unsigned period,
354     unsigned len) {
355   z_vec128i_t v;
356   const int bump = ((len - 1) % sizeof(v)) + 1;
357 
358   switch (period) {
359     case 1:
360       v = v_load8_dup(out - 1);
361       v_store_128(out, v);
362       out += bump;
363       len -= bump;
364       while (len > 0) {
365         v_store_128(out, v);
366         out += sizeof(v);
367         len -= sizeof(v);
368       }
369       return out;
370     case 2:
371       v = v_load16_dup(out - 2);
372       v_store_128(out, v);
373       out += bump;
374       len -= bump;
375       if (len > 0) {
376         v = v_load16_dup(out - 2);
377         do {
378           v_store_128(out, v);
379           out += sizeof(v);
380           len -= sizeof(v);
381         } while (len > 0);
382       }
383       return out;
384     case 4:
385       v = v_load32_dup(out - 4);
386       v_store_128(out, v);
387       out += bump;
388       len -= bump;
389       if (len > 0) {
390         v = v_load32_dup(out - 4);
391         do {
392           v_store_128(out, v);
393           out += sizeof(v);
394           len -= sizeof(v);
395         } while (len > 0);
396       }
397       return out;
398     case 8:
399       v = v_load64_dup(out - 8);
400       v_store_128(out, v);
401       out += bump;
402       len -= bump;
403       if (len > 0) {
404         v = v_load64_dup(out - 8);
405         do {
406           v_store_128(out, v);
407           out += sizeof(v);
408           len -= sizeof(v);
409         } while (len > 0);
410       }
411       return out;
412   }
413   out = chunkunroll_relaxed(out, &period, &len);
414   return chunkcopy_core(out, out - period, len);
415 }
416 
417 /*
418  * Perform a memcpy-like operation, but assume that length is non-zero and that
419  * it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
420  * the length is shorter than this.
421  *
422  * Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour
423  * of overlapping buffers, regardless of the distance between the pointers.
424  * This is reflected in the `restrict`-qualified pointers, allowing the
425  * compiler to re-order loads and stores.
426  */
chunkcopy_relaxed(unsigned char FAR * Z_RESTRICT out,const unsigned char FAR * Z_RESTRICT from,unsigned len)427 static inline unsigned char FAR* chunkcopy_relaxed(
428     unsigned char FAR* Z_RESTRICT out,
429     const unsigned char FAR* Z_RESTRICT from,
430     unsigned len) {
431   Assert((uintptr_t)out - (uintptr_t)from >= len,
432          "invalid restrict in chunkcopy_relaxed");
433   Assert((uintptr_t)from - (uintptr_t)out >= len,
434          "invalid restrict in chunkcopy_relaxed");
435   return chunkcopy_core(out, from, len);
436 }
437 
438 /*
439  * Like chunkcopy_relaxed(), but avoid writing beyond of legal output.
440  *
441  * Unlike chunkcopy_core_safe() above, no guarantee is made regarding the
442  * behaviour of overlapping buffers, regardless of the distance between the
443  * pointers.  This is reflected in the `restrict`-qualified pointers, allowing
444  * the compiler to re-order loads and stores.
445  *
446  * Accepts an additional pointer to the end of safe output.  A generic safe
447  * copy would use (out + len), but it's normally the case that the end of the
448  * output buffer is beyond the end of the current copy, and this can still be
449  * exploited.
450  */
chunkcopy_safe(unsigned char FAR * out,const unsigned char FAR * Z_RESTRICT from,unsigned len,unsigned char FAR * limit)451 static inline unsigned char FAR* chunkcopy_safe(
452     unsigned char FAR* out,
453     const unsigned char FAR* Z_RESTRICT from,
454     unsigned len,
455     unsigned char FAR* limit) {
456   Assert(out + len <= limit, "chunk copy exceeds safety limit");
457   Assert((uintptr_t)out - (uintptr_t)from >= len,
458          "invalid restrict in chunkcopy_safe");
459   Assert((uintptr_t)from - (uintptr_t)out >= len,
460          "invalid restrict in chunkcopy_safe");
461 
462   return chunkcopy_core_safe(out, from, len, limit);
463 }
464 
465 /*
466  * Perform chunky copy within the same buffer, where the source and destination
467  * may potentially overlap.
468  *
469  * Assumes that len > 0 on entry, and that it's safe to write at least
470  * CHUNKCOPY_CHUNK_SIZE*3 bytes to the output.
471  */
chunkcopy_lapped_relaxed(unsigned char FAR * out,unsigned dist,unsigned len)472 static inline unsigned char FAR* chunkcopy_lapped_relaxed(
473     unsigned char FAR* out,
474     unsigned dist,
475     unsigned len) {
476   if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) {
477     return chunkset_core(out, dist, len);
478   }
479   return chunkcopy_core(out, out - dist, len);
480 }
481 
482 /*
483  * Behave like chunkcopy_lapped_relaxed(), but avoid writing beyond of legal
484  * output.
485  *
486  * Accepts an additional pointer to the end of safe output.  A generic safe
487  * copy would use (out + len), but it's normally the case that the end of the
488  * output buffer is beyond the end of the current copy, and this can still be
489  * exploited.
490  */
chunkcopy_lapped_safe(unsigned char FAR * out,unsigned dist,unsigned len,unsigned char FAR * limit)491 static inline unsigned char FAR* chunkcopy_lapped_safe(
492     unsigned char FAR* out,
493     unsigned dist,
494     unsigned len,
495     unsigned char FAR* limit) {
496   Assert(out + len <= limit, "chunk copy exceeds safety limit");
497   if ((limit - out) < (ptrdiff_t)(3 * CHUNKCOPY_CHUNK_SIZE)) {
498     /* TODO(cavalcantii): try harder to optimise this */
499     while (len-- > 0) {
500       *out = *(out - dist);
501       out++;
502     }
503     return out;
504   }
505   return chunkcopy_lapped_relaxed(out, dist, len);
506 }
507 
508 /* TODO(cavalcanti): see crbug.com/1110083. */
chunkcopy_safe_ugly(unsigned char FAR * out,unsigned dist,unsigned len,unsigned char FAR * limit)509 static inline unsigned char FAR* chunkcopy_safe_ugly(unsigned char FAR* out,
510                                                      unsigned dist,
511                                                      unsigned len,
512                                                      unsigned char FAR* limit) {
513 #if defined(__GNUC__) && !defined(__clang__)
514   /* Speed is the same as using chunkcopy_safe
515      w/ GCC on ARM (tested gcc 6.3 and 7.5) and avoids
516      undefined behavior.
517   */
518   return chunkcopy_core_safe(out, out - dist, len, limit);
519 #elif defined(__clang__) && defined(ARMV8_OS_ANDROID) && !defined(__aarch64__)
520   /* Seems to perform better on 32bit (i.e. Android). */
521   return chunkcopy_core_safe(out, out - dist, len, limit);
522 #else
523   /* Seems to perform better on 64bit. */
524   return chunkcopy_lapped_safe(out, dist, len, limit);
525 #endif
526 }
527 
528 /*
529  * The chunk-copy code above deals with writing the decoded DEFLATE data to
530  * the output with SIMD methods to increase decode speed. Reading the input
531  * to the DEFLATE decoder with a wide, SIMD method can also increase decode
532  * speed. This option is supported on little endian machines, and reads the
533  * input data in 64-bit (8 byte) chunks.
534  */
535 
536 #ifdef INFLATE_CHUNK_READ_64LE
537 /*
538  * Buffer the input in a uint64_t (8 bytes) in the wide input reading case.
539  */
540 typedef uint64_t inflate_holder_t;
541 
542 /*
543  * Ask the compiler to perform a wide, unaligned load of a uint64_t using a
544  * machine instruction appropriate for the uint64_t type.
545  */
read64le(const unsigned char FAR * in)546 static inline inflate_holder_t read64le(const unsigned char FAR *in) {
547     inflate_holder_t input;
548     Z_BUILTIN_MEMCPY(&input, in, sizeof(input));
549     return input;
550 }
551 #else
552 /*
553  * Otherwise, buffer the input bits using zlib's default input buffer type.
554  */
555 typedef unsigned long inflate_holder_t;
556 
557 #endif /* INFLATE_CHUNK_READ_64LE */
558 
559 #undef Z_STATIC_ASSERT
560 #undef Z_RESTRICT
561 #undef Z_BUILTIN_MEMCPY
562 #undef Z_DISABLE_MSAN
563 
564 #endif /* CHUNKCOPY_H */
565