1 /* chunkset_tpl.h -- inline functions to copy small data chunks.
2 * For conditions of distribution and use, see copyright notice in zlib.h
3 */
4
5 #include "zbuild.h"
6 #include <stdlib.h>
7
8 #if CHUNK_SIZE == 32 && defined(X86_SSE41) && defined(X86_SSE2)
9 extern uint8_t* chunkmemset_sse41(uint8_t *out, unsigned dist, unsigned len);
10 #endif
11
12 /* Returns the chunk size */
CHUNKSIZE(void)13 Z_INTERNAL uint32_t CHUNKSIZE(void) {
14 return sizeof(chunk_t);
15 }
16
17 /* Behave like memcpy, but assume that it's OK to overwrite at least
18 chunk_t bytes of output even if the length is shorter than this,
19 that the length is non-zero, and that `from` lags `out` by at least
20 sizeof chunk_t bytes (or that they don't overlap at all or simply that
21 the distance is less than the length of the copy).
22
23 Aside from better memory bus utilisation, this means that short copies
24 (chunk_t bytes or fewer) will fall straight through the loop
25 without iteration, which will hopefully make the branch prediction more
26 reliable. */
27 #ifndef HAVE_CHUNKCOPY
CHUNKCOPY(uint8_t * out,uint8_t const * from,unsigned len)28 Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
29 Assert(len > 0, "chunkcopy should never have a length 0");
30 chunk_t chunk;
31 int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
32 loadchunk(from, &chunk);
33 storechunk(out, &chunk);
34 out += align;
35 from += align;
36 len -= align;
37 while (len > 0) {
38 loadchunk(from, &chunk);
39 storechunk(out, &chunk);
40 out += sizeof(chunk_t);
41 from += sizeof(chunk_t);
42 len -= sizeof(chunk_t);
43 }
44 return out;
45 }
46 #endif
47
48 /* Perform short copies until distance can be rewritten as being at least
49 sizeof chunk_t.
50
51 This assumes that it's OK to overwrite at least the first
52 2*sizeof(chunk_t) bytes of output even if the copy is shorter than this.
53 This assumption holds because inflate_fast() starts every iteration with at
54 least 258 bytes of output space available (258 being the maximum length
55 output from a single token; see inflate_fast()'s assumptions below). */
56 #ifndef HAVE_CHUNKUNROLL
CHUNKUNROLL(uint8_t * out,unsigned * dist,unsigned * len)57 Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
58 unsigned char const *from = out - *dist;
59 chunk_t chunk;
60 while (*dist < *len && *dist < sizeof(chunk_t)) {
61 loadchunk(from, &chunk);
62 storechunk(out, &chunk);
63 out += *dist;
64 *len -= *dist;
65 *dist += *dist;
66 }
67 return out;
68 }
69 #endif
70
71 #ifndef HAVE_CHUNK_MAG
72 /* Loads a magazine to feed into memory of the pattern */
GET_CHUNK_MAG(uint8_t * buf,uint32_t * chunk_rem,uint32_t dist)73 static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
74 /* This code takes string of length dist from "from" and repeats
75 * it for as many times as can fit in a chunk_t (vector register) */
76 uint32_t cpy_dist;
77 uint32_t bytes_remaining = sizeof(chunk_t);
78 chunk_t chunk_load;
79 uint8_t *cur_chunk = (uint8_t *)&chunk_load;
80 while (bytes_remaining) {
81 cpy_dist = MIN(dist, bytes_remaining);
82 memcpy(cur_chunk, buf, cpy_dist);
83 bytes_remaining -= cpy_dist;
84 cur_chunk += cpy_dist;
85 /* This allows us to bypass an expensive integer division since we're effectively
86 * counting in this loop, anyway */
87 *chunk_rem = cpy_dist;
88 }
89
90 return chunk_load;
91 }
92 #endif
93
94 /* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
95 Return OUT + LEN. */
CHUNKMEMSET(uint8_t * out,unsigned dist,unsigned len)96 Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
97 /* Debug performance related issues when len < sizeof(uint64_t):
98 Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
99 Assert(dist > 0, "chunkmemset cannot have a distance 0");
100 /* Only AVX2 */
101 #if CHUNK_SIZE == 32 && defined(X86_SSE41) && defined(X86_SSE2)
102 if (len <= 16) {
103 return chunkmemset_sse41(out, dist, len);
104 }
105 #endif
106
107 uint8_t *from = out - dist;
108
109 if (dist == 1) {
110 memset(out, *from, len);
111 return out + len;
112 } else if (dist > sizeof(chunk_t)) {
113 return CHUNKCOPY(out, out - dist, len);
114 }
115
116 chunk_t chunk_load;
117 uint32_t chunk_mod = 0;
118
119 /* TODO: possibly build up a permutation table for this if not an even modulus */
120 #ifdef HAVE_CHUNKMEMSET_2
121 if (dist == 2) {
122 chunkmemset_2(from, &chunk_load);
123 } else
124 #endif
125 #ifdef HAVE_CHUNKMEMSET_4
126 if (dist == 4) {
127 chunkmemset_4(from, &chunk_load);
128 } else
129 #endif
130 #ifdef HAVE_CHUNKMEMSET_8
131 if (dist == 8) {
132 chunkmemset_8(from, &chunk_load);
133 } else if (dist == sizeof(chunk_t)) {
134 loadchunk(from, &chunk_load);
135 } else
136 #endif
137 {
138 chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
139 }
140
141 /* If we're lucky enough and dist happens to be an even modulus of our vector length,
142 * we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */
143 if (chunk_mod == 0) {
144 while (len >= (2 * sizeof(chunk_t))) {
145 storechunk(out, &chunk_load);
146 storechunk(out + sizeof(chunk_t), &chunk_load);
147 out += 2 * sizeof(chunk_t);
148 len -= 2 * sizeof(chunk_t);
149 }
150 }
151
152 /* If we don't have a "dist" length that divides evenly into a vector
153 * register, we can write the whole vector register but we need only
154 * advance by the amount of the whole string that fits in our chunk_t.
155 * If we do divide evenly into the vector length, adv_amount = chunk_t size*/
156 uint32_t adv_amount = sizeof(chunk_t) - chunk_mod;
157 while (len >= sizeof(chunk_t)) {
158 storechunk(out, &chunk_load);
159 len -= adv_amount;
160 out += adv_amount;
161 }
162
163 if (len) {
164 memcpy(out, &chunk_load, len);
165 out += len;
166 }
167
168 return out;
169 }
170
CHUNKMEMSET_SAFE(uint8_t * out,unsigned dist,unsigned len,unsigned left)171 Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
172 #if !defined(UNALIGNED64_OK)
173 # if !defined(UNALIGNED_OK)
174 static const uint32_t align_mask = 7;
175 # else
176 static const uint32_t align_mask = 3;
177 # endif
178 #endif
179
180 len = MIN(len, left);
181 uint8_t *from = out - dist;
182 #if !defined(UNALIGNED64_OK)
183 while (((uintptr_t)out & align_mask) && (len > 0)) {
184 *out++ = *from++;
185 --len;
186 --left;
187 }
188 #endif
189 if (left < (unsigned)(3 * sizeof(chunk_t))) {
190 while (len > 0) {
191 *out++ = *from++;
192 --len;
193 }
194 return out;
195 }
196 if (len)
197 return CHUNKMEMSET(out, dist, len);
198
199 return out;
200 }
201