1 /* functable.c -- Choose relevant optimized functions at runtime
2 * Copyright (C) 2017 Hans Kristian Rosbach
3 * For conditions of distribution and use, see copyright notice in zlib.h
4 */
5
6 #include "zbuild.h"
7 #include "zendian.h"
8 #include "crc32_braid_p.h"
9 #include "deflate.h"
10 #include "deflate_p.h"
11
12 #include "functable.h"
13
14 #include "cpu_features.h"
15
16 Z_INTERNAL Z_TLS struct functable_s functable;
17
18 /* stub functions */
update_hash_stub(deflate_state * const s,uint32_t h,uint32_t val)19 Z_INTERNAL uint32_t update_hash_stub(deflate_state *const s, uint32_t h, uint32_t val) {
20 // Initialize default
21
22 functable.update_hash = &update_hash_c;
23 cpu_check_features();
24
25 #ifdef X86_SSE42_CRC_HASH
26 if (x86_cpu_has_sse42)
27 functable.update_hash = &update_hash_sse4;
28 #elif defined(ARM_ACLE_CRC_HASH)
29 if (arm_cpu_has_crc32)
30 functable.update_hash = &update_hash_acle;
31 #endif
32
33 return functable.update_hash(s, h, val);
34 }
35
insert_string_stub(deflate_state * const s,uint32_t str,uint32_t count)36 Z_INTERNAL void insert_string_stub(deflate_state *const s, uint32_t str, uint32_t count) {
37 // Initialize default
38
39 functable.insert_string = &insert_string_c;
40 cpu_check_features();
41
42 #ifdef X86_SSE42_CRC_HASH
43 if (x86_cpu_has_sse42)
44 functable.insert_string = &insert_string_sse4;
45 #elif defined(ARM_ACLE_CRC_HASH)
46 if (arm_cpu_has_crc32)
47 functable.insert_string = &insert_string_acle;
48 #endif
49
50 functable.insert_string(s, str, count);
51 }
52
quick_insert_string_stub(deflate_state * const s,const uint32_t str)53 Z_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const uint32_t str) {
54 functable.quick_insert_string = &quick_insert_string_c;
55
56 #ifdef X86_SSE42_CRC_HASH
57 if (x86_cpu_has_sse42)
58 functable.quick_insert_string = &quick_insert_string_sse4;
59 #elif defined(ARM_ACLE_CRC_HASH)
60 if (arm_cpu_has_crc32)
61 functable.quick_insert_string = &quick_insert_string_acle;
62 #endif
63
64 return functable.quick_insert_string(s, str);
65 }
66
slide_hash_stub(deflate_state * s)67 Z_INTERNAL void slide_hash_stub(deflate_state *s) {
68
69 functable.slide_hash = &slide_hash_c;
70 cpu_check_features();
71
72 #ifdef X86_SSE2
73 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
74 if (x86_cpu_has_sse2)
75 # endif
76 functable.slide_hash = &slide_hash_sse2;
77 #elif defined(ARM_NEON_SLIDEHASH)
78 # ifndef ARM_NOCHECK_NEON
79 if (arm_cpu_has_neon)
80 # endif
81 functable.slide_hash = &slide_hash_neon;
82 #endif
83 #ifdef X86_AVX2
84 if (x86_cpu_has_avx2)
85 functable.slide_hash = &slide_hash_avx2;
86 #endif
87 #ifdef PPC_VMX_SLIDEHASH
88 if (power_cpu_has_altivec)
89 functable.slide_hash = &slide_hash_vmx;
90 #endif
91 #ifdef POWER8_VSX_SLIDEHASH
92 if (power_cpu_has_arch_2_07)
93 functable.slide_hash = &slide_hash_power8;
94 #endif
95
96 functable.slide_hash(s);
97 }
98
longest_match_stub(deflate_state * const s,Pos cur_match)99 Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) {
100
101 #ifdef UNALIGNED_OK
102 # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
103 functable.longest_match = &longest_match_unaligned_64;
104 # elif defined(HAVE_BUILTIN_CTZ)
105 functable.longest_match = &longest_match_unaligned_32;
106 # else
107 functable.longest_match = &longest_match_unaligned_16;
108 # endif
109 #else
110 functable.longest_match = &longest_match_c;
111 #endif
112 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
113 if (x86_cpu_has_sse2)
114 functable.longest_match = &longest_match_sse2;
115 #endif
116 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
117 if (x86_cpu_has_avx2)
118 functable.longest_match = &longest_match_avx2;
119 #endif
120 #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
121 if (arm_cpu_has_neon)
122 functable.longest_match = &longest_match_neon;
123 #endif
124 #ifdef POWER9
125 if (power_cpu_has_arch_3_00)
126 functable.longest_match = &longest_match_power9;
127 #endif
128
129 return functable.longest_match(s, cur_match);
130 }
131
longest_match_slow_stub(deflate_state * const s,Pos cur_match)132 Z_INTERNAL uint32_t longest_match_slow_stub(deflate_state *const s, Pos cur_match) {
133
134 #ifdef UNALIGNED_OK
135 # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
136 functable.longest_match_slow = &longest_match_slow_unaligned_64;
137 # elif defined(HAVE_BUILTIN_CTZ)
138 functable.longest_match_slow = &longest_match_slow_unaligned_32;
139 # else
140 functable.longest_match_slow = &longest_match_slow_unaligned_16;
141 # endif
142 #else
143 functable.longest_match_slow = &longest_match_slow_c;
144 #endif
145 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
146 if (x86_cpu_has_sse2)
147 functable.longest_match_slow = &longest_match_slow_sse2;
148 #endif
149 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
150 if (x86_cpu_has_avx2)
151 functable.longest_match_slow = &longest_match_slow_avx2;
152 #endif
153 #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
154 if (arm_cpu_has_neon)
155 functable.longest_match_slow = &longest_match_slow_neon;
156 #endif
157 #ifdef POWER9
158 if (power_cpu_has_arch_3_00)
159 functable.longest_match_slow = &longest_match_slow_power9;
160 #endif
161
162 return functable.longest_match_slow(s, cur_match);
163 }
164
adler32_stub(uint32_t adler,const unsigned char * buf,size_t len)165 Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) {
166 // Initialize default
167 functable.adler32 = &adler32_c;
168 cpu_check_features();
169
170 #ifdef ARM_NEON_ADLER32
171 # ifndef ARM_NOCHECK_NEON
172 if (arm_cpu_has_neon)
173 # endif
174 functable.adler32 = &adler32_neon;
175 #endif
176 #ifdef X86_SSSE3_ADLER32
177 if (x86_cpu_has_ssse3)
178 functable.adler32 = &adler32_ssse3;
179 #endif
180 #ifdef X86_AVX2_ADLER32
181 if (x86_cpu_has_avx2)
182 functable.adler32 = &adler32_avx2;
183 #endif
184 #ifdef X86_AVX512_ADLER32
185 if (x86_cpu_has_avx512)
186 functable.adler32 = &adler32_avx512;
187 #endif
188 #ifdef X86_AVX512VNNI_ADLER32
189 if (x86_cpu_has_avx512vnni) {
190 functable.adler32 = &adler32_avx512_vnni;
191 }
192 #endif
193 #ifdef PPC_VMX_ADLER32
194 if (power_cpu_has_altivec)
195 functable.adler32 = &adler32_vmx;
196 #endif
197 #ifdef POWER8_VSX_ADLER32
198 if (power_cpu_has_arch_2_07)
199 functable.adler32 = &adler32_power8;
200 #endif
201
202 return functable.adler32(adler, buf, len);
203 }
204
adler32_fold_copy_stub(uint32_t adler,uint8_t * dst,const uint8_t * src,size_t len)205 Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
206 functable.adler32_fold_copy = &adler32_fold_copy_c;
207 #if (defined X86_SSE42_ADLER32)
208 if (x86_cpu_has_sse42)
209 functable.adler32_fold_copy = &adler32_fold_copy_sse42;
210 #endif
211 #ifdef X86_AVX2_ADLER32
212 if (x86_cpu_has_avx2)
213 functable.adler32_fold_copy = &adler32_fold_copy_avx2;
214 #endif
215 #ifdef X86_AVX512_ADLER32
216 if (x86_cpu_has_avx512)
217 functable.adler32_fold_copy = &adler32_fold_copy_avx512;
218 #endif
219 #ifdef X86_AVX512VNNI_ADLER32
220 if (x86_cpu_has_avx512vnni)
221 functable.adler32_fold_copy = &adler32_fold_copy_avx512_vnni;
222 #endif
223 return functable.adler32_fold_copy(adler, dst, src, len);
224 }
225
crc32_fold_reset_stub(crc32_fold * crc)226 Z_INTERNAL uint32_t crc32_fold_reset_stub(crc32_fold *crc) {
227 functable.crc32_fold_reset = &crc32_fold_reset_c;
228 cpu_check_features();
229 #ifdef X86_PCLMULQDQ_CRC
230 if (x86_cpu_has_pclmulqdq)
231 functable.crc32_fold_reset = &crc32_fold_reset_pclmulqdq;
232 #endif
233 return functable.crc32_fold_reset(crc);
234 }
235
crc32_fold_copy_stub(crc32_fold * crc,uint8_t * dst,const uint8_t * src,size_t len)236 Z_INTERNAL void crc32_fold_copy_stub(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
237 functable.crc32_fold_copy = &crc32_fold_copy_c;
238 cpu_check_features();
239 #ifdef X86_PCLMULQDQ_CRC
240 if (x86_cpu_has_pclmulqdq)
241 functable.crc32_fold_copy = &crc32_fold_copy_pclmulqdq;
242 #endif
243 functable.crc32_fold_copy(crc, dst, src, len);
244 }
245
crc32_fold_stub(crc32_fold * crc,const uint8_t * src,size_t len,uint32_t init_crc)246 Z_INTERNAL void crc32_fold_stub(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
247 functable.crc32_fold = &crc32_fold_c;
248 cpu_check_features();
249 #ifdef X86_PCLMULQDQ_CRC
250 if (x86_cpu_has_pclmulqdq)
251 functable.crc32_fold = &crc32_fold_pclmulqdq;
252 #endif
253 functable.crc32_fold(crc, src, len, init_crc);
254 }
255
crc32_fold_final_stub(crc32_fold * crc)256 Z_INTERNAL uint32_t crc32_fold_final_stub(crc32_fold *crc) {
257 functable.crc32_fold_final = &crc32_fold_final_c;
258 cpu_check_features();
259 #ifdef X86_PCLMULQDQ_CRC
260 if (x86_cpu_has_pclmulqdq)
261 functable.crc32_fold_final = &crc32_fold_final_pclmulqdq;
262 #endif
263 return functable.crc32_fold_final(crc);
264 }
265
chunksize_stub(void)266 Z_INTERNAL uint32_t chunksize_stub(void) {
267 // Initialize default
268 functable.chunksize = &chunksize_c;
269 cpu_check_features();
270
271 #ifdef X86_SSE2_CHUNKSET
272 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
273 if (x86_cpu_has_sse2)
274 # endif
275 functable.chunksize = &chunksize_sse2;
276 #endif
277 #ifdef X86_AVX_CHUNKSET
278 if (x86_cpu_has_avx2)
279 functable.chunksize = &chunksize_avx;
280 #endif
281 #ifdef ARM_NEON_CHUNKSET
282 if (arm_cpu_has_neon)
283 functable.chunksize = &chunksize_neon;
284 #endif
285 #ifdef POWER8_VSX_CHUNKSET
286 if (power_cpu_has_arch_2_07)
287 functable.chunksize = &chunksize_power8;
288 #endif
289
290 return functable.chunksize();
291 }
292
chunkcopy_stub(uint8_t * out,uint8_t const * from,unsigned len)293 Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned len) {
294 // Initialize default
295 functable.chunkcopy = &chunkcopy_c;
296
297 #ifdef X86_SSE2_CHUNKSET
298 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
299 if (x86_cpu_has_sse2)
300 # endif
301 functable.chunkcopy = &chunkcopy_sse2;
302 #endif
303 #ifdef X86_AVX_CHUNKSET
304 if (x86_cpu_has_avx2)
305 functable.chunkcopy = &chunkcopy_avx;
306 #endif
307 #ifdef ARM_NEON_CHUNKSET
308 if (arm_cpu_has_neon)
309 functable.chunkcopy = &chunkcopy_neon;
310 #endif
311 #ifdef POWER8_VSX_CHUNKSET
312 if (power_cpu_has_arch_2_07)
313 functable.chunkcopy = &chunkcopy_power8;
314 #endif
315
316 return functable.chunkcopy(out, from, len);
317 }
318
chunkunroll_stub(uint8_t * out,unsigned * dist,unsigned * len)319 Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len) {
320 // Initialize default
321 functable.chunkunroll = &chunkunroll_c;
322
323 #ifdef X86_SSE2_CHUNKSET
324 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
325 if (x86_cpu_has_sse2)
326 # endif
327 functable.chunkunroll = &chunkunroll_sse2;
328 #endif
329 #ifdef X86_AVX_CHUNKSET
330 if (x86_cpu_has_avx2)
331 functable.chunkunroll = &chunkunroll_avx;
332 #endif
333 #ifdef ARM_NEON_CHUNKSET
334 if (arm_cpu_has_neon)
335 functable.chunkunroll = &chunkunroll_neon;
336 #endif
337 #ifdef POWER8_VSX_CHUNKSET
338 if (power_cpu_has_arch_2_07)
339 functable.chunkunroll = &chunkunroll_power8;
340 #endif
341
342 return functable.chunkunroll(out, dist, len);
343 }
344
chunkmemset_stub(uint8_t * out,unsigned dist,unsigned len)345 Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len) {
346 // Initialize default
347 functable.chunkmemset = &chunkmemset_c;
348
349 #ifdef X86_SSE2_CHUNKSET
350 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
351 if (x86_cpu_has_sse2)
352 # endif
353 functable.chunkmemset = &chunkmemset_sse2;
354 #endif
355 #if defined(X86_SSE41) && defined(X86_SSE2)
356 if (x86_cpu_has_sse41)
357 functable.chunkmemset = &chunkmemset_sse41;
358 #endif
359 #ifdef X86_AVX_CHUNKSET
360 if (x86_cpu_has_avx2)
361 functable.chunkmemset = &chunkmemset_avx;
362 #endif
363 #ifdef ARM_NEON_CHUNKSET
364 if (arm_cpu_has_neon)
365 functable.chunkmemset = &chunkmemset_neon;
366 #endif
367 #ifdef POWER8_VSX_CHUNKSET
368 if (power_cpu_has_arch_2_07)
369 functable.chunkmemset = &chunkmemset_power8;
370 #endif
371
372
373 return functable.chunkmemset(out, dist, len);
374 }
375
chunkmemset_safe_stub(uint8_t * out,unsigned dist,unsigned len,unsigned left)376 Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
377 // Initialize default
378 functable.chunkmemset_safe = &chunkmemset_safe_c;
379
380 #ifdef X86_SSE2_CHUNKSET
381 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
382 if (x86_cpu_has_sse2)
383 # endif
384 functable.chunkmemset_safe = &chunkmemset_safe_sse2;
385 #endif
386 #if defined(X86_SSE41) && defined(X86_SSE2)
387 if (x86_cpu_has_sse41)
388 functable.chunkmemset_safe = &chunkmemset_safe_sse41;
389 #endif
390 #ifdef X86_AVX_CHUNKSET
391 if (x86_cpu_has_avx2)
392 functable.chunkmemset_safe = &chunkmemset_safe_avx;
393 #endif
394 #ifdef ARM_NEON_CHUNKSET
395 if (arm_cpu_has_neon)
396 functable.chunkmemset_safe = &chunkmemset_safe_neon;
397 #endif
398 #ifdef POWER8_VSX_CHUNKSET
399 if (power_cpu_has_arch_2_07)
400 functable.chunkmemset_safe = &chunkmemset_safe_power8;
401 #endif
402
403 return functable.chunkmemset_safe(out, dist, len, left);
404 }
405
crc32_stub(uint32_t crc,const unsigned char * buf,uint64_t len)406 Z_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len) {
407 Assert(sizeof(uint64_t) >= sizeof(size_t),
408 "crc32_z takes size_t but internally we have a uint64_t len");
409
410 functable.crc32 = &crc32_braid;
411 cpu_check_features();
412 #ifdef ARM_ACLE_CRC_HASH
413 if (arm_cpu_has_crc32)
414 functable.crc32 = &crc32_acle;
415 #elif defined(POWER8_VSX_CRC32)
416 if (power_cpu_has_arch_2_07)
417 functable.crc32 = &crc32_power8;
418 #elif defined(S390_CRC32_VX)
419 if (PREFIX(s390_cpu_has_vx))
420 functable.crc32 = &PREFIX(s390_crc32_vx);
421 #elif defined(X86_PCLMULQDQ_CRC)
422 if (x86_cpu_has_pclmulqdq)
423 functable.crc32 = &crc32_pclmulqdq;
424 #endif
425
426 return functable.crc32(crc, buf, len);
427 }
428
compare256_stub(const uint8_t * src0,const uint8_t * src1)429 Z_INTERNAL uint32_t compare256_stub(const uint8_t *src0, const uint8_t *src1) {
430
431 #ifdef UNALIGNED_OK
432 # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
433 functable.compare256 = &compare256_unaligned_64;
434 # elif defined(HAVE_BUILTIN_CTZ)
435 functable.compare256 = &compare256_unaligned_32;
436 # else
437 functable.compare256 = &compare256_unaligned_16;
438 # endif
439 #else
440 functable.compare256 = &compare256_c;
441 #endif
442 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
443 if (x86_cpu_has_sse2)
444 functable.compare256 = &compare256_sse2;
445 #endif
446 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
447 if (x86_cpu_has_avx2)
448 functable.compare256 = &compare256_avx2;
449 #endif
450 #ifdef POWER9
451 if (power_cpu_has_arch_3_00)
452 functable.compare256 = &compare256_power9;
453 #endif
454
455 return functable.compare256(src0, src1);
456 }
457
458 /* functable init */
459 Z_INTERNAL Z_TLS struct functable_s functable = {
460 adler32_stub,
461 adler32_fold_copy_stub,
462 crc32_stub,
463 crc32_fold_reset_stub,
464 crc32_fold_copy_stub,
465 crc32_fold_stub,
466 crc32_fold_final_stub,
467 compare256_stub,
468 chunksize_stub,
469 chunkcopy_stub,
470 chunkunroll_stub,
471 chunkmemset_stub,
472 chunkmemset_safe_stub,
473 insert_string_stub,
474 longest_match_stub,
475 longest_match_slow_stub,
476 quick_insert_string_stub,
477 slide_hash_stub,
478 update_hash_stub
479 };
480