1 /* functable.c -- Choose relevant optimized functions at runtime
2  * Copyright (C) 2017 Hans Kristian Rosbach
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  */
5 
6 #include "zbuild.h"
7 #include "zendian.h"
8 #include "crc32_braid_p.h"
9 #include "deflate.h"
10 #include "deflate_p.h"
11 
12 #include "functable.h"
13 
14 #include "cpu_features.h"
15 
16 Z_INTERNAL Z_TLS struct functable_s functable;
17 
18 /* stub functions */
update_hash_stub(deflate_state * const s,uint32_t h,uint32_t val)19 Z_INTERNAL uint32_t update_hash_stub(deflate_state *const s, uint32_t h, uint32_t val) {
20     // Initialize default
21 
22     functable.update_hash = &update_hash_c;
23     cpu_check_features();
24 
25 #ifdef X86_SSE42_CRC_HASH
26     if (x86_cpu_has_sse42)
27         functable.update_hash = &update_hash_sse4;
28 #elif defined(ARM_ACLE_CRC_HASH)
29     if (arm_cpu_has_crc32)
30         functable.update_hash = &update_hash_acle;
31 #endif
32 
33     return functable.update_hash(s, h, val);
34 }
35 
insert_string_stub(deflate_state * const s,uint32_t str,uint32_t count)36 Z_INTERNAL void insert_string_stub(deflate_state *const s, uint32_t str, uint32_t count) {
37     // Initialize default
38 
39     functable.insert_string = &insert_string_c;
40     cpu_check_features();
41 
42 #ifdef X86_SSE42_CRC_HASH
43     if (x86_cpu_has_sse42)
44         functable.insert_string = &insert_string_sse4;
45 #elif defined(ARM_ACLE_CRC_HASH)
46     if (arm_cpu_has_crc32)
47         functable.insert_string = &insert_string_acle;
48 #endif
49 
50     functable.insert_string(s, str, count);
51 }
52 
quick_insert_string_stub(deflate_state * const s,const uint32_t str)53 Z_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const uint32_t str) {
54     functable.quick_insert_string = &quick_insert_string_c;
55 
56 #ifdef X86_SSE42_CRC_HASH
57     if (x86_cpu_has_sse42)
58         functable.quick_insert_string = &quick_insert_string_sse4;
59 #elif defined(ARM_ACLE_CRC_HASH)
60     if (arm_cpu_has_crc32)
61         functable.quick_insert_string = &quick_insert_string_acle;
62 #endif
63 
64     return functable.quick_insert_string(s, str);
65 }
66 
slide_hash_stub(deflate_state * s)67 Z_INTERNAL void slide_hash_stub(deflate_state *s) {
68 
69     functable.slide_hash = &slide_hash_c;
70     cpu_check_features();
71 
72 #ifdef X86_SSE2
73 #  if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
74     if (x86_cpu_has_sse2)
75 #  endif
76         functable.slide_hash = &slide_hash_sse2;
77 #elif defined(ARM_NEON_SLIDEHASH)
78 #  ifndef ARM_NOCHECK_NEON
79     if (arm_cpu_has_neon)
80 #  endif
81         functable.slide_hash = &slide_hash_neon;
82 #endif
83 #ifdef X86_AVX2
84     if (x86_cpu_has_avx2)
85         functable.slide_hash = &slide_hash_avx2;
86 #endif
87 #ifdef PPC_VMX_SLIDEHASH
88     if (power_cpu_has_altivec)
89         functable.slide_hash = &slide_hash_vmx;
90 #endif
91 #ifdef POWER8_VSX_SLIDEHASH
92     if (power_cpu_has_arch_2_07)
93         functable.slide_hash = &slide_hash_power8;
94 #endif
95 
96     functable.slide_hash(s);
97 }
98 
longest_match_stub(deflate_state * const s,Pos cur_match)99 Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) {
100 
101 #ifdef UNALIGNED_OK
102 #  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
103     functable.longest_match = &longest_match_unaligned_64;
104 #  elif defined(HAVE_BUILTIN_CTZ)
105     functable.longest_match = &longest_match_unaligned_32;
106 #  else
107     functable.longest_match = &longest_match_unaligned_16;
108 #  endif
109 #else
110     functable.longest_match = &longest_match_c;
111 #endif
112 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
113     if (x86_cpu_has_sse2)
114         functable.longest_match = &longest_match_sse2;
115 #endif
116 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
117     if (x86_cpu_has_avx2)
118         functable.longest_match = &longest_match_avx2;
119 #endif
120 #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
121     if (arm_cpu_has_neon)
122         functable.longest_match = &longest_match_neon;
123 #endif
124 #ifdef POWER9
125     if (power_cpu_has_arch_3_00)
126         functable.longest_match = &longest_match_power9;
127 #endif
128 
129     return functable.longest_match(s, cur_match);
130 }
131 
longest_match_slow_stub(deflate_state * const s,Pos cur_match)132 Z_INTERNAL uint32_t longest_match_slow_stub(deflate_state *const s, Pos cur_match) {
133 
134 #ifdef UNALIGNED_OK
135 #  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
136     functable.longest_match_slow = &longest_match_slow_unaligned_64;
137 #  elif defined(HAVE_BUILTIN_CTZ)
138     functable.longest_match_slow = &longest_match_slow_unaligned_32;
139 #  else
140     functable.longest_match_slow = &longest_match_slow_unaligned_16;
141 #  endif
142 #else
143     functable.longest_match_slow = &longest_match_slow_c;
144 #endif
145 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
146     if (x86_cpu_has_sse2)
147         functable.longest_match_slow = &longest_match_slow_sse2;
148 #endif
149 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
150     if (x86_cpu_has_avx2)
151         functable.longest_match_slow = &longest_match_slow_avx2;
152 #endif
153 #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
154     if (arm_cpu_has_neon)
155         functable.longest_match_slow = &longest_match_slow_neon;
156 #endif
157 #ifdef POWER9
158     if (power_cpu_has_arch_3_00)
159         functable.longest_match_slow = &longest_match_slow_power9;
160 #endif
161 
162     return functable.longest_match_slow(s, cur_match);
163 }
164 
adler32_stub(uint32_t adler,const unsigned char * buf,size_t len)165 Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) {
166     // Initialize default
167     functable.adler32 = &adler32_c;
168     cpu_check_features();
169 
170 #ifdef ARM_NEON_ADLER32
171 #  ifndef ARM_NOCHECK_NEON
172     if (arm_cpu_has_neon)
173 #  endif
174         functable.adler32 = &adler32_neon;
175 #endif
176 #ifdef X86_SSSE3_ADLER32
177     if (x86_cpu_has_ssse3)
178         functable.adler32 = &adler32_ssse3;
179 #endif
180 #ifdef X86_AVX2_ADLER32
181     if (x86_cpu_has_avx2)
182         functable.adler32 = &adler32_avx2;
183 #endif
184 #ifdef X86_AVX512_ADLER32
185     if (x86_cpu_has_avx512)
186         functable.adler32 = &adler32_avx512;
187 #endif
188 #ifdef X86_AVX512VNNI_ADLER32
189     if (x86_cpu_has_avx512vnni) {
190         functable.adler32 = &adler32_avx512_vnni;
191     }
192 #endif
193 #ifdef PPC_VMX_ADLER32
194     if (power_cpu_has_altivec)
195         functable.adler32 = &adler32_vmx;
196 #endif
197 #ifdef POWER8_VSX_ADLER32
198     if (power_cpu_has_arch_2_07)
199         functable.adler32 = &adler32_power8;
200 #endif
201 
202     return functable.adler32(adler, buf, len);
203 }
204 
adler32_fold_copy_stub(uint32_t adler,uint8_t * dst,const uint8_t * src,size_t len)205 Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
206     functable.adler32_fold_copy = &adler32_fold_copy_c;
207 #if (defined X86_SSE42_ADLER32)
208     if (x86_cpu_has_sse42)
209         functable.adler32_fold_copy = &adler32_fold_copy_sse42;
210 #endif
211 #ifdef X86_AVX2_ADLER32
212     if (x86_cpu_has_avx2)
213         functable.adler32_fold_copy = &adler32_fold_copy_avx2;
214 #endif
215 #ifdef X86_AVX512_ADLER32
216     if (x86_cpu_has_avx512)
217         functable.adler32_fold_copy = &adler32_fold_copy_avx512;
218 #endif
219 #ifdef X86_AVX512VNNI_ADLER32
220     if (x86_cpu_has_avx512vnni)
221         functable.adler32_fold_copy = &adler32_fold_copy_avx512_vnni;
222 #endif
223     return functable.adler32_fold_copy(adler, dst, src, len);
224 }
225 
crc32_fold_reset_stub(crc32_fold * crc)226 Z_INTERNAL uint32_t crc32_fold_reset_stub(crc32_fold *crc) {
227     functable.crc32_fold_reset = &crc32_fold_reset_c;
228     cpu_check_features();
229 #ifdef X86_PCLMULQDQ_CRC
230     if (x86_cpu_has_pclmulqdq)
231         functable.crc32_fold_reset = &crc32_fold_reset_pclmulqdq;
232 #endif
233     return functable.crc32_fold_reset(crc);
234 }
235 
crc32_fold_copy_stub(crc32_fold * crc,uint8_t * dst,const uint8_t * src,size_t len)236 Z_INTERNAL void crc32_fold_copy_stub(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
237     functable.crc32_fold_copy = &crc32_fold_copy_c;
238     cpu_check_features();
239 #ifdef X86_PCLMULQDQ_CRC
240     if (x86_cpu_has_pclmulqdq)
241         functable.crc32_fold_copy = &crc32_fold_copy_pclmulqdq;
242 #endif
243     functable.crc32_fold_copy(crc, dst, src, len);
244 }
245 
crc32_fold_stub(crc32_fold * crc,const uint8_t * src,size_t len,uint32_t init_crc)246 Z_INTERNAL void crc32_fold_stub(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
247     functable.crc32_fold = &crc32_fold_c;
248     cpu_check_features();
249 #ifdef X86_PCLMULQDQ_CRC
250     if (x86_cpu_has_pclmulqdq)
251         functable.crc32_fold = &crc32_fold_pclmulqdq;
252 #endif
253     functable.crc32_fold(crc, src, len, init_crc);
254 }
255 
crc32_fold_final_stub(crc32_fold * crc)256 Z_INTERNAL uint32_t crc32_fold_final_stub(crc32_fold *crc) {
257     functable.crc32_fold_final = &crc32_fold_final_c;
258     cpu_check_features();
259 #ifdef X86_PCLMULQDQ_CRC
260     if (x86_cpu_has_pclmulqdq)
261         functable.crc32_fold_final = &crc32_fold_final_pclmulqdq;
262 #endif
263     return functable.crc32_fold_final(crc);
264 }
265 
chunksize_stub(void)266 Z_INTERNAL uint32_t chunksize_stub(void) {
267     // Initialize default
268     functable.chunksize = &chunksize_c;
269     cpu_check_features();
270 
271 #ifdef X86_SSE2_CHUNKSET
272 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
273     if (x86_cpu_has_sse2)
274 # endif
275         functable.chunksize = &chunksize_sse2;
276 #endif
277 #ifdef X86_AVX_CHUNKSET
278     if (x86_cpu_has_avx2)
279         functable.chunksize = &chunksize_avx;
280 #endif
281 #ifdef ARM_NEON_CHUNKSET
282     if (arm_cpu_has_neon)
283         functable.chunksize = &chunksize_neon;
284 #endif
285 #ifdef POWER8_VSX_CHUNKSET
286     if (power_cpu_has_arch_2_07)
287         functable.chunksize = &chunksize_power8;
288 #endif
289 
290     return functable.chunksize();
291 }
292 
chunkcopy_stub(uint8_t * out,uint8_t const * from,unsigned len)293 Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned len) {
294     // Initialize default
295     functable.chunkcopy = &chunkcopy_c;
296 
297 #ifdef X86_SSE2_CHUNKSET
298 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
299     if (x86_cpu_has_sse2)
300 # endif
301         functable.chunkcopy = &chunkcopy_sse2;
302 #endif
303 #ifdef X86_AVX_CHUNKSET
304     if (x86_cpu_has_avx2)
305         functable.chunkcopy = &chunkcopy_avx;
306 #endif
307 #ifdef ARM_NEON_CHUNKSET
308     if (arm_cpu_has_neon)
309         functable.chunkcopy = &chunkcopy_neon;
310 #endif
311 #ifdef POWER8_VSX_CHUNKSET
312     if (power_cpu_has_arch_2_07)
313         functable.chunkcopy = &chunkcopy_power8;
314 #endif
315 
316     return functable.chunkcopy(out, from, len);
317 }
318 
chunkunroll_stub(uint8_t * out,unsigned * dist,unsigned * len)319 Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len) {
320     // Initialize default
321     functable.chunkunroll = &chunkunroll_c;
322 
323 #ifdef X86_SSE2_CHUNKSET
324 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
325     if (x86_cpu_has_sse2)
326 # endif
327         functable.chunkunroll = &chunkunroll_sse2;
328 #endif
329 #ifdef X86_AVX_CHUNKSET
330     if (x86_cpu_has_avx2)
331         functable.chunkunroll = &chunkunroll_avx;
332 #endif
333 #ifdef ARM_NEON_CHUNKSET
334     if (arm_cpu_has_neon)
335         functable.chunkunroll = &chunkunroll_neon;
336 #endif
337 #ifdef POWER8_VSX_CHUNKSET
338     if (power_cpu_has_arch_2_07)
339         functable.chunkunroll = &chunkunroll_power8;
340 #endif
341 
342     return functable.chunkunroll(out, dist, len);
343 }
344 
chunkmemset_stub(uint8_t * out,unsigned dist,unsigned len)345 Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len) {
346     // Initialize default
347     functable.chunkmemset = &chunkmemset_c;
348 
349 #ifdef X86_SSE2_CHUNKSET
350 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
351     if (x86_cpu_has_sse2)
352 # endif
353         functable.chunkmemset = &chunkmemset_sse2;
354 #endif
355 #if defined(X86_SSE41) && defined(X86_SSE2)
356     if (x86_cpu_has_sse41)
357         functable.chunkmemset = &chunkmemset_sse41;
358 #endif
359 #ifdef X86_AVX_CHUNKSET
360     if (x86_cpu_has_avx2)
361         functable.chunkmemset = &chunkmemset_avx;
362 #endif
363 #ifdef ARM_NEON_CHUNKSET
364     if (arm_cpu_has_neon)
365         functable.chunkmemset = &chunkmemset_neon;
366 #endif
367 #ifdef POWER8_VSX_CHUNKSET
368     if (power_cpu_has_arch_2_07)
369         functable.chunkmemset = &chunkmemset_power8;
370 #endif
371 
372 
373     return functable.chunkmemset(out, dist, len);
374 }
375 
chunkmemset_safe_stub(uint8_t * out,unsigned dist,unsigned len,unsigned left)376 Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
377     // Initialize default
378     functable.chunkmemset_safe = &chunkmemset_safe_c;
379 
380 #ifdef X86_SSE2_CHUNKSET
381 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
382     if (x86_cpu_has_sse2)
383 # endif
384         functable.chunkmemset_safe = &chunkmemset_safe_sse2;
385 #endif
386 #if defined(X86_SSE41) && defined(X86_SSE2)
387     if (x86_cpu_has_sse41)
388         functable.chunkmemset_safe = &chunkmemset_safe_sse41;
389 #endif
390 #ifdef X86_AVX_CHUNKSET
391     if (x86_cpu_has_avx2)
392         functable.chunkmemset_safe = &chunkmemset_safe_avx;
393 #endif
394 #ifdef ARM_NEON_CHUNKSET
395     if (arm_cpu_has_neon)
396         functable.chunkmemset_safe = &chunkmemset_safe_neon;
397 #endif
398 #ifdef POWER8_VSX_CHUNKSET
399     if (power_cpu_has_arch_2_07)
400         functable.chunkmemset_safe = &chunkmemset_safe_power8;
401 #endif
402 
403     return functable.chunkmemset_safe(out, dist, len, left);
404 }
405 
crc32_stub(uint32_t crc,const unsigned char * buf,uint64_t len)406 Z_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len) {
407     Assert(sizeof(uint64_t) >= sizeof(size_t),
408            "crc32_z takes size_t but internally we have a uint64_t len");
409 
410     functable.crc32 = &crc32_braid;
411     cpu_check_features();
412 #ifdef ARM_ACLE_CRC_HASH
413     if (arm_cpu_has_crc32)
414         functable.crc32 = &crc32_acle;
415 #elif defined(POWER8_VSX_CRC32)
416     if (power_cpu_has_arch_2_07)
417         functable.crc32 = &crc32_power8;
418 #elif defined(S390_CRC32_VX)
419     if (PREFIX(s390_cpu_has_vx))
420         functable.crc32 = &PREFIX(s390_crc32_vx);
421 #elif defined(X86_PCLMULQDQ_CRC)
422     if (x86_cpu_has_pclmulqdq)
423         functable.crc32 = &crc32_pclmulqdq;
424 #endif
425 
426     return functable.crc32(crc, buf, len);
427 }
428 
compare256_stub(const uint8_t * src0,const uint8_t * src1)429 Z_INTERNAL uint32_t compare256_stub(const uint8_t *src0, const uint8_t *src1) {
430 
431 #ifdef UNALIGNED_OK
432 #  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
433     functable.compare256 = &compare256_unaligned_64;
434 #  elif defined(HAVE_BUILTIN_CTZ)
435     functable.compare256 = &compare256_unaligned_32;
436 #  else
437     functable.compare256 = &compare256_unaligned_16;
438 #  endif
439 #else
440     functable.compare256 = &compare256_c;
441 #endif
442 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
443     if (x86_cpu_has_sse2)
444         functable.compare256 = &compare256_sse2;
445 #endif
446 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
447     if (x86_cpu_has_avx2)
448         functable.compare256 = &compare256_avx2;
449 #endif
450 #ifdef POWER9
451     if (power_cpu_has_arch_3_00)
452         functable.compare256 = &compare256_power9;
453 #endif
454 
455     return functable.compare256(src0, src1);
456 }
457 
458 /* functable init */
459 Z_INTERNAL Z_TLS struct functable_s functable = {
460     adler32_stub,
461     adler32_fold_copy_stub,
462     crc32_stub,
463     crc32_fold_reset_stub,
464     crc32_fold_copy_stub,
465     crc32_fold_stub,
466     crc32_fold_final_stub,
467     compare256_stub,
468     chunksize_stub,
469     chunkcopy_stub,
470     chunkunroll_stub,
471     chunkmemset_stub,
472     chunkmemset_safe_stub,
473     insert_string_stub,
474     longest_match_stub,
475     longest_match_slow_stub,
476     quick_insert_string_stub,
477     slide_hash_stub,
478     update_hash_stub
479 };
480