1 /* compare256_sse2.c -- SSE2 version of compare256
2  * Copyright Adam Stylinski <[email protected]>
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  */
5 
6 #include "../../zbuild.h"
7 
8 #include "fallback_builtins.h"
9 
10 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
11 
12 #include <emmintrin.h>
13 
compare256_sse2_static(const uint8_t * src0,const uint8_t * src1)14 static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) {
15     uint32_t len = 0;
16     int align_offset = ((uintptr_t)src0) & 15;
17     const uint8_t *end0 = src0 + 256;
18     const uint8_t *end1 = src1 + 256;
19     __m128i xmm_src0, xmm_src1, xmm_cmp;
20 
21     /* Do the first load unaligned, than all subsequent ones we have at least
22      * one aligned load. Sadly aligning both loads is probably unrealistic */
23     xmm_src0 = _mm_loadu_si128((__m128i*)src0);
24     xmm_src1 = _mm_loadu_si128((__m128i*)src1);
25     xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
26 
27     unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
28 
29     /* Compiler _may_ turn this branch into a ptest + movemask,
30      * since a lot of those uops are shared and fused */
31     if (mask != 0xFFFF) {
32         uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
33         return len + match_byte;
34     }
35 
36     int align_adv = 16 - align_offset;
37     len += align_adv;
38     src0 += align_adv;
39     src1 += align_adv;
40 
41     /* Do a flooring division (should just be a shift right) */
42     int num_iter = (256 - len) / 16;
43 
44     for (int i = 0; i < num_iter; ++i) {
45         xmm_src0 = _mm_load_si128((__m128i*)src0);
46         xmm_src1 = _mm_loadu_si128((__m128i*)src1);
47         xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
48 
49         mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
50 
51         /* Compiler _may_ turn this branch into a ptest + movemask,
52          * since a lot of those uops are shared and fused */
53         if (mask != 0xFFFF) {
54             uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
55             return len + match_byte;
56         }
57 
58         len += 16, src0 += 16, src1 += 16;
59     }
60 
61     if (align_offset) {
62         src0 = end0 - 16;
63         src1 = end1 - 16;
64         len = 256 - 16;
65 
66         xmm_src0 = _mm_loadu_si128((__m128i*)src0);
67         xmm_src1 = _mm_loadu_si128((__m128i*)src1);
68         xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
69 
70         mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
71 
72         if (mask != 0xFFFF) {
73             uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
74             return len + match_byte;
75         }
76     }
77 
78     return 256;
79 }
80 
compare256_sse2(const uint8_t * src0,const uint8_t * src1)81 Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) {
82     return compare256_sse2_static(src0, src1);
83 }
84 
85 #define LONGEST_MATCH       longest_match_sse2
86 #define COMPARE256          compare256_sse2_static
87 
88 #include "match_tpl.h"
89 
90 #define LONGEST_MATCH_SLOW
91 #define LONGEST_MATCH       longest_match_slow_sse2
92 #define COMPARE256          compare256_sse2_static
93 
94 #include "match_tpl.h"
95 
96 #endif
97