1 /* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
2  * Copyright (C) 1995-2011 Mark Adler
3  * Copyright (C) 2017-2021 Mika T. Lindqvist <[email protected]>
4  * Copyright (C) 2021 Adam Stylinski <[email protected]>
5  * For conditions of distribution and use, see copyright notice in zlib.h
6  */
7 
8 #ifdef PPC_VMX_ADLER32
9 #include <altivec.h>
10 #include "zbuild.h"
11 #include "adler32_p.h"
12 
13 #define vmx_zero()  (vec_splat_u32(0))
14 
vmx_handle_head_or_tail(uint32_t * pair,const unsigned char * buf,size_t len)15 static inline void vmx_handle_head_or_tail(uint32_t *pair, const unsigned char *buf, size_t len) {
16     unsigned int i;
17     for (i = 0; i < len; ++i) {
18         pair[0] += buf[i];
19         pair[1] += pair[0];
20     }
21 }
22 
vmx_accum32(uint32_t * s,const unsigned char * buf,size_t len)23 static void vmx_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
24     /* Different taps for the separable components of sums */
25     const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
26     const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
27     const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
28     const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
29     /* As silly and inefficient as it seems, creating 1 permutation vector to permute
30      * a 2 element vector from a single load + a subsequent shift is just barely faster
31      * than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
32     const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
33     const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
34     vector unsigned int  adacc, s2acc;
35     vector unsigned int pair_vec = vec_ld(0, s);
36     adacc = vec_perm(pair_vec, pair_vec, s0_perm);
37     s2acc = vec_slo(pair_vec, shift_vec);
38 
39     vector unsigned int zero = vmx_zero();
40     vector unsigned int s3acc = zero;
41     vector unsigned int s3acc_0 = zero;
42     vector unsigned int adacc_prev = adacc;
43     vector unsigned int adacc_prev_0 = zero;
44 
45     vector unsigned int s2acc_0 = zero;
46     vector unsigned int s2acc_1 = zero;
47     vector unsigned int s2acc_2 = zero;
48 
49     /* Maintain a running sum of a second half, this might help use break yet another
50      * data dependency bubble in the sum */
51     vector unsigned int adacc_0 = zero;
52 
53     int num_iter = len / 4;
54     int rem = len & 3;
55 
56     for (int i = 0; i < num_iter; ++i) {
57         vector unsigned char d0 = vec_ld(0, buf);
58         vector unsigned char d1 = vec_ld(16, buf);
59         vector unsigned char d2 = vec_ld(32, buf);
60         vector unsigned char d3 = vec_ld(48, buf);
61 
62         /* The core operation of the loop, basically
63          * what is being unrolled below */
64         adacc = vec_sum4s(d0, adacc);
65         s3acc = vec_add(s3acc, adacc_prev);
66         s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
67         s2acc = vec_msum(t0, d0, s2acc);
68 
69         /* interleave dependent sums in here */
70         adacc_0 = vec_sum4s(d1, adacc_0);
71         s2acc_0 = vec_msum(t1, d1, s2acc_0);
72         adacc = vec_sum4s(d2, adacc);
73         s2acc_1 = vec_msum(t2, d2, s2acc_1);
74         s2acc_2 = vec_msum(t3, d3, s2acc_2);
75         adacc_0 = vec_sum4s(d3, adacc_0);
76 
77         adacc_prev = adacc;
78         adacc_prev_0 = adacc_0;
79         buf += 64;
80     }
81 
82     adacc = vec_add(adacc, adacc_0);
83     s3acc = vec_add(s3acc, s3acc_0);
84     s3acc = vec_sl(s3acc, vec_splat_u32(6));
85 
86     if (rem) {
87         adacc_prev = vec_add(adacc_prev_0, adacc_prev);
88         adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
89         while (rem--) {
90             vector unsigned char d0 = vec_ld(0, buf);
91             adacc = vec_sum4s(d0, adacc);
92             s3acc = vec_add(s3acc, adacc_prev);
93             s2acc = vec_msum(t3, d0, s2acc);
94             adacc_prev = vec_sl(adacc, vec_splat_u32(4));
95             buf += 16;
96         }
97     }
98 
99 
100     /* Sum up independent second sums */
101     s2acc = vec_add(s2acc, s2acc_0);
102     s2acc_2 = vec_add(s2acc_1, s2acc_2);
103     s2acc = vec_add(s2acc, s2acc_2);
104 
105     s2acc = vec_add(s2acc, s3acc);
106 
107     adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
108     s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
109     adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
110     s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
111 
112     vec_ste(adacc, 0, s);
113     vec_ste(s2acc, 0, s+1);
114 }
115 
adler32_vmx(uint32_t adler,const unsigned char * buf,size_t len)116 uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len) {
117     uint32_t sum2;
118     uint32_t pair[16] ALIGNED_(16);
119     memset(&pair[2], 0, 14);
120     int n = NMAX;
121     unsigned int done = 0, i;
122 
123     /* Split Adler-32 into component sums, it can be supplied by
124      * the caller sites (e.g. in a PNG file).
125      */
126     sum2 = (adler >> 16) & 0xffff;
127     adler &= 0xffff;
128     pair[0] = adler;
129     pair[1] = sum2;
130 
131     /* in case user likes doing a byte at a time, keep it fast */
132     if (UNLIKELY(len == 1))
133         return adler32_len_1(adler, buf, sum2);
134 
135     /* initial Adler-32 value (deferred check for len == 1 speed) */
136     if (UNLIKELY(buf == NULL))
137         return 1L;
138 
139     /* in case short lengths are provided, keep it somewhat fast */
140     if (UNLIKELY(len < 16))
141         return adler32_len_16(adler, buf, len, sum2);
142 
143     // Align buffer
144     unsigned int al = 0;
145     if ((uintptr_t)buf & 0xf) {
146         al = 16-((uintptr_t)buf & 0xf);
147         if (al > len) {
148             al=len;
149         }
150         vmx_handle_head_or_tail(pair, buf, al);
151 
152         done += al;
153         /* Rather than rebasing, we can reduce the max sums for the
154          * first round only */
155         n -= al;
156     }
157     for (i = al; i < len; i += n) {
158         int remaining = (int)(len-i);
159         n = MIN(remaining, (i == al) ? n : NMAX);
160 
161         if (n < 16)
162             break;
163 
164         vmx_accum32(pair, buf + i, n / 16);
165         pair[0] %= BASE;
166         pair[1] %= BASE;
167 
168         done += (n / 16) * 16;
169     }
170 
171     /* Handle the tail elements. */
172     if (done < len) {
173         vmx_handle_head_or_tail(pair, (buf + done), len - done);
174         pair[0] %= BASE;
175         pair[1] %= BASE;
176     }
177 
178     /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
179     return (pair[1] << 16) | pair[0];
180 }
181 #endif
182