1*424fb153SAndroid Build Coastguard Worker // Copyright 2008 Google Inc. All Rights Reserved.
2*424fb153SAndroid Build Coastguard Worker
3*424fb153SAndroid Build Coastguard Worker // Licensed under the Apache License, Version 2.0 (the "License");
4*424fb153SAndroid Build Coastguard Worker // you may not use this file except in compliance with the License.
5*424fb153SAndroid Build Coastguard Worker // You may obtain a copy of the License at
6*424fb153SAndroid Build Coastguard Worker
7*424fb153SAndroid Build Coastguard Worker // http://www.apache.org/licenses/LICENSE-2.0
8*424fb153SAndroid Build Coastguard Worker
9*424fb153SAndroid Build Coastguard Worker // Unless required by applicable law or agreed to in writing, software
10*424fb153SAndroid Build Coastguard Worker // distributed under the License is distributed on an "AS IS" BASIS,
11*424fb153SAndroid Build Coastguard Worker // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*424fb153SAndroid Build Coastguard Worker // See the License for the specific language governing permissions and
13*424fb153SAndroid Build Coastguard Worker // limitations under the License.
14*424fb153SAndroid Build Coastguard Worker
15*424fb153SAndroid Build Coastguard Worker #include "adler32memcpy.h"
16*424fb153SAndroid Build Coastguard Worker
17*424fb153SAndroid Build Coastguard Worker // We are using (a modified form of) adler-32 checksum algorithm instead
18*424fb153SAndroid Build Coastguard Worker // of CRC since adler-32 is faster than CRC.
19*424fb153SAndroid Build Coastguard Worker // (Comparison: http://guru.multimedia.cx/crc32-vs-adler32/)
20*424fb153SAndroid Build Coastguard Worker // This form of adler is bit modified, instead of treating the data in
21*424fb153SAndroid Build Coastguard Worker // units of bytes, 32-bit data is taken as a unit and two 64-bit
22*424fb153SAndroid Build Coastguard Worker // checksums are done (we could have one checksum but two checksums
23*424fb153SAndroid Build Coastguard Worker // make the code run faster).
24*424fb153SAndroid Build Coastguard Worker
25*424fb153SAndroid Build Coastguard Worker // Adler-32 implementation:
26*424fb153SAndroid Build Coastguard Worker // Data is treated as 1-byte numbers and,
27*424fb153SAndroid Build Coastguard Worker // there are two 16-bit numbers a and b
28*424fb153SAndroid Build Coastguard Worker // Initialize a with 1 and b with 0.
29*424fb153SAndroid Build Coastguard Worker // for each data unit 'd'
30*424fb153SAndroid Build Coastguard Worker // a += d
31*424fb153SAndroid Build Coastguard Worker // b += a
32*424fb153SAndroid Build Coastguard Worker // checksum = a<<16 + b
33*424fb153SAndroid Build Coastguard Worker // This sum should never overflow.
34*424fb153SAndroid Build Coastguard Worker //
35*424fb153SAndroid Build Coastguard Worker // Adler-64+64 implementation:
36*424fb153SAndroid Build Coastguard Worker // (applied in this code)
37*424fb153SAndroid Build Coastguard Worker // Data is treated as 32-bit numbers and whole data is separated into two
38*424fb153SAndroid Build Coastguard Worker // streams, and hence the two checksums a1, a2, b1 and b2.
39*424fb153SAndroid Build Coastguard Worker // Initialize a1 and a2 with 1, b1 and b2 with 0
40*424fb153SAndroid Build Coastguard Worker // add first dataunit to a1
41*424fb153SAndroid Build Coastguard Worker // add a1 to b1
42*424fb153SAndroid Build Coastguard Worker // add second dataunit to a1
43*424fb153SAndroid Build Coastguard Worker // add a1 to b1
44*424fb153SAndroid Build Coastguard Worker // add third dataunit to a2
45*424fb153SAndroid Build Coastguard Worker // add a2 to b2
46*424fb153SAndroid Build Coastguard Worker // add fourth dataunit to a2
47*424fb153SAndroid Build Coastguard Worker // add a2 to b2
48*424fb153SAndroid Build Coastguard Worker // ...
49*424fb153SAndroid Build Coastguard Worker // repeat the sequence back for next 4 dataunits
50*424fb153SAndroid Build Coastguard Worker //
51*424fb153SAndroid Build Coastguard Worker // variable A = XMM6 and variable B = XMM7.
52*424fb153SAndroid Build Coastguard Worker // (a1 = lower 8 bytes of XMM6 and b1 = lower 8 bytes of XMM7)
53*424fb153SAndroid Build Coastguard Worker
54*424fb153SAndroid Build Coastguard Worker // Assumptions
55*424fb153SAndroid Build Coastguard Worker // 1. size_in_bytes is a multiple of 16.
56*424fb153SAndroid Build Coastguard Worker // 2. srcmem and dstmem are 16 byte aligned.
57*424fb153SAndroid Build Coastguard Worker // 3. size_in_bytes is less than 2^19 bytes.
58*424fb153SAndroid Build Coastguard Worker
59*424fb153SAndroid Build Coastguard Worker // Assumption 3 ensures that there is no overflow when numbers are being
60*424fb153SAndroid Build Coastguard Worker // added (we can remove this assumption by doing modulus with a prime
61*424fb153SAndroid Build Coastguard Worker // number when it is just about to overflow but that would be a very costly
62*424fb153SAndroid Build Coastguard Worker // exercise)
63*424fb153SAndroid Build Coastguard Worker
64*424fb153SAndroid Build Coastguard Worker // Returns true if the checksums are equal.
Equals(const AdlerChecksum & other) const65*424fb153SAndroid Build Coastguard Worker bool AdlerChecksum::Equals(const AdlerChecksum &other) const {
66*424fb153SAndroid Build Coastguard Worker return ( (a1_ == other.a1_) && (a2_ == other.a2_) &&
67*424fb153SAndroid Build Coastguard Worker (b1_ == other.b1_) && (b2_ == other.b2_) );
68*424fb153SAndroid Build Coastguard Worker }
69*424fb153SAndroid Build Coastguard Worker
70*424fb153SAndroid Build Coastguard Worker // Returns string representation of the Adler checksum.
ToHexString() const71*424fb153SAndroid Build Coastguard Worker string AdlerChecksum::ToHexString() const {
72*424fb153SAndroid Build Coastguard Worker char buffer[128];
73*424fb153SAndroid Build Coastguard Worker snprintf(buffer, sizeof(buffer), "%016llx %016llx %016llx %016llx", a1_, a2_, b1_, b2_);
74*424fb153SAndroid Build Coastguard Worker return string(buffer);
75*424fb153SAndroid Build Coastguard Worker }
76*424fb153SAndroid Build Coastguard Worker
77*424fb153SAndroid Build Coastguard Worker // Sets components of the Adler checksum.
Set(uint64 a1,uint64 a2,uint64 b1,uint64 b2)78*424fb153SAndroid Build Coastguard Worker void AdlerChecksum::Set(uint64 a1, uint64 a2, uint64 b1, uint64 b2) {
79*424fb153SAndroid Build Coastguard Worker a1_ = a1;
80*424fb153SAndroid Build Coastguard Worker a2_ = a2;
81*424fb153SAndroid Build Coastguard Worker b1_ = b1;
82*424fb153SAndroid Build Coastguard Worker b2_ = b2;
83*424fb153SAndroid Build Coastguard Worker }
84*424fb153SAndroid Build Coastguard Worker
85*424fb153SAndroid Build Coastguard Worker // Calculates Adler checksum for supplied data.
CalculateAdlerChecksum(uint64 * data64,unsigned int size_in_bytes,AdlerChecksum * checksum)86*424fb153SAndroid Build Coastguard Worker bool CalculateAdlerChecksum(uint64 *data64, unsigned int size_in_bytes,
87*424fb153SAndroid Build Coastguard Worker AdlerChecksum *checksum) {
88*424fb153SAndroid Build Coastguard Worker // Use this data wrapper to access memory with 64bit read/write.
89*424fb153SAndroid Build Coastguard Worker datacast_t data;
90*424fb153SAndroid Build Coastguard Worker unsigned int count = size_in_bytes / sizeof(data);
91*424fb153SAndroid Build Coastguard Worker
92*424fb153SAndroid Build Coastguard Worker if (count > (1U) << 19) {
93*424fb153SAndroid Build Coastguard Worker // Size is too large, must be strictly less than 512 KB.
94*424fb153SAndroid Build Coastguard Worker return false;
95*424fb153SAndroid Build Coastguard Worker }
96*424fb153SAndroid Build Coastguard Worker
97*424fb153SAndroid Build Coastguard Worker uint64 a1 = 1;
98*424fb153SAndroid Build Coastguard Worker uint64 a2 = 1;
99*424fb153SAndroid Build Coastguard Worker uint64 b1 = 0;
100*424fb153SAndroid Build Coastguard Worker uint64 b2 = 0;
101*424fb153SAndroid Build Coastguard Worker
102*424fb153SAndroid Build Coastguard Worker unsigned int i = 0;
103*424fb153SAndroid Build Coastguard Worker while (i < count) {
104*424fb153SAndroid Build Coastguard Worker // Process 64 bits at a time.
105*424fb153SAndroid Build Coastguard Worker data.l64 = data64[i];
106*424fb153SAndroid Build Coastguard Worker a1 = a1 + data.l32.l;
107*424fb153SAndroid Build Coastguard Worker b1 = b1 + a1;
108*424fb153SAndroid Build Coastguard Worker a1 = a1 + data.l32.h;
109*424fb153SAndroid Build Coastguard Worker b1 = b1 + a1;
110*424fb153SAndroid Build Coastguard Worker i++;
111*424fb153SAndroid Build Coastguard Worker
112*424fb153SAndroid Build Coastguard Worker data.l64 = data64[i];
113*424fb153SAndroid Build Coastguard Worker a2 = a2 + data.l32.l;
114*424fb153SAndroid Build Coastguard Worker b2 = b2 + a2;
115*424fb153SAndroid Build Coastguard Worker a2 = a2 + data.l32.h;
116*424fb153SAndroid Build Coastguard Worker b2 = b2 + a2;
117*424fb153SAndroid Build Coastguard Worker i++;
118*424fb153SAndroid Build Coastguard Worker }
119*424fb153SAndroid Build Coastguard Worker checksum->Set(a1, a2, b1, b2);
120*424fb153SAndroid Build Coastguard Worker return true;
121*424fb153SAndroid Build Coastguard Worker }
122*424fb153SAndroid Build Coastguard Worker
123*424fb153SAndroid Build Coastguard Worker // C implementation of Adler memory copy.
AdlerMemcpyC(uint64 * dstmem64,uint64 * srcmem64,unsigned int size_in_bytes,AdlerChecksum * checksum)124*424fb153SAndroid Build Coastguard Worker bool AdlerMemcpyC(uint64 *dstmem64, uint64 *srcmem64,
125*424fb153SAndroid Build Coastguard Worker unsigned int size_in_bytes, AdlerChecksum *checksum) {
126*424fb153SAndroid Build Coastguard Worker // Use this data wrapper to access memory with 64bit read/write.
127*424fb153SAndroid Build Coastguard Worker datacast_t data;
128*424fb153SAndroid Build Coastguard Worker unsigned int count = size_in_bytes / sizeof(data);
129*424fb153SAndroid Build Coastguard Worker
130*424fb153SAndroid Build Coastguard Worker if (count > ((1U) << 19)) {
131*424fb153SAndroid Build Coastguard Worker // Size is too large, must be strictly less than 512 KB.
132*424fb153SAndroid Build Coastguard Worker return false;
133*424fb153SAndroid Build Coastguard Worker }
134*424fb153SAndroid Build Coastguard Worker
135*424fb153SAndroid Build Coastguard Worker uint64 a1 = 1;
136*424fb153SAndroid Build Coastguard Worker uint64 a2 = 1;
137*424fb153SAndroid Build Coastguard Worker uint64 b1 = 0;
138*424fb153SAndroid Build Coastguard Worker uint64 b2 = 0;
139*424fb153SAndroid Build Coastguard Worker
140*424fb153SAndroid Build Coastguard Worker unsigned int i = 0;
141*424fb153SAndroid Build Coastguard Worker while (i < count) {
142*424fb153SAndroid Build Coastguard Worker // Process 64 bits at a time.
143*424fb153SAndroid Build Coastguard Worker data.l64 = srcmem64[i];
144*424fb153SAndroid Build Coastguard Worker a1 = a1 + data.l32.l;
145*424fb153SAndroid Build Coastguard Worker b1 = b1 + a1;
146*424fb153SAndroid Build Coastguard Worker a1 = a1 + data.l32.h;
147*424fb153SAndroid Build Coastguard Worker b1 = b1 + a1;
148*424fb153SAndroid Build Coastguard Worker dstmem64[i] = data.l64;
149*424fb153SAndroid Build Coastguard Worker i++;
150*424fb153SAndroid Build Coastguard Worker
151*424fb153SAndroid Build Coastguard Worker data.l64 = srcmem64[i];
152*424fb153SAndroid Build Coastguard Worker a2 = a2 + data.l32.l;
153*424fb153SAndroid Build Coastguard Worker b2 = b2 + a2;
154*424fb153SAndroid Build Coastguard Worker a2 = a2 + data.l32.h;
155*424fb153SAndroid Build Coastguard Worker b2 = b2 + a2;
156*424fb153SAndroid Build Coastguard Worker dstmem64[i] = data.l64;
157*424fb153SAndroid Build Coastguard Worker i++;
158*424fb153SAndroid Build Coastguard Worker }
159*424fb153SAndroid Build Coastguard Worker checksum->Set(a1, a2, b1, b2);
160*424fb153SAndroid Build Coastguard Worker return true;
161*424fb153SAndroid Build Coastguard Worker }
162*424fb153SAndroid Build Coastguard Worker
163*424fb153SAndroid Build Coastguard Worker // C implementation of Adler memory copy with some float point ops,
164*424fb153SAndroid Build Coastguard Worker // attempting to warm up the CPU.
AdlerMemcpyWarmC(uint64 * dstmem64,uint64 * srcmem64,unsigned int size_in_bytes,AdlerChecksum * checksum)165*424fb153SAndroid Build Coastguard Worker bool AdlerMemcpyWarmC(uint64 *dstmem64, uint64 *srcmem64,
166*424fb153SAndroid Build Coastguard Worker unsigned int size_in_bytes, AdlerChecksum *checksum) {
167*424fb153SAndroid Build Coastguard Worker // Use this data wrapper to access memory with 64bit read/write.
168*424fb153SAndroid Build Coastguard Worker datacast_t data;
169*424fb153SAndroid Build Coastguard Worker unsigned int count = size_in_bytes / sizeof(data);
170*424fb153SAndroid Build Coastguard Worker
171*424fb153SAndroid Build Coastguard Worker if (count > ((1U) << 19)) {
172*424fb153SAndroid Build Coastguard Worker // Size is too large, must be strictly less than 512 KB.
173*424fb153SAndroid Build Coastguard Worker return false;
174*424fb153SAndroid Build Coastguard Worker }
175*424fb153SAndroid Build Coastguard Worker
176*424fb153SAndroid Build Coastguard Worker uint64 a1 = 1;
177*424fb153SAndroid Build Coastguard Worker uint64 a2 = 1;
178*424fb153SAndroid Build Coastguard Worker uint64 b1 = 0;
179*424fb153SAndroid Build Coastguard Worker uint64 b2 = 0;
180*424fb153SAndroid Build Coastguard Worker
181*424fb153SAndroid Build Coastguard Worker double a = 2.0 * static_cast<double>(srcmem64[0]);
182*424fb153SAndroid Build Coastguard Worker double b = 5.0 * static_cast<double>(srcmem64[0]);
183*424fb153SAndroid Build Coastguard Worker double c = 7.0 * static_cast<double>(srcmem64[0]);
184*424fb153SAndroid Build Coastguard Worker double d = 9.0 * static_cast<double>(srcmem64[0]);
185*424fb153SAndroid Build Coastguard Worker
186*424fb153SAndroid Build Coastguard Worker unsigned int i = 0;
187*424fb153SAndroid Build Coastguard Worker while (i < count) {
188*424fb153SAndroid Build Coastguard Worker // Process 64 bits at a time.
189*424fb153SAndroid Build Coastguard Worker data.l64 = srcmem64[i];
190*424fb153SAndroid Build Coastguard Worker a1 = a1 + data.l32.l;
191*424fb153SAndroid Build Coastguard Worker b1 = b1 + a1;
192*424fb153SAndroid Build Coastguard Worker a1 = a1 + data.l32.h;
193*424fb153SAndroid Build Coastguard Worker b1 = b1 + a1;
194*424fb153SAndroid Build Coastguard Worker dstmem64[i] = data.l64;
195*424fb153SAndroid Build Coastguard Worker i++;
196*424fb153SAndroid Build Coastguard Worker
197*424fb153SAndroid Build Coastguard Worker // Warm cpu up.
198*424fb153SAndroid Build Coastguard Worker a = a * b;
199*424fb153SAndroid Build Coastguard Worker b = b + c;
200*424fb153SAndroid Build Coastguard Worker
201*424fb153SAndroid Build Coastguard Worker data.l64 = srcmem64[i];
202*424fb153SAndroid Build Coastguard Worker a2 = a2 + data.l32.l;
203*424fb153SAndroid Build Coastguard Worker b2 = b2 + a2;
204*424fb153SAndroid Build Coastguard Worker a2 = a2 + data.l32.h;
205*424fb153SAndroid Build Coastguard Worker b2 = b2 + a2;
206*424fb153SAndroid Build Coastguard Worker dstmem64[i] = data.l64;
207*424fb153SAndroid Build Coastguard Worker i++;
208*424fb153SAndroid Build Coastguard Worker
209*424fb153SAndroid Build Coastguard Worker // Warm cpu up.
210*424fb153SAndroid Build Coastguard Worker c = c * d;
211*424fb153SAndroid Build Coastguard Worker d = d + d;
212*424fb153SAndroid Build Coastguard Worker }
213*424fb153SAndroid Build Coastguard Worker
214*424fb153SAndroid Build Coastguard Worker // Warm cpu up.
215*424fb153SAndroid Build Coastguard Worker d = a + b + c + d;
216*424fb153SAndroid Build Coastguard Worker if (d == 1.0) {
217*424fb153SAndroid Build Coastguard Worker // Reference the result so that it can't be discarded by the compiler.
218*424fb153SAndroid Build Coastguard Worker printf("Log: This will probably never happen.\n");
219*424fb153SAndroid Build Coastguard Worker }
220*424fb153SAndroid Build Coastguard Worker
221*424fb153SAndroid Build Coastguard Worker checksum->Set(a1, a2, b1, b2);
222*424fb153SAndroid Build Coastguard Worker return true;
223*424fb153SAndroid Build Coastguard Worker }
224*424fb153SAndroid Build Coastguard Worker
225*424fb153SAndroid Build Coastguard Worker // x86_64 SSE2 assembly implementation of fast and stressful Adler memory copy.
AdlerMemcpyAsm(uint64 * dstmem64,uint64 * srcmem64,unsigned int size_in_bytes,AdlerChecksum * checksum)226*424fb153SAndroid Build Coastguard Worker bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
227*424fb153SAndroid Build Coastguard Worker unsigned int size_in_bytes, AdlerChecksum *checksum) {
228*424fb153SAndroid Build Coastguard Worker // Use assembly implementation where supported.
229*424fb153SAndroid Build Coastguard Worker #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
230*424fb153SAndroid Build Coastguard Worker
231*424fb153SAndroid Build Coastguard Worker // Pull a bit of tricky preprocessing to make the inline asm both
232*424fb153SAndroid Build Coastguard Worker // 32 bit and 64 bit.
233*424fb153SAndroid Build Coastguard Worker #ifdef STRESSAPPTEST_CPU_I686 // Instead of coding both, x86...
234*424fb153SAndroid Build Coastguard Worker #define rAX "%%eax"
235*424fb153SAndroid Build Coastguard Worker #define rCX "%%ecx"
236*424fb153SAndroid Build Coastguard Worker #define rDX "%%edx"
237*424fb153SAndroid Build Coastguard Worker #define rBX "%%ebx"
238*424fb153SAndroid Build Coastguard Worker #define rSP "%%esp"
239*424fb153SAndroid Build Coastguard Worker #define rBP "%%ebp"
240*424fb153SAndroid Build Coastguard Worker #define rSI "%%esi"
241*424fb153SAndroid Build Coastguard Worker #define rDI "%%edi"
242*424fb153SAndroid Build Coastguard Worker #endif
243*424fb153SAndroid Build Coastguard Worker
244*424fb153SAndroid Build Coastguard Worker #ifdef STRESSAPPTEST_CPU_X86_64 // ...and x64, we use rXX macros.
245*424fb153SAndroid Build Coastguard Worker #define rAX "%%rax"
246*424fb153SAndroid Build Coastguard Worker #define rCX "%%rcx"
247*424fb153SAndroid Build Coastguard Worker #define rDX "%%rdx"
248*424fb153SAndroid Build Coastguard Worker #define rBX "%%rbx"
249*424fb153SAndroid Build Coastguard Worker #define rSP "%%rsp"
250*424fb153SAndroid Build Coastguard Worker #define rBP "%%rbp"
251*424fb153SAndroid Build Coastguard Worker #define rSI "%%rsi"
252*424fb153SAndroid Build Coastguard Worker #define rDI "%%rdi"
253*424fb153SAndroid Build Coastguard Worker #endif
254*424fb153SAndroid Build Coastguard Worker
255*424fb153SAndroid Build Coastguard Worker // Elements 0 to 3 are used for holding checksum terms a1, a2,
256*424fb153SAndroid Build Coastguard Worker // b1, b2 respectively. These elements are filled by asm code.
257*424fb153SAndroid Build Coastguard Worker // Elements 4 and 5 are used by asm code to for ANDing MMX data and removing
258*424fb153SAndroid Build Coastguard Worker // 2 words from each MMX register (A MMX reg has 4 words, by ANDing we are
259*424fb153SAndroid Build Coastguard Worker // setting word index 0 and word index 2 to zero).
260*424fb153SAndroid Build Coastguard Worker // Element 6 and 7 are used for setting a1 and a2 to 1.
261*424fb153SAndroid Build Coastguard Worker volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
262*424fb153SAndroid Build Coastguard Worker {0, 0, 0, 0, 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1};
263*424fb153SAndroid Build Coastguard Worker
264*424fb153SAndroid Build Coastguard Worker if ((size_in_bytes >> 19) > 0) {
265*424fb153SAndroid Build Coastguard Worker // Size is too large. Must be less than 2^19 bytes = 512 KB.
266*424fb153SAndroid Build Coastguard Worker return false;
267*424fb153SAndroid Build Coastguard Worker }
268*424fb153SAndroid Build Coastguard Worker
269*424fb153SAndroid Build Coastguard Worker // Number of 32-bit words which are not added to a1/a2 in the main loop.
270*424fb153SAndroid Build Coastguard Worker uint32 remaining_words = (size_in_bytes % 48) / 4;
271*424fb153SAndroid Build Coastguard Worker
272*424fb153SAndroid Build Coastguard Worker // Since we are moving 48 bytes at a time number of iterations = total size/48
273*424fb153SAndroid Build Coastguard Worker // is value of counter.
274*424fb153SAndroid Build Coastguard Worker uint32 num_of_48_byte_units = size_in_bytes / 48;
275*424fb153SAndroid Build Coastguard Worker
276*424fb153SAndroid Build Coastguard Worker asm volatile (
277*424fb153SAndroid Build Coastguard Worker // Source address is in ESI (extended source index)
278*424fb153SAndroid Build Coastguard Worker // destination is in EDI (extended destination index)
279*424fb153SAndroid Build Coastguard Worker // and counter is already in ECX (extended counter
280*424fb153SAndroid Build Coastguard Worker // index).
281*424fb153SAndroid Build Coastguard Worker "cmp $0, " rCX ";" // Compare counter to zero.
282*424fb153SAndroid Build Coastguard Worker "jz END;"
283*424fb153SAndroid Build Coastguard Worker
284*424fb153SAndroid Build Coastguard Worker // XMM6 is initialized with 1 and XMM7 with 0.
285*424fb153SAndroid Build Coastguard Worker "prefetchnta 0(" rSI ");"
286*424fb153SAndroid Build Coastguard Worker "prefetchnta 64(" rSI ");"
287*424fb153SAndroid Build Coastguard Worker "movdqu 48(" rAX "), %%xmm6;"
288*424fb153SAndroid Build Coastguard Worker "xorps %%xmm7, %%xmm7;"
289*424fb153SAndroid Build Coastguard Worker
290*424fb153SAndroid Build Coastguard Worker // Start of the loop which copies 48 bytes from source to dst each time.
291*424fb153SAndroid Build Coastguard Worker "TOP:\n"
292*424fb153SAndroid Build Coastguard Worker
293*424fb153SAndroid Build Coastguard Worker // Make 6 moves each of 16 bytes from srcmem to XMM registers.
294*424fb153SAndroid Build Coastguard Worker // We are using 2 words out of 4 words in each XMM register,
295*424fb153SAndroid Build Coastguard Worker // word index 0 and word index 2
296*424fb153SAndroid Build Coastguard Worker "movdqa 0(" rSI "), %%xmm0;"
297*424fb153SAndroid Build Coastguard Worker "movdqu 4(" rSI "), %%xmm1;" // Be careful to use unaligned move here.
298*424fb153SAndroid Build Coastguard Worker "movdqa 16(" rSI "), %%xmm2;"
299*424fb153SAndroid Build Coastguard Worker "movdqu 20(" rSI "), %%xmm3;"
300*424fb153SAndroid Build Coastguard Worker "movdqa 32(" rSI "), %%xmm4;"
301*424fb153SAndroid Build Coastguard Worker "movdqu 36(" rSI "), %%xmm5;"
302*424fb153SAndroid Build Coastguard Worker
303*424fb153SAndroid Build Coastguard Worker // Move 3 * 16 bytes from XMM registers to dstmem.
304*424fb153SAndroid Build Coastguard Worker // Note: this copy must be performed before pinsrw instructions since
305*424fb153SAndroid Build Coastguard Worker // they will modify the XMM registers.
306*424fb153SAndroid Build Coastguard Worker "movntdq %%xmm0, 0(" rDI ");"
307*424fb153SAndroid Build Coastguard Worker "movntdq %%xmm2, 16(" rDI ");"
308*424fb153SAndroid Build Coastguard Worker "movntdq %%xmm4, 32(" rDI ");"
309*424fb153SAndroid Build Coastguard Worker
310*424fb153SAndroid Build Coastguard Worker // Sets the word[1] and word[3] of XMM0 to XMM5 to zero.
311*424fb153SAndroid Build Coastguard Worker "andps 32(" rAX "), %%xmm0;"
312*424fb153SAndroid Build Coastguard Worker "andps 32(" rAX "), %%xmm1;"
313*424fb153SAndroid Build Coastguard Worker "andps 32(" rAX "), %%xmm2;"
314*424fb153SAndroid Build Coastguard Worker "andps 32(" rAX "), %%xmm3;"
315*424fb153SAndroid Build Coastguard Worker "andps 32(" rAX "), %%xmm4;"
316*424fb153SAndroid Build Coastguard Worker "andps 32(" rAX "), %%xmm5;"
317*424fb153SAndroid Build Coastguard Worker
318*424fb153SAndroid Build Coastguard Worker // Add XMM0 to XMM6 and then add XMM6 to XMM7.
319*424fb153SAndroid Build Coastguard Worker // Repeat this for XMM1, ..., XMM5.
320*424fb153SAndroid Build Coastguard Worker // Overflow(for XMM7) can occur only if there are more
321*424fb153SAndroid Build Coastguard Worker // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so
322*424fb153SAndroid Build Coastguard Worker // if size_in_bytes > 2^19 than overflow occurs.
323*424fb153SAndroid Build Coastguard Worker "paddq %%xmm0, %%xmm6;"
324*424fb153SAndroid Build Coastguard Worker "paddq %%xmm6, %%xmm7;"
325*424fb153SAndroid Build Coastguard Worker "paddq %%xmm1, %%xmm6;"
326*424fb153SAndroid Build Coastguard Worker "paddq %%xmm6, %%xmm7;"
327*424fb153SAndroid Build Coastguard Worker "paddq %%xmm2, %%xmm6;"
328*424fb153SAndroid Build Coastguard Worker "paddq %%xmm6, %%xmm7;"
329*424fb153SAndroid Build Coastguard Worker "paddq %%xmm3, %%xmm6;"
330*424fb153SAndroid Build Coastguard Worker "paddq %%xmm6, %%xmm7;"
331*424fb153SAndroid Build Coastguard Worker "paddq %%xmm4, %%xmm6;"
332*424fb153SAndroid Build Coastguard Worker "paddq %%xmm6, %%xmm7;"
333*424fb153SAndroid Build Coastguard Worker "paddq %%xmm5, %%xmm6;"
334*424fb153SAndroid Build Coastguard Worker "paddq %%xmm6, %%xmm7;"
335*424fb153SAndroid Build Coastguard Worker
336*424fb153SAndroid Build Coastguard Worker // Increment ESI and EDI by 48 bytes and decrement counter by 1.
337*424fb153SAndroid Build Coastguard Worker "add $48, " rSI ";"
338*424fb153SAndroid Build Coastguard Worker "add $48, " rDI ";"
339*424fb153SAndroid Build Coastguard Worker "prefetchnta 0(" rSI ");"
340*424fb153SAndroid Build Coastguard Worker "prefetchnta 64(" rSI ");"
341*424fb153SAndroid Build Coastguard Worker "dec " rCX ";"
342*424fb153SAndroid Build Coastguard Worker "jnz TOP;"
343*424fb153SAndroid Build Coastguard Worker
344*424fb153SAndroid Build Coastguard Worker // Now only remaining_words 32-bit words are left.
345*424fb153SAndroid Build Coastguard Worker // make a loop, add first two words to a1 and next two to a2 (just like
346*424fb153SAndroid Build Coastguard Worker // above loop, the only extra thing we are doing is rechecking
347*424fb153SAndroid Build Coastguard Worker // rDX (=remaining_words) everytime we add a number to a1/a2.
348*424fb153SAndroid Build Coastguard Worker "REM_IS_STILL_NOT_ZERO:\n"
349*424fb153SAndroid Build Coastguard Worker // Unless remaining_words becomes less than 4 words(16 bytes)
350*424fb153SAndroid Build Coastguard Worker // there is not much issue and remaining_words will always
351*424fb153SAndroid Build Coastguard Worker // be a multiple of four by assumption.
352*424fb153SAndroid Build Coastguard Worker "cmp $4, " rDX ";"
353*424fb153SAndroid Build Coastguard Worker // In case for some weird reasons if remaining_words becomes
354*424fb153SAndroid Build Coastguard Worker // less than 4 but not zero then also break the code and go off to END.
355*424fb153SAndroid Build Coastguard Worker "jl END;"
356*424fb153SAndroid Build Coastguard Worker // Otherwise just go on and copy data in chunks of 4-words at a time till
357*424fb153SAndroid Build Coastguard Worker // whole data (<48 bytes) is copied.
358*424fb153SAndroid Build Coastguard Worker "movdqa 0(" rSI "), %%xmm0;" // Copy next 4-words to XMM0 and to XMM1.
359*424fb153SAndroid Build Coastguard Worker
360*424fb153SAndroid Build Coastguard Worker "movdqa 0(" rSI "), %%xmm5;" // Accomplish movdqu 4(%rSI) without
361*424fb153SAndroid Build Coastguard Worker "pshufd $0x39, %%xmm5, %%xmm1;" // indexing off memory boundary.
362*424fb153SAndroid Build Coastguard Worker
363*424fb153SAndroid Build Coastguard Worker "movntdq %%xmm0, 0(" rDI ");" // Copy 4-words to destination.
364*424fb153SAndroid Build Coastguard Worker "andps 32(" rAX "), %%xmm0;"
365*424fb153SAndroid Build Coastguard Worker "andps 32(" rAX "), %%xmm1;"
366*424fb153SAndroid Build Coastguard Worker "paddq %%xmm0, %%xmm6;"
367*424fb153SAndroid Build Coastguard Worker "paddq %%xmm6, %%xmm7;"
368*424fb153SAndroid Build Coastguard Worker "paddq %%xmm1, %%xmm6;"
369*424fb153SAndroid Build Coastguard Worker "paddq %%xmm6, %%xmm7;"
370*424fb153SAndroid Build Coastguard Worker "add $16, " rSI ";"
371*424fb153SAndroid Build Coastguard Worker "add $16, " rDI ";"
372*424fb153SAndroid Build Coastguard Worker "sub $4, " rDX ";"
373*424fb153SAndroid Build Coastguard Worker // Decrement %rDX by 4 since %rDX is number of 32-bit
374*424fb153SAndroid Build Coastguard Worker // words left after considering all 48-byte units.
375*424fb153SAndroid Build Coastguard Worker "jmp REM_IS_STILL_NOT_ZERO;"
376*424fb153SAndroid Build Coastguard Worker
377*424fb153SAndroid Build Coastguard Worker "END:\n"
378*424fb153SAndroid Build Coastguard Worker // Report checksum values A and B (both right now are two concatenated
379*424fb153SAndroid Build Coastguard Worker // 64 bit numbers and have to be converted to 64 bit numbers)
380*424fb153SAndroid Build Coastguard Worker // seems like Adler128 (since size of each part is 4 byte rather than
381*424fb153SAndroid Build Coastguard Worker // 1 byte).
382*424fb153SAndroid Build Coastguard Worker "movdqa %%xmm6, 0(" rAX ");"
383*424fb153SAndroid Build Coastguard Worker "movdqa %%xmm7, 16(" rAX ");"
384*424fb153SAndroid Build Coastguard Worker "sfence;"
385*424fb153SAndroid Build Coastguard Worker
386*424fb153SAndroid Build Coastguard Worker // No output registers.
387*424fb153SAndroid Build Coastguard Worker :
388*424fb153SAndroid Build Coastguard Worker // Input registers.
389*424fb153SAndroid Build Coastguard Worker : "S" (srcmem64), "D" (dstmem64), "a" (checksum_arr),
390*424fb153SAndroid Build Coastguard Worker "c" (num_of_48_byte_units), "d" (remaining_words)
391*424fb153SAndroid Build Coastguard Worker ); // asm.
392*424fb153SAndroid Build Coastguard Worker
393*424fb153SAndroid Build Coastguard Worker if (checksum != NULL) {
394*424fb153SAndroid Build Coastguard Worker checksum->Set(checksum_arr[0], checksum_arr[1],
395*424fb153SAndroid Build Coastguard Worker checksum_arr[2], checksum_arr[3]);
396*424fb153SAndroid Build Coastguard Worker }
397*424fb153SAndroid Build Coastguard Worker
398*424fb153SAndroid Build Coastguard Worker // Everything went fine, so return true (this does not mean
399*424fb153SAndroid Build Coastguard Worker // that there is no problem with memory this just mean that data was copied
400*424fb153SAndroid Build Coastguard Worker // from src to dst and checksum was calculated successfully).
401*424fb153SAndroid Build Coastguard Worker return true;
402*424fb153SAndroid Build Coastguard Worker #elif defined(STRESSAPPTEST_CPU_ARMV7A) && defined(__ARM_NEON__)
403*424fb153SAndroid Build Coastguard Worker // Elements 0 to 3 are used for holding checksum terms a1, a2,
404*424fb153SAndroid Build Coastguard Worker // b1, b2 respectively. These elements are filled by asm code.
405*424fb153SAndroid Build Coastguard Worker // Checksum is seeded with the null checksum.
406*424fb153SAndroid Build Coastguard Worker volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
407*424fb153SAndroid Build Coastguard Worker {1, 1, 0, 0};
408*424fb153SAndroid Build Coastguard Worker
409*424fb153SAndroid Build Coastguard Worker if ((size_in_bytes >> 19) > 0) {
410*424fb153SAndroid Build Coastguard Worker // Size is too large. Must be less than 2^19 bytes = 512 KB.
411*424fb153SAndroid Build Coastguard Worker return false;
412*424fb153SAndroid Build Coastguard Worker }
413*424fb153SAndroid Build Coastguard Worker
414*424fb153SAndroid Build Coastguard Worker // Since we are moving 64 bytes at a time number of iterations = total size/64
415*424fb153SAndroid Build Coastguard Worker uint32 blocks = size_in_bytes / 64;
416*424fb153SAndroid Build Coastguard Worker
417*424fb153SAndroid Build Coastguard Worker uint64 *dst = dstmem64;
418*424fb153SAndroid Build Coastguard Worker uint64 *src = srcmem64;
419*424fb153SAndroid Build Coastguard Worker
420*424fb153SAndroid Build Coastguard Worker #define src_r "r3"
421*424fb153SAndroid Build Coastguard Worker #define dst_r "r4"
422*424fb153SAndroid Build Coastguard Worker #define blocks_r "r5"
423*424fb153SAndroid Build Coastguard Worker #define crc_r "r6"
424*424fb153SAndroid Build Coastguard Worker
425*424fb153SAndroid Build Coastguard Worker asm volatile (
426*424fb153SAndroid Build Coastguard Worker "mov " src_r ", %[src]; \n"
427*424fb153SAndroid Build Coastguard Worker "mov " dst_r ", %[dst]; \n"
428*424fb153SAndroid Build Coastguard Worker "mov " crc_r ", %[crc]; \n"
429*424fb153SAndroid Build Coastguard Worker "mov " blocks_r ", %[blocks]; \n"
430*424fb153SAndroid Build Coastguard Worker
431*424fb153SAndroid Build Coastguard Worker // Loop over block count.
432*424fb153SAndroid Build Coastguard Worker "cmp " blocks_r ", #0; \n" // Compare counter to zero.
433*424fb153SAndroid Build Coastguard Worker "ble END; \n"
434*424fb153SAndroid Build Coastguard Worker
435*424fb153SAndroid Build Coastguard Worker
436*424fb153SAndroid Build Coastguard Worker // Preload upcoming cacheline.
437*424fb153SAndroid Build Coastguard Worker "pld [" src_r ", #0x0]; \n"
438*424fb153SAndroid Build Coastguard Worker "pld [" src_r ", #0x20]; \n"
439*424fb153SAndroid Build Coastguard Worker
440*424fb153SAndroid Build Coastguard Worker // Init checksum
441*424fb153SAndroid Build Coastguard Worker "vldm " crc_r ", {q0}; \n"
442*424fb153SAndroid Build Coastguard Worker "vmov.i32 q1, #0; \n"
443*424fb153SAndroid Build Coastguard Worker
444*424fb153SAndroid Build Coastguard Worker // Start of the loop which copies 48 bytes from source to dst each time.
445*424fb153SAndroid Build Coastguard Worker "TOP: \n"
446*424fb153SAndroid Build Coastguard Worker
447*424fb153SAndroid Build Coastguard Worker // Make 3 moves each of 16 bytes from srcmem to qX registers.
448*424fb153SAndroid Build Coastguard Worker // We are using 2 words out of 4 words in each qX register,
449*424fb153SAndroid Build Coastguard Worker // word index 0 and word index 2. We'll swizzle them in a bit.
450*424fb153SAndroid Build Coastguard Worker // Copy it.
451*424fb153SAndroid Build Coastguard Worker "vldm " src_r "!, {q8, q9, q10, q11}; \n"
452*424fb153SAndroid Build Coastguard Worker "vstm " dst_r "!, {q8, q9, q10, q11}; \n"
453*424fb153SAndroid Build Coastguard Worker
454*424fb153SAndroid Build Coastguard Worker // Arrange it.
455*424fb153SAndroid Build Coastguard Worker "vmov.i64 q12, #0; \n"
456*424fb153SAndroid Build Coastguard Worker "vmov.i64 q13, #0; \n"
457*424fb153SAndroid Build Coastguard Worker "vmov.i64 q14, #0; \n"
458*424fb153SAndroid Build Coastguard Worker "vmov.i64 q15, #0; \n"
459*424fb153SAndroid Build Coastguard Worker // This exchenges words 1,3 in the filled registers with
460*424fb153SAndroid Build Coastguard Worker // words 0,2 in the empty registers.
461*424fb153SAndroid Build Coastguard Worker "vtrn.32 q8, q12; \n"
462*424fb153SAndroid Build Coastguard Worker "vtrn.32 q9, q13; \n"
463*424fb153SAndroid Build Coastguard Worker "vtrn.32 q10, q14; \n"
464*424fb153SAndroid Build Coastguard Worker "vtrn.32 q11, q15; \n"
465*424fb153SAndroid Build Coastguard Worker
466*424fb153SAndroid Build Coastguard Worker // Sum into q0, then into q1.
467*424fb153SAndroid Build Coastguard Worker // Repeat this for q8 - q13.
468*424fb153SAndroid Build Coastguard Worker // Overflow can occur only if there are more
469*424fb153SAndroid Build Coastguard Worker // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so
470*424fb153SAndroid Build Coastguard Worker // if size_in_bytes > 2^19 than overflow occurs.
471*424fb153SAndroid Build Coastguard Worker "vadd.i64 q0, q0, q8; \n"
472*424fb153SAndroid Build Coastguard Worker "vadd.i64 q1, q1, q0; \n"
473*424fb153SAndroid Build Coastguard Worker "vadd.i64 q0, q0, q12; \n"
474*424fb153SAndroid Build Coastguard Worker "vadd.i64 q1, q1, q0; \n"
475*424fb153SAndroid Build Coastguard Worker "vadd.i64 q0, q0, q9; \n"
476*424fb153SAndroid Build Coastguard Worker "vadd.i64 q1, q1, q0; \n"
477*424fb153SAndroid Build Coastguard Worker "vadd.i64 q0, q0, q13; \n"
478*424fb153SAndroid Build Coastguard Worker "vadd.i64 q1, q1, q0; \n"
479*424fb153SAndroid Build Coastguard Worker
480*424fb153SAndroid Build Coastguard Worker "vadd.i64 q0, q0, q10; \n"
481*424fb153SAndroid Build Coastguard Worker "vadd.i64 q1, q1, q0; \n"
482*424fb153SAndroid Build Coastguard Worker "vadd.i64 q0, q0, q14; \n"
483*424fb153SAndroid Build Coastguard Worker "vadd.i64 q1, q1, q0; \n"
484*424fb153SAndroid Build Coastguard Worker "vadd.i64 q0, q0, q11; \n"
485*424fb153SAndroid Build Coastguard Worker "vadd.i64 q1, q1, q0; \n"
486*424fb153SAndroid Build Coastguard Worker "vadd.i64 q0, q0, q15; \n"
487*424fb153SAndroid Build Coastguard Worker "vadd.i64 q1, q1, q0; \n"
488*424fb153SAndroid Build Coastguard Worker
489*424fb153SAndroid Build Coastguard Worker // Increment counter and loop.
490*424fb153SAndroid Build Coastguard Worker "sub " blocks_r ", " blocks_r ", #1; \n"
491*424fb153SAndroid Build Coastguard Worker "cmp " blocks_r ", #0; \n" // Compare counter to zero.
492*424fb153SAndroid Build Coastguard Worker "bgt TOP; \n"
493*424fb153SAndroid Build Coastguard Worker
494*424fb153SAndroid Build Coastguard Worker
495*424fb153SAndroid Build Coastguard Worker "END:\n"
496*424fb153SAndroid Build Coastguard Worker // Report checksum values A and B (both right now are two concatenated
497*424fb153SAndroid Build Coastguard Worker // 64 bit numbers and have to be converted to 64 bit numbers)
498*424fb153SAndroid Build Coastguard Worker // seems like Adler128 (since size of each part is 4 byte rather than
499*424fb153SAndroid Build Coastguard Worker // 1 byte).
500*424fb153SAndroid Build Coastguard Worker "vstm " crc_r ", {q0, q1}; \n"
501*424fb153SAndroid Build Coastguard Worker
502*424fb153SAndroid Build Coastguard Worker // Output registers.
503*424fb153SAndroid Build Coastguard Worker :
504*424fb153SAndroid Build Coastguard Worker // Input registers.
505*424fb153SAndroid Build Coastguard Worker : [src] "r"(src), [dst] "r"(dst), [blocks] "r"(blocks) , [crc] "r"(checksum_arr)
506*424fb153SAndroid Build Coastguard Worker : "memory", "cc", "r3", "r4", "r5", "r6", "q0", "q1", "q8","q9","q10", "q11", "q12","q13","q14","q15"
507*424fb153SAndroid Build Coastguard Worker ); // asm.
508*424fb153SAndroid Build Coastguard Worker
509*424fb153SAndroid Build Coastguard Worker if (checksum != NULL) {
510*424fb153SAndroid Build Coastguard Worker checksum->Set(checksum_arr[0], checksum_arr[1],
511*424fb153SAndroid Build Coastguard Worker checksum_arr[2], checksum_arr[3]);
512*424fb153SAndroid Build Coastguard Worker }
513*424fb153SAndroid Build Coastguard Worker
514*424fb153SAndroid Build Coastguard Worker // Everything went fine, so return true (this does not mean
515*424fb153SAndroid Build Coastguard Worker // that there is no problem with memory this just mean that data was copied
516*424fb153SAndroid Build Coastguard Worker // from src to dst and checksum was calculated successfully).
517*424fb153SAndroid Build Coastguard Worker return true;
518*424fb153SAndroid Build Coastguard Worker #else
519*424fb153SAndroid Build Coastguard Worker #warning "No vector copy defined for this architecture."
520*424fb153SAndroid Build Coastguard Worker // Fall back to C implementation for anything else.
521*424fb153SAndroid Build Coastguard Worker return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum);
522*424fb153SAndroid Build Coastguard Worker #endif
523*424fb153SAndroid Build Coastguard Worker }
524