xref: /aosp_15_r20/external/stressapptest/src/adler32memcpy.cc (revision 424fb153c814cbcb3e8904974796228774b3229a)
1*424fb153SAndroid Build Coastguard Worker // Copyright 2008 Google Inc. All Rights Reserved.
2*424fb153SAndroid Build Coastguard Worker 
3*424fb153SAndroid Build Coastguard Worker // Licensed under the Apache License, Version 2.0 (the "License");
4*424fb153SAndroid Build Coastguard Worker // you may not use this file except in compliance with the License.
5*424fb153SAndroid Build Coastguard Worker // You may obtain a copy of the License at
6*424fb153SAndroid Build Coastguard Worker 
7*424fb153SAndroid Build Coastguard Worker //      http://www.apache.org/licenses/LICENSE-2.0
8*424fb153SAndroid Build Coastguard Worker 
9*424fb153SAndroid Build Coastguard Worker // Unless required by applicable law or agreed to in writing, software
10*424fb153SAndroid Build Coastguard Worker // distributed under the License is distributed on an "AS IS" BASIS,
11*424fb153SAndroid Build Coastguard Worker // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*424fb153SAndroid Build Coastguard Worker // See the License for the specific language governing permissions and
13*424fb153SAndroid Build Coastguard Worker // limitations under the License.
14*424fb153SAndroid Build Coastguard Worker 
15*424fb153SAndroid Build Coastguard Worker #include "adler32memcpy.h"
16*424fb153SAndroid Build Coastguard Worker 
17*424fb153SAndroid Build Coastguard Worker // We are using (a modified form of) adler-32 checksum algorithm instead
18*424fb153SAndroid Build Coastguard Worker // of CRC since adler-32 is faster than CRC.
19*424fb153SAndroid Build Coastguard Worker // (Comparison: http://guru.multimedia.cx/crc32-vs-adler32/)
20*424fb153SAndroid Build Coastguard Worker // This form of adler is bit modified, instead of treating the data in
21*424fb153SAndroid Build Coastguard Worker // units of bytes, 32-bit data is taken as a unit and two 64-bit
22*424fb153SAndroid Build Coastguard Worker // checksums are done (we could have one checksum but two checksums
23*424fb153SAndroid Build Coastguard Worker // make the code run faster).
24*424fb153SAndroid Build Coastguard Worker 
25*424fb153SAndroid Build Coastguard Worker // Adler-32 implementation:
26*424fb153SAndroid Build Coastguard Worker //   Data is treated as 1-byte numbers and,
27*424fb153SAndroid Build Coastguard Worker //   there are two 16-bit numbers a and b
28*424fb153SAndroid Build Coastguard Worker //   Initialize a with 1 and b with 0.
29*424fb153SAndroid Build Coastguard Worker //   for each data unit 'd'
30*424fb153SAndroid Build Coastguard Worker //      a += d
31*424fb153SAndroid Build Coastguard Worker //      b += a
32*424fb153SAndroid Build Coastguard Worker //   checksum = a<<16 + b
33*424fb153SAndroid Build Coastguard Worker //   This sum should never overflow.
34*424fb153SAndroid Build Coastguard Worker //
35*424fb153SAndroid Build Coastguard Worker // Adler-64+64 implementation:
36*424fb153SAndroid Build Coastguard Worker //   (applied in this code)
37*424fb153SAndroid Build Coastguard Worker //   Data is treated as 32-bit numbers and whole data is separated into two
38*424fb153SAndroid Build Coastguard Worker //   streams, and hence the two checksums a1, a2, b1 and b2.
39*424fb153SAndroid Build Coastguard Worker //   Initialize a1 and a2 with 1, b1 and b2 with 0
40*424fb153SAndroid Build Coastguard Worker //   add first dataunit to a1
41*424fb153SAndroid Build Coastguard Worker //   add a1 to b1
42*424fb153SAndroid Build Coastguard Worker //   add second dataunit to a1
43*424fb153SAndroid Build Coastguard Worker //   add a1 to b1
44*424fb153SAndroid Build Coastguard Worker //   add third dataunit to a2
45*424fb153SAndroid Build Coastguard Worker //   add a2 to b2
46*424fb153SAndroid Build Coastguard Worker //   add fourth dataunit to a2
47*424fb153SAndroid Build Coastguard Worker //   add a2 to b2
48*424fb153SAndroid Build Coastguard Worker //   ...
49*424fb153SAndroid Build Coastguard Worker //   repeat the sequence back for next 4 dataunits
50*424fb153SAndroid Build Coastguard Worker //
51*424fb153SAndroid Build Coastguard Worker //   variable A = XMM6 and variable B = XMM7.
52*424fb153SAndroid Build Coastguard Worker //   (a1 = lower 8 bytes of XMM6 and b1 = lower 8 bytes of XMM7)
53*424fb153SAndroid Build Coastguard Worker 
54*424fb153SAndroid Build Coastguard Worker // Assumptions
55*424fb153SAndroid Build Coastguard Worker // 1. size_in_bytes is a multiple of 16.
56*424fb153SAndroid Build Coastguard Worker // 2. srcmem and dstmem are 16 byte aligned.
57*424fb153SAndroid Build Coastguard Worker // 3. size_in_bytes is less than 2^19 bytes.
58*424fb153SAndroid Build Coastguard Worker 
59*424fb153SAndroid Build Coastguard Worker // Assumption 3 ensures that there is no overflow when numbers are being
60*424fb153SAndroid Build Coastguard Worker // added (we can remove this assumption by doing modulus with a prime
61*424fb153SAndroid Build Coastguard Worker // number when it is just about to overflow but that would be a very costly
62*424fb153SAndroid Build Coastguard Worker // exercise)
63*424fb153SAndroid Build Coastguard Worker 
64*424fb153SAndroid Build Coastguard Worker // Returns true if the checksums are equal.
Equals(const AdlerChecksum & other) const65*424fb153SAndroid Build Coastguard Worker bool AdlerChecksum::Equals(const AdlerChecksum &other) const {
66*424fb153SAndroid Build Coastguard Worker   return ( (a1_ == other.a1_) && (a2_ == other.a2_) &&
67*424fb153SAndroid Build Coastguard Worker            (b1_ == other.b1_) && (b2_ == other.b2_) );
68*424fb153SAndroid Build Coastguard Worker }
69*424fb153SAndroid Build Coastguard Worker 
70*424fb153SAndroid Build Coastguard Worker // Returns string representation of the Adler checksum.
ToHexString() const71*424fb153SAndroid Build Coastguard Worker string AdlerChecksum::ToHexString() const {
72*424fb153SAndroid Build Coastguard Worker   char buffer[128];
73*424fb153SAndroid Build Coastguard Worker   snprintf(buffer, sizeof(buffer), "%016llx %016llx %016llx %016llx", a1_, a2_, b1_, b2_);
74*424fb153SAndroid Build Coastguard Worker   return string(buffer);
75*424fb153SAndroid Build Coastguard Worker }
76*424fb153SAndroid Build Coastguard Worker 
77*424fb153SAndroid Build Coastguard Worker // Sets components of the Adler checksum.
Set(uint64 a1,uint64 a2,uint64 b1,uint64 b2)78*424fb153SAndroid Build Coastguard Worker void AdlerChecksum::Set(uint64 a1, uint64 a2, uint64 b1, uint64 b2) {
79*424fb153SAndroid Build Coastguard Worker   a1_ = a1;
80*424fb153SAndroid Build Coastguard Worker   a2_ = a2;
81*424fb153SAndroid Build Coastguard Worker   b1_ = b1;
82*424fb153SAndroid Build Coastguard Worker   b2_ = b2;
83*424fb153SAndroid Build Coastguard Worker }
84*424fb153SAndroid Build Coastguard Worker 
85*424fb153SAndroid Build Coastguard Worker // Calculates Adler checksum for supplied data.
CalculateAdlerChecksum(uint64 * data64,unsigned int size_in_bytes,AdlerChecksum * checksum)86*424fb153SAndroid Build Coastguard Worker bool CalculateAdlerChecksum(uint64 *data64, unsigned int size_in_bytes,
87*424fb153SAndroid Build Coastguard Worker                             AdlerChecksum *checksum) {
88*424fb153SAndroid Build Coastguard Worker   // Use this data wrapper to access memory with 64bit read/write.
89*424fb153SAndroid Build Coastguard Worker   datacast_t data;
90*424fb153SAndroid Build Coastguard Worker   unsigned int count = size_in_bytes / sizeof(data);
91*424fb153SAndroid Build Coastguard Worker 
92*424fb153SAndroid Build Coastguard Worker   if (count > (1U) << 19) {
93*424fb153SAndroid Build Coastguard Worker     // Size is too large, must be strictly less than 512 KB.
94*424fb153SAndroid Build Coastguard Worker     return false;
95*424fb153SAndroid Build Coastguard Worker   }
96*424fb153SAndroid Build Coastguard Worker 
97*424fb153SAndroid Build Coastguard Worker   uint64 a1 = 1;
98*424fb153SAndroid Build Coastguard Worker   uint64 a2 = 1;
99*424fb153SAndroid Build Coastguard Worker   uint64 b1 = 0;
100*424fb153SAndroid Build Coastguard Worker   uint64 b2 = 0;
101*424fb153SAndroid Build Coastguard Worker 
102*424fb153SAndroid Build Coastguard Worker   unsigned int i = 0;
103*424fb153SAndroid Build Coastguard Worker   while (i < count) {
104*424fb153SAndroid Build Coastguard Worker     // Process 64 bits at a time.
105*424fb153SAndroid Build Coastguard Worker     data.l64 = data64[i];
106*424fb153SAndroid Build Coastguard Worker     a1 = a1 + data.l32.l;
107*424fb153SAndroid Build Coastguard Worker     b1 = b1 + a1;
108*424fb153SAndroid Build Coastguard Worker     a1 = a1 + data.l32.h;
109*424fb153SAndroid Build Coastguard Worker     b1 = b1 + a1;
110*424fb153SAndroid Build Coastguard Worker     i++;
111*424fb153SAndroid Build Coastguard Worker 
112*424fb153SAndroid Build Coastguard Worker     data.l64 = data64[i];
113*424fb153SAndroid Build Coastguard Worker     a2 = a2 + data.l32.l;
114*424fb153SAndroid Build Coastguard Worker     b2 = b2 + a2;
115*424fb153SAndroid Build Coastguard Worker     a2 = a2 + data.l32.h;
116*424fb153SAndroid Build Coastguard Worker     b2 = b2 + a2;
117*424fb153SAndroid Build Coastguard Worker     i++;
118*424fb153SAndroid Build Coastguard Worker   }
119*424fb153SAndroid Build Coastguard Worker   checksum->Set(a1, a2, b1, b2);
120*424fb153SAndroid Build Coastguard Worker   return true;
121*424fb153SAndroid Build Coastguard Worker }
122*424fb153SAndroid Build Coastguard Worker 
123*424fb153SAndroid Build Coastguard Worker // C implementation of Adler memory copy.
AdlerMemcpyC(uint64 * dstmem64,uint64 * srcmem64,unsigned int size_in_bytes,AdlerChecksum * checksum)124*424fb153SAndroid Build Coastguard Worker bool AdlerMemcpyC(uint64 *dstmem64, uint64 *srcmem64,
125*424fb153SAndroid Build Coastguard Worker                   unsigned int size_in_bytes, AdlerChecksum *checksum) {
126*424fb153SAndroid Build Coastguard Worker   // Use this data wrapper to access memory with 64bit read/write.
127*424fb153SAndroid Build Coastguard Worker   datacast_t data;
128*424fb153SAndroid Build Coastguard Worker   unsigned int count = size_in_bytes / sizeof(data);
129*424fb153SAndroid Build Coastguard Worker 
130*424fb153SAndroid Build Coastguard Worker   if (count > ((1U) << 19)) {
131*424fb153SAndroid Build Coastguard Worker     // Size is too large, must be strictly less than 512 KB.
132*424fb153SAndroid Build Coastguard Worker     return false;
133*424fb153SAndroid Build Coastguard Worker   }
134*424fb153SAndroid Build Coastguard Worker 
135*424fb153SAndroid Build Coastguard Worker   uint64 a1 = 1;
136*424fb153SAndroid Build Coastguard Worker   uint64 a2 = 1;
137*424fb153SAndroid Build Coastguard Worker   uint64 b1 = 0;
138*424fb153SAndroid Build Coastguard Worker   uint64 b2 = 0;
139*424fb153SAndroid Build Coastguard Worker 
140*424fb153SAndroid Build Coastguard Worker   unsigned int i = 0;
141*424fb153SAndroid Build Coastguard Worker   while (i < count) {
142*424fb153SAndroid Build Coastguard Worker     // Process 64 bits at a time.
143*424fb153SAndroid Build Coastguard Worker     data.l64 = srcmem64[i];
144*424fb153SAndroid Build Coastguard Worker     a1 = a1 + data.l32.l;
145*424fb153SAndroid Build Coastguard Worker     b1 = b1 + a1;
146*424fb153SAndroid Build Coastguard Worker     a1 = a1 + data.l32.h;
147*424fb153SAndroid Build Coastguard Worker     b1 = b1 + a1;
148*424fb153SAndroid Build Coastguard Worker     dstmem64[i] = data.l64;
149*424fb153SAndroid Build Coastguard Worker     i++;
150*424fb153SAndroid Build Coastguard Worker 
151*424fb153SAndroid Build Coastguard Worker     data.l64 = srcmem64[i];
152*424fb153SAndroid Build Coastguard Worker     a2 = a2 + data.l32.l;
153*424fb153SAndroid Build Coastguard Worker     b2 = b2 + a2;
154*424fb153SAndroid Build Coastguard Worker     a2 = a2 + data.l32.h;
155*424fb153SAndroid Build Coastguard Worker     b2 = b2 + a2;
156*424fb153SAndroid Build Coastguard Worker     dstmem64[i] = data.l64;
157*424fb153SAndroid Build Coastguard Worker     i++;
158*424fb153SAndroid Build Coastguard Worker   }
159*424fb153SAndroid Build Coastguard Worker   checksum->Set(a1, a2, b1, b2);
160*424fb153SAndroid Build Coastguard Worker   return true;
161*424fb153SAndroid Build Coastguard Worker }
162*424fb153SAndroid Build Coastguard Worker 
163*424fb153SAndroid Build Coastguard Worker // C implementation of Adler memory copy with some float point ops,
164*424fb153SAndroid Build Coastguard Worker // attempting to warm up the CPU.
AdlerMemcpyWarmC(uint64 * dstmem64,uint64 * srcmem64,unsigned int size_in_bytes,AdlerChecksum * checksum)165*424fb153SAndroid Build Coastguard Worker bool AdlerMemcpyWarmC(uint64 *dstmem64, uint64 *srcmem64,
166*424fb153SAndroid Build Coastguard Worker                       unsigned int size_in_bytes, AdlerChecksum *checksum) {
167*424fb153SAndroid Build Coastguard Worker   // Use this data wrapper to access memory with 64bit read/write.
168*424fb153SAndroid Build Coastguard Worker   datacast_t data;
169*424fb153SAndroid Build Coastguard Worker   unsigned int count = size_in_bytes / sizeof(data);
170*424fb153SAndroid Build Coastguard Worker 
171*424fb153SAndroid Build Coastguard Worker   if (count > ((1U) << 19)) {
172*424fb153SAndroid Build Coastguard Worker     // Size is too large, must be strictly less than 512 KB.
173*424fb153SAndroid Build Coastguard Worker     return false;
174*424fb153SAndroid Build Coastguard Worker   }
175*424fb153SAndroid Build Coastguard Worker 
176*424fb153SAndroid Build Coastguard Worker   uint64 a1 = 1;
177*424fb153SAndroid Build Coastguard Worker   uint64 a2 = 1;
178*424fb153SAndroid Build Coastguard Worker   uint64 b1 = 0;
179*424fb153SAndroid Build Coastguard Worker   uint64 b2 = 0;
180*424fb153SAndroid Build Coastguard Worker 
181*424fb153SAndroid Build Coastguard Worker   double a = 2.0 * static_cast<double>(srcmem64[0]);
182*424fb153SAndroid Build Coastguard Worker   double b = 5.0 * static_cast<double>(srcmem64[0]);
183*424fb153SAndroid Build Coastguard Worker   double c = 7.0 * static_cast<double>(srcmem64[0]);
184*424fb153SAndroid Build Coastguard Worker   double d = 9.0 * static_cast<double>(srcmem64[0]);
185*424fb153SAndroid Build Coastguard Worker 
186*424fb153SAndroid Build Coastguard Worker   unsigned int i = 0;
187*424fb153SAndroid Build Coastguard Worker   while (i < count) {
188*424fb153SAndroid Build Coastguard Worker     // Process 64 bits at a time.
189*424fb153SAndroid Build Coastguard Worker     data.l64 = srcmem64[i];
190*424fb153SAndroid Build Coastguard Worker     a1 = a1 + data.l32.l;
191*424fb153SAndroid Build Coastguard Worker     b1 = b1 + a1;
192*424fb153SAndroid Build Coastguard Worker     a1 = a1 + data.l32.h;
193*424fb153SAndroid Build Coastguard Worker     b1 = b1 + a1;
194*424fb153SAndroid Build Coastguard Worker     dstmem64[i] = data.l64;
195*424fb153SAndroid Build Coastguard Worker     i++;
196*424fb153SAndroid Build Coastguard Worker 
197*424fb153SAndroid Build Coastguard Worker     // Warm cpu up.
198*424fb153SAndroid Build Coastguard Worker     a = a * b;
199*424fb153SAndroid Build Coastguard Worker     b = b + c;
200*424fb153SAndroid Build Coastguard Worker 
201*424fb153SAndroid Build Coastguard Worker     data.l64 = srcmem64[i];
202*424fb153SAndroid Build Coastguard Worker     a2 = a2 + data.l32.l;
203*424fb153SAndroid Build Coastguard Worker     b2 = b2 + a2;
204*424fb153SAndroid Build Coastguard Worker     a2 = a2 + data.l32.h;
205*424fb153SAndroid Build Coastguard Worker     b2 = b2 + a2;
206*424fb153SAndroid Build Coastguard Worker     dstmem64[i] = data.l64;
207*424fb153SAndroid Build Coastguard Worker     i++;
208*424fb153SAndroid Build Coastguard Worker 
209*424fb153SAndroid Build Coastguard Worker     // Warm cpu up.
210*424fb153SAndroid Build Coastguard Worker     c = c * d;
211*424fb153SAndroid Build Coastguard Worker     d = d + d;
212*424fb153SAndroid Build Coastguard Worker   }
213*424fb153SAndroid Build Coastguard Worker 
214*424fb153SAndroid Build Coastguard Worker   // Warm cpu up.
215*424fb153SAndroid Build Coastguard Worker   d = a + b + c + d;
216*424fb153SAndroid Build Coastguard Worker   if (d == 1.0) {
217*424fb153SAndroid Build Coastguard Worker     // Reference the result so that it can't be discarded by the compiler.
218*424fb153SAndroid Build Coastguard Worker     printf("Log: This will probably never happen.\n");
219*424fb153SAndroid Build Coastguard Worker   }
220*424fb153SAndroid Build Coastguard Worker 
221*424fb153SAndroid Build Coastguard Worker   checksum->Set(a1, a2, b1, b2);
222*424fb153SAndroid Build Coastguard Worker   return true;
223*424fb153SAndroid Build Coastguard Worker }
224*424fb153SAndroid Build Coastguard Worker 
225*424fb153SAndroid Build Coastguard Worker // x86_64 SSE2 assembly implementation of fast and stressful Adler memory copy.
AdlerMemcpyAsm(uint64 * dstmem64,uint64 * srcmem64,unsigned int size_in_bytes,AdlerChecksum * checksum)226*424fb153SAndroid Build Coastguard Worker bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
227*424fb153SAndroid Build Coastguard Worker                     unsigned int size_in_bytes, AdlerChecksum *checksum) {
228*424fb153SAndroid Build Coastguard Worker // Use assembly implementation where supported.
229*424fb153SAndroid Build Coastguard Worker #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
230*424fb153SAndroid Build Coastguard Worker 
231*424fb153SAndroid Build Coastguard Worker // Pull a bit of tricky preprocessing to make the inline asm both
232*424fb153SAndroid Build Coastguard Worker // 32 bit and 64 bit.
233*424fb153SAndroid Build Coastguard Worker #ifdef STRESSAPPTEST_CPU_I686  // Instead of coding both, x86...
234*424fb153SAndroid Build Coastguard Worker #define rAX "%%eax"
235*424fb153SAndroid Build Coastguard Worker #define rCX "%%ecx"
236*424fb153SAndroid Build Coastguard Worker #define rDX "%%edx"
237*424fb153SAndroid Build Coastguard Worker #define rBX "%%ebx"
238*424fb153SAndroid Build Coastguard Worker #define rSP "%%esp"
239*424fb153SAndroid Build Coastguard Worker #define rBP "%%ebp"
240*424fb153SAndroid Build Coastguard Worker #define rSI "%%esi"
241*424fb153SAndroid Build Coastguard Worker #define rDI "%%edi"
242*424fb153SAndroid Build Coastguard Worker #endif
243*424fb153SAndroid Build Coastguard Worker 
244*424fb153SAndroid Build Coastguard Worker #ifdef STRESSAPPTEST_CPU_X86_64  // ...and x64, we use rXX macros.
245*424fb153SAndroid Build Coastguard Worker #define rAX "%%rax"
246*424fb153SAndroid Build Coastguard Worker #define rCX "%%rcx"
247*424fb153SAndroid Build Coastguard Worker #define rDX "%%rdx"
248*424fb153SAndroid Build Coastguard Worker #define rBX "%%rbx"
249*424fb153SAndroid Build Coastguard Worker #define rSP "%%rsp"
250*424fb153SAndroid Build Coastguard Worker #define rBP "%%rbp"
251*424fb153SAndroid Build Coastguard Worker #define rSI "%%rsi"
252*424fb153SAndroid Build Coastguard Worker #define rDI "%%rdi"
253*424fb153SAndroid Build Coastguard Worker #endif
254*424fb153SAndroid Build Coastguard Worker 
255*424fb153SAndroid Build Coastguard Worker   // Elements 0 to 3 are used for holding checksum terms a1, a2,
256*424fb153SAndroid Build Coastguard Worker   // b1, b2 respectively. These elements are filled by asm code.
257*424fb153SAndroid Build Coastguard Worker   // Elements 4 and 5 are used by asm code to for ANDing MMX data and removing
258*424fb153SAndroid Build Coastguard Worker   // 2 words from each MMX register (A MMX reg has 4 words, by ANDing we are
259*424fb153SAndroid Build Coastguard Worker   // setting word index 0 and word index 2 to zero).
260*424fb153SAndroid Build Coastguard Worker   // Element 6 and 7 are used for setting a1 and a2 to 1.
261*424fb153SAndroid Build Coastguard Worker   volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
262*424fb153SAndroid Build Coastguard Worker       {0, 0, 0, 0, 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1};
263*424fb153SAndroid Build Coastguard Worker 
264*424fb153SAndroid Build Coastguard Worker   if ((size_in_bytes >> 19) > 0) {
265*424fb153SAndroid Build Coastguard Worker     // Size is too large. Must be less than 2^19 bytes = 512 KB.
266*424fb153SAndroid Build Coastguard Worker     return false;
267*424fb153SAndroid Build Coastguard Worker   }
268*424fb153SAndroid Build Coastguard Worker 
269*424fb153SAndroid Build Coastguard Worker   // Number of 32-bit words which are not added to a1/a2 in the main loop.
270*424fb153SAndroid Build Coastguard Worker   uint32 remaining_words = (size_in_bytes % 48) / 4;
271*424fb153SAndroid Build Coastguard Worker 
272*424fb153SAndroid Build Coastguard Worker   // Since we are moving 48 bytes at a time number of iterations = total size/48
273*424fb153SAndroid Build Coastguard Worker   // is value of counter.
274*424fb153SAndroid Build Coastguard Worker   uint32 num_of_48_byte_units = size_in_bytes / 48;
275*424fb153SAndroid Build Coastguard Worker 
276*424fb153SAndroid Build Coastguard Worker   asm volatile (
277*424fb153SAndroid Build Coastguard Worker       // Source address is in ESI (extended source index)
278*424fb153SAndroid Build Coastguard Worker       // destination is in EDI (extended destination index)
279*424fb153SAndroid Build Coastguard Worker       // and counter is already in ECX (extended counter
280*424fb153SAndroid Build Coastguard Worker       // index).
281*424fb153SAndroid Build Coastguard Worker       "cmp  $0, " rCX ";"   // Compare counter to zero.
282*424fb153SAndroid Build Coastguard Worker       "jz END;"
283*424fb153SAndroid Build Coastguard Worker 
284*424fb153SAndroid Build Coastguard Worker       // XMM6 is initialized with 1 and XMM7 with 0.
285*424fb153SAndroid Build Coastguard Worker       "prefetchnta  0(" rSI ");"
286*424fb153SAndroid Build Coastguard Worker       "prefetchnta 64(" rSI ");"
287*424fb153SAndroid Build Coastguard Worker       "movdqu   48(" rAX "), %%xmm6;"
288*424fb153SAndroid Build Coastguard Worker       "xorps      %%xmm7, %%xmm7;"
289*424fb153SAndroid Build Coastguard Worker 
290*424fb153SAndroid Build Coastguard Worker       // Start of the loop which copies 48 bytes from source to dst each time.
291*424fb153SAndroid Build Coastguard Worker       "TOP:\n"
292*424fb153SAndroid Build Coastguard Worker 
293*424fb153SAndroid Build Coastguard Worker       // Make 6 moves each of 16 bytes from srcmem to XMM registers.
294*424fb153SAndroid Build Coastguard Worker       // We are using 2 words out of 4 words in each XMM register,
295*424fb153SAndroid Build Coastguard Worker       // word index 0 and word index 2
296*424fb153SAndroid Build Coastguard Worker       "movdqa   0(" rSI "), %%xmm0;"
297*424fb153SAndroid Build Coastguard Worker       "movdqu   4(" rSI "), %%xmm1;"  // Be careful to use unaligned move here.
298*424fb153SAndroid Build Coastguard Worker       "movdqa  16(" rSI "), %%xmm2;"
299*424fb153SAndroid Build Coastguard Worker       "movdqu  20(" rSI "), %%xmm3;"
300*424fb153SAndroid Build Coastguard Worker       "movdqa  32(" rSI "), %%xmm4;"
301*424fb153SAndroid Build Coastguard Worker       "movdqu  36(" rSI "), %%xmm5;"
302*424fb153SAndroid Build Coastguard Worker 
303*424fb153SAndroid Build Coastguard Worker       // Move 3 * 16 bytes from XMM registers to dstmem.
304*424fb153SAndroid Build Coastguard Worker       // Note: this copy must be performed before pinsrw instructions since
305*424fb153SAndroid Build Coastguard Worker       // they will modify the XMM registers.
306*424fb153SAndroid Build Coastguard Worker       "movntdq %%xmm0,  0(" rDI ");"
307*424fb153SAndroid Build Coastguard Worker       "movntdq %%xmm2, 16(" rDI ");"
308*424fb153SAndroid Build Coastguard Worker       "movntdq %%xmm4, 32(" rDI ");"
309*424fb153SAndroid Build Coastguard Worker 
310*424fb153SAndroid Build Coastguard Worker       // Sets the word[1] and word[3] of XMM0 to XMM5 to zero.
311*424fb153SAndroid Build Coastguard Worker       "andps 32(" rAX "), %%xmm0;"
312*424fb153SAndroid Build Coastguard Worker       "andps 32(" rAX "), %%xmm1;"
313*424fb153SAndroid Build Coastguard Worker       "andps 32(" rAX "), %%xmm2;"
314*424fb153SAndroid Build Coastguard Worker       "andps 32(" rAX "), %%xmm3;"
315*424fb153SAndroid Build Coastguard Worker       "andps 32(" rAX "), %%xmm4;"
316*424fb153SAndroid Build Coastguard Worker       "andps 32(" rAX "), %%xmm5;"
317*424fb153SAndroid Build Coastguard Worker 
318*424fb153SAndroid Build Coastguard Worker       // Add XMM0 to XMM6 and then add XMM6 to XMM7.
319*424fb153SAndroid Build Coastguard Worker       // Repeat this for XMM1, ..., XMM5.
320*424fb153SAndroid Build Coastguard Worker       // Overflow(for XMM7) can occur only if there are more
321*424fb153SAndroid Build Coastguard Worker       // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so
322*424fb153SAndroid Build Coastguard Worker       // if size_in_bytes > 2^19 than overflow occurs.
323*424fb153SAndroid Build Coastguard Worker       "paddq %%xmm0, %%xmm6;"
324*424fb153SAndroid Build Coastguard Worker       "paddq %%xmm6, %%xmm7;"
325*424fb153SAndroid Build Coastguard Worker       "paddq %%xmm1, %%xmm6;"
326*424fb153SAndroid Build Coastguard Worker       "paddq %%xmm6, %%xmm7;"
327*424fb153SAndroid Build Coastguard Worker       "paddq %%xmm2, %%xmm6;"
328*424fb153SAndroid Build Coastguard Worker       "paddq %%xmm6, %%xmm7;"
329*424fb153SAndroid Build Coastguard Worker       "paddq %%xmm3, %%xmm6;"
330*424fb153SAndroid Build Coastguard Worker       "paddq %%xmm6, %%xmm7;"
331*424fb153SAndroid Build Coastguard Worker       "paddq %%xmm4, %%xmm6;"
332*424fb153SAndroid Build Coastguard Worker       "paddq %%xmm6, %%xmm7;"
333*424fb153SAndroid Build Coastguard Worker       "paddq %%xmm5, %%xmm6;"
334*424fb153SAndroid Build Coastguard Worker       "paddq %%xmm6, %%xmm7;"
335*424fb153SAndroid Build Coastguard Worker 
336*424fb153SAndroid Build Coastguard Worker       // Increment ESI and EDI by 48 bytes and decrement counter by 1.
337*424fb153SAndroid Build Coastguard Worker       "add $48, " rSI ";"
338*424fb153SAndroid Build Coastguard Worker       "add $48, " rDI ";"
339*424fb153SAndroid Build Coastguard Worker       "prefetchnta  0(" rSI ");"
340*424fb153SAndroid Build Coastguard Worker       "prefetchnta 64(" rSI ");"
341*424fb153SAndroid Build Coastguard Worker       "dec " rCX ";"
342*424fb153SAndroid Build Coastguard Worker       "jnz TOP;"
343*424fb153SAndroid Build Coastguard Worker 
344*424fb153SAndroid Build Coastguard Worker       // Now only remaining_words 32-bit words are left.
345*424fb153SAndroid Build Coastguard Worker       // make a loop, add first two words to a1 and next two to a2 (just like
346*424fb153SAndroid Build Coastguard Worker       // above loop, the only extra thing we are doing is rechecking
347*424fb153SAndroid Build Coastguard Worker       // rDX (=remaining_words) everytime we add a number to a1/a2.
348*424fb153SAndroid Build Coastguard Worker       "REM_IS_STILL_NOT_ZERO:\n"
349*424fb153SAndroid Build Coastguard Worker       // Unless remaining_words becomes less than 4 words(16 bytes)
350*424fb153SAndroid Build Coastguard Worker       // there is not much issue and remaining_words will always
351*424fb153SAndroid Build Coastguard Worker       // be a multiple of four by assumption.
352*424fb153SAndroid Build Coastguard Worker       "cmp $4, " rDX ";"
353*424fb153SAndroid Build Coastguard Worker       // In case for some weird reasons if remaining_words becomes
354*424fb153SAndroid Build Coastguard Worker       // less than 4 but not zero then also break the code and go off to END.
355*424fb153SAndroid Build Coastguard Worker       "jl END;"
356*424fb153SAndroid Build Coastguard Worker       // Otherwise just go on and copy data in chunks of 4-words at a time till
357*424fb153SAndroid Build Coastguard Worker       // whole data (<48 bytes) is copied.
358*424fb153SAndroid Build Coastguard Worker       "movdqa  0(" rSI "), %%xmm0;"    // Copy next 4-words to XMM0 and to XMM1.
359*424fb153SAndroid Build Coastguard Worker 
360*424fb153SAndroid Build Coastguard Worker       "movdqa  0(" rSI "), %%xmm5;"    // Accomplish movdqu 4(%rSI) without
361*424fb153SAndroid Build Coastguard Worker       "pshufd $0x39, %%xmm5, %%xmm1;"  // indexing off memory boundary.
362*424fb153SAndroid Build Coastguard Worker 
363*424fb153SAndroid Build Coastguard Worker       "movntdq %%xmm0,  0(" rDI ");"   // Copy 4-words to destination.
364*424fb153SAndroid Build Coastguard Worker       "andps  32(" rAX "), %%xmm0;"
365*424fb153SAndroid Build Coastguard Worker       "andps  32(" rAX "), %%xmm1;"
366*424fb153SAndroid Build Coastguard Worker       "paddq     %%xmm0, %%xmm6;"
367*424fb153SAndroid Build Coastguard Worker       "paddq     %%xmm6, %%xmm7;"
368*424fb153SAndroid Build Coastguard Worker       "paddq     %%xmm1, %%xmm6;"
369*424fb153SAndroid Build Coastguard Worker       "paddq     %%xmm6, %%xmm7;"
370*424fb153SAndroid Build Coastguard Worker       "add $16, " rSI ";"
371*424fb153SAndroid Build Coastguard Worker       "add $16, " rDI ";"
372*424fb153SAndroid Build Coastguard Worker       "sub $4, " rDX ";"
373*424fb153SAndroid Build Coastguard Worker       // Decrement %rDX by 4 since %rDX is number of 32-bit
374*424fb153SAndroid Build Coastguard Worker       // words left after considering all 48-byte units.
375*424fb153SAndroid Build Coastguard Worker       "jmp REM_IS_STILL_NOT_ZERO;"
376*424fb153SAndroid Build Coastguard Worker 
377*424fb153SAndroid Build Coastguard Worker       "END:\n"
378*424fb153SAndroid Build Coastguard Worker       // Report checksum values A and B (both right now are two concatenated
379*424fb153SAndroid Build Coastguard Worker       // 64 bit numbers and have to be converted to 64 bit numbers)
380*424fb153SAndroid Build Coastguard Worker       // seems like Adler128 (since size of each part is 4 byte rather than
381*424fb153SAndroid Build Coastguard Worker       // 1 byte).
382*424fb153SAndroid Build Coastguard Worker       "movdqa %%xmm6,   0(" rAX ");"
383*424fb153SAndroid Build Coastguard Worker       "movdqa %%xmm7,  16(" rAX ");"
384*424fb153SAndroid Build Coastguard Worker       "sfence;"
385*424fb153SAndroid Build Coastguard Worker 
386*424fb153SAndroid Build Coastguard Worker       // No output registers.
387*424fb153SAndroid Build Coastguard Worker       :
388*424fb153SAndroid Build Coastguard Worker       // Input registers.
389*424fb153SAndroid Build Coastguard Worker       : "S" (srcmem64), "D" (dstmem64), "a" (checksum_arr),
390*424fb153SAndroid Build Coastguard Worker         "c" (num_of_48_byte_units), "d" (remaining_words)
391*424fb153SAndroid Build Coastguard Worker   );  // asm.
392*424fb153SAndroid Build Coastguard Worker 
393*424fb153SAndroid Build Coastguard Worker   if (checksum != NULL) {
394*424fb153SAndroid Build Coastguard Worker     checksum->Set(checksum_arr[0], checksum_arr[1],
395*424fb153SAndroid Build Coastguard Worker                   checksum_arr[2], checksum_arr[3]);
396*424fb153SAndroid Build Coastguard Worker   }
397*424fb153SAndroid Build Coastguard Worker 
398*424fb153SAndroid Build Coastguard Worker   // Everything went fine, so return true (this does not mean
399*424fb153SAndroid Build Coastguard Worker   // that there is no problem with memory this just mean that data was copied
400*424fb153SAndroid Build Coastguard Worker   // from src to dst and checksum was calculated successfully).
401*424fb153SAndroid Build Coastguard Worker   return true;
402*424fb153SAndroid Build Coastguard Worker #elif defined(STRESSAPPTEST_CPU_ARMV7A) && defined(__ARM_NEON__)
403*424fb153SAndroid Build Coastguard Worker   // Elements 0 to 3 are used for holding checksum terms a1, a2,
404*424fb153SAndroid Build Coastguard Worker   // b1, b2 respectively. These elements are filled by asm code.
405*424fb153SAndroid Build Coastguard Worker   // Checksum is seeded with the null checksum.
406*424fb153SAndroid Build Coastguard Worker   volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
407*424fb153SAndroid Build Coastguard Worker       {1, 1, 0, 0};
408*424fb153SAndroid Build Coastguard Worker 
409*424fb153SAndroid Build Coastguard Worker   if ((size_in_bytes >> 19) > 0) {
410*424fb153SAndroid Build Coastguard Worker     // Size is too large. Must be less than 2^19 bytes = 512 KB.
411*424fb153SAndroid Build Coastguard Worker     return false;
412*424fb153SAndroid Build Coastguard Worker   }
413*424fb153SAndroid Build Coastguard Worker 
414*424fb153SAndroid Build Coastguard Worker   // Since we are moving 64 bytes at a time number of iterations = total size/64
415*424fb153SAndroid Build Coastguard Worker   uint32 blocks = size_in_bytes / 64;
416*424fb153SAndroid Build Coastguard Worker 
417*424fb153SAndroid Build Coastguard Worker   uint64 *dst = dstmem64;
418*424fb153SAndroid Build Coastguard Worker   uint64 *src = srcmem64;
419*424fb153SAndroid Build Coastguard Worker 
420*424fb153SAndroid Build Coastguard Worker   #define src_r "r3"
421*424fb153SAndroid Build Coastguard Worker   #define dst_r "r4"
422*424fb153SAndroid Build Coastguard Worker   #define blocks_r "r5"
423*424fb153SAndroid Build Coastguard Worker   #define crc_r "r6"
424*424fb153SAndroid Build Coastguard Worker 
425*424fb153SAndroid Build Coastguard Worker   asm volatile (
426*424fb153SAndroid Build Coastguard Worker       "mov " src_r ", %[src];	 	\n"
427*424fb153SAndroid Build Coastguard Worker       "mov " dst_r ", %[dst]; 		\n"
428*424fb153SAndroid Build Coastguard Worker       "mov " crc_r ", %[crc]; 		\n"
429*424fb153SAndroid Build Coastguard Worker       "mov " blocks_r ", %[blocks]; 	\n"
430*424fb153SAndroid Build Coastguard Worker 
431*424fb153SAndroid Build Coastguard Worker       // Loop over block count.
432*424fb153SAndroid Build Coastguard Worker       "cmp " blocks_r ", #0; 	\n"   // Compare counter to zero.
433*424fb153SAndroid Build Coastguard Worker       "ble END;			\n"
434*424fb153SAndroid Build Coastguard Worker 
435*424fb153SAndroid Build Coastguard Worker 
436*424fb153SAndroid Build Coastguard Worker       // Preload upcoming cacheline.
437*424fb153SAndroid Build Coastguard Worker       "pld [" src_r ", #0x0];	\n"
438*424fb153SAndroid Build Coastguard Worker       "pld [" src_r ", #0x20];	\n"
439*424fb153SAndroid Build Coastguard Worker 
440*424fb153SAndroid Build Coastguard Worker       // Init checksum
441*424fb153SAndroid Build Coastguard Worker       "vldm " crc_r ", {q0};		\n"
442*424fb153SAndroid Build Coastguard Worker       "vmov.i32 q1, #0;			\n"
443*424fb153SAndroid Build Coastguard Worker 
444*424fb153SAndroid Build Coastguard Worker       // Start of the loop which copies 48 bytes from source to dst each time.
445*424fb153SAndroid Build Coastguard Worker       "TOP:			\n"
446*424fb153SAndroid Build Coastguard Worker 
447*424fb153SAndroid Build Coastguard Worker       // Make 3 moves each of 16 bytes from srcmem to qX registers.
448*424fb153SAndroid Build Coastguard Worker       // We are using 2 words out of 4 words in each qX register,
449*424fb153SAndroid Build Coastguard Worker       // word index 0 and word index 2. We'll swizzle them in a bit.
450*424fb153SAndroid Build Coastguard Worker       // Copy it.
451*424fb153SAndroid Build Coastguard Worker       "vldm " src_r "!, {q8, q9, q10, q11};	\n"
452*424fb153SAndroid Build Coastguard Worker       "vstm " dst_r "!, {q8, q9, q10, q11};	\n"
453*424fb153SAndroid Build Coastguard Worker 
454*424fb153SAndroid Build Coastguard Worker       // Arrange it.
455*424fb153SAndroid Build Coastguard Worker       "vmov.i64 q12, #0;	\n"
456*424fb153SAndroid Build Coastguard Worker       "vmov.i64 q13, #0;	\n"
457*424fb153SAndroid Build Coastguard Worker       "vmov.i64 q14, #0;	\n"
458*424fb153SAndroid Build Coastguard Worker       "vmov.i64 q15, #0;	\n"
459*424fb153SAndroid Build Coastguard Worker       // This exchenges words 1,3 in the filled registers with
460*424fb153SAndroid Build Coastguard Worker       // words 0,2 in the empty registers.
461*424fb153SAndroid Build Coastguard Worker       "vtrn.32 q8, q12;		\n"
462*424fb153SAndroid Build Coastguard Worker       "vtrn.32 q9, q13;		\n"
463*424fb153SAndroid Build Coastguard Worker       "vtrn.32 q10, q14;	\n"
464*424fb153SAndroid Build Coastguard Worker       "vtrn.32 q11, q15;	\n"
465*424fb153SAndroid Build Coastguard Worker 
466*424fb153SAndroid Build Coastguard Worker       // Sum into q0, then into q1.
467*424fb153SAndroid Build Coastguard Worker       // Repeat this for q8 - q13.
468*424fb153SAndroid Build Coastguard Worker       // Overflow can occur only if there are more
469*424fb153SAndroid Build Coastguard Worker       // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so
470*424fb153SAndroid Build Coastguard Worker       // if size_in_bytes > 2^19 than overflow occurs.
471*424fb153SAndroid Build Coastguard Worker       "vadd.i64 q0, q0, q8;	\n"
472*424fb153SAndroid Build Coastguard Worker       "vadd.i64 q1, q1, q0;	\n"
473*424fb153SAndroid Build Coastguard Worker       "vadd.i64 q0, q0, q12;	\n"
474*424fb153SAndroid Build Coastguard Worker       "vadd.i64 q1, q1, q0;	\n"
475*424fb153SAndroid Build Coastguard Worker       "vadd.i64 q0, q0, q9;	\n"
476*424fb153SAndroid Build Coastguard Worker       "vadd.i64 q1, q1, q0;	\n"
477*424fb153SAndroid Build Coastguard Worker       "vadd.i64 q0, q0, q13;	\n"
478*424fb153SAndroid Build Coastguard Worker       "vadd.i64 q1, q1, q0;	\n"
479*424fb153SAndroid Build Coastguard Worker 
480*424fb153SAndroid Build Coastguard Worker       "vadd.i64 q0, q0, q10;	\n"
481*424fb153SAndroid Build Coastguard Worker       "vadd.i64 q1, q1, q0;	\n"
482*424fb153SAndroid Build Coastguard Worker       "vadd.i64 q0, q0, q14;	\n"
483*424fb153SAndroid Build Coastguard Worker       "vadd.i64 q1, q1, q0;	\n"
484*424fb153SAndroid Build Coastguard Worker       "vadd.i64 q0, q0, q11;	\n"
485*424fb153SAndroid Build Coastguard Worker       "vadd.i64 q1, q1, q0;	\n"
486*424fb153SAndroid Build Coastguard Worker       "vadd.i64 q0, q0, q15;	\n"
487*424fb153SAndroid Build Coastguard Worker       "vadd.i64 q1, q1, q0;	\n"
488*424fb153SAndroid Build Coastguard Worker 
489*424fb153SAndroid Build Coastguard Worker       // Increment counter and loop.
490*424fb153SAndroid Build Coastguard Worker       "sub " blocks_r ", " blocks_r ", #1;	\n"
491*424fb153SAndroid Build Coastguard Worker       "cmp " blocks_r ", #0;	\n"   // Compare counter to zero.
492*424fb153SAndroid Build Coastguard Worker       "bgt TOP;	\n"
493*424fb153SAndroid Build Coastguard Worker 
494*424fb153SAndroid Build Coastguard Worker 
495*424fb153SAndroid Build Coastguard Worker       "END:\n"
496*424fb153SAndroid Build Coastguard Worker       // Report checksum values A and B (both right now are two concatenated
497*424fb153SAndroid Build Coastguard Worker       // 64 bit numbers and have to be converted to 64 bit numbers)
498*424fb153SAndroid Build Coastguard Worker       // seems like Adler128 (since size of each part is 4 byte rather than
499*424fb153SAndroid Build Coastguard Worker       // 1 byte).
500*424fb153SAndroid Build Coastguard Worker       "vstm " crc_r ", {q0, q1};	\n"
501*424fb153SAndroid Build Coastguard Worker 
502*424fb153SAndroid Build Coastguard Worker       // Output registers.
503*424fb153SAndroid Build Coastguard Worker       :
504*424fb153SAndroid Build Coastguard Worker       // Input registers.
505*424fb153SAndroid Build Coastguard Worker       : [src] "r"(src), [dst] "r"(dst), [blocks] "r"(blocks) , [crc] "r"(checksum_arr)
506*424fb153SAndroid Build Coastguard Worker       : "memory", "cc", "r3", "r4", "r5", "r6", "q0", "q1", "q8","q9","q10", "q11", "q12","q13","q14","q15"
507*424fb153SAndroid Build Coastguard Worker   );  // asm.
508*424fb153SAndroid Build Coastguard Worker 
509*424fb153SAndroid Build Coastguard Worker   if (checksum != NULL) {
510*424fb153SAndroid Build Coastguard Worker     checksum->Set(checksum_arr[0], checksum_arr[1],
511*424fb153SAndroid Build Coastguard Worker                   checksum_arr[2], checksum_arr[3]);
512*424fb153SAndroid Build Coastguard Worker   }
513*424fb153SAndroid Build Coastguard Worker 
514*424fb153SAndroid Build Coastguard Worker   // Everything went fine, so return true (this does not mean
515*424fb153SAndroid Build Coastguard Worker   // that there is no problem with memory this just mean that data was copied
516*424fb153SAndroid Build Coastguard Worker   // from src to dst and checksum was calculated successfully).
517*424fb153SAndroid Build Coastguard Worker   return true;
518*424fb153SAndroid Build Coastguard Worker #else
519*424fb153SAndroid Build Coastguard Worker   #warning "No vector copy defined for this architecture."
520*424fb153SAndroid Build Coastguard Worker   // Fall back to C implementation for anything else.
521*424fb153SAndroid Build Coastguard Worker   return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum);
522*424fb153SAndroid Build Coastguard Worker #endif
523*424fb153SAndroid Build Coastguard Worker }
524