1*8d67ca89SAndroid Build Coastguard Worker/* 2*8d67ca89SAndroid Build Coastguard WorkerCopyright (c) 2014, Intel Corporation 3*8d67ca89SAndroid Build Coastguard WorkerAll rights reserved. 4*8d67ca89SAndroid Build Coastguard Worker 5*8d67ca89SAndroid Build Coastguard WorkerRedistribution and use in source and binary forms, with or without 6*8d67ca89SAndroid Build Coastguard Workermodification, are permitted provided that the following conditions are met: 7*8d67ca89SAndroid Build Coastguard Worker 8*8d67ca89SAndroid Build Coastguard Worker * Redistributions of source code must retain the above copyright notice, 9*8d67ca89SAndroid Build Coastguard Worker * this list of conditions and the following disclaimer. 10*8d67ca89SAndroid Build Coastguard Worker 11*8d67ca89SAndroid Build Coastguard Worker * Redistributions in binary form must reproduce the above copyright notice, 12*8d67ca89SAndroid Build Coastguard Worker * this list of conditions and the following disclaimer in the documentation 13*8d67ca89SAndroid Build Coastguard Worker * and/or other materials provided with the distribution. 14*8d67ca89SAndroid Build Coastguard Worker 15*8d67ca89SAndroid Build Coastguard Worker * Neither the name of Intel Corporation nor the names of its contributors 16*8d67ca89SAndroid Build Coastguard Worker * may be used to endorse or promote products derived from this software 17*8d67ca89SAndroid Build Coastguard Worker * without specific prior written permission. 18*8d67ca89SAndroid Build Coastguard Worker 19*8d67ca89SAndroid Build Coastguard WorkerTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20*8d67ca89SAndroid Build Coastguard WorkerANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21*8d67ca89SAndroid Build Coastguard WorkerWARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22*8d67ca89SAndroid Build Coastguard WorkerDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23*8d67ca89SAndroid Build Coastguard WorkerANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24*8d67ca89SAndroid Build Coastguard Worker(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25*8d67ca89SAndroid Build Coastguard WorkerLOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26*8d67ca89SAndroid Build Coastguard WorkerANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27*8d67ca89SAndroid Build Coastguard Worker(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28*8d67ca89SAndroid Build Coastguard WorkerSOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*8d67ca89SAndroid Build Coastguard Worker*/ 30*8d67ca89SAndroid Build Coastguard Worker 31*8d67ca89SAndroid Build Coastguard Worker 32*8d67ca89SAndroid Build Coastguard Worker#ifndef MEMMOVE 33*8d67ca89SAndroid Build Coastguard Worker# define MEMMOVE memmove 34*8d67ca89SAndroid Build Coastguard Worker#endif 35*8d67ca89SAndroid Build Coastguard Worker 36*8d67ca89SAndroid Build Coastguard Worker#ifndef L 37*8d67ca89SAndroid Build Coastguard Worker# define L(label) .L##label 38*8d67ca89SAndroid Build Coastguard Worker#endif 39*8d67ca89SAndroid Build Coastguard Worker 40*8d67ca89SAndroid Build Coastguard Worker#ifndef cfi_startproc 41*8d67ca89SAndroid Build Coastguard Worker# define cfi_startproc .cfi_startproc 42*8d67ca89SAndroid Build Coastguard Worker#endif 43*8d67ca89SAndroid Build Coastguard Worker 44*8d67ca89SAndroid Build Coastguard Worker#ifndef cfi_endproc 45*8d67ca89SAndroid Build Coastguard Worker# define cfi_endproc .cfi_endproc 46*8d67ca89SAndroid Build Coastguard Worker#endif 47*8d67ca89SAndroid Build Coastguard Worker 48*8d67ca89SAndroid Build Coastguard Worker#ifndef cfi_rel_offset 49*8d67ca89SAndroid Build Coastguard Worker# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 50*8d67ca89SAndroid Build Coastguard Worker#endif 51*8d67ca89SAndroid Build Coastguard Worker 52*8d67ca89SAndroid Build Coastguard Worker#ifndef cfi_restore 53*8d67ca89SAndroid Build Coastguard Worker# define cfi_restore(reg) .cfi_restore reg 54*8d67ca89SAndroid Build Coastguard Worker#endif 55*8d67ca89SAndroid Build Coastguard Worker 56*8d67ca89SAndroid Build Coastguard Worker#ifndef cfi_adjust_cfa_offset 57*8d67ca89SAndroid Build Coastguard Worker# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 58*8d67ca89SAndroid Build Coastguard Worker#endif 59*8d67ca89SAndroid Build Coastguard Worker 60*8d67ca89SAndroid Build Coastguard Worker#ifndef ENTRY 61*8d67ca89SAndroid Build Coastguard Worker# define ENTRY(name) \ 62*8d67ca89SAndroid Build Coastguard Worker .type name, @function; \ 63*8d67ca89SAndroid Build Coastguard Worker .globl name; \ 64*8d67ca89SAndroid Build Coastguard Worker .p2align 4; \ 65*8d67ca89SAndroid Build Coastguard Workername: \ 66*8d67ca89SAndroid Build Coastguard Worker cfi_startproc 67*8d67ca89SAndroid Build Coastguard Worker#endif 68*8d67ca89SAndroid Build Coastguard Worker 69*8d67ca89SAndroid Build Coastguard Worker#ifndef ALIAS_SYMBOL 70*8d67ca89SAndroid Build Coastguard Worker# define ALIAS_SYMBOL(alias, original) \ 71*8d67ca89SAndroid Build Coastguard Worker .globl alias; \ 72*8d67ca89SAndroid Build Coastguard Worker .equ alias, original 73*8d67ca89SAndroid Build Coastguard Worker#endif 74*8d67ca89SAndroid Build Coastguard Worker 75*8d67ca89SAndroid Build Coastguard Worker#ifndef END 76*8d67ca89SAndroid Build Coastguard Worker# define END(name) \ 77*8d67ca89SAndroid Build Coastguard Worker cfi_endproc; \ 78*8d67ca89SAndroid Build Coastguard Worker .size name, .-name 79*8d67ca89SAndroid Build Coastguard Worker#endif 80*8d67ca89SAndroid Build Coastguard Worker 81*8d67ca89SAndroid Build Coastguard Worker#define CFI_PUSH(REG) \ 82*8d67ca89SAndroid Build Coastguard Worker cfi_adjust_cfa_offset (4); \ 83*8d67ca89SAndroid Build Coastguard Worker cfi_rel_offset (REG, 0) 84*8d67ca89SAndroid Build Coastguard Worker 85*8d67ca89SAndroid Build Coastguard Worker#define CFI_POP(REG) \ 86*8d67ca89SAndroid Build Coastguard Worker cfi_adjust_cfa_offset (-4); \ 87*8d67ca89SAndroid Build Coastguard Worker cfi_restore (REG) 88*8d67ca89SAndroid Build Coastguard Worker 89*8d67ca89SAndroid Build Coastguard Worker#define PUSH(REG) push REG; 90*8d67ca89SAndroid Build Coastguard Worker#define POP(REG) pop REG; 91*8d67ca89SAndroid Build Coastguard Worker 92*8d67ca89SAndroid Build Coastguard Worker#define ENTRANCE PUSH (%rbx); 93*8d67ca89SAndroid Build Coastguard Worker#define RETURN_END POP (%rbx); ret 94*8d67ca89SAndroid Build Coastguard Worker#define RETURN RETURN_END; 95*8d67ca89SAndroid Build Coastguard Worker 96*8d67ca89SAndroid Build Coastguard Worker .section .text.sse2,"ax",@progbits 97*8d67ca89SAndroid Build Coastguard WorkerENTRY (MEMMOVE) 98*8d67ca89SAndroid Build Coastguard Worker ENTRANCE 99*8d67ca89SAndroid Build Coastguard Worker mov %rdi, %rax 100*8d67ca89SAndroid Build Coastguard Worker 101*8d67ca89SAndroid Build Coastguard Worker/* Check whether we should copy backward or forward. */ 102*8d67ca89SAndroid Build Coastguard Worker cmp %rsi, %rdi 103*8d67ca89SAndroid Build Coastguard Worker je L(mm_return) 104*8d67ca89SAndroid Build Coastguard Worker jg L(mm_len_0_or_more_backward) 105*8d67ca89SAndroid Build Coastguard Worker 106*8d67ca89SAndroid Build Coastguard Worker/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] 107*8d67ca89SAndroid Build Coastguard Worker separately. */ 108*8d67ca89SAndroid Build Coastguard Worker cmp $16, %rdx 109*8d67ca89SAndroid Build Coastguard Worker jbe L(mm_len_0_16_bytes_forward) 110*8d67ca89SAndroid Build Coastguard Worker 111*8d67ca89SAndroid Build Coastguard Worker cmp $32, %rdx 112*8d67ca89SAndroid Build Coastguard Worker ja L(mm_len_32_or_more_forward) 113*8d67ca89SAndroid Build Coastguard Worker 114*8d67ca89SAndroid Build Coastguard Worker/* Copy [0..32] and return. */ 115*8d67ca89SAndroid Build Coastguard Worker movdqu (%rsi), %xmm0 116*8d67ca89SAndroid Build Coastguard Worker movdqu -16(%rsi, %rdx), %xmm1 117*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm0, (%rdi) 118*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm1, -16(%rdi, %rdx) 119*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 120*8d67ca89SAndroid Build Coastguard Worker 121*8d67ca89SAndroid Build Coastguard WorkerL(mm_len_32_or_more_forward): 122*8d67ca89SAndroid Build Coastguard Worker cmp $64, %rdx 123*8d67ca89SAndroid Build Coastguard Worker ja L(mm_len_64_or_more_forward) 124*8d67ca89SAndroid Build Coastguard Worker 125*8d67ca89SAndroid Build Coastguard Worker/* Copy [0..64] and return. */ 126*8d67ca89SAndroid Build Coastguard Worker movdqu (%rsi), %xmm0 127*8d67ca89SAndroid Build Coastguard Worker movdqu 16(%rsi), %xmm1 128*8d67ca89SAndroid Build Coastguard Worker movdqu -16(%rsi, %rdx), %xmm2 129*8d67ca89SAndroid Build Coastguard Worker movdqu -32(%rsi, %rdx), %xmm3 130*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm0, (%rdi) 131*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm1, 16(%rdi) 132*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm2, -16(%rdi, %rdx) 133*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm3, -32(%rdi, %rdx) 134*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 135*8d67ca89SAndroid Build Coastguard Worker 136*8d67ca89SAndroid Build Coastguard WorkerL(mm_len_64_or_more_forward): 137*8d67ca89SAndroid Build Coastguard Worker cmp $128, %rdx 138*8d67ca89SAndroid Build Coastguard Worker ja L(mm_len_128_or_more_forward) 139*8d67ca89SAndroid Build Coastguard Worker 140*8d67ca89SAndroid Build Coastguard Worker/* Copy [0..128] and return. */ 141*8d67ca89SAndroid Build Coastguard Worker movdqu (%rsi), %xmm0 142*8d67ca89SAndroid Build Coastguard Worker movdqu 16(%rsi), %xmm1 143*8d67ca89SAndroid Build Coastguard Worker movdqu 32(%rsi), %xmm2 144*8d67ca89SAndroid Build Coastguard Worker movdqu 48(%rsi), %xmm3 145*8d67ca89SAndroid Build Coastguard Worker movdqu -64(%rsi, %rdx), %xmm4 146*8d67ca89SAndroid Build Coastguard Worker movdqu -48(%rsi, %rdx), %xmm5 147*8d67ca89SAndroid Build Coastguard Worker movdqu -32(%rsi, %rdx), %xmm6 148*8d67ca89SAndroid Build Coastguard Worker movdqu -16(%rsi, %rdx), %xmm7 149*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm0, (%rdi) 150*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm1, 16(%rdi) 151*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm2, 32(%rdi) 152*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm3, 48(%rdi) 153*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm4, -64(%rdi, %rdx) 154*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm5, -48(%rdi, %rdx) 155*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm6, -32(%rdi, %rdx) 156*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm7, -16(%rdi, %rdx) 157*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 158*8d67ca89SAndroid Build Coastguard Worker 159*8d67ca89SAndroid Build Coastguard WorkerL(mm_len_128_or_more_forward): 160*8d67ca89SAndroid Build Coastguard Worker/* Aligning the address of destination. */ 161*8d67ca89SAndroid Build Coastguard Worker/* save first unaligned 64 bytes */ 162*8d67ca89SAndroid Build Coastguard Worker movdqu (%rsi), %xmm0 163*8d67ca89SAndroid Build Coastguard Worker movdqu 16(%rsi), %xmm1 164*8d67ca89SAndroid Build Coastguard Worker movdqu 32(%rsi), %xmm2 165*8d67ca89SAndroid Build Coastguard Worker movdqu 48(%rsi), %xmm3 166*8d67ca89SAndroid Build Coastguard Worker 167*8d67ca89SAndroid Build Coastguard Worker lea 64(%rdi), %r8 168*8d67ca89SAndroid Build Coastguard Worker and $-64, %r8 /* r8 now aligned to next 64 byte boundary */ 169*8d67ca89SAndroid Build Coastguard Worker sub %rdi, %rsi /* rsi = src - dst = diff */ 170*8d67ca89SAndroid Build Coastguard Worker 171*8d67ca89SAndroid Build Coastguard Worker movdqu (%r8, %rsi), %xmm4 172*8d67ca89SAndroid Build Coastguard Worker movdqu 16(%r8, %rsi), %xmm5 173*8d67ca89SAndroid Build Coastguard Worker movdqu 32(%r8, %rsi), %xmm6 174*8d67ca89SAndroid Build Coastguard Worker movdqu 48(%r8, %rsi), %xmm7 175*8d67ca89SAndroid Build Coastguard Worker 176*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm0, (%rdi) 177*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm1, 16(%rdi) 178*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm2, 32(%rdi) 179*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm3, 48(%rdi) 180*8d67ca89SAndroid Build Coastguard Worker movdqa %xmm4, (%r8) 181*8d67ca89SAndroid Build Coastguard Worker movaps %xmm5, 16(%r8) 182*8d67ca89SAndroid Build Coastguard Worker movaps %xmm6, 32(%r8) 183*8d67ca89SAndroid Build Coastguard Worker movaps %xmm7, 48(%r8) 184*8d67ca89SAndroid Build Coastguard Worker add $64, %r8 185*8d67ca89SAndroid Build Coastguard Worker 186*8d67ca89SAndroid Build Coastguard Worker lea (%rdi, %rdx), %rbx 187*8d67ca89SAndroid Build Coastguard Worker and $-64, %rbx 188*8d67ca89SAndroid Build Coastguard Worker cmp %r8, %rbx 189*8d67ca89SAndroid Build Coastguard Worker jbe L(mm_copy_remaining_forward) 190*8d67ca89SAndroid Build Coastguard Worker 191*8d67ca89SAndroid Build Coastguard Worker cmp __x86_shared_cache_size_half(%rip), %rdx 192*8d67ca89SAndroid Build Coastguard Worker 193*8d67ca89SAndroid Build Coastguard Worker ja L(mm_overlapping_check_forward) 194*8d67ca89SAndroid Build Coastguard Worker 195*8d67ca89SAndroid Build Coastguard Worker .p2align 4 196*8d67ca89SAndroid Build Coastguard WorkerL(mm_main_loop_forward): 197*8d67ca89SAndroid Build Coastguard Worker 198*8d67ca89SAndroid Build Coastguard Worker prefetcht0 128(%r8, %rsi) 199*8d67ca89SAndroid Build Coastguard Worker 200*8d67ca89SAndroid Build Coastguard Worker movdqu (%r8, %rsi), %xmm0 201*8d67ca89SAndroid Build Coastguard Worker movdqu 16(%r8, %rsi), %xmm1 202*8d67ca89SAndroid Build Coastguard Worker movdqu 32(%r8, %rsi), %xmm2 203*8d67ca89SAndroid Build Coastguard Worker movdqu 48(%r8, %rsi), %xmm3 204*8d67ca89SAndroid Build Coastguard Worker movdqa %xmm0, (%r8) 205*8d67ca89SAndroid Build Coastguard Worker movaps %xmm1, 16(%r8) 206*8d67ca89SAndroid Build Coastguard Worker movaps %xmm2, 32(%r8) 207*8d67ca89SAndroid Build Coastguard Worker movaps %xmm3, 48(%r8) 208*8d67ca89SAndroid Build Coastguard Worker lea 64(%r8), %r8 209*8d67ca89SAndroid Build Coastguard Worker cmp %r8, %rbx 210*8d67ca89SAndroid Build Coastguard Worker ja L(mm_main_loop_forward) 211*8d67ca89SAndroid Build Coastguard Worker 212*8d67ca89SAndroid Build Coastguard WorkerL(mm_copy_remaining_forward): 213*8d67ca89SAndroid Build Coastguard Worker add %rdi, %rdx 214*8d67ca89SAndroid Build Coastguard Worker sub %r8, %rdx 215*8d67ca89SAndroid Build Coastguard Worker/* We copied all up till %rdi position in the dst. 216*8d67ca89SAndroid Build Coastguard Worker In %rdx now is how many bytes are left to copy. 217*8d67ca89SAndroid Build Coastguard Worker Now we need to advance %r8. */ 218*8d67ca89SAndroid Build Coastguard Worker lea (%r8, %rsi), %r9 219*8d67ca89SAndroid Build Coastguard Worker 220*8d67ca89SAndroid Build Coastguard WorkerL(mm_remaining_0_64_bytes_forward): 221*8d67ca89SAndroid Build Coastguard Worker cmp $32, %rdx 222*8d67ca89SAndroid Build Coastguard Worker ja L(mm_remaining_33_64_bytes_forward) 223*8d67ca89SAndroid Build Coastguard Worker cmp $16, %rdx 224*8d67ca89SAndroid Build Coastguard Worker ja L(mm_remaining_17_32_bytes_forward) 225*8d67ca89SAndroid Build Coastguard Worker test %rdx, %rdx 226*8d67ca89SAndroid Build Coastguard Worker .p2align 4,,2 227*8d67ca89SAndroid Build Coastguard Worker je L(mm_return) 228*8d67ca89SAndroid Build Coastguard Worker 229*8d67ca89SAndroid Build Coastguard Worker cmpb $8, %dl 230*8d67ca89SAndroid Build Coastguard Worker ja L(mm_remaining_9_16_bytes_forward) 231*8d67ca89SAndroid Build Coastguard Worker cmpb $4, %dl 232*8d67ca89SAndroid Build Coastguard Worker .p2align 4,,5 233*8d67ca89SAndroid Build Coastguard Worker ja L(mm_remaining_5_8_bytes_forward) 234*8d67ca89SAndroid Build Coastguard Worker cmpb $2, %dl 235*8d67ca89SAndroid Build Coastguard Worker .p2align 4,,1 236*8d67ca89SAndroid Build Coastguard Worker ja L(mm_remaining_3_4_bytes_forward) 237*8d67ca89SAndroid Build Coastguard Worker movzbl -1(%r9,%rdx), %esi 238*8d67ca89SAndroid Build Coastguard Worker movzbl (%r9), %ebx 239*8d67ca89SAndroid Build Coastguard Worker movb %sil, -1(%r8,%rdx) 240*8d67ca89SAndroid Build Coastguard Worker movb %bl, (%r8) 241*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 242*8d67ca89SAndroid Build Coastguard Worker 243*8d67ca89SAndroid Build Coastguard WorkerL(mm_remaining_33_64_bytes_forward): 244*8d67ca89SAndroid Build Coastguard Worker movdqu (%r9), %xmm0 245*8d67ca89SAndroid Build Coastguard Worker movdqu 16(%r9), %xmm1 246*8d67ca89SAndroid Build Coastguard Worker movdqu -32(%r9, %rdx), %xmm2 247*8d67ca89SAndroid Build Coastguard Worker movdqu -16(%r9, %rdx), %xmm3 248*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm0, (%r8) 249*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm1, 16(%r8) 250*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm2, -32(%r8, %rdx) 251*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm3, -16(%r8, %rdx) 252*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 253*8d67ca89SAndroid Build Coastguard Worker 254*8d67ca89SAndroid Build Coastguard WorkerL(mm_remaining_17_32_bytes_forward): 255*8d67ca89SAndroid Build Coastguard Worker movdqu (%r9), %xmm0 256*8d67ca89SAndroid Build Coastguard Worker movdqu -16(%r9, %rdx), %xmm1 257*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm0, (%r8) 258*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm1, -16(%r8, %rdx) 259*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 260*8d67ca89SAndroid Build Coastguard Worker 261*8d67ca89SAndroid Build Coastguard WorkerL(mm_remaining_5_8_bytes_forward): 262*8d67ca89SAndroid Build Coastguard Worker movl (%r9), %esi 263*8d67ca89SAndroid Build Coastguard Worker movl -4(%r9,%rdx), %ebx 264*8d67ca89SAndroid Build Coastguard Worker movl %esi, (%r8) 265*8d67ca89SAndroid Build Coastguard Worker movl %ebx, -4(%r8,%rdx) 266*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 267*8d67ca89SAndroid Build Coastguard Worker 268*8d67ca89SAndroid Build Coastguard WorkerL(mm_remaining_9_16_bytes_forward): 269*8d67ca89SAndroid Build Coastguard Worker mov (%r9), %rsi 270*8d67ca89SAndroid Build Coastguard Worker mov -8(%r9, %rdx), %rbx 271*8d67ca89SAndroid Build Coastguard Worker mov %rsi, (%r8) 272*8d67ca89SAndroid Build Coastguard Worker mov %rbx, -8(%r8, %rdx) 273*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 274*8d67ca89SAndroid Build Coastguard Worker 275*8d67ca89SAndroid Build Coastguard WorkerL(mm_remaining_3_4_bytes_forward): 276*8d67ca89SAndroid Build Coastguard Worker movzwl -2(%r9,%rdx), %esi 277*8d67ca89SAndroid Build Coastguard Worker movzwl (%r9), %ebx 278*8d67ca89SAndroid Build Coastguard Worker movw %si, -2(%r8,%rdx) 279*8d67ca89SAndroid Build Coastguard Worker movw %bx, (%r8) 280*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 281*8d67ca89SAndroid Build Coastguard Worker 282*8d67ca89SAndroid Build Coastguard WorkerL(mm_len_0_16_bytes_forward): 283*8d67ca89SAndroid Build Coastguard Worker testb $24, %dl 284*8d67ca89SAndroid Build Coastguard Worker jne L(mm_len_9_16_bytes_forward) 285*8d67ca89SAndroid Build Coastguard Worker testb $4, %dl 286*8d67ca89SAndroid Build Coastguard Worker .p2align 4,,5 287*8d67ca89SAndroid Build Coastguard Worker jne L(mm_len_5_8_bytes_forward) 288*8d67ca89SAndroid Build Coastguard Worker test %rdx, %rdx 289*8d67ca89SAndroid Build Coastguard Worker .p2align 4,,2 290*8d67ca89SAndroid Build Coastguard Worker je L(mm_return) 291*8d67ca89SAndroid Build Coastguard Worker testb $2, %dl 292*8d67ca89SAndroid Build Coastguard Worker .p2align 4,,1 293*8d67ca89SAndroid Build Coastguard Worker jne L(mm_len_2_4_bytes_forward) 294*8d67ca89SAndroid Build Coastguard Worker movzbl -1(%rsi,%rdx), %ebx 295*8d67ca89SAndroid Build Coastguard Worker movzbl (%rsi), %esi 296*8d67ca89SAndroid Build Coastguard Worker movb %bl, -1(%rdi,%rdx) 297*8d67ca89SAndroid Build Coastguard Worker movb %sil, (%rdi) 298*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 299*8d67ca89SAndroid Build Coastguard Worker 300*8d67ca89SAndroid Build Coastguard WorkerL(mm_len_2_4_bytes_forward): 301*8d67ca89SAndroid Build Coastguard Worker movzwl -2(%rsi,%rdx), %ebx 302*8d67ca89SAndroid Build Coastguard Worker movzwl (%rsi), %esi 303*8d67ca89SAndroid Build Coastguard Worker movw %bx, -2(%rdi,%rdx) 304*8d67ca89SAndroid Build Coastguard Worker movw %si, (%rdi) 305*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 306*8d67ca89SAndroid Build Coastguard Worker 307*8d67ca89SAndroid Build Coastguard WorkerL(mm_len_5_8_bytes_forward): 308*8d67ca89SAndroid Build Coastguard Worker movl (%rsi), %ebx 309*8d67ca89SAndroid Build Coastguard Worker movl -4(%rsi,%rdx), %esi 310*8d67ca89SAndroid Build Coastguard Worker movl %ebx, (%rdi) 311*8d67ca89SAndroid Build Coastguard Worker movl %esi, -4(%rdi,%rdx) 312*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 313*8d67ca89SAndroid Build Coastguard Worker 314*8d67ca89SAndroid Build Coastguard WorkerL(mm_len_9_16_bytes_forward): 315*8d67ca89SAndroid Build Coastguard Worker mov (%rsi), %rbx 316*8d67ca89SAndroid Build Coastguard Worker mov -8(%rsi, %rdx), %rsi 317*8d67ca89SAndroid Build Coastguard Worker mov %rbx, (%rdi) 318*8d67ca89SAndroid Build Coastguard Worker mov %rsi, -8(%rdi, %rdx) 319*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 320*8d67ca89SAndroid Build Coastguard Worker 321*8d67ca89SAndroid Build Coastguard WorkerL(mm_recalc_len): 322*8d67ca89SAndroid Build Coastguard Worker/* Compute in %rdx how many bytes are left to copy after 323*8d67ca89SAndroid Build Coastguard Worker the main loop stops. */ 324*8d67ca89SAndroid Build Coastguard Worker mov %rbx, %rdx 325*8d67ca89SAndroid Build Coastguard Worker sub %rdi, %rdx 326*8d67ca89SAndroid Build Coastguard Worker/* The code for copying backwards. */ 327*8d67ca89SAndroid Build Coastguard WorkerL(mm_len_0_or_more_backward): 328*8d67ca89SAndroid Build Coastguard Worker 329*8d67ca89SAndroid Build Coastguard Worker/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] 330*8d67ca89SAndroid Build Coastguard Worker separately. */ 331*8d67ca89SAndroid Build Coastguard Worker cmp $16, %rdx 332*8d67ca89SAndroid Build Coastguard Worker jbe L(mm_len_0_16_bytes_backward) 333*8d67ca89SAndroid Build Coastguard Worker 334*8d67ca89SAndroid Build Coastguard Worker cmp $32, %rdx 335*8d67ca89SAndroid Build Coastguard Worker ja L(mm_len_32_or_more_backward) 336*8d67ca89SAndroid Build Coastguard Worker 337*8d67ca89SAndroid Build Coastguard Worker/* Copy [0..32] and return. */ 338*8d67ca89SAndroid Build Coastguard Worker movdqu (%rsi), %xmm0 339*8d67ca89SAndroid Build Coastguard Worker movdqu -16(%rsi, %rdx), %xmm1 340*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm0, (%rdi) 341*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm1, -16(%rdi, %rdx) 342*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 343*8d67ca89SAndroid Build Coastguard Worker 344*8d67ca89SAndroid Build Coastguard WorkerL(mm_len_32_or_more_backward): 345*8d67ca89SAndroid Build Coastguard Worker cmp $64, %rdx 346*8d67ca89SAndroid Build Coastguard Worker ja L(mm_len_64_or_more_backward) 347*8d67ca89SAndroid Build Coastguard Worker 348*8d67ca89SAndroid Build Coastguard Worker/* Copy [0..64] and return. */ 349*8d67ca89SAndroid Build Coastguard Worker movdqu (%rsi), %xmm0 350*8d67ca89SAndroid Build Coastguard Worker movdqu 16(%rsi), %xmm1 351*8d67ca89SAndroid Build Coastguard Worker movdqu -16(%rsi, %rdx), %xmm2 352*8d67ca89SAndroid Build Coastguard Worker movdqu -32(%rsi, %rdx), %xmm3 353*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm0, (%rdi) 354*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm1, 16(%rdi) 355*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm2, -16(%rdi, %rdx) 356*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm3, -32(%rdi, %rdx) 357*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 358*8d67ca89SAndroid Build Coastguard Worker 359*8d67ca89SAndroid Build Coastguard WorkerL(mm_len_64_or_more_backward): 360*8d67ca89SAndroid Build Coastguard Worker cmp $128, %rdx 361*8d67ca89SAndroid Build Coastguard Worker ja L(mm_len_128_or_more_backward) 362*8d67ca89SAndroid Build Coastguard Worker 363*8d67ca89SAndroid Build Coastguard Worker/* Copy [0..128] and return. */ 364*8d67ca89SAndroid Build Coastguard Worker movdqu (%rsi), %xmm0 365*8d67ca89SAndroid Build Coastguard Worker movdqu 16(%rsi), %xmm1 366*8d67ca89SAndroid Build Coastguard Worker movdqu 32(%rsi), %xmm2 367*8d67ca89SAndroid Build Coastguard Worker movdqu 48(%rsi), %xmm3 368*8d67ca89SAndroid Build Coastguard Worker movdqu -64(%rsi, %rdx), %xmm4 369*8d67ca89SAndroid Build Coastguard Worker movdqu -48(%rsi, %rdx), %xmm5 370*8d67ca89SAndroid Build Coastguard Worker movdqu -32(%rsi, %rdx), %xmm6 371*8d67ca89SAndroid Build Coastguard Worker movdqu -16(%rsi, %rdx), %xmm7 372*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm0, (%rdi) 373*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm1, 16(%rdi) 374*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm2, 32(%rdi) 375*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm3, 48(%rdi) 376*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm4, -64(%rdi, %rdx) 377*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm5, -48(%rdi, %rdx) 378*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm6, -32(%rdi, %rdx) 379*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm7, -16(%rdi, %rdx) 380*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 381*8d67ca89SAndroid Build Coastguard Worker 382*8d67ca89SAndroid Build Coastguard WorkerL(mm_len_128_or_more_backward): 383*8d67ca89SAndroid Build Coastguard Worker/* Aligning the address of destination. We need to save 384*8d67ca89SAndroid Build Coastguard Worker 16 bits from the source in order not to overwrite them. */ 385*8d67ca89SAndroid Build Coastguard Worker movdqu -16(%rsi, %rdx), %xmm0 386*8d67ca89SAndroid Build Coastguard Worker movdqu -32(%rsi, %rdx), %xmm1 387*8d67ca89SAndroid Build Coastguard Worker movdqu -48(%rsi, %rdx), %xmm2 388*8d67ca89SAndroid Build Coastguard Worker movdqu -64(%rsi, %rdx), %xmm3 389*8d67ca89SAndroid Build Coastguard Worker 390*8d67ca89SAndroid Build Coastguard Worker lea (%rdi, %rdx), %r9 391*8d67ca89SAndroid Build Coastguard Worker and $-64, %r9 /* r9 = aligned dst */ 392*8d67ca89SAndroid Build Coastguard Worker 393*8d67ca89SAndroid Build Coastguard Worker mov %rsi, %r8 394*8d67ca89SAndroid Build Coastguard Worker sub %rdi, %r8 /* r8 = src - dst, diff */ 395*8d67ca89SAndroid Build Coastguard Worker 396*8d67ca89SAndroid Build Coastguard Worker movdqu -16(%r9, %r8), %xmm4 397*8d67ca89SAndroid Build Coastguard Worker movdqu -32(%r9, %r8), %xmm5 398*8d67ca89SAndroid Build Coastguard Worker movdqu -48(%r9, %r8), %xmm6 399*8d67ca89SAndroid Build Coastguard Worker movdqu -64(%r9, %r8), %xmm7 400*8d67ca89SAndroid Build Coastguard Worker 401*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm0, -16(%rdi, %rdx) 402*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm1, -32(%rdi, %rdx) 403*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm2, -48(%rdi, %rdx) 404*8d67ca89SAndroid Build Coastguard Worker movdqu %xmm3, -64(%rdi, %rdx) 405*8d67ca89SAndroid Build Coastguard Worker movdqa %xmm4, -16(%r9) 406*8d67ca89SAndroid Build Coastguard Worker movaps %xmm5, -32(%r9) 407*8d67ca89SAndroid Build Coastguard Worker movaps %xmm6, -48(%r9) 408*8d67ca89SAndroid Build Coastguard Worker movaps %xmm7, -64(%r9) 409*8d67ca89SAndroid Build Coastguard Worker lea -64(%r9), %r9 410*8d67ca89SAndroid Build Coastguard Worker 411*8d67ca89SAndroid Build Coastguard Worker lea 64(%rdi), %rbx 412*8d67ca89SAndroid Build Coastguard Worker and $-64, %rbx 413*8d67ca89SAndroid Build Coastguard Worker 414*8d67ca89SAndroid Build Coastguard Worker cmp %r9, %rbx 415*8d67ca89SAndroid Build Coastguard Worker jae L(mm_recalc_len) 416*8d67ca89SAndroid Build Coastguard Worker 417*8d67ca89SAndroid Build Coastguard Worker cmp __x86_shared_cache_size_half(%rip), %rdx 418*8d67ca89SAndroid Build Coastguard Worker 419*8d67ca89SAndroid Build Coastguard Worker ja L(mm_overlapping_check_backward) 420*8d67ca89SAndroid Build Coastguard Worker 421*8d67ca89SAndroid Build Coastguard Worker .p2align 4 422*8d67ca89SAndroid Build Coastguard WorkerL(mm_main_loop_backward): 423*8d67ca89SAndroid Build Coastguard Worker 424*8d67ca89SAndroid Build Coastguard Worker prefetcht0 -128(%r9, %r8) 425*8d67ca89SAndroid Build Coastguard Worker 426*8d67ca89SAndroid Build Coastguard Worker movdqu -64(%r9, %r8), %xmm0 427*8d67ca89SAndroid Build Coastguard Worker movdqu -48(%r9, %r8), %xmm1 428*8d67ca89SAndroid Build Coastguard Worker movdqu -32(%r9, %r8), %xmm2 429*8d67ca89SAndroid Build Coastguard Worker movdqu -16(%r9, %r8), %xmm3 430*8d67ca89SAndroid Build Coastguard Worker movdqa %xmm0, -64(%r9) 431*8d67ca89SAndroid Build Coastguard Worker movaps %xmm1, -48(%r9) 432*8d67ca89SAndroid Build Coastguard Worker movaps %xmm2, -32(%r9) 433*8d67ca89SAndroid Build Coastguard Worker movaps %xmm3, -16(%r9) 434*8d67ca89SAndroid Build Coastguard Worker lea -64(%r9), %r9 435*8d67ca89SAndroid Build Coastguard Worker cmp %r9, %rbx 436*8d67ca89SAndroid Build Coastguard Worker jb L(mm_main_loop_backward) 437*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_recalc_len) 438*8d67ca89SAndroid Build Coastguard Worker 439*8d67ca89SAndroid Build Coastguard Worker/* Copy [0..16] and return. */ 440*8d67ca89SAndroid Build Coastguard WorkerL(mm_len_0_16_bytes_backward): 441*8d67ca89SAndroid Build Coastguard Worker testb $24, %dl 442*8d67ca89SAndroid Build Coastguard Worker jnz L(mm_len_9_16_bytes_backward) 443*8d67ca89SAndroid Build Coastguard Worker testb $4, %dl 444*8d67ca89SAndroid Build Coastguard Worker .p2align 4,,5 445*8d67ca89SAndroid Build Coastguard Worker jnz L(mm_len_5_8_bytes_backward) 446*8d67ca89SAndroid Build Coastguard Worker test %rdx, %rdx 447*8d67ca89SAndroid Build Coastguard Worker .p2align 4,,2 448*8d67ca89SAndroid Build Coastguard Worker je L(mm_return) 449*8d67ca89SAndroid Build Coastguard Worker testb $2, %dl 450*8d67ca89SAndroid Build Coastguard Worker .p2align 4,,1 451*8d67ca89SAndroid Build Coastguard Worker jne L(mm_len_3_4_bytes_backward) 452*8d67ca89SAndroid Build Coastguard Worker movzbl -1(%rsi,%rdx), %ebx 453*8d67ca89SAndroid Build Coastguard Worker movzbl (%rsi), %ecx 454*8d67ca89SAndroid Build Coastguard Worker movb %bl, -1(%rdi,%rdx) 455*8d67ca89SAndroid Build Coastguard Worker movb %cl, (%rdi) 456*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 457*8d67ca89SAndroid Build Coastguard Worker 458*8d67ca89SAndroid Build Coastguard WorkerL(mm_len_3_4_bytes_backward): 459*8d67ca89SAndroid Build Coastguard Worker movzwl -2(%rsi,%rdx), %ebx 460*8d67ca89SAndroid Build Coastguard Worker movzwl (%rsi), %ecx 461*8d67ca89SAndroid Build Coastguard Worker movw %bx, -2(%rdi,%rdx) 462*8d67ca89SAndroid Build Coastguard Worker movw %cx, (%rdi) 463*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_return) 464*8d67ca89SAndroid Build Coastguard Worker 465*8d67ca89SAndroid Build Coastguard WorkerL(mm_len_9_16_bytes_backward): 466*8d67ca89SAndroid Build Coastguard Worker movl -4(%rsi,%rdx), %ebx 467*8d67ca89SAndroid Build Coastguard Worker movl -8(%rsi,%rdx), %ecx 468*8d67ca89SAndroid Build Coastguard Worker movl %ebx, -4(%rdi,%rdx) 469*8d67ca89SAndroid Build Coastguard Worker movl %ecx, -8(%rdi,%rdx) 470*8d67ca89SAndroid Build Coastguard Worker sub $8, %rdx 471*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_len_0_16_bytes_backward) 472*8d67ca89SAndroid Build Coastguard Worker 473*8d67ca89SAndroid Build Coastguard WorkerL(mm_len_5_8_bytes_backward): 474*8d67ca89SAndroid Build Coastguard Worker movl (%rsi), %ebx 475*8d67ca89SAndroid Build Coastguard Worker movl -4(%rsi,%rdx), %ecx 476*8d67ca89SAndroid Build Coastguard Worker movl %ebx, (%rdi) 477*8d67ca89SAndroid Build Coastguard Worker movl %ecx, -4(%rdi,%rdx) 478*8d67ca89SAndroid Build Coastguard Worker 479*8d67ca89SAndroid Build Coastguard WorkerL(mm_return): 480*8d67ca89SAndroid Build Coastguard Worker RETURN 481*8d67ca89SAndroid Build Coastguard Worker 482*8d67ca89SAndroid Build Coastguard Worker/* Big length copy forward part. */ 483*8d67ca89SAndroid Build Coastguard Worker 484*8d67ca89SAndroid Build Coastguard Worker .p2align 4 485*8d67ca89SAndroid Build Coastguard Worker 486*8d67ca89SAndroid Build Coastguard WorkerL(mm_overlapping_check_forward): 487*8d67ca89SAndroid Build Coastguard Worker mov %rsi, %r9 488*8d67ca89SAndroid Build Coastguard Worker add %rdx, %r9 489*8d67ca89SAndroid Build Coastguard Worker cmp __x86_shared_cache_size(%rip), %r9 490*8d67ca89SAndroid Build Coastguard Worker jbe L(mm_main_loop_forward) 491*8d67ca89SAndroid Build Coastguard Worker 492*8d67ca89SAndroid Build Coastguard WorkerL(mm_large_page_loop_forward): 493*8d67ca89SAndroid Build Coastguard Worker movdqu (%r8, %rsi), %xmm0 494*8d67ca89SAndroid Build Coastguard Worker movdqu 16(%r8, %rsi), %xmm1 495*8d67ca89SAndroid Build Coastguard Worker movdqu 32(%r8, %rsi), %xmm2 496*8d67ca89SAndroid Build Coastguard Worker movdqu 48(%r8, %rsi), %xmm3 497*8d67ca89SAndroid Build Coastguard Worker movntdq %xmm0, (%r8) 498*8d67ca89SAndroid Build Coastguard Worker movntdq %xmm1, 16(%r8) 499*8d67ca89SAndroid Build Coastguard Worker movntdq %xmm2, 32(%r8) 500*8d67ca89SAndroid Build Coastguard Worker movntdq %xmm3, 48(%r8) 501*8d67ca89SAndroid Build Coastguard Worker lea 64(%r8), %r8 502*8d67ca89SAndroid Build Coastguard Worker cmp %r8, %rbx 503*8d67ca89SAndroid Build Coastguard Worker ja L(mm_large_page_loop_forward) 504*8d67ca89SAndroid Build Coastguard Worker sfence 505*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_copy_remaining_forward) 506*8d67ca89SAndroid Build Coastguard Worker 507*8d67ca89SAndroid Build Coastguard Worker/* Big length copy backward part. */ 508*8d67ca89SAndroid Build Coastguard Worker .p2align 4 509*8d67ca89SAndroid Build Coastguard Worker 510*8d67ca89SAndroid Build Coastguard WorkerL(mm_overlapping_check_backward): 511*8d67ca89SAndroid Build Coastguard Worker mov %rdi, %r11 512*8d67ca89SAndroid Build Coastguard Worker sub %rsi, %r11 /* r11 = dst - src, diff */ 513*8d67ca89SAndroid Build Coastguard Worker add %rdx, %r11 514*8d67ca89SAndroid Build Coastguard Worker cmp __x86_shared_cache_size(%rip), %r11 515*8d67ca89SAndroid Build Coastguard Worker jbe L(mm_main_loop_backward) 516*8d67ca89SAndroid Build Coastguard Worker 517*8d67ca89SAndroid Build Coastguard WorkerL(mm_large_page_loop_backward): 518*8d67ca89SAndroid Build Coastguard Worker movdqu -64(%r9, %r8), %xmm0 519*8d67ca89SAndroid Build Coastguard Worker movdqu -48(%r9, %r8), %xmm1 520*8d67ca89SAndroid Build Coastguard Worker movdqu -32(%r9, %r8), %xmm2 521*8d67ca89SAndroid Build Coastguard Worker movdqu -16(%r9, %r8), %xmm3 522*8d67ca89SAndroid Build Coastguard Worker movntdq %xmm0, -64(%r9) 523*8d67ca89SAndroid Build Coastguard Worker movntdq %xmm1, -48(%r9) 524*8d67ca89SAndroid Build Coastguard Worker movntdq %xmm2, -32(%r9) 525*8d67ca89SAndroid Build Coastguard Worker movntdq %xmm3, -16(%r9) 526*8d67ca89SAndroid Build Coastguard Worker lea -64(%r9), %r9 527*8d67ca89SAndroid Build Coastguard Worker cmp %r9, %rbx 528*8d67ca89SAndroid Build Coastguard Worker jb L(mm_large_page_loop_backward) 529*8d67ca89SAndroid Build Coastguard Worker sfence 530*8d67ca89SAndroid Build Coastguard Worker jmp L(mm_recalc_len) 531*8d67ca89SAndroid Build Coastguard Worker 532*8d67ca89SAndroid Build Coastguard WorkerEND (MEMMOVE) 533*8d67ca89SAndroid Build Coastguard Worker 534*8d67ca89SAndroid Build Coastguard WorkerALIAS_SYMBOL(memcpy, MEMMOVE) 535