1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31 32#ifndef MEMMOVE 33# define MEMMOVE memmove 34#endif 35 36#ifndef L 37# define L(label) .L##label 38#endif 39 40#ifndef cfi_startproc 41# define cfi_startproc .cfi_startproc 42#endif 43 44#ifndef cfi_endproc 45# define cfi_endproc .cfi_endproc 46#endif 47 48#ifndef cfi_rel_offset 49# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 50#endif 51 52#ifndef cfi_restore 53# define cfi_restore(reg) .cfi_restore reg 54#endif 55 56#ifndef cfi_adjust_cfa_offset 57# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 58#endif 59 60#ifndef ENTRY 61# define ENTRY(name) \ 62 .type name, @function; \ 63 .globl name; \ 64 .p2align 4; \ 65name: \ 66 cfi_startproc 67#endif 68 69#ifndef ALIAS_SYMBOL 70# define ALIAS_SYMBOL(alias, original) \ 71 .globl alias; \ 72 .equ alias, original 73#endif 74 75#ifndef END 76# define END(name) \ 77 cfi_endproc; \ 78 .size name, .-name 79#endif 80 81#define CFI_PUSH(REG) \ 82 cfi_adjust_cfa_offset (4); \ 83 cfi_rel_offset (REG, 0) 84 85#define CFI_POP(REG) \ 86 cfi_adjust_cfa_offset (-4); \ 87 cfi_restore (REG) 88 89#define PUSH(REG) push REG; 90#define POP(REG) pop REG; 91 92#define ENTRANCE PUSH (%rbx); 93#define RETURN_END POP (%rbx); ret 94#define RETURN RETURN_END; 95 96 .section .text.sse2,"ax",@progbits 97ENTRY (MEMMOVE) 98 ENTRANCE 99 mov %rdi, %rax 100 101/* Check whether we should copy backward or forward. */ 102 cmp %rsi, %rdi 103 je L(mm_return) 104 jg L(mm_len_0_or_more_backward) 105 106/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] 107 separately. */ 108 cmp $16, %rdx 109 jbe L(mm_len_0_16_bytes_forward) 110 111 cmp $32, %rdx 112 ja L(mm_len_32_or_more_forward) 113 114/* Copy [0..32] and return. */ 115 movdqu (%rsi), %xmm0 116 movdqu -16(%rsi, %rdx), %xmm1 117 movdqu %xmm0, (%rdi) 118 movdqu %xmm1, -16(%rdi, %rdx) 119 jmp L(mm_return) 120 121L(mm_len_32_or_more_forward): 122 cmp $64, %rdx 123 ja L(mm_len_64_or_more_forward) 124 125/* Copy [0..64] and return. */ 126 movdqu (%rsi), %xmm0 127 movdqu 16(%rsi), %xmm1 128 movdqu -16(%rsi, %rdx), %xmm2 129 movdqu -32(%rsi, %rdx), %xmm3 130 movdqu %xmm0, (%rdi) 131 movdqu %xmm1, 16(%rdi) 132 movdqu %xmm2, -16(%rdi, %rdx) 133 movdqu %xmm3, -32(%rdi, %rdx) 134 jmp L(mm_return) 135 136L(mm_len_64_or_more_forward): 137 cmp $128, %rdx 138 ja L(mm_len_128_or_more_forward) 139 140/* Copy [0..128] and return. */ 141 movdqu (%rsi), %xmm0 142 movdqu 16(%rsi), %xmm1 143 movdqu 32(%rsi), %xmm2 144 movdqu 48(%rsi), %xmm3 145 movdqu -64(%rsi, %rdx), %xmm4 146 movdqu -48(%rsi, %rdx), %xmm5 147 movdqu -32(%rsi, %rdx), %xmm6 148 movdqu -16(%rsi, %rdx), %xmm7 149 movdqu %xmm0, (%rdi) 150 movdqu %xmm1, 16(%rdi) 151 movdqu %xmm2, 32(%rdi) 152 movdqu %xmm3, 48(%rdi) 153 movdqu %xmm4, -64(%rdi, %rdx) 154 movdqu %xmm5, -48(%rdi, %rdx) 155 movdqu %xmm6, -32(%rdi, %rdx) 156 movdqu %xmm7, -16(%rdi, %rdx) 157 jmp L(mm_return) 158 159L(mm_len_128_or_more_forward): 160/* Aligning the address of destination. */ 161/* save first unaligned 64 bytes */ 162 movdqu (%rsi), %xmm0 163 movdqu 16(%rsi), %xmm1 164 movdqu 32(%rsi), %xmm2 165 movdqu 48(%rsi), %xmm3 166 167 lea 64(%rdi), %r8 168 and $-64, %r8 /* r8 now aligned to next 64 byte boundary */ 169 sub %rdi, %rsi /* rsi = src - dst = diff */ 170 171 movdqu (%r8, %rsi), %xmm4 172 movdqu 16(%r8, %rsi), %xmm5 173 movdqu 32(%r8, %rsi), %xmm6 174 movdqu 48(%r8, %rsi), %xmm7 175 176 movdqu %xmm0, (%rdi) 177 movdqu %xmm1, 16(%rdi) 178 movdqu %xmm2, 32(%rdi) 179 movdqu %xmm3, 48(%rdi) 180 movdqa %xmm4, (%r8) 181 movaps %xmm5, 16(%r8) 182 movaps %xmm6, 32(%r8) 183 movaps %xmm7, 48(%r8) 184 add $64, %r8 185 186 lea (%rdi, %rdx), %rbx 187 and $-64, %rbx 188 cmp %r8, %rbx 189 jbe L(mm_copy_remaining_forward) 190 191 cmp __x86_shared_cache_size_half(%rip), %rdx 192 193 ja L(mm_overlapping_check_forward) 194 195 .p2align 4 196L(mm_main_loop_forward): 197 198 prefetcht0 128(%r8, %rsi) 199 200 movdqu (%r8, %rsi), %xmm0 201 movdqu 16(%r8, %rsi), %xmm1 202 movdqu 32(%r8, %rsi), %xmm2 203 movdqu 48(%r8, %rsi), %xmm3 204 movdqa %xmm0, (%r8) 205 movaps %xmm1, 16(%r8) 206 movaps %xmm2, 32(%r8) 207 movaps %xmm3, 48(%r8) 208 lea 64(%r8), %r8 209 cmp %r8, %rbx 210 ja L(mm_main_loop_forward) 211 212L(mm_copy_remaining_forward): 213 add %rdi, %rdx 214 sub %r8, %rdx 215/* We copied all up till %rdi position in the dst. 216 In %rdx now is how many bytes are left to copy. 217 Now we need to advance %r8. */ 218 lea (%r8, %rsi), %r9 219 220L(mm_remaining_0_64_bytes_forward): 221 cmp $32, %rdx 222 ja L(mm_remaining_33_64_bytes_forward) 223 cmp $16, %rdx 224 ja L(mm_remaining_17_32_bytes_forward) 225 test %rdx, %rdx 226 .p2align 4,,2 227 je L(mm_return) 228 229 cmpb $8, %dl 230 ja L(mm_remaining_9_16_bytes_forward) 231 cmpb $4, %dl 232 .p2align 4,,5 233 ja L(mm_remaining_5_8_bytes_forward) 234 cmpb $2, %dl 235 .p2align 4,,1 236 ja L(mm_remaining_3_4_bytes_forward) 237 movzbl -1(%r9,%rdx), %esi 238 movzbl (%r9), %ebx 239 movb %sil, -1(%r8,%rdx) 240 movb %bl, (%r8) 241 jmp L(mm_return) 242 243L(mm_remaining_33_64_bytes_forward): 244 movdqu (%r9), %xmm0 245 movdqu 16(%r9), %xmm1 246 movdqu -32(%r9, %rdx), %xmm2 247 movdqu -16(%r9, %rdx), %xmm3 248 movdqu %xmm0, (%r8) 249 movdqu %xmm1, 16(%r8) 250 movdqu %xmm2, -32(%r8, %rdx) 251 movdqu %xmm3, -16(%r8, %rdx) 252 jmp L(mm_return) 253 254L(mm_remaining_17_32_bytes_forward): 255 movdqu (%r9), %xmm0 256 movdqu -16(%r9, %rdx), %xmm1 257 movdqu %xmm0, (%r8) 258 movdqu %xmm1, -16(%r8, %rdx) 259 jmp L(mm_return) 260 261L(mm_remaining_5_8_bytes_forward): 262 movl (%r9), %esi 263 movl -4(%r9,%rdx), %ebx 264 movl %esi, (%r8) 265 movl %ebx, -4(%r8,%rdx) 266 jmp L(mm_return) 267 268L(mm_remaining_9_16_bytes_forward): 269 mov (%r9), %rsi 270 mov -8(%r9, %rdx), %rbx 271 mov %rsi, (%r8) 272 mov %rbx, -8(%r8, %rdx) 273 jmp L(mm_return) 274 275L(mm_remaining_3_4_bytes_forward): 276 movzwl -2(%r9,%rdx), %esi 277 movzwl (%r9), %ebx 278 movw %si, -2(%r8,%rdx) 279 movw %bx, (%r8) 280 jmp L(mm_return) 281 282L(mm_len_0_16_bytes_forward): 283 testb $24, %dl 284 jne L(mm_len_9_16_bytes_forward) 285 testb $4, %dl 286 .p2align 4,,5 287 jne L(mm_len_5_8_bytes_forward) 288 test %rdx, %rdx 289 .p2align 4,,2 290 je L(mm_return) 291 testb $2, %dl 292 .p2align 4,,1 293 jne L(mm_len_2_4_bytes_forward) 294 movzbl -1(%rsi,%rdx), %ebx 295 movzbl (%rsi), %esi 296 movb %bl, -1(%rdi,%rdx) 297 movb %sil, (%rdi) 298 jmp L(mm_return) 299 300L(mm_len_2_4_bytes_forward): 301 movzwl -2(%rsi,%rdx), %ebx 302 movzwl (%rsi), %esi 303 movw %bx, -2(%rdi,%rdx) 304 movw %si, (%rdi) 305 jmp L(mm_return) 306 307L(mm_len_5_8_bytes_forward): 308 movl (%rsi), %ebx 309 movl -4(%rsi,%rdx), %esi 310 movl %ebx, (%rdi) 311 movl %esi, -4(%rdi,%rdx) 312 jmp L(mm_return) 313 314L(mm_len_9_16_bytes_forward): 315 mov (%rsi), %rbx 316 mov -8(%rsi, %rdx), %rsi 317 mov %rbx, (%rdi) 318 mov %rsi, -8(%rdi, %rdx) 319 jmp L(mm_return) 320 321L(mm_recalc_len): 322/* Compute in %rdx how many bytes are left to copy after 323 the main loop stops. */ 324 mov %rbx, %rdx 325 sub %rdi, %rdx 326/* The code for copying backwards. */ 327L(mm_len_0_or_more_backward): 328 329/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] 330 separately. */ 331 cmp $16, %rdx 332 jbe L(mm_len_0_16_bytes_backward) 333 334 cmp $32, %rdx 335 ja L(mm_len_32_or_more_backward) 336 337/* Copy [0..32] and return. */ 338 movdqu (%rsi), %xmm0 339 movdqu -16(%rsi, %rdx), %xmm1 340 movdqu %xmm0, (%rdi) 341 movdqu %xmm1, -16(%rdi, %rdx) 342 jmp L(mm_return) 343 344L(mm_len_32_or_more_backward): 345 cmp $64, %rdx 346 ja L(mm_len_64_or_more_backward) 347 348/* Copy [0..64] and return. */ 349 movdqu (%rsi), %xmm0 350 movdqu 16(%rsi), %xmm1 351 movdqu -16(%rsi, %rdx), %xmm2 352 movdqu -32(%rsi, %rdx), %xmm3 353 movdqu %xmm0, (%rdi) 354 movdqu %xmm1, 16(%rdi) 355 movdqu %xmm2, -16(%rdi, %rdx) 356 movdqu %xmm3, -32(%rdi, %rdx) 357 jmp L(mm_return) 358 359L(mm_len_64_or_more_backward): 360 cmp $128, %rdx 361 ja L(mm_len_128_or_more_backward) 362 363/* Copy [0..128] and return. */ 364 movdqu (%rsi), %xmm0 365 movdqu 16(%rsi), %xmm1 366 movdqu 32(%rsi), %xmm2 367 movdqu 48(%rsi), %xmm3 368 movdqu -64(%rsi, %rdx), %xmm4 369 movdqu -48(%rsi, %rdx), %xmm5 370 movdqu -32(%rsi, %rdx), %xmm6 371 movdqu -16(%rsi, %rdx), %xmm7 372 movdqu %xmm0, (%rdi) 373 movdqu %xmm1, 16(%rdi) 374 movdqu %xmm2, 32(%rdi) 375 movdqu %xmm3, 48(%rdi) 376 movdqu %xmm4, -64(%rdi, %rdx) 377 movdqu %xmm5, -48(%rdi, %rdx) 378 movdqu %xmm6, -32(%rdi, %rdx) 379 movdqu %xmm7, -16(%rdi, %rdx) 380 jmp L(mm_return) 381 382L(mm_len_128_or_more_backward): 383/* Aligning the address of destination. We need to save 384 16 bits from the source in order not to overwrite them. */ 385 movdqu -16(%rsi, %rdx), %xmm0 386 movdqu -32(%rsi, %rdx), %xmm1 387 movdqu -48(%rsi, %rdx), %xmm2 388 movdqu -64(%rsi, %rdx), %xmm3 389 390 lea (%rdi, %rdx), %r9 391 and $-64, %r9 /* r9 = aligned dst */ 392 393 mov %rsi, %r8 394 sub %rdi, %r8 /* r8 = src - dst, diff */ 395 396 movdqu -16(%r9, %r8), %xmm4 397 movdqu -32(%r9, %r8), %xmm5 398 movdqu -48(%r9, %r8), %xmm6 399 movdqu -64(%r9, %r8), %xmm7 400 401 movdqu %xmm0, -16(%rdi, %rdx) 402 movdqu %xmm1, -32(%rdi, %rdx) 403 movdqu %xmm2, -48(%rdi, %rdx) 404 movdqu %xmm3, -64(%rdi, %rdx) 405 movdqa %xmm4, -16(%r9) 406 movaps %xmm5, -32(%r9) 407 movaps %xmm6, -48(%r9) 408 movaps %xmm7, -64(%r9) 409 lea -64(%r9), %r9 410 411 lea 64(%rdi), %rbx 412 and $-64, %rbx 413 414 cmp %r9, %rbx 415 jae L(mm_recalc_len) 416 417 cmp __x86_shared_cache_size_half(%rip), %rdx 418 419 ja L(mm_overlapping_check_backward) 420 421 .p2align 4 422L(mm_main_loop_backward): 423 424 prefetcht0 -128(%r9, %r8) 425 426 movdqu -64(%r9, %r8), %xmm0 427 movdqu -48(%r9, %r8), %xmm1 428 movdqu -32(%r9, %r8), %xmm2 429 movdqu -16(%r9, %r8), %xmm3 430 movdqa %xmm0, -64(%r9) 431 movaps %xmm1, -48(%r9) 432 movaps %xmm2, -32(%r9) 433 movaps %xmm3, -16(%r9) 434 lea -64(%r9), %r9 435 cmp %r9, %rbx 436 jb L(mm_main_loop_backward) 437 jmp L(mm_recalc_len) 438 439/* Copy [0..16] and return. */ 440L(mm_len_0_16_bytes_backward): 441 testb $24, %dl 442 jnz L(mm_len_9_16_bytes_backward) 443 testb $4, %dl 444 .p2align 4,,5 445 jnz L(mm_len_5_8_bytes_backward) 446 test %rdx, %rdx 447 .p2align 4,,2 448 je L(mm_return) 449 testb $2, %dl 450 .p2align 4,,1 451 jne L(mm_len_3_4_bytes_backward) 452 movzbl -1(%rsi,%rdx), %ebx 453 movzbl (%rsi), %ecx 454 movb %bl, -1(%rdi,%rdx) 455 movb %cl, (%rdi) 456 jmp L(mm_return) 457 458L(mm_len_3_4_bytes_backward): 459 movzwl -2(%rsi,%rdx), %ebx 460 movzwl (%rsi), %ecx 461 movw %bx, -2(%rdi,%rdx) 462 movw %cx, (%rdi) 463 jmp L(mm_return) 464 465L(mm_len_9_16_bytes_backward): 466 movl -4(%rsi,%rdx), %ebx 467 movl -8(%rsi,%rdx), %ecx 468 movl %ebx, -4(%rdi,%rdx) 469 movl %ecx, -8(%rdi,%rdx) 470 sub $8, %rdx 471 jmp L(mm_len_0_16_bytes_backward) 472 473L(mm_len_5_8_bytes_backward): 474 movl (%rsi), %ebx 475 movl -4(%rsi,%rdx), %ecx 476 movl %ebx, (%rdi) 477 movl %ecx, -4(%rdi,%rdx) 478 479L(mm_return): 480 RETURN 481 482/* Big length copy forward part. */ 483 484 .p2align 4 485 486L(mm_overlapping_check_forward): 487 mov %rsi, %r9 488 add %rdx, %r9 489 cmp __x86_shared_cache_size(%rip), %r9 490 jbe L(mm_main_loop_forward) 491 492L(mm_large_page_loop_forward): 493 movdqu (%r8, %rsi), %xmm0 494 movdqu 16(%r8, %rsi), %xmm1 495 movdqu 32(%r8, %rsi), %xmm2 496 movdqu 48(%r8, %rsi), %xmm3 497 movntdq %xmm0, (%r8) 498 movntdq %xmm1, 16(%r8) 499 movntdq %xmm2, 32(%r8) 500 movntdq %xmm3, 48(%r8) 501 lea 64(%r8), %r8 502 cmp %r8, %rbx 503 ja L(mm_large_page_loop_forward) 504 sfence 505 jmp L(mm_copy_remaining_forward) 506 507/* Big length copy backward part. */ 508 .p2align 4 509 510L(mm_overlapping_check_backward): 511 mov %rdi, %r11 512 sub %rsi, %r11 /* r11 = dst - src, diff */ 513 add %rdx, %r11 514 cmp __x86_shared_cache_size(%rip), %r11 515 jbe L(mm_main_loop_backward) 516 517L(mm_large_page_loop_backward): 518 movdqu -64(%r9, %r8), %xmm0 519 movdqu -48(%r9, %r8), %xmm1 520 movdqu -32(%r9, %r8), %xmm2 521 movdqu -16(%r9, %r8), %xmm3 522 movntdq %xmm0, -64(%r9) 523 movntdq %xmm1, -48(%r9) 524 movntdq %xmm2, -32(%r9) 525 movntdq %xmm3, -16(%r9) 526 lea -64(%r9), %r9 527 cmp %r9, %rbx 528 jb L(mm_large_page_loop_backward) 529 sfence 530 jmp L(mm_recalc_len) 531 532END (MEMMOVE) 533 534ALIAS_SYMBOL(memcpy, MEMMOVE) 535