1*412f47f9SXin Li/* 2*412f47f9SXin Li * memcpy - copy memory area 3*412f47f9SXin Li * 4*412f47f9SXin Li * Copyright (c) 2019-2023, Arm Limited. 5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*412f47f9SXin Li */ 7*412f47f9SXin Li 8*412f47f9SXin Li/* Assumptions: 9*412f47f9SXin Li * 10*412f47f9SXin Li * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 11*412f47f9SXin Li * 12*412f47f9SXin Li */ 13*412f47f9SXin Li 14*412f47f9SXin Li#include "asmdefs.h" 15*412f47f9SXin Li 16*412f47f9SXin Li#define dstin x0 17*412f47f9SXin Li#define src x1 18*412f47f9SXin Li#define count x2 19*412f47f9SXin Li#define dst x3 20*412f47f9SXin Li#define srcend x4 21*412f47f9SXin Li#define dstend x5 22*412f47f9SXin Li#define A_l x6 23*412f47f9SXin Li#define A_lw w6 24*412f47f9SXin Li#define A_h x7 25*412f47f9SXin Li#define B_l x8 26*412f47f9SXin Li#define B_lw w8 27*412f47f9SXin Li#define B_h x9 28*412f47f9SXin Li#define C_lw w10 29*412f47f9SXin Li#define tmp1 x14 30*412f47f9SXin Li 31*412f47f9SXin Li#define A_q q0 32*412f47f9SXin Li#define B_q q1 33*412f47f9SXin Li#define C_q q2 34*412f47f9SXin Li#define D_q q3 35*412f47f9SXin Li#define E_q q4 36*412f47f9SXin Li#define F_q q5 37*412f47f9SXin Li#define G_q q6 38*412f47f9SXin Li#define H_q q7 39*412f47f9SXin Li 40*412f47f9SXin Li/* This implementation handles overlaps and supports both memcpy and memmove 41*412f47f9SXin Li from a single entry point. It uses unaligned accesses and branchless 42*412f47f9SXin Li sequences to keep the code small, simple and improve performance. 43*412f47f9SXin Li 44*412f47f9SXin Li Copies are split into 3 main cases: small copies of up to 32 bytes, medium 45*412f47f9SXin Li copies of up to 128 bytes, and large copies. The overhead of the overlap 46*412f47f9SXin Li check is negligible since it is only required for large copies. 47*412f47f9SXin Li 48*412f47f9SXin Li Large copies use a software pipelined loop processing 64 bytes per iteration. 49*412f47f9SXin Li The source pointer is 16-byte aligned to minimize unaligned accesses. 50*412f47f9SXin Li The loop tail is handled by always copying 64 bytes from the end. 51*412f47f9SXin Li*/ 52*412f47f9SXin Li 53*412f47f9SXin LiENTRY_ALIAS (__memmove_aarch64_simd) 54*412f47f9SXin LiENTRY (__memcpy_aarch64_simd) 55*412f47f9SXin Li PTR_ARG (0) 56*412f47f9SXin Li PTR_ARG (1) 57*412f47f9SXin Li SIZE_ARG (2) 58*412f47f9SXin Li add srcend, src, count 59*412f47f9SXin Li cmp count, 128 60*412f47f9SXin Li b.hi L(copy_long) 61*412f47f9SXin Li add dstend, dstin, count 62*412f47f9SXin Li cmp count, 32 63*412f47f9SXin Li b.hi L(copy32_128) 64*412f47f9SXin Li nop 65*412f47f9SXin Li 66*412f47f9SXin Li /* Small copies: 0..32 bytes. */ 67*412f47f9SXin Li cmp count, 16 68*412f47f9SXin Li b.lo L(copy16) 69*412f47f9SXin Li ldr A_q, [src] 70*412f47f9SXin Li ldr B_q, [srcend, -16] 71*412f47f9SXin Li str A_q, [dstin] 72*412f47f9SXin Li str B_q, [dstend, -16] 73*412f47f9SXin Li ret 74*412f47f9SXin Li 75*412f47f9SXin Li .p2align 4 76*412f47f9SXin Li /* Medium copies: 33..128 bytes. */ 77*412f47f9SXin LiL(copy32_128): 78*412f47f9SXin Li ldp A_q, B_q, [src] 79*412f47f9SXin Li ldp C_q, D_q, [srcend, -32] 80*412f47f9SXin Li cmp count, 64 81*412f47f9SXin Li b.hi L(copy128) 82*412f47f9SXin Li stp A_q, B_q, [dstin] 83*412f47f9SXin Li stp C_q, D_q, [dstend, -32] 84*412f47f9SXin Li ret 85*412f47f9SXin Li 86*412f47f9SXin Li .p2align 4 87*412f47f9SXin Li /* Copy 8-15 bytes. */ 88*412f47f9SXin LiL(copy16): 89*412f47f9SXin Li tbz count, 3, L(copy8) 90*412f47f9SXin Li ldr A_l, [src] 91*412f47f9SXin Li ldr A_h, [srcend, -8] 92*412f47f9SXin Li str A_l, [dstin] 93*412f47f9SXin Li str A_h, [dstend, -8] 94*412f47f9SXin Li ret 95*412f47f9SXin Li 96*412f47f9SXin Li /* Copy 4-7 bytes. */ 97*412f47f9SXin LiL(copy8): 98*412f47f9SXin Li tbz count, 2, L(copy4) 99*412f47f9SXin Li ldr A_lw, [src] 100*412f47f9SXin Li ldr B_lw, [srcend, -4] 101*412f47f9SXin Li str A_lw, [dstin] 102*412f47f9SXin Li str B_lw, [dstend, -4] 103*412f47f9SXin Li ret 104*412f47f9SXin Li 105*412f47f9SXin Li /* Copy 65..128 bytes. */ 106*412f47f9SXin LiL(copy128): 107*412f47f9SXin Li ldp E_q, F_q, [src, 32] 108*412f47f9SXin Li cmp count, 96 109*412f47f9SXin Li b.ls L(copy96) 110*412f47f9SXin Li ldp G_q, H_q, [srcend, -64] 111*412f47f9SXin Li stp G_q, H_q, [dstend, -64] 112*412f47f9SXin LiL(copy96): 113*412f47f9SXin Li stp A_q, B_q, [dstin] 114*412f47f9SXin Li stp E_q, F_q, [dstin, 32] 115*412f47f9SXin Li stp C_q, D_q, [dstend, -32] 116*412f47f9SXin Li ret 117*412f47f9SXin Li 118*412f47f9SXin Li /* Copy 0..3 bytes using a branchless sequence. */ 119*412f47f9SXin LiL(copy4): 120*412f47f9SXin Li cbz count, L(copy0) 121*412f47f9SXin Li lsr tmp1, count, 1 122*412f47f9SXin Li ldrb A_lw, [src] 123*412f47f9SXin Li ldrb C_lw, [srcend, -1] 124*412f47f9SXin Li ldrb B_lw, [src, tmp1] 125*412f47f9SXin Li strb A_lw, [dstin] 126*412f47f9SXin Li strb B_lw, [dstin, tmp1] 127*412f47f9SXin Li strb C_lw, [dstend, -1] 128*412f47f9SXin LiL(copy0): 129*412f47f9SXin Li ret 130*412f47f9SXin Li 131*412f47f9SXin Li .p2align 3 132*412f47f9SXin Li /* Copy more than 128 bytes. */ 133*412f47f9SXin LiL(copy_long): 134*412f47f9SXin Li add dstend, dstin, count 135*412f47f9SXin Li 136*412f47f9SXin Li /* Use backwards copy if there is an overlap. */ 137*412f47f9SXin Li sub tmp1, dstin, src 138*412f47f9SXin Li cmp tmp1, count 139*412f47f9SXin Li b.lo L(copy_long_backwards) 140*412f47f9SXin Li 141*412f47f9SXin Li /* Copy 16 bytes and then align src to 16-byte alignment. */ 142*412f47f9SXin Li ldr D_q, [src] 143*412f47f9SXin Li and tmp1, src, 15 144*412f47f9SXin Li bic src, src, 15 145*412f47f9SXin Li sub dst, dstin, tmp1 146*412f47f9SXin Li add count, count, tmp1 /* Count is now 16 too large. */ 147*412f47f9SXin Li ldp A_q, B_q, [src, 16] 148*412f47f9SXin Li str D_q, [dstin] 149*412f47f9SXin Li ldp C_q, D_q, [src, 48] 150*412f47f9SXin Li subs count, count, 128 + 16 /* Test and readjust count. */ 151*412f47f9SXin Li b.ls L(copy64_from_end) 152*412f47f9SXin LiL(loop64): 153*412f47f9SXin Li stp A_q, B_q, [dst, 16] 154*412f47f9SXin Li ldp A_q, B_q, [src, 80] 155*412f47f9SXin Li stp C_q, D_q, [dst, 48] 156*412f47f9SXin Li ldp C_q, D_q, [src, 112] 157*412f47f9SXin Li add src, src, 64 158*412f47f9SXin Li add dst, dst, 64 159*412f47f9SXin Li subs count, count, 64 160*412f47f9SXin Li b.hi L(loop64) 161*412f47f9SXin Li 162*412f47f9SXin Li /* Write the last iteration and copy 64 bytes from the end. */ 163*412f47f9SXin LiL(copy64_from_end): 164*412f47f9SXin Li ldp E_q, F_q, [srcend, -64] 165*412f47f9SXin Li stp A_q, B_q, [dst, 16] 166*412f47f9SXin Li ldp A_q, B_q, [srcend, -32] 167*412f47f9SXin Li stp C_q, D_q, [dst, 48] 168*412f47f9SXin Li stp E_q, F_q, [dstend, -64] 169*412f47f9SXin Li stp A_q, B_q, [dstend, -32] 170*412f47f9SXin Li ret 171*412f47f9SXin Li 172*412f47f9SXin Li .p2align 4 173*412f47f9SXin Li nop 174*412f47f9SXin Li 175*412f47f9SXin Li /* Large backwards copy for overlapping copies. 176*412f47f9SXin Li Copy 16 bytes and then align srcend to 16-byte alignment. */ 177*412f47f9SXin LiL(copy_long_backwards): 178*412f47f9SXin Li cbz tmp1, L(copy0) 179*412f47f9SXin Li ldr D_q, [srcend, -16] 180*412f47f9SXin Li and tmp1, srcend, 15 181*412f47f9SXin Li bic srcend, srcend, 15 182*412f47f9SXin Li sub count, count, tmp1 183*412f47f9SXin Li ldp A_q, B_q, [srcend, -32] 184*412f47f9SXin Li str D_q, [dstend, -16] 185*412f47f9SXin Li ldp C_q, D_q, [srcend, -64] 186*412f47f9SXin Li sub dstend, dstend, tmp1 187*412f47f9SXin Li subs count, count, 128 188*412f47f9SXin Li b.ls L(copy64_from_start) 189*412f47f9SXin Li 190*412f47f9SXin LiL(loop64_backwards): 191*412f47f9SXin Li str B_q, [dstend, -16] 192*412f47f9SXin Li str A_q, [dstend, -32] 193*412f47f9SXin Li ldp A_q, B_q, [srcend, -96] 194*412f47f9SXin Li str D_q, [dstend, -48] 195*412f47f9SXin Li str C_q, [dstend, -64]! 196*412f47f9SXin Li ldp C_q, D_q, [srcend, -128] 197*412f47f9SXin Li sub srcend, srcend, 64 198*412f47f9SXin Li subs count, count, 64 199*412f47f9SXin Li b.hi L(loop64_backwards) 200*412f47f9SXin Li 201*412f47f9SXin Li /* Write the last iteration and copy 64 bytes from the start. */ 202*412f47f9SXin LiL(copy64_from_start): 203*412f47f9SXin Li ldp E_q, F_q, [src, 32] 204*412f47f9SXin Li stp A_q, B_q, [dstend, -32] 205*412f47f9SXin Li ldp A_q, B_q, [src] 206*412f47f9SXin Li stp C_q, D_q, [dstend, -64] 207*412f47f9SXin Li stp E_q, F_q, [dstin, 32] 208*412f47f9SXin Li stp A_q, B_q, [dstin] 209*412f47f9SXin Li ret 210*412f47f9SXin Li 211*412f47f9SXin LiEND (__memcpy_aarch64_simd) 212*412f47f9SXin Li 213