1*412f47f9SXin Li/* 2*412f47f9SXin Li * memcpy - copy memory area 3*412f47f9SXin Li * 4*412f47f9SXin Li * Copyright (c) 2012-2022, Arm Limited. 5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*412f47f9SXin Li */ 7*412f47f9SXin Li 8*412f47f9SXin Li/* Assumptions: 9*412f47f9SXin Li * 10*412f47f9SXin Li * ARMv8-a, AArch64, unaligned accesses. 11*412f47f9SXin Li * 12*412f47f9SXin Li */ 13*412f47f9SXin Li 14*412f47f9SXin Li#include "asmdefs.h" 15*412f47f9SXin Li 16*412f47f9SXin Li#define dstin x0 17*412f47f9SXin Li#define src x1 18*412f47f9SXin Li#define count x2 19*412f47f9SXin Li#define dst x3 20*412f47f9SXin Li#define srcend x4 21*412f47f9SXin Li#define dstend x5 22*412f47f9SXin Li#define A_l x6 23*412f47f9SXin Li#define A_lw w6 24*412f47f9SXin Li#define A_h x7 25*412f47f9SXin Li#define B_l x8 26*412f47f9SXin Li#define B_lw w8 27*412f47f9SXin Li#define B_h x9 28*412f47f9SXin Li#define C_l x10 29*412f47f9SXin Li#define C_lw w10 30*412f47f9SXin Li#define C_h x11 31*412f47f9SXin Li#define D_l x12 32*412f47f9SXin Li#define D_h x13 33*412f47f9SXin Li#define E_l x14 34*412f47f9SXin Li#define E_h x15 35*412f47f9SXin Li#define F_l x16 36*412f47f9SXin Li#define F_h x17 37*412f47f9SXin Li#define G_l count 38*412f47f9SXin Li#define G_h dst 39*412f47f9SXin Li#define H_l src 40*412f47f9SXin Li#define H_h srcend 41*412f47f9SXin Li#define tmp1 x14 42*412f47f9SXin Li 43*412f47f9SXin Li/* This implementation handles overlaps and supports both memcpy and memmove 44*412f47f9SXin Li from a single entry point. It uses unaligned accesses and branchless 45*412f47f9SXin Li sequences to keep the code small, simple and improve performance. 46*412f47f9SXin Li 47*412f47f9SXin Li Copies are split into 3 main cases: small copies of up to 32 bytes, medium 48*412f47f9SXin Li copies of up to 128 bytes, and large copies. The overhead of the overlap 49*412f47f9SXin Li check is negligible since it is only required for large copies. 50*412f47f9SXin Li 51*412f47f9SXin Li Large copies use a software pipelined loop processing 64 bytes per iteration. 52*412f47f9SXin Li The destination pointer is 16-byte aligned to minimize unaligned accesses. 53*412f47f9SXin Li The loop tail is handled by always copying 64 bytes from the end. 54*412f47f9SXin Li*/ 55*412f47f9SXin Li 56*412f47f9SXin LiENTRY_ALIAS (__memmove_aarch64) 57*412f47f9SXin LiENTRY (__memcpy_aarch64) 58*412f47f9SXin Li PTR_ARG (0) 59*412f47f9SXin Li PTR_ARG (1) 60*412f47f9SXin Li SIZE_ARG (2) 61*412f47f9SXin Li add srcend, src, count 62*412f47f9SXin Li add dstend, dstin, count 63*412f47f9SXin Li cmp count, 128 64*412f47f9SXin Li b.hi L(copy_long) 65*412f47f9SXin Li cmp count, 32 66*412f47f9SXin Li b.hi L(copy32_128) 67*412f47f9SXin Li 68*412f47f9SXin Li /* Small copies: 0..32 bytes. */ 69*412f47f9SXin Li cmp count, 16 70*412f47f9SXin Li b.lo L(copy16) 71*412f47f9SXin Li ldp A_l, A_h, [src] 72*412f47f9SXin Li ldp D_l, D_h, [srcend, -16] 73*412f47f9SXin Li stp A_l, A_h, [dstin] 74*412f47f9SXin Li stp D_l, D_h, [dstend, -16] 75*412f47f9SXin Li ret 76*412f47f9SXin Li 77*412f47f9SXin Li /* Copy 8-15 bytes. */ 78*412f47f9SXin LiL(copy16): 79*412f47f9SXin Li tbz count, 3, L(copy8) 80*412f47f9SXin Li ldr A_l, [src] 81*412f47f9SXin Li ldr A_h, [srcend, -8] 82*412f47f9SXin Li str A_l, [dstin] 83*412f47f9SXin Li str A_h, [dstend, -8] 84*412f47f9SXin Li ret 85*412f47f9SXin Li 86*412f47f9SXin Li .p2align 3 87*412f47f9SXin Li /* Copy 4-7 bytes. */ 88*412f47f9SXin LiL(copy8): 89*412f47f9SXin Li tbz count, 2, L(copy4) 90*412f47f9SXin Li ldr A_lw, [src] 91*412f47f9SXin Li ldr B_lw, [srcend, -4] 92*412f47f9SXin Li str A_lw, [dstin] 93*412f47f9SXin Li str B_lw, [dstend, -4] 94*412f47f9SXin Li ret 95*412f47f9SXin Li 96*412f47f9SXin Li /* Copy 0..3 bytes using a branchless sequence. */ 97*412f47f9SXin LiL(copy4): 98*412f47f9SXin Li cbz count, L(copy0) 99*412f47f9SXin Li lsr tmp1, count, 1 100*412f47f9SXin Li ldrb A_lw, [src] 101*412f47f9SXin Li ldrb C_lw, [srcend, -1] 102*412f47f9SXin Li ldrb B_lw, [src, tmp1] 103*412f47f9SXin Li strb A_lw, [dstin] 104*412f47f9SXin Li strb B_lw, [dstin, tmp1] 105*412f47f9SXin Li strb C_lw, [dstend, -1] 106*412f47f9SXin LiL(copy0): 107*412f47f9SXin Li ret 108*412f47f9SXin Li 109*412f47f9SXin Li .p2align 4 110*412f47f9SXin Li /* Medium copies: 33..128 bytes. */ 111*412f47f9SXin LiL(copy32_128): 112*412f47f9SXin Li ldp A_l, A_h, [src] 113*412f47f9SXin Li ldp B_l, B_h, [src, 16] 114*412f47f9SXin Li ldp C_l, C_h, [srcend, -32] 115*412f47f9SXin Li ldp D_l, D_h, [srcend, -16] 116*412f47f9SXin Li cmp count, 64 117*412f47f9SXin Li b.hi L(copy128) 118*412f47f9SXin Li stp A_l, A_h, [dstin] 119*412f47f9SXin Li stp B_l, B_h, [dstin, 16] 120*412f47f9SXin Li stp C_l, C_h, [dstend, -32] 121*412f47f9SXin Li stp D_l, D_h, [dstend, -16] 122*412f47f9SXin Li ret 123*412f47f9SXin Li 124*412f47f9SXin Li .p2align 4 125*412f47f9SXin Li /* Copy 65..128 bytes. */ 126*412f47f9SXin LiL(copy128): 127*412f47f9SXin Li ldp E_l, E_h, [src, 32] 128*412f47f9SXin Li ldp F_l, F_h, [src, 48] 129*412f47f9SXin Li cmp count, 96 130*412f47f9SXin Li b.ls L(copy96) 131*412f47f9SXin Li ldp G_l, G_h, [srcend, -64] 132*412f47f9SXin Li ldp H_l, H_h, [srcend, -48] 133*412f47f9SXin Li stp G_l, G_h, [dstend, -64] 134*412f47f9SXin Li stp H_l, H_h, [dstend, -48] 135*412f47f9SXin LiL(copy96): 136*412f47f9SXin Li stp A_l, A_h, [dstin] 137*412f47f9SXin Li stp B_l, B_h, [dstin, 16] 138*412f47f9SXin Li stp E_l, E_h, [dstin, 32] 139*412f47f9SXin Li stp F_l, F_h, [dstin, 48] 140*412f47f9SXin Li stp C_l, C_h, [dstend, -32] 141*412f47f9SXin Li stp D_l, D_h, [dstend, -16] 142*412f47f9SXin Li ret 143*412f47f9SXin Li 144*412f47f9SXin Li .p2align 4 145*412f47f9SXin Li /* Copy more than 128 bytes. */ 146*412f47f9SXin LiL(copy_long): 147*412f47f9SXin Li /* Use backwards copy if there is an overlap. */ 148*412f47f9SXin Li sub tmp1, dstin, src 149*412f47f9SXin Li cbz tmp1, L(copy0) 150*412f47f9SXin Li cmp tmp1, count 151*412f47f9SXin Li b.lo L(copy_long_backwards) 152*412f47f9SXin Li 153*412f47f9SXin Li /* Copy 16 bytes and then align dst to 16-byte alignment. */ 154*412f47f9SXin Li 155*412f47f9SXin Li ldp D_l, D_h, [src] 156*412f47f9SXin Li and tmp1, dstin, 15 157*412f47f9SXin Li bic dst, dstin, 15 158*412f47f9SXin Li sub src, src, tmp1 159*412f47f9SXin Li add count, count, tmp1 /* Count is now 16 too large. */ 160*412f47f9SXin Li ldp A_l, A_h, [src, 16] 161*412f47f9SXin Li stp D_l, D_h, [dstin] 162*412f47f9SXin Li ldp B_l, B_h, [src, 32] 163*412f47f9SXin Li ldp C_l, C_h, [src, 48] 164*412f47f9SXin Li ldp D_l, D_h, [src, 64]! 165*412f47f9SXin Li subs count, count, 128 + 16 /* Test and readjust count. */ 166*412f47f9SXin Li b.ls L(copy64_from_end) 167*412f47f9SXin Li 168*412f47f9SXin LiL(loop64): 169*412f47f9SXin Li stp A_l, A_h, [dst, 16] 170*412f47f9SXin Li ldp A_l, A_h, [src, 16] 171*412f47f9SXin Li stp B_l, B_h, [dst, 32] 172*412f47f9SXin Li ldp B_l, B_h, [src, 32] 173*412f47f9SXin Li stp C_l, C_h, [dst, 48] 174*412f47f9SXin Li ldp C_l, C_h, [src, 48] 175*412f47f9SXin Li stp D_l, D_h, [dst, 64]! 176*412f47f9SXin Li ldp D_l, D_h, [src, 64]! 177*412f47f9SXin Li subs count, count, 64 178*412f47f9SXin Li b.hi L(loop64) 179*412f47f9SXin Li 180*412f47f9SXin Li /* Write the last iteration and copy 64 bytes from the end. */ 181*412f47f9SXin LiL(copy64_from_end): 182*412f47f9SXin Li ldp E_l, E_h, [srcend, -64] 183*412f47f9SXin Li stp A_l, A_h, [dst, 16] 184*412f47f9SXin Li ldp A_l, A_h, [srcend, -48] 185*412f47f9SXin Li stp B_l, B_h, [dst, 32] 186*412f47f9SXin Li ldp B_l, B_h, [srcend, -32] 187*412f47f9SXin Li stp C_l, C_h, [dst, 48] 188*412f47f9SXin Li ldp C_l, C_h, [srcend, -16] 189*412f47f9SXin Li stp D_l, D_h, [dst, 64] 190*412f47f9SXin Li stp E_l, E_h, [dstend, -64] 191*412f47f9SXin Li stp A_l, A_h, [dstend, -48] 192*412f47f9SXin Li stp B_l, B_h, [dstend, -32] 193*412f47f9SXin Li stp C_l, C_h, [dstend, -16] 194*412f47f9SXin Li ret 195*412f47f9SXin Li 196*412f47f9SXin Li .p2align 4 197*412f47f9SXin Li 198*412f47f9SXin Li /* Large backwards copy for overlapping copies. 199*412f47f9SXin Li Copy 16 bytes and then align dst to 16-byte alignment. */ 200*412f47f9SXin LiL(copy_long_backwards): 201*412f47f9SXin Li ldp D_l, D_h, [srcend, -16] 202*412f47f9SXin Li and tmp1, dstend, 15 203*412f47f9SXin Li sub srcend, srcend, tmp1 204*412f47f9SXin Li sub count, count, tmp1 205*412f47f9SXin Li ldp A_l, A_h, [srcend, -16] 206*412f47f9SXin Li stp D_l, D_h, [dstend, -16] 207*412f47f9SXin Li ldp B_l, B_h, [srcend, -32] 208*412f47f9SXin Li ldp C_l, C_h, [srcend, -48] 209*412f47f9SXin Li ldp D_l, D_h, [srcend, -64]! 210*412f47f9SXin Li sub dstend, dstend, tmp1 211*412f47f9SXin Li subs count, count, 128 212*412f47f9SXin Li b.ls L(copy64_from_start) 213*412f47f9SXin Li 214*412f47f9SXin LiL(loop64_backwards): 215*412f47f9SXin Li stp A_l, A_h, [dstend, -16] 216*412f47f9SXin Li ldp A_l, A_h, [srcend, -16] 217*412f47f9SXin Li stp B_l, B_h, [dstend, -32] 218*412f47f9SXin Li ldp B_l, B_h, [srcend, -32] 219*412f47f9SXin Li stp C_l, C_h, [dstend, -48] 220*412f47f9SXin Li ldp C_l, C_h, [srcend, -48] 221*412f47f9SXin Li stp D_l, D_h, [dstend, -64]! 222*412f47f9SXin Li ldp D_l, D_h, [srcend, -64]! 223*412f47f9SXin Li subs count, count, 64 224*412f47f9SXin Li b.hi L(loop64_backwards) 225*412f47f9SXin Li 226*412f47f9SXin Li /* Write the last iteration and copy 64 bytes from the start. */ 227*412f47f9SXin LiL(copy64_from_start): 228*412f47f9SXin Li ldp G_l, G_h, [src, 48] 229*412f47f9SXin Li stp A_l, A_h, [dstend, -16] 230*412f47f9SXin Li ldp A_l, A_h, [src, 32] 231*412f47f9SXin Li stp B_l, B_h, [dstend, -32] 232*412f47f9SXin Li ldp B_l, B_h, [src, 16] 233*412f47f9SXin Li stp C_l, C_h, [dstend, -48] 234*412f47f9SXin Li ldp C_l, C_h, [src] 235*412f47f9SXin Li stp D_l, D_h, [dstend, -64] 236*412f47f9SXin Li stp G_l, G_h, [dstin, 48] 237*412f47f9SXin Li stp A_l, A_h, [dstin, 32] 238*412f47f9SXin Li stp B_l, B_h, [dstin, 16] 239*412f47f9SXin Li stp C_l, C_h, [dstin] 240*412f47f9SXin Li ret 241*412f47f9SXin Li 242*412f47f9SXin LiEND (__memcpy_aarch64) 243*412f47f9SXin Li 244