1*412f47f9SXin Li/* 2*412f47f9SXin Li * memcpy - copy memory area 3*412f47f9SXin Li * 4*412f47f9SXin Li * Copyright (c) 2019-2023, Arm Limited. 5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*412f47f9SXin Li */ 7*412f47f9SXin Li 8*412f47f9SXin Li/* Assumptions: 9*412f47f9SXin Li * 10*412f47f9SXin Li * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. 11*412f47f9SXin Li * 12*412f47f9SXin Li */ 13*412f47f9SXin Li 14*412f47f9SXin Li#include "asmdefs.h" 15*412f47f9SXin Li 16*412f47f9SXin Li#ifdef HAVE_SVE 17*412f47f9SXin Li 18*412f47f9SXin Li.arch armv8-a+sve 19*412f47f9SXin Li 20*412f47f9SXin Li#define dstin x0 21*412f47f9SXin Li#define src x1 22*412f47f9SXin Li#define count x2 23*412f47f9SXin Li#define dst x3 24*412f47f9SXin Li#define srcend x4 25*412f47f9SXin Li#define dstend x5 26*412f47f9SXin Li#define tmp1 x6 27*412f47f9SXin Li#define vlen x6 28*412f47f9SXin Li 29*412f47f9SXin Li#define A_q q0 30*412f47f9SXin Li#define B_q q1 31*412f47f9SXin Li#define C_q q2 32*412f47f9SXin Li#define D_q q3 33*412f47f9SXin Li#define E_q q4 34*412f47f9SXin Li#define F_q q5 35*412f47f9SXin Li#define G_q q6 36*412f47f9SXin Li#define H_q q7 37*412f47f9SXin Li 38*412f47f9SXin Li/* This implementation handles overlaps and supports both memcpy and memmove 39*412f47f9SXin Li from a single entry point. It uses unaligned accesses and branchless 40*412f47f9SXin Li sequences to keep the code small, simple and improve performance. 41*412f47f9SXin Li SVE vectors are used to speedup small copies. 42*412f47f9SXin Li 43*412f47f9SXin Li Copies are split into 3 main cases: small copies of up to 32 bytes, medium 44*412f47f9SXin Li copies of up to 128 bytes, and large copies. The overhead of the overlap 45*412f47f9SXin Li check is negligible since it is only required for large copies. 46*412f47f9SXin Li 47*412f47f9SXin Li Large copies use a software pipelined loop processing 64 bytes per iteration. 48*412f47f9SXin Li The source pointer is 16-byte aligned to minimize unaligned accesses. 49*412f47f9SXin Li The loop tail is handled by always copying 64 bytes from the end. 50*412f47f9SXin Li*/ 51*412f47f9SXin Li 52*412f47f9SXin LiENTRY_ALIAS (__memmove_aarch64_sve) 53*412f47f9SXin LiENTRY (__memcpy_aarch64_sve) 54*412f47f9SXin Li PTR_ARG (0) 55*412f47f9SXin Li PTR_ARG (1) 56*412f47f9SXin Li SIZE_ARG (2) 57*412f47f9SXin Li 58*412f47f9SXin Li cmp count, 128 59*412f47f9SXin Li b.hi L(copy_long) 60*412f47f9SXin Li cntb vlen 61*412f47f9SXin Li cmp count, vlen, lsl 1 62*412f47f9SXin Li b.hi L(copy32_128) 63*412f47f9SXin Li 64*412f47f9SXin Li whilelo p0.b, xzr, count 65*412f47f9SXin Li whilelo p1.b, vlen, count 66*412f47f9SXin Li ld1b z0.b, p0/z, [src, 0, mul vl] 67*412f47f9SXin Li ld1b z1.b, p1/z, [src, 1, mul vl] 68*412f47f9SXin Li st1b z0.b, p0, [dstin, 0, mul vl] 69*412f47f9SXin Li st1b z1.b, p1, [dstin, 1, mul vl] 70*412f47f9SXin Li ret 71*412f47f9SXin Li 72*412f47f9SXin Li /* Medium copies: 33..128 bytes. */ 73*412f47f9SXin LiL(copy32_128): 74*412f47f9SXin Li add srcend, src, count 75*412f47f9SXin Li add dstend, dstin, count 76*412f47f9SXin Li ldp A_q, B_q, [src] 77*412f47f9SXin Li ldp C_q, D_q, [srcend, -32] 78*412f47f9SXin Li cmp count, 64 79*412f47f9SXin Li b.hi L(copy128) 80*412f47f9SXin Li stp A_q, B_q, [dstin] 81*412f47f9SXin Li stp C_q, D_q, [dstend, -32] 82*412f47f9SXin Li ret 83*412f47f9SXin Li 84*412f47f9SXin Li /* Copy 65..128 bytes. */ 85*412f47f9SXin LiL(copy128): 86*412f47f9SXin Li ldp E_q, F_q, [src, 32] 87*412f47f9SXin Li cmp count, 96 88*412f47f9SXin Li b.ls L(copy96) 89*412f47f9SXin Li ldp G_q, H_q, [srcend, -64] 90*412f47f9SXin Li stp G_q, H_q, [dstend, -64] 91*412f47f9SXin LiL(copy96): 92*412f47f9SXin Li stp A_q, B_q, [dstin] 93*412f47f9SXin Li stp E_q, F_q, [dstin, 32] 94*412f47f9SXin Li stp C_q, D_q, [dstend, -32] 95*412f47f9SXin Li ret 96*412f47f9SXin Li 97*412f47f9SXin Li /* Copy more than 128 bytes. */ 98*412f47f9SXin LiL(copy_long): 99*412f47f9SXin Li add srcend, src, count 100*412f47f9SXin Li add dstend, dstin, count 101*412f47f9SXin Li 102*412f47f9SXin Li /* Use backwards copy if there is an overlap. */ 103*412f47f9SXin Li sub tmp1, dstin, src 104*412f47f9SXin Li cmp tmp1, count 105*412f47f9SXin Li b.lo L(copy_long_backwards) 106*412f47f9SXin Li 107*412f47f9SXin Li /* Copy 16 bytes and then align src to 16-byte alignment. */ 108*412f47f9SXin Li ldr D_q, [src] 109*412f47f9SXin Li and tmp1, src, 15 110*412f47f9SXin Li bic src, src, 15 111*412f47f9SXin Li sub dst, dstin, tmp1 112*412f47f9SXin Li add count, count, tmp1 /* Count is now 16 too large. */ 113*412f47f9SXin Li ldp A_q, B_q, [src, 16] 114*412f47f9SXin Li str D_q, [dstin] 115*412f47f9SXin Li ldp C_q, D_q, [src, 48] 116*412f47f9SXin Li subs count, count, 128 + 16 /* Test and readjust count. */ 117*412f47f9SXin Li b.ls L(copy64_from_end) 118*412f47f9SXin LiL(loop64): 119*412f47f9SXin Li stp A_q, B_q, [dst, 16] 120*412f47f9SXin Li ldp A_q, B_q, [src, 80] 121*412f47f9SXin Li stp C_q, D_q, [dst, 48] 122*412f47f9SXin Li ldp C_q, D_q, [src, 112] 123*412f47f9SXin Li add src, src, 64 124*412f47f9SXin Li add dst, dst, 64 125*412f47f9SXin Li subs count, count, 64 126*412f47f9SXin Li b.hi L(loop64) 127*412f47f9SXin Li 128*412f47f9SXin Li /* Write the last iteration and copy 64 bytes from the end. */ 129*412f47f9SXin LiL(copy64_from_end): 130*412f47f9SXin Li ldp E_q, F_q, [srcend, -64] 131*412f47f9SXin Li stp A_q, B_q, [dst, 16] 132*412f47f9SXin Li ldp A_q, B_q, [srcend, -32] 133*412f47f9SXin Li stp C_q, D_q, [dst, 48] 134*412f47f9SXin Li stp E_q, F_q, [dstend, -64] 135*412f47f9SXin Li stp A_q, B_q, [dstend, -32] 136*412f47f9SXin Li ret 137*412f47f9SXin Li 138*412f47f9SXin Li /* Large backwards copy for overlapping copies. 139*412f47f9SXin Li Copy 16 bytes and then align srcend to 16-byte alignment. */ 140*412f47f9SXin LiL(copy_long_backwards): 141*412f47f9SXin Li cbz tmp1, L(return) 142*412f47f9SXin Li ldr D_q, [srcend, -16] 143*412f47f9SXin Li and tmp1, srcend, 15 144*412f47f9SXin Li bic srcend, srcend, 15 145*412f47f9SXin Li sub count, count, tmp1 146*412f47f9SXin Li ldp A_q, B_q, [srcend, -32] 147*412f47f9SXin Li str D_q, [dstend, -16] 148*412f47f9SXin Li ldp C_q, D_q, [srcend, -64] 149*412f47f9SXin Li sub dstend, dstend, tmp1 150*412f47f9SXin Li subs count, count, 128 151*412f47f9SXin Li b.ls L(copy64_from_start) 152*412f47f9SXin Li 153*412f47f9SXin LiL(loop64_backwards): 154*412f47f9SXin Li str B_q, [dstend, -16] 155*412f47f9SXin Li str A_q, [dstend, -32] 156*412f47f9SXin Li ldp A_q, B_q, [srcend, -96] 157*412f47f9SXin Li str D_q, [dstend, -48] 158*412f47f9SXin Li str C_q, [dstend, -64]! 159*412f47f9SXin Li ldp C_q, D_q, [srcend, -128] 160*412f47f9SXin Li sub srcend, srcend, 64 161*412f47f9SXin Li subs count, count, 64 162*412f47f9SXin Li b.hi L(loop64_backwards) 163*412f47f9SXin Li 164*412f47f9SXin Li /* Write the last iteration and copy 64 bytes from the start. */ 165*412f47f9SXin LiL(copy64_from_start): 166*412f47f9SXin Li ldp E_q, F_q, [src, 32] 167*412f47f9SXin Li stp A_q, B_q, [dstend, -32] 168*412f47f9SXin Li ldp A_q, B_q, [src] 169*412f47f9SXin Li stp C_q, D_q, [dstend, -64] 170*412f47f9SXin Li stp E_q, F_q, [dstin, 32] 171*412f47f9SXin Li stp A_q, B_q, [dstin] 172*412f47f9SXin LiL(return): 173*412f47f9SXin Li ret 174*412f47f9SXin Li 175*412f47f9SXin LiEND (__memcpy_aarch64_sve) 176*412f47f9SXin Li 177*412f47f9SXin Li#endif 178