1*412f47f9SXin Li/* 2*412f47f9SXin Li * strcpy/stpcpy - copy a string returning pointer to start/end. 3*412f47f9SXin Li * 4*412f47f9SXin Li * Copyright (c) 2020-2023, Arm Limited. 5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*412f47f9SXin Li */ 7*412f47f9SXin Li 8*412f47f9SXin Li/* Assumptions: 9*412f47f9SXin Li * 10*412f47f9SXin Li * ARMv8-a, AArch64, Advanced SIMD. 11*412f47f9SXin Li * MTE compatible. 12*412f47f9SXin Li */ 13*412f47f9SXin Li 14*412f47f9SXin Li#include "asmdefs.h" 15*412f47f9SXin Li 16*412f47f9SXin Li#define dstin x0 17*412f47f9SXin Li#define srcin x1 18*412f47f9SXin Li#define result x0 19*412f47f9SXin Li 20*412f47f9SXin Li#define src x2 21*412f47f9SXin Li#define dst x3 22*412f47f9SXin Li#define len x4 23*412f47f9SXin Li#define synd x4 24*412f47f9SXin Li#define tmp x5 25*412f47f9SXin Li#define shift x5 26*412f47f9SXin Li#define data1 x6 27*412f47f9SXin Li#define dataw1 w6 28*412f47f9SXin Li#define data2 x7 29*412f47f9SXin Li#define dataw2 w7 30*412f47f9SXin Li 31*412f47f9SXin Li#define dataq q0 32*412f47f9SXin Li#define vdata v0 33*412f47f9SXin Li#define vhas_nul v1 34*412f47f9SXin Li#define vend v2 35*412f47f9SXin Li#define dend d2 36*412f47f9SXin Li#define dataq2 q1 37*412f47f9SXin Li 38*412f47f9SXin Li#ifdef BUILD_STPCPY 39*412f47f9SXin Li# define STRCPY __stpcpy_aarch64 40*412f47f9SXin Li# define IFSTPCPY(X,...) X,__VA_ARGS__ 41*412f47f9SXin Li#else 42*412f47f9SXin Li# define STRCPY __strcpy_aarch64 43*412f47f9SXin Li# define IFSTPCPY(X,...) 44*412f47f9SXin Li#endif 45*412f47f9SXin Li 46*412f47f9SXin Li/* 47*412f47f9SXin Li Core algorithm: 48*412f47f9SXin Li For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits 49*412f47f9SXin Li per byte. We take 4 bits of every comparison byte with shift right and narrow 50*412f47f9SXin Li by 4 instruction. Since the bits in the nibble mask reflect the order in 51*412f47f9SXin Li which things occur in the original string, counting leading zeros identifies 52*412f47f9SXin Li exactly which byte matched. */ 53*412f47f9SXin Li 54*412f47f9SXin LiENTRY (STRCPY) 55*412f47f9SXin Li PTR_ARG (0) 56*412f47f9SXin Li PTR_ARG (1) 57*412f47f9SXin Li bic src, srcin, 15 58*412f47f9SXin Li ld1 {vdata.16b}, [src] 59*412f47f9SXin Li cmeq vhas_nul.16b, vdata.16b, 0 60*412f47f9SXin Li lsl shift, srcin, 2 61*412f47f9SXin Li shrn vend.8b, vhas_nul.8h, 4 62*412f47f9SXin Li fmov synd, dend 63*412f47f9SXin Li lsr synd, synd, shift 64*412f47f9SXin Li cbnz synd, L(tail) 65*412f47f9SXin Li 66*412f47f9SXin Li ldr dataq, [src, 16]! 67*412f47f9SXin Li cmeq vhas_nul.16b, vdata.16b, 0 68*412f47f9SXin Li shrn vend.8b, vhas_nul.8h, 4 69*412f47f9SXin Li fmov synd, dend 70*412f47f9SXin Li cbz synd, L(start_loop) 71*412f47f9SXin Li 72*412f47f9SXin Li#ifndef __AARCH64EB__ 73*412f47f9SXin Li rbit synd, synd 74*412f47f9SXin Li#endif 75*412f47f9SXin Li sub tmp, src, srcin 76*412f47f9SXin Li clz len, synd 77*412f47f9SXin Li add len, tmp, len, lsr 2 78*412f47f9SXin Li tbz len, 4, L(less16) 79*412f47f9SXin Li sub tmp, len, 15 80*412f47f9SXin Li ldr dataq, [srcin] 81*412f47f9SXin Li ldr dataq2, [srcin, tmp] 82*412f47f9SXin Li str dataq, [dstin] 83*412f47f9SXin Li str dataq2, [dstin, tmp] 84*412f47f9SXin Li IFSTPCPY (add result, dstin, len) 85*412f47f9SXin Li ret 86*412f47f9SXin Li 87*412f47f9SXin LiL(tail): 88*412f47f9SXin Li rbit synd, synd 89*412f47f9SXin Li clz len, synd 90*412f47f9SXin Li lsr len, len, 2 91*412f47f9SXin LiL(less16): 92*412f47f9SXin Li tbz len, 3, L(less8) 93*412f47f9SXin Li sub tmp, len, 7 94*412f47f9SXin Li ldr data1, [srcin] 95*412f47f9SXin Li ldr data2, [srcin, tmp] 96*412f47f9SXin Li str data1, [dstin] 97*412f47f9SXin Li str data2, [dstin, tmp] 98*412f47f9SXin Li IFSTPCPY (add result, dstin, len) 99*412f47f9SXin Li ret 100*412f47f9SXin Li 101*412f47f9SXin Li .p2align 4 102*412f47f9SXin LiL(less8): 103*412f47f9SXin Li subs tmp, len, 3 104*412f47f9SXin Li b.lo L(less4) 105*412f47f9SXin Li ldr dataw1, [srcin] 106*412f47f9SXin Li ldr dataw2, [srcin, tmp] 107*412f47f9SXin Li str dataw1, [dstin] 108*412f47f9SXin Li str dataw2, [dstin, tmp] 109*412f47f9SXin Li IFSTPCPY (add result, dstin, len) 110*412f47f9SXin Li ret 111*412f47f9SXin Li 112*412f47f9SXin LiL(less4): 113*412f47f9SXin Li cbz len, L(zerobyte) 114*412f47f9SXin Li ldrh dataw1, [srcin] 115*412f47f9SXin Li strh dataw1, [dstin] 116*412f47f9SXin LiL(zerobyte): 117*412f47f9SXin Li strb wzr, [dstin, len] 118*412f47f9SXin Li IFSTPCPY (add result, dstin, len) 119*412f47f9SXin Li ret 120*412f47f9SXin Li 121*412f47f9SXin Li .p2align 4 122*412f47f9SXin LiL(start_loop): 123*412f47f9SXin Li sub tmp, srcin, dstin 124*412f47f9SXin Li ldr dataq2, [srcin] 125*412f47f9SXin Li sub dst, src, tmp 126*412f47f9SXin Li str dataq2, [dstin] 127*412f47f9SXin LiL(loop): 128*412f47f9SXin Li str dataq, [dst], 32 129*412f47f9SXin Li ldr dataq, [src, 16] 130*412f47f9SXin Li cmeq vhas_nul.16b, vdata.16b, 0 131*412f47f9SXin Li umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 132*412f47f9SXin Li fmov synd, dend 133*412f47f9SXin Li cbnz synd, L(loopend) 134*412f47f9SXin Li str dataq, [dst, -16] 135*412f47f9SXin Li ldr dataq, [src, 32]! 136*412f47f9SXin Li cmeq vhas_nul.16b, vdata.16b, 0 137*412f47f9SXin Li umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 138*412f47f9SXin Li fmov synd, dend 139*412f47f9SXin Li cbz synd, L(loop) 140*412f47f9SXin Li add dst, dst, 16 141*412f47f9SXin LiL(loopend): 142*412f47f9SXin Li shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ 143*412f47f9SXin Li fmov synd, dend 144*412f47f9SXin Li sub dst, dst, 31 145*412f47f9SXin Li#ifndef __AARCH64EB__ 146*412f47f9SXin Li rbit synd, synd 147*412f47f9SXin Li#endif 148*412f47f9SXin Li clz len, synd 149*412f47f9SXin Li lsr len, len, 2 150*412f47f9SXin Li add dst, dst, len 151*412f47f9SXin Li ldr dataq, [dst, tmp] 152*412f47f9SXin Li str dataq, [dst] 153*412f47f9SXin Li IFSTPCPY (add result, dst, 15) 154*412f47f9SXin Li ret 155*412f47f9SXin Li 156*412f47f9SXin LiEND (STRCPY) 157