1*412f47f9SXin Li/* 2*412f47f9SXin Li * strlen - calculate the length of a string. 3*412f47f9SXin Li * 4*412f47f9SXin Li * Copyright (c) 2020-2022, Arm Limited. 5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*412f47f9SXin Li */ 7*412f47f9SXin Li 8*412f47f9SXin Li/* Assumptions: 9*412f47f9SXin Li * 10*412f47f9SXin Li * ARMv8-a, AArch64, Advanced SIMD. 11*412f47f9SXin Li * MTE compatible. 12*412f47f9SXin Li */ 13*412f47f9SXin Li 14*412f47f9SXin Li#include "asmdefs.h" 15*412f47f9SXin Li 16*412f47f9SXin Li#define srcin x0 17*412f47f9SXin Li#define result x0 18*412f47f9SXin Li 19*412f47f9SXin Li#define src x1 20*412f47f9SXin Li#define synd x2 21*412f47f9SXin Li#define tmp x3 22*412f47f9SXin Li#define shift x4 23*412f47f9SXin Li 24*412f47f9SXin Li#define data q0 25*412f47f9SXin Li#define vdata v0 26*412f47f9SXin Li#define vhas_nul v1 27*412f47f9SXin Li#define vend v2 28*412f47f9SXin Li#define dend d2 29*412f47f9SXin Li 30*412f47f9SXin Li/* Core algorithm: 31*412f47f9SXin Li Process the string in 16-byte aligned chunks. Compute a 64-bit mask with 32*412f47f9SXin Li four bits per byte using the shrn instruction. A count trailing zeros then 33*412f47f9SXin Li identifies the first zero byte. */ 34*412f47f9SXin Li 35*412f47f9SXin LiENTRY (__strlen_aarch64_mte) 36*412f47f9SXin Li PTR_ARG (0) 37*412f47f9SXin Li bic src, srcin, 15 38*412f47f9SXin Li ld1 {vdata.16b}, [src] 39*412f47f9SXin Li cmeq vhas_nul.16b, vdata.16b, 0 40*412f47f9SXin Li lsl shift, srcin, 2 41*412f47f9SXin Li shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ 42*412f47f9SXin Li fmov synd, dend 43*412f47f9SXin Li lsr synd, synd, shift 44*412f47f9SXin Li cbz synd, L(loop) 45*412f47f9SXin Li 46*412f47f9SXin Li rbit synd, synd 47*412f47f9SXin Li clz result, synd 48*412f47f9SXin Li lsr result, result, 2 49*412f47f9SXin Li ret 50*412f47f9SXin Li 51*412f47f9SXin Li .p2align 5 52*412f47f9SXin LiL(loop): 53*412f47f9SXin Li ldr data, [src, 16] 54*412f47f9SXin Li cmeq vhas_nul.16b, vdata.16b, 0 55*412f47f9SXin Li umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 56*412f47f9SXin Li fmov synd, dend 57*412f47f9SXin Li cbnz synd, L(loop_end) 58*412f47f9SXin Li ldr data, [src, 32]! 59*412f47f9SXin Li cmeq vhas_nul.16b, vdata.16b, 0 60*412f47f9SXin Li umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 61*412f47f9SXin Li fmov synd, dend 62*412f47f9SXin Li cbz synd, L(loop) 63*412f47f9SXin Li sub src, src, 16 64*412f47f9SXin LiL(loop_end): 65*412f47f9SXin Li shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ 66*412f47f9SXin Li sub result, src, srcin 67*412f47f9SXin Li fmov synd, dend 68*412f47f9SXin Li#ifndef __AARCH64EB__ 69*412f47f9SXin Li rbit synd, synd 70*412f47f9SXin Li#endif 71*412f47f9SXin Li add result, result, 16 72*412f47f9SXin Li clz tmp, synd 73*412f47f9SXin Li add result, result, tmp, lsr 2 74*412f47f9SXin Li ret 75*412f47f9SXin Li 76*412f47f9SXin LiEND (__strlen_aarch64_mte) 77*412f47f9SXin Li 78