1*412f47f9SXin Li/* 2*412f47f9SXin Li * strlen - calculate the length of a string. 3*412f47f9SXin Li * 4*412f47f9SXin Li * Copyright (c) 2020-2022, Arm Limited. 5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*412f47f9SXin Li */ 7*412f47f9SXin Li 8*412f47f9SXin Li/* Assumptions: 9*412f47f9SXin Li * 10*412f47f9SXin Li * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 11*412f47f9SXin Li * Not MTE compatible. 12*412f47f9SXin Li */ 13*412f47f9SXin Li 14*412f47f9SXin Li#include "asmdefs.h" 15*412f47f9SXin Li 16*412f47f9SXin Li#define srcin x0 17*412f47f9SXin Li#define len x0 18*412f47f9SXin Li 19*412f47f9SXin Li#define src x1 20*412f47f9SXin Li#define data1 x2 21*412f47f9SXin Li#define data2 x3 22*412f47f9SXin Li#define has_nul1 x4 23*412f47f9SXin Li#define has_nul2 x5 24*412f47f9SXin Li#define tmp1 x4 25*412f47f9SXin Li#define tmp2 x5 26*412f47f9SXin Li#define tmp3 x6 27*412f47f9SXin Li#define tmp4 x7 28*412f47f9SXin Li#define zeroones x8 29*412f47f9SXin Li 30*412f47f9SXin Li#define maskv v0 31*412f47f9SXin Li#define maskd d0 32*412f47f9SXin Li#define dataq1 q1 33*412f47f9SXin Li#define dataq2 q2 34*412f47f9SXin Li#define datav1 v1 35*412f47f9SXin Li#define datav2 v2 36*412f47f9SXin Li#define tmp x2 37*412f47f9SXin Li#define tmpw w2 38*412f47f9SXin Li#define synd x3 39*412f47f9SXin Li#define syndw w3 40*412f47f9SXin Li#define shift x4 41*412f47f9SXin Li 42*412f47f9SXin Li/* For the first 32 bytes, NUL detection works on the principle that 43*412f47f9SXin Li (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a 44*412f47f9SXin Li byte is zero, and can be done in parallel across the entire word. */ 45*412f47f9SXin Li 46*412f47f9SXin Li#define REP8_01 0x0101010101010101 47*412f47f9SXin Li#define REP8_7f 0x7f7f7f7f7f7f7f7f 48*412f47f9SXin Li 49*412f47f9SXin Li/* To test the page crossing code path more thoroughly, compile with 50*412f47f9SXin Li -DTEST_PAGE_CROSS - this will force all calls through the slower 51*412f47f9SXin Li entry path. This option is not intended for production use. */ 52*412f47f9SXin Li 53*412f47f9SXin Li#ifdef TEST_PAGE_CROSS 54*412f47f9SXin Li# define MIN_PAGE_SIZE 32 55*412f47f9SXin Li#else 56*412f47f9SXin Li# define MIN_PAGE_SIZE 4096 57*412f47f9SXin Li#endif 58*412f47f9SXin Li 59*412f47f9SXin Li/* Core algorithm: 60*412f47f9SXin Li 61*412f47f9SXin Li Since strings are short on average, we check the first 32 bytes of the 62*412f47f9SXin Li string for a NUL character without aligning the string. In order to use 63*412f47f9SXin Li unaligned loads safely we must do a page cross check first. 64*412f47f9SXin Li 65*412f47f9SXin Li If there is a NUL byte we calculate the length from the 2 8-byte words 66*412f47f9SXin Li using conditional select to reduce branch mispredictions (it is unlikely 67*412f47f9SXin Li strlen will be repeatedly called on strings with the same length). 68*412f47f9SXin Li 69*412f47f9SXin Li If the string is longer than 32 bytes, align src so we don't need further 70*412f47f9SXin Li page cross checks, and process 32 bytes per iteration using a fast SIMD 71*412f47f9SXin Li loop. 72*412f47f9SXin Li 73*412f47f9SXin Li If the page cross check fails, we read 32 bytes from an aligned address, 74*412f47f9SXin Li and ignore any characters before the string. If it contains a NUL 75*412f47f9SXin Li character, return the length, if not, continue in the main loop. */ 76*412f47f9SXin Li 77*412f47f9SXin LiENTRY (__strlen_aarch64) 78*412f47f9SXin Li PTR_ARG (0) 79*412f47f9SXin Li and tmp1, srcin, MIN_PAGE_SIZE - 1 80*412f47f9SXin Li cmp tmp1, MIN_PAGE_SIZE - 32 81*412f47f9SXin Li b.hi L(page_cross) 82*412f47f9SXin Li 83*412f47f9SXin Li /* Look for a NUL byte in the first 16 bytes. */ 84*412f47f9SXin Li ldp data1, data2, [srcin] 85*412f47f9SXin Li mov zeroones, REP8_01 86*412f47f9SXin Li 87*412f47f9SXin Li#ifdef __AARCH64EB__ 88*412f47f9SXin Li /* For big-endian, carry propagation (if the final byte in the 89*412f47f9SXin Li string is 0x01) means we cannot use has_nul1/2 directly. 90*412f47f9SXin Li Since we expect strings to be small and early-exit, 91*412f47f9SXin Li byte-swap the data now so has_null1/2 will be correct. */ 92*412f47f9SXin Li rev data1, data1 93*412f47f9SXin Li rev data2, data2 94*412f47f9SXin Li#endif 95*412f47f9SXin Li sub tmp1, data1, zeroones 96*412f47f9SXin Li orr tmp2, data1, REP8_7f 97*412f47f9SXin Li sub tmp3, data2, zeroones 98*412f47f9SXin Li orr tmp4, data2, REP8_7f 99*412f47f9SXin Li bics has_nul1, tmp1, tmp2 100*412f47f9SXin Li bic has_nul2, tmp3, tmp4 101*412f47f9SXin Li ccmp has_nul2, 0, 0, eq 102*412f47f9SXin Li b.eq L(bytes16_31) 103*412f47f9SXin Li 104*412f47f9SXin Li /* Find the exact offset of the first NUL byte in the first 16 bytes 105*412f47f9SXin Li from the string start. Enter with C = has_nul1 == 0. */ 106*412f47f9SXin Li csel has_nul1, has_nul1, has_nul2, cc 107*412f47f9SXin Li mov len, 8 108*412f47f9SXin Li rev has_nul1, has_nul1 109*412f47f9SXin Li csel len, xzr, len, cc 110*412f47f9SXin Li clz tmp1, has_nul1 111*412f47f9SXin Li add len, len, tmp1, lsr 3 112*412f47f9SXin Li ret 113*412f47f9SXin Li 114*412f47f9SXin Li /* Look for a NUL byte at offset 16..31 in the string. */ 115*412f47f9SXin LiL(bytes16_31): 116*412f47f9SXin Li ldp data1, data2, [srcin, 16] 117*412f47f9SXin Li#ifdef __AARCH64EB__ 118*412f47f9SXin Li rev data1, data1 119*412f47f9SXin Li rev data2, data2 120*412f47f9SXin Li#endif 121*412f47f9SXin Li sub tmp1, data1, zeroones 122*412f47f9SXin Li orr tmp2, data1, REP8_7f 123*412f47f9SXin Li sub tmp3, data2, zeroones 124*412f47f9SXin Li orr tmp4, data2, REP8_7f 125*412f47f9SXin Li bics has_nul1, tmp1, tmp2 126*412f47f9SXin Li bic has_nul2, tmp3, tmp4 127*412f47f9SXin Li ccmp has_nul2, 0, 0, eq 128*412f47f9SXin Li b.eq L(loop_entry) 129*412f47f9SXin Li 130*412f47f9SXin Li /* Find the exact offset of the first NUL byte at offset 16..31 from 131*412f47f9SXin Li the string start. Enter with C = has_nul1 == 0. */ 132*412f47f9SXin Li csel has_nul1, has_nul1, has_nul2, cc 133*412f47f9SXin Li mov len, 24 134*412f47f9SXin Li rev has_nul1, has_nul1 135*412f47f9SXin Li mov tmp3, 16 136*412f47f9SXin Li clz tmp1, has_nul1 137*412f47f9SXin Li csel len, tmp3, len, cc 138*412f47f9SXin Li add len, len, tmp1, lsr 3 139*412f47f9SXin Li ret 140*412f47f9SXin Li 141*412f47f9SXin Li nop 142*412f47f9SXin LiL(loop_entry): 143*412f47f9SXin Li bic src, srcin, 31 144*412f47f9SXin Li 145*412f47f9SXin Li .p2align 5 146*412f47f9SXin LiL(loop): 147*412f47f9SXin Li ldp dataq1, dataq2, [src, 32]! 148*412f47f9SXin Li uminp maskv.16b, datav1.16b, datav2.16b 149*412f47f9SXin Li uminp maskv.16b, maskv.16b, maskv.16b 150*412f47f9SXin Li cmeq maskv.8b, maskv.8b, 0 151*412f47f9SXin Li fmov synd, maskd 152*412f47f9SXin Li cbz synd, L(loop) 153*412f47f9SXin Li 154*412f47f9SXin Li /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ 155*412f47f9SXin Li cmeq maskv.16b, datav1.16b, 0 156*412f47f9SXin Li sub len, src, srcin 157*412f47f9SXin Li cbnz syndw, 1f 158*412f47f9SXin Li cmeq maskv.16b, datav2.16b, 0 159*412f47f9SXin Li add len, len, 16 160*412f47f9SXin Li1: 161*412f47f9SXin Li /* Generate a bitmask and compute correct byte offset. */ 162*412f47f9SXin Li shrn maskv.8b, maskv.8h, 4 163*412f47f9SXin Li fmov synd, maskd 164*412f47f9SXin Li#ifndef __AARCH64EB__ 165*412f47f9SXin Li rbit synd, synd 166*412f47f9SXin Li#endif 167*412f47f9SXin Li clz tmp, synd 168*412f47f9SXin Li add len, len, tmp, lsr 2 169*412f47f9SXin Li ret 170*412f47f9SXin Li 171*412f47f9SXin LiL(page_cross): 172*412f47f9SXin Li bic src, srcin, 31 173*412f47f9SXin Li mov tmpw, 0x0c03 174*412f47f9SXin Li movk tmpw, 0xc030, lsl 16 175*412f47f9SXin Li ld1 {datav1.16b, datav2.16b}, [src] 176*412f47f9SXin Li dup maskv.4s, tmpw 177*412f47f9SXin Li cmeq datav1.16b, datav1.16b, 0 178*412f47f9SXin Li cmeq datav2.16b, datav2.16b, 0 179*412f47f9SXin Li and datav1.16b, datav1.16b, maskv.16b 180*412f47f9SXin Li and datav2.16b, datav2.16b, maskv.16b 181*412f47f9SXin Li addp maskv.16b, datav1.16b, datav2.16b 182*412f47f9SXin Li addp maskv.16b, maskv.16b, maskv.16b 183*412f47f9SXin Li fmov synd, maskd 184*412f47f9SXin Li lsl shift, srcin, 1 185*412f47f9SXin Li lsr synd, synd, shift 186*412f47f9SXin Li cbz synd, L(loop) 187*412f47f9SXin Li 188*412f47f9SXin Li rbit synd, synd 189*412f47f9SXin Li clz len, synd 190*412f47f9SXin Li lsr len, len, 1 191*412f47f9SXin Li ret 192*412f47f9SXin Li 193*412f47f9SXin LiEND (__strlen_aarch64) 194