1*412f47f9SXin Li/* 2*412f47f9SXin Li * strchr - find a character in a string 3*412f47f9SXin Li * 4*412f47f9SXin Li * Copyright (c) 2014-2022, Arm Limited. 5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*412f47f9SXin Li */ 7*412f47f9SXin Li 8*412f47f9SXin Li/* Assumptions: 9*412f47f9SXin Li * 10*412f47f9SXin Li * ARMv8-a, AArch64 11*412f47f9SXin Li * Neon Available. 12*412f47f9SXin Li */ 13*412f47f9SXin Li 14*412f47f9SXin Li#include "asmdefs.h" 15*412f47f9SXin Li 16*412f47f9SXin Li/* Arguments and results. */ 17*412f47f9SXin Li#define srcin x0 18*412f47f9SXin Li#define chrin w1 19*412f47f9SXin Li 20*412f47f9SXin Li#define result x0 21*412f47f9SXin Li 22*412f47f9SXin Li#define src x2 23*412f47f9SXin Li#define tmp1 x3 24*412f47f9SXin Li#define wtmp2 w4 25*412f47f9SXin Li#define tmp3 x5 26*412f47f9SXin Li 27*412f47f9SXin Li#define vrepchr v0 28*412f47f9SXin Li#define vdata1 v1 29*412f47f9SXin Li#define vdata2 v2 30*412f47f9SXin Li#define vhas_nul1 v3 31*412f47f9SXin Li#define vhas_nul2 v4 32*412f47f9SXin Li#define vhas_chr1 v5 33*412f47f9SXin Li#define vhas_chr2 v6 34*412f47f9SXin Li#define vrepmask_0 v7 35*412f47f9SXin Li#define vrepmask_c v16 36*412f47f9SXin Li#define vend1 v17 37*412f47f9SXin Li#define vend2 v18 38*412f47f9SXin Li 39*412f47f9SXin Li/* Core algorithm. 40*412f47f9SXin Li 41*412f47f9SXin Li For each 32-byte hunk we calculate a 64-bit syndrome value, with 42*412f47f9SXin Li two bits per byte (LSB is always in bits 0 and 1, for both big 43*412f47f9SXin Li and little-endian systems). For each tuple, bit 0 is set iff 44*412f47f9SXin Li the relevant byte matched the requested character; bit 1 is set 45*412f47f9SXin Li iff the relevant byte matched the NUL end of string (we trigger 46*412f47f9SXin Li off bit0 for the special case of looking for NUL). Since the bits 47*412f47f9SXin Li in the syndrome reflect exactly the order in which things occur 48*412f47f9SXin Li in the original string a count_trailing_zeros() operation will 49*412f47f9SXin Li identify exactly which byte is causing the termination, and why. */ 50*412f47f9SXin Li 51*412f47f9SXin Li/* Locals and temporaries. */ 52*412f47f9SXin Li 53*412f47f9SXin LiENTRY (__strchr_aarch64) 54*412f47f9SXin Li PTR_ARG (0) 55*412f47f9SXin Li /* Magic constant 0xc0300c03 to allow us to identify which lane 56*412f47f9SXin Li matches the requested byte. Even bits are set if the character 57*412f47f9SXin Li matches, odd bits if either the char is NUL or matches. */ 58*412f47f9SXin Li mov wtmp2, 0x0c03 59*412f47f9SXin Li movk wtmp2, 0xc030, lsl 16 60*412f47f9SXin Li dup vrepchr.16b, chrin 61*412f47f9SXin Li bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ 62*412f47f9SXin Li dup vrepmask_c.4s, wtmp2 63*412f47f9SXin Li ands tmp1, srcin, #31 64*412f47f9SXin Li add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ 65*412f47f9SXin Li b.eq L(loop) 66*412f47f9SXin Li 67*412f47f9SXin Li /* Input string is not 32-byte aligned. Rather than forcing 68*412f47f9SXin Li the padding bytes to a safe value, we calculate the syndrome 69*412f47f9SXin Li for all the bytes, but then mask off those bits of the 70*412f47f9SXin Li syndrome that are related to the padding. */ 71*412f47f9SXin Li ld1 {vdata1.16b, vdata2.16b}, [src], #32 72*412f47f9SXin Li neg tmp1, tmp1 73*412f47f9SXin Li cmeq vhas_nul1.16b, vdata1.16b, #0 74*412f47f9SXin Li cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 75*412f47f9SXin Li cmeq vhas_nul2.16b, vdata2.16b, #0 76*412f47f9SXin Li cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 77*412f47f9SXin Li bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b 78*412f47f9SXin Li bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b 79*412f47f9SXin Li and vend1.16b, vhas_nul1.16b, vrepmask_c.16b 80*412f47f9SXin Li and vend2.16b, vhas_nul2.16b, vrepmask_c.16b 81*412f47f9SXin Li lsl tmp1, tmp1, #1 82*412f47f9SXin Li addp vend1.16b, vend1.16b, vend2.16b // 256->128 83*412f47f9SXin Li mov tmp3, #~0 84*412f47f9SXin Li addp vend1.16b, vend1.16b, vend2.16b // 128->64 85*412f47f9SXin Li lsr tmp1, tmp3, tmp1 86*412f47f9SXin Li 87*412f47f9SXin Li mov tmp3, vend1.d[0] 88*412f47f9SXin Li bic tmp1, tmp3, tmp1 // Mask padding bits. 89*412f47f9SXin Li cbnz tmp1, L(tail) 90*412f47f9SXin Li 91*412f47f9SXin Li .p2align 4 92*412f47f9SXin LiL(loop): 93*412f47f9SXin Li ld1 {vdata1.16b, vdata2.16b}, [src], #32 94*412f47f9SXin Li cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 95*412f47f9SXin Li cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 96*412f47f9SXin Li cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b 97*412f47f9SXin Li cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b 98*412f47f9SXin Li orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b 99*412f47f9SXin Li umaxp vend1.16b, vend1.16b, vend1.16b 100*412f47f9SXin Li mov tmp1, vend1.d[0] 101*412f47f9SXin Li cbz tmp1, L(loop) 102*412f47f9SXin Li 103*412f47f9SXin Li /* Termination condition found. Now need to establish exactly why 104*412f47f9SXin Li we terminated. */ 105*412f47f9SXin Li bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b 106*412f47f9SXin Li bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b 107*412f47f9SXin Li and vend1.16b, vhas_nul1.16b, vrepmask_c.16b 108*412f47f9SXin Li and vend2.16b, vhas_nul2.16b, vrepmask_c.16b 109*412f47f9SXin Li addp vend1.16b, vend1.16b, vend2.16b // 256->128 110*412f47f9SXin Li addp vend1.16b, vend1.16b, vend2.16b // 128->64 111*412f47f9SXin Li mov tmp1, vend1.d[0] 112*412f47f9SXin LiL(tail): 113*412f47f9SXin Li /* Count the trailing zeros, by bit reversing... */ 114*412f47f9SXin Li rbit tmp1, tmp1 115*412f47f9SXin Li /* Re-bias source. */ 116*412f47f9SXin Li sub src, src, #32 117*412f47f9SXin Li clz tmp1, tmp1 /* And counting the leading zeros. */ 118*412f47f9SXin Li /* Tmp1 is even if the target charager was found first. Otherwise 119*412f47f9SXin Li we've found the end of string and we weren't looking for NUL. */ 120*412f47f9SXin Li tst tmp1, #1 121*412f47f9SXin Li add result, src, tmp1, lsr #1 122*412f47f9SXin Li csel result, result, xzr, eq 123*412f47f9SXin Li ret 124*412f47f9SXin Li 125*412f47f9SXin LiEND (__strchr_aarch64) 126*412f47f9SXin Li 127