1*412f47f9SXin Li/* 2*412f47f9SXin Li * strrchr - find last position of a character in a string. 3*412f47f9SXin Li * 4*412f47f9SXin Li * Copyright (c) 2014-2022, Arm Limited. 5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*412f47f9SXin Li */ 7*412f47f9SXin Li 8*412f47f9SXin Li/* Assumptions: 9*412f47f9SXin Li * 10*412f47f9SXin Li * ARMv8-a, AArch64 11*412f47f9SXin Li * Neon Available. 12*412f47f9SXin Li */ 13*412f47f9SXin Li 14*412f47f9SXin Li#include "asmdefs.h" 15*412f47f9SXin Li 16*412f47f9SXin Li/* Arguments and results. */ 17*412f47f9SXin Li#define srcin x0 18*412f47f9SXin Li#define chrin w1 19*412f47f9SXin Li 20*412f47f9SXin Li#define result x0 21*412f47f9SXin Li 22*412f47f9SXin Li#define src x2 23*412f47f9SXin Li#define tmp1 x3 24*412f47f9SXin Li#define wtmp2 w4 25*412f47f9SXin Li#define tmp3 x5 26*412f47f9SXin Li#define src_match x6 27*412f47f9SXin Li#define src_offset x7 28*412f47f9SXin Li#define const_m1 x8 29*412f47f9SXin Li#define tmp4 x9 30*412f47f9SXin Li#define nul_match x10 31*412f47f9SXin Li#define chr_match x11 32*412f47f9SXin Li 33*412f47f9SXin Li#define vrepchr v0 34*412f47f9SXin Li#define vdata1 v1 35*412f47f9SXin Li#define vdata2 v2 36*412f47f9SXin Li#define vhas_nul1 v3 37*412f47f9SXin Li#define vhas_nul2 v4 38*412f47f9SXin Li#define vhas_chr1 v5 39*412f47f9SXin Li#define vhas_chr2 v6 40*412f47f9SXin Li#define vrepmask_0 v7 41*412f47f9SXin Li#define vrepmask_c v16 42*412f47f9SXin Li#define vend1 v17 43*412f47f9SXin Li#define vend2 v18 44*412f47f9SXin Li 45*412f47f9SXin Li/* Core algorithm. 46*412f47f9SXin Li 47*412f47f9SXin Li For each 32-byte hunk we calculate a 64-bit syndrome value, with 48*412f47f9SXin Li two bits per byte (LSB is always in bits 0 and 1, for both big 49*412f47f9SXin Li and little-endian systems). For each tuple, bit 0 is set iff 50*412f47f9SXin Li the relevant byte matched the requested character; bit 1 is set 51*412f47f9SXin Li iff the relevant byte matched the NUL end of string (we trigger 52*412f47f9SXin Li off bit0 for the special case of looking for NUL). Since the bits 53*412f47f9SXin Li in the syndrome reflect exactly the order in which things occur 54*412f47f9SXin Li in the original string a count_trailing_zeros() operation will 55*412f47f9SXin Li identify exactly which byte is causing the termination, and why. */ 56*412f47f9SXin Li 57*412f47f9SXin LiENTRY (__strrchr_aarch64) 58*412f47f9SXin Li PTR_ARG (0) 59*412f47f9SXin Li /* Magic constant 0x40100401 to allow us to identify which lane 60*412f47f9SXin Li matches the requested byte. Magic constant 0x80200802 used 61*412f47f9SXin Li similarly for NUL termination. */ 62*412f47f9SXin Li mov wtmp2, #0x0401 63*412f47f9SXin Li movk wtmp2, #0x4010, lsl #16 64*412f47f9SXin Li dup vrepchr.16b, chrin 65*412f47f9SXin Li bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ 66*412f47f9SXin Li dup vrepmask_c.4s, wtmp2 67*412f47f9SXin Li mov src_offset, #0 68*412f47f9SXin Li ands tmp1, srcin, #31 69*412f47f9SXin Li add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ 70*412f47f9SXin Li b.eq L(aligned) 71*412f47f9SXin Li 72*412f47f9SXin Li /* Input string is not 32-byte aligned. Rather than forcing 73*412f47f9SXin Li the padding bytes to a safe value, we calculate the syndrome 74*412f47f9SXin Li for all the bytes, but then mask off those bits of the 75*412f47f9SXin Li syndrome that are related to the padding. */ 76*412f47f9SXin Li ld1 {vdata1.16b, vdata2.16b}, [src], #32 77*412f47f9SXin Li neg tmp1, tmp1 78*412f47f9SXin Li cmeq vhas_nul1.16b, vdata1.16b, #0 79*412f47f9SXin Li cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 80*412f47f9SXin Li cmeq vhas_nul2.16b, vdata2.16b, #0 81*412f47f9SXin Li cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 82*412f47f9SXin Li and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b 83*412f47f9SXin Li and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b 84*412f47f9SXin Li and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b 85*412f47f9SXin Li and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b 86*412f47f9SXin Li addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128 87*412f47f9SXin Li addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 88*412f47f9SXin Li addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64 89*412f47f9SXin Li mov nul_match, vend1.d[0] 90*412f47f9SXin Li lsl tmp1, tmp1, #1 91*412f47f9SXin Li mov const_m1, #~0 92*412f47f9SXin Li lsr tmp3, const_m1, tmp1 93*412f47f9SXin Li mov chr_match, vend1.d[1] 94*412f47f9SXin Li 95*412f47f9SXin Li bic nul_match, nul_match, tmp3 // Mask padding bits. 96*412f47f9SXin Li bic chr_match, chr_match, tmp3 // Mask padding bits. 97*412f47f9SXin Li cbnz nul_match, L(tail) 98*412f47f9SXin Li 99*412f47f9SXin Li .p2align 4 100*412f47f9SXin LiL(loop): 101*412f47f9SXin Li cmp chr_match, #0 102*412f47f9SXin Li csel src_match, src, src_match, ne 103*412f47f9SXin Li csel src_offset, chr_match, src_offset, ne 104*412f47f9SXin LiL(aligned): 105*412f47f9SXin Li ld1 {vdata1.16b, vdata2.16b}, [src], #32 106*412f47f9SXin Li cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 107*412f47f9SXin Li cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 108*412f47f9SXin Li uminp vend1.16b, vdata1.16b, vdata2.16b 109*412f47f9SXin Li and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b 110*412f47f9SXin Li and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b 111*412f47f9SXin Li cmeq vend1.16b, vend1.16b, 0 112*412f47f9SXin Li addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 113*412f47f9SXin Li addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64 114*412f47f9SXin Li mov nul_match, vend1.d[0] 115*412f47f9SXin Li mov chr_match, vend1.d[1] 116*412f47f9SXin Li cbz nul_match, L(loop) 117*412f47f9SXin Li 118*412f47f9SXin Li cmeq vhas_nul1.16b, vdata1.16b, #0 119*412f47f9SXin Li cmeq vhas_nul2.16b, vdata2.16b, #0 120*412f47f9SXin Li and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b 121*412f47f9SXin Li and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b 122*412f47f9SXin Li addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b 123*412f47f9SXin Li addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b 124*412f47f9SXin Li mov nul_match, vhas_nul1.d[0] 125*412f47f9SXin Li 126*412f47f9SXin LiL(tail): 127*412f47f9SXin Li /* Work out exactly where the string ends. */ 128*412f47f9SXin Li sub tmp4, nul_match, #1 129*412f47f9SXin Li eor tmp4, tmp4, nul_match 130*412f47f9SXin Li ands chr_match, chr_match, tmp4 131*412f47f9SXin Li /* And pick the values corresponding to the last match. */ 132*412f47f9SXin Li csel src_match, src, src_match, ne 133*412f47f9SXin Li csel src_offset, chr_match, src_offset, ne 134*412f47f9SXin Li 135*412f47f9SXin Li /* Count down from the top of the syndrome to find the last match. */ 136*412f47f9SXin Li clz tmp3, src_offset 137*412f47f9SXin Li /* Src_match points beyond the word containing the match, so we can 138*412f47f9SXin Li simply subtract half the bit-offset into the syndrome. Because 139*412f47f9SXin Li we are counting down, we need to go back one more character. */ 140*412f47f9SXin Li add tmp3, tmp3, #2 141*412f47f9SXin Li sub result, src_match, tmp3, lsr #1 142*412f47f9SXin Li /* But if the syndrome shows no match was found, then return NULL. */ 143*412f47f9SXin Li cmp src_offset, #0 144*412f47f9SXin Li csel result, result, xzr, ne 145*412f47f9SXin Li 146*412f47f9SXin Li ret 147*412f47f9SXin Li 148*412f47f9SXin LiEND (__strrchr_aarch64) 149*412f47f9SXin Li 150