1*412f47f9SXin Li/* 2*412f47f9SXin Li * memchr - find a character in a memory zone 3*412f47f9SXin Li * 4*412f47f9SXin Li * Copyright (c) 2014-2022, Arm Limited. 5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*412f47f9SXin Li */ 7*412f47f9SXin Li 8*412f47f9SXin Li/* Assumptions: 9*412f47f9SXin Li * 10*412f47f9SXin Li * ARMv8-a, AArch64 11*412f47f9SXin Li * Neon Available. 12*412f47f9SXin Li */ 13*412f47f9SXin Li 14*412f47f9SXin Li#include "asmdefs.h" 15*412f47f9SXin Li 16*412f47f9SXin Li/* Arguments and results. */ 17*412f47f9SXin Li#define srcin x0 18*412f47f9SXin Li#define chrin w1 19*412f47f9SXin Li#define cntin x2 20*412f47f9SXin Li 21*412f47f9SXin Li#define result x0 22*412f47f9SXin Li 23*412f47f9SXin Li#define src x3 24*412f47f9SXin Li#define tmp x4 25*412f47f9SXin Li#define wtmp2 w5 26*412f47f9SXin Li#define synd x6 27*412f47f9SXin Li#define soff x9 28*412f47f9SXin Li#define cntrem x10 29*412f47f9SXin Li 30*412f47f9SXin Li#define vrepchr v0 31*412f47f9SXin Li#define vdata1 v1 32*412f47f9SXin Li#define vdata2 v2 33*412f47f9SXin Li#define vhas_chr1 v3 34*412f47f9SXin Li#define vhas_chr2 v4 35*412f47f9SXin Li#define vrepmask v5 36*412f47f9SXin Li#define vend v6 37*412f47f9SXin Li 38*412f47f9SXin Li/* 39*412f47f9SXin Li * Core algorithm: 40*412f47f9SXin Li * 41*412f47f9SXin Li * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits 42*412f47f9SXin Li * per byte. For each tuple, bit 0 is set if the relevant byte matched the 43*412f47f9SXin Li * requested character and bit 1 is not used (faster than using a 32bit 44*412f47f9SXin Li * syndrome). Since the bits in the syndrome reflect exactly the order in which 45*412f47f9SXin Li * things occur in the original string, counting trailing zeros allows to 46*412f47f9SXin Li * identify exactly which byte has matched. 47*412f47f9SXin Li */ 48*412f47f9SXin Li 49*412f47f9SXin LiENTRY (__memchr_aarch64) 50*412f47f9SXin Li PTR_ARG (0) 51*412f47f9SXin Li SIZE_ARG (2) 52*412f47f9SXin Li /* Do not dereference srcin if no bytes to compare. */ 53*412f47f9SXin Li cbz cntin, L(zero_length) 54*412f47f9SXin Li /* 55*412f47f9SXin Li * Magic constant 0x40100401 allows us to identify which lane matches 56*412f47f9SXin Li * the requested byte. 57*412f47f9SXin Li */ 58*412f47f9SXin Li mov wtmp2, #0x0401 59*412f47f9SXin Li movk wtmp2, #0x4010, lsl #16 60*412f47f9SXin Li dup vrepchr.16b, chrin 61*412f47f9SXin Li /* Work with aligned 32-byte chunks */ 62*412f47f9SXin Li bic src, srcin, #31 63*412f47f9SXin Li dup vrepmask.4s, wtmp2 64*412f47f9SXin Li ands soff, srcin, #31 65*412f47f9SXin Li and cntrem, cntin, #31 66*412f47f9SXin Li b.eq L(loop) 67*412f47f9SXin Li 68*412f47f9SXin Li /* 69*412f47f9SXin Li * Input string is not 32-byte aligned. We calculate the syndrome 70*412f47f9SXin Li * value for the aligned 32 bytes block containing the first bytes 71*412f47f9SXin Li * and mask the irrelevant part. 72*412f47f9SXin Li */ 73*412f47f9SXin Li 74*412f47f9SXin Li ld1 {vdata1.16b, vdata2.16b}, [src], #32 75*412f47f9SXin Li sub tmp, soff, #32 76*412f47f9SXin Li adds cntin, cntin, tmp 77*412f47f9SXin Li cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 78*412f47f9SXin Li cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 79*412f47f9SXin Li and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 80*412f47f9SXin Li and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 81*412f47f9SXin Li addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 82*412f47f9SXin Li addp vend.16b, vend.16b, vend.16b /* 128->64 */ 83*412f47f9SXin Li mov synd, vend.d[0] 84*412f47f9SXin Li /* Clear the soff*2 lower bits */ 85*412f47f9SXin Li lsl tmp, soff, #1 86*412f47f9SXin Li lsr synd, synd, tmp 87*412f47f9SXin Li lsl synd, synd, tmp 88*412f47f9SXin Li /* The first block can also be the last */ 89*412f47f9SXin Li b.ls L(masklast) 90*412f47f9SXin Li /* Have we found something already? */ 91*412f47f9SXin Li cbnz synd, L(tail) 92*412f47f9SXin Li 93*412f47f9SXin LiL(loop): 94*412f47f9SXin Li ld1 {vdata1.16b, vdata2.16b}, [src], #32 95*412f47f9SXin Li subs cntin, cntin, #32 96*412f47f9SXin Li cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 97*412f47f9SXin Li cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 98*412f47f9SXin Li /* If we're out of data we finish regardless of the result */ 99*412f47f9SXin Li b.ls L(end) 100*412f47f9SXin Li /* Use a fast check for the termination condition */ 101*412f47f9SXin Li orr vend.16b, vhas_chr1.16b, vhas_chr2.16b 102*412f47f9SXin Li addp vend.2d, vend.2d, vend.2d 103*412f47f9SXin Li mov synd, vend.d[0] 104*412f47f9SXin Li /* We're not out of data, loop if we haven't found the character */ 105*412f47f9SXin Li cbz synd, L(loop) 106*412f47f9SXin Li 107*412f47f9SXin LiL(end): 108*412f47f9SXin Li /* Termination condition found, let's calculate the syndrome value */ 109*412f47f9SXin Li and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 110*412f47f9SXin Li and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 111*412f47f9SXin Li addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 112*412f47f9SXin Li addp vend.16b, vend.16b, vend.16b /* 128->64 */ 113*412f47f9SXin Li mov synd, vend.d[0] 114*412f47f9SXin Li /* Only do the clear for the last possible block */ 115*412f47f9SXin Li b.hs L(tail) 116*412f47f9SXin Li 117*412f47f9SXin LiL(masklast): 118*412f47f9SXin Li /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ 119*412f47f9SXin Li add tmp, cntrem, soff 120*412f47f9SXin Li and tmp, tmp, #31 121*412f47f9SXin Li sub tmp, tmp, #32 122*412f47f9SXin Li neg tmp, tmp, lsl #1 123*412f47f9SXin Li lsl synd, synd, tmp 124*412f47f9SXin Li lsr synd, synd, tmp 125*412f47f9SXin Li 126*412f47f9SXin LiL(tail): 127*412f47f9SXin Li /* Count the trailing zeros using bit reversing */ 128*412f47f9SXin Li rbit synd, synd 129*412f47f9SXin Li /* Compensate the last post-increment */ 130*412f47f9SXin Li sub src, src, #32 131*412f47f9SXin Li /* Check that we have found a character */ 132*412f47f9SXin Li cmp synd, #0 133*412f47f9SXin Li /* And count the leading zeros */ 134*412f47f9SXin Li clz synd, synd 135*412f47f9SXin Li /* Compute the potential result */ 136*412f47f9SXin Li add result, src, synd, lsr #1 137*412f47f9SXin Li /* Select result or NULL */ 138*412f47f9SXin Li csel result, xzr, result, eq 139*412f47f9SXin Li ret 140*412f47f9SXin Li 141*412f47f9SXin LiL(zero_length): 142*412f47f9SXin Li mov result, #0 143*412f47f9SXin Li ret 144*412f47f9SXin Li 145*412f47f9SXin LiEND (__memchr_aarch64) 146*412f47f9SXin Li 147