1*412f47f9SXin Li/* 2*412f47f9SXin Li * strncmp - compare two strings 3*412f47f9SXin Li * 4*412f47f9SXin Li * Copyright (c) 2013-2022, Arm Limited. 5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*412f47f9SXin Li */ 7*412f47f9SXin Li 8*412f47f9SXin Li/* Assumptions: 9*412f47f9SXin Li * 10*412f47f9SXin Li * ARMv8-a, AArch64. 11*412f47f9SXin Li * MTE compatible. 12*412f47f9SXin Li */ 13*412f47f9SXin Li 14*412f47f9SXin Li#include "asmdefs.h" 15*412f47f9SXin Li 16*412f47f9SXin Li#define REP8_01 0x0101010101010101 17*412f47f9SXin Li#define REP8_7f 0x7f7f7f7f7f7f7f7f 18*412f47f9SXin Li 19*412f47f9SXin Li/* Parameters and result. */ 20*412f47f9SXin Li#define src1 x0 21*412f47f9SXin Li#define src2 x1 22*412f47f9SXin Li#define limit x2 23*412f47f9SXin Li#define result x0 24*412f47f9SXin Li 25*412f47f9SXin Li/* Internal variables. */ 26*412f47f9SXin Li#define data1 x3 27*412f47f9SXin Li#define data1w w3 28*412f47f9SXin Li#define data2 x4 29*412f47f9SXin Li#define data2w w4 30*412f47f9SXin Li#define has_nul x5 31*412f47f9SXin Li#define diff x6 32*412f47f9SXin Li#define syndrome x7 33*412f47f9SXin Li#define tmp1 x8 34*412f47f9SXin Li#define tmp2 x9 35*412f47f9SXin Li#define tmp3 x10 36*412f47f9SXin Li#define zeroones x11 37*412f47f9SXin Li#define pos x12 38*412f47f9SXin Li#define mask x13 39*412f47f9SXin Li#define endloop x14 40*412f47f9SXin Li#define count mask 41*412f47f9SXin Li#define offset pos 42*412f47f9SXin Li#define neg_offset x15 43*412f47f9SXin Li 44*412f47f9SXin Li/* Define endian dependent shift operations. 45*412f47f9SXin Li On big-endian early bytes are at MSB and on little-endian LSB. 46*412f47f9SXin Li LS_FW means shifting towards early bytes. 47*412f47f9SXin Li LS_BK means shifting towards later bytes. 48*412f47f9SXin Li */ 49*412f47f9SXin Li#ifdef __AARCH64EB__ 50*412f47f9SXin Li#define LS_FW lsl 51*412f47f9SXin Li#define LS_BK lsr 52*412f47f9SXin Li#else 53*412f47f9SXin Li#define LS_FW lsr 54*412f47f9SXin Li#define LS_BK lsl 55*412f47f9SXin Li#endif 56*412f47f9SXin Li 57*412f47f9SXin LiENTRY (__strncmp_aarch64) 58*412f47f9SXin Li PTR_ARG (0) 59*412f47f9SXin Li PTR_ARG (1) 60*412f47f9SXin Li SIZE_ARG (2) 61*412f47f9SXin Li cbz limit, L(ret0) 62*412f47f9SXin Li eor tmp1, src1, src2 63*412f47f9SXin Li mov zeroones, #REP8_01 64*412f47f9SXin Li tst tmp1, #7 65*412f47f9SXin Li and count, src1, #7 66*412f47f9SXin Li b.ne L(misaligned8) 67*412f47f9SXin Li cbnz count, L(mutual_align) 68*412f47f9SXin Li 69*412f47f9SXin Li /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 70*412f47f9SXin Li (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 71*412f47f9SXin Li can be done in parallel across the entire word. */ 72*412f47f9SXin Li .p2align 4 73*412f47f9SXin LiL(loop_aligned): 74*412f47f9SXin Li ldr data1, [src1], #8 75*412f47f9SXin Li ldr data2, [src2], #8 76*412f47f9SXin LiL(start_realigned): 77*412f47f9SXin Li subs limit, limit, #8 78*412f47f9SXin Li sub tmp1, data1, zeroones 79*412f47f9SXin Li orr tmp2, data1, #REP8_7f 80*412f47f9SXin Li eor diff, data1, data2 /* Non-zero if differences found. */ 81*412f47f9SXin Li csinv endloop, diff, xzr, hi /* Last Dword or differences. */ 82*412f47f9SXin Li bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 83*412f47f9SXin Li ccmp endloop, #0, #0, eq 84*412f47f9SXin Li b.eq L(loop_aligned) 85*412f47f9SXin Li /* End of main loop */ 86*412f47f9SXin Li 87*412f47f9SXin LiL(full_check): 88*412f47f9SXin Li#ifndef __AARCH64EB__ 89*412f47f9SXin Li orr syndrome, diff, has_nul 90*412f47f9SXin Li add limit, limit, 8 /* Rewind limit to before last subs. */ 91*412f47f9SXin LiL(syndrome_check): 92*412f47f9SXin Li /* Limit was reached. Check if the NUL byte or the difference 93*412f47f9SXin Li is before the limit. */ 94*412f47f9SXin Li rev syndrome, syndrome 95*412f47f9SXin Li rev data1, data1 96*412f47f9SXin Li clz pos, syndrome 97*412f47f9SXin Li rev data2, data2 98*412f47f9SXin Li lsl data1, data1, pos 99*412f47f9SXin Li cmp limit, pos, lsr #3 100*412f47f9SXin Li lsl data2, data2, pos 101*412f47f9SXin Li /* But we need to zero-extend (char is unsigned) the value and then 102*412f47f9SXin Li perform a signed 32-bit subtraction. */ 103*412f47f9SXin Li lsr data1, data1, #56 104*412f47f9SXin Li sub result, data1, data2, lsr #56 105*412f47f9SXin Li csel result, result, xzr, hi 106*412f47f9SXin Li ret 107*412f47f9SXin Li#else 108*412f47f9SXin Li /* Not reached the limit, must have found the end or a diff. */ 109*412f47f9SXin Li tbz limit, #63, L(not_limit) 110*412f47f9SXin Li add tmp1, limit, 8 111*412f47f9SXin Li cbz limit, L(not_limit) 112*412f47f9SXin Li 113*412f47f9SXin Li lsl limit, tmp1, #3 /* Bits -> bytes. */ 114*412f47f9SXin Li mov mask, #~0 115*412f47f9SXin Li lsr mask, mask, limit 116*412f47f9SXin Li bic data1, data1, mask 117*412f47f9SXin Li bic data2, data2, mask 118*412f47f9SXin Li 119*412f47f9SXin Li /* Make sure that the NUL byte is marked in the syndrome. */ 120*412f47f9SXin Li orr has_nul, has_nul, mask 121*412f47f9SXin Li 122*412f47f9SXin LiL(not_limit): 123*412f47f9SXin Li /* For big-endian we cannot use the trick with the syndrome value 124*412f47f9SXin Li as carry-propagation can corrupt the upper bits if the trailing 125*412f47f9SXin Li bytes in the string contain 0x01. */ 126*412f47f9SXin Li /* However, if there is no NUL byte in the dword, we can generate 127*412f47f9SXin Li the result directly. We can't just subtract the bytes as the 128*412f47f9SXin Li MSB might be significant. */ 129*412f47f9SXin Li cbnz has_nul, 1f 130*412f47f9SXin Li cmp data1, data2 131*412f47f9SXin Li cset result, ne 132*412f47f9SXin Li cneg result, result, lo 133*412f47f9SXin Li ret 134*412f47f9SXin Li1: 135*412f47f9SXin Li /* Re-compute the NUL-byte detection, using a byte-reversed value. */ 136*412f47f9SXin Li rev tmp3, data1 137*412f47f9SXin Li sub tmp1, tmp3, zeroones 138*412f47f9SXin Li orr tmp2, tmp3, #REP8_7f 139*412f47f9SXin Li bic has_nul, tmp1, tmp2 140*412f47f9SXin Li rev has_nul, has_nul 141*412f47f9SXin Li orr syndrome, diff, has_nul 142*412f47f9SXin Li clz pos, syndrome 143*412f47f9SXin Li /* The most-significant-non-zero bit of the syndrome marks either the 144*412f47f9SXin Li first bit that is different, or the top bit of the first zero byte. 145*412f47f9SXin Li Shifting left now will bring the critical information into the 146*412f47f9SXin Li top bits. */ 147*412f47f9SXin LiL(end_quick): 148*412f47f9SXin Li lsl data1, data1, pos 149*412f47f9SXin Li lsl data2, data2, pos 150*412f47f9SXin Li /* But we need to zero-extend (char is unsigned) the value and then 151*412f47f9SXin Li perform a signed 32-bit subtraction. */ 152*412f47f9SXin Li lsr data1, data1, #56 153*412f47f9SXin Li sub result, data1, data2, lsr #56 154*412f47f9SXin Li ret 155*412f47f9SXin Li#endif 156*412f47f9SXin Li 157*412f47f9SXin LiL(mutual_align): 158*412f47f9SXin Li /* Sources are mutually aligned, but are not currently at an 159*412f47f9SXin Li alignment boundary. Round down the addresses and then mask off 160*412f47f9SXin Li the bytes that precede the start point. 161*412f47f9SXin Li We also need to adjust the limit calculations, but without 162*412f47f9SXin Li overflowing if the limit is near ULONG_MAX. */ 163*412f47f9SXin Li bic src1, src1, #7 164*412f47f9SXin Li bic src2, src2, #7 165*412f47f9SXin Li ldr data1, [src1], #8 166*412f47f9SXin Li neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ 167*412f47f9SXin Li ldr data2, [src2], #8 168*412f47f9SXin Li mov tmp2, #~0 169*412f47f9SXin Li LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ 170*412f47f9SXin Li /* Adjust the limit and ensure it doesn't overflow. */ 171*412f47f9SXin Li adds limit, limit, count 172*412f47f9SXin Li csinv limit, limit, xzr, lo 173*412f47f9SXin Li orr data1, data1, tmp2 174*412f47f9SXin Li orr data2, data2, tmp2 175*412f47f9SXin Li b L(start_realigned) 176*412f47f9SXin Li 177*412f47f9SXin Li .p2align 4 178*412f47f9SXin Li /* Don't bother with dwords for up to 16 bytes. */ 179*412f47f9SXin LiL(misaligned8): 180*412f47f9SXin Li cmp limit, #16 181*412f47f9SXin Li b.hs L(try_misaligned_words) 182*412f47f9SXin Li 183*412f47f9SXin LiL(byte_loop): 184*412f47f9SXin Li /* Perhaps we can do better than this. */ 185*412f47f9SXin Li ldrb data1w, [src1], #1 186*412f47f9SXin Li ldrb data2w, [src2], #1 187*412f47f9SXin Li subs limit, limit, #1 188*412f47f9SXin Li ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ 189*412f47f9SXin Li ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 190*412f47f9SXin Li b.eq L(byte_loop) 191*412f47f9SXin LiL(done): 192*412f47f9SXin Li sub result, data1, data2 193*412f47f9SXin Li ret 194*412f47f9SXin Li /* Align the SRC1 to a dword by doing a bytewise compare and then do 195*412f47f9SXin Li the dword loop. */ 196*412f47f9SXin LiL(try_misaligned_words): 197*412f47f9SXin Li cbz count, L(src1_aligned) 198*412f47f9SXin Li 199*412f47f9SXin Li neg count, count 200*412f47f9SXin Li and count, count, #7 201*412f47f9SXin Li sub limit, limit, count 202*412f47f9SXin Li 203*412f47f9SXin LiL(page_end_loop): 204*412f47f9SXin Li ldrb data1w, [src1], #1 205*412f47f9SXin Li ldrb data2w, [src2], #1 206*412f47f9SXin Li cmp data1w, #1 207*412f47f9SXin Li ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 208*412f47f9SXin Li b.ne L(done) 209*412f47f9SXin Li subs count, count, #1 210*412f47f9SXin Li b.hi L(page_end_loop) 211*412f47f9SXin Li 212*412f47f9SXin Li /* The following diagram explains the comparison of misaligned strings. 213*412f47f9SXin Li The bytes are shown in natural order. For little-endian, it is 214*412f47f9SXin Li reversed in the registers. The "x" bytes are before the string. 215*412f47f9SXin Li The "|" separates data that is loaded at one time. 216*412f47f9SXin Li src1 | a a a a a a a a | b b b c c c c c | . . . 217*412f47f9SXin Li src2 | x x x x x a a a a a a a a b b b | c c c c c . . . 218*412f47f9SXin Li 219*412f47f9SXin Li After shifting in each step, the data looks like this: 220*412f47f9SXin Li STEP_A STEP_B STEP_C 221*412f47f9SXin Li data1 a a a a a a a a b b b c c c c c b b b c c c c c 222*412f47f9SXin Li data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c 223*412f47f9SXin Li 224*412f47f9SXin Li The bytes with "0" are eliminated from the syndrome via mask. 225*412f47f9SXin Li 226*412f47f9SXin Li Align SRC2 down to 16 bytes. This way we can read 16 bytes at a 227*412f47f9SXin Li time from SRC2. The comparison happens in 3 steps. After each step 228*412f47f9SXin Li the loop can exit, or read from SRC1 or SRC2. */ 229*412f47f9SXin LiL(src1_aligned): 230*412f47f9SXin Li /* Calculate offset from 8 byte alignment to string start in bits. No 231*412f47f9SXin Li need to mask offset since shifts are ignoring upper bits. */ 232*412f47f9SXin Li lsl offset, src2, #3 233*412f47f9SXin Li bic src2, src2, #0xf 234*412f47f9SXin Li mov mask, -1 235*412f47f9SXin Li neg neg_offset, offset 236*412f47f9SXin Li ldr data1, [src1], #8 237*412f47f9SXin Li ldp tmp1, tmp2, [src2], #16 238*412f47f9SXin Li LS_BK mask, mask, neg_offset 239*412f47f9SXin Li and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ 240*412f47f9SXin Li /* Skip the first compare if data in tmp1 is irrelevant. */ 241*412f47f9SXin Li tbnz offset, 6, L(misaligned_mid_loop) 242*412f47f9SXin Li 243*412f47f9SXin LiL(loop_misaligned): 244*412f47f9SXin Li /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ 245*412f47f9SXin Li LS_FW data2, tmp1, offset 246*412f47f9SXin Li LS_BK tmp1, tmp2, neg_offset 247*412f47f9SXin Li subs limit, limit, #8 248*412f47f9SXin Li orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ 249*412f47f9SXin Li sub has_nul, data1, zeroones 250*412f47f9SXin Li eor diff, data1, data2 /* Non-zero if differences found. */ 251*412f47f9SXin Li orr tmp3, data1, #REP8_7f 252*412f47f9SXin Li csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ 253*412f47f9SXin Li bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ 254*412f47f9SXin Li orr tmp3, endloop, has_nul 255*412f47f9SXin Li cbnz tmp3, L(full_check) 256*412f47f9SXin Li 257*412f47f9SXin Li ldr data1, [src1], #8 258*412f47f9SXin LiL(misaligned_mid_loop): 259*412f47f9SXin Li /* STEP_B: Compare first part of data1 to second part of tmp2. */ 260*412f47f9SXin Li LS_FW data2, tmp2, offset 261*412f47f9SXin Li#ifdef __AARCH64EB__ 262*412f47f9SXin Li /* For big-endian we do a byte reverse to avoid carry-propagation 263*412f47f9SXin Li problem described above. This way we can reuse the has_nul in the 264*412f47f9SXin Li next step and also use syndrome value trick at the end. */ 265*412f47f9SXin Li rev tmp3, data1 266*412f47f9SXin Li #define data1_fixed tmp3 267*412f47f9SXin Li#else 268*412f47f9SXin Li #define data1_fixed data1 269*412f47f9SXin Li#endif 270*412f47f9SXin Li sub has_nul, data1_fixed, zeroones 271*412f47f9SXin Li orr tmp3, data1_fixed, #REP8_7f 272*412f47f9SXin Li eor diff, data2, data1 /* Non-zero if differences found. */ 273*412f47f9SXin Li bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ 274*412f47f9SXin Li#ifdef __AARCH64EB__ 275*412f47f9SXin Li rev has_nul, has_nul 276*412f47f9SXin Li#endif 277*412f47f9SXin Li cmp limit, neg_offset, lsr #3 278*412f47f9SXin Li orr syndrome, diff, has_nul 279*412f47f9SXin Li bic syndrome, syndrome, mask /* Ignore later bytes. */ 280*412f47f9SXin Li csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 281*412f47f9SXin Li cbnz tmp3, L(syndrome_check) 282*412f47f9SXin Li 283*412f47f9SXin Li /* STEP_C: Compare second part of data1 to first part of tmp1. */ 284*412f47f9SXin Li ldp tmp1, tmp2, [src2], #16 285*412f47f9SXin Li cmp limit, #8 286*412f47f9SXin Li LS_BK data2, tmp1, neg_offset 287*412f47f9SXin Li eor diff, data2, data1 /* Non-zero if differences found. */ 288*412f47f9SXin Li orr syndrome, diff, has_nul 289*412f47f9SXin Li and syndrome, syndrome, mask /* Ignore earlier bytes. */ 290*412f47f9SXin Li csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 291*412f47f9SXin Li cbnz tmp3, L(syndrome_check) 292*412f47f9SXin Li 293*412f47f9SXin Li ldr data1, [src1], #8 294*412f47f9SXin Li sub limit, limit, #8 295*412f47f9SXin Li b L(loop_misaligned) 296*412f47f9SXin Li 297*412f47f9SXin Li#ifdef __AARCH64EB__ 298*412f47f9SXin LiL(syndrome_check): 299*412f47f9SXin Li clz pos, syndrome 300*412f47f9SXin Li cmp pos, limit, lsl #3 301*412f47f9SXin Li b.lo L(end_quick) 302*412f47f9SXin Li#endif 303*412f47f9SXin Li 304*412f47f9SXin LiL(ret0): 305*412f47f9SXin Li mov result, #0 306*412f47f9SXin Li ret 307*412f47f9SXin LiEND(__strncmp_aarch64) 308*412f47f9SXin Li 309