1*412f47f9SXin Li/* 2*412f47f9SXin Li * strcmp - compare two strings 3*412f47f9SXin Li * 4*412f47f9SXin Li * Copyright (c) 2012-2022, Arm Limited. 5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*412f47f9SXin Li */ 7*412f47f9SXin Li 8*412f47f9SXin Li 9*412f47f9SXin Li/* Assumptions: 10*412f47f9SXin Li * 11*412f47f9SXin Li * ARMv8-a, AArch64. 12*412f47f9SXin Li * MTE compatible. 13*412f47f9SXin Li */ 14*412f47f9SXin Li 15*412f47f9SXin Li#include "asmdefs.h" 16*412f47f9SXin Li 17*412f47f9SXin Li#define REP8_01 0x0101010101010101 18*412f47f9SXin Li#define REP8_7f 0x7f7f7f7f7f7f7f7f 19*412f47f9SXin Li 20*412f47f9SXin Li#define src1 x0 21*412f47f9SXin Li#define src2 x1 22*412f47f9SXin Li#define result x0 23*412f47f9SXin Li 24*412f47f9SXin Li#define data1 x2 25*412f47f9SXin Li#define data1w w2 26*412f47f9SXin Li#define data2 x3 27*412f47f9SXin Li#define data2w w3 28*412f47f9SXin Li#define has_nul x4 29*412f47f9SXin Li#define diff x5 30*412f47f9SXin Li#define off1 x5 31*412f47f9SXin Li#define syndrome x6 32*412f47f9SXin Li#define tmp x6 33*412f47f9SXin Li#define data3 x7 34*412f47f9SXin Li#define zeroones x8 35*412f47f9SXin Li#define shift x9 36*412f47f9SXin Li#define off2 x10 37*412f47f9SXin Li 38*412f47f9SXin Li/* On big-endian early bytes are at MSB and on little-endian LSB. 39*412f47f9SXin Li LS_FW means shifting towards early bytes. */ 40*412f47f9SXin Li#ifdef __AARCH64EB__ 41*412f47f9SXin Li# define LS_FW lsl 42*412f47f9SXin Li#else 43*412f47f9SXin Li# define LS_FW lsr 44*412f47f9SXin Li#endif 45*412f47f9SXin Li 46*412f47f9SXin Li/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 47*412f47f9SXin Li (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 48*412f47f9SXin Li can be done in parallel across the entire word. 49*412f47f9SXin Li Since carry propagation makes 0x1 bytes before a NUL byte appear 50*412f47f9SXin Li NUL too in big-endian, byte-reverse the data before the NUL check. */ 51*412f47f9SXin Li 52*412f47f9SXin Li 53*412f47f9SXin LiENTRY (__strcmp_aarch64) 54*412f47f9SXin Li PTR_ARG (0) 55*412f47f9SXin Li PTR_ARG (1) 56*412f47f9SXin Li sub off2, src2, src1 57*412f47f9SXin Li mov zeroones, REP8_01 58*412f47f9SXin Li and tmp, src1, 7 59*412f47f9SXin Li tst off2, 7 60*412f47f9SXin Li b.ne L(misaligned8) 61*412f47f9SXin Li cbnz tmp, L(mutual_align) 62*412f47f9SXin Li 63*412f47f9SXin Li .p2align 4 64*412f47f9SXin Li 65*412f47f9SXin LiL(loop_aligned): 66*412f47f9SXin Li ldr data2, [src1, off2] 67*412f47f9SXin Li ldr data1, [src1], 8 68*412f47f9SXin LiL(start_realigned): 69*412f47f9SXin Li#ifdef __AARCH64EB__ 70*412f47f9SXin Li rev tmp, data1 71*412f47f9SXin Li sub has_nul, tmp, zeroones 72*412f47f9SXin Li orr tmp, tmp, REP8_7f 73*412f47f9SXin Li#else 74*412f47f9SXin Li sub has_nul, data1, zeroones 75*412f47f9SXin Li orr tmp, data1, REP8_7f 76*412f47f9SXin Li#endif 77*412f47f9SXin Li bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ 78*412f47f9SXin Li ccmp data1, data2, 0, eq 79*412f47f9SXin Li b.eq L(loop_aligned) 80*412f47f9SXin Li#ifdef __AARCH64EB__ 81*412f47f9SXin Li rev has_nul, has_nul 82*412f47f9SXin Li#endif 83*412f47f9SXin Li eor diff, data1, data2 84*412f47f9SXin Li orr syndrome, diff, has_nul 85*412f47f9SXin LiL(end): 86*412f47f9SXin Li#ifndef __AARCH64EB__ 87*412f47f9SXin Li rev syndrome, syndrome 88*412f47f9SXin Li rev data1, data1 89*412f47f9SXin Li rev data2, data2 90*412f47f9SXin Li#endif 91*412f47f9SXin Li clz shift, syndrome 92*412f47f9SXin Li /* The most-significant-non-zero bit of the syndrome marks either the 93*412f47f9SXin Li first bit that is different, or the top bit of the first zero byte. 94*412f47f9SXin Li Shifting left now will bring the critical information into the 95*412f47f9SXin Li top bits. */ 96*412f47f9SXin Li lsl data1, data1, shift 97*412f47f9SXin Li lsl data2, data2, shift 98*412f47f9SXin Li /* But we need to zero-extend (char is unsigned) the value and then 99*412f47f9SXin Li perform a signed 32-bit subtraction. */ 100*412f47f9SXin Li lsr data1, data1, 56 101*412f47f9SXin Li sub result, data1, data2, lsr 56 102*412f47f9SXin Li ret 103*412f47f9SXin Li 104*412f47f9SXin Li .p2align 4 105*412f47f9SXin Li 106*412f47f9SXin LiL(mutual_align): 107*412f47f9SXin Li /* Sources are mutually aligned, but are not currently at an 108*412f47f9SXin Li alignment boundary. Round down the addresses and then mask off 109*412f47f9SXin Li the bytes that precede the start point. */ 110*412f47f9SXin Li bic src1, src1, 7 111*412f47f9SXin Li ldr data2, [src1, off2] 112*412f47f9SXin Li ldr data1, [src1], 8 113*412f47f9SXin Li neg shift, src2, lsl 3 /* Bits to alignment -64. */ 114*412f47f9SXin Li mov tmp, -1 115*412f47f9SXin Li LS_FW tmp, tmp, shift 116*412f47f9SXin Li orr data1, data1, tmp 117*412f47f9SXin Li orr data2, data2, tmp 118*412f47f9SXin Li b L(start_realigned) 119*412f47f9SXin Li 120*412f47f9SXin LiL(misaligned8): 121*412f47f9SXin Li /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always 122*412f47f9SXin Li checking to make sure that we don't access beyond the end of SRC2. */ 123*412f47f9SXin Li cbz tmp, L(src1_aligned) 124*412f47f9SXin LiL(do_misaligned): 125*412f47f9SXin Li ldrb data1w, [src1], 1 126*412f47f9SXin Li ldrb data2w, [src2], 1 127*412f47f9SXin Li cmp data1w, 0 128*412f47f9SXin Li ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ 129*412f47f9SXin Li b.ne L(done) 130*412f47f9SXin Li tst src1, 7 131*412f47f9SXin Li b.ne L(do_misaligned) 132*412f47f9SXin Li 133*412f47f9SXin LiL(src1_aligned): 134*412f47f9SXin Li neg shift, src2, lsl 3 135*412f47f9SXin Li bic src2, src2, 7 136*412f47f9SXin Li ldr data3, [src2], 8 137*412f47f9SXin Li#ifdef __AARCH64EB__ 138*412f47f9SXin Li rev data3, data3 139*412f47f9SXin Li#endif 140*412f47f9SXin Li lsr tmp, zeroones, shift 141*412f47f9SXin Li orr data3, data3, tmp 142*412f47f9SXin Li sub has_nul, data3, zeroones 143*412f47f9SXin Li orr tmp, data3, REP8_7f 144*412f47f9SXin Li bics has_nul, has_nul, tmp 145*412f47f9SXin Li b.ne L(tail) 146*412f47f9SXin Li 147*412f47f9SXin Li sub off1, src2, src1 148*412f47f9SXin Li 149*412f47f9SXin Li .p2align 4 150*412f47f9SXin Li 151*412f47f9SXin LiL(loop_unaligned): 152*412f47f9SXin Li ldr data3, [src1, off1] 153*412f47f9SXin Li ldr data2, [src1, off2] 154*412f47f9SXin Li#ifdef __AARCH64EB__ 155*412f47f9SXin Li rev data3, data3 156*412f47f9SXin Li#endif 157*412f47f9SXin Li sub has_nul, data3, zeroones 158*412f47f9SXin Li orr tmp, data3, REP8_7f 159*412f47f9SXin Li ldr data1, [src1], 8 160*412f47f9SXin Li bics has_nul, has_nul, tmp 161*412f47f9SXin Li ccmp data1, data2, 0, eq 162*412f47f9SXin Li b.eq L(loop_unaligned) 163*412f47f9SXin Li 164*412f47f9SXin Li lsl tmp, has_nul, shift 165*412f47f9SXin Li#ifdef __AARCH64EB__ 166*412f47f9SXin Li rev tmp, tmp 167*412f47f9SXin Li#endif 168*412f47f9SXin Li eor diff, data1, data2 169*412f47f9SXin Li orr syndrome, diff, tmp 170*412f47f9SXin Li cbnz syndrome, L(end) 171*412f47f9SXin LiL(tail): 172*412f47f9SXin Li ldr data1, [src1] 173*412f47f9SXin Li neg shift, shift 174*412f47f9SXin Li lsr data2, data3, shift 175*412f47f9SXin Li lsr has_nul, has_nul, shift 176*412f47f9SXin Li#ifdef __AARCH64EB__ 177*412f47f9SXin Li rev data2, data2 178*412f47f9SXin Li rev has_nul, has_nul 179*412f47f9SXin Li#endif 180*412f47f9SXin Li eor diff, data1, data2 181*412f47f9SXin Li orr syndrome, diff, has_nul 182*412f47f9SXin Li b L(end) 183*412f47f9SXin Li 184*412f47f9SXin LiL(done): 185*412f47f9SXin Li sub result, data1, data2 186*412f47f9SXin Li ret 187*412f47f9SXin Li 188*412f47f9SXin LiEND (__strcmp_aarch64) 189*412f47f9SXin Li 190