1*412f47f9SXin Li/* memcmp - compare memory 2*412f47f9SXin Li * 3*412f47f9SXin Li * Copyright (c) 2013-2022, Arm Limited. 4*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 5*412f47f9SXin Li */ 6*412f47f9SXin Li 7*412f47f9SXin Li/* Assumptions: 8*412f47f9SXin Li * 9*412f47f9SXin Li * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 10*412f47f9SXin Li */ 11*412f47f9SXin Li 12*412f47f9SXin Li#include "asmdefs.h" 13*412f47f9SXin Li 14*412f47f9SXin Li#define src1 x0 15*412f47f9SXin Li#define src2 x1 16*412f47f9SXin Li#define limit x2 17*412f47f9SXin Li#define result w0 18*412f47f9SXin Li 19*412f47f9SXin Li#define data1 x3 20*412f47f9SXin Li#define data1w w3 21*412f47f9SXin Li#define data2 x4 22*412f47f9SXin Li#define data2w w4 23*412f47f9SXin Li#define data3 x5 24*412f47f9SXin Li#define data3w w5 25*412f47f9SXin Li#define data4 x6 26*412f47f9SXin Li#define data4w w6 27*412f47f9SXin Li#define tmp x6 28*412f47f9SXin Li#define src1end x7 29*412f47f9SXin Li#define src2end x8 30*412f47f9SXin Li 31*412f47f9SXin Li 32*412f47f9SXin LiENTRY (__memcmp_aarch64) 33*412f47f9SXin Li PTR_ARG (0) 34*412f47f9SXin Li PTR_ARG (1) 35*412f47f9SXin Li SIZE_ARG (2) 36*412f47f9SXin Li 37*412f47f9SXin Li cmp limit, 16 38*412f47f9SXin Li b.lo L(less16) 39*412f47f9SXin Li ldp data1, data3, [src1] 40*412f47f9SXin Li ldp data2, data4, [src2] 41*412f47f9SXin Li ccmp data1, data2, 0, ne 42*412f47f9SXin Li ccmp data3, data4, 0, eq 43*412f47f9SXin Li b.ne L(return2) 44*412f47f9SXin Li 45*412f47f9SXin Li add src1end, src1, limit 46*412f47f9SXin Li add src2end, src2, limit 47*412f47f9SXin Li cmp limit, 32 48*412f47f9SXin Li b.ls L(last_bytes) 49*412f47f9SXin Li cmp limit, 160 50*412f47f9SXin Li b.hs L(loop_align) 51*412f47f9SXin Li sub limit, limit, 32 52*412f47f9SXin Li 53*412f47f9SXin Li .p2align 4 54*412f47f9SXin LiL(loop32): 55*412f47f9SXin Li ldp data1, data3, [src1, 16] 56*412f47f9SXin Li ldp data2, data4, [src2, 16] 57*412f47f9SXin Li cmp data1, data2 58*412f47f9SXin Li ccmp data3, data4, 0, eq 59*412f47f9SXin Li b.ne L(return2) 60*412f47f9SXin Li cmp limit, 16 61*412f47f9SXin Li b.ls L(last_bytes) 62*412f47f9SXin Li 63*412f47f9SXin Li ldp data1, data3, [src1, 32] 64*412f47f9SXin Li ldp data2, data4, [src2, 32] 65*412f47f9SXin Li cmp data1, data2 66*412f47f9SXin Li ccmp data3, data4, 0, eq 67*412f47f9SXin Li b.ne L(return2) 68*412f47f9SXin Li add src1, src1, 32 69*412f47f9SXin Li add src2, src2, 32 70*412f47f9SXin LiL(last64): 71*412f47f9SXin Li subs limit, limit, 32 72*412f47f9SXin Li b.hi L(loop32) 73*412f47f9SXin Li 74*412f47f9SXin Li /* Compare last 1-16 bytes using unaligned access. */ 75*412f47f9SXin LiL(last_bytes): 76*412f47f9SXin Li ldp data1, data3, [src1end, -16] 77*412f47f9SXin Li ldp data2, data4, [src2end, -16] 78*412f47f9SXin LiL(return2): 79*412f47f9SXin Li cmp data1, data2 80*412f47f9SXin Li csel data1, data1, data3, ne 81*412f47f9SXin Li csel data2, data2, data4, ne 82*412f47f9SXin Li 83*412f47f9SXin Li /* Compare data bytes and set return value to 0, -1 or 1. */ 84*412f47f9SXin LiL(return): 85*412f47f9SXin Li#ifndef __AARCH64EB__ 86*412f47f9SXin Li rev data1, data1 87*412f47f9SXin Li rev data2, data2 88*412f47f9SXin Li#endif 89*412f47f9SXin Li cmp data1, data2 90*412f47f9SXin Li cset result, ne 91*412f47f9SXin Li cneg result, result, lo 92*412f47f9SXin Li ret 93*412f47f9SXin Li 94*412f47f9SXin Li .p2align 4 95*412f47f9SXin LiL(less16): 96*412f47f9SXin Li add src1end, src1, limit 97*412f47f9SXin Li add src2end, src2, limit 98*412f47f9SXin Li tbz limit, 3, L(less8) 99*412f47f9SXin Li ldr data1, [src1] 100*412f47f9SXin Li ldr data2, [src2] 101*412f47f9SXin Li ldr data3, [src1end, -8] 102*412f47f9SXin Li ldr data4, [src2end, -8] 103*412f47f9SXin Li b L(return2) 104*412f47f9SXin Li 105*412f47f9SXin Li .p2align 4 106*412f47f9SXin LiL(less8): 107*412f47f9SXin Li tbz limit, 2, L(less4) 108*412f47f9SXin Li ldr data1w, [src1] 109*412f47f9SXin Li ldr data2w, [src2] 110*412f47f9SXin Li ldr data3w, [src1end, -4] 111*412f47f9SXin Li ldr data4w, [src2end, -4] 112*412f47f9SXin Li b L(return2) 113*412f47f9SXin Li 114*412f47f9SXin LiL(less4): 115*412f47f9SXin Li tbz limit, 1, L(less2) 116*412f47f9SXin Li ldrh data1w, [src1] 117*412f47f9SXin Li ldrh data2w, [src2] 118*412f47f9SXin Li cmp data1w, data2w 119*412f47f9SXin Li b.ne L(return) 120*412f47f9SXin LiL(less2): 121*412f47f9SXin Li mov result, 0 122*412f47f9SXin Li tbz limit, 0, L(return_zero) 123*412f47f9SXin Li ldrb data1w, [src1end, -1] 124*412f47f9SXin Li ldrb data2w, [src2end, -1] 125*412f47f9SXin Li sub result, data1w, data2w 126*412f47f9SXin LiL(return_zero): 127*412f47f9SXin Li ret 128*412f47f9SXin Li 129*412f47f9SXin LiL(loop_align): 130*412f47f9SXin Li ldp data1, data3, [src1, 16] 131*412f47f9SXin Li ldp data2, data4, [src2, 16] 132*412f47f9SXin Li cmp data1, data2 133*412f47f9SXin Li ccmp data3, data4, 0, eq 134*412f47f9SXin Li b.ne L(return2) 135*412f47f9SXin Li 136*412f47f9SXin Li /* Align src2 and adjust src1, src2 and limit. */ 137*412f47f9SXin Li and tmp, src2, 15 138*412f47f9SXin Li sub tmp, tmp, 16 139*412f47f9SXin Li sub src2, src2, tmp 140*412f47f9SXin Li add limit, limit, tmp 141*412f47f9SXin Li sub src1, src1, tmp 142*412f47f9SXin Li sub limit, limit, 64 + 16 143*412f47f9SXin Li 144*412f47f9SXin Li .p2align 4 145*412f47f9SXin LiL(loop64): 146*412f47f9SXin Li ldr q0, [src1, 16] 147*412f47f9SXin Li ldr q1, [src2, 16] 148*412f47f9SXin Li subs limit, limit, 64 149*412f47f9SXin Li ldr q2, [src1, 32] 150*412f47f9SXin Li ldr q3, [src2, 32] 151*412f47f9SXin Li eor v0.16b, v0.16b, v1.16b 152*412f47f9SXin Li eor v1.16b, v2.16b, v3.16b 153*412f47f9SXin Li ldr q2, [src1, 48] 154*412f47f9SXin Li ldr q3, [src2, 48] 155*412f47f9SXin Li umaxp v0.16b, v0.16b, v1.16b 156*412f47f9SXin Li ldr q4, [src1, 64]! 157*412f47f9SXin Li ldr q5, [src2, 64]! 158*412f47f9SXin Li eor v1.16b, v2.16b, v3.16b 159*412f47f9SXin Li eor v2.16b, v4.16b, v5.16b 160*412f47f9SXin Li umaxp v1.16b, v1.16b, v2.16b 161*412f47f9SXin Li umaxp v0.16b, v0.16b, v1.16b 162*412f47f9SXin Li umaxp v0.16b, v0.16b, v0.16b 163*412f47f9SXin Li fmov tmp, d0 164*412f47f9SXin Li ccmp tmp, 0, 0, hi 165*412f47f9SXin Li b.eq L(loop64) 166*412f47f9SXin Li 167*412f47f9SXin Li /* If equal, process last 1-64 bytes using scalar loop. */ 168*412f47f9SXin Li add limit, limit, 64 + 16 169*412f47f9SXin Li cbz tmp, L(last64) 170*412f47f9SXin Li 171*412f47f9SXin Li /* Determine the 8-byte aligned offset of the first difference. */ 172*412f47f9SXin Li#ifdef __AARCH64EB__ 173*412f47f9SXin Li rev16 tmp, tmp 174*412f47f9SXin Li#endif 175*412f47f9SXin Li rev tmp, tmp 176*412f47f9SXin Li clz tmp, tmp 177*412f47f9SXin Li bic tmp, tmp, 7 178*412f47f9SXin Li sub tmp, tmp, 48 179*412f47f9SXin Li ldr data1, [src1, tmp] 180*412f47f9SXin Li ldr data2, [src2, tmp] 181*412f47f9SXin Li#ifndef __AARCH64EB__ 182*412f47f9SXin Li rev data1, data1 183*412f47f9SXin Li rev data2, data2 184*412f47f9SXin Li#endif 185*412f47f9SXin Li mov result, 1 186*412f47f9SXin Li cmp data1, data2 187*412f47f9SXin Li cneg result, result, lo 188*412f47f9SXin Li ret 189*412f47f9SXin Li 190*412f47f9SXin LiEND (__memcmp_aarch64) 191