1*412f47f9SXin Li/* 2*412f47f9SXin Li * memset - fill memory with a constant byte 3*412f47f9SXin Li * 4*412f47f9SXin Li * Copyright (c) 2012-2022, Arm Limited. 5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*412f47f9SXin Li */ 7*412f47f9SXin Li 8*412f47f9SXin Li/* Assumptions: 9*412f47f9SXin Li * 10*412f47f9SXin Li * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 11*412f47f9SXin Li * 12*412f47f9SXin Li */ 13*412f47f9SXin Li 14*412f47f9SXin Li#include "asmdefs.h" 15*412f47f9SXin Li 16*412f47f9SXin Li#define dstin x0 17*412f47f9SXin Li#define val x1 18*412f47f9SXin Li#define valw w1 19*412f47f9SXin Li#define count x2 20*412f47f9SXin Li#define dst x3 21*412f47f9SXin Li#define dstend x4 22*412f47f9SXin Li#define zva_val x5 23*412f47f9SXin Li 24*412f47f9SXin LiENTRY (__memset_aarch64) 25*412f47f9SXin Li PTR_ARG (0) 26*412f47f9SXin Li SIZE_ARG (2) 27*412f47f9SXin Li 28*412f47f9SXin Li dup v0.16B, valw 29*412f47f9SXin Li add dstend, dstin, count 30*412f47f9SXin Li 31*412f47f9SXin Li cmp count, 96 32*412f47f9SXin Li b.hi L(set_long) 33*412f47f9SXin Li cmp count, 16 34*412f47f9SXin Li b.hs L(set_medium) 35*412f47f9SXin Li mov val, v0.D[0] 36*412f47f9SXin Li 37*412f47f9SXin Li /* Set 0..15 bytes. */ 38*412f47f9SXin Li tbz count, 3, 1f 39*412f47f9SXin Li str val, [dstin] 40*412f47f9SXin Li str val, [dstend, -8] 41*412f47f9SXin Li ret 42*412f47f9SXin Li .p2align 4 43*412f47f9SXin Li1: tbz count, 2, 2f 44*412f47f9SXin Li str valw, [dstin] 45*412f47f9SXin Li str valw, [dstend, -4] 46*412f47f9SXin Li ret 47*412f47f9SXin Li2: cbz count, 3f 48*412f47f9SXin Li strb valw, [dstin] 49*412f47f9SXin Li tbz count, 1, 3f 50*412f47f9SXin Li strh valw, [dstend, -2] 51*412f47f9SXin Li3: ret 52*412f47f9SXin Li 53*412f47f9SXin Li /* Set 17..96 bytes. */ 54*412f47f9SXin LiL(set_medium): 55*412f47f9SXin Li str q0, [dstin] 56*412f47f9SXin Li tbnz count, 6, L(set96) 57*412f47f9SXin Li str q0, [dstend, -16] 58*412f47f9SXin Li tbz count, 5, 1f 59*412f47f9SXin Li str q0, [dstin, 16] 60*412f47f9SXin Li str q0, [dstend, -32] 61*412f47f9SXin Li1: ret 62*412f47f9SXin Li 63*412f47f9SXin Li .p2align 4 64*412f47f9SXin Li /* Set 64..96 bytes. Write 64 bytes from the start and 65*412f47f9SXin Li 32 bytes from the end. */ 66*412f47f9SXin LiL(set96): 67*412f47f9SXin Li str q0, [dstin, 16] 68*412f47f9SXin Li stp q0, q0, [dstin, 32] 69*412f47f9SXin Li stp q0, q0, [dstend, -32] 70*412f47f9SXin Li ret 71*412f47f9SXin Li 72*412f47f9SXin Li .p2align 4 73*412f47f9SXin LiL(set_long): 74*412f47f9SXin Li and valw, valw, 255 75*412f47f9SXin Li bic dst, dstin, 15 76*412f47f9SXin Li str q0, [dstin] 77*412f47f9SXin Li cmp count, 160 78*412f47f9SXin Li ccmp valw, 0, 0, hs 79*412f47f9SXin Li b.ne L(no_zva) 80*412f47f9SXin Li 81*412f47f9SXin Li#ifndef SKIP_ZVA_CHECK 82*412f47f9SXin Li mrs zva_val, dczid_el0 83*412f47f9SXin Li and zva_val, zva_val, 31 84*412f47f9SXin Li cmp zva_val, 4 /* ZVA size is 64 bytes. */ 85*412f47f9SXin Li b.ne L(no_zva) 86*412f47f9SXin Li#endif 87*412f47f9SXin Li str q0, [dst, 16] 88*412f47f9SXin Li stp q0, q0, [dst, 32] 89*412f47f9SXin Li bic dst, dst, 63 90*412f47f9SXin Li sub count, dstend, dst /* Count is now 64 too large. */ 91*412f47f9SXin Li sub count, count, 128 /* Adjust count and bias for loop. */ 92*412f47f9SXin Li 93*412f47f9SXin Li .p2align 4 94*412f47f9SXin LiL(zva_loop): 95*412f47f9SXin Li add dst, dst, 64 96*412f47f9SXin Li dc zva, dst 97*412f47f9SXin Li subs count, count, 64 98*412f47f9SXin Li b.hi L(zva_loop) 99*412f47f9SXin Li stp q0, q0, [dstend, -64] 100*412f47f9SXin Li stp q0, q0, [dstend, -32] 101*412f47f9SXin Li ret 102*412f47f9SXin Li 103*412f47f9SXin LiL(no_zva): 104*412f47f9SXin Li sub count, dstend, dst /* Count is 16 too large. */ 105*412f47f9SXin Li sub dst, dst, 16 /* Dst is biased by -32. */ 106*412f47f9SXin Li sub count, count, 64 + 16 /* Adjust count and bias for loop. */ 107*412f47f9SXin LiL(no_zva_loop): 108*412f47f9SXin Li stp q0, q0, [dst, 32] 109*412f47f9SXin Li stp q0, q0, [dst, 64]! 110*412f47f9SXin Li subs count, count, 64 111*412f47f9SXin Li b.hi L(no_zva_loop) 112*412f47f9SXin Li stp q0, q0, [dstend, -64] 113*412f47f9SXin Li stp q0, q0, [dstend, -32] 114*412f47f9SXin Li ret 115*412f47f9SXin Li 116*412f47f9SXin LiEND (__memset_aarch64) 117*412f47f9SXin Li 118