1*f3087befSAndrew Turner/* 2*f3087befSAndrew Turner * memset - fill memory with a constant byte 3*f3087befSAndrew Turner * 4*f3087befSAndrew Turner * Copyright (c) 2024-2024, Arm Limited. 5*f3087befSAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*f3087befSAndrew Turner */ 7*f3087befSAndrew Turner 8*f3087befSAndrew Turner/* Assumptions: 9*f3087befSAndrew Turner * 10*f3087befSAndrew Turner * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. 11*f3087befSAndrew Turner * 12*f3087befSAndrew Turner */ 13*f3087befSAndrew Turner 14*f3087befSAndrew Turner#include "asmdefs.h" 15*f3087befSAndrew Turner 16*f3087befSAndrew Turner.arch armv8-a+sve 17*f3087befSAndrew Turner 18*f3087befSAndrew Turner#define dstin x0 19*f3087befSAndrew Turner#define val x1 20*f3087befSAndrew Turner#define valw w1 21*f3087befSAndrew Turner#define count x2 22*f3087befSAndrew Turner#define dst x3 23*f3087befSAndrew Turner#define dstend x4 24*f3087befSAndrew Turner#define zva_val x5 25*f3087befSAndrew Turner#define vlen x5 26*f3087befSAndrew Turner#define off x3 27*f3087befSAndrew Turner#define dstend2 x5 28*f3087befSAndrew Turner 29*f3087befSAndrew TurnerENTRY (__memset_aarch64_sve) 30*f3087befSAndrew Turner dup v0.16B, valw 31*f3087befSAndrew Turner cmp count, 16 32*f3087befSAndrew Turner b.lo L(set_16) 33*f3087befSAndrew Turner 34*f3087befSAndrew Turner add dstend, dstin, count 35*f3087befSAndrew Turner cmp count, 64 36*f3087befSAndrew Turner b.hs L(set_128) 37*f3087befSAndrew Turner 38*f3087befSAndrew Turner /* Set 16..63 bytes. */ 39*f3087befSAndrew Turner mov off, 16 40*f3087befSAndrew Turner and off, off, count, lsr 1 41*f3087befSAndrew Turner sub dstend2, dstend, off 42*f3087befSAndrew Turner str q0, [dstin] 43*f3087befSAndrew Turner str q0, [dstin, off] 44*f3087befSAndrew Turner str q0, [dstend2, -16] 45*f3087befSAndrew Turner str q0, [dstend, -16] 46*f3087befSAndrew Turner ret 47*f3087befSAndrew Turner 48*f3087befSAndrew Turner .p2align 4 49*f3087befSAndrew TurnerL(set_16): 50*f3087befSAndrew Turner whilelo p0.b, xzr, count 51*f3087befSAndrew Turner st1b z0.b, p0, [dstin] 52*f3087befSAndrew Turner ret 53*f3087befSAndrew Turner 54*f3087befSAndrew Turner .p2align 4 55*f3087befSAndrew TurnerL(set_128): 56*f3087befSAndrew Turner bic dst, dstin, 15 57*f3087befSAndrew Turner cmp count, 128 58*f3087befSAndrew Turner b.hi L(set_long) 59*f3087befSAndrew Turner stp q0, q0, [dstin] 60*f3087befSAndrew Turner stp q0, q0, [dstin, 32] 61*f3087befSAndrew Turner stp q0, q0, [dstend, -64] 62*f3087befSAndrew Turner stp q0, q0, [dstend, -32] 63*f3087befSAndrew Turner ret 64*f3087befSAndrew Turner 65*f3087befSAndrew Turner .p2align 4 66*f3087befSAndrew TurnerL(set_long): 67*f3087befSAndrew Turner cmp count, 256 68*f3087befSAndrew Turner b.lo L(no_zva) 69*f3087befSAndrew Turner tst valw, 255 70*f3087befSAndrew Turner b.ne L(no_zva) 71*f3087befSAndrew Turner 72*f3087befSAndrew Turner#ifndef SKIP_ZVA_CHECK 73*f3087befSAndrew Turner mrs zva_val, dczid_el0 74*f3087befSAndrew Turner and zva_val, zva_val, 31 75*f3087befSAndrew Turner cmp zva_val, 4 /* ZVA size is 64 bytes. */ 76*f3087befSAndrew Turner b.ne L(no_zva) 77*f3087befSAndrew Turner#endif 78*f3087befSAndrew Turner str q0, [dstin] 79*f3087befSAndrew Turner str q0, [dst, 16] 80*f3087befSAndrew Turner bic dst, dstin, 31 81*f3087befSAndrew Turner stp q0, q0, [dst, 32] 82*f3087befSAndrew Turner bic dst, dstin, 63 83*f3087befSAndrew Turner sub count, dstend, dst /* Count is now 64 too large. */ 84*f3087befSAndrew Turner sub count, count, 128 /* Adjust count and bias for loop. */ 85*f3087befSAndrew Turner 86*f3087befSAndrew Turner sub x8, dstend, 1 /* Write last bytes before ZVA loop. */ 87*f3087befSAndrew Turner bic x8, x8, 15 88*f3087befSAndrew Turner stp q0, q0, [x8, -48] 89*f3087befSAndrew Turner str q0, [x8, -16] 90*f3087befSAndrew Turner str q0, [dstend, -16] 91*f3087befSAndrew Turner 92*f3087befSAndrew Turner .p2align 4 93*f3087befSAndrew TurnerL(zva64_loop): 94*f3087befSAndrew Turner add dst, dst, 64 95*f3087befSAndrew Turner dc zva, dst 96*f3087befSAndrew Turner subs count, count, 64 97*f3087befSAndrew Turner b.hi L(zva64_loop) 98*f3087befSAndrew Turner ret 99*f3087befSAndrew Turner 100*f3087befSAndrew TurnerL(no_zva): 101*f3087befSAndrew Turner str q0, [dstin] 102*f3087befSAndrew Turner sub count, dstend, dst /* Count is 16 too large. */ 103*f3087befSAndrew Turner sub count, count, 64 + 16 /* Adjust count and bias for loop. */ 104*f3087befSAndrew TurnerL(no_zva_loop): 105*f3087befSAndrew Turner stp q0, q0, [dst, 16] 106*f3087befSAndrew Turner stp q0, q0, [dst, 48] 107*f3087befSAndrew Turner add dst, dst, 64 108*f3087befSAndrew Turner subs count, count, 64 109*f3087befSAndrew Turner b.hi L(no_zva_loop) 110*f3087befSAndrew Turner stp q0, q0, [dstend, -64] 111*f3087befSAndrew Turner stp q0, q0, [dstend, -32] 112*f3087befSAndrew Turner ret 113*f3087befSAndrew Turner 114*f3087befSAndrew TurnerEND (__memset_aarch64_sve) 115