1/* 2 * memset - fill memory with a constant byte 3 * 4 * Copyright (c) 2024-2024, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. 11 * 12 */ 13 14#include "asmdefs.h" 15 16.arch armv8-a+sve 17 18#define dstin x0 19#define val x1 20#define valw w1 21#define count x2 22#define dst x3 23#define dstend x4 24#define zva_val x5 25#define vlen x5 26#define off x3 27#define dstend2 x5 28 29ENTRY (__memset_aarch64_sve) 30 dup v0.16B, valw 31 cmp count, 16 32 b.lo L(set_16) 33 34 add dstend, dstin, count 35 cmp count, 64 36 b.hs L(set_128) 37 38 /* Set 16..63 bytes. */ 39 mov off, 16 40 and off, off, count, lsr 1 41 sub dstend2, dstend, off 42 str q0, [dstin] 43 str q0, [dstin, off] 44 str q0, [dstend2, -16] 45 str q0, [dstend, -16] 46 ret 47 48 .p2align 4 49L(set_16): 50 whilelo p0.b, xzr, count 51 st1b z0.b, p0, [dstin] 52 ret 53 54 .p2align 4 55L(set_128): 56 bic dst, dstin, 15 57 cmp count, 128 58 b.hi L(set_long) 59 stp q0, q0, [dstin] 60 stp q0, q0, [dstin, 32] 61 stp q0, q0, [dstend, -64] 62 stp q0, q0, [dstend, -32] 63 ret 64 65 .p2align 4 66L(set_long): 67 cmp count, 256 68 b.lo L(no_zva) 69 tst valw, 255 70 b.ne L(no_zva) 71 72#ifndef SKIP_ZVA_CHECK 73 mrs zva_val, dczid_el0 74 and zva_val, zva_val, 31 75 cmp zva_val, 4 /* ZVA size is 64 bytes. */ 76 b.ne L(no_zva) 77#endif 78 str q0, [dstin] 79 str q0, [dst, 16] 80 bic dst, dstin, 31 81 stp q0, q0, [dst, 32] 82 bic dst, dstin, 63 83 sub count, dstend, dst /* Count is now 64 too large. */ 84 sub count, count, 128 /* Adjust count and bias for loop. */ 85 86 sub x8, dstend, 1 /* Write last bytes before ZVA loop. */ 87 bic x8, x8, 15 88 stp q0, q0, [x8, -48] 89 str q0, [x8, -16] 90 str q0, [dstend, -16] 91 92 .p2align 4 93L(zva64_loop): 94 add dst, dst, 64 95 dc zva, dst 96 subs count, count, 64 97 b.hi L(zva64_loop) 98 ret 99 100L(no_zva): 101 str q0, [dstin] 102 sub count, dstend, dst /* Count is 16 too large. */ 103 sub count, count, 64 + 16 /* Adjust count and bias for loop. */ 104L(no_zva_loop): 105 stp q0, q0, [dst, 16] 106 stp q0, q0, [dst, 48] 107 add dst, dst, 64 108 subs count, count, 64 109 b.hi L(no_zva_loop) 110 stp q0, q0, [dstend, -64] 111 stp q0, q0, [dstend, -32] 112 ret 113 114END (__memset_aarch64_sve) 115