xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memset-sve.S (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1/*
2 * memset - fill memory with a constant byte
3 *
4 * Copyright (c) 2024-2024, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
11 *
12 */
13
14#include "asmdefs.h"
15
16.arch armv8-a+sve
17
18#define dstin	x0
19#define val	x1
20#define valw	w1
21#define count	x2
22#define dst	x3
23#define dstend	x4
24#define zva_val	x5
25#define vlen	x5
26#define off	x3
27#define dstend2 x5
28
29ENTRY (__memset_aarch64_sve)
30	dup	v0.16B, valw
31	cmp	count, 16
32	b.lo	L(set_16)
33
34	add	dstend, dstin, count
35	cmp	count, 64
36	b.hs	L(set_128)
37
38	/* Set 16..63 bytes.  */
39	mov	off, 16
40	and	off, off, count, lsr 1
41	sub	dstend2, dstend, off
42	str	q0, [dstin]
43	str	q0, [dstin, off]
44	str	q0, [dstend2, -16]
45	str	q0, [dstend, -16]
46	ret
47
48	.p2align 4
49L(set_16):
50	whilelo p0.b, xzr, count
51	st1b	z0.b, p0, [dstin]
52	ret
53
54	.p2align 4
55L(set_128):
56	bic	dst, dstin, 15
57	cmp	count, 128
58	b.hi	L(set_long)
59	stp	q0, q0, [dstin]
60	stp	q0, q0, [dstin, 32]
61	stp	q0, q0, [dstend, -64]
62	stp	q0, q0, [dstend, -32]
63	ret
64
65	.p2align 4
66L(set_long):
67	cmp	count, 256
68	b.lo	L(no_zva)
69	tst	valw, 255
70	b.ne	L(no_zva)
71
72#ifndef SKIP_ZVA_CHECK
73	mrs	zva_val, dczid_el0
74	and	zva_val, zva_val, 31
75	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
76	b.ne	L(no_zva)
77#endif
78	str	q0, [dstin]
79	str	q0, [dst, 16]
80	bic	dst, dstin, 31
81	stp	q0, q0, [dst, 32]
82	bic	dst, dstin, 63
83	sub	count, dstend, dst	/* Count is now 64 too large.  */
84	sub	count, count, 128	/* Adjust count and bias for loop.  */
85
86	sub	x8, dstend, 1		/* Write last bytes before ZVA loop.  */
87	bic	x8, x8, 15
88	stp	q0, q0, [x8, -48]
89	str	q0, [x8, -16]
90	str	q0, [dstend, -16]
91
92	.p2align 4
93L(zva64_loop):
94	add	dst, dst, 64
95	dc	zva, dst
96	subs	count, count, 64
97	b.hi	L(zva64_loop)
98	ret
99
100L(no_zva):
101	str	q0, [dstin]
102	sub	count, dstend, dst	/* Count is 16 too large.  */
103	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
104L(no_zva_loop):
105	stp	q0, q0, [dst, 16]
106	stp	q0, q0, [dst, 48]
107	add	dst, dst, 64
108	subs	count, count, 64
109	b.hi	L(no_zva_loop)
110	stp	q0, q0, [dstend, -64]
111	stp	q0, q0, [dstend, -32]
112	ret
113
114END (__memset_aarch64_sve)
115