xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memset.S (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1/*
2 * memset - fill memory with a constant byte
3 *
4 * Copyright (c) 2012-2024, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11 *
12 */
13
14#include "asmdefs.h"
15
16#define dstin	x0
17#define val	x1
18#define valw	w1
19#define count	x2
20#define dst	x3
21#define dstend	x4
22#define zva_val	x5
23#define off	x3
24#define dstend2	x5
25
26ENTRY (__memset_aarch64)
27	dup	v0.16B, valw
28	cmp	count, 16
29	b.lo	L(set_small)
30
31	add	dstend, dstin, count
32	cmp	count, 64
33	b.hs	L(set_128)
34
35	/* Set 16..63 bytes.  */
36	mov	off, 16
37	and	off, off, count, lsr 1
38	sub	dstend2, dstend, off
39	str	q0, [dstin]
40	str	q0, [dstin, off]
41	str	q0, [dstend2, -16]
42	str	q0, [dstend, -16]
43	ret
44
45	.p2align 4
46	/* Set 0..15 bytes.  */
47L(set_small):
48	add	dstend, dstin, count
49	cmp	count, 4
50	b.lo	2f
51	lsr	off, count, 3
52	sub	dstend2, dstend, off, lsl 2
53	str	s0, [dstin]
54	str	s0, [dstin, off, lsl 2]
55	str	s0, [dstend2, -4]
56	str	s0, [dstend, -4]
57	ret
58
59	/* Set 0..3 bytes.  */
602:	cbz	count, 3f
61	lsr	off, count, 1
62	strb	valw, [dstin]
63	strb	valw, [dstin, off]
64	strb	valw, [dstend, -1]
653:	ret
66
67	.p2align 4
68L(set_128):
69	bic	dst, dstin, 15
70	cmp	count, 128
71	b.hi	L(set_long)
72	stp	q0, q0, [dstin]
73	stp	q0, q0, [dstin, 32]
74	stp	q0, q0, [dstend, -64]
75	stp	q0, q0, [dstend, -32]
76	ret
77
78	.p2align 4
79L(set_long):
80	str	q0, [dstin]
81	str	q0, [dst, 16]
82	tst	valw, 255
83	b.ne	L(no_zva)
84#ifndef SKIP_ZVA_CHECK
85	mrs	zva_val, dczid_el0
86	and	zva_val, zva_val, 31
87	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
88	b.ne	L(no_zva)
89#endif
90	stp	q0, q0, [dst, 32]
91	bic	dst, dstin, 63
92	sub	count, dstend, dst	/* Count is now 64 too large.  */
93	sub	count, count, 64 + 64	/* Adjust count and bias for loop.  */
94
95	/* Write last bytes before ZVA loop.  */
96	stp	q0, q0, [dstend, -64]
97	stp	q0, q0, [dstend, -32]
98
99	.p2align 4
100L(zva64_loop):
101	add	dst, dst, 64
102	dc	zva, dst
103	subs	count, count, 64
104	b.hi	L(zva64_loop)
105	ret
106
107	.p2align 3
108L(no_zva):
109	sub	count, dstend, dst	/* Count is 32 too large.  */
110	sub	count, count, 64 + 32	/* Adjust count and bias for loop.  */
111L(no_zva_loop):
112	stp	q0, q0, [dst, 32]
113	stp	q0, q0, [dst, 64]
114	add	dst, dst, 64
115	subs	count, count, 64
116	b.hi	L(no_zva_loop)
117	stp	q0, q0, [dstend, -64]
118	stp	q0, q0, [dstend, -32]
119	ret
120
121END (__memset_aarch64)
122