xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memset.S (revision 9729f076e4d93c5a37e78d427bfe0f1ab99bbcc6)
1/*
2 * memset - fill memory with a constant byte
3 *
4 * Copyright (c) 2012-2021, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11 *
12 */
13
14#include "../asmdefs.h"
15
16#define dstin	x0
17#define val	x1
18#define valw	w1
19#define count	x2
20#define dst	x3
21#define dstend	x4
22#define zva_val	x5
23
24ENTRY (__memset_aarch64)
25	PTR_ARG (0)
26	SIZE_ARG (2)
27
28	dup	v0.16B, valw
29	add	dstend, dstin, count
30
31	cmp	count, 96
32	b.hi	L(set_long)
33	cmp	count, 16
34	b.hs	L(set_medium)
35	mov	val, v0.D[0]
36
37	/* Set 0..15 bytes.  */
38	tbz	count, 3, 1f
39	str	val, [dstin]
40	str	val, [dstend, -8]
41	ret
42	.p2align 4
431:	tbz	count, 2, 2f
44	str	valw, [dstin]
45	str	valw, [dstend, -4]
46	ret
472:	cbz	count, 3f
48	strb	valw, [dstin]
49	tbz	count, 1, 3f
50	strh	valw, [dstend, -2]
513:	ret
52
53	/* Set 17..96 bytes.  */
54L(set_medium):
55	str	q0, [dstin]
56	tbnz	count, 6, L(set96)
57	str	q0, [dstend, -16]
58	tbz	count, 5, 1f
59	str	q0, [dstin, 16]
60	str	q0, [dstend, -32]
611:	ret
62
63	.p2align 4
64	/* Set 64..96 bytes.  Write 64 bytes from the start and
65	   32 bytes from the end.  */
66L(set96):
67	str	q0, [dstin, 16]
68	stp	q0, q0, [dstin, 32]
69	stp	q0, q0, [dstend, -32]
70	ret
71
72	.p2align 4
73L(set_long):
74	and	valw, valw, 255
75	bic	dst, dstin, 15
76	str	q0, [dstin]
77	cmp	count, 160
78	ccmp	valw, 0, 0, hs
79	b.ne	L(no_zva)
80
81#ifndef SKIP_ZVA_CHECK
82	mrs	zva_val, dczid_el0
83	and	zva_val, zva_val, 31
84	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
85	b.ne	L(no_zva)
86#endif
87	str	q0, [dst, 16]
88	stp	q0, q0, [dst, 32]
89	bic	dst, dst, 63
90	sub	count, dstend, dst	/* Count is now 64 too large.  */
91	sub	count, count, 128	/* Adjust count and bias for loop.  */
92
93	.p2align 4
94L(zva_loop):
95	add	dst, dst, 64
96	dc	zva, dst
97	subs	count, count, 64
98	b.hi	L(zva_loop)
99	stp	q0, q0, [dstend, -64]
100	stp	q0, q0, [dstend, -32]
101	ret
102
103L(no_zva):
104	sub	count, dstend, dst	/* Count is 16 too large.  */
105	sub	dst, dst, 16		/* Dst is biased by -32.  */
106	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
107L(no_zva_loop):
108	stp	q0, q0, [dst, 32]
109	stp	q0, q0, [dst, 64]!
110	subs	count, count, 64
111	b.hi	L(no_zva_loop)
112	stp	q0, q0, [dstend, -64]
113	stp	q0, q0, [dstend, -32]
114	ret
115
116END (__memset_aarch64)
117
118