xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memset-sve.S (revision f3087bef11543b42e0d69b708f367097a4118d24)
1*f3087befSAndrew Turner/*
2*f3087befSAndrew Turner * memset - fill memory with a constant byte
3*f3087befSAndrew Turner *
4*f3087befSAndrew Turner * Copyright (c) 2024-2024, Arm Limited.
5*f3087befSAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*f3087befSAndrew Turner */
7*f3087befSAndrew Turner
8*f3087befSAndrew Turner/* Assumptions:
9*f3087befSAndrew Turner *
10*f3087befSAndrew Turner * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
11*f3087befSAndrew Turner *
12*f3087befSAndrew Turner */
13*f3087befSAndrew Turner
14*f3087befSAndrew Turner#include "asmdefs.h"
15*f3087befSAndrew Turner
16*f3087befSAndrew Turner.arch armv8-a+sve
17*f3087befSAndrew Turner
18*f3087befSAndrew Turner#define dstin	x0
19*f3087befSAndrew Turner#define val	x1
20*f3087befSAndrew Turner#define valw	w1
21*f3087befSAndrew Turner#define count	x2
22*f3087befSAndrew Turner#define dst	x3
23*f3087befSAndrew Turner#define dstend	x4
24*f3087befSAndrew Turner#define zva_val	x5
25*f3087befSAndrew Turner#define vlen	x5
26*f3087befSAndrew Turner#define off	x3
27*f3087befSAndrew Turner#define dstend2 x5
28*f3087befSAndrew Turner
29*f3087befSAndrew TurnerENTRY (__memset_aarch64_sve)
30*f3087befSAndrew Turner	dup	v0.16B, valw
31*f3087befSAndrew Turner	cmp	count, 16
32*f3087befSAndrew Turner	b.lo	L(set_16)
33*f3087befSAndrew Turner
34*f3087befSAndrew Turner	add	dstend, dstin, count
35*f3087befSAndrew Turner	cmp	count, 64
36*f3087befSAndrew Turner	b.hs	L(set_128)
37*f3087befSAndrew Turner
38*f3087befSAndrew Turner	/* Set 16..63 bytes.  */
39*f3087befSAndrew Turner	mov	off, 16
40*f3087befSAndrew Turner	and	off, off, count, lsr 1
41*f3087befSAndrew Turner	sub	dstend2, dstend, off
42*f3087befSAndrew Turner	str	q0, [dstin]
43*f3087befSAndrew Turner	str	q0, [dstin, off]
44*f3087befSAndrew Turner	str	q0, [dstend2, -16]
45*f3087befSAndrew Turner	str	q0, [dstend, -16]
46*f3087befSAndrew Turner	ret
47*f3087befSAndrew Turner
48*f3087befSAndrew Turner	.p2align 4
49*f3087befSAndrew TurnerL(set_16):
50*f3087befSAndrew Turner	whilelo p0.b, xzr, count
51*f3087befSAndrew Turner	st1b	z0.b, p0, [dstin]
52*f3087befSAndrew Turner	ret
53*f3087befSAndrew Turner
54*f3087befSAndrew Turner	.p2align 4
55*f3087befSAndrew TurnerL(set_128):
56*f3087befSAndrew Turner	bic	dst, dstin, 15
57*f3087befSAndrew Turner	cmp	count, 128
58*f3087befSAndrew Turner	b.hi	L(set_long)
59*f3087befSAndrew Turner	stp	q0, q0, [dstin]
60*f3087befSAndrew Turner	stp	q0, q0, [dstin, 32]
61*f3087befSAndrew Turner	stp	q0, q0, [dstend, -64]
62*f3087befSAndrew Turner	stp	q0, q0, [dstend, -32]
63*f3087befSAndrew Turner	ret
64*f3087befSAndrew Turner
65*f3087befSAndrew Turner	.p2align 4
66*f3087befSAndrew TurnerL(set_long):
67*f3087befSAndrew Turner	cmp	count, 256
68*f3087befSAndrew Turner	b.lo	L(no_zva)
69*f3087befSAndrew Turner	tst	valw, 255
70*f3087befSAndrew Turner	b.ne	L(no_zva)
71*f3087befSAndrew Turner
72*f3087befSAndrew Turner#ifndef SKIP_ZVA_CHECK
73*f3087befSAndrew Turner	mrs	zva_val, dczid_el0
74*f3087befSAndrew Turner	and	zva_val, zva_val, 31
75*f3087befSAndrew Turner	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
76*f3087befSAndrew Turner	b.ne	L(no_zva)
77*f3087befSAndrew Turner#endif
78*f3087befSAndrew Turner	str	q0, [dstin]
79*f3087befSAndrew Turner	str	q0, [dst, 16]
80*f3087befSAndrew Turner	bic	dst, dstin, 31
81*f3087befSAndrew Turner	stp	q0, q0, [dst, 32]
82*f3087befSAndrew Turner	bic	dst, dstin, 63
83*f3087befSAndrew Turner	sub	count, dstend, dst	/* Count is now 64 too large.  */
84*f3087befSAndrew Turner	sub	count, count, 128	/* Adjust count and bias for loop.  */
85*f3087befSAndrew Turner
86*f3087befSAndrew Turner	sub	x8, dstend, 1		/* Write last bytes before ZVA loop.  */
87*f3087befSAndrew Turner	bic	x8, x8, 15
88*f3087befSAndrew Turner	stp	q0, q0, [x8, -48]
89*f3087befSAndrew Turner	str	q0, [x8, -16]
90*f3087befSAndrew Turner	str	q0, [dstend, -16]
91*f3087befSAndrew Turner
92*f3087befSAndrew Turner	.p2align 4
93*f3087befSAndrew TurnerL(zva64_loop):
94*f3087befSAndrew Turner	add	dst, dst, 64
95*f3087befSAndrew Turner	dc	zva, dst
96*f3087befSAndrew Turner	subs	count, count, 64
97*f3087befSAndrew Turner	b.hi	L(zva64_loop)
98*f3087befSAndrew Turner	ret
99*f3087befSAndrew Turner
100*f3087befSAndrew TurnerL(no_zva):
101*f3087befSAndrew Turner	str	q0, [dstin]
102*f3087befSAndrew Turner	sub	count, dstend, dst	/* Count is 16 too large.  */
103*f3087befSAndrew Turner	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
104*f3087befSAndrew TurnerL(no_zva_loop):
105*f3087befSAndrew Turner	stp	q0, q0, [dst, 16]
106*f3087befSAndrew Turner	stp	q0, q0, [dst, 48]
107*f3087befSAndrew Turner	add	dst, dst, 64
108*f3087befSAndrew Turner	subs	count, count, 64
109*f3087befSAndrew Turner	b.hi	L(no_zva_loop)
110*f3087befSAndrew Turner	stp	q0, q0, [dstend, -64]
111*f3087befSAndrew Turner	stp	q0, q0, [dstend, -32]
112*f3087befSAndrew Turner	ret
113*f3087befSAndrew Turner
114*f3087befSAndrew TurnerEND (__memset_aarch64_sve)
115