xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strcpy.S (revision 5b5b7e2ca2fa9a2418dd51749f4ef6f881ae7179)
1/*
2 * strcpy/stpcpy - copy a string returning pointer to start/end.
3 *
4 * Copyright (c) 2020-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD.
11 * MTE compatible.
12 */
13
14#include "../asmdefs.h"
15
16#define dstin		x0
17#define srcin		x1
18#define result		x0
19
20#define src		x2
21#define dst		x3
22#define len		x4
23#define synd		x4
24#define	tmp		x5
25#define wtmp		w5
26#define shift		x5
27#define data1		x6
28#define dataw1		w6
29#define data2		x7
30#define dataw2		w7
31
32#define dataq		q0
33#define vdata		v0
34#define vhas_nul	v1
35#define vrepmask	v2
36#define vend		v3
37#define dend		d3
38#define dataq2		q1
39
40#ifdef BUILD_STPCPY
41# define STRCPY __stpcpy_aarch64
42# define IFSTPCPY(X,...) X,__VA_ARGS__
43#else
44# define STRCPY __strcpy_aarch64
45# define IFSTPCPY(X,...)
46#endif
47
48/* Core algorithm:
49
50   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
51   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
52   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
53   set likewise for odd bytes so that adjacent bytes can be merged. Since the
54   bits in the syndrome reflect the order in which things occur in the original
55   string, counting trailing zeros identifies exactly which byte matched.  */
56
57ENTRY (STRCPY)
58	PTR_ARG (0)
59	PTR_ARG (1)
60	bic	src, srcin, 15
61	mov	wtmp, 0xf00f
62	ld1	{vdata.16b}, [src]
63	dup	vrepmask.8h, wtmp
64	cmeq	vhas_nul.16b, vdata.16b, 0
65	lsl	shift, srcin, 2
66	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
67	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
68	fmov	synd, dend
69	lsr	synd, synd, shift
70	cbnz	synd, L(tail)
71
72	ldr	dataq, [src, 16]!
73	cmeq	vhas_nul.16b, vdata.16b, 0
74	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
75	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
76	fmov	synd, dend
77	cbz	synd, L(start_loop)
78
79#ifndef __AARCH64EB__
80	rbit	synd, synd
81#endif
82	sub	tmp, src, srcin
83	clz	len, synd
84	add	len, tmp, len, lsr 2
85	tbz	len, 4, L(less16)
86	sub	tmp, len, 15
87	ldr	dataq, [srcin]
88	ldr	dataq2, [srcin, tmp]
89	str	dataq, [dstin]
90	str	dataq2, [dstin, tmp]
91	IFSTPCPY (add result, dstin, len)
92	ret
93
94	.p2align 4,,8
95L(tail):
96	rbit	synd, synd
97	clz	len, synd
98	lsr	len, len, 2
99
100	.p2align 4
101L(less16):
102	tbz	len, 3, L(less8)
103	sub	tmp, len, 7
104	ldr	data1, [srcin]
105	ldr	data2, [srcin, tmp]
106	str	data1, [dstin]
107	str	data2, [dstin, tmp]
108	IFSTPCPY (add result, dstin, len)
109	ret
110
111	.p2align 4
112L(less8):
113	subs	tmp, len, 3
114	b.lo	L(less4)
115	ldr	dataw1, [srcin]
116	ldr	dataw2, [srcin, tmp]
117	str	dataw1, [dstin]
118	str	dataw2, [dstin, tmp]
119	IFSTPCPY (add result, dstin, len)
120	ret
121
122L(less4):
123	cbz	len, L(zerobyte)
124	ldrh	dataw1, [srcin]
125	strh	dataw1, [dstin]
126L(zerobyte):
127	strb	wzr, [dstin, len]
128	IFSTPCPY (add result, dstin, len)
129	ret
130
131	.p2align 4
132L(start_loop):
133	sub	len, src, srcin
134	ldr	dataq2, [srcin]
135	add	dst, dstin, len
136	str	dataq2, [dstin]
137
138	.p2align 5
139L(loop):
140	str	dataq, [dst], 16
141	ldr	dataq, [src, 16]!
142	cmeq	vhas_nul.16b, vdata.16b, 0
143	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
144	fmov	synd, dend
145	cbz	synd, L(loop)
146
147	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
148	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
149	fmov	synd, dend
150#ifndef __AARCH64EB__
151	rbit	synd, synd
152#endif
153	clz	len, synd
154	lsr	len, len, 2
155	sub	tmp, len, 15
156	ldr	dataq, [src, tmp]
157	str	dataq, [dst, tmp]
158	IFSTPCPY (add result, dst, len)
159	ret
160
161END (STRCPY)
162