xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strcpy.S (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1/*
2 * strcpy/stpcpy - copy a string returning pointer to start/end.
3 *
4 * Copyright (c) 2020-2023, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD.
11 * MTE compatible.
12 */
13
14#include "asmdefs.h"
15
16#define dstin		x0
17#define srcin		x1
18#define result		x0
19
20#define src		x2
21#define dst		x3
22#define len		x4
23#define synd		x4
24#define	tmp		x5
25#define shift		x5
26#define data1		x6
27#define dataw1		w6
28#define data2		x7
29#define dataw2		w7
30
31#define dataq		q0
32#define vdata		v0
33#define vhas_nul	v1
34#define vend		v2
35#define dend		d2
36#define dataq2		q1
37
38#ifdef BUILD_STPCPY
39# define STRCPY __stpcpy_aarch64
40# define IFSTPCPY(X,...) X,__VA_ARGS__
41#else
42# define STRCPY __strcpy_aarch64
43# define IFSTPCPY(X,...)
44#endif
45
46/*
47   Core algorithm:
48   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
49   per byte. We take 4 bits of every comparison byte with shift right and narrow
50   by 4 instruction. Since the bits in the nibble mask reflect the order in
51   which things occur in the original string, counting leading zeros identifies
52   exactly which byte matched.  */
53
54ENTRY (STRCPY)
55	bic	src, srcin, 15
56	ld1	{vdata.16b}, [src]
57	cmeq	vhas_nul.16b, vdata.16b, 0
58	lsl	shift, srcin, 2
59	shrn	vend.8b, vhas_nul.8h, 4
60	fmov	synd, dend
61	lsr	synd, synd, shift
62	cbnz	synd, L(tail)
63
64	ldr	dataq, [src, 16]!
65	cmeq	vhas_nul.16b, vdata.16b, 0
66	shrn	vend.8b, vhas_nul.8h, 4
67	fmov	synd, dend
68	cbz	synd, L(start_loop)
69
70#ifndef __AARCH64EB__
71	rbit	synd, synd
72#endif
73	sub	tmp, src, srcin
74	clz	len, synd
75	add	len, tmp, len, lsr 2
76	tbz	len, 4, L(less16)
77	sub	tmp, len, 15
78	ldr	dataq, [srcin]
79	ldr	dataq2, [srcin, tmp]
80	str	dataq, [dstin]
81	str	dataq2, [dstin, tmp]
82	IFSTPCPY (add result, dstin, len)
83	ret
84
85L(tail):
86	rbit	synd, synd
87	clz	len, synd
88	lsr	len, len, 2
89L(less16):
90	tbz	len, 3, L(less8)
91	sub	tmp, len, 7
92	ldr	data1, [srcin]
93	ldr	data2, [srcin, tmp]
94	str	data1, [dstin]
95	str	data2, [dstin, tmp]
96	IFSTPCPY (add result, dstin, len)
97	ret
98
99	.p2align 4
100L(less8):
101	subs	tmp, len, 3
102	b.lo	L(less4)
103	ldr	dataw1, [srcin]
104	ldr	dataw2, [srcin, tmp]
105	str	dataw1, [dstin]
106	str	dataw2, [dstin, tmp]
107	IFSTPCPY (add result, dstin, len)
108	ret
109
110L(less4):
111	cbz	len, L(zerobyte)
112	ldrh	dataw1, [srcin]
113	strh	dataw1, [dstin]
114L(zerobyte):
115	strb	wzr, [dstin, len]
116	IFSTPCPY (add result, dstin, len)
117	ret
118
119	.p2align 4
120L(start_loop):
121	sub	tmp, srcin, dstin
122	ldr	dataq2, [srcin]
123	sub	dst, src, tmp
124	str	dataq2, [dstin]
125L(loop):
126	str	dataq, [dst], 32
127	ldr	dataq, [src, 16]
128	cmeq	vhas_nul.16b, vdata.16b, 0
129	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
130	fmov	synd, dend
131	cbnz	synd, L(loopend)
132	str	dataq, [dst, -16]
133	ldr	dataq, [src, 32]!
134	cmeq	vhas_nul.16b, vdata.16b, 0
135	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
136	fmov	synd, dend
137	cbz	synd, L(loop)
138	add	dst, dst, 16
139L(loopend):
140	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
141	fmov	synd, dend
142	sub	dst, dst, 31
143#ifndef __AARCH64EB__
144	rbit	synd, synd
145#endif
146	clz	len, synd
147	lsr	len, len, 2
148	add	dst, dst, len
149	ldr	dataq, [dst, tmp]
150	str	dataq, [dst]
151	IFSTPCPY (add result, dst, 15)
152	ret
153
154END (STRCPY)
155