xref: /freebsd/lib/libc/aarch64/string/strlcpy.S (revision 756b7fc80837567d114a3c93e9bb987e219a1b23)
1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
5*/
6
7#include <machine/asm.h>
8
9	.weak strlcpy
10	.set strlcpy, __strlcpy
11	.text
12
13ENTRY(__strlcpy)
14	subs	x2, x2, #1
15	b.lo	.L0
16
17	mov	x9, x0			// stash copy of dst pointer
18	bic	x10, x1, #0xf		// src aligned
19	and	x11, x1, #0xf		// src offset
20
21	ldr	q1, [x10]
22	cmeq	v1.16b, v1.16b, #0	// NUL found in head?
23
24	mov	x8, #-1			// fill register with 0xfff..fff
25	lsl	x12, x11, #2
26	lsl	x8, x8, x12		// mask of bytes in the string
27
28	shrn	v1.8b, v1.8h, #4
29	fmov	x5, d1
30
31	ands	x5, x5, x8
32	b.ne	.Lhead_nul
33
34	ldr	q3, [x10, #16]		// load second string chunk
35	ldr	q2, [x1]		// load true head
36	mov	x8, #32
37	sub	x8, x8, x11
38
39	cmeq	v1.16b, v3.16b, #0	// NUL found in second chunk?
40
41	subs	x2, x2, x8
42	b.ls	.Lhead_buf_end
43
44	/* process second chunk */
45	shrn	v1.8b, v1.8h, #4
46	fmov	x5, d1
47	cbnz	x5, .Lsecond_nul
48
49	/* string didn't end in second chunk and neither did buffer */
50	ldr	q1,	[x10, #32]	// load next string chunk
51	str	q2,	[x0]		// deposit head into buffer
52	sub	x0, x0, x11		// adjust x0
53	str	q3,	[x0, #16]	// deposit second chunk
54	add	x10, x10, #32		// advance src
55	add	x0, x0, #32		// advance dst
56	subs	x2, x2, #16		// enough left for another round?
57	b.ls	1f
58
59	/* main loop unrolled twice */
60	.p2align 4
610:
62	cmeq	v2.16b, v1.16b, #0	// NUL found in second chunk?
63	shrn	v2.8b, v2.8h, #4
64	fmov	x5, d2
65
66	cbnz	x5, 3f
67
68	str	q1, [x0]
69	ldr	q1, [x10, #16]		// load next chunk
70
71	cmp	x2, #16			// more than a full chunk left?
72	b.ls	2f
73
74	add	x10, x10, #32		// advance pointers
75	add	x0, x0, #32
76
77	cmeq	v2.16b, v1.16b, #0	// NUL found in second chunk?
78	shrn	v2.8b, v2.8h, #4
79	fmov	x5, d2
80	cbnz	x5, 4f			// process chunk if match
81
82	str	q1, [x0, #-16]
83	ldr	q1, [x10]		// load next chunk
84
85	subs	x2, x2, #32
86	b.hi	0b
87
881:
89	sub	x10, x10, #16		// undo second advancement
90	add	x2, x2, #16
91	sub	x0, x0, #16
92
93	/* 1--16 bytes left in the buffer but string has not ended yet */
942:
95	cmeq	v2.16b, v1.16b, #0	// NUL found in second chunk?
96	shrn	v2.8b, v2.8h, #4
97	fmov	x4, d2
98
99	mov	x6, #0xf
100	mov	x7, x4
101
102	lsl	x5, x2, #2		// shift 0xf to the limits position
103	lsl	x5, x6, x5
104	cmp	x2, #16			// dont induce match if limit >=16
105	csel	x5, x5, xzr, lo
106	orr	x8, x4, x5		// treat limit as if terminator present
107
108	rbit	x8, x8			// simulate x86 tzcnt
109	clz	x8, x8			// index of mismatch
110	lsr	x8, x8, #2
111
112	add	x0, x0, x8
113
114	ldr	q1, [x10, x8]		// load tail
115	str	q1, [x0]		// store tail
116	strb	wzr, [x0, #16]
117
118	/* continue to find the end of the string */
119	cbnz	x7, 1f
120
121	/* we opt for a simpler strlen than the one in libc as the
122	 * cmeq, shrn approach is faster for shorter strings.
123	 */
124	.p2align 4
1250:
126	ldr	q1, [x10, #32]
127	cmeq	v1.16b, v1.16b, #0	// bytewise compare against NUL
128	shrn	v1.8b, v1.8h, #4
129	fmov	x7, d1
130	cbnz	x7, 2f
131
132	ldr	q1, [x10, #48]
133	cmeq	v1.16b, v1.16b, #0	// bytewise compare against NUL
134	shrn	v1.8b, v1.8h, #4
135	fmov	x7, d1
136	add	x10, x10, #32
137	cbz	x7, 0b
138
1391:	sub	x10, x10, #16
1402:	rbit	x8, x7
141	clz	x8, x8			// index of mismatch
142	lsr	x8, x8, #2
143
144	sub	x10, x10, x1
145	add	x0, x10, #32
146	add	x0, x0, x8
147
148	ret
149
1504:
151	sub	x10, x10, #16		// undo second advancement
152	sub	x0, x0, #16		// undo second advancement
153
154	/* string has ended but buffer has not */
1553:
156	rbit	x8, x5
157	clz	x8, x8			// index of mismatch
158	lsr	x8, x8, #2
159
160	add	x0, x0, x8		// restore dst pointer
161	add	x10, x10, x8
162
163	ldr	q1, [x10, #-15]
164	str	q1, [x0, #-15]
165	add	x0, x0, #1
166	sub	x0, x10, x1
167
168	ret
169
170.Lhead_buf_end:
171	shrn	v1.8b, v1.8h, #4
172	fmov	x8, d1
173
174	add	x2, x2, #32		// restore limit
175
176	mov	x7, x8
177	mov	x6, #0xf
178
179	cmp	x2, #16			// should we induce a match or not
180	b.lo	0f
181
182	rbit	x8, x8
183	clz	x8, x8			// index of mismatch
184	lsr	x8, x8, #2
185	add	x8, x8, #16
186
187	cmp	x8, x2
188	csel	x8, x8, x2, lo		// copy min(buflen, srclen) bytes
189	b	1f
1900:
191
192	rbit	x8, x8
193	clz	x8, x8			// index of mismatch
194	lsr	x8, x8, #2
195
196	mov	x8, x2
1971:
198
199	sub	x8, x8, x11
200	strb	wzr, [x9, x8]
201
202	/* continue to find the end of the string */
203	cbnz	x7, 1f
204
205	/* we opt for a simpler strlen than the one in libc as the
206	 * cmeq, shrn approach is faster for shorter strings.
207	 */
208	.p2align 4
2090:
210	ldr	q1, [x10, #32]
211	cmeq	v1.16b, v1.16b, #0	// bytewise compare against NUL
212	shrn	v1.8b, v1.8h, #4
213	fmov	x7, d1
214	cbnz	x7, 2f
215
216	ldr	q1, [x10, #48]
217	cmeq	v1.16b, v1.16b, #0	// bytewise compare against NUL
218	shrn	v1.8b, v1.8h, #4
219	fmov	x7, d1
220	add	x10, x10, #32
221	cbz	x7, 0b
222
2231:	sub	x10, x10, #16
2242:	rbit	x6, x7
225	clz	x6, x6			// index of mismatch
226	lsr	x6, x6, #2
227
228	sub	x10, x10, x1
229	add	x0, x10, #32
230	add	x0, x0, x6
231
232	add	x4, x9, x8		// dst + cnt
233	add	x5, x1, x8		// src + cnt
234
235	b	.L1732
236
237.Lsecond_nul:
238	add	x2, x2, x8
239
240	rbit	x8, x5
241	clz	x8, x8			// index of mismatch
242	lsr	x5, x8, #2
243
244	sub	x8, x11, #16
245	sub	x0, x5, x8		// string length
246
247	cmp	x0, x2			// did we match or hit limit first?
248	csel	x8, x2, x0, hi
249
250	add	x4, x9, x8		// dst + cnt
251	add	x5, x1, x8		// src + cnt
252
253	strb	wzr, [x4]
254
255	/* copy 17-32 bytes */
256.L1732:
257	cmp	x8, #16
258	b.lo	.L0816
259	ldp	x16, x17, [x1]
260	ldp	x12, x1, [x5, #-16]
261	stp	x16, x17, [x9]
262	stp	x12, x1, [x4, #-16]
263	ret
264
265.Lhead_nul:
266	rbit	x8, x5
267	clz	x8, x8			// index of mismatch
268	lsr	x8, x8, #2
269
270	sub	x0, x8, x11
271	cmp	x0, x2
272	csel	x8, x2, x0, hi
273
274	add	x4, x9, x8		// dst + cnt
275	add	x5, x1, x8		// src + cnt
276	strb	wzr, [x4]
277
278	/* Copy 8-16 bytes */
279.L0816:
280	tbz	x8, #3, .L0407
281	ldr	x16, [x1]
282	ldr	x17, [x5, #-8]
283	str	x16, [x9]
284	str	x17, [x4, #-8]
285	ret
286
287	/* Copy 4-7 bytes */
288	.p2align 4
289.L0407:
290	cmp	x8, #3
291	b.ls	.L0203
292	ldr	w16, [x1]
293	ldr	w18, [x5, #-4]
294	str	w16, [x9]
295	str	w18, [x4, #-4]
296	ret
297
298.L0203:
299	tbz	x8, 1, .L0001
300	ldrh	w16, [x1]
301	ldrh	w17, [x5, #-2]
302	strh	w16, [x9]
303	strh	w17, [x4, #-2]
304	ret
305
306.L0001:
307	ldrb	w16, [x1]
308	strb	w16, [x9]
309	strb	wzr, [x4]
310	ret
311
312.L0:
313	mov	x0, x1
314	b	strlen
315	ret
316END(__strlcpy)
317