xref: /freebsd/lib/libc/aarch64/string/strlcpy.S (revision 756b7fc80837567d114a3c93e9bb987e219a1b23)
1*756b7fc8SGetz Mikalsen/*-
2*756b7fc8SGetz Mikalsen * SPDX-License-Identifier: BSD-2-Clause
3*756b7fc8SGetz Mikalsen *
4*756b7fc8SGetz Mikalsen * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
5*756b7fc8SGetz Mikalsen*/
6*756b7fc8SGetz Mikalsen
7*756b7fc8SGetz Mikalsen#include <machine/asm.h>
8*756b7fc8SGetz Mikalsen
9*756b7fc8SGetz Mikalsen	.weak strlcpy
10*756b7fc8SGetz Mikalsen	.set strlcpy, __strlcpy
11*756b7fc8SGetz Mikalsen	.text
12*756b7fc8SGetz Mikalsen
13*756b7fc8SGetz MikalsenENTRY(__strlcpy)
14*756b7fc8SGetz Mikalsen	subs	x2, x2, #1
15*756b7fc8SGetz Mikalsen	b.lo	.L0
16*756b7fc8SGetz Mikalsen
17*756b7fc8SGetz Mikalsen	mov	x9, x0			// stash copy of dst pointer
18*756b7fc8SGetz Mikalsen	bic	x10, x1, #0xf		// src aligned
19*756b7fc8SGetz Mikalsen	and	x11, x1, #0xf		// src offset
20*756b7fc8SGetz Mikalsen
21*756b7fc8SGetz Mikalsen	ldr	q1, [x10]
22*756b7fc8SGetz Mikalsen	cmeq	v1.16b, v1.16b, #0	// NUL found in head?
23*756b7fc8SGetz Mikalsen
24*756b7fc8SGetz Mikalsen	mov	x8, #-1			// fill register with 0xfff..fff
25*756b7fc8SGetz Mikalsen	lsl	x12, x11, #2
26*756b7fc8SGetz Mikalsen	lsl	x8, x8, x12		// mask of bytes in the string
27*756b7fc8SGetz Mikalsen
28*756b7fc8SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
29*756b7fc8SGetz Mikalsen	fmov	x5, d1
30*756b7fc8SGetz Mikalsen
31*756b7fc8SGetz Mikalsen	ands	x5, x5, x8
32*756b7fc8SGetz Mikalsen	b.ne	.Lhead_nul
33*756b7fc8SGetz Mikalsen
34*756b7fc8SGetz Mikalsen	ldr	q3, [x10, #16]		// load second string chunk
35*756b7fc8SGetz Mikalsen	ldr	q2, [x1]		// load true head
36*756b7fc8SGetz Mikalsen	mov	x8, #32
37*756b7fc8SGetz Mikalsen	sub	x8, x8, x11
38*756b7fc8SGetz Mikalsen
39*756b7fc8SGetz Mikalsen	cmeq	v1.16b, v3.16b, #0	// NUL found in second chunk?
40*756b7fc8SGetz Mikalsen
41*756b7fc8SGetz Mikalsen	subs	x2, x2, x8
42*756b7fc8SGetz Mikalsen	b.ls	.Lhead_buf_end
43*756b7fc8SGetz Mikalsen
44*756b7fc8SGetz Mikalsen	/* process second chunk */
45*756b7fc8SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
46*756b7fc8SGetz Mikalsen	fmov	x5, d1
47*756b7fc8SGetz Mikalsen	cbnz	x5, .Lsecond_nul
48*756b7fc8SGetz Mikalsen
49*756b7fc8SGetz Mikalsen	/* string didn't end in second chunk and neither did buffer */
50*756b7fc8SGetz Mikalsen	ldr	q1,	[x10, #32]	// load next string chunk
51*756b7fc8SGetz Mikalsen	str	q2,	[x0]		// deposit head into buffer
52*756b7fc8SGetz Mikalsen	sub	x0, x0, x11		// adjust x0
53*756b7fc8SGetz Mikalsen	str	q3,	[x0, #16]	// deposit second chunk
54*756b7fc8SGetz Mikalsen	add	x10, x10, #32		// advance src
55*756b7fc8SGetz Mikalsen	add	x0, x0, #32		// advance dst
56*756b7fc8SGetz Mikalsen	subs	x2, x2, #16		// enough left for another round?
57*756b7fc8SGetz Mikalsen	b.ls	1f
58*756b7fc8SGetz Mikalsen
59*756b7fc8SGetz Mikalsen	/* main loop unrolled twice */
60*756b7fc8SGetz Mikalsen	.p2align 4
61*756b7fc8SGetz Mikalsen0:
62*756b7fc8SGetz Mikalsen	cmeq	v2.16b, v1.16b, #0	// NUL found in second chunk?
63*756b7fc8SGetz Mikalsen	shrn	v2.8b, v2.8h, #4
64*756b7fc8SGetz Mikalsen	fmov	x5, d2
65*756b7fc8SGetz Mikalsen
66*756b7fc8SGetz Mikalsen	cbnz	x5, 3f
67*756b7fc8SGetz Mikalsen
68*756b7fc8SGetz Mikalsen	str	q1, [x0]
69*756b7fc8SGetz Mikalsen	ldr	q1, [x10, #16]		// load next chunk
70*756b7fc8SGetz Mikalsen
71*756b7fc8SGetz Mikalsen	cmp	x2, #16			// more than a full chunk left?
72*756b7fc8SGetz Mikalsen	b.ls	2f
73*756b7fc8SGetz Mikalsen
74*756b7fc8SGetz Mikalsen	add	x10, x10, #32		// advance pointers
75*756b7fc8SGetz Mikalsen	add	x0, x0, #32
76*756b7fc8SGetz Mikalsen
77*756b7fc8SGetz Mikalsen	cmeq	v2.16b, v1.16b, #0	// NUL found in second chunk?
78*756b7fc8SGetz Mikalsen	shrn	v2.8b, v2.8h, #4
79*756b7fc8SGetz Mikalsen	fmov	x5, d2
80*756b7fc8SGetz Mikalsen	cbnz	x5, 4f			// process chunk if match
81*756b7fc8SGetz Mikalsen
82*756b7fc8SGetz Mikalsen	str	q1, [x0, #-16]
83*756b7fc8SGetz Mikalsen	ldr	q1, [x10]		// load next chunk
84*756b7fc8SGetz Mikalsen
85*756b7fc8SGetz Mikalsen	subs	x2, x2, #32
86*756b7fc8SGetz Mikalsen	b.hi	0b
87*756b7fc8SGetz Mikalsen
88*756b7fc8SGetz Mikalsen1:
89*756b7fc8SGetz Mikalsen	sub	x10, x10, #16		// undo second advancement
90*756b7fc8SGetz Mikalsen	add	x2, x2, #16
91*756b7fc8SGetz Mikalsen	sub	x0, x0, #16
92*756b7fc8SGetz Mikalsen
93*756b7fc8SGetz Mikalsen	/* 1--16 bytes left in the buffer but string has not ended yet */
94*756b7fc8SGetz Mikalsen2:
95*756b7fc8SGetz Mikalsen	cmeq	v2.16b, v1.16b, #0	// NUL found in second chunk?
96*756b7fc8SGetz Mikalsen	shrn	v2.8b, v2.8h, #4
97*756b7fc8SGetz Mikalsen	fmov	x4, d2
98*756b7fc8SGetz Mikalsen
99*756b7fc8SGetz Mikalsen	mov	x6, #0xf
100*756b7fc8SGetz Mikalsen	mov	x7, x4
101*756b7fc8SGetz Mikalsen
102*756b7fc8SGetz Mikalsen	lsl	x5, x2, #2		// shift 0xf to the limits position
103*756b7fc8SGetz Mikalsen	lsl	x5, x6, x5
104*756b7fc8SGetz Mikalsen	cmp	x2, #16			// dont induce match if limit >=16
105*756b7fc8SGetz Mikalsen	csel	x5, x5, xzr, lo
106*756b7fc8SGetz Mikalsen	orr	x8, x4, x5		// treat limit as if terminator present
107*756b7fc8SGetz Mikalsen
108*756b7fc8SGetz Mikalsen	rbit	x8, x8			// simulate x86 tzcnt
109*756b7fc8SGetz Mikalsen	clz	x8, x8			// index of mismatch
110*756b7fc8SGetz Mikalsen	lsr	x8, x8, #2
111*756b7fc8SGetz Mikalsen
112*756b7fc8SGetz Mikalsen	add	x0, x0, x8
113*756b7fc8SGetz Mikalsen
114*756b7fc8SGetz Mikalsen	ldr	q1, [x10, x8]		// load tail
115*756b7fc8SGetz Mikalsen	str	q1, [x0]		// store tail
116*756b7fc8SGetz Mikalsen	strb	wzr, [x0, #16]
117*756b7fc8SGetz Mikalsen
118*756b7fc8SGetz Mikalsen	/* continue to find the end of the string */
119*756b7fc8SGetz Mikalsen	cbnz	x7, 1f
120*756b7fc8SGetz Mikalsen
121*756b7fc8SGetz Mikalsen	/* we opt for a simpler strlen than the one in libc as the
122*756b7fc8SGetz Mikalsen	 * cmeq, shrn approach is faster for shorter strings.
123*756b7fc8SGetz Mikalsen	 */
124*756b7fc8SGetz Mikalsen	.p2align 4
125*756b7fc8SGetz Mikalsen0:
126*756b7fc8SGetz Mikalsen	ldr	q1, [x10, #32]
127*756b7fc8SGetz Mikalsen	cmeq	v1.16b, v1.16b, #0	// bytewise compare against NUL
128*756b7fc8SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
129*756b7fc8SGetz Mikalsen	fmov	x7, d1
130*756b7fc8SGetz Mikalsen	cbnz	x7, 2f
131*756b7fc8SGetz Mikalsen
132*756b7fc8SGetz Mikalsen	ldr	q1, [x10, #48]
133*756b7fc8SGetz Mikalsen	cmeq	v1.16b, v1.16b, #0	// bytewise compare against NUL
134*756b7fc8SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
135*756b7fc8SGetz Mikalsen	fmov	x7, d1
136*756b7fc8SGetz Mikalsen	add	x10, x10, #32
137*756b7fc8SGetz Mikalsen	cbz	x7, 0b
138*756b7fc8SGetz Mikalsen
139*756b7fc8SGetz Mikalsen1:	sub	x10, x10, #16
140*756b7fc8SGetz Mikalsen2:	rbit	x8, x7
141*756b7fc8SGetz Mikalsen	clz	x8, x8			// index of mismatch
142*756b7fc8SGetz Mikalsen	lsr	x8, x8, #2
143*756b7fc8SGetz Mikalsen
144*756b7fc8SGetz Mikalsen	sub	x10, x10, x1
145*756b7fc8SGetz Mikalsen	add	x0, x10, #32
146*756b7fc8SGetz Mikalsen	add	x0, x0, x8
147*756b7fc8SGetz Mikalsen
148*756b7fc8SGetz Mikalsen	ret
149*756b7fc8SGetz Mikalsen
150*756b7fc8SGetz Mikalsen4:
151*756b7fc8SGetz Mikalsen	sub	x10, x10, #16		// undo second advancement
152*756b7fc8SGetz Mikalsen	sub	x0, x0, #16		// undo second advancement
153*756b7fc8SGetz Mikalsen
154*756b7fc8SGetz Mikalsen	/* string has ended but buffer has not */
155*756b7fc8SGetz Mikalsen3:
156*756b7fc8SGetz Mikalsen	rbit	x8, x5
157*756b7fc8SGetz Mikalsen	clz	x8, x8			// index of mismatch
158*756b7fc8SGetz Mikalsen	lsr	x8, x8, #2
159*756b7fc8SGetz Mikalsen
160*756b7fc8SGetz Mikalsen	add	x0, x0, x8		// restore dst pointer
161*756b7fc8SGetz Mikalsen	add	x10, x10, x8
162*756b7fc8SGetz Mikalsen
163*756b7fc8SGetz Mikalsen	ldr	q1, [x10, #-15]
164*756b7fc8SGetz Mikalsen	str	q1, [x0, #-15]
165*756b7fc8SGetz Mikalsen	add	x0, x0, #1
166*756b7fc8SGetz Mikalsen	sub	x0, x10, x1
167*756b7fc8SGetz Mikalsen
168*756b7fc8SGetz Mikalsen	ret
169*756b7fc8SGetz Mikalsen
170*756b7fc8SGetz Mikalsen.Lhead_buf_end:
171*756b7fc8SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
172*756b7fc8SGetz Mikalsen	fmov	x8, d1
173*756b7fc8SGetz Mikalsen
174*756b7fc8SGetz Mikalsen	add	x2, x2, #32		// restore limit
175*756b7fc8SGetz Mikalsen
176*756b7fc8SGetz Mikalsen	mov	x7, x8
177*756b7fc8SGetz Mikalsen	mov	x6, #0xf
178*756b7fc8SGetz Mikalsen
179*756b7fc8SGetz Mikalsen	cmp	x2, #16			// should we induce a match or not
180*756b7fc8SGetz Mikalsen	b.lo	0f
181*756b7fc8SGetz Mikalsen
182*756b7fc8SGetz Mikalsen	rbit	x8, x8
183*756b7fc8SGetz Mikalsen	clz	x8, x8			// index of mismatch
184*756b7fc8SGetz Mikalsen	lsr	x8, x8, #2
185*756b7fc8SGetz Mikalsen	add	x8, x8, #16
186*756b7fc8SGetz Mikalsen
187*756b7fc8SGetz Mikalsen	cmp	x8, x2
188*756b7fc8SGetz Mikalsen	csel	x8, x8, x2, lo		// copy min(buflen, srclen) bytes
189*756b7fc8SGetz Mikalsen	b	1f
190*756b7fc8SGetz Mikalsen0:
191*756b7fc8SGetz Mikalsen
192*756b7fc8SGetz Mikalsen	rbit	x8, x8
193*756b7fc8SGetz Mikalsen	clz	x8, x8			// index of mismatch
194*756b7fc8SGetz Mikalsen	lsr	x8, x8, #2
195*756b7fc8SGetz Mikalsen
196*756b7fc8SGetz Mikalsen	mov	x8, x2
197*756b7fc8SGetz Mikalsen1:
198*756b7fc8SGetz Mikalsen
199*756b7fc8SGetz Mikalsen	sub	x8, x8, x11
200*756b7fc8SGetz Mikalsen	strb	wzr, [x9, x8]
201*756b7fc8SGetz Mikalsen
202*756b7fc8SGetz Mikalsen	/* continue to find the end of the string */
203*756b7fc8SGetz Mikalsen	cbnz	x7, 1f
204*756b7fc8SGetz Mikalsen
205*756b7fc8SGetz Mikalsen	/* we opt for a simpler strlen than the one in libc as the
206*756b7fc8SGetz Mikalsen	 * cmeq, shrn approach is faster for shorter strings.
207*756b7fc8SGetz Mikalsen	 */
208*756b7fc8SGetz Mikalsen	.p2align 4
209*756b7fc8SGetz Mikalsen0:
210*756b7fc8SGetz Mikalsen	ldr	q1, [x10, #32]
211*756b7fc8SGetz Mikalsen	cmeq	v1.16b, v1.16b, #0	// bytewise compare against NUL
212*756b7fc8SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
213*756b7fc8SGetz Mikalsen	fmov	x7, d1
214*756b7fc8SGetz Mikalsen	cbnz	x7, 2f
215*756b7fc8SGetz Mikalsen
216*756b7fc8SGetz Mikalsen	ldr	q1, [x10, #48]
217*756b7fc8SGetz Mikalsen	cmeq	v1.16b, v1.16b, #0	// bytewise compare against NUL
218*756b7fc8SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
219*756b7fc8SGetz Mikalsen	fmov	x7, d1
220*756b7fc8SGetz Mikalsen	add	x10, x10, #32
221*756b7fc8SGetz Mikalsen	cbz	x7, 0b
222*756b7fc8SGetz Mikalsen
223*756b7fc8SGetz Mikalsen1:	sub	x10, x10, #16
224*756b7fc8SGetz Mikalsen2:	rbit	x6, x7
225*756b7fc8SGetz Mikalsen	clz	x6, x6			// index of mismatch
226*756b7fc8SGetz Mikalsen	lsr	x6, x6, #2
227*756b7fc8SGetz Mikalsen
228*756b7fc8SGetz Mikalsen	sub	x10, x10, x1
229*756b7fc8SGetz Mikalsen	add	x0, x10, #32
230*756b7fc8SGetz Mikalsen	add	x0, x0, x6
231*756b7fc8SGetz Mikalsen
232*756b7fc8SGetz Mikalsen	add	x4, x9, x8		// dst + cnt
233*756b7fc8SGetz Mikalsen	add	x5, x1, x8		// src + cnt
234*756b7fc8SGetz Mikalsen
235*756b7fc8SGetz Mikalsen	b	.L1732
236*756b7fc8SGetz Mikalsen
237*756b7fc8SGetz Mikalsen.Lsecond_nul:
238*756b7fc8SGetz Mikalsen	add	x2, x2, x8
239*756b7fc8SGetz Mikalsen
240*756b7fc8SGetz Mikalsen	rbit	x8, x5
241*756b7fc8SGetz Mikalsen	clz	x8, x8			// index of mismatch
242*756b7fc8SGetz Mikalsen	lsr	x5, x8, #2
243*756b7fc8SGetz Mikalsen
244*756b7fc8SGetz Mikalsen	sub	x8, x11, #16
245*756b7fc8SGetz Mikalsen	sub	x0, x5, x8		// string length
246*756b7fc8SGetz Mikalsen
247*756b7fc8SGetz Mikalsen	cmp	x0, x2			// did we match or hit limit first?
248*756b7fc8SGetz Mikalsen	csel	x8, x2, x0, hi
249*756b7fc8SGetz Mikalsen
250*756b7fc8SGetz Mikalsen	add	x4, x9, x8		// dst + cnt
251*756b7fc8SGetz Mikalsen	add	x5, x1, x8		// src + cnt
252*756b7fc8SGetz Mikalsen
253*756b7fc8SGetz Mikalsen	strb	wzr, [x4]
254*756b7fc8SGetz Mikalsen
255*756b7fc8SGetz Mikalsen	/* copy 17-32 bytes */
256*756b7fc8SGetz Mikalsen.L1732:
257*756b7fc8SGetz Mikalsen	cmp	x8, #16
258*756b7fc8SGetz Mikalsen	b.lo	.L0816
259*756b7fc8SGetz Mikalsen	ldp	x16, x17, [x1]
260*756b7fc8SGetz Mikalsen	ldp	x12, x1, [x5, #-16]
261*756b7fc8SGetz Mikalsen	stp	x16, x17, [x9]
262*756b7fc8SGetz Mikalsen	stp	x12, x1, [x4, #-16]
263*756b7fc8SGetz Mikalsen	ret
264*756b7fc8SGetz Mikalsen
265*756b7fc8SGetz Mikalsen.Lhead_nul:
266*756b7fc8SGetz Mikalsen	rbit	x8, x5
267*756b7fc8SGetz Mikalsen	clz	x8, x8			// index of mismatch
268*756b7fc8SGetz Mikalsen	lsr	x8, x8, #2
269*756b7fc8SGetz Mikalsen
270*756b7fc8SGetz Mikalsen	sub	x0, x8, x11
271*756b7fc8SGetz Mikalsen	cmp	x0, x2
272*756b7fc8SGetz Mikalsen	csel	x8, x2, x0, hi
273*756b7fc8SGetz Mikalsen
274*756b7fc8SGetz Mikalsen	add	x4, x9, x8		// dst + cnt
275*756b7fc8SGetz Mikalsen	add	x5, x1, x8		// src + cnt
276*756b7fc8SGetz Mikalsen	strb	wzr, [x4]
277*756b7fc8SGetz Mikalsen
278*756b7fc8SGetz Mikalsen	/* Copy 8-16 bytes */
279*756b7fc8SGetz Mikalsen.L0816:
280*756b7fc8SGetz Mikalsen	tbz	x8, #3, .L0407
281*756b7fc8SGetz Mikalsen	ldr	x16, [x1]
282*756b7fc8SGetz Mikalsen	ldr	x17, [x5, #-8]
283*756b7fc8SGetz Mikalsen	str	x16, [x9]
284*756b7fc8SGetz Mikalsen	str	x17, [x4, #-8]
285*756b7fc8SGetz Mikalsen	ret
286*756b7fc8SGetz Mikalsen
287*756b7fc8SGetz Mikalsen	/* Copy 4-7 bytes */
288*756b7fc8SGetz Mikalsen	.p2align 4
289*756b7fc8SGetz Mikalsen.L0407:
290*756b7fc8SGetz Mikalsen	cmp	x8, #3
291*756b7fc8SGetz Mikalsen	b.ls	.L0203
292*756b7fc8SGetz Mikalsen	ldr	w16, [x1]
293*756b7fc8SGetz Mikalsen	ldr	w18, [x5, #-4]
294*756b7fc8SGetz Mikalsen	str	w16, [x9]
295*756b7fc8SGetz Mikalsen	str	w18, [x4, #-4]
296*756b7fc8SGetz Mikalsen	ret
297*756b7fc8SGetz Mikalsen
298*756b7fc8SGetz Mikalsen.L0203:
299*756b7fc8SGetz Mikalsen	tbz	x8, 1, .L0001
300*756b7fc8SGetz Mikalsen	ldrh	w16, [x1]
301*756b7fc8SGetz Mikalsen	ldrh	w17, [x5, #-2]
302*756b7fc8SGetz Mikalsen	strh	w16, [x9]
303*756b7fc8SGetz Mikalsen	strh	w17, [x4, #-2]
304*756b7fc8SGetz Mikalsen	ret
305*756b7fc8SGetz Mikalsen
306*756b7fc8SGetz Mikalsen.L0001:
307*756b7fc8SGetz Mikalsen	ldrb	w16, [x1]
308*756b7fc8SGetz Mikalsen	strb	w16, [x9]
309*756b7fc8SGetz Mikalsen	strb	wzr, [x4]
310*756b7fc8SGetz Mikalsen	ret
311*756b7fc8SGetz Mikalsen
312*756b7fc8SGetz Mikalsen.L0:
313*756b7fc8SGetz Mikalsen	mov	x0, x1
314*756b7fc8SGetz Mikalsen	b	strlen
315*756b7fc8SGetz Mikalsen	ret
316*756b7fc8SGetz MikalsenEND(__strlcpy)
317