xref: /freebsd/lib/libc/aarch64/string/memccpy.S (revision bad17991c06d684e9053938d00a07b962e2fd31c)
1*bad17991SGetz Mikalsen/*-
2*bad17991SGetz Mikalsen * SPDX-License-Identifier: BSD-2-Clause
3*bad17991SGetz Mikalsen *
4*bad17991SGetz Mikalsen * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
5*bad17991SGetz Mikalsen*/
6*bad17991SGetz Mikalsen
7*bad17991SGetz Mikalsen#include <machine/asm.h>
8*bad17991SGetz Mikalsen
9*bad17991SGetz Mikalsen	.weak	memccpy
10*bad17991SGetz Mikalsen	.set	memccpy, __memccpy
11*bad17991SGetz Mikalsen	.text
12*bad17991SGetz Mikalsen
13*bad17991SGetz MikalsenENTRY(__memccpy)
14*bad17991SGetz Mikalsen	subs	x3, x3, #1
15*bad17991SGetz Mikalsen	b.lo	.L0
16*bad17991SGetz Mikalsen
17*bad17991SGetz Mikalsen	dup	v0.16b,	w2
18*bad17991SGetz Mikalsen
19*bad17991SGetz Mikalsen	mov	x9, x0			// stash copy of src pointer
20*bad17991SGetz Mikalsen	bic	x10, x1, #0xf		// src aligned
21*bad17991SGetz Mikalsen	and	x11, x1, #0xf		// src offset
22*bad17991SGetz Mikalsen
23*bad17991SGetz Mikalsen	ldr	q1, [x10]
24*bad17991SGetz Mikalsen	cmeq	v1.16b, v1.16b, v0.16b	// bytewise compare against src char
25*bad17991SGetz Mikalsen
26*bad17991SGetz Mikalsen	mov	x8, #-1			// prepare a 0xfff..fff register
27*bad17991SGetz Mikalsen	mov	x6, #0xf
28*bad17991SGetz Mikalsen
29*bad17991SGetz Mikalsen	lsl	x12, x11, #2
30*bad17991SGetz Mikalsen	lsl	x8, x8, x12		// mask of bytes in the string
31*bad17991SGetz Mikalsen
32*bad17991SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
33*bad17991SGetz Mikalsen	fmov	x5, d1
34*bad17991SGetz Mikalsen
35*bad17991SGetz Mikalsen	sub	x12, x11, #32
36*bad17991SGetz Mikalsen	adds	x12, x12, x3		// distance from alignment boundary - 32
37*bad17991SGetz Mikalsen	b.cc	.Lrunt			// branch if buffer length is 32 or less
38*bad17991SGetz Mikalsen
39*bad17991SGetz Mikalsen	ands	x8, x8, x5
40*bad17991SGetz Mikalsen	b.eq	0f
41*bad17991SGetz Mikalsen
42*bad17991SGetz Mikalsen	/* match in first chunk */
43*bad17991SGetz Mikalsen	rbit	x8, x8
44*bad17991SGetz Mikalsen	clz	x8, x8			// index of mismatch
45*bad17991SGetz Mikalsen	lsr	x8, x8, #2
46*bad17991SGetz Mikalsen
47*bad17991SGetz Mikalsen	sub	x8, x8, x11		// ... from beginning of the string
48*bad17991SGetz Mikalsen
49*bad17991SGetz Mikalsen	add	x0, x0, x8
50*bad17991SGetz Mikalsen	add	x4, x9, x8		// dst + cnt
51*bad17991SGetz Mikalsen	add	x5, x1, x8		// src + cnt
52*bad17991SGetz Mikalsen	add	x0, x0, #1
53*bad17991SGetz Mikalsen
54*bad17991SGetz Mikalsen	b	.L0816
55*bad17991SGetz Mikalsen
56*bad17991SGetz Mikalsen0:
57*bad17991SGetz Mikalsen	ldr	q3,	[x10, #16]	// load second string chunk
58*bad17991SGetz Mikalsen	ldr	q2,	[x1]		// load true head
59*bad17991SGetz Mikalsen	cmeq	v1.16b, v3.16b, v0.16b	// char found in second chunk?
60*bad17991SGetz Mikalsen
61*bad17991SGetz Mikalsen	/* process second chunk */
62*bad17991SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
63*bad17991SGetz Mikalsen	fmov	x5, d1
64*bad17991SGetz Mikalsen
65*bad17991SGetz Mikalsen	cbz	x5, 0f
66*bad17991SGetz Mikalsen
67*bad17991SGetz Mikalsen	/* match in second chunk */
68*bad17991SGetz Mikalsen	rbit	x8, x5
69*bad17991SGetz Mikalsen	clz	x8, x8			// index of mismatch
70*bad17991SGetz Mikalsen	lsr	x8, x8, #2
71*bad17991SGetz Mikalsen
72*bad17991SGetz Mikalsen	sub	x11, x11, #16
73*bad17991SGetz Mikalsen	sub	x8, x8, x11		// adjust for alignment offset
74*bad17991SGetz Mikalsen	add	x0, x0, x8		// return value
75*bad17991SGetz Mikalsen	add	x0, x0, #1
76*bad17991SGetz Mikalsen
77*bad17991SGetz Mikalsen	add	x4, x9, x8
78*bad17991SGetz Mikalsen	add	x5, x1, x8
79*bad17991SGetz Mikalsen	b	.L1732
80*bad17991SGetz Mikalsen
81*bad17991SGetz Mikalsen0:
82*bad17991SGetz Mikalsen	/* string didn't end in second chunk and neither did buffer */
83*bad17991SGetz Mikalsen	ldr	q1,	[x10, #32]	// load next string chunk
84*bad17991SGetz Mikalsen	str	q2,	[x0]		// deposit head into buffer
85*bad17991SGetz Mikalsen	sub	x0, x0, x11		// adjust x0
86*bad17991SGetz Mikalsen	mov	x3, x12
87*bad17991SGetz Mikalsen	str	q3,	[x0, #16]	// deposit second chunk
88*bad17991SGetz Mikalsen
89*bad17991SGetz Mikalsen	add	x10, x10, #32		// advance src
90*bad17991SGetz Mikalsen	add	x0, x0, #32		// advance dst
91*bad17991SGetz Mikalsen	subs	x3, x3, #16		// enough left for another round?
92*bad17991SGetz Mikalsen	b.lo	1f
93*bad17991SGetz Mikalsen
94*bad17991SGetz Mikalsen	/* main loop unrolled twice */
95*bad17991SGetz Mikalsen	.p2align 4
96*bad17991SGetz Mikalsen0:
97*bad17991SGetz Mikalsen	cmeq	v2.16b, v1.16b, v0.16b	// char found in second chunk?
98*bad17991SGetz Mikalsen	shrn	v2.8b, v2.8h, #4
99*bad17991SGetz Mikalsen	fmov	x5, d2
100*bad17991SGetz Mikalsen
101*bad17991SGetz Mikalsen	cbnz	x5, 3f
102*bad17991SGetz Mikalsen
103*bad17991SGetz Mikalsen	str	q1, [x0]
104*bad17991SGetz Mikalsen	ldr	q1, [x10, #16]		// load next chunk
105*bad17991SGetz Mikalsen
106*bad17991SGetz Mikalsen	cmp	x3, #16			// more than a full chunk left?
107*bad17991SGetz Mikalsen	b.lo	2f
108*bad17991SGetz Mikalsen
109*bad17991SGetz Mikalsen	add	x10, x10, #32		// advance pointers
110*bad17991SGetz Mikalsen	add	x0, x0, #32
111*bad17991SGetz Mikalsen
112*bad17991SGetz Mikalsen	cmeq	v2.16b, v1.16b, v0.16b	// char found in second chunk?
113*bad17991SGetz Mikalsen	shrn	v2.8b, v2.8h, #4
114*bad17991SGetz Mikalsen	fmov	x5, d2
115*bad17991SGetz Mikalsen	cbnz	x5, 4f			// process chunk if match
116*bad17991SGetz Mikalsen
117*bad17991SGetz Mikalsen	str	q1, [x0, #-16]
118*bad17991SGetz Mikalsen	ldr	q1, [x10]		// load next chunk
119*bad17991SGetz Mikalsen
120*bad17991SGetz Mikalsen	subs	x3, x3, #32
121*bad17991SGetz Mikalsen	b.hs	0b
122*bad17991SGetz Mikalsen
123*bad17991SGetz Mikalsen1:
124*bad17991SGetz Mikalsen	sub	x10, x10, #16		// undo second advancement
125*bad17991SGetz Mikalsen	add	x3, x3, #16
126*bad17991SGetz Mikalsen	sub	x0, x0, #16
127*bad17991SGetz Mikalsen
128*bad17991SGetz Mikalsen	/* 1--16 bytes left in the buffer but string has not ended yet */
129*bad17991SGetz Mikalsen2:
130*bad17991SGetz Mikalsen	cmeq	v2.16b, v1.16b, v0.16b	// char found in second chunk?
131*bad17991SGetz Mikalsen	shrn	v2.8b, v2.8h, #4
132*bad17991SGetz Mikalsen	fmov	x4, d2
133*bad17991SGetz Mikalsen
134*bad17991SGetz Mikalsen	lsl	x5, x3, #2		// shift 0xf to the limits position
135*bad17991SGetz Mikalsen	lsl	x5, x6, x5
136*bad17991SGetz Mikalsen	orr	x8, x4, x5		// insert match in mask at limit
137*bad17991SGetz Mikalsen
138*bad17991SGetz Mikalsen	rbit	x8, x8			// simulate x86 tzcnt
139*bad17991SGetz Mikalsen	clz	x7, x8			// index of mismatch
140*bad17991SGetz Mikalsen	lsr	x8, x7, #2
141*bad17991SGetz Mikalsen
142*bad17991SGetz Mikalsen	lsl	x5, x6, x7		// simulate x86 bt with shifted 0xf
143*bad17991SGetz Mikalsen
144*bad17991SGetz Mikalsen	add	x8, x8, #1
145*bad17991SGetz Mikalsen	add	x0, x0, x8
146*bad17991SGetz Mikalsen
147*bad17991SGetz Mikalsen	ldr	q1, [x10, x8]		// load tail
148*bad17991SGetz Mikalsen	str	q1, [x0]		// store tail
149*bad17991SGetz Mikalsen
150*bad17991SGetz Mikalsen	add	x0, x0, #16
151*bad17991SGetz Mikalsen
152*bad17991SGetz Mikalsen	tst	x4, x5			// terminator encountered inside buffer?
153*bad17991SGetz Mikalsen	csel	x0, x0, xzr, ne		// if yes, return pointer, else NUL
154*bad17991SGetz Mikalsen	ret
155*bad17991SGetz Mikalsen
156*bad17991SGetz Mikalsen4:
157*bad17991SGetz Mikalsen	sub	x10, x10, #16		// undo second advancement
158*bad17991SGetz Mikalsen	sub	x0, x0, #16		// undo second advancement
159*bad17991SGetz Mikalsen
160*bad17991SGetz Mikalsen3:
161*bad17991SGetz Mikalsen	rbit	x8, x5
162*bad17991SGetz Mikalsen	clz	x8, x8			// index of mismatch
163*bad17991SGetz Mikalsen	lsr	x3, x8, #2
164*bad17991SGetz Mikalsen
165*bad17991SGetz Mikalsen	add	x0, x0, x3		// restore dst pointer
166*bad17991SGetz Mikalsen	add	x10, x10, x3
167*bad17991SGetz Mikalsen	ldr	q1, [x10, #-15]
168*bad17991SGetz Mikalsen	str	q1, [x0, #-15]
169*bad17991SGetz Mikalsen	add	x0, x0, #1
170*bad17991SGetz Mikalsen	ret
171*bad17991SGetz Mikalsen
172*bad17991SGetz Mikalsen.Lrunt:
173*bad17991SGetz Mikalsen	add	x13, x11, x3
174*bad17991SGetz Mikalsen
175*bad17991SGetz Mikalsen	mov	x7, x5			// keep a copy of original match mask
176*bad17991SGetz Mikalsen
177*bad17991SGetz Mikalsen	lsl	x4, x12, #2		// shift 0xf to the limits position
178*bad17991SGetz Mikalsen	lsl	x4, x6, x4
179*bad17991SGetz Mikalsen
180*bad17991SGetz Mikalsen	cmp	x13, #16		// dont induce match if limit >=16
181*bad17991SGetz Mikalsen	csel	x4, x4, xzr, lo
182*bad17991SGetz Mikalsen	orr	x5, x5, x4		// insert match in mask at limit
183*bad17991SGetz Mikalsen
184*bad17991SGetz Mikalsen	ands	x8, x8, x5		// if match always fall through
185*bad17991SGetz Mikalsen	b.ne	0f
186*bad17991SGetz Mikalsen
187*bad17991SGetz Mikalsen	ldr	q4,	[x10, #16]	// load second string chunk
188*bad17991SGetz Mikalsen	cmeq	v1.16b, v4.16b, v0.16b	// char found in second chunk?
189*bad17991SGetz Mikalsen
190*bad17991SGetz Mikalsen	/* process second chunk */
191*bad17991SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
192*bad17991SGetz Mikalsen	fmov	x8, d1
193*bad17991SGetz Mikalsen	mov	x7, x8
194*bad17991SGetz Mikalsen
195*bad17991SGetz Mikalsen	lsl	x4, x12, #2
196*bad17991SGetz Mikalsen	lsl	x4, x6, x4
197*bad17991SGetz Mikalsen	orr	x8, x8, x4		// induce match in upper bytes of mask
198*bad17991SGetz Mikalsen
199*bad17991SGetz Mikalsen	rbit	x8, x8
200*bad17991SGetz Mikalsen	clz	x4, x8			// index of mismatch
201*bad17991SGetz Mikalsen	lsr	x8, x4, #2
202*bad17991SGetz Mikalsen	add	x8, x8, #16		// no match in first chunk
203*bad17991SGetz Mikalsen	b	1f
204*bad17991SGetz Mikalsen
205*bad17991SGetz Mikalsen0:
206*bad17991SGetz Mikalsen	rbit	x8, x8
207*bad17991SGetz Mikalsen	clz	x4, x8			// index of mismatch
208*bad17991SGetz Mikalsen	lsr	x8, x4, #2
209*bad17991SGetz Mikalsen1:
210*bad17991SGetz Mikalsen	add	x0, x0, x8		// return value if terminator not found
211*bad17991SGetz Mikalsen	sub	x0, x0, x11
212*bad17991SGetz Mikalsen	add	x0, x0, #1
213*bad17991SGetz Mikalsen
214*bad17991SGetz Mikalsen	/* check if we encountered a match or the limit first */
215*bad17991SGetz Mikalsen	lsl	x5, x6, x4
216*bad17991SGetz Mikalsen	ands	x7, x7, x5		// was the terminator present?
217*bad17991SGetz Mikalsen	csel	x0, xzr, x0, eq		// return value based on what we matched
218*bad17991SGetz Mikalsen
219*bad17991SGetz Mikalsen	sub	x8, x8, x11
220*bad17991SGetz Mikalsen	add	x4, x9, x8		// dst + cnt
221*bad17991SGetz Mikalsen	add	x5, x1, x8		// src + cnt
222*bad17991SGetz Mikalsen
223*bad17991SGetz Mikalsen	/* copy 17-32 bytes */
224*bad17991SGetz Mikalsen.L1732:
225*bad17991SGetz Mikalsen	cmp	x8, #16
226*bad17991SGetz Mikalsen	b.lo	.L0816
227*bad17991SGetz Mikalsen	add	x5, x5, #1		// ldp offsets are powers of 2
228*bad17991SGetz Mikalsen	add	x4, x4, #1
229*bad17991SGetz Mikalsen	ldp	x16, x17, [x1]
230*bad17991SGetz Mikalsen	ldp	x12, x13, [x5, #-16]
231*bad17991SGetz Mikalsen	stp	x16, x17, [x9]
232*bad17991SGetz Mikalsen	stp	x12, x13, [x4, #-16]
233*bad17991SGetz Mikalsen	ret
234*bad17991SGetz Mikalsen
235*bad17991SGetz Mikalsen	/* Copy 8-16 bytes */
236*bad17991SGetz Mikalsen.L0816:
237*bad17991SGetz Mikalsen	tbz	x8, #3, .L0407
238*bad17991SGetz Mikalsen	ldr	x16, [x1]
239*bad17991SGetz Mikalsen	ldr	x17, [x5, #-7]
240*bad17991SGetz Mikalsen	str	x16, [x9]
241*bad17991SGetz Mikalsen	str	x17, [x4, #-7]
242*bad17991SGetz Mikalsen	ret
243*bad17991SGetz Mikalsen
244*bad17991SGetz Mikalsen	/* Copy 4-7 bytes */
245*bad17991SGetz Mikalsen	.p2align 4
246*bad17991SGetz Mikalsen.L0407:
247*bad17991SGetz Mikalsen	cmp	x8, #3
248*bad17991SGetz Mikalsen	b.lo	.L0103
249*bad17991SGetz Mikalsen	ldr	w16, [x1]
250*bad17991SGetz Mikalsen	ldr	w18, [x5, #-3]
251*bad17991SGetz Mikalsen	str	w16, [x9]
252*bad17991SGetz Mikalsen	str	w18, [x4, #-3]
253*bad17991SGetz Mikalsen	ret
254*bad17991SGetz Mikalsen
255*bad17991SGetz Mikalsen	/* Copy 1-3 bytes */
256*bad17991SGetz Mikalsen	.p2align 4
257*bad17991SGetz Mikalsen.L0103:
258*bad17991SGetz Mikalsen	lsr	x14, x8, #1
259*bad17991SGetz Mikalsen	ldrb	w16, [x1]
260*bad17991SGetz Mikalsen	ldrb	w15, [x5]
261*bad17991SGetz Mikalsen	ldrb	w18, [x1, x14]
262*bad17991SGetz Mikalsen	strb	w16, [x9]
263*bad17991SGetz Mikalsen	strb	w18, [x9, x14]
264*bad17991SGetz Mikalsen	strb	w15, [x4]
265*bad17991SGetz Mikalsen	ret
266*bad17991SGetz Mikalsen
267*bad17991SGetz Mikalsen.L0:
268*bad17991SGetz Mikalsen	eor	x0, x0, x0
269*bad17991SGetz Mikalsen	ret
270*bad17991SGetz Mikalsen
271*bad17991SGetz MikalsenEND(__memccpy)
272