xref: /freebsd/lib/libc/aarch64/string/memccpy.S (revision bad17991c06d684e9053938d00a07b962e2fd31c)
1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
5*/
6
7#include <machine/asm.h>
8
9	.weak	memccpy
10	.set	memccpy, __memccpy
11	.text
12
13ENTRY(__memccpy)
14	subs	x3, x3, #1
15	b.lo	.L0
16
17	dup	v0.16b,	w2
18
19	mov	x9, x0			// stash copy of src pointer
20	bic	x10, x1, #0xf		// src aligned
21	and	x11, x1, #0xf		// src offset
22
23	ldr	q1, [x10]
24	cmeq	v1.16b, v1.16b, v0.16b	// bytewise compare against src char
25
26	mov	x8, #-1			// prepare a 0xfff..fff register
27	mov	x6, #0xf
28
29	lsl	x12, x11, #2
30	lsl	x8, x8, x12		// mask of bytes in the string
31
32	shrn	v1.8b, v1.8h, #4
33	fmov	x5, d1
34
35	sub	x12, x11, #32
36	adds	x12, x12, x3		// distance from alignment boundary - 32
37	b.cc	.Lrunt			// branch if buffer length is 32 or less
38
39	ands	x8, x8, x5
40	b.eq	0f
41
42	/* match in first chunk */
43	rbit	x8, x8
44	clz	x8, x8			// index of mismatch
45	lsr	x8, x8, #2
46
47	sub	x8, x8, x11		// ... from beginning of the string
48
49	add	x0, x0, x8
50	add	x4, x9, x8		// dst + cnt
51	add	x5, x1, x8		// src + cnt
52	add	x0, x0, #1
53
54	b	.L0816
55
560:
57	ldr	q3,	[x10, #16]	// load second string chunk
58	ldr	q2,	[x1]		// load true head
59	cmeq	v1.16b, v3.16b, v0.16b	// char found in second chunk?
60
61	/* process second chunk */
62	shrn	v1.8b, v1.8h, #4
63	fmov	x5, d1
64
65	cbz	x5, 0f
66
67	/* match in second chunk */
68	rbit	x8, x5
69	clz	x8, x8			// index of mismatch
70	lsr	x8, x8, #2
71
72	sub	x11, x11, #16
73	sub	x8, x8, x11		// adjust for alignment offset
74	add	x0, x0, x8		// return value
75	add	x0, x0, #1
76
77	add	x4, x9, x8
78	add	x5, x1, x8
79	b	.L1732
80
810:
82	/* string didn't end in second chunk and neither did buffer */
83	ldr	q1,	[x10, #32]	// load next string chunk
84	str	q2,	[x0]		// deposit head into buffer
85	sub	x0, x0, x11		// adjust x0
86	mov	x3, x12
87	str	q3,	[x0, #16]	// deposit second chunk
88
89	add	x10, x10, #32		// advance src
90	add	x0, x0, #32		// advance dst
91	subs	x3, x3, #16		// enough left for another round?
92	b.lo	1f
93
94	/* main loop unrolled twice */
95	.p2align 4
960:
97	cmeq	v2.16b, v1.16b, v0.16b	// char found in second chunk?
98	shrn	v2.8b, v2.8h, #4
99	fmov	x5, d2
100
101	cbnz	x5, 3f
102
103	str	q1, [x0]
104	ldr	q1, [x10, #16]		// load next chunk
105
106	cmp	x3, #16			// more than a full chunk left?
107	b.lo	2f
108
109	add	x10, x10, #32		// advance pointers
110	add	x0, x0, #32
111
112	cmeq	v2.16b, v1.16b, v0.16b	// char found in second chunk?
113	shrn	v2.8b, v2.8h, #4
114	fmov	x5, d2
115	cbnz	x5, 4f			// process chunk if match
116
117	str	q1, [x0, #-16]
118	ldr	q1, [x10]		// load next chunk
119
120	subs	x3, x3, #32
121	b.hs	0b
122
1231:
124	sub	x10, x10, #16		// undo second advancement
125	add	x3, x3, #16
126	sub	x0, x0, #16
127
128	/* 1--16 bytes left in the buffer but string has not ended yet */
1292:
130	cmeq	v2.16b, v1.16b, v0.16b	// char found in second chunk?
131	shrn	v2.8b, v2.8h, #4
132	fmov	x4, d2
133
134	lsl	x5, x3, #2		// shift 0xf to the limits position
135	lsl	x5, x6, x5
136	orr	x8, x4, x5		// insert match in mask at limit
137
138	rbit	x8, x8			// simulate x86 tzcnt
139	clz	x7, x8			// index of mismatch
140	lsr	x8, x7, #2
141
142	lsl	x5, x6, x7		// simulate x86 bt with shifted 0xf
143
144	add	x8, x8, #1
145	add	x0, x0, x8
146
147	ldr	q1, [x10, x8]		// load tail
148	str	q1, [x0]		// store tail
149
150	add	x0, x0, #16
151
152	tst	x4, x5			// terminator encountered inside buffer?
153	csel	x0, x0, xzr, ne		// if yes, return pointer, else NUL
154	ret
155
1564:
157	sub	x10, x10, #16		// undo second advancement
158	sub	x0, x0, #16		// undo second advancement
159
1603:
161	rbit	x8, x5
162	clz	x8, x8			// index of mismatch
163	lsr	x3, x8, #2
164
165	add	x0, x0, x3		// restore dst pointer
166	add	x10, x10, x3
167	ldr	q1, [x10, #-15]
168	str	q1, [x0, #-15]
169	add	x0, x0, #1
170	ret
171
172.Lrunt:
173	add	x13, x11, x3
174
175	mov	x7, x5			// keep a copy of original match mask
176
177	lsl	x4, x12, #2		// shift 0xf to the limits position
178	lsl	x4, x6, x4
179
180	cmp	x13, #16		// dont induce match if limit >=16
181	csel	x4, x4, xzr, lo
182	orr	x5, x5, x4		// insert match in mask at limit
183
184	ands	x8, x8, x5		// if match always fall through
185	b.ne	0f
186
187	ldr	q4,	[x10, #16]	// load second string chunk
188	cmeq	v1.16b, v4.16b, v0.16b	// char found in second chunk?
189
190	/* process second chunk */
191	shrn	v1.8b, v1.8h, #4
192	fmov	x8, d1
193	mov	x7, x8
194
195	lsl	x4, x12, #2
196	lsl	x4, x6, x4
197	orr	x8, x8, x4		// induce match in upper bytes of mask
198
199	rbit	x8, x8
200	clz	x4, x8			// index of mismatch
201	lsr	x8, x4, #2
202	add	x8, x8, #16		// no match in first chunk
203	b	1f
204
2050:
206	rbit	x8, x8
207	clz	x4, x8			// index of mismatch
208	lsr	x8, x4, #2
2091:
210	add	x0, x0, x8		// return value if terminator not found
211	sub	x0, x0, x11
212	add	x0, x0, #1
213
214	/* check if we encountered a match or the limit first */
215	lsl	x5, x6, x4
216	ands	x7, x7, x5		// was the terminator present?
217	csel	x0, xzr, x0, eq		// return value based on what we matched
218
219	sub	x8, x8, x11
220	add	x4, x9, x8		// dst + cnt
221	add	x5, x1, x8		// src + cnt
222
223	/* copy 17-32 bytes */
224.L1732:
225	cmp	x8, #16
226	b.lo	.L0816
227	add	x5, x5, #1		// ldp offsets are powers of 2
228	add	x4, x4, #1
229	ldp	x16, x17, [x1]
230	ldp	x12, x13, [x5, #-16]
231	stp	x16, x17, [x9]
232	stp	x12, x13, [x4, #-16]
233	ret
234
235	/* Copy 8-16 bytes */
236.L0816:
237	tbz	x8, #3, .L0407
238	ldr	x16, [x1]
239	ldr	x17, [x5, #-7]
240	str	x16, [x9]
241	str	x17, [x4, #-7]
242	ret
243
244	/* Copy 4-7 bytes */
245	.p2align 4
246.L0407:
247	cmp	x8, #3
248	b.lo	.L0103
249	ldr	w16, [x1]
250	ldr	w18, [x5, #-3]
251	str	w16, [x9]
252	str	w18, [x4, #-3]
253	ret
254
255	/* Copy 1-3 bytes */
256	.p2align 4
257.L0103:
258	lsr	x14, x8, #1
259	ldrb	w16, [x1]
260	ldrb	w15, [x5]
261	ldrb	w18, [x1, x14]
262	strb	w16, [x9]
263	strb	w18, [x9, x14]
264	strb	w15, [x4]
265	ret
266
267.L0:
268	eor	x0, x0, x0
269	ret
270
271END(__memccpy)
272