xref: /freebsd/lib/libc/aarch64/string/strcmp.S (revision 5e7d93a604400ca3c9db3be1df82ce963527740c)
1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
5*/
6
7#include <machine/asm.h>
8#include <machine/param.h>
9
10	.weak	strcmp
11	.set	strcmp, __strcmp
12	.text
13
14ENTRY(__strcmp)
15
16	bic	x8, x0, #0xf			// x0 aligned to the boundary
17	and	x9, x0, #0xf			// x9 is the offset
18	bic	x10, x1, #0xf			// x1 aligned to the boundary
19	and	x11, x1, #0xf			// x11 is the offset
20
21	mov	x13, #-1
22
23	/*
24	 * Check if either string is located at end of page to avoid crossing
25	 * into unmapped page. If so, we load 16 bytes from the nearest
26	 * alignment boundary and shift based on the offset.
27	 */
28
29	add	x3, x0, #16			// end of head
30	add	x4, x1, #16
31	eor	x3, x3, x0
32	eor	x4, x4, x1			// bits that changed
33	orr	x3, x3, x4			// in either str1 or str2
34	tbz	w3, #PAGE_SHIFT, .Lbegin
35
36	ldr	q0, [x8]			// load aligned head
37	ldr	q2, [x10]
38
39	lsl	x14, x9, #2
40	lsl	x15, x11, #2
41	lsl	x3, x13, x14			// string head
42	lsl	x4, x13, x15
43
44	cmeq	v5.16b, v0.16b, #0
45	cmeq	v6.16b, v2.16b, #0
46
47	shrn	v5.8b, v5.8h, #4
48	shrn	v6.8b, v6.8h, #4
49	fmov	x5, d5
50	fmov	x6, d6
51
52	adrp	x2, shift_data
53	add	x2, x2, :lo12:shift_data
54
55	/* heads may cross page boundary, avoid unmapped loads */
56	tst	x5, x3
57	b.eq	0f
58
59	ldr	q4, [x2, x9]			// load permutation table
60	tbl	v0.16b, {v0.16b}, v4.16b
61
62	b		1f
63	.p2align 4
640:
65	ldr	q0, [x0]			// load true head
661:
67	tst	x6, x4
68	b.eq	0f
69
70	ldr	q4, [x2, x11]
71	tbl	v4.16b, {v2.16b}, v4.16b
72
73	b 1f
74
75	.p2align 4
76.Lbegin:
77	ldr	q0, [x0]			// load true heads
780:
79	ldr	q4, [x1]
801:
81
82	cmeq	v2.16b, v0.16b, #0		// NUL byte present?
83	cmeq	v4.16b, v0.16b, v4.16b		// which bytes match?
84
85	orn	v2.16b, v2.16b, v4.16b		// mismatch or NUL byte?
86
87	shrn	v2.8b, v2.8h, #4
88	fmov	x5, d2
89
90	cbnz	x5, .Lhead_mismatch
91
92	ldr	q2, [x8, #16]			// load second chunk
93	ldr	q3, [x10, #16]
94	subs	x9, x9, x11			// is a&0xf >= b&0xf
95	b.lo	.Lswapped			// if not swap operands
96	sub	x12, x10, x9
97	ldr	q0, [x12, #16]!
98	sub	x10, x10, x8
99	sub	x11, x10, x9
100
101	cmeq	v1.16b, v3.16b, #0
102	cmeq	v0.16b, v0.16b, v2.16b
103	add	x8, x8, #16
104	shrn	v1.8b, v1.8h, #4
105	fmov	x6, d1
106	shrn	v0.8b, v0.8h, #4
107	fmov	x5, d0
108	cbnz	x6, .Lnulfound
109	mvn	x5, x5
110	cbnz	x5, .Lmismatch
111	add	x8, x8, #16			// advance aligned pointers
112
113	/*
114	 * During the main loop, the layout of the two strings is something like:
115	 *
116	 *          v ------1------ v ------2------ v
117	 *      X0:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
118	 *      X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
119	 *
120	 * where v indicates the alignment boundaries and corresponding chunks
121	 * of the strings have the same letters.  Chunk A has been checked in
122	 * the previous iteration.  This iteration, we first check that string
123	 * X1 doesn't end within region 2, then we compare chunk B between the
124	 * two strings.  As X1 is known not to hold a NUL byte in regions 1
125	 * and 2 at this point, this also ensures that x0 has not ended yet.
126	 */
127	.p2align 4
1280:
129	ldr	q0, [x8, x11]
130	ldr	q1, [x8, x10]
131	ldr	q2, [x8]
132
133	cmeq	v1.16b, v1.16b, #0		// end of string?
134	cmeq	v0.16b, v0.16b, v2.16b		// do the chunks match?
135
136	shrn	v1.8b, v1.8h, #4
137	fmov	x6, d1
138	shrn	v0.8b, v0.8h, #4
139	fmov	x5, d0
140	cbnz	x6, .Lnulfound
141	mvn	x5, x5				// any mismatches?
142	cbnz	x5, .Lmismatch
143
144	add	x8, x8, #16
145
146	ldr	q0, [x8, x11]
147	ldr	q1, [x8, x10]
148	ldr	q2, [x8]
149
150	add	x8, x8, #16
151	cmeq	v1.16b, v1.16b, #0
152	cmeq	v0.16b, v0.16b, v2.16b
153
154	shrn	v1.8b, v1.8h, #4
155	fmov	x6, d1
156	shrn	v0.8b, v0.8h, #4
157	fmov	x5, d0
158	cbnz	x6, .Lnulfound2
159	mvn	x5, x5
160	cbz	x5, 0b
161
162	sub	x8, x8, #16			// roll back second increment
163.Lmismatch:
164	rbit	x2, x5
165	clz	x2, x2				// index of mismatch
166	lsr	x2, x2, #2
167	add	x11, x8, x11
168
169	ldrb	w4, [x8, x2]
170	ldrb	w5, [x11, x2]
171	sub	w0, w4, w5			// byte difference
172	ret
173
174	.p2align 4
175.Lnulfound2:
176	sub	x8, x8, #16
177
178.Lnulfound:
179	mov	x7, x9
180	mov	x4, x6
181
182	ubfiz	x7, x7, #2, #4			// x7 = (x7 & 0xf) << 2
183	lsl	x6, x6, x7			// adjust NUL mask to indices
184	orn	x5, x6, x5
185	cbnz	x5, .Lmismatch
186
187	/*
188	 * (x0) == (x1) and NUL is past the string.
189	 * Compare (x1) with the corresponding part
190	 * of the other string until the NUL byte.
191	 */
192	ldr	q0, [x8, x9]
193	ldr	q1, [x8, x10]
194
195	cmeq	v1.16b, v0.16b, v1.16b
196	shrn	v1.8b, v1.8h, #4
197	fmov	x5, d1
198
199	orn	x5, x4, x5
200
201	rbit	x2, x5
202	clz	x2, x2
203	lsr	x5, x2, #2
204
205	add	x10, x10, x8			// restore x10 pointer
206	add	x8, x8, x9			// point to corresponding chunk
207
208	ldrb	w4, [x8, x5]
209	ldrb	w5, [x10, x5]
210	sub	w0, w4, w5
211	ret
212
213	.p2align 4
214.Lhead_mismatch:
215	rbit	x2, x5
216	clz	x2, x2				// index of mismatch
217	lsr	x2, x2, #2
218	ldrb	w4, [x0, x2]
219	ldrb	w5, [x1, x2]
220	sub	w0, w4, w5
221	ret
222
223	/*
224	 * If (a&0xf) < (b&0xf), we do the same thing but with swapped
225	 * operands.  I found that this performs slightly better than
226	 * using conditional moves to do the swap branchless.
227	 */
228	.p2align 4
229.Lswapped:
230	add	x12, x8, x9
231	ldr	q0, [x12, #16]!
232	sub	x8, x8, x10
233	add	x11, x8, x9
234	neg	x9, x9
235
236	cmeq	v1.16b, v2.16b, #0
237	cmeq	v0.16b, v0.16b, v3.16b
238	add	x10, x10, #16
239	shrn	v1.8b, v1.8h, #4
240	fmov	x6, d1
241	shrn	v0.8b, v0.8h, #4
242	fmov	x5, d0
243	cbnz	x6, .Lnulfounds
244	mvn	x5, x5
245	cbnz	x5, .Lmismatchs
246	add	x10, x10, #16
247
248	/*
249	 * During the main loop, the layout of the two strings is something like:
250	 *
251	 *          v ------1------ v ------2------ v
252	 *      X1:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
253	 *      X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
254	 *
255	 * where v indicates the alignment boundaries and corresponding chunks
256	 * of the strings have the same letters.  Chunk A has been checked in
257	 * the previous iteration.  This iteration, we first check that string
258	 * X0 doesn't end within region 2, then we compare chunk B between the
259	 * two strings.  As X0 is known not to hold a NUL byte in regions 1
260	 * and 2 at this point, this also ensures that X1 has not ended yet.
261	 */
262	.p2align 4
2630:
264	ldr	q0, [x10, x11]
265	ldr	q1, [x10, x8]
266	ldr	q2, [x10]
267
268	cmeq	v1.16b, v1.16b, #0
269	cmeq	v0.16b, v0.16b, v2.16b
270
271	shrn	v1.8b, v1.8h, #4
272	fmov	x6, d1
273	shrn	v0.8b, v0.8h, #4
274	fmov	x5, d0
275	cbnz	x6, .Lnulfounds
276	mvn	x5, x5
277	cbnz	x5, .Lmismatchs
278
279	add	x10, x10, #16
280
281	ldr	q0, [x10, x11]
282	ldr	q1, [x10, x8]
283	ldr	q2, [x10]
284
285	add	x10, x10, #16
286	cmeq	v1.16b, v1.16b, #0
287	cmeq	v0.16b, v0.16b, v2.16b
288
289	shrn	v1.8b, v1.8h, #4
290	fmov	x6, d1
291	shrn	v0.8b, v0.8h, #4
292	fmov	x5, d0
293	cbnz	x6, .Lnulfound2s
294	mvn	x5, x5
295	cbz	x5, 0b
296
297	sub	x10, x10, #16
298
299.Lmismatchs:
300	rbit	x2, x5
301	clz	x2, x2
302	lsr	x2, x2, #2
303	add	x11, x10, x11
304
305	ldrb	w4, [x10, x2]
306	ldrb	w5, [x11, x2]
307	sub	w0, w5, w4
308	ret
309
310	.p2align 4
311.Lnulfound2s:
312	sub	x10, x10, #16
313.Lnulfounds:
314	mov	x7, x9
315	mov	x4, x6
316
317	ubfiz	x7, x7, #2, #4
318	lsl	x6, x6, x7
319	orn	x5, x6, x5
320	cbnz	x5, .Lmismatchs
321
322	ldr	q0, [x10, x9]
323	ldr	q1, [x10, x8]
324
325	cmeq	v1.16b, v0.16b, v1.16b
326	shrn	v1.8b, v1.8h, #4
327	fmov	x5, d1
328
329	orn	x5, x4, x5
330
331	rbit	x2, x5
332	clz	x2, x2
333	lsr	x5, x2, #2
334
335	add	x11, x10, x8
336	add	x10, x10, x9
337
338	ldrb	w4, [x10, x5]
339	ldrb	w5, [x11, x5]
340	sub	w0, w5, w4
341	ret
342
343END(__strcmp)
344
345	.section .rodata
346	.p2align 4
347shift_data:
348	.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
349	.fill 16, 1, -1
350	.size shift_data, .-shift_data
351