xref: /freebsd/lib/libc/aarch64/string/strncmp.S (revision 4b15965daa99044daf184221b7c283bf7f2d7e66)
1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
5*/
6
7#include <machine/asm.h>
8#include <machine/param.h>
9
10	.weak	strncmp
11	.set	strncmp, __strncmp
12	.text
13
14ENTRY(__strncmp)
15
16	bic	x8, x0, #0xf			// x0 aligned to the boundary
17	and	x9, x0, #0xf			// x9 is the offset
18	bic	x10, x1, #0xf			// x1 aligned to the boundary
19	and	x11, x1, #0xf			// x11 is the offset
20
21	subs	x2, x2, #1
22	b.lo	.Lempty
23
24	mov	x13, #-1			// save constants for later
25	mov	x16, #0xf
26
27	/*
28	 * Check if either string is located at end of page to avoid crossing
29	 * into unmapped page. If so, we load 16 bytes from the nearest
30	 * alignment boundary and shift based on the offset.
31	 */
32
33	add	x3, x0, #16			// end of head
34	add	x4, x1, #16
35	eor	x3, x3, x0
36	eor	x4, x4, x1			// bits that changed
37	orr	x3, x3, x4			// in either str1 or str2
38	cmp	x2,#16
39	b.lo	.Llt16
40	tbz	w3, #PAGE_SHIFT, .Lbegin
41
42	ldr	q0, [x8]			// load aligned head
43	ldr	q1, [x10]
44
45	lsl	x14, x9, #2
46	lsl	x15, x11, #2
47	lsl	x3, x13, x14			// string head
48	lsl	x4, x13, x15
49
50	cmeq	v5.16b, v0.16b, #0
51	cmeq	v6.16b, v1.16b, #0
52
53	shrn	v5.8b, v5.8h, #4
54	shrn	v6.8b, v6.8h, #4
55	fmov	x5, d5
56	fmov	x6, d6
57
58	adrp	x14, shift_data
59	add	x14, x14, :lo12:shift_data
60
61	/* heads may cross page boundary, avoid unmapped loads */
62	tst	x5, x3
63	b.eq	0f
64
65	ldr	q4, [x14, x9]			// load permutation table
66	tbl	v0.16b, {v0.16b}, v4.16b
67
68	b	1f
69	.p2align 4
700:
71	ldr	q0, [x0]			// load true head
721:
73	tst	x6, x4
74	b.eq	0f
75
76	ldr	q4, [x14, x11]
77	tbl	v4.16b, {v1.16b}, v4.16b
78
79	b 1f
80
81	.p2align 4
82.Lbegin:
83	ldr	q0, [x0]			// load true heads
840:
85	ldr	q4, [x1]
861:
87	cmeq	v2.16b, v0.16b, #0		// NUL byte present?
88	cmeq	v4.16b, v0.16b, v4.16b		// which bytes match?
89
90	orn	v2.16b, v2.16b, v4.16b		// mismatch or NUL byte?
91
92	shrn	v2.8b, v2.8h, #4
93	fmov	x5, d2
94
95	cbnz	x5, .Lhead_mismatch
96	/* load head and second chunk */
97	ldr	q2, [x8, #16]			// load second chunk
98	ldr	q3, [x10, #16]
99
100	add	x2, x2, x11
101	sub	x2, x2, #16
102
103	subs	x9, x9, x11			// is a&0xf >= b&0xf
104	b.lo	.Lswapped			// if not swap operands
105	b	.Lnormal
106
107	.p2align 4
108.Llt16:
109	/*
110	 * Check if either string is located at end of page to avoid crossing
111	 * into unmapped page. If so, we load 16 bytes from the nearest
112	 * alignment boundary and shift based on the offset.
113	 */
114	tbz	w3, #PAGE_SHIFT, 2f
115
116	ldr	q0, [x8]			// load aligned head
117	ldr	q1, [x10]
118
119	lsl	x14, x9, #2
120	lsl	x15, x11, #2
121	lsl	x3, x13, x14			// string head
122	lsl	x4, x13, x15
123
124	/* Introduce a null byte match if the limit is within the aligned chunk */
125	add	x14, x2, x9
126	add	x15, x2, x11
127	lsl	x14, x14, #2
128	lsl	x15, x15, #2
129	lsl	x14, x16, x14
130	lsl	x15, x16, x15
131
132	cmeq	v5.16b, v0.16b, #0
133	cmeq	v6.16b, v1.16b, #0
134
135	shrn	v5.8b, v5.8h, #4
136	shrn	v6.8b, v6.8h, #4
137	fmov	x5, d5
138	fmov	x6, d6
139
140	orr	x5, x5, x14			// insert match at limit
141	orr	x6, x6, x15
142
143	adrp	x14, shift_data
144	add	x14, x14, :lo12:shift_data
145
146	/* heads may cross page boundary, avoid unmapped loads */
147	tst	x5, x3
148	b.eq	0f
149
150	ldr	q4, [x14, x9]			// load permutation table
151	tbl	v0.16b, {v0.16b}, v4.16b
152
153	b	1f
154	.p2align 4
1550:
156	ldr	q0, [x0]			// load true head
1571:
158	tst	x6, x4
159	b.eq	0f
160
161	ldr	q4, [x14, x11]
162	tbl	v4.16b, {v1.16b}, v4.16b
163
164	b 1f
165
166	.p2align 4
1672:
168	ldr	q0, [x0]			// load true heads
1690:
170	ldr	q4, [x1]
1711:
172
173	cmeq	v2.16b, v0.16b, #0		// NUL byte present?
174	cmeq	v4.16b, v0.16b, v4.16b		// which bytes match?
175
176	bic	v2.16b, v4.16b, v2.16b		// match and not NUL byte
177
178	shrn	v2.8b, v2.8h, #4
179	fmov	x5, d2
180	lsl	x4, x2, #2
181	lsl	x4, x13, x4
182	orn	x5, x4, x5			// mismatch or NUL byte?
183
184.Lhead_mismatch:
185	rbit	x3, x5
186	clz	x3, x3				// index of mismatch
187	lsr	x3, x3, #2
188	ldrb	w4, [x0, x3]
189	ldrb	w5, [x1, x3]
190	sub	w0, w4, w5
191	ret
192
193	.p2align 4
194.Lnormal:
195	sub	x12, x10, x9
196	ldr	q0, [x12, #16]!
197	sub	x10, x10, x8
198	sub	x11, x10, x9
199
200	cmeq	v1.16b, v3.16b, #0		// NUL present?
201	cmeq	v0.16b, v0.16b, v2.16b		// Mismatch between chunks?
202	shrn	v1.8b, v1.8h, #4
203	shrn	v0.8b, v0.8h, #4
204	fmov	x6, d1
205	fmov	x5, d0
206
207	add	x8, x8, #32			// advance to next iteration
208
209	lsl	x4, x2, #2
210	lsl	x4, x13, x4
211	orr	x3, x6, x4			// introduce a null byte match
212	cmp	x2, #16				// does the buffer end within x2
213	csel	x6, x3, x6, lo
214	cbnz	x6, .Lnulfound2			// NUL or end of buffer found?
215	mvn	x5, x5
216	cbnz	x5, .Lmismatch2
217	sub	x2, x2, #16
218	cmp	x2, #32				// end of buffer?
219	b.lo	.Ltail
220	/*
221	 * During the main loop, the layout of the two strings is something like:
222	 *
223	 *          v ------1------ v ------2------ v
224	 *      X0:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
225	 *      X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
226	 *
227	 * where v indicates the alignment boundaries and corresponding chunks
228	 * of the strings have the same letters.  Chunk A has been checked in
229	 * the previous iteration.  This iteration, we first check that string
230	 * X1 doesn't end within region 2, then we compare chunk B between the
231	 * two strings.  As X1 is known not to hold a NUL byte in regions 1
232	 * and 2 at this point, this also ensures that x0 has not ended yet.
233	 */
234	.p2align 4
2350:
236	ldr	q0, [x8, x11]
237	ldr	q1, [x8, x10]
238	ldr	q2, [x8]
239
240	cmeq	v1.16b, v1.16b, #0		// end of string?
241	cmeq	v0.16b, v0.16b, v2.16b		// do the chunks match?
242
243	shrn	v1.8b, v1.8h, #4
244	shrn	v0.8b, v0.8h, #4
245	fmov	x6, d1
246	fmov	x5, d0
247	cbnz	x6, .Lnulfound
248	mvn	x5, x5				// any mismatches?
249	cbnz	x5, .Lmismatch
250
251	add	x8, x8, #16
252
253	/* main loop unrolled twice */
254	ldr	q0, [x8, x11]
255	ldr	q1, [x8, x10]
256	ldr	q2, [x8]
257
258	add	x8, x8, #16
259	cmeq	v1.16b, v1.16b, #0
260	cmeq	v0.16b, v0.16b, v2.16b
261
262	shrn	v1.8b, v1.8h, #4
263	shrn	v0.8b, v0.8h, #4
264	fmov	x6, d1
265	fmov	x5, d0
266	cbnz	x6, .Lnulfound2
267	mvn	x5, x5
268	cbnz	x5, .Lmismatch2
269	sub	x2, x2, #32
270	cmp	x2, #32				// end of buffer?
271	b.hs	0b				// if yes, process tail
272
273	/* end of buffer will occur in next 32 bytes */
274.Ltail:
275	ldr	q0, [x8, x11]
276	ldr	q1, [x8, x10]
277	ldr	q2, [x8]
278
279	cmeq	v1.16b, v1.16b, #0		// end of string?
280	cmeq	v0.16b, v0.16b, v2.16b		// do the chunks match?
281
282	shrn	v1.8b, v1.8h, #4
283	shrn	v0.8b, v0.8h, #4
284	fmov	x6, d1
285	fmov	x5, d0
286
287	/*
288	 * If x2 <= 16 then we introduce a NUL byte in the
289	 * result from CMEQ to avoid comparing further!
290	 */
291
292	lsl	x4, x2, #2
293	lsl	x4, x13, x4
294	orr	x3, x6, x4			// introduce a null byte match
295	cmp	x2, #16				// does the buffer end within x2
296	csel	x6, x3, x6, lo
297
298	cbnz	x6, .Lnulfound			// NUL or end of string found
299	mvn	x5, x5
300	cbnz	x5, .Lmismatch
301
302	add	x8, x8, #16
303
304	/* main loop unrolled twice */
305	ldr	q0, [x8, x11]
306	ldr	q1, [x8, x10]
307	ldr	q2, [x8]
308
309	add	x8, x8, #16
310	cmeq	v1.16b, v1.16b, #0
311	cmeq	v0.16b, v0.16b, v2.16b
312
313	shrn	v1.8b, v1.8h, #4
314	shrn	v0.8b, v0.8h, #4
315	fmov	x6, d1
316	fmov	x5, d0
317
318	ubfiz	x4, x2, #2, #4	// (x2 - 16) << 2
319	lsl	x4, x13, x4			// take first half into account
320	orr	x6, x6, x4			// introduce a null byte match
321
322.Lnulfound2:
323	sub	x8, x8, #16
324
325.Lnulfound:
326	mov	x4, x6
327
328	ubfiz	x7, x9, #2, #4
329	lsl	x6, x6, x7			// adjust NUL mask to indices
330
331	orn	x5, x6, x5
332	cbnz	x5, .Lmismatch
333
334	/*
335	 * (x0) == (x1) and NUL is past the string.
336	 * Compare (x1) with the corresponding part
337	 * of the other string until the NUL byte.
338	 */
339	ldr	q0, [x8, x9]
340	ldr	q1, [x8, x10]
341
342	cmeq	v1.16b, v0.16b, v1.16b
343	shrn	v1.8b, v1.8h, #4
344	fmov	x5, d1
345
346	orn	x5, x4, x5
347
348	rbit	x3, x5
349	clz	x3, x3
350	lsr	x5, x3, #2
351
352	add	x10, x10, x8			// restore x10 pointer
353	add	x8, x8, x9			// point to corresponding chunk
354
355	ldrb	w4, [x8, x5]
356	ldrb	w5, [x10, x5]
357	sub	w0, w4, w5
358	ret
359
360	.p2align 4
361.Lmismatch2:
362	sub	x8, x8, #16			// roll back second increment
363.Lmismatch:
364	rbit	x3, x5
365	clz	x3, x3				// index of mismatch
366	lsr	x3, x3, #2
367	add	x11, x8, x11
368
369	ldrb	w4, [x8, x3]
370	ldrb	w5, [x11, x3]
371	sub	w0, w4, w5			// byte difference
372	ret
373
374	/*
375	 * If (a&0xf) < (b&0xf), we do the same thing but with swapped
376	 * operands.  I found that this performs slightly better than
377	 * using conditional moves to do the swap branchless.
378	 */
379	.p2align 4
380.Lswapped:
381	add	x12, x8, x9
382	ldr	q0, [x12, #16]!
383	sub	x8, x8, x10
384	add	x11, x8, x9
385	add	x2,x2,x9
386	neg	x9, x9
387
388	cmeq	v1.16b, v2.16b, #0
389	cmeq	v0.16b, v0.16b, v3.16b
390	shrn	v1.8b, v1.8h, #4
391	shrn	v0.8b, v0.8h, #4
392	fmov	x6, d1
393	fmov	x5, d0
394
395	add	x10, x10, #32
396
397	lsl	x4, x2, #2
398	lsl	x4, x13, x4
399	orr	x3,x6,x4			// introduce a null byte match
400	cmp	x2,#16
401	csel	x6, x3, x6, lo
402	cbnz	x6, .Lnulfound2s
403	mvn	x5, x5
404	cbnz	x5, .Lmismatch2s
405
406	sub	x2, x2, #16
407	cmp	x2, #32
408	b.lo	.Ltails
409
410	/*
411	 * During the main loop, the layout of the two strings is something like:
412	 *
413	 *          v ------1------ v ------2------ v
414	 *      X1:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
415	 *      X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
416	 *
417	 * where v indicates the alignment boundaries and corresponding chunks
418	 * of the strings have the same letters.  Chunk A has been checked in
419	 * the previous iteration.  This iteration, we first check that string
420	 * X0 doesn't end within region 2, then we compare chunk B between the
421	 * two strings.  As X0 is known not to hold a NUL byte in regions 1
422	 * and 2 at this point, this also ensures that X1 has not ended yet.
423	 */
424	.p2align 4
4250:
426	ldr	q0, [x10, x11]
427	ldr	q1, [x10, x8]
428	ldr	q2, [x10]
429
430	cmeq	v1.16b, v1.16b, #0
431	cmeq	v0.16b, v0.16b, v2.16b
432
433	shrn	v1.8b, v1.8h, #4
434	shrn	v0.8b, v0.8h, #4
435	fmov	x6, d1
436	fmov	x5, d0
437	cbnz	x6, .Lnulfounds
438	mvn	x5, x5
439	cbnz	x5, .Lmismatchs
440
441	add	x10, x10, #16
442
443	/* main loop unrolled twice */
444	ldr	q0, [x10, x11]
445	ldr	q1, [x10, x8]
446	ldr	q2, [x10]
447
448	add	x10, x10, #16
449	cmeq	v1.16b, v1.16b, #0
450	cmeq	v0.16b, v0.16b, v2.16b
451
452	shrn	v1.8b, v1.8h, #4
453	shrn	v0.8b, v0.8h, #4
454	fmov	x6, d1
455	fmov	x5, d0
456	cbnz	x6, .Lnulfound2s
457	mvn	x5, x5
458	cbnz	x5, .Lmismatch2s
459	sub	x2, x2, #32
460	cmp	x2, #32
461	b.hs	0b
462
463.Ltails:
464	ldr	q0, [x10, x11]
465	ldr	q1, [x10, x8]
466	ldr	q2, [x10]
467
468	cmeq	v1.16b, v1.16b, #0
469	cmeq	v0.16b, v0.16b, v2.16b
470
471	shrn	v1.8b, v1.8h, #4
472	shrn	v0.8b, v0.8h, #4
473	fmov	x6, d1
474	fmov	x5, d0
475
476	/*
477	 * If x2 <= 16 then we introduce a NUL byte in the
478	 * result from CMEQ to avoid comparing further!
479	 */
480
481	lsl	x4, x2, #2
482	lsl	x4, x13, x4
483	orr	x3, x6, x4			// introduce a null byte match
484	cmp	x2, #16
485	csel	x6, x3, x6, lo
486
487	cbnz	x6, .Lnulfounds
488	mvn	x5, x5
489	cbnz	x5, .Lmismatchs
490
491	add	x10, x10, #16
492
493	ldr	q0, [x10, x11]
494	ldr	q1, [x10, x8]
495	ldr	q2, [x10]
496
497	add	x10, x10, #16
498	cmeq	v1.16b, v1.16b, #0
499	cmeq	v0.16b, v0.16b, v2.16b
500
501	shrn	v1.8b, v1.8h, #4
502	shrn	v0.8b, v0.8h, #4
503	fmov	x6, d1
504	fmov	x5, d0
505
506	ubfiz	x4, x2, #2, #4
507	lsl	x4, x13, x4
508	orr	x6, x6, x4			// introduce a null byte match
509
510.Lnulfound2s:
511	sub	x10, x10, #16
512.Lnulfounds:
513	mov	x4, x6
514
515	ubfiz	x7, x9, #2, #4
516	lsl	x6, x6, x7
517
518	orn	x5, x6, x5
519
520	cbnz	x5, .Lmismatchs
521
522	ldr	q0, [x10, x9]
523	ldr	q1, [x10, x8]
524
525	cmeq	v1.16b, v0.16b, v1.16b
526	shrn	v1.8b, v1.8h, #4
527	fmov	x5, d1
528
529	orn	x5, x4, x5
530
531	rbit	x3, x5
532	clz	x3, x3
533	lsr	x5, x3, #2
534
535	add	x11, x10, x8
536	add	x10, x10, x9
537
538	ldrb	w4, [x10, x5]
539	ldrb	w5, [x11, x5]
540	sub	w0, w5, w4
541	ret
542
543	.p2align 4
544.Lmismatch2s:
545	sub	x10, x10, #16
546.Lmismatchs:
547	rbit	x3, x5
548	clz	x3, x3
549	lsr	x3, x3, #2
550	add	x11, x10, x11
551
552	ldrb	w4, [x10, x3]
553	ldrb	w5, [x11, x3]
554	sub	w0, w5, w4
555	ret
556
557	.p2align 4
558.Lempty:
559	eor	x0, x0, x0
560	ret
561
562END(__strncmp)
563
564	.section .rodata
565	.p2align 4
566shift_data:
567	.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
568	.fill 16, 1, -1
569	.size shift_data, .-shift_data
570