xref: /freebsd/lib/libc/aarch64/string/strncmp.S (revision 25c485e147691f3929b0b5029bab58bf56d3606b)
1*25c485e1SGetz Mikalsen/*-
2*25c485e1SGetz Mikalsen * SPDX-License-Identifier: BSD-2-Clause
3*25c485e1SGetz Mikalsen *
4*25c485e1SGetz Mikalsen * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
5*25c485e1SGetz Mikalsen*/
6*25c485e1SGetz Mikalsen
7*25c485e1SGetz Mikalsen#include <machine/asm.h>
8*25c485e1SGetz Mikalsen#include <machine/param.h>
9*25c485e1SGetz Mikalsen
10*25c485e1SGetz Mikalsen	.weak	strncmp
11*25c485e1SGetz Mikalsen	.set	strncmp, __strncmp
12*25c485e1SGetz Mikalsen	.text
13*25c485e1SGetz Mikalsen
14*25c485e1SGetz MikalsenENTRY(__strncmp)
15*25c485e1SGetz Mikalsen
16*25c485e1SGetz Mikalsen	bic	x8, x0, #0xf			// x0 aligned to the boundary
17*25c485e1SGetz Mikalsen	and	x9, x0, #0xf			// x9 is the offset
18*25c485e1SGetz Mikalsen	bic	x10, x1, #0xf			// x1 aligned to the boundary
19*25c485e1SGetz Mikalsen	and	x11, x1, #0xf			// x11 is the offset
20*25c485e1SGetz Mikalsen
21*25c485e1SGetz Mikalsen	subs	x2, x2, #1
22*25c485e1SGetz Mikalsen	b.lo	.Lempty
23*25c485e1SGetz Mikalsen
24*25c485e1SGetz Mikalsen	mov	x13, #-1			// save constants for later
25*25c485e1SGetz Mikalsen	mov	x16, #0xf
26*25c485e1SGetz Mikalsen
27*25c485e1SGetz Mikalsen	/*
28*25c485e1SGetz Mikalsen	 * Check if either string is located at end of page to avoid crossing
29*25c485e1SGetz Mikalsen	 * into unmapped page. If so, we load 16 bytes from the nearest
30*25c485e1SGetz Mikalsen	 * alignment boundary and shift based on the offset.
31*25c485e1SGetz Mikalsen	 */
32*25c485e1SGetz Mikalsen
33*25c485e1SGetz Mikalsen	add	x3, x0, #16			// end of head
34*25c485e1SGetz Mikalsen	add	x4, x1, #16
35*25c485e1SGetz Mikalsen	eor	x3, x3, x0
36*25c485e1SGetz Mikalsen	eor	x4, x4, x1			// bits that changed
37*25c485e1SGetz Mikalsen	orr	x3, x3, x4			// in either str1 or str2
38*25c485e1SGetz Mikalsen	cmp	x2,#16
39*25c485e1SGetz Mikalsen	b.lo	.Llt16
40*25c485e1SGetz Mikalsen	tbz	w3, #PAGE_SHIFT, .Lbegin
41*25c485e1SGetz Mikalsen
42*25c485e1SGetz Mikalsen	ldr	q0, [x8]			// load aligned head
43*25c485e1SGetz Mikalsen	ldr	q1, [x10]
44*25c485e1SGetz Mikalsen
45*25c485e1SGetz Mikalsen	lsl	x14, x9, #2
46*25c485e1SGetz Mikalsen	lsl	x15, x11, #2
47*25c485e1SGetz Mikalsen	lsl	x3, x13, x14			// string head
48*25c485e1SGetz Mikalsen	lsl	x4, x13, x15
49*25c485e1SGetz Mikalsen
50*25c485e1SGetz Mikalsen	cmeq	v5.16b, v0.16b, #0
51*25c485e1SGetz Mikalsen	cmeq	v6.16b, v1.16b, #0
52*25c485e1SGetz Mikalsen
53*25c485e1SGetz Mikalsen	shrn	v5.8b, v5.8h, #4
54*25c485e1SGetz Mikalsen	shrn	v6.8b, v6.8h, #4
55*25c485e1SGetz Mikalsen	fmov	x5, d5
56*25c485e1SGetz Mikalsen	fmov	x6, d6
57*25c485e1SGetz Mikalsen
58*25c485e1SGetz Mikalsen	adrp	x14, shift_data
59*25c485e1SGetz Mikalsen	add	x14, x14, :lo12:shift_data
60*25c485e1SGetz Mikalsen
61*25c485e1SGetz Mikalsen	/* heads may cross page boundary, avoid unmapped loads */
62*25c485e1SGetz Mikalsen	tst	x5, x3
63*25c485e1SGetz Mikalsen	b.eq	0f
64*25c485e1SGetz Mikalsen
65*25c485e1SGetz Mikalsen	ldr	q4, [x14, x9]			// load permutation table
66*25c485e1SGetz Mikalsen	tbl	v0.16b, {v0.16b}, v4.16b
67*25c485e1SGetz Mikalsen
68*25c485e1SGetz Mikalsen	b	1f
69*25c485e1SGetz Mikalsen	.p2align 4
70*25c485e1SGetz Mikalsen0:
71*25c485e1SGetz Mikalsen	ldr	q0, [x0]			// load true head
72*25c485e1SGetz Mikalsen1:
73*25c485e1SGetz Mikalsen	tst	x6, x4
74*25c485e1SGetz Mikalsen	b.eq	0f
75*25c485e1SGetz Mikalsen
76*25c485e1SGetz Mikalsen	ldr	q4, [x14, x11]
77*25c485e1SGetz Mikalsen	tbl	v4.16b, {v1.16b}, v4.16b
78*25c485e1SGetz Mikalsen
79*25c485e1SGetz Mikalsen	b 1f
80*25c485e1SGetz Mikalsen
81*25c485e1SGetz Mikalsen	.p2align 4
82*25c485e1SGetz Mikalsen.Lbegin:
83*25c485e1SGetz Mikalsen	ldr	q0, [x0]			// load true heads
84*25c485e1SGetz Mikalsen0:
85*25c485e1SGetz Mikalsen	ldr	q4, [x1]
86*25c485e1SGetz Mikalsen1:
87*25c485e1SGetz Mikalsen	cmeq	v2.16b, v0.16b, #0		// NUL byte present?
88*25c485e1SGetz Mikalsen	cmeq	v4.16b, v0.16b, v4.16b		// which bytes match?
89*25c485e1SGetz Mikalsen
90*25c485e1SGetz Mikalsen	orn	v2.16b, v2.16b, v4.16b		// mismatch or NUL byte?
91*25c485e1SGetz Mikalsen
92*25c485e1SGetz Mikalsen	shrn	v2.8b, v2.8h, #4
93*25c485e1SGetz Mikalsen	fmov	x5, d2
94*25c485e1SGetz Mikalsen
95*25c485e1SGetz Mikalsen	cbnz	x5, .Lhead_mismatch
96*25c485e1SGetz Mikalsen	/* load head and second chunk */
97*25c485e1SGetz Mikalsen	ldr	q2, [x8, #16]			// load second chunk
98*25c485e1SGetz Mikalsen	ldr	q3, [x10, #16]
99*25c485e1SGetz Mikalsen
100*25c485e1SGetz Mikalsen	add	x2, x2, x11
101*25c485e1SGetz Mikalsen	sub	x2, x2, #16
102*25c485e1SGetz Mikalsen
103*25c485e1SGetz Mikalsen	subs	x9, x9, x11			// is a&0xf >= b&0xf
104*25c485e1SGetz Mikalsen	b.lo	.Lswapped			// if not swap operands
105*25c485e1SGetz Mikalsen	b	.Lnormal
106*25c485e1SGetz Mikalsen
107*25c485e1SGetz Mikalsen	.p2align 4
108*25c485e1SGetz Mikalsen.Llt16:
109*25c485e1SGetz Mikalsen	/*
110*25c485e1SGetz Mikalsen	 * Check if either string is located at end of page to avoid crossing
111*25c485e1SGetz Mikalsen	 * into unmapped page. If so, we load 16 bytes from the nearest
112*25c485e1SGetz Mikalsen	 * alignment boundary and shift based on the offset.
113*25c485e1SGetz Mikalsen	 */
114*25c485e1SGetz Mikalsen	tbz	w3, #PAGE_SHIFT, 2f
115*25c485e1SGetz Mikalsen
116*25c485e1SGetz Mikalsen	ldr	q0, [x8]			// load aligned head
117*25c485e1SGetz Mikalsen	ldr	q1, [x10]
118*25c485e1SGetz Mikalsen
119*25c485e1SGetz Mikalsen	lsl	x14, x9, #2
120*25c485e1SGetz Mikalsen	lsl	x15, x11, #2
121*25c485e1SGetz Mikalsen	lsl	x3, x13, x14			// string head
122*25c485e1SGetz Mikalsen	lsl	x4, x13, x15
123*25c485e1SGetz Mikalsen
124*25c485e1SGetz Mikalsen	/* Introduce a null byte match if the limit is within the aligned chunk */
125*25c485e1SGetz Mikalsen	add	x14, x2, x9
126*25c485e1SGetz Mikalsen	add	x15, x2, x11
127*25c485e1SGetz Mikalsen	lsl	x14, x14, #2
128*25c485e1SGetz Mikalsen	lsl	x15, x15, #2
129*25c485e1SGetz Mikalsen	lsl	x14, x16, x14
130*25c485e1SGetz Mikalsen	lsl	x15, x16, x15
131*25c485e1SGetz Mikalsen
132*25c485e1SGetz Mikalsen	cmeq	v5.16b, v0.16b, #0
133*25c485e1SGetz Mikalsen	cmeq	v6.16b, v1.16b, #0
134*25c485e1SGetz Mikalsen
135*25c485e1SGetz Mikalsen	shrn	v5.8b, v5.8h, #4
136*25c485e1SGetz Mikalsen	shrn	v6.8b, v6.8h, #4
137*25c485e1SGetz Mikalsen	fmov	x5, d5
138*25c485e1SGetz Mikalsen	fmov	x6, d6
139*25c485e1SGetz Mikalsen
140*25c485e1SGetz Mikalsen	orr	x5, x5, x14			// insert match at limit
141*25c485e1SGetz Mikalsen	orr	x6, x6, x15
142*25c485e1SGetz Mikalsen
143*25c485e1SGetz Mikalsen	adrp	x14, shift_data
144*25c485e1SGetz Mikalsen	add	x14, x14, :lo12:shift_data
145*25c485e1SGetz Mikalsen
146*25c485e1SGetz Mikalsen	/* heads may cross page boundary, avoid unmapped loads */
147*25c485e1SGetz Mikalsen	tst	x5, x3
148*25c485e1SGetz Mikalsen	b.eq	0f
149*25c485e1SGetz Mikalsen
150*25c485e1SGetz Mikalsen	ldr	q4, [x14, x9]			// load permutation table
151*25c485e1SGetz Mikalsen	tbl	v0.16b, {v0.16b}, v4.16b
152*25c485e1SGetz Mikalsen
153*25c485e1SGetz Mikalsen	b	1f
154*25c485e1SGetz Mikalsen	.p2align 4
155*25c485e1SGetz Mikalsen0:
156*25c485e1SGetz Mikalsen	ldr	q0, [x0]			// load true head
157*25c485e1SGetz Mikalsen1:
158*25c485e1SGetz Mikalsen	tst	x6, x4
159*25c485e1SGetz Mikalsen	b.eq	0f
160*25c485e1SGetz Mikalsen
161*25c485e1SGetz Mikalsen	ldr	q4, [x14, x11]
162*25c485e1SGetz Mikalsen	tbl	v4.16b, {v1.16b}, v4.16b
163*25c485e1SGetz Mikalsen
164*25c485e1SGetz Mikalsen	b 1f
165*25c485e1SGetz Mikalsen
166*25c485e1SGetz Mikalsen	.p2align 4
167*25c485e1SGetz Mikalsen2:
168*25c485e1SGetz Mikalsen	ldr	q0, [x0]			// load true heads
169*25c485e1SGetz Mikalsen0:
170*25c485e1SGetz Mikalsen	ldr	q4, [x1]
171*25c485e1SGetz Mikalsen1:
172*25c485e1SGetz Mikalsen
173*25c485e1SGetz Mikalsen	cmeq	v2.16b, v0.16b, #0		// NUL byte present?
174*25c485e1SGetz Mikalsen	cmeq	v4.16b, v0.16b, v4.16b		// which bytes match?
175*25c485e1SGetz Mikalsen
176*25c485e1SGetz Mikalsen	bic	v2.16b, v4.16b, v2.16b		// match and not NUL byte
177*25c485e1SGetz Mikalsen
178*25c485e1SGetz Mikalsen	shrn	v2.8b, v2.8h, #4
179*25c485e1SGetz Mikalsen	fmov	x5, d2
180*25c485e1SGetz Mikalsen	lsl	x4, x2, #2
181*25c485e1SGetz Mikalsen	lsl	x4, x13, x4
182*25c485e1SGetz Mikalsen	orn	x5, x4, x5			// mismatch or NUL byte?
183*25c485e1SGetz Mikalsen
184*25c485e1SGetz Mikalsen.Lhead_mismatch:
185*25c485e1SGetz Mikalsen	rbit	x3, x5
186*25c485e1SGetz Mikalsen	clz	x3, x3				// index of mismatch
187*25c485e1SGetz Mikalsen	lsr	x3, x3, #2
188*25c485e1SGetz Mikalsen	ldrb	w4, [x0, x3]
189*25c485e1SGetz Mikalsen	ldrb	w5, [x1, x3]
190*25c485e1SGetz Mikalsen	sub	w0, w4, w5
191*25c485e1SGetz Mikalsen	ret
192*25c485e1SGetz Mikalsen
193*25c485e1SGetz Mikalsen	.p2align 4
194*25c485e1SGetz Mikalsen.Lnormal:
195*25c485e1SGetz Mikalsen	sub	x12, x10, x9
196*25c485e1SGetz Mikalsen	ldr	q0, [x12, #16]!
197*25c485e1SGetz Mikalsen	sub	x10, x10, x8
198*25c485e1SGetz Mikalsen	sub	x11, x10, x9
199*25c485e1SGetz Mikalsen
200*25c485e1SGetz Mikalsen	cmeq	v1.16b, v3.16b, #0		// NUL present?
201*25c485e1SGetz Mikalsen	cmeq	v0.16b, v0.16b, v2.16b		// Mismatch between chunks?
202*25c485e1SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
203*25c485e1SGetz Mikalsen	shrn	v0.8b, v0.8h, #4
204*25c485e1SGetz Mikalsen	fmov	x6, d1
205*25c485e1SGetz Mikalsen	fmov	x5, d0
206*25c485e1SGetz Mikalsen
207*25c485e1SGetz Mikalsen	add	x8, x8, #32			// advance to next iteration
208*25c485e1SGetz Mikalsen
209*25c485e1SGetz Mikalsen	lsl	x4, x2, #2
210*25c485e1SGetz Mikalsen	lsl	x4, x13, x4
211*25c485e1SGetz Mikalsen	orr	x3, x6, x4			// introduce a null byte match
212*25c485e1SGetz Mikalsen	cmp	x2, #16				// does the buffer end within x2
213*25c485e1SGetz Mikalsen	csel	x6, x3, x6, lo
214*25c485e1SGetz Mikalsen	cbnz	x6, .Lnulfound2			// NUL or end of buffer found?
215*25c485e1SGetz Mikalsen	mvn	x5, x5
216*25c485e1SGetz Mikalsen	cbnz	x5, .Lmismatch2
217*25c485e1SGetz Mikalsen	sub	x2, x2, #16
218*25c485e1SGetz Mikalsen	cmp	x2, #32				// end of buffer?
219*25c485e1SGetz Mikalsen	b.lo	.Ltail
220*25c485e1SGetz Mikalsen	/*
221*25c485e1SGetz Mikalsen	 * During the main loop, the layout of the two strings is something like:
222*25c485e1SGetz Mikalsen	 *
223*25c485e1SGetz Mikalsen	 *          v ------1------ v ------2------ v
224*25c485e1SGetz Mikalsen	 *      X0:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
225*25c485e1SGetz Mikalsen	 *      X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
226*25c485e1SGetz Mikalsen	 *
227*25c485e1SGetz Mikalsen	 * where v indicates the alignment boundaries and corresponding chunks
228*25c485e1SGetz Mikalsen	 * of the strings have the same letters.  Chunk A has been checked in
229*25c485e1SGetz Mikalsen	 * the previous iteration.  This iteration, we first check that string
230*25c485e1SGetz Mikalsen	 * X1 doesn't end within region 2, then we compare chunk B between the
231*25c485e1SGetz Mikalsen	 * two strings.  As X1 is known not to hold a NUL byte in regions 1
232*25c485e1SGetz Mikalsen	 * and 2 at this point, this also ensures that x0 has not ended yet.
233*25c485e1SGetz Mikalsen	 */
234*25c485e1SGetz Mikalsen	.p2align 4
235*25c485e1SGetz Mikalsen0:
236*25c485e1SGetz Mikalsen	ldr	q0, [x8, x11]
237*25c485e1SGetz Mikalsen	ldr	q1, [x8, x10]
238*25c485e1SGetz Mikalsen	ldr	q2, [x8]
239*25c485e1SGetz Mikalsen
240*25c485e1SGetz Mikalsen	cmeq	v1.16b, v1.16b, #0		// end of string?
241*25c485e1SGetz Mikalsen	cmeq	v0.16b, v0.16b, v2.16b		// do the chunks match?
242*25c485e1SGetz Mikalsen
243*25c485e1SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
244*25c485e1SGetz Mikalsen	shrn	v0.8b, v0.8h, #4
245*25c485e1SGetz Mikalsen	fmov	x6, d1
246*25c485e1SGetz Mikalsen	fmov	x5, d0
247*25c485e1SGetz Mikalsen	cbnz	x6, .Lnulfound
248*25c485e1SGetz Mikalsen	mvn	x5, x5				// any mismatches?
249*25c485e1SGetz Mikalsen	cbnz	x5, .Lmismatch
250*25c485e1SGetz Mikalsen
251*25c485e1SGetz Mikalsen	add	x8, x8, #16
252*25c485e1SGetz Mikalsen
253*25c485e1SGetz Mikalsen	/* main loop unrolled twice */
254*25c485e1SGetz Mikalsen	ldr	q0, [x8, x11]
255*25c485e1SGetz Mikalsen	ldr	q1, [x8, x10]
256*25c485e1SGetz Mikalsen	ldr	q2, [x8]
257*25c485e1SGetz Mikalsen
258*25c485e1SGetz Mikalsen	add	x8, x8, #16
259*25c485e1SGetz Mikalsen	cmeq	v1.16b, v1.16b, #0
260*25c485e1SGetz Mikalsen	cmeq	v0.16b, v0.16b, v2.16b
261*25c485e1SGetz Mikalsen
262*25c485e1SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
263*25c485e1SGetz Mikalsen	shrn	v0.8b, v0.8h, #4
264*25c485e1SGetz Mikalsen	fmov	x6, d1
265*25c485e1SGetz Mikalsen	fmov	x5, d0
266*25c485e1SGetz Mikalsen	cbnz	x6, .Lnulfound2
267*25c485e1SGetz Mikalsen	mvn	x5, x5
268*25c485e1SGetz Mikalsen	cbnz	x5, .Lmismatch2
269*25c485e1SGetz Mikalsen	sub	x2, x2, #32
270*25c485e1SGetz Mikalsen	cmp	x2, #32				// end of buffer?
271*25c485e1SGetz Mikalsen	b.hs	0b				// if yes, process tail
272*25c485e1SGetz Mikalsen
273*25c485e1SGetz Mikalsen	/* end of buffer will occur in next 32 bytes */
274*25c485e1SGetz Mikalsen.Ltail:
275*25c485e1SGetz Mikalsen	ldr	q0, [x8, x11]
276*25c485e1SGetz Mikalsen	ldr	q1, [x8, x10]
277*25c485e1SGetz Mikalsen	ldr	q2, [x8]
278*25c485e1SGetz Mikalsen
279*25c485e1SGetz Mikalsen	cmeq	v1.16b, v1.16b, #0		// end of string?
280*25c485e1SGetz Mikalsen	cmeq	v0.16b, v0.16b, v2.16b		// do the chunks match?
281*25c485e1SGetz Mikalsen
282*25c485e1SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
283*25c485e1SGetz Mikalsen	shrn	v0.8b, v0.8h, #4
284*25c485e1SGetz Mikalsen	fmov	x6, d1
285*25c485e1SGetz Mikalsen	fmov	x5, d0
286*25c485e1SGetz Mikalsen
287*25c485e1SGetz Mikalsen	/*
288*25c485e1SGetz Mikalsen	 * If x2 <= 16 then we introduce a NUL byte in the
289*25c485e1SGetz Mikalsen	 * result from CMEQ to avoid comparing further!
290*25c485e1SGetz Mikalsen	 */
291*25c485e1SGetz Mikalsen
292*25c485e1SGetz Mikalsen	lsl	x4, x2, #2
293*25c485e1SGetz Mikalsen	lsl	x4, x13, x4
294*25c485e1SGetz Mikalsen	orr	x3, x6, x4			// introduce a null byte match
295*25c485e1SGetz Mikalsen	cmp	x2, #16				// does the buffer end within x2
296*25c485e1SGetz Mikalsen	csel	x6, x3, x6, lo
297*25c485e1SGetz Mikalsen
298*25c485e1SGetz Mikalsen	cbnz	x6, .Lnulfound			// NUL or end of string found
299*25c485e1SGetz Mikalsen	mvn	x5, x5
300*25c485e1SGetz Mikalsen	cbnz	x5, .Lmismatch
301*25c485e1SGetz Mikalsen
302*25c485e1SGetz Mikalsen	add	x8, x8, #16
303*25c485e1SGetz Mikalsen
304*25c485e1SGetz Mikalsen	/* main loop unrolled twice */
305*25c485e1SGetz Mikalsen	ldr	q0, [x8, x11]
306*25c485e1SGetz Mikalsen	ldr	q1, [x8, x10]
307*25c485e1SGetz Mikalsen	ldr	q2, [x8]
308*25c485e1SGetz Mikalsen
309*25c485e1SGetz Mikalsen	add	x8, x8, #16
310*25c485e1SGetz Mikalsen	cmeq	v1.16b, v1.16b, #0
311*25c485e1SGetz Mikalsen	cmeq	v0.16b, v0.16b, v2.16b
312*25c485e1SGetz Mikalsen
313*25c485e1SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
314*25c485e1SGetz Mikalsen	shrn	v0.8b, v0.8h, #4
315*25c485e1SGetz Mikalsen	fmov	x6, d1
316*25c485e1SGetz Mikalsen	fmov	x5, d0
317*25c485e1SGetz Mikalsen
318*25c485e1SGetz Mikalsen	ubfiz	x4, x2, #2, #4	// (x2 - 16) << 2
319*25c485e1SGetz Mikalsen	lsl	x4, x13, x4			// take first half into account
320*25c485e1SGetz Mikalsen	orr	x6, x6, x4			// introduce a null byte match
321*25c485e1SGetz Mikalsen
322*25c485e1SGetz Mikalsen.Lnulfound2:
323*25c485e1SGetz Mikalsen	sub	x8, x8, #16
324*25c485e1SGetz Mikalsen
325*25c485e1SGetz Mikalsen.Lnulfound:
326*25c485e1SGetz Mikalsen	mov	x4, x6
327*25c485e1SGetz Mikalsen
328*25c485e1SGetz Mikalsen	ubfiz	x7, x9, #2, #4
329*25c485e1SGetz Mikalsen	lsl	x6, x6, x7			// adjust NUL mask to indices
330*25c485e1SGetz Mikalsen
331*25c485e1SGetz Mikalsen	orn	x5, x6, x5
332*25c485e1SGetz Mikalsen	cbnz	x5, .Lmismatch
333*25c485e1SGetz Mikalsen
334*25c485e1SGetz Mikalsen	/*
335*25c485e1SGetz Mikalsen	 * (x0) == (x1) and NUL is past the string.
336*25c485e1SGetz Mikalsen	 * Compare (x1) with the corresponding part
337*25c485e1SGetz Mikalsen	 * of the other string until the NUL byte.
338*25c485e1SGetz Mikalsen	 */
339*25c485e1SGetz Mikalsen	ldr	q0, [x8, x9]
340*25c485e1SGetz Mikalsen	ldr	q1, [x8, x10]
341*25c485e1SGetz Mikalsen
342*25c485e1SGetz Mikalsen	cmeq	v1.16b, v0.16b, v1.16b
343*25c485e1SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
344*25c485e1SGetz Mikalsen	fmov	x5, d1
345*25c485e1SGetz Mikalsen
346*25c485e1SGetz Mikalsen	orn	x5, x4, x5
347*25c485e1SGetz Mikalsen
348*25c485e1SGetz Mikalsen	rbit	x3, x5
349*25c485e1SGetz Mikalsen	clz	x3, x3
350*25c485e1SGetz Mikalsen	lsr	x5, x3, #2
351*25c485e1SGetz Mikalsen
352*25c485e1SGetz Mikalsen	add	x10, x10, x8			// restore x10 pointer
353*25c485e1SGetz Mikalsen	add	x8, x8, x9			// point to corresponding chunk
354*25c485e1SGetz Mikalsen
355*25c485e1SGetz Mikalsen	ldrb	w4, [x8, x5]
356*25c485e1SGetz Mikalsen	ldrb	w5, [x10, x5]
357*25c485e1SGetz Mikalsen	sub	w0, w4, w5
358*25c485e1SGetz Mikalsen	ret
359*25c485e1SGetz Mikalsen
360*25c485e1SGetz Mikalsen	.p2align 4
361*25c485e1SGetz Mikalsen.Lmismatch2:
362*25c485e1SGetz Mikalsen	sub	x8, x8, #16			// roll back second increment
363*25c485e1SGetz Mikalsen.Lmismatch:
364*25c485e1SGetz Mikalsen	rbit	x3, x5
365*25c485e1SGetz Mikalsen	clz	x3, x3				// index of mismatch
366*25c485e1SGetz Mikalsen	lsr	x3, x3, #2
367*25c485e1SGetz Mikalsen	add	x11, x8, x11
368*25c485e1SGetz Mikalsen
369*25c485e1SGetz Mikalsen	ldrb	w4, [x8, x3]
370*25c485e1SGetz Mikalsen	ldrb	w5, [x11, x3]
371*25c485e1SGetz Mikalsen	sub	w0, w4, w5			// byte difference
372*25c485e1SGetz Mikalsen	ret
373*25c485e1SGetz Mikalsen
374*25c485e1SGetz Mikalsen	/*
375*25c485e1SGetz Mikalsen	 * If (a&0xf) < (b&0xf), we do the same thing but with swapped
376*25c485e1SGetz Mikalsen	 * operands.  I found that this performs slightly better than
377*25c485e1SGetz Mikalsen	 * using conditional moves to do the swap branchless.
378*25c485e1SGetz Mikalsen	 */
379*25c485e1SGetz Mikalsen	.p2align 4
380*25c485e1SGetz Mikalsen.Lswapped:
381*25c485e1SGetz Mikalsen	add	x12, x8, x9
382*25c485e1SGetz Mikalsen	ldr	q0, [x12, #16]!
383*25c485e1SGetz Mikalsen	sub	x8, x8, x10
384*25c485e1SGetz Mikalsen	add	x11, x8, x9
385*25c485e1SGetz Mikalsen	add	x2,x2,x9
386*25c485e1SGetz Mikalsen	neg	x9, x9
387*25c485e1SGetz Mikalsen
388*25c485e1SGetz Mikalsen	cmeq	v1.16b, v2.16b, #0
389*25c485e1SGetz Mikalsen	cmeq	v0.16b, v0.16b, v3.16b
390*25c485e1SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
391*25c485e1SGetz Mikalsen	shrn	v0.8b, v0.8h, #4
392*25c485e1SGetz Mikalsen	fmov	x6, d1
393*25c485e1SGetz Mikalsen	fmov	x5, d0
394*25c485e1SGetz Mikalsen
395*25c485e1SGetz Mikalsen	add	x10, x10, #32
396*25c485e1SGetz Mikalsen
397*25c485e1SGetz Mikalsen	lsl	x4, x2, #2
398*25c485e1SGetz Mikalsen	lsl	x4, x13, x4
399*25c485e1SGetz Mikalsen	orr	x3,x6,x4			// introduce a null byte match
400*25c485e1SGetz Mikalsen	cmp	x2,#16
401*25c485e1SGetz Mikalsen	csel	x6, x3, x6, lo
402*25c485e1SGetz Mikalsen	cbnz	x6, .Lnulfound2s
403*25c485e1SGetz Mikalsen	mvn	x5, x5
404*25c485e1SGetz Mikalsen	cbnz	x5, .Lmismatch2s
405*25c485e1SGetz Mikalsen
406*25c485e1SGetz Mikalsen	sub	x2, x2, #16
407*25c485e1SGetz Mikalsen	cmp	x2, #32
408*25c485e1SGetz Mikalsen	b.lo	.Ltails
409*25c485e1SGetz Mikalsen
410*25c485e1SGetz Mikalsen	/*
411*25c485e1SGetz Mikalsen	 * During the main loop, the layout of the two strings is something like:
412*25c485e1SGetz Mikalsen	 *
413*25c485e1SGetz Mikalsen	 *          v ------1------ v ------2------ v
414*25c485e1SGetz Mikalsen	 *      X1:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
415*25c485e1SGetz Mikalsen	 *      X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
416*25c485e1SGetz Mikalsen	 *
417*25c485e1SGetz Mikalsen	 * where v indicates the alignment boundaries and corresponding chunks
418*25c485e1SGetz Mikalsen	 * of the strings have the same letters.  Chunk A has been checked in
419*25c485e1SGetz Mikalsen	 * the previous iteration.  This iteration, we first check that string
420*25c485e1SGetz Mikalsen	 * X0 doesn't end within region 2, then we compare chunk B between the
421*25c485e1SGetz Mikalsen	 * two strings.  As X0 is known not to hold a NUL byte in regions 1
422*25c485e1SGetz Mikalsen	 * and 2 at this point, this also ensures that X1 has not ended yet.
423*25c485e1SGetz Mikalsen	 */
424*25c485e1SGetz Mikalsen	.p2align 4
425*25c485e1SGetz Mikalsen0:
426*25c485e1SGetz Mikalsen	ldr	q0, [x10, x11]
427*25c485e1SGetz Mikalsen	ldr	q1, [x10, x8]
428*25c485e1SGetz Mikalsen	ldr	q2, [x10]
429*25c485e1SGetz Mikalsen
430*25c485e1SGetz Mikalsen	cmeq	v1.16b, v1.16b, #0
431*25c485e1SGetz Mikalsen	cmeq	v0.16b, v0.16b, v2.16b
432*25c485e1SGetz Mikalsen
433*25c485e1SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
434*25c485e1SGetz Mikalsen	shrn	v0.8b, v0.8h, #4
435*25c485e1SGetz Mikalsen	fmov	x6, d1
436*25c485e1SGetz Mikalsen	fmov	x5, d0
437*25c485e1SGetz Mikalsen	cbnz	x6, .Lnulfounds
438*25c485e1SGetz Mikalsen	mvn	x5, x5
439*25c485e1SGetz Mikalsen	cbnz	x5, .Lmismatchs
440*25c485e1SGetz Mikalsen
441*25c485e1SGetz Mikalsen	add	x10, x10, #16
442*25c485e1SGetz Mikalsen
443*25c485e1SGetz Mikalsen	/* main loop unrolled twice */
444*25c485e1SGetz Mikalsen	ldr	q0, [x10, x11]
445*25c485e1SGetz Mikalsen	ldr	q1, [x10, x8]
446*25c485e1SGetz Mikalsen	ldr	q2, [x10]
447*25c485e1SGetz Mikalsen
448*25c485e1SGetz Mikalsen	add	x10, x10, #16
449*25c485e1SGetz Mikalsen	cmeq	v1.16b, v1.16b, #0
450*25c485e1SGetz Mikalsen	cmeq	v0.16b, v0.16b, v2.16b
451*25c485e1SGetz Mikalsen
452*25c485e1SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
453*25c485e1SGetz Mikalsen	shrn	v0.8b, v0.8h, #4
454*25c485e1SGetz Mikalsen	fmov	x6, d1
455*25c485e1SGetz Mikalsen	fmov	x5, d0
456*25c485e1SGetz Mikalsen	cbnz	x6, .Lnulfound2s
457*25c485e1SGetz Mikalsen	mvn	x5, x5
458*25c485e1SGetz Mikalsen	cbnz	x5, .Lmismatch2s
459*25c485e1SGetz Mikalsen	sub	x2, x2, #32
460*25c485e1SGetz Mikalsen	cmp	x2, #32
461*25c485e1SGetz Mikalsen	b.hs	0b
462*25c485e1SGetz Mikalsen
463*25c485e1SGetz Mikalsen.Ltails:
464*25c485e1SGetz Mikalsen	ldr	q0, [x10, x11]
465*25c485e1SGetz Mikalsen	ldr	q1, [x10, x8]
466*25c485e1SGetz Mikalsen	ldr	q2, [x10]
467*25c485e1SGetz Mikalsen
468*25c485e1SGetz Mikalsen	cmeq	v1.16b, v1.16b, #0
469*25c485e1SGetz Mikalsen	cmeq	v0.16b, v0.16b, v2.16b
470*25c485e1SGetz Mikalsen
471*25c485e1SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
472*25c485e1SGetz Mikalsen	shrn	v0.8b, v0.8h, #4
473*25c485e1SGetz Mikalsen	fmov	x6, d1
474*25c485e1SGetz Mikalsen	fmov	x5, d0
475*25c485e1SGetz Mikalsen
476*25c485e1SGetz Mikalsen	/*
477*25c485e1SGetz Mikalsen	 * If x2 <= 16 then we introduce a NUL byte in the
478*25c485e1SGetz Mikalsen	 * result from CMEQ to avoid comparing further!
479*25c485e1SGetz Mikalsen	 */
480*25c485e1SGetz Mikalsen
481*25c485e1SGetz Mikalsen	lsl	x4, x2, #2
482*25c485e1SGetz Mikalsen	lsl	x4, x13, x4
483*25c485e1SGetz Mikalsen	orr	x3, x6, x4			// introduce a null byte match
484*25c485e1SGetz Mikalsen	cmp	x2, #16
485*25c485e1SGetz Mikalsen	csel	x6, x3, x6, lo
486*25c485e1SGetz Mikalsen
487*25c485e1SGetz Mikalsen	cbnz	x6, .Lnulfounds
488*25c485e1SGetz Mikalsen	mvn	x5, x5
489*25c485e1SGetz Mikalsen	cbnz	x5, .Lmismatchs
490*25c485e1SGetz Mikalsen
491*25c485e1SGetz Mikalsen	add	x10, x10, #16
492*25c485e1SGetz Mikalsen
493*25c485e1SGetz Mikalsen	ldr	q0, [x10, x11]
494*25c485e1SGetz Mikalsen	ldr	q1, [x10, x8]
495*25c485e1SGetz Mikalsen	ldr	q2, [x10]
496*25c485e1SGetz Mikalsen
497*25c485e1SGetz Mikalsen	add	x10, x10, #16
498*25c485e1SGetz Mikalsen	cmeq	v1.16b, v1.16b, #0
499*25c485e1SGetz Mikalsen	cmeq	v0.16b, v0.16b, v2.16b
500*25c485e1SGetz Mikalsen
501*25c485e1SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
502*25c485e1SGetz Mikalsen	shrn	v0.8b, v0.8h, #4
503*25c485e1SGetz Mikalsen	fmov	x6, d1
504*25c485e1SGetz Mikalsen	fmov	x5, d0
505*25c485e1SGetz Mikalsen
506*25c485e1SGetz Mikalsen	ubfiz	x4, x2, #2, #4
507*25c485e1SGetz Mikalsen	lsl	x4, x13, x4
508*25c485e1SGetz Mikalsen	orr	x6, x6, x4			// introduce a null byte match
509*25c485e1SGetz Mikalsen
510*25c485e1SGetz Mikalsen.Lnulfound2s:
511*25c485e1SGetz Mikalsen	sub	x10, x10, #16
512*25c485e1SGetz Mikalsen.Lnulfounds:
513*25c485e1SGetz Mikalsen	mov	x4, x6
514*25c485e1SGetz Mikalsen
515*25c485e1SGetz Mikalsen	ubfiz	x7, x9, #2, #4
516*25c485e1SGetz Mikalsen	lsl	x6, x6, x7
517*25c485e1SGetz Mikalsen
518*25c485e1SGetz Mikalsen	orn	x5, x6, x5
519*25c485e1SGetz Mikalsen
520*25c485e1SGetz Mikalsen	cbnz	x5, .Lmismatchs
521*25c485e1SGetz Mikalsen
522*25c485e1SGetz Mikalsen	ldr	q0, [x10, x9]
523*25c485e1SGetz Mikalsen	ldr	q1, [x10, x8]
524*25c485e1SGetz Mikalsen
525*25c485e1SGetz Mikalsen	cmeq	v1.16b, v0.16b, v1.16b
526*25c485e1SGetz Mikalsen	shrn	v1.8b, v1.8h, #4
527*25c485e1SGetz Mikalsen	fmov	x5, d1
528*25c485e1SGetz Mikalsen
529*25c485e1SGetz Mikalsen	orn	x5, x4, x5
530*25c485e1SGetz Mikalsen
531*25c485e1SGetz Mikalsen	rbit	x3, x5
532*25c485e1SGetz Mikalsen	clz	x3, x3
533*25c485e1SGetz Mikalsen	lsr	x5, x3, #2
534*25c485e1SGetz Mikalsen
535*25c485e1SGetz Mikalsen	add	x11, x10, x8
536*25c485e1SGetz Mikalsen	add	x10, x10, x9
537*25c485e1SGetz Mikalsen
538*25c485e1SGetz Mikalsen	ldrb	w4, [x10, x5]
539*25c485e1SGetz Mikalsen	ldrb	w5, [x11, x5]
540*25c485e1SGetz Mikalsen	sub	w0, w5, w4
541*25c485e1SGetz Mikalsen	ret
542*25c485e1SGetz Mikalsen
543*25c485e1SGetz Mikalsen	.p2align 4
544*25c485e1SGetz Mikalsen.Lmismatch2s:
545*25c485e1SGetz Mikalsen	sub	x10, x10, #16
546*25c485e1SGetz Mikalsen.Lmismatchs:
547*25c485e1SGetz Mikalsen	rbit	x3, x5
548*25c485e1SGetz Mikalsen	clz	x3, x3
549*25c485e1SGetz Mikalsen	lsr	x3, x3, #2
550*25c485e1SGetz Mikalsen	add	x11, x10, x11
551*25c485e1SGetz Mikalsen
552*25c485e1SGetz Mikalsen	ldrb	w4, [x10, x3]
553*25c485e1SGetz Mikalsen	ldrb	w5, [x11, x3]
554*25c485e1SGetz Mikalsen	sub	w0, w5, w4
555*25c485e1SGetz Mikalsen	ret
556*25c485e1SGetz Mikalsen
557*25c485e1SGetz Mikalsen	.p2align 4
558*25c485e1SGetz Mikalsen.Lempty:
559*25c485e1SGetz Mikalsen	eor	x0, x0, x0
560*25c485e1SGetz Mikalsen	ret
561*25c485e1SGetz Mikalsen
562*25c485e1SGetz MikalsenEND(__strncmp)
563*25c485e1SGetz Mikalsen
564*25c485e1SGetz Mikalsen	.section .rodata
565*25c485e1SGetz Mikalsen	.p2align 4
566*25c485e1SGetz Mikalsenshift_data:
567*25c485e1SGetz Mikalsen	.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
568*25c485e1SGetz Mikalsen	.fill 16, 1, -1
569*25c485e1SGetz Mikalsen	.size shift_data, .-shift_data
570