xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strncmp.S (revision 716fd348e01c5f2ba125f878a634a753436c2994)
1/*
2 * strncmp - compare two strings
3 *
4 * Copyright (c) 2013-2021, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64
11 */
12
13#include "../asmdefs.h"
14
15#define REP8_01 0x0101010101010101
16#define REP8_7f 0x7f7f7f7f7f7f7f7f
17#define REP8_80 0x8080808080808080
18
19/* Parameters and result.  */
20#define src1		x0
21#define src2		x1
22#define limit		x2
23#define result		x0
24
25/* Internal variables.  */
26#define data1		x3
27#define data1w		w3
28#define data2		x4
29#define data2w		w4
30#define has_nul		x5
31#define diff		x6
32#define syndrome	x7
33#define tmp1		x8
34#define tmp2		x9
35#define tmp3		x10
36#define zeroones	x11
37#define pos		x12
38#define limit_wd	x13
39#define mask		x14
40#define endloop		x15
41#define count		mask
42
43ENTRY (__strncmp_aarch64)
44	PTR_ARG (0)
45	PTR_ARG (1)
46	SIZE_ARG (2)
47	cbz	limit, L(ret0)
48	eor	tmp1, src1, src2
49	mov	zeroones, #REP8_01
50	tst	tmp1, #7
51	and	count, src1, #7
52	b.ne	L(misaligned8)
53	cbnz	count, L(mutual_align)
54	/* Calculate the number of full and partial words -1.  */
55	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
56	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
57
58	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
59	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
60	   can be done in parallel across the entire word.  */
61	.p2align 4
62L(loop_aligned):
63	ldr	data1, [src1], #8
64	ldr	data2, [src2], #8
65L(start_realigned):
66	subs	limit_wd, limit_wd, #1
67	sub	tmp1, data1, zeroones
68	orr	tmp2, data1, #REP8_7f
69	eor	diff, data1, data2	/* Non-zero if differences found.  */
70	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
71	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
72	ccmp	endloop, #0, #0, eq
73	b.eq	L(loop_aligned)
74	/* End of main loop */
75
76	/* Not reached the limit, must have found the end or a diff.  */
77	tbz	limit_wd, #63, L(not_limit)
78
79	/* Limit % 8 == 0 => all bytes significant.  */
80	ands	limit, limit, #7
81	b.eq	L(not_limit)
82
83	lsl	limit, limit, #3	/* Bits -> bytes.  */
84	mov	mask, #~0
85#ifdef __AARCH64EB__
86	lsr	mask, mask, limit
87#else
88	lsl	mask, mask, limit
89#endif
90	bic	data1, data1, mask
91	bic	data2, data2, mask
92
93	/* Make sure that the NUL byte is marked in the syndrome.  */
94	orr	has_nul, has_nul, mask
95
96L(not_limit):
97	orr	syndrome, diff, has_nul
98
99#ifndef	__AARCH64EB__
100	rev	syndrome, syndrome
101	rev	data1, data1
102	/* The MS-non-zero bit of the syndrome marks either the first bit
103	   that is different, or the top bit of the first zero byte.
104	   Shifting left now will bring the critical information into the
105	   top bits.  */
106	clz	pos, syndrome
107	rev	data2, data2
108	lsl	data1, data1, pos
109	lsl	data2, data2, pos
110	/* But we need to zero-extend (char is unsigned) the value and then
111	   perform a signed 32-bit subtraction.  */
112	lsr	data1, data1, #56
113	sub	result, data1, data2, lsr #56
114	ret
115#else
116	/* For big-endian we cannot use the trick with the syndrome value
117	   as carry-propagation can corrupt the upper bits if the trailing
118	   bytes in the string contain 0x01.  */
119	/* However, if there is no NUL byte in the dword, we can generate
120	   the result directly.  We can't just subtract the bytes as the
121	   MSB might be significant.  */
122	cbnz	has_nul, 1f
123	cmp	data1, data2
124	cset	result, ne
125	cneg	result, result, lo
126	ret
1271:
128	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
129	rev	tmp3, data1
130	sub	tmp1, tmp3, zeroones
131	orr	tmp2, tmp3, #REP8_7f
132	bic	has_nul, tmp1, tmp2
133	rev	has_nul, has_nul
134	orr	syndrome, diff, has_nul
135	clz	pos, syndrome
136	/* The MS-non-zero bit of the syndrome marks either the first bit
137	   that is different, or the top bit of the first zero byte.
138	   Shifting left now will bring the critical information into the
139	   top bits.  */
140	lsl	data1, data1, pos
141	lsl	data2, data2, pos
142	/* But we need to zero-extend (char is unsigned) the value and then
143	   perform a signed 32-bit subtraction.  */
144	lsr	data1, data1, #56
145	sub	result, data1, data2, lsr #56
146	ret
147#endif
148
149L(mutual_align):
150	/* Sources are mutually aligned, but are not currently at an
151	   alignment boundary.  Round down the addresses and then mask off
152	   the bytes that precede the start point.
153	   We also need to adjust the limit calculations, but without
154	   overflowing if the limit is near ULONG_MAX.  */
155	bic	src1, src1, #7
156	bic	src2, src2, #7
157	ldr	data1, [src1], #8
158	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
159	ldr	data2, [src2], #8
160	mov	tmp2, #~0
161	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
162#ifdef __AARCH64EB__
163	/* Big-endian.  Early bytes are at MSB.  */
164	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
165#else
166	/* Little-endian.  Early bytes are at LSB.  */
167	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
168#endif
169	and	tmp3, limit_wd, #7
170	lsr	limit_wd, limit_wd, #3
171	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
172	add	limit, limit, count
173	add	tmp3, tmp3, count
174	orr	data1, data1, tmp2
175	orr	data2, data2, tmp2
176	add	limit_wd, limit_wd, tmp3, lsr #3
177	b	L(start_realigned)
178
179	.p2align 4
180	/* Don't bother with dwords for up to 16 bytes.  */
181L(misaligned8):
182	cmp	limit, #16
183	b.hs	L(try_misaligned_words)
184
185L(byte_loop):
186	/* Perhaps we can do better than this.  */
187	ldrb	data1w, [src1], #1
188	ldrb	data2w, [src2], #1
189	subs	limit, limit, #1
190	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
191	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
192	b.eq	L(byte_loop)
193L(done):
194	sub	result, data1, data2
195	ret
196	/* Align the SRC1 to a dword by doing a bytewise compare and then do
197	   the dword loop.  */
198L(try_misaligned_words):
199	lsr	limit_wd, limit, #3
200	cbz	count, L(do_misaligned)
201
202	neg	count, count
203	and	count, count, #7
204	sub	limit, limit, count
205	lsr	limit_wd, limit, #3
206
207L(page_end_loop):
208	ldrb	data1w, [src1], #1
209	ldrb	data2w, [src2], #1
210	cmp	data1w, #1
211	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
212	b.ne	L(done)
213	subs	count, count, #1
214	b.hi	L(page_end_loop)
215
216L(do_misaligned):
217	/* Prepare ourselves for the next page crossing.  Unlike the aligned
218	   loop, we fetch 1 less dword because we risk crossing bounds on
219	   SRC2.  */
220	mov	count, #8
221	subs	limit_wd, limit_wd, #1
222	b.lo	L(done_loop)
223L(loop_misaligned):
224	and	tmp2, src2, #0xff8
225	eor	tmp2, tmp2, #0xff8
226	cbz	tmp2, L(page_end_loop)
227
228	ldr	data1, [src1], #8
229	ldr	data2, [src2], #8
230	sub	tmp1, data1, zeroones
231	orr	tmp2, data1, #REP8_7f
232	eor	diff, data1, data2	/* Non-zero if differences found.  */
233	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
234	ccmp	diff, #0, #0, eq
235	b.ne	L(not_limit)
236	subs	limit_wd, limit_wd, #1
237	b.pl	L(loop_misaligned)
238
239L(done_loop):
240	/* We found a difference or a NULL before the limit was reached.  */
241	and	limit, limit, #7
242	cbz	limit, L(not_limit)
243	/* Read the last word.  */
244	sub	src1, src1, 8
245	sub	src2, src2, 8
246	ldr	data1, [src1, limit]
247	ldr	data2, [src2, limit]
248	sub	tmp1, data1, zeroones
249	orr	tmp2, data1, #REP8_7f
250	eor	diff, data1, data2	/* Non-zero if differences found.  */
251	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
252	ccmp	diff, #0, #0, eq
253	b.ne	L(not_limit)
254
255L(ret0):
256	mov	result, #0
257	ret
258
259END ( __strncmp_aarch64)
260
261