xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strncmp.S (revision 072a4ba82a01476eaee33781ccd241033eefcf0b)
1/*
2 * strncmp - compare two strings
3 *
4 * Copyright (c) 2013-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64.
11 * MTE compatible.
12 */
13
14#include "asmdefs.h"
15
16#define REP8_01 0x0101010101010101
17#define REP8_7f 0x7f7f7f7f7f7f7f7f
18
19/* Parameters and result.  */
20#define src1		x0
21#define src2		x1
22#define limit		x2
23#define result		x0
24
25/* Internal variables.  */
26#define data1		x3
27#define data1w		w3
28#define data2		x4
29#define data2w		w4
30#define has_nul		x5
31#define diff		x6
32#define syndrome	x7
33#define tmp1		x8
34#define tmp2		x9
35#define tmp3		x10
36#define zeroones	x11
37#define pos		x12
38#define mask		x13
39#define endloop		x14
40#define count		mask
41#define offset		pos
42#define neg_offset	x15
43
44/* Define endian dependent shift operations.
45   On big-endian early bytes are at MSB and on little-endian LSB.
46   LS_FW means shifting towards early bytes.
47   LS_BK means shifting towards later bytes.
48   */
49#ifdef __AARCH64EB__
50#define LS_FW lsl
51#define LS_BK lsr
52#else
53#define LS_FW lsr
54#define LS_BK lsl
55#endif
56
57ENTRY (__strncmp_aarch64)
58	PTR_ARG (0)
59	PTR_ARG (1)
60	SIZE_ARG (2)
61	cbz	limit, L(ret0)
62	eor	tmp1, src1, src2
63	mov	zeroones, #REP8_01
64	tst	tmp1, #7
65	and	count, src1, #7
66	b.ne	L(misaligned8)
67	cbnz	count, L(mutual_align)
68
69	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
70	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
71	   can be done in parallel across the entire word.  */
72	.p2align 4
73L(loop_aligned):
74	ldr	data1, [src1], #8
75	ldr	data2, [src2], #8
76L(start_realigned):
77	subs	limit, limit, #8
78	sub	tmp1, data1, zeroones
79	orr	tmp2, data1, #REP8_7f
80	eor	diff, data1, data2	/* Non-zero if differences found.  */
81	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
82	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
83	ccmp	endloop, #0, #0, eq
84	b.eq	L(loop_aligned)
85	/* End of main loop */
86
87L(full_check):
88#ifndef __AARCH64EB__
89	orr	syndrome, diff, has_nul
90	add	limit, limit, 8	/* Rewind limit to before last subs. */
91L(syndrome_check):
92	/* Limit was reached. Check if the NUL byte or the difference
93	   is before the limit. */
94	rev	syndrome, syndrome
95	rev	data1, data1
96	clz	pos, syndrome
97	rev	data2, data2
98	lsl	data1, data1, pos
99	cmp	limit, pos, lsr #3
100	lsl	data2, data2, pos
101	/* But we need to zero-extend (char is unsigned) the value and then
102	   perform a signed 32-bit subtraction.  */
103	lsr	data1, data1, #56
104	sub	result, data1, data2, lsr #56
105	csel result, result, xzr, hi
106	ret
107#else
108	/* Not reached the limit, must have found the end or a diff.  */
109	tbz	limit, #63, L(not_limit)
110	add	tmp1, limit, 8
111	cbz	limit, L(not_limit)
112
113	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
114	mov	mask, #~0
115	lsr	mask, mask, limit
116	bic	data1, data1, mask
117	bic	data2, data2, mask
118
119	/* Make sure that the NUL byte is marked in the syndrome.  */
120	orr	has_nul, has_nul, mask
121
122L(not_limit):
123	/* For big-endian we cannot use the trick with the syndrome value
124	   as carry-propagation can corrupt the upper bits if the trailing
125	   bytes in the string contain 0x01.  */
126	/* However, if there is no NUL byte in the dword, we can generate
127	   the result directly.  We can't just subtract the bytes as the
128	   MSB might be significant.  */
129	cbnz	has_nul, 1f
130	cmp	data1, data2
131	cset	result, ne
132	cneg	result, result, lo
133	ret
1341:
135	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
136	rev	tmp3, data1
137	sub	tmp1, tmp3, zeroones
138	orr	tmp2, tmp3, #REP8_7f
139	bic	has_nul, tmp1, tmp2
140	rev	has_nul, has_nul
141	orr	syndrome, diff, has_nul
142	clz	pos, syndrome
143	/* The most-significant-non-zero bit of the syndrome marks either the
144	   first bit that is different, or the top bit of the first zero byte.
145	   Shifting left now will bring the critical information into the
146	   top bits.  */
147L(end_quick):
148	lsl	data1, data1, pos
149	lsl	data2, data2, pos
150	/* But we need to zero-extend (char is unsigned) the value and then
151	   perform a signed 32-bit subtraction.  */
152	lsr	data1, data1, #56
153	sub	result, data1, data2, lsr #56
154	ret
155#endif
156
157L(mutual_align):
158	/* Sources are mutually aligned, but are not currently at an
159	   alignment boundary.  Round down the addresses and then mask off
160	   the bytes that precede the start point.
161	   We also need to adjust the limit calculations, but without
162	   overflowing if the limit is near ULONG_MAX.  */
163	bic	src1, src1, #7
164	bic	src2, src2, #7
165	ldr	data1, [src1], #8
166	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
167	ldr	data2, [src2], #8
168	mov	tmp2, #~0
169	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
170	/* Adjust the limit and ensure it doesn't overflow.  */
171	adds	limit, limit, count
172	csinv	limit, limit, xzr, lo
173	orr	data1, data1, tmp2
174	orr	data2, data2, tmp2
175	b	L(start_realigned)
176
177	.p2align 4
178	/* Don't bother with dwords for up to 16 bytes.  */
179L(misaligned8):
180	cmp	limit, #16
181	b.hs	L(try_misaligned_words)
182
183L(byte_loop):
184	/* Perhaps we can do better than this.  */
185	ldrb	data1w, [src1], #1
186	ldrb	data2w, [src2], #1
187	subs	limit, limit, #1
188	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
189	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
190	b.eq	L(byte_loop)
191L(done):
192	sub	result, data1, data2
193	ret
194	/* Align the SRC1 to a dword by doing a bytewise compare and then do
195	   the dword loop.  */
196L(try_misaligned_words):
197	cbz	count, L(src1_aligned)
198
199	neg	count, count
200	and	count, count, #7
201	sub	limit, limit, count
202
203L(page_end_loop):
204	ldrb	data1w, [src1], #1
205	ldrb	data2w, [src2], #1
206	cmp	data1w, #1
207	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
208	b.ne	L(done)
209	subs	count, count, #1
210	b.hi	L(page_end_loop)
211
212	/* The following diagram explains the comparison of misaligned strings.
213	   The bytes are shown in natural order. For little-endian, it is
214	   reversed in the registers. The "x" bytes are before the string.
215	   The "|" separates data that is loaded at one time.
216	   src1     | a a a a a a a a | b b b c c c c c | . . .
217	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
218
219	   After shifting in each step, the data looks like this:
220	                STEP_A              STEP_B              STEP_C
221	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
222	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
223
224	   The bytes with "0" are eliminated from the syndrome via mask.
225
226	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
227	   time from SRC2. The comparison happens in 3 steps. After each step
228	   the loop can exit, or read from SRC1 or SRC2. */
229L(src1_aligned):
230	/* Calculate offset from 8 byte alignment to string start in bits. No
231	   need to mask offset since shifts are ignoring upper bits. */
232	lsl	offset, src2, #3
233	bic	src2, src2, #0xf
234	mov	mask, -1
235	neg	neg_offset, offset
236	ldr	data1, [src1], #8
237	ldp	tmp1, tmp2, [src2], #16
238	LS_BK	mask, mask, neg_offset
239	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
240	/* Skip the first compare if data in tmp1 is irrelevant. */
241	tbnz	offset, 6, L(misaligned_mid_loop)
242
243L(loop_misaligned):
244	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
245	LS_FW	data2, tmp1, offset
246	LS_BK	tmp1, tmp2, neg_offset
247	subs	limit, limit, #8
248	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
249	sub	has_nul, data1, zeroones
250	eor	diff, data1, data2	/* Non-zero if differences found.  */
251	orr	tmp3, data1, #REP8_7f
252	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
253	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
254	orr	tmp3, endloop, has_nul
255	cbnz	tmp3, L(full_check)
256
257	ldr	data1, [src1], #8
258L(misaligned_mid_loop):
259	/* STEP_B: Compare first part of data1 to second part of tmp2. */
260	LS_FW	data2, tmp2, offset
261#ifdef __AARCH64EB__
262	/* For big-endian we do a byte reverse to avoid carry-propagation
263	problem described above. This way we can reuse the has_nul in the
264	next step and also use syndrome value trick at the end. */
265	rev	tmp3, data1
266	#define data1_fixed tmp3
267#else
268	#define data1_fixed data1
269#endif
270	sub	has_nul, data1_fixed, zeroones
271	orr	tmp3, data1_fixed, #REP8_7f
272	eor	diff, data2, data1	/* Non-zero if differences found.  */
273	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
274#ifdef __AARCH64EB__
275	rev	has_nul, has_nul
276#endif
277	cmp	limit, neg_offset, lsr #3
278	orr	syndrome, diff, has_nul
279	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
280	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
281	cbnz	tmp3, L(syndrome_check)
282
283	/* STEP_C: Compare second part of data1 to first part of tmp1. */
284	ldp	tmp1, tmp2, [src2], #16
285	cmp	limit, #8
286	LS_BK	data2, tmp1, neg_offset
287	eor	diff, data2, data1	/* Non-zero if differences found.  */
288	orr	syndrome, diff, has_nul
289	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
290	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
291	cbnz	tmp3, L(syndrome_check)
292
293	ldr	data1, [src1], #8
294	sub	limit, limit, #8
295	b	L(loop_misaligned)
296
297#ifdef	__AARCH64EB__
298L(syndrome_check):
299	clz	pos, syndrome
300	cmp	pos, limit, lsl #3
301	b.lo	L(end_quick)
302#endif
303
304L(ret0):
305	mov	result, #0
306	ret
307END(__strncmp_aarch64)
308
309