xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strncmp.S (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1/*
2 * strncmp - compare two strings
3 *
4 * Copyright (c) 2013-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64.
11 * MTE compatible.
12 */
13
14#include "asmdefs.h"
15
16#define REP8_01 0x0101010101010101
17#define REP8_7f 0x7f7f7f7f7f7f7f7f
18
19/* Parameters and result.  */
20#define src1		x0
21#define src2		x1
22#define limit		x2
23#define result		x0
24
25/* Internal variables.  */
26#define data1		x3
27#define data1w		w3
28#define data2		x4
29#define data2w		w4
30#define has_nul		x5
31#define diff		x6
32#define syndrome	x7
33#define tmp1		x8
34#define tmp2		x9
35#define tmp3		x10
36#define zeroones	x11
37#define pos		x12
38#define mask		x13
39#define endloop		x14
40#define count		mask
41#define offset		pos
42#define neg_offset	x15
43
44/* Define endian dependent shift operations.
45   On big-endian early bytes are at MSB and on little-endian LSB.
46   LS_FW means shifting towards early bytes.
47   LS_BK means shifting towards later bytes.
48   */
49#ifdef __AARCH64EB__
50#define LS_FW lsl
51#define LS_BK lsr
52#else
53#define LS_FW lsr
54#define LS_BK lsl
55#endif
56
57ENTRY (__strncmp_aarch64)
58	cbz	limit, L(ret0)
59	eor	tmp1, src1, src2
60	mov	zeroones, #REP8_01
61	tst	tmp1, #7
62	and	count, src1, #7
63	b.ne	L(misaligned8)
64	cbnz	count, L(mutual_align)
65
66	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
67	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
68	   can be done in parallel across the entire word.  */
69	.p2align 4
70L(loop_aligned):
71	ldr	data1, [src1], #8
72	ldr	data2, [src2], #8
73L(start_realigned):
74	subs	limit, limit, #8
75	sub	tmp1, data1, zeroones
76	orr	tmp2, data1, #REP8_7f
77	eor	diff, data1, data2	/* Non-zero if differences found.  */
78	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
79	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
80	ccmp	endloop, #0, #0, eq
81	b.eq	L(loop_aligned)
82	/* End of main loop */
83
84L(full_check):
85#ifndef __AARCH64EB__
86	orr	syndrome, diff, has_nul
87	add	limit, limit, 8	/* Rewind limit to before last subs. */
88L(syndrome_check):
89	/* Limit was reached. Check if the NUL byte or the difference
90	   is before the limit. */
91	rev	syndrome, syndrome
92	rev	data1, data1
93	clz	pos, syndrome
94	rev	data2, data2
95	lsl	data1, data1, pos
96	cmp	limit, pos, lsr #3
97	lsl	data2, data2, pos
98	/* But we need to zero-extend (char is unsigned) the value and then
99	   perform a signed 32-bit subtraction.  */
100	lsr	data1, data1, #56
101	sub	result, data1, data2, lsr #56
102	csel result, result, xzr, hi
103	ret
104#else
105	/* Not reached the limit, must have found the end or a diff.  */
106	tbz	limit, #63, L(not_limit)
107	add	tmp1, limit, 8
108	cbz	limit, L(not_limit)
109
110	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
111	mov	mask, #~0
112	lsr	mask, mask, limit
113	bic	data1, data1, mask
114	bic	data2, data2, mask
115
116	/* Make sure that the NUL byte is marked in the syndrome.  */
117	orr	has_nul, has_nul, mask
118
119L(not_limit):
120	/* For big-endian we cannot use the trick with the syndrome value
121	   as carry-propagation can corrupt the upper bits if the trailing
122	   bytes in the string contain 0x01.  */
123	/* However, if there is no NUL byte in the dword, we can generate
124	   the result directly.  We can't just subtract the bytes as the
125	   MSB might be significant.  */
126	cbnz	has_nul, 1f
127	cmp	data1, data2
128	cset	result, ne
129	cneg	result, result, lo
130	ret
1311:
132	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
133	rev	tmp3, data1
134	sub	tmp1, tmp3, zeroones
135	orr	tmp2, tmp3, #REP8_7f
136	bic	has_nul, tmp1, tmp2
137	rev	has_nul, has_nul
138	orr	syndrome, diff, has_nul
139	clz	pos, syndrome
140	/* The most-significant-non-zero bit of the syndrome marks either the
141	   first bit that is different, or the top bit of the first zero byte.
142	   Shifting left now will bring the critical information into the
143	   top bits.  */
144L(end_quick):
145	lsl	data1, data1, pos
146	lsl	data2, data2, pos
147	/* But we need to zero-extend (char is unsigned) the value and then
148	   perform a signed 32-bit subtraction.  */
149	lsr	data1, data1, #56
150	sub	result, data1, data2, lsr #56
151	ret
152#endif
153
154L(mutual_align):
155	/* Sources are mutually aligned, but are not currently at an
156	   alignment boundary.  Round down the addresses and then mask off
157	   the bytes that precede the start point.
158	   We also need to adjust the limit calculations, but without
159	   overflowing if the limit is near ULONG_MAX.  */
160	bic	src1, src1, #7
161	bic	src2, src2, #7
162	ldr	data1, [src1], #8
163	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
164	ldr	data2, [src2], #8
165	mov	tmp2, #~0
166	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
167	/* Adjust the limit and ensure it doesn't overflow.  */
168	adds	limit, limit, count
169	csinv	limit, limit, xzr, lo
170	orr	data1, data1, tmp2
171	orr	data2, data2, tmp2
172	b	L(start_realigned)
173
174	.p2align 4
175	/* Don't bother with dwords for up to 16 bytes.  */
176L(misaligned8):
177	cmp	limit, #16
178	b.hs	L(try_misaligned_words)
179
180L(byte_loop):
181	/* Perhaps we can do better than this.  */
182	ldrb	data1w, [src1], #1
183	ldrb	data2w, [src2], #1
184	subs	limit, limit, #1
185	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
186	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
187	b.eq	L(byte_loop)
188L(done):
189	sub	result, data1, data2
190	ret
191	/* Align the SRC1 to a dword by doing a bytewise compare and then do
192	   the dword loop.  */
193L(try_misaligned_words):
194	cbz	count, L(src1_aligned)
195
196	neg	count, count
197	and	count, count, #7
198	sub	limit, limit, count
199
200L(page_end_loop):
201	ldrb	data1w, [src1], #1
202	ldrb	data2w, [src2], #1
203	cmp	data1w, #1
204	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
205	b.ne	L(done)
206	subs	count, count, #1
207	b.hi	L(page_end_loop)
208
209	/* The following diagram explains the comparison of misaligned strings.
210	   The bytes are shown in natural order. For little-endian, it is
211	   reversed in the registers. The "x" bytes are before the string.
212	   The "|" separates data that is loaded at one time.
213	   src1     | a a a a a a a a | b b b c c c c c | . . .
214	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
215
216	   After shifting in each step, the data looks like this:
217	                STEP_A              STEP_B              STEP_C
218	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
219	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
220
221	   The bytes with "0" are eliminated from the syndrome via mask.
222
223	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
224	   time from SRC2. The comparison happens in 3 steps. After each step
225	   the loop can exit, or read from SRC1 or SRC2. */
226L(src1_aligned):
227	/* Calculate offset from 8 byte alignment to string start in bits. No
228	   need to mask offset since shifts are ignoring upper bits. */
229	lsl	offset, src2, #3
230	bic	src2, src2, #0xf
231	mov	mask, -1
232	neg	neg_offset, offset
233	ldr	data1, [src1], #8
234	ldp	tmp1, tmp2, [src2], #16
235	LS_BK	mask, mask, neg_offset
236	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
237	/* Skip the first compare if data in tmp1 is irrelevant. */
238	tbnz	offset, 6, L(misaligned_mid_loop)
239
240L(loop_misaligned):
241	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
242	LS_FW	data2, tmp1, offset
243	LS_BK	tmp1, tmp2, neg_offset
244	subs	limit, limit, #8
245	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
246	sub	has_nul, data1, zeroones
247	eor	diff, data1, data2	/* Non-zero if differences found.  */
248	orr	tmp3, data1, #REP8_7f
249	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
250	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
251	orr	tmp3, endloop, has_nul
252	cbnz	tmp3, L(full_check)
253
254	ldr	data1, [src1], #8
255L(misaligned_mid_loop):
256	/* STEP_B: Compare first part of data1 to second part of tmp2. */
257	LS_FW	data2, tmp2, offset
258#ifdef __AARCH64EB__
259	/* For big-endian we do a byte reverse to avoid carry-propagation
260	problem described above. This way we can reuse the has_nul in the
261	next step and also use syndrome value trick at the end. */
262	rev	tmp3, data1
263	#define data1_fixed tmp3
264#else
265	#define data1_fixed data1
266#endif
267	sub	has_nul, data1_fixed, zeroones
268	orr	tmp3, data1_fixed, #REP8_7f
269	eor	diff, data2, data1	/* Non-zero if differences found.  */
270	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
271#ifdef __AARCH64EB__
272	rev	has_nul, has_nul
273#endif
274	cmp	limit, neg_offset, lsr #3
275	orr	syndrome, diff, has_nul
276	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
277	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
278	cbnz	tmp3, L(syndrome_check)
279
280	/* STEP_C: Compare second part of data1 to first part of tmp1. */
281	ldp	tmp1, tmp2, [src2], #16
282	cmp	limit, #8
283	LS_BK	data2, tmp1, neg_offset
284	eor	diff, data2, data1	/* Non-zero if differences found.  */
285	orr	syndrome, diff, has_nul
286	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
287	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
288	cbnz	tmp3, L(syndrome_check)
289
290	ldr	data1, [src1], #8
291	sub	limit, limit, #8
292	b	L(loop_misaligned)
293
294#ifdef	__AARCH64EB__
295L(syndrome_check):
296	clz	pos, syndrome
297	cmp	pos, limit, lsl #3
298	b.lo	L(end_quick)
299#endif
300
301L(ret0):
302	mov	result, #0
303	ret
304END(__strncmp_aarch64)
305
306