xref: /freebsd/contrib/arm-optimized-routines/string/arm/strcmp.S (revision 258a0d760aa8b42899a000e30f610f900a402556)
1/*
2 * strcmp for ARMv7
3 *
4 * Copyright (c) 2012-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
9
10/* Implementation of strcmp for ARMv7 when DSP instructions are
11   available.  Use ldrd to support wider loads, provided the data
12   is sufficiently aligned.  Use saturating arithmetic to optimize
13   the compares.  */
14
15#include "asmdefs.h"
16
17/* Build Options:
18   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
19   byte in the string.  If comparing completely random strings
20   the pre-check will save time, since there is a very high
21   probability of a mismatch in the first character: we save
22   significant overhead if this is the common case.  However,
23   if strings are likely to be identical (eg because we're
24   verifying a hit in a hash table), then this check is largely
25   redundant.  */
26
27#define STRCMP_NO_PRECHECK	0
28
29/* Ensure the .cantunwind directive is prepended to .fnend.
30   Leaf functions cannot throw exceptions - EHABI only supports
31   synchronous exceptions.  */
32#define IS_LEAF
33
34	/* This version uses Thumb-2 code.  */
35	.thumb
36	.syntax unified
37
38#ifdef __ARM_BIG_ENDIAN
39#define S2LO lsl
40#define S2LOEQ lsleq
41#define S2HI lsr
42#define MSB 0x000000ff
43#define LSB 0xff000000
44#define BYTE0_OFFSET 24
45#define BYTE1_OFFSET 16
46#define BYTE2_OFFSET 8
47#define BYTE3_OFFSET 0
48#else /* not  __ARM_BIG_ENDIAN */
49#define S2LO lsr
50#define S2LOEQ lsreq
51#define S2HI lsl
52#define BYTE0_OFFSET 0
53#define BYTE1_OFFSET 8
54#define BYTE2_OFFSET 16
55#define BYTE3_OFFSET 24
56#define MSB 0xff000000
57#define LSB 0x000000ff
58#endif /* not  __ARM_BIG_ENDIAN */
59
60/* Parameters and result.  */
61#define src1		r0
62#define src2		r1
63#define result		r0	/* Overlaps src1.  */
64
65/* Internal variables.  */
66#define tmp1		r4
67#define tmp2		r5
68#define const_m1	r12
69
70/* Additional internal variables for 64-bit aligned data.  */
71#define data1a		r2
72#define data1b		r3
73#define data2a		r6
74#define data2b		r7
75#define syndrome_a	tmp1
76#define syndrome_b	tmp2
77
78/* Additional internal variables for 32-bit aligned data.  */
79#define data1		r2
80#define data2		r3
81#define syndrome	tmp2
82
83
84	/* Macro to compute and return the result value for word-aligned
85	   cases.  */
86	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
87#ifdef __ARM_BIG_ENDIAN
88	/* If data1 contains a zero byte, then syndrome will contain a 1 in
89	   bit 7 of that byte.  Otherwise, the highest set bit in the
90	   syndrome will highlight the first different bit.  It is therefore
91	   sufficient to extract the eight bits starting with the syndrome
92	   bit.  */
93	clz	tmp1, \synd
94	lsl	r1, \d2, tmp1
95	.if \restore_r6
96	ldrd	r6, r7, [sp, #8]
97	.endif
98	.cfi_restore 6
99	.cfi_restore 7
100	lsl	\d1, \d1, tmp1
101	.cfi_remember_state
102	lsr	result, \d1, #24
103	ldrd	r4, r5, [sp], #16
104	.cfi_restore 4
105	.cfi_restore 5
106	.cfi_adjust_cfa_offset -16
107	sub	result, result, r1, lsr #24
108	epilogue push_ip=HAVE_PAC_LEAF
109#else
110	/* To use the big-endian trick we'd have to reverse all three words.
111	   that's slower than this approach.  */
112	rev	\synd, \synd
113	clz	tmp1, \synd
114	bic	tmp1, tmp1, #7
115	lsr	r1, \d2, tmp1
116	.cfi_remember_state
117	.if \restore_r6
118	ldrd	r6, r7, [sp, #8]
119	.endif
120	.cfi_restore 6
121	.cfi_restore 7
122	lsr	\d1, \d1, tmp1
123	and	result, \d1, #255
124	and	r1, r1, #255
125	ldrd	r4, r5, [sp], #16
126	.cfi_restore 4
127	.cfi_restore 5
128	.cfi_adjust_cfa_offset -16
129	sub	result, result, r1
130
131	epilogue push_ip=HAVE_PAC_LEAF
132#endif
133	.endm
134
135ENTRY(__strcmp_arm)
136	prologue push_ip=HAVE_PAC_LEAF
137#if STRCMP_NO_PRECHECK == 0
138	ldrb	r2, [src1]
139	ldrb	r3, [src2]
140	cmp	r2, #1
141	it	cs
142	cmpcs	r2, r3
143	bne	L(fastpath_exit)
144#endif
145	strd	r4, r5, [sp, #-16]!
146	.cfi_adjust_cfa_offset 16
147	.cfi_rel_offset 4, 0
148	.cfi_rel_offset 5, 4
149	orr	tmp1, src1, src2
150	strd	r6, r7, [sp, #8]
151	.cfi_rel_offset 6, 8
152	.cfi_rel_offset 7, 12
153	mvn	const_m1, #0
154	lsl	r2, tmp1, #29
155	cbz	r2, L(loop_aligned8)
156
157L(not_aligned):
158	eor	tmp1, src1, src2
159	tst	tmp1, #7
160	bne	L(misaligned8)
161
162	/* Deal with mutual misalignment by aligning downwards and then
163	   masking off the unwanted loaded data to prevent a difference.  */
164	and	tmp1, src1, #7
165	bic	src1, src1, #7
166	and	tmp2, tmp1, #3
167	bic	src2, src2, #7
168	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
169	ldrd	data1a, data1b, [src1], #16
170	tst	tmp1, #4
171	ldrd	data2a, data2b, [src2], #16
172	/* In thumb code we can't use MVN with a register shift, but
173	   we do have ORN.  */
174	S2HI	tmp1, const_m1, tmp2
175	orn	data1a, data1a, tmp1
176	orn	data2a, data2a, tmp1
177	beq	L(start_realigned8)
178	orn	data1b, data1b, tmp1
179	mov	data1a, const_m1
180	orn	data2b, data2b, tmp1
181	mov	data2a, const_m1
182	b	L(start_realigned8)
183
184	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
185	   pass.  */
186	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
187	.p2align 2	/* Always word aligned.  */
188L(loop_aligned8):
189	ldrd	data1a, data1b, [src1], #16
190	ldrd	data2a, data2b, [src2], #16
191L(start_realigned8):
192	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
193	eor	syndrome_a, data1a, data2a
194	sel	syndrome_a, syndrome_a, const_m1
195	cbnz	syndrome_a, L(diff_in_a)
196	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
197	eor	syndrome_b, data1b, data2b
198	sel	syndrome_b, syndrome_b, const_m1
199	cbnz	syndrome_b, L(diff_in_b)
200
201	ldrd	data1a, data1b, [src1, #-8]
202	ldrd	data2a, data2b, [src2, #-8]
203	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
204	eor	syndrome_a, data1a, data2a
205	sel	syndrome_a, syndrome_a, const_m1
206	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
207	eor	syndrome_b, data1b, data2b
208	sel	syndrome_b, syndrome_b, const_m1
209	/* Can't use CBZ for backwards branch.  */
210	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
211	beq	L(loop_aligned8)
212
213L(diff_found):
214	cbnz	syndrome_a, L(diff_in_a)
215
216L(diff_in_b):
217	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
218
219L(diff_in_a):
220	.cfi_restore_state
221	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
222
223	.cfi_restore_state
224L(misaligned8):
225	tst	tmp1, #3
226	bne	L(misaligned4)
227	ands	tmp1, src1, #3
228	bne	L(mutual_align4)
229
230	/* Unrolled by a factor of 2, to reduce the number of post-increment
231	   operations.  */
232L(loop_aligned4):
233	ldr	data1, [src1], #8
234	ldr	data2, [src2], #8
235L(start_realigned4):
236	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
237	eor	syndrome, data1, data2
238	sel	syndrome, syndrome, const_m1
239	cbnz	syndrome, L(aligned4_done)
240	ldr	data1, [src1, #-4]
241	ldr	data2, [src2, #-4]
242	uadd8	syndrome, data1, const_m1
243	eor	syndrome, data1, data2
244	sel	syndrome, syndrome, const_m1
245	cmp	syndrome, #0
246	beq	L(loop_aligned4)
247
248L(aligned4_done):
249	strcmp_epilogue_aligned syndrome, data1, data2, 0
250
251L(mutual_align4):
252	.cfi_restore_state
253	/* Deal with mutual misalignment by aligning downwards and then
254	   masking off the unwanted loaded data to prevent a difference.  */
255	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
256	bic	src1, src1, #3
257	ldr	data1, [src1], #8
258	bic	src2, src2, #3
259	ldr	data2, [src2], #8
260
261	/* In thumb code we can't use MVN with a register shift, but
262	   we do have ORN.  */
263	S2HI	tmp1, const_m1, tmp1
264	orn	data1, data1, tmp1
265	orn	data2, data2, tmp1
266	b	L(start_realigned4)
267
268L(misaligned4):
269	ands	tmp1, src1, #3
270	beq	L(src1_aligned)
271	sub	src2, src2, tmp1
272	bic	src1, src1, #3
273	lsls	tmp1, tmp1, #31
274	ldr	data1, [src1], #4
275	beq	L(aligned_m2)
276	bcs	L(aligned_m1)
277
278#if STRCMP_NO_PRECHECK == 1
279	ldrb	data2, [src2, #1]
280	uxtb	tmp1, data1, ror #BYTE1_OFFSET
281	subs	tmp1, tmp1, data2
282	bne	L(misaligned_exit)
283	cbz	data2, L(misaligned_exit)
284
285L(aligned_m2):
286	ldrb	data2, [src2, #2]
287	uxtb	tmp1, data1, ror #BYTE2_OFFSET
288	subs	tmp1, tmp1, data2
289	bne	L(misaligned_exit)
290	cbz	data2, L(misaligned_exit)
291
292L(aligned_m1):
293	ldrb	data2, [src2, #3]
294	uxtb	tmp1, data1, ror #BYTE3_OFFSET
295	subs	tmp1, tmp1, data2
296	bne	L(misaligned_exit)
297	add	src2, src2, #4
298	cbnz	data2, L(src1_aligned)
299#else  /* STRCMP_NO_PRECHECK */
300	/* If we've done the pre-check, then we don't need to check the
301	   first byte again here.  */
302	ldrb	data2, [src2, #2]
303	uxtb	tmp1, data1, ror #BYTE2_OFFSET
304	subs	tmp1, tmp1, data2
305	bne	L(misaligned_exit)
306	cbz	data2, L(misaligned_exit)
307
308L(aligned_m2):
309	ldrb	data2, [src2, #3]
310	uxtb	tmp1, data1, ror #BYTE3_OFFSET
311	subs	tmp1, tmp1, data2
312	bne	L(misaligned_exit)
313	cbnz	data2, L(aligned_m1)
314#endif
315
316L(misaligned_exit):
317	.cfi_remember_state
318	mov	result, tmp1
319	ldr	r4, [sp], #16
320	.cfi_restore 4
321	.cfi_adjust_cfa_offset -16
322	epilogue push_ip=HAVE_PAC_LEAF
323
324#if STRCMP_NO_PRECHECK == 0
325L(fastpath_exit):
326	.cfi_restore_state
327	.cfi_remember_state
328	sub	r0, r2, r3
329	epilogue push_ip=HAVE_PAC_LEAF
330
331L(aligned_m1):
332	.cfi_restore_state
333	.cfi_remember_state
334	add	src2, src2, #4
335#endif
336L(src1_aligned):
337	.cfi_restore_state
338	/* src1 is word aligned, but src2 has no common alignment
339	   with it.  */
340	ldr	data1, [src1], #4
341	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
342
343	bic	src2, src2, #3
344	ldr	data2, [src2], #4
345	bhi	L(overlap1)		/* C=1, Z=0 => src2[1:0] = 0b11.  */
346	bcs	L(overlap2)		/* C=1, Z=1 => src2[1:0] = 0b10.  */
347
348	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
349L(overlap3):
350	bic	tmp1, data1, #MSB
351	uadd8	syndrome, data1, const_m1
352	eors	syndrome, tmp1, data2, S2LO #8
353	sel	syndrome, syndrome, const_m1
354	bne	4f
355	cbnz	syndrome, 5f
356	ldr	data2, [src2], #4
357	eor	tmp1, tmp1, data1
358	cmp	tmp1, data2, S2HI #24
359	bne	6f
360	ldr	data1, [src1], #4
361	b	L(overlap3)
3624:
363	S2LO	data2, data2, #8
364	b	L(strcmp_tail)
365
3665:
367	bics	syndrome, syndrome, #MSB
368	bne	L(strcmp_done_equal)
369
370	/* We can only get here if the MSB of data1 contains 0, so
371	   fast-path the exit.  */
372	ldrb	result, [src2]
373	.cfi_remember_state
374	ldrd	r4, r5, [sp], #16
375	.cfi_restore 4
376	.cfi_restore 5
377	/* R6/7 Not used in this sequence.  */
378	.cfi_restore 6
379	.cfi_restore 7
380	.cfi_adjust_cfa_offset -16
381	neg	result, result
382	epilogue push_ip=HAVE_PAC_LEAF
3836:
384	.cfi_restore_state
385	S2LO	data1, data1, #24
386	and	data2, data2, #LSB
387	b	L(strcmp_tail)
388
389	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
390L(overlap2):
391	and	tmp1, data1, const_m1, S2LO #16
392	uadd8	syndrome, data1, const_m1
393	eors	syndrome, tmp1, data2, S2LO #16
394	sel	syndrome, syndrome, const_m1
395	bne	4f
396	cbnz	syndrome, 5f
397	ldr	data2, [src2], #4
398	eor	tmp1, tmp1, data1
399	cmp	tmp1, data2, S2HI #16
400	bne	6f
401	ldr	data1, [src1], #4
402	b	L(overlap2)
4034:
404	S2LO	data2, data2, #16
405	b	L(strcmp_tail)
4065:
407	ands	syndrome, syndrome, const_m1, S2LO #16
408	bne	L(strcmp_done_equal)
409
410	ldrh	data2, [src2]
411	S2LO	data1, data1, #16
412#ifdef __ARM_BIG_ENDIAN
413	lsl	data2, data2, #16
414#endif
415	b	L(strcmp_tail)
416
4176:
418	S2LO	data1, data1, #16
419	and	data2, data2, const_m1, S2LO #16
420	b	L(strcmp_tail)
421
422	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
423L(overlap1):
424	and	tmp1, data1, #LSB
425	uadd8	syndrome, data1, const_m1
426	eors	syndrome, tmp1, data2, S2LO #24
427	sel	syndrome, syndrome, const_m1
428	bne	4f
429	cbnz	syndrome, 5f
430	ldr	data2, [src2], #4
431	eor	tmp1, tmp1, data1
432	cmp	tmp1, data2, S2HI #8
433	bne	6f
434	ldr	data1, [src1], #4
435	b	L(overlap1)
4364:
437	S2LO	data2, data2, #24
438	b	L(strcmp_tail)
4395:
440	tst	syndrome, #LSB
441	bne	L(strcmp_done_equal)
442	ldr	data2, [src2]
4436:
444	S2LO	data1, data1, #8
445	bic	data2, data2, #MSB
446	b	L(strcmp_tail)
447
448L(strcmp_done_equal):
449	mov	result, #0
450	.cfi_remember_state
451	ldrd	r4, r5, [sp], #16
452	.cfi_restore 4
453	.cfi_restore 5
454	/* R6/7 not used in this sequence.  */
455	.cfi_restore 6
456	.cfi_restore 7
457	.cfi_adjust_cfa_offset -16
458	epilogue push_ip=HAVE_PAC_LEAF
459
460L(strcmp_tail):
461	.cfi_restore_state
462#ifndef __ARM_BIG_ENDIAN
463	rev	data1, data1
464	rev	data2, data2
465	/* Now everything looks big-endian...  */
466#endif
467	uadd8	tmp1, data1, const_m1
468	eor	tmp1, data1, data2
469	sel	syndrome, tmp1, const_m1
470	clz	tmp1, syndrome
471	lsl	data1, data1, tmp1
472	lsl	data2, data2, tmp1
473	lsr	result, data1, #24
474	ldrd	r4, r5, [sp], #16
475	.cfi_restore 4
476	.cfi_restore 5
477	/* R6/7 not used in this sequence.  */
478	.cfi_restore 6
479	.cfi_restore 7
480	.cfi_adjust_cfa_offset -16
481	sub	result, result, data2, lsr #24
482	epilogue push_ip=HAVE_PAC_LEAF
483
484END (__strcmp_arm)
485
486#endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1  */
487