xref: /freebsd/contrib/arm-optimized-routines/string/arm/strcmp.S (revision c1d255d3ffdbe447de3ab875bf4e7d7accc5bfc5)
1/*
2 * strcmp for ARMv7
3 *
4 * Copyright (c) 2012-2021, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
9
10/* Implementation of strcmp for ARMv7 when DSP instructions are
11   available.  Use ldrd to support wider loads, provided the data
12   is sufficiently aligned.  Use saturating arithmetic to optimize
13   the compares.  */
14
15#include "../asmdefs.h"
16
17/* Build Options:
18   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
19   byte in the string.  If comparing completely random strings
20   the pre-check will save time, since there is a very high
21   probability of a mismatch in the first character: we save
22   significant overhead if this is the common case.  However,
23   if strings are likely to be identical (eg because we're
24   verifying a hit in a hash table), then this check is largely
25   redundant.  */
26
27#define STRCMP_NO_PRECHECK	0
28
29	/* This version uses Thumb-2 code.  */
30	.thumb
31	.syntax unified
32
33#ifdef __ARM_BIG_ENDIAN
34#define S2LO lsl
35#define S2LOEQ lsleq
36#define S2HI lsr
37#define MSB 0x000000ff
38#define LSB 0xff000000
39#define BYTE0_OFFSET 24
40#define BYTE1_OFFSET 16
41#define BYTE2_OFFSET 8
42#define BYTE3_OFFSET 0
43#else /* not  __ARM_BIG_ENDIAN */
44#define S2LO lsr
45#define S2LOEQ lsreq
46#define S2HI lsl
47#define BYTE0_OFFSET 0
48#define BYTE1_OFFSET 8
49#define BYTE2_OFFSET 16
50#define BYTE3_OFFSET 24
51#define MSB 0xff000000
52#define LSB 0x000000ff
53#endif /* not  __ARM_BIG_ENDIAN */
54
55/* Parameters and result.  */
56#define src1		r0
57#define src2		r1
58#define result		r0	/* Overlaps src1.  */
59
60/* Internal variables.  */
61#define tmp1		r4
62#define tmp2		r5
63#define const_m1	r12
64
65/* Additional internal variables for 64-bit aligned data.  */
66#define data1a		r2
67#define data1b		r3
68#define data2a		r6
69#define data2b		r7
70#define syndrome_a	tmp1
71#define syndrome_b	tmp2
72
73/* Additional internal variables for 32-bit aligned data.  */
74#define data1		r2
75#define data2		r3
76#define syndrome	tmp2
77
78
79	/* Macro to compute and return the result value for word-aligned
80	   cases.  */
81	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
82#ifdef __ARM_BIG_ENDIAN
83	/* If data1 contains a zero byte, then syndrome will contain a 1 in
84	   bit 7 of that byte.  Otherwise, the highest set bit in the
85	   syndrome will highlight the first different bit.  It is therefore
86	   sufficient to extract the eight bits starting with the syndrome
87	   bit.  */
88	clz	tmp1, \synd
89	lsl	r1, \d2, tmp1
90	.if \restore_r6
91	ldrd	r6, r7, [sp, #8]
92	.endif
93	.cfi_restore 6
94	.cfi_restore 7
95	lsl	\d1, \d1, tmp1
96	.cfi_remember_state
97	lsr	result, \d1, #24
98	ldrd	r4, r5, [sp], #16
99	.cfi_restore 4
100	.cfi_restore 5
101	sub	result, result, r1, lsr #24
102	bx	lr
103#else
104	/* To use the big-endian trick we'd have to reverse all three words.
105	   that's slower than this approach.  */
106	rev	\synd, \synd
107	clz	tmp1, \synd
108	bic	tmp1, tmp1, #7
109	lsr	r1, \d2, tmp1
110	.cfi_remember_state
111	.if \restore_r6
112	ldrd	r6, r7, [sp, #8]
113	.endif
114	.cfi_restore 6
115	.cfi_restore 7
116	lsr	\d1, \d1, tmp1
117	and	result, \d1, #255
118	and	r1, r1, #255
119	ldrd	r4, r5, [sp], #16
120	.cfi_restore 4
121	.cfi_restore 5
122	sub	result, result, r1
123
124	bx	lr
125#endif
126	.endm
127
128	.p2align	5
129L(strcmp_start_addr):
130#if STRCMP_NO_PRECHECK == 0
131L(fastpath_exit):
132	sub	r0, r2, r3
133	bx	lr
134	nop
135#endif
136ENTRY_ALIGN (__strcmp_arm, 0)
137#if STRCMP_NO_PRECHECK == 0
138	ldrb	r2, [src1]
139	ldrb	r3, [src2]
140	cmp	r2, #1
141	it	cs
142	cmpcs	r2, r3
143	bne	L(fastpath_exit)
144#endif
145	strd	r4, r5, [sp, #-16]!
146	.cfi_def_cfa_offset 16
147	.cfi_offset 4, -16
148	.cfi_offset 5, -12
149	orr	tmp1, src1, src2
150	strd	r6, r7, [sp, #8]
151	.cfi_offset 6, -8
152	.cfi_offset 7, -4
153	mvn	const_m1, #0
154	lsl	r2, tmp1, #29
155	cbz	r2, L(loop_aligned8)
156
157L(not_aligned):
158	eor	tmp1, src1, src2
159	tst	tmp1, #7
160	bne	L(misaligned8)
161
162	/* Deal with mutual misalignment by aligning downwards and then
163	   masking off the unwanted loaded data to prevent a difference.  */
164	and	tmp1, src1, #7
165	bic	src1, src1, #7
166	and	tmp2, tmp1, #3
167	bic	src2, src2, #7
168	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
169	ldrd	data1a, data1b, [src1], #16
170	tst	tmp1, #4
171	ldrd	data2a, data2b, [src2], #16
172	/* In thumb code we can't use MVN with a register shift, but
173	   we do have ORN.  */
174	S2HI	tmp1, const_m1, tmp2
175	orn	data1a, data1a, tmp1
176	orn	data2a, data2a, tmp1
177	beq	L(start_realigned8)
178	orn	data1b, data1b, tmp1
179	mov	data1a, const_m1
180	orn	data2b, data2b, tmp1
181	mov	data2a, const_m1
182	b	L(start_realigned8)
183
184	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
185	   pass.  */
186	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
187	.p2align 2	/* Always word aligned.  */
188L(loop_aligned8):
189	ldrd	data1a, data1b, [src1], #16
190	ldrd	data2a, data2b, [src2], #16
191L(start_realigned8):
192	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
193	eor	syndrome_a, data1a, data2a
194	sel	syndrome_a, syndrome_a, const_m1
195	cbnz	syndrome_a, L(diff_in_a)
196	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
197	eor	syndrome_b, data1b, data2b
198	sel	syndrome_b, syndrome_b, const_m1
199	cbnz	syndrome_b, L(diff_in_b)
200
201	ldrd	data1a, data1b, [src1, #-8]
202	ldrd	data2a, data2b, [src2, #-8]
203	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
204	eor	syndrome_a, data1a, data2a
205	sel	syndrome_a, syndrome_a, const_m1
206	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
207	eor	syndrome_b, data1b, data2b
208	sel	syndrome_b, syndrome_b, const_m1
209	/* Can't use CBZ for backwards branch.  */
210	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
211	beq	L(loop_aligned8)
212
213L(diff_found):
214	cbnz	syndrome_a, L(diff_in_a)
215
216L(diff_in_b):
217	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
218
219L(diff_in_a):
220	.cfi_restore_state
221	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
222
223	.cfi_restore_state
224L(misaligned8):
225	tst	tmp1, #3
226	bne	L(misaligned4)
227	ands	tmp1, src1, #3
228	bne	L(mutual_align4)
229
230	/* Unrolled by a factor of 2, to reduce the number of post-increment
231	   operations.  */
232L(loop_aligned4):
233	ldr	data1, [src1], #8
234	ldr	data2, [src2], #8
235L(start_realigned4):
236	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
237	eor	syndrome, data1, data2
238	sel	syndrome, syndrome, const_m1
239	cbnz	syndrome, L(aligned4_done)
240	ldr	data1, [src1, #-4]
241	ldr	data2, [src2, #-4]
242	uadd8	syndrome, data1, const_m1
243	eor	syndrome, data1, data2
244	sel	syndrome, syndrome, const_m1
245	cmp	syndrome, #0
246	beq	L(loop_aligned4)
247
248L(aligned4_done):
249	strcmp_epilogue_aligned syndrome, data1, data2, 0
250
251L(mutual_align4):
252	.cfi_restore_state
253	/* Deal with mutual misalignment by aligning downwards and then
254	   masking off the unwanted loaded data to prevent a difference.  */
255	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
256	bic	src1, src1, #3
257	ldr	data1, [src1], #8
258	bic	src2, src2, #3
259	ldr	data2, [src2], #8
260
261	/* In thumb code we can't use MVN with a register shift, but
262	   we do have ORN.  */
263	S2HI	tmp1, const_m1, tmp1
264	orn	data1, data1, tmp1
265	orn	data2, data2, tmp1
266	b	L(start_realigned4)
267
268L(misaligned4):
269	ands	tmp1, src1, #3
270	beq	L(src1_aligned)
271	sub	src2, src2, tmp1
272	bic	src1, src1, #3
273	lsls	tmp1, tmp1, #31
274	ldr	data1, [src1], #4
275	beq	L(aligned_m2)
276	bcs	L(aligned_m1)
277
278#if STRCMP_NO_PRECHECK == 1
279	ldrb	data2, [src2, #1]
280	uxtb	tmp1, data1, ror #BYTE1_OFFSET
281	subs	tmp1, tmp1, data2
282	bne	L(misaligned_exit)
283	cbz	data2, L(misaligned_exit)
284
285L(aligned_m2):
286	ldrb	data2, [src2, #2]
287	uxtb	tmp1, data1, ror #BYTE2_OFFSET
288	subs	tmp1, tmp1, data2
289	bne	L(misaligned_exit)
290	cbz	data2, L(misaligned_exit)
291
292L(aligned_m1):
293	ldrb	data2, [src2, #3]
294	uxtb	tmp1, data1, ror #BYTE3_OFFSET
295	subs	tmp1, tmp1, data2
296	bne	L(misaligned_exit)
297	add	src2, src2, #4
298	cbnz	data2, L(src1_aligned)
299#else  /* STRCMP_NO_PRECHECK */
300	/* If we've done the pre-check, then we don't need to check the
301	   first byte again here.  */
302	ldrb	data2, [src2, #2]
303	uxtb	tmp1, data1, ror #BYTE2_OFFSET
304	subs	tmp1, tmp1, data2
305	bne	L(misaligned_exit)
306	cbz	data2, L(misaligned_exit)
307
308L(aligned_m2):
309	ldrb	data2, [src2, #3]
310	uxtb	tmp1, data1, ror #BYTE3_OFFSET
311	subs	tmp1, tmp1, data2
312	bne	L(misaligned_exit)
313	cbnz	data2, L(aligned_m1)
314#endif
315
316L(misaligned_exit):
317	.cfi_remember_state
318	mov	result, tmp1
319	ldr	r4, [sp], #16
320	.cfi_restore 4
321	bx	lr
322
323#if STRCMP_NO_PRECHECK == 0
324L(aligned_m1):
325	add	src2, src2, #4
326#endif
327L(src1_aligned):
328	.cfi_restore_state
329	/* src1 is word aligned, but src2 has no common alignment
330	   with it.  */
331	ldr	data1, [src1], #4
332	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
333
334	bic	src2, src2, #3
335	ldr	data2, [src2], #4
336	bhi	L(overlap1)		/* C=1, Z=0 => src2[1:0] = 0b11.  */
337	bcs	L(overlap2)		/* C=1, Z=1 => src2[1:0] = 0b10.  */
338
339	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
340L(overlap3):
341	bic	tmp1, data1, #MSB
342	uadd8	syndrome, data1, const_m1
343	eors	syndrome, tmp1, data2, S2LO #8
344	sel	syndrome, syndrome, const_m1
345	bne	4f
346	cbnz	syndrome, 5f
347	ldr	data2, [src2], #4
348	eor	tmp1, tmp1, data1
349	cmp	tmp1, data2, S2HI #24
350	bne	6f
351	ldr	data1, [src1], #4
352	b	L(overlap3)
3534:
354	S2LO	data2, data2, #8
355	b	L(strcmp_tail)
356
3575:
358	bics	syndrome, syndrome, #MSB
359	bne	L(strcmp_done_equal)
360
361	/* We can only get here if the MSB of data1 contains 0, so
362	   fast-path the exit.  */
363	ldrb	result, [src2]
364	.cfi_remember_state
365	ldrd	r4, r5, [sp], #16
366	.cfi_restore 4
367	.cfi_restore 5
368	/* R6/7 Not used in this sequence.  */
369	.cfi_restore 6
370	.cfi_restore 7
371	neg	result, result
372	bx	lr
373
3746:
375	.cfi_restore_state
376	S2LO	data1, data1, #24
377	and	data2, data2, #LSB
378	b	L(strcmp_tail)
379
380	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
381L(overlap2):
382	and	tmp1, data1, const_m1, S2LO #16
383	uadd8	syndrome, data1, const_m1
384	eors	syndrome, tmp1, data2, S2LO #16
385	sel	syndrome, syndrome, const_m1
386	bne	4f
387	cbnz	syndrome, 5f
388	ldr	data2, [src2], #4
389	eor	tmp1, tmp1, data1
390	cmp	tmp1, data2, S2HI #16
391	bne	6f
392	ldr	data1, [src1], #4
393	b	L(overlap2)
3944:
395	S2LO	data2, data2, #16
396	b	L(strcmp_tail)
3975:
398	ands	syndrome, syndrome, const_m1, S2LO #16
399	bne	L(strcmp_done_equal)
400
401	ldrh	data2, [src2]
402	S2LO	data1, data1, #16
403#ifdef __ARM_BIG_ENDIAN
404	lsl	data2, data2, #16
405#endif
406	b	L(strcmp_tail)
407
4086:
409	S2LO	data1, data1, #16
410	and	data2, data2, const_m1, S2LO #16
411	b	L(strcmp_tail)
412
413	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
414L(overlap1):
415	and	tmp1, data1, #LSB
416	uadd8	syndrome, data1, const_m1
417	eors	syndrome, tmp1, data2, S2LO #24
418	sel	syndrome, syndrome, const_m1
419	bne	4f
420	cbnz	syndrome, 5f
421	ldr	data2, [src2], #4
422	eor	tmp1, tmp1, data1
423	cmp	tmp1, data2, S2HI #8
424	bne	6f
425	ldr	data1, [src1], #4
426	b	L(overlap1)
4274:
428	S2LO	data2, data2, #24
429	b	L(strcmp_tail)
4305:
431	tst	syndrome, #LSB
432	bne	L(strcmp_done_equal)
433	ldr	data2, [src2]
4346:
435	S2LO	data1, data1, #8
436	bic	data2, data2, #MSB
437	b	L(strcmp_tail)
438
439L(strcmp_done_equal):
440	mov	result, #0
441	.cfi_remember_state
442	ldrd	r4, r5, [sp], #16
443	.cfi_restore 4
444	.cfi_restore 5
445	/* R6/7 not used in this sequence.  */
446	.cfi_restore 6
447	.cfi_restore 7
448	bx	lr
449
450L(strcmp_tail):
451	.cfi_restore_state
452#ifndef __ARM_BIG_ENDIAN
453	rev	data1, data1
454	rev	data2, data2
455	/* Now everything looks big-endian...  */
456#endif
457	uadd8	tmp1, data1, const_m1
458	eor	tmp1, data1, data2
459	sel	syndrome, tmp1, const_m1
460	clz	tmp1, syndrome
461	lsl	data1, data1, tmp1
462	lsl	data2, data2, tmp1
463	lsr	result, data1, #24
464	ldrd	r4, r5, [sp], #16
465	.cfi_restore 4
466	.cfi_restore 5
467	/* R6/7 not used in this sequence.  */
468	.cfi_restore 6
469	.cfi_restore 7
470	sub	result, result, data2, lsr #24
471	bx	lr
472
473END (__strcmp_arm)
474
475#endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1  */
476