xref: /freebsd/contrib/cortex-strings/src/arm/strcmp.S (revision 193d9e768ba63fcfb187cfd17f461f7d41345048)
1/*
2 * Copyright (c) 2012-2014 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 *    products derived from this software without specific prior written
15 *    permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/* Implementation of strcmp for ARMv7 when DSP instructions are
30   available.  Use ldrd to support wider loads, provided the data
31   is sufficiently aligned.  Use saturating arithmetic to optimize
32   the compares.  */
33
34/* Build Options:
35   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
36   byte in the string.  If comparing completely random strings
37   the pre-check will save time, since there is a very high
38   probability of a mismatch in the first character: we save
39   significant overhead if this is the common case.  However,
40   if strings are likely to be identical (eg because we're
41   verifying a hit in a hash table), then this check is largely
42   redundant.  */
43
44#define STRCMP_NO_PRECHECK	0
45
46	/* This version uses Thumb-2 code.  */
47	.thumb
48	.syntax unified
49
50#ifdef __ARM_BIG_ENDIAN
51#define S2LO lsl
52#define S2LOEQ lsleq
53#define S2HI lsr
54#define MSB 0x000000ff
55#define LSB 0xff000000
56#define BYTE0_OFFSET 24
57#define BYTE1_OFFSET 16
58#define BYTE2_OFFSET 8
59#define BYTE3_OFFSET 0
60#else /* not  __ARM_BIG_ENDIAN */
61#define S2LO lsr
62#define S2LOEQ lsreq
63#define S2HI lsl
64#define BYTE0_OFFSET 0
65#define BYTE1_OFFSET 8
66#define BYTE2_OFFSET 16
67#define BYTE3_OFFSET 24
68#define MSB 0xff000000
69#define LSB 0x000000ff
70#endif /* not  __ARM_BIG_ENDIAN */
71
72	.macro def_fn f p2align=0
73	.text
74	.p2align \p2align
75	.global \f
76	.type \f, %function
77\f:
78	.endm
79
80/* Parameters and result.  */
81#define src1		r0
82#define src2		r1
83#define result		r0	/* Overlaps src1.  */
84
85/* Internal variables.  */
86#define tmp1		r4
87#define tmp2		r5
88#define const_m1	r12
89
90/* Additional internal variables for 64-bit aligned data.  */
91#define data1a		r2
92#define data1b		r3
93#define data2a		r6
94#define data2b		r7
95#define syndrome_a	tmp1
96#define syndrome_b	tmp2
97
98/* Additional internal variables for 32-bit aligned data.  */
99#define data1		r2
100#define data2		r3
101#define syndrome	tmp2
102
103
104	/* Macro to compute and return the result value for word-aligned
105	   cases.  */
106	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
107#ifdef __ARM_BIG_ENDIAN
108	/* If data1 contains a zero byte, then syndrome will contain a 1 in
109	   bit 7 of that byte.  Otherwise, the highest set bit in the
110	   syndrome will highlight the first different bit.  It is therefore
111	   sufficient to extract the eight bits starting with the syndrome
112	   bit.  */
113	clz	tmp1, \synd
114	lsl	r1, \d2, tmp1
115	.if \restore_r6
116	ldrd	r6, r7, [sp, #8]
117	.endif
118	.cfi_restore 6
119	.cfi_restore 7
120	lsl	\d1, \d1, tmp1
121	.cfi_remember_state
122	lsr	result, \d1, #24
123	ldrd	r4, r5, [sp], #16
124	.cfi_restore 4
125	.cfi_restore 5
126	sub	result, result, r1, lsr #24
127	bx	lr
128#else
129	/* To use the big-endian trick we'd have to reverse all three words.
130	   that's slower than this approach.  */
131	rev	\synd, \synd
132	clz	tmp1, \synd
133	bic	tmp1, tmp1, #7
134	lsr	r1, \d2, tmp1
135	.cfi_remember_state
136	.if \restore_r6
137	ldrd	r6, r7, [sp, #8]
138	.endif
139	.cfi_restore 6
140	.cfi_restore 7
141	lsr	\d1, \d1, tmp1
142	and	result, \d1, #255
143	and	r1, r1, #255
144	ldrd	r4, r5, [sp], #16
145	.cfi_restore 4
146	.cfi_restore 5
147	sub	result, result, r1
148
149	bx	lr
150#endif
151	.endm
152
153	.text
154	.p2align	5
155.Lstrcmp_start_addr:
156#if STRCMP_NO_PRECHECK == 0
157.Lfastpath_exit:
158	sub	r0, r2, r3
159	bx	lr
160	nop
161#endif
162def_fn	strcmp
163#if STRCMP_NO_PRECHECK == 0
164	ldrb	r2, [src1]
165	ldrb	r3, [src2]
166	cmp	r2, #1
167	it	cs
168	cmpcs	r2, r3
169	bne	.Lfastpath_exit
170#endif
171	.cfi_startproc
172	strd	r4, r5, [sp, #-16]!
173	.cfi_def_cfa_offset 16
174	.cfi_offset 4, -16
175	.cfi_offset 5, -12
176	orr	tmp1, src1, src2
177	strd	r6, r7, [sp, #8]
178	.cfi_offset 6, -8
179	.cfi_offset 7, -4
180	mvn	const_m1, #0
181	lsl	r2, tmp1, #29
182	cbz	r2, .Lloop_aligned8
183
184.Lnot_aligned:
185	eor	tmp1, src1, src2
186	tst	tmp1, #7
187	bne	.Lmisaligned8
188
189	/* Deal with mutual misalignment by aligning downwards and then
190	   masking off the unwanted loaded data to prevent a difference.  */
191	and	tmp1, src1, #7
192	bic	src1, src1, #7
193	and	tmp2, tmp1, #3
194	bic	src2, src2, #7
195	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
196	ldrd	data1a, data1b, [src1], #16
197	tst	tmp1, #4
198	ldrd	data2a, data2b, [src2], #16
199	/* In thumb code we can't use MVN with a register shift, but
200	   we do have ORN.  */
201	S2HI	tmp1, const_m1, tmp2
202	orn	data1a, data1a, tmp1
203	orn	data2a, data2a, tmp1
204	beq	.Lstart_realigned8
205	orn	data1b, data1b, tmp1
206	mov	data1a, const_m1
207	orn	data2b, data2b, tmp1
208	mov	data2a, const_m1
209	b	.Lstart_realigned8
210
211	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
212	   pass.  */
213	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
214	.p2align 2	/* Always word aligned.  */
215.Lloop_aligned8:
216	ldrd	data1a, data1b, [src1], #16
217	ldrd	data2a, data2b, [src2], #16
218.Lstart_realigned8:
219	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
220	eor	syndrome_a, data1a, data2a
221	sel	syndrome_a, syndrome_a, const_m1
222	cbnz	syndrome_a, .Ldiff_in_a
223	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
224	eor	syndrome_b, data1b, data2b
225	sel	syndrome_b, syndrome_b, const_m1
226	cbnz	syndrome_b, .Ldiff_in_b
227
228	ldrd	data1a, data1b, [src1, #-8]
229	ldrd	data2a, data2b, [src2, #-8]
230	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
231	eor	syndrome_a, data1a, data2a
232	sel	syndrome_a, syndrome_a, const_m1
233	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
234	eor	syndrome_b, data1b, data2b
235	sel	syndrome_b, syndrome_b, const_m1
236	/* Can't use CBZ for backwards branch.  */
237	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
238	beq	.Lloop_aligned8
239
240.Ldiff_found:
241	cbnz	syndrome_a, .Ldiff_in_a
242
243.Ldiff_in_b:
244	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
245
246.Ldiff_in_a:
247	.cfi_restore_state
248	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
249
250	.cfi_restore_state
251.Lmisaligned8:
252	tst	tmp1, #3
253	bne	.Lmisaligned4
254	ands	tmp1, src1, #3
255	bne	.Lmutual_align4
256
257	/* Unrolled by a factor of 2, to reduce the number of post-increment
258	   operations.  */
259.Lloop_aligned4:
260	ldr	data1, [src1], #8
261	ldr	data2, [src2], #8
262.Lstart_realigned4:
263	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
264	eor	syndrome, data1, data2
265	sel	syndrome, syndrome, const_m1
266	cbnz	syndrome, .Laligned4_done
267	ldr	data1, [src1, #-4]
268	ldr	data2, [src2, #-4]
269	uadd8	syndrome, data1, const_m1
270	eor	syndrome, data1, data2
271	sel	syndrome, syndrome, const_m1
272	cmp	syndrome, #0
273	beq	.Lloop_aligned4
274
275.Laligned4_done:
276	strcmp_epilogue_aligned syndrome, data1, data2, 0
277
278.Lmutual_align4:
279	.cfi_restore_state
280	/* Deal with mutual misalignment by aligning downwards and then
281	   masking off the unwanted loaded data to prevent a difference.  */
282	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
283	bic	src1, src1, #3
284	ldr	data1, [src1], #8
285	bic	src2, src2, #3
286	ldr	data2, [src2], #8
287
288	/* In thumb code we can't use MVN with a register shift, but
289	   we do have ORN.  */
290	S2HI	tmp1, const_m1, tmp1
291	orn	data1, data1, tmp1
292	orn	data2, data2, tmp1
293	b	.Lstart_realigned4
294
295.Lmisaligned4:
296	ands	tmp1, src1, #3
297	beq	.Lsrc1_aligned
298	sub	src2, src2, tmp1
299	bic	src1, src1, #3
300	lsls	tmp1, tmp1, #31
301	ldr	data1, [src1], #4
302	beq	.Laligned_m2
303	bcs	.Laligned_m1
304
305#if STRCMP_NO_PRECHECK == 1
306	ldrb	data2, [src2, #1]
307	uxtb	tmp1, data1, ror #BYTE1_OFFSET
308	subs	tmp1, tmp1, data2
309	bne	.Lmisaligned_exit
310	cbz	data2, .Lmisaligned_exit
311
312.Laligned_m2:
313	ldrb	data2, [src2, #2]
314	uxtb	tmp1, data1, ror #BYTE2_OFFSET
315	subs	tmp1, tmp1, data2
316	bne	.Lmisaligned_exit
317	cbz	data2, .Lmisaligned_exit
318
319.Laligned_m1:
320	ldrb	data2, [src2, #3]
321	uxtb	tmp1, data1, ror #BYTE3_OFFSET
322	subs	tmp1, tmp1, data2
323	bne	.Lmisaligned_exit
324	add	src2, src2, #4
325	cbnz	data2, .Lsrc1_aligned
326#else  /* STRCMP_NO_PRECHECK */
327	/* If we've done the pre-check, then we don't need to check the
328	   first byte again here.  */
329	ldrb	data2, [src2, #2]
330	uxtb	tmp1, data1, ror #BYTE2_OFFSET
331	subs	tmp1, tmp1, data2
332	bne	.Lmisaligned_exit
333	cbz	data2, .Lmisaligned_exit
334
335.Laligned_m2:
336	ldrb	data2, [src2, #3]
337	uxtb	tmp1, data1, ror #BYTE3_OFFSET
338	subs	tmp1, tmp1, data2
339	bne	.Lmisaligned_exit
340	cbnz	data2, .Laligned_m1
341#endif
342
343.Lmisaligned_exit:
344	.cfi_remember_state
345	mov	result, tmp1
346	ldr	r4, [sp], #16
347	.cfi_restore 4
348	bx	lr
349
350#if STRCMP_NO_PRECHECK == 0
351.Laligned_m1:
352	add	src2, src2, #4
353#endif
354.Lsrc1_aligned:
355	.cfi_restore_state
356	/* src1 is word aligned, but src2 has no common alignment
357	   with it.  */
358	ldr	data1, [src1], #4
359	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
360
361	bic	src2, src2, #3
362	ldr	data2, [src2], #4
363	bhi	.Loverlap1		/* C=1, Z=0 => src2[1:0] = 0b11.  */
364	bcs	.Loverlap2		/* C=1, Z=1 => src2[1:0] = 0b10.  */
365
366	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
367.Loverlap3:
368	bic	tmp1, data1, #MSB
369	uadd8	syndrome, data1, const_m1
370	eors	syndrome, tmp1, data2, S2LO #8
371	sel	syndrome, syndrome, const_m1
372	bne	4f
373	cbnz	syndrome, 5f
374	ldr	data2, [src2], #4
375	eor	tmp1, tmp1, data1
376	cmp	tmp1, data2, S2HI #24
377	bne	6f
378	ldr	data1, [src1], #4
379	b	.Loverlap3
3804:
381	S2LO	data2, data2, #8
382	b	.Lstrcmp_tail
383
3845:
385	bics	syndrome, syndrome, #MSB
386	bne	.Lstrcmp_done_equal
387
388	/* We can only get here if the MSB of data1 contains 0, so
389	   fast-path the exit.  */
390	ldrb	result, [src2]
391	.cfi_remember_state
392	ldrd	r4, r5, [sp], #16
393	.cfi_restore 4
394	.cfi_restore 5
395	/* R6/7 Not used in this sequence.  */
396	.cfi_restore 6
397	.cfi_restore 7
398	neg	result, result
399	bx	lr
400
4016:
402	.cfi_restore_state
403	S2LO	data1, data1, #24
404	and	data2, data2, #LSB
405	b	.Lstrcmp_tail
406
407	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
408.Loverlap2:
409	and	tmp1, data1, const_m1, S2LO #16
410	uadd8	syndrome, data1, const_m1
411	eors	syndrome, tmp1, data2, S2LO #16
412	sel	syndrome, syndrome, const_m1
413	bne	4f
414	cbnz	syndrome, 5f
415	ldr	data2, [src2], #4
416	eor	tmp1, tmp1, data1
417	cmp	tmp1, data2, S2HI #16
418	bne	6f
419	ldr	data1, [src1], #4
420	b	.Loverlap2
4214:
422	S2LO	data2, data2, #16
423	b	.Lstrcmp_tail
4245:
425	ands	syndrome, syndrome, const_m1, S2LO #16
426	bne	.Lstrcmp_done_equal
427
428	ldrh	data2, [src2]
429	S2LO	data1, data1, #16
430#ifdef __ARM_BIG_ENDIAN
431	lsl	data2, data2, #16
432#endif
433	b	.Lstrcmp_tail
434
4356:
436	S2LO	data1, data1, #16
437	and	data2, data2, const_m1, S2LO #16
438	b	.Lstrcmp_tail
439
440	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
441.Loverlap1:
442	and	tmp1, data1, #LSB
443	uadd8	syndrome, data1, const_m1
444	eors	syndrome, tmp1, data2, S2LO #24
445	sel	syndrome, syndrome, const_m1
446	bne	4f
447	cbnz	syndrome, 5f
448	ldr	data2, [src2], #4
449	eor	tmp1, tmp1, data1
450	cmp	tmp1, data2, S2HI #8
451	bne	6f
452	ldr	data1, [src1], #4
453	b	.Loverlap1
4544:
455	S2LO	data2, data2, #24
456	b	.Lstrcmp_tail
4575:
458	tst	syndrome, #LSB
459	bne	.Lstrcmp_done_equal
460	ldr	data2, [src2]
4616:
462	S2LO	data1, data1, #8
463	bic	data2, data2, #MSB
464	b	.Lstrcmp_tail
465
466.Lstrcmp_done_equal:
467	mov	result, #0
468	.cfi_remember_state
469	ldrd	r4, r5, [sp], #16
470	.cfi_restore 4
471	.cfi_restore 5
472	/* R6/7 not used in this sequence.  */
473	.cfi_restore 6
474	.cfi_restore 7
475	bx	lr
476
477.Lstrcmp_tail:
478	.cfi_restore_state
479#ifndef __ARM_BIG_ENDIAN
480	rev	data1, data1
481	rev	data2, data2
482	/* Now everything looks big-endian...  */
483#endif
484	uadd8	tmp1, data1, const_m1
485	eor	tmp1, data1, data2
486	sel	syndrome, tmp1, const_m1
487	clz	tmp1, syndrome
488	lsl	data1, data1, tmp1
489	lsl	data2, data2, tmp1
490	lsr	result, data1, #24
491	ldrd	r4, r5, [sp], #16
492	.cfi_restore 4
493	.cfi_restore 5
494	/* R6/7 not used in this sequence.  */
495	.cfi_restore 6
496	.cfi_restore 7
497	sub	result, result, data2, lsr #24
498	bx	lr
499	.cfi_endproc
500	.size strcmp, . - .Lstrcmp_start_addr
501