xref: /freebsd/contrib/arm-optimized-routines/string/arm/memchr.S (revision 035dd78d30ba28a3dc15c05ec85ad10127165677)
1/*
2 * memchr - scan memory for a character
3 *
4 * Copyright (c) 2010-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/*
9   Written by Dave Gilbert <david.gilbert@linaro.org>
10
11   This __memchr_arm routine is optimised on a Cortex-A9 and should work on
12   all ARMv7 processors.   It has a fast past for short sizes, and has
13   an optimised path for large data sets; the worst case is finding the
14   match early in a large data set.
15
16 */
17
18@ 2011-02-07 david.gilbert@linaro.org
19@    Extracted from local git a5b438d861
20@ 2011-07-14 david.gilbert@linaro.org
21@    Import endianness fix from local git ea786f1b
22@ 2011-12-07 david.gilbert@linaro.org
23@    Removed unneeded cbz from align loop
24
25	.syntax unified
26#if __ARM_ARCH >= 8 && __ARM_ARCH_PROFILE == 'M'
27    /* keep config inherited from -march= */
28#else
29	.arch armv7-a
30#endif
31
32@ this lets us check a flag in a 00/ff byte easily in either endianness
33#ifdef __ARMEB__
34#define CHARTSTMASK(c) 1<<(31-(c*8))
35#else
36#define CHARTSTMASK(c) 1<<(c*8)
37#endif
38	.thumb
39#include "asmdefs.h"
40
41
42@ ---------------------------------------------------------------------------
43	.thumb_func
44	.align 2
45	.p2align 4,,15
46	.global __memchr_arm
47	.type __memchr_arm,%function
48	.fnstart
49	.cfi_startproc
50__memchr_arm:
51	@ r0 = start of memory to scan
52	@ r1 = character to look for
53	@ r2 = length
54	@ returns r0 = pointer to character or NULL if not found
55	prologue
56	and	r1,r1,#0xff	@ Don't think we can trust the caller to actually pass a char
57
58	cmp	r2,#16		@ If it's short don't bother with anything clever
59	blt	20f
60
61	tst	r0, #7		@ If it's already aligned skip the next bit
62	beq	10f
63
64	@ Work up to an aligned point
655:
66	ldrb	r3, [r0],#1
67	subs	r2, r2, #1
68	cmp	r3, r1
69	beq	50f		@ If it matches exit found
70	tst	r0, #7
71	bne	5b		@ If not aligned yet then do next byte
72
7310:
74	@ At this point, we are aligned, we know we have at least 8 bytes to work with
75	push	{r4,r5,r6,r7}
76	.cfi_adjust_cfa_offset 16
77	.cfi_rel_offset 4, 0
78	.cfi_rel_offset 5, 4
79	.cfi_rel_offset 6, 8
80	.cfi_rel_offset 7, 12
81	orr	r1, r1, r1, lsl #8	@ expand the match word across to all bytes
82	orr	r1, r1, r1, lsl #16
83	bic	r4, r2, #7	@ Number of double words to work with
84	mvns	r7, #0		@ all F's
85	movs	r3, #0
86
8715:
88	ldmia	r0!,{r5,r6}
89	subs	r4, r4, #8
90	eor	r5,r5, r1	@ Get it so that r5,r6 have 00's where the bytes match the target
91	eor	r6,r6, r1
92	uadd8	r5, r5, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
93	sel	r5, r3, r7	@ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
94	uadd8	r6, r6, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
95	sel	r6, r5, r7	@ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
96	cbnz	r6, 60f
97	bne	15b		@ (Flags from the subs above) If not run out of bytes then go around again
98
99	pop	{r4,r5,r6,r7}
100	.cfi_restore 7
101	.cfi_restore 6
102	.cfi_restore 5
103	.cfi_restore 4
104	.cfi_adjust_cfa_offset -16
105	and	r1,r1,#0xff	@ Get r1 back to a single character from the expansion above
106	and	r2,r2,#7	@ Leave the count remaining as the number after the double words have been done
107
10820:
109	cbz	r2, 40f		@ 0 length or hit the end already then not found
110
11121:  @ Post aligned section, or just a short call
112	ldrb	r3,[r0],#1
113	subs	r2,r2,#1
114	eor	r3,r3,r1	@ r3 = 0 if match - doesn't break flags from sub
115	cbz	r3, 50f
116	bne	21b		@ on r2 flags
117
11840:
119	.cfi_remember_state
120	movs	r0,#0		@ not found
121	epilogue
122
12350:
124	.cfi_restore_state
125	.cfi_remember_state
126	subs	r0,r0,#1	@ found
127	epilogue
128
12960:  @ We're here because the fast path found a hit - now we have to track down exactly which word it was
130	@ r0 points to the start of the double word after the one that was tested
131	@ r5 has the 00/ff pattern for the first word, r6 has the chained value
132	.cfi_restore_state	@ Standard post-prologue state
133	.cfi_adjust_cfa_offset 16
134	.cfi_rel_offset	4, 0
135	.cfi_rel_offset 5, 4
136	.cfi_rel_offset 6, 8
137	.cfi_rel_offset 7, 12
138	cmp	r5, #0
139	itte	eq
140	moveq	r5, r6		@ the end is in the 2nd word
141	subeq	r0,r0,#3	@ Points to 2nd byte of 2nd word
142	subne	r0,r0,#7	@ or 2nd byte of 1st word
143
144	@ r0 currently points to the 3rd byte of the word containing the hit
145	tst	r5, # CHARTSTMASK(0)	@ 1st character
146	bne	61f
147	adds	r0,r0,#1
148	tst	r5, # CHARTSTMASK(1)	@ 2nd character
149	ittt	eq
150	addeq	r0,r0,#1
151	tsteq	r5, # (3<<15)		@ 2nd & 3rd character
152	@ If not the 3rd must be the last one
153	addeq	r0,r0,#1
154
15561:
156	pop	{r4,r5,r6,r7}
157	.cfi_restore 7
158	.cfi_restore 6
159	.cfi_restore 5
160	.cfi_restore 4
161	.cfi_adjust_cfa_offset -16
162	subs	r0,r0,#1
163	epilogue
164	.cfi_endproc
165	.cantunwind
166	.fnend
167
168	.size	__memchr_arm, . - __memchr_arm
169