xref: /freebsd/contrib/bionic-x86_64-string/sse2-strlen-slm.S (revision f81cdf24ba5436367377f7c8e8f51f6df2a75ca7)
1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef USE_AS_STRCAT
32
33#ifndef STRLEN
34# define STRLEN		strlen
35#endif
36
37#ifndef L
38# define L(label)	.L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc			.cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc			.cfi_endproc
47#endif
48
49#ifndef ENTRY
50# define ENTRY(name)			\
51	.type name,  @function; 	\
52	.globl name;			\
53	.p2align 4;			\
54name:					\
55	cfi_startproc
56#endif
57
58#ifndef END
59# define END(name)			\
60	cfi_endproc;			\
61	.size name, .-name
62#endif
63#define RETURN ret
64	.section .text.sse2,"ax",@progbits
65ENTRY (STRLEN)
66/* end ifndef USE_AS_STRCAT */
67#endif
68	xor	%rax, %rax
69	mov	%edi, %ecx
70	and	$0x3f, %ecx
71	pxor	%xmm0, %xmm0
72	cmp	$0x30, %ecx
73	ja	L(next)
74	movdqu	(%rdi), %xmm1
75	pcmpeqb	%xmm1, %xmm0
76	pmovmskb %xmm0, %edx
77	test	%edx, %edx
78	jnz	L(exit_less16)
79	mov	%rdi, %rax
80	and	$-16, %rax
81	jmp	L(align16_start)
82L(next):
83	mov	%rdi, %rax
84	and	$-16, %rax
85	pcmpeqb	(%rax), %xmm0
86	mov	$-1, %r10d
87	sub	%rax, %rcx
88	shl	%cl, %r10d
89	pmovmskb %xmm0, %edx
90	and	%r10d, %edx
91	jnz	L(exit)
92L(align16_start):
93	pxor	%xmm0, %xmm0
94	pxor	%xmm1, %xmm1
95	pxor	%xmm2, %xmm2
96	pxor	%xmm3, %xmm3
97	pcmpeqb	16(%rax), %xmm0
98	pmovmskb %xmm0, %edx
99	test	%edx, %edx
100	jnz	L(exit16)
101
102	pcmpeqb	32(%rax), %xmm1
103	pmovmskb %xmm1, %edx
104	test	%edx, %edx
105	jnz	L(exit32)
106
107	pcmpeqb	48(%rax), %xmm2
108	pmovmskb %xmm2, %edx
109	test	%edx, %edx
110	jnz	L(exit48)
111
112	pcmpeqb	64(%rax), %xmm3
113	pmovmskb %xmm3, %edx
114	test	%edx, %edx
115	jnz	L(exit64)
116
117	pcmpeqb	80(%rax), %xmm0
118	add	$64, %rax
119	pmovmskb %xmm0, %edx
120	test	%edx, %edx
121	jnz	L(exit16)
122
123	pcmpeqb	32(%rax), %xmm1
124	pmovmskb %xmm1, %edx
125	test	%edx, %edx
126	jnz	L(exit32)
127
128	pcmpeqb	48(%rax), %xmm2
129	pmovmskb %xmm2, %edx
130	test	%edx, %edx
131	jnz	L(exit48)
132
133	pcmpeqb	64(%rax), %xmm3
134	pmovmskb %xmm3, %edx
135	test	%edx, %edx
136	jnz	L(exit64)
137
138	pcmpeqb	80(%rax), %xmm0
139	add	$64, %rax
140	pmovmskb %xmm0, %edx
141	test	%edx, %edx
142	jnz	L(exit16)
143
144	pcmpeqb	32(%rax), %xmm1
145	pmovmskb %xmm1, %edx
146	test	%edx, %edx
147	jnz	L(exit32)
148
149	pcmpeqb	48(%rax), %xmm2
150	pmovmskb %xmm2, %edx
151	test	%edx, %edx
152	jnz	L(exit48)
153
154	pcmpeqb	64(%rax), %xmm3
155	pmovmskb %xmm3, %edx
156	test	%edx, %edx
157	jnz	L(exit64)
158
159	pcmpeqb	80(%rax), %xmm0
160	add	$64, %rax
161	pmovmskb %xmm0, %edx
162	test	%edx, %edx
163	jnz	L(exit16)
164
165	pcmpeqb	32(%rax), %xmm1
166	pmovmskb %xmm1, %edx
167	test	%edx, %edx
168	jnz	L(exit32)
169
170	pcmpeqb	48(%rax), %xmm2
171	pmovmskb %xmm2, %edx
172	test	%edx, %edx
173	jnz	L(exit48)
174
175	pcmpeqb	64(%rax), %xmm3
176	pmovmskb %xmm3, %edx
177	test	%edx, %edx
178	jnz	L(exit64)
179
180
181	test	$0x3f, %rax
182	jz	L(align64_loop)
183
184	pcmpeqb	80(%rax), %xmm0
185	add	$80, %rax
186	pmovmskb %xmm0, %edx
187	test	%edx, %edx
188	jnz	L(exit)
189
190	test	$0x3f, %rax
191	jz	L(align64_loop)
192
193	pcmpeqb	16(%rax), %xmm1
194	add	$16, %rax
195	pmovmskb %xmm1, %edx
196	test	%edx, %edx
197	jnz	L(exit)
198
199	test	$0x3f, %rax
200	jz	L(align64_loop)
201
202	pcmpeqb	16(%rax), %xmm2
203	add	$16, %rax
204	pmovmskb %xmm2, %edx
205	test	%edx, %edx
206	jnz	L(exit)
207
208	test	$0x3f, %rax
209	jz	L(align64_loop)
210
211	pcmpeqb	16(%rax), %xmm3
212	add	$16, %rax
213	pmovmskb %xmm3, %edx
214	test	%edx, %edx
215	jnz	L(exit)
216
217	add	$16, %rax
218	.p2align 4
219	L(align64_loop):
220	movaps	(%rax),	%xmm4
221	pminub	16(%rax), 	%xmm4
222	movaps	32(%rax), 	%xmm5
223	pminub	48(%rax), 	%xmm5
224	add	$64, 	%rax
225	pminub	%xmm4,	%xmm5
226	pcmpeqb	%xmm0,	%xmm5
227	pmovmskb %xmm5,	%edx
228	test	%edx,	%edx
229	jz	L(align64_loop)
230
231
232	pcmpeqb	-64(%rax), %xmm0
233	sub	$80, 	%rax
234	pmovmskb %xmm0, %edx
235	test	%edx, %edx
236	jnz	L(exit16)
237
238	pcmpeqb	32(%rax), %xmm1
239	pmovmskb %xmm1, %edx
240	test	%edx, %edx
241	jnz	L(exit32)
242
243	pcmpeqb	48(%rax), %xmm2
244	pmovmskb %xmm2, %edx
245	test	%edx, %edx
246	jnz	L(exit48)
247
248	pcmpeqb	64(%rax), %xmm3
249	pmovmskb %xmm3, %edx
250	sub	%rdi, %rax
251	bsf	%rdx, %rdx
252	add	%rdx, %rax
253	add	$64, %rax
254	RETURN
255
256	.p2align 4
257L(exit):
258	sub	%rdi, %rax
259L(exit_less16):
260	bsf	%rdx, %rdx
261	add	%rdx, %rax
262	RETURN
263	.p2align 4
264L(exit16):
265	sub	%rdi, %rax
266	bsf	%rdx, %rdx
267	add	%rdx, %rax
268	add	$16, %rax
269	RETURN
270	.p2align 4
271L(exit32):
272	sub	%rdi, %rax
273	bsf	%rdx, %rdx
274	add	%rdx, %rax
275	add	$32, %rax
276	RETURN
277	.p2align 4
278L(exit48):
279	sub	%rdi, %rax
280	bsf	%rdx, %rdx
281	add	%rdx, %rax
282	add	$48, %rax
283	RETURN
284	.p2align 4
285L(exit64):
286	sub	%rdi, %rax
287	bsf	%rdx, %rdx
288	add	%rdx, %rax
289	add	$64, %rax
290#ifndef USE_AS_STRCAT
291	RETURN
292
293END (STRLEN)
294#endif
295