xref: /freebsd/contrib/bionic-x86_64-string/sse2-strlen-slm.S (revision 8ddb146abcdf061be9f2c0db7e391697dafad85c)
1*8ddb146aSEd Maste/*
2*8ddb146aSEd MasteCopyright (c) 2014, Intel Corporation
3*8ddb146aSEd MasteAll rights reserved.
4*8ddb146aSEd Maste
5*8ddb146aSEd MasteRedistribution and use in source and binary forms, with or without
6*8ddb146aSEd Mastemodification, are permitted provided that the following conditions are met:
7*8ddb146aSEd Maste
8*8ddb146aSEd Maste    * Redistributions of source code must retain the above copyright notice,
9*8ddb146aSEd Maste    * this list of conditions and the following disclaimer.
10*8ddb146aSEd Maste
11*8ddb146aSEd Maste    * Redistributions in binary form must reproduce the above copyright notice,
12*8ddb146aSEd Maste    * this list of conditions and the following disclaimer in the documentation
13*8ddb146aSEd Maste    * and/or other materials provided with the distribution.
14*8ddb146aSEd Maste
15*8ddb146aSEd Maste    * Neither the name of Intel Corporation nor the names of its contributors
16*8ddb146aSEd Maste    * may be used to endorse or promote products derived from this software
17*8ddb146aSEd Maste    * without specific prior written permission.
18*8ddb146aSEd Maste
19*8ddb146aSEd MasteTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20*8ddb146aSEd MasteANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21*8ddb146aSEd MasteWARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22*8ddb146aSEd MasteDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23*8ddb146aSEd MasteANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24*8ddb146aSEd Maste(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25*8ddb146aSEd MasteLOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26*8ddb146aSEd MasteANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27*8ddb146aSEd Maste(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28*8ddb146aSEd MasteSOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*8ddb146aSEd Maste*/
30*8ddb146aSEd Maste
31*8ddb146aSEd Maste#ifndef USE_AS_STRCAT
32*8ddb146aSEd Maste
33*8ddb146aSEd Maste#ifndef STRLEN
34*8ddb146aSEd Maste# define STRLEN		strlen
35*8ddb146aSEd Maste#endif
36*8ddb146aSEd Maste
37*8ddb146aSEd Maste#ifndef L
38*8ddb146aSEd Maste# define L(label)	.L##label
39*8ddb146aSEd Maste#endif
40*8ddb146aSEd Maste
41*8ddb146aSEd Maste#ifndef cfi_startproc
42*8ddb146aSEd Maste# define cfi_startproc			.cfi_startproc
43*8ddb146aSEd Maste#endif
44*8ddb146aSEd Maste
45*8ddb146aSEd Maste#ifndef cfi_endproc
46*8ddb146aSEd Maste# define cfi_endproc			.cfi_endproc
47*8ddb146aSEd Maste#endif
48*8ddb146aSEd Maste
49*8ddb146aSEd Maste#ifndef ENTRY
50*8ddb146aSEd Maste# define ENTRY(name)			\
51*8ddb146aSEd Maste	.type name,  @function; 	\
52*8ddb146aSEd Maste	.globl name;			\
53*8ddb146aSEd Maste	.p2align 4;			\
54*8ddb146aSEd Mastename:					\
55*8ddb146aSEd Maste	cfi_startproc
56*8ddb146aSEd Maste#endif
57*8ddb146aSEd Maste
58*8ddb146aSEd Maste#ifndef END
59*8ddb146aSEd Maste# define END(name)			\
60*8ddb146aSEd Maste	cfi_endproc;			\
61*8ddb146aSEd Maste	.size name, .-name
62*8ddb146aSEd Maste#endif
63*8ddb146aSEd Maste#define RETURN ret
64*8ddb146aSEd Maste	.section .text.sse2,"ax",@progbits
65*8ddb146aSEd MasteENTRY (STRLEN)
66*8ddb146aSEd Maste/* end ifndef USE_AS_STRCAT */
67*8ddb146aSEd Maste#endif
68*8ddb146aSEd Maste	xor	%rax, %rax
69*8ddb146aSEd Maste	mov	%edi, %ecx
70*8ddb146aSEd Maste	and	$0x3f, %ecx
71*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
72*8ddb146aSEd Maste	cmp	$0x30, %ecx
73*8ddb146aSEd Maste	ja	L(next)
74*8ddb146aSEd Maste	movdqu	(%rdi), %xmm1
75*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
76*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
77*8ddb146aSEd Maste	test	%edx, %edx
78*8ddb146aSEd Maste	jnz	L(exit_less16)
79*8ddb146aSEd Maste	mov	%rdi, %rax
80*8ddb146aSEd Maste	and	$-16, %rax
81*8ddb146aSEd Maste	jmp	L(align16_start)
82*8ddb146aSEd MasteL(next):
83*8ddb146aSEd Maste	mov	%rdi, %rax
84*8ddb146aSEd Maste	and	$-16, %rax
85*8ddb146aSEd Maste	pcmpeqb	(%rax), %xmm0
86*8ddb146aSEd Maste	mov	$-1, %r10d
87*8ddb146aSEd Maste	sub	%rax, %rcx
88*8ddb146aSEd Maste	shl	%cl, %r10d
89*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
90*8ddb146aSEd Maste	and	%r10d, %edx
91*8ddb146aSEd Maste	jnz	L(exit)
92*8ddb146aSEd MasteL(align16_start):
93*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
94*8ddb146aSEd Maste	pxor	%xmm1, %xmm1
95*8ddb146aSEd Maste	pxor	%xmm2, %xmm2
96*8ddb146aSEd Maste	pxor	%xmm3, %xmm3
97*8ddb146aSEd Maste	pcmpeqb	16(%rax), %xmm0
98*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
99*8ddb146aSEd Maste	test	%edx, %edx
100*8ddb146aSEd Maste	jnz	L(exit16)
101*8ddb146aSEd Maste
102*8ddb146aSEd Maste	pcmpeqb	32(%rax), %xmm1
103*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
104*8ddb146aSEd Maste	test	%edx, %edx
105*8ddb146aSEd Maste	jnz	L(exit32)
106*8ddb146aSEd Maste
107*8ddb146aSEd Maste	pcmpeqb	48(%rax), %xmm2
108*8ddb146aSEd Maste	pmovmskb %xmm2, %edx
109*8ddb146aSEd Maste	test	%edx, %edx
110*8ddb146aSEd Maste	jnz	L(exit48)
111*8ddb146aSEd Maste
112*8ddb146aSEd Maste	pcmpeqb	64(%rax), %xmm3
113*8ddb146aSEd Maste	pmovmskb %xmm3, %edx
114*8ddb146aSEd Maste	test	%edx, %edx
115*8ddb146aSEd Maste	jnz	L(exit64)
116*8ddb146aSEd Maste
117*8ddb146aSEd Maste	pcmpeqb	80(%rax), %xmm0
118*8ddb146aSEd Maste	add	$64, %rax
119*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
120*8ddb146aSEd Maste	test	%edx, %edx
121*8ddb146aSEd Maste	jnz	L(exit16)
122*8ddb146aSEd Maste
123*8ddb146aSEd Maste	pcmpeqb	32(%rax), %xmm1
124*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
125*8ddb146aSEd Maste	test	%edx, %edx
126*8ddb146aSEd Maste	jnz	L(exit32)
127*8ddb146aSEd Maste
128*8ddb146aSEd Maste	pcmpeqb	48(%rax), %xmm2
129*8ddb146aSEd Maste	pmovmskb %xmm2, %edx
130*8ddb146aSEd Maste	test	%edx, %edx
131*8ddb146aSEd Maste	jnz	L(exit48)
132*8ddb146aSEd Maste
133*8ddb146aSEd Maste	pcmpeqb	64(%rax), %xmm3
134*8ddb146aSEd Maste	pmovmskb %xmm3, %edx
135*8ddb146aSEd Maste	test	%edx, %edx
136*8ddb146aSEd Maste	jnz	L(exit64)
137*8ddb146aSEd Maste
138*8ddb146aSEd Maste	pcmpeqb	80(%rax), %xmm0
139*8ddb146aSEd Maste	add	$64, %rax
140*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
141*8ddb146aSEd Maste	test	%edx, %edx
142*8ddb146aSEd Maste	jnz	L(exit16)
143*8ddb146aSEd Maste
144*8ddb146aSEd Maste	pcmpeqb	32(%rax), %xmm1
145*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
146*8ddb146aSEd Maste	test	%edx, %edx
147*8ddb146aSEd Maste	jnz	L(exit32)
148*8ddb146aSEd Maste
149*8ddb146aSEd Maste	pcmpeqb	48(%rax), %xmm2
150*8ddb146aSEd Maste	pmovmskb %xmm2, %edx
151*8ddb146aSEd Maste	test	%edx, %edx
152*8ddb146aSEd Maste	jnz	L(exit48)
153*8ddb146aSEd Maste
154*8ddb146aSEd Maste	pcmpeqb	64(%rax), %xmm3
155*8ddb146aSEd Maste	pmovmskb %xmm3, %edx
156*8ddb146aSEd Maste	test	%edx, %edx
157*8ddb146aSEd Maste	jnz	L(exit64)
158*8ddb146aSEd Maste
159*8ddb146aSEd Maste	pcmpeqb	80(%rax), %xmm0
160*8ddb146aSEd Maste	add	$64, %rax
161*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
162*8ddb146aSEd Maste	test	%edx, %edx
163*8ddb146aSEd Maste	jnz	L(exit16)
164*8ddb146aSEd Maste
165*8ddb146aSEd Maste	pcmpeqb	32(%rax), %xmm1
166*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
167*8ddb146aSEd Maste	test	%edx, %edx
168*8ddb146aSEd Maste	jnz	L(exit32)
169*8ddb146aSEd Maste
170*8ddb146aSEd Maste	pcmpeqb	48(%rax), %xmm2
171*8ddb146aSEd Maste	pmovmskb %xmm2, %edx
172*8ddb146aSEd Maste	test	%edx, %edx
173*8ddb146aSEd Maste	jnz	L(exit48)
174*8ddb146aSEd Maste
175*8ddb146aSEd Maste	pcmpeqb	64(%rax), %xmm3
176*8ddb146aSEd Maste	pmovmskb %xmm3, %edx
177*8ddb146aSEd Maste	test	%edx, %edx
178*8ddb146aSEd Maste	jnz	L(exit64)
179*8ddb146aSEd Maste
180*8ddb146aSEd Maste
181*8ddb146aSEd Maste	test	$0x3f, %rax
182*8ddb146aSEd Maste	jz	L(align64_loop)
183*8ddb146aSEd Maste
184*8ddb146aSEd Maste	pcmpeqb	80(%rax), %xmm0
185*8ddb146aSEd Maste	add	$80, %rax
186*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
187*8ddb146aSEd Maste	test	%edx, %edx
188*8ddb146aSEd Maste	jnz	L(exit)
189*8ddb146aSEd Maste
190*8ddb146aSEd Maste	test	$0x3f, %rax
191*8ddb146aSEd Maste	jz	L(align64_loop)
192*8ddb146aSEd Maste
193*8ddb146aSEd Maste	pcmpeqb	16(%rax), %xmm1
194*8ddb146aSEd Maste	add	$16, %rax
195*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
196*8ddb146aSEd Maste	test	%edx, %edx
197*8ddb146aSEd Maste	jnz	L(exit)
198*8ddb146aSEd Maste
199*8ddb146aSEd Maste	test	$0x3f, %rax
200*8ddb146aSEd Maste	jz	L(align64_loop)
201*8ddb146aSEd Maste
202*8ddb146aSEd Maste	pcmpeqb	16(%rax), %xmm2
203*8ddb146aSEd Maste	add	$16, %rax
204*8ddb146aSEd Maste	pmovmskb %xmm2, %edx
205*8ddb146aSEd Maste	test	%edx, %edx
206*8ddb146aSEd Maste	jnz	L(exit)
207*8ddb146aSEd Maste
208*8ddb146aSEd Maste	test	$0x3f, %rax
209*8ddb146aSEd Maste	jz	L(align64_loop)
210*8ddb146aSEd Maste
211*8ddb146aSEd Maste	pcmpeqb	16(%rax), %xmm3
212*8ddb146aSEd Maste	add	$16, %rax
213*8ddb146aSEd Maste	pmovmskb %xmm3, %edx
214*8ddb146aSEd Maste	test	%edx, %edx
215*8ddb146aSEd Maste	jnz	L(exit)
216*8ddb146aSEd Maste
217*8ddb146aSEd Maste	add	$16, %rax
218*8ddb146aSEd Maste	.p2align 4
219*8ddb146aSEd Maste	L(align64_loop):
220*8ddb146aSEd Maste	movaps	(%rax),	%xmm4
221*8ddb146aSEd Maste	pminub	16(%rax), 	%xmm4
222*8ddb146aSEd Maste	movaps	32(%rax), 	%xmm5
223*8ddb146aSEd Maste	pminub	48(%rax), 	%xmm5
224*8ddb146aSEd Maste	add	$64, 	%rax
225*8ddb146aSEd Maste	pminub	%xmm4,	%xmm5
226*8ddb146aSEd Maste	pcmpeqb	%xmm0,	%xmm5
227*8ddb146aSEd Maste	pmovmskb %xmm5,	%edx
228*8ddb146aSEd Maste	test	%edx,	%edx
229*8ddb146aSEd Maste	jz	L(align64_loop)
230*8ddb146aSEd Maste
231*8ddb146aSEd Maste
232*8ddb146aSEd Maste	pcmpeqb	-64(%rax), %xmm0
233*8ddb146aSEd Maste	sub	$80, 	%rax
234*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
235*8ddb146aSEd Maste	test	%edx, %edx
236*8ddb146aSEd Maste	jnz	L(exit16)
237*8ddb146aSEd Maste
238*8ddb146aSEd Maste	pcmpeqb	32(%rax), %xmm1
239*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
240*8ddb146aSEd Maste	test	%edx, %edx
241*8ddb146aSEd Maste	jnz	L(exit32)
242*8ddb146aSEd Maste
243*8ddb146aSEd Maste	pcmpeqb	48(%rax), %xmm2
244*8ddb146aSEd Maste	pmovmskb %xmm2, %edx
245*8ddb146aSEd Maste	test	%edx, %edx
246*8ddb146aSEd Maste	jnz	L(exit48)
247*8ddb146aSEd Maste
248*8ddb146aSEd Maste	pcmpeqb	64(%rax), %xmm3
249*8ddb146aSEd Maste	pmovmskb %xmm3, %edx
250*8ddb146aSEd Maste	sub	%rdi, %rax
251*8ddb146aSEd Maste	bsf	%rdx, %rdx
252*8ddb146aSEd Maste	add	%rdx, %rax
253*8ddb146aSEd Maste	add	$64, %rax
254*8ddb146aSEd Maste	RETURN
255*8ddb146aSEd Maste
256*8ddb146aSEd Maste	.p2align 4
257*8ddb146aSEd MasteL(exit):
258*8ddb146aSEd Maste	sub	%rdi, %rax
259*8ddb146aSEd MasteL(exit_less16):
260*8ddb146aSEd Maste	bsf	%rdx, %rdx
261*8ddb146aSEd Maste	add	%rdx, %rax
262*8ddb146aSEd Maste	RETURN
263*8ddb146aSEd Maste	.p2align 4
264*8ddb146aSEd MasteL(exit16):
265*8ddb146aSEd Maste	sub	%rdi, %rax
266*8ddb146aSEd Maste	bsf	%rdx, %rdx
267*8ddb146aSEd Maste	add	%rdx, %rax
268*8ddb146aSEd Maste	add	$16, %rax
269*8ddb146aSEd Maste	RETURN
270*8ddb146aSEd Maste	.p2align 4
271*8ddb146aSEd MasteL(exit32):
272*8ddb146aSEd Maste	sub	%rdi, %rax
273*8ddb146aSEd Maste	bsf	%rdx, %rdx
274*8ddb146aSEd Maste	add	%rdx, %rax
275*8ddb146aSEd Maste	add	$32, %rax
276*8ddb146aSEd Maste	RETURN
277*8ddb146aSEd Maste	.p2align 4
278*8ddb146aSEd MasteL(exit48):
279*8ddb146aSEd Maste	sub	%rdi, %rax
280*8ddb146aSEd Maste	bsf	%rdx, %rdx
281*8ddb146aSEd Maste	add	%rdx, %rax
282*8ddb146aSEd Maste	add	$48, %rax
283*8ddb146aSEd Maste	RETURN
284*8ddb146aSEd Maste	.p2align 4
285*8ddb146aSEd MasteL(exit64):
286*8ddb146aSEd Maste	sub	%rdi, %rax
287*8ddb146aSEd Maste	bsf	%rdx, %rdx
288*8ddb146aSEd Maste	add	%rdx, %rax
289*8ddb146aSEd Maste	add	$64, %rax
290*8ddb146aSEd Maste#ifndef USE_AS_STRCAT
291*8ddb146aSEd Maste	RETURN
292*8ddb146aSEd Maste
293*8ddb146aSEd MasteEND (STRLEN)
294*8ddb146aSEd Maste#endif
295