xref: /illumos-gate/usr/src/lib/libc/amd64/gen/strcmp.S (revision 5d9d9091f564c198a760790b0bfa72c44e17912b)
1*5d9d9091SRichard Lowe/*
2*5d9d9091SRichard Lowe * CDDL HEADER START
3*5d9d9091SRichard Lowe *
4*5d9d9091SRichard Lowe * The contents of this file are subject to the terms of the
5*5d9d9091SRichard Lowe * Common Development and Distribution License (the "License").
6*5d9d9091SRichard Lowe * You may not use this file except in compliance with the License.
7*5d9d9091SRichard Lowe *
8*5d9d9091SRichard Lowe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*5d9d9091SRichard Lowe * or http://www.opensolaris.org/os/licensing.
10*5d9d9091SRichard Lowe * See the License for the specific language governing permissions
11*5d9d9091SRichard Lowe * and limitations under the License.
12*5d9d9091SRichard Lowe *
13*5d9d9091SRichard Lowe * When distributing Covered Code, include this CDDL HEADER in each
14*5d9d9091SRichard Lowe * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*5d9d9091SRichard Lowe * If applicable, add the following below this CDDL HEADER, with the
16*5d9d9091SRichard Lowe * fields enclosed by brackets "[]" replaced with your own identifying
17*5d9d9091SRichard Lowe * information: Portions Copyright [yyyy] [name of copyright owner]
18*5d9d9091SRichard Lowe *
19*5d9d9091SRichard Lowe * CDDL HEADER END
20*5d9d9091SRichard Lowe */
21*5d9d9091SRichard Lowe
22*5d9d9091SRichard Lowe/*
23*5d9d9091SRichard Lowe * Copyright (c) 2009, Intel Corporation
24*5d9d9091SRichard Lowe * All rights reserved.
25*5d9d9091SRichard Lowe */
26*5d9d9091SRichard Lowe
27*5d9d9091SRichard Lowe/*
28*5d9d9091SRichard Lowe *	str[n]cmp - compare chars between two string
29*5d9d9091SRichard Lowe */
30*5d9d9091SRichard Lowe
31*5d9d9091SRichard Lowe#include "SYS.h"
32*5d9d9091SRichard Lowe#include "proc64_id.h"
33*5d9d9091SRichard Lowe
34*5d9d9091SRichard Lowe#define LABEL(s) .strcmp##s
35*5d9d9091SRichard Lowe
36*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
37*5d9d9091SRichard Lowe	/*
38*5d9d9091SRichard Lowe	 * Since the counter, %r11, is unsigned, we branch to strcmp_exitz
39*5d9d9091SRichard Lowe	 * if the new counter > the old one or is 0.
40*5d9d9091SRichard Lowe	 */
41*5d9d9091SRichard Lowe#define UPDATE_STRNCMP_COUNTER				\
42*5d9d9091SRichard Lowe	/* calculate left number to compare */		\
43*5d9d9091SRichard Lowe	lea	-16(%rcx, %r11), %r9;			\
44*5d9d9091SRichard Lowe	cmp	%r9, %r11;				\
45*5d9d9091SRichard Lowe	jb	LABEL(strcmp_exitz);			\
46*5d9d9091SRichard Lowe	test	%r9, %r9;				\
47*5d9d9091SRichard Lowe	je	LABEL(strcmp_exitz);			\
48*5d9d9091SRichard Lowe	mov	%r9, %r11
49*5d9d9091SRichard Lowe#else
50*5d9d9091SRichard Lowe#define UPDATE_STRNCMP_COUNTER
51*5d9d9091SRichard Lowe#endif
52*5d9d9091SRichard Lowe
53*5d9d9091SRichard Lowe	/*
54*5d9d9091SRichard Lowe	 * This implementation uses SSE to compare up to 16 bytes at a time.
55*5d9d9091SRichard Lowe	 */
56*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
57*5d9d9091SRichard Lowe	ENTRY(strncmp)
58*5d9d9091SRichard Lowe	test	%rdx, %rdx
59*5d9d9091SRichard Lowe	je	LABEL(strcmp_exitz)
60*5d9d9091SRichard Lowe	mov	%rdx, %r11
61*5d9d9091SRichard Lowe#else
62*5d9d9091SRichard Lowe	ENTRY(strcmp)			/* (const char *, const char *) */
63*5d9d9091SRichard Lowe#endif
64*5d9d9091SRichard Lowe	mov	%esi, %ecx
65*5d9d9091SRichard Lowe	mov	%edi, %eax
66*5d9d9091SRichard Lowe	and	$0x3f, %rcx		/* rsi alignment in cache line */
67*5d9d9091SRichard Lowe	and	$0x3f, %rax		/* rdi alignment in cache line */
68*5d9d9091SRichard Lowe	cmp	$0x30, %ecx
69*5d9d9091SRichard Lowe	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
70*5d9d9091SRichard Lowe	cmp	$0x30, %eax
71*5d9d9091SRichard Lowe	ja	LABEL(crosscache)	/* rdi: 16-byte load will cross cache line */
72*5d9d9091SRichard Lowe	movlpd	(%rdi), %xmm1
73*5d9d9091SRichard Lowe	movlpd	(%rsi), %xmm2
74*5d9d9091SRichard Lowe	movhpd	8(%rdi), %xmm1
75*5d9d9091SRichard Lowe	movhpd	8(%rsi), %xmm2
76*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
77*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
78*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
79*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
80*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
81*5d9d9091SRichard Lowe	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
82*5d9d9091SRichard Lowe	jnz	LABEL(less16bytes)	/* If not, found mismatch or null char */
83*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
84*5d9d9091SRichard Lowe	sub	$16, %r11
85*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)	/* finish comparision */
86*5d9d9091SRichard Lowe#endif
87*5d9d9091SRichard Lowe	add	$16, %rsi		/* prepare to search next 16 bytes */
88*5d9d9091SRichard Lowe	add	$16, %rdi		/* prepare to search next 16 bytes */
89*5d9d9091SRichard Lowe
90*5d9d9091SRichard Lowe	/*
91*5d9d9091SRichard Lowe	 * Determine rdi and rsi string offsets from 16-byte alignment.
92*5d9d9091SRichard Lowe	 * Use relative offset difference between the two to determine which case
93*5d9d9091SRichard Lowe	 * below to use.
94*5d9d9091SRichard Lowe	 */
95*5d9d9091SRichard Lowe	.p2align 4
96*5d9d9091SRichard LoweLABEL(crosscache):
97*5d9d9091SRichard Lowe	and	$0xfffffffffffffff0, %rsi	/* force %rsi to be 16 byte aligned */
98*5d9d9091SRichard Lowe	and	$0xfffffffffffffff0, %rdi	/* force %rdi to be 16 byte aligned */
99*5d9d9091SRichard Lowe	mov	$0xffff, %edx			/* for equivalent offset */
100*5d9d9091SRichard Lowe	xor	%r8d, %r8d
101*5d9d9091SRichard Lowe	and	$0xf, %ecx			/* offset of rsi */
102*5d9d9091SRichard Lowe	and	$0xf, %eax			/* offset of rdi */
103*5d9d9091SRichard Lowe	cmp	%eax, %ecx
104*5d9d9091SRichard Lowe	je	LABEL(ashr_0)			/* both strings have the same alignment */
105*5d9d9091SRichard Lowe	ja	LABEL(bigger)
106*5d9d9091SRichard Lowe	mov	%edx, %r8d			/* r8d is offset flag for exit tail */
107*5d9d9091SRichard Lowe	xchg	%ecx, %eax
108*5d9d9091SRichard Lowe	xchg	%rsi, %rdi
109*5d9d9091SRichard LoweLABEL(bigger):
110*5d9d9091SRichard Lowe	mov	%rcx, %r9
111*5d9d9091SRichard Lowe	sub	%rax, %r9
112*5d9d9091SRichard Lowe	lea	LABEL(unaligned_table)(%rip), %r10
113*5d9d9091SRichard Lowe	movslq	(%r10, %r9, 4), %r9
114*5d9d9091SRichard Lowe	lea	(%r10, %r9), %r10
115*5d9d9091SRichard Lowe	jmp	*%r10				/* jump to corresponding case */
116*5d9d9091SRichard Lowe
117*5d9d9091SRichard Lowe/*
118*5d9d9091SRichard Lowe * ashr_0 handles the following cases:
119*5d9d9091SRichard Lowe * 	str1 offset = str2 offset
120*5d9d9091SRichard Lowe */
121*5d9d9091SRichard Lowe	.p2align 4
122*5d9d9091SRichard LoweLABEL(ashr_0):
123*5d9d9091SRichard Lowe	movdqa	(%rsi), %xmm1
124*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
125*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
126*5d9d9091SRichard Lowe	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
127*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
128*5d9d9091SRichard Lowe	pmovmskb %xmm1, %r9d
129*5d9d9091SRichard Lowe	shr	%cl, %edx			/* adjust 0xffff for offset */
130*5d9d9091SRichard Lowe	shr	%cl, %r9d			/* adjust for 16-byte offset */
131*5d9d9091SRichard Lowe	sub	%r9d, %edx
132*5d9d9091SRichard Lowe	/*
133*5d9d9091SRichard Lowe	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
134*5d9d9091SRichard Lowe	 * the start from (16-rax) and no null char was seen.
135*5d9d9091SRichard Lowe	 */
136*5d9d9091SRichard Lowe	jne	LABEL(less32bytes)		/* mismatch or null char */
137*5d9d9091SRichard Lowe	UPDATE_STRNCMP_COUNTER
138*5d9d9091SRichard Lowe	mov	$16, %rcx
139*5d9d9091SRichard Lowe	mov	$16, %r9
140*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0			/* clear xmm0, may have changed above */
141*5d9d9091SRichard Lowe
142*5d9d9091SRichard Lowe	/*
143*5d9d9091SRichard Lowe	 * Now both strings are aligned at 16-byte boundary. Loop over strings
144*5d9d9091SRichard Lowe	 * checking 32-bytes per iteration.
145*5d9d9091SRichard Lowe	 */
146*5d9d9091SRichard Lowe	.p2align 4
147*5d9d9091SRichard LoweLABEL(loop_ashr_0):
148*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
149*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
150*5d9d9091SRichard Lowe
151*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
152*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
153*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
154*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
155*5d9d9091SRichard Lowe	sub	$0xffff, %edx
156*5d9d9091SRichard Lowe	jnz	LABEL(exit)		/* mismatch or null char seen */
157*5d9d9091SRichard Lowe
158*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
159*5d9d9091SRichard Lowe	sub	$16, %r11
160*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
161*5d9d9091SRichard Lowe#endif
162*5d9d9091SRichard Lowe	add	$16, %rcx
163*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
164*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
165*5d9d9091SRichard Lowe
166*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
167*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
168*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
169*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
170*5d9d9091SRichard Lowe	sub	$0xffff, %edx
171*5d9d9091SRichard Lowe	jnz	LABEL(exit)
172*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
173*5d9d9091SRichard Lowe	sub	$16, %r11
174*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
175*5d9d9091SRichard Lowe#endif
176*5d9d9091SRichard Lowe	add	$16, %rcx
177*5d9d9091SRichard Lowe	jmp	LABEL(loop_ashr_0)
178*5d9d9091SRichard Lowe
179*5d9d9091SRichard Lowe/*
180*5d9d9091SRichard Lowe * ashr_1 handles the following cases:
181*5d9d9091SRichard Lowe * 	abs(str1 offset - str2 offset) = 15
182*5d9d9091SRichard Lowe */
183*5d9d9091SRichard Lowe	.p2align 4
184*5d9d9091SRichard LoweLABEL(ashr_1):
185*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
186*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm2
187*5d9d9091SRichard Lowe	movdqa	(%rsi), %xmm1
188*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
189*5d9d9091SRichard Lowe	pslldq	$15, %xmm2		/* shift first string to align with second */
190*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
191*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
192*5d9d9091SRichard Lowe	pmovmskb %xmm2, %r9d
193*5d9d9091SRichard Lowe	shr	%cl, %edx		/* adjust 0xffff for offset */
194*5d9d9091SRichard Lowe	shr	%cl, %r9d		/* adjust for 16-byte offset */
195*5d9d9091SRichard Lowe	sub	%r9d, %edx
196*5d9d9091SRichard Lowe	jnz	LABEL(less32bytes)	/* mismatch or null char seen */
197*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm3
198*5d9d9091SRichard Lowe	UPDATE_STRNCMP_COUNTER
199*5d9d9091SRichard Lowe
200*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
201*5d9d9091SRichard Lowe	mov	$16, %rcx		/* index for loads */
202*5d9d9091SRichard Lowe	mov	$1, %r9d		/* rdi bytes already examined. Used in exit code */
203*5d9d9091SRichard Lowe	/*
204*5d9d9091SRichard Lowe	 * Setup %r10 value allows us to detect crossing a page boundary.
205*5d9d9091SRichard Lowe	 * When %r10 goes positive we are crossing a page boundary and
206*5d9d9091SRichard Lowe	 * need to do a nibble.
207*5d9d9091SRichard Lowe	 */
208*5d9d9091SRichard Lowe	lea	1(%rdi), %r10
209*5d9d9091SRichard Lowe	and	$0xfff, %r10		/* offset into 4K page */
210*5d9d9091SRichard Lowe	sub	$0x1000, %r10		/* subtract 4K pagesize */
211*5d9d9091SRichard Lowe	movdqa	%xmm3, %xmm4
212*5d9d9091SRichard Lowe
213*5d9d9091SRichard Lowe	.p2align 4
214*5d9d9091SRichard LoweLABEL(loop_ashr_1):
215*5d9d9091SRichard Lowe	add	$16, %r10
216*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_1)	/* cross page boundary */
217*5d9d9091SRichard Lowe
218*5d9d9091SRichard LoweLABEL(gobble_ashr_1):
219*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
220*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
221*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4		 /* store for next cycle */
222*5d9d9091SRichard Lowe
223*5d9d9091SRichard Lowe	psrldq	$1, %xmm3
224*5d9d9091SRichard Lowe	pslldq	$15, %xmm2
225*5d9d9091SRichard Lowe	por	%xmm3, %xmm2		/* merge into one 16byte value */
226*5d9d9091SRichard Lowe
227*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
228*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
229*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
230*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
231*5d9d9091SRichard Lowe	sub	$0xffff, %edx
232*5d9d9091SRichard Lowe	jnz	LABEL(exit)
233*5d9d9091SRichard Lowe
234*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
235*5d9d9091SRichard Lowe	sub	$16, %r11
236*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
237*5d9d9091SRichard Lowe#endif
238*5d9d9091SRichard Lowe	add	$16, %rcx
239*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
240*5d9d9091SRichard Lowe
241*5d9d9091SRichard Lowe	add	$16, %r10
242*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_1)	/* cross page boundary */
243*5d9d9091SRichard Lowe
244*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
245*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
246*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4		/* store for next cycle */
247*5d9d9091SRichard Lowe
248*5d9d9091SRichard Lowe	psrldq	$1, %xmm3
249*5d9d9091SRichard Lowe	pslldq 	$15, %xmm2
250*5d9d9091SRichard Lowe	por	%xmm3, %xmm2		/* merge into one 16byte value */
251*5d9d9091SRichard Lowe
252*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
253*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
254*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
255*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
256*5d9d9091SRichard Lowe	sub	$0xffff, %edx
257*5d9d9091SRichard Lowe	jnz	LABEL(exit)
258*5d9d9091SRichard Lowe
259*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
260*5d9d9091SRichard Lowe	sub	$16, %r11
261*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
262*5d9d9091SRichard Lowe#endif
263*5d9d9091SRichard Lowe	add	$16, %rcx
264*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
265*5d9d9091SRichard Lowe	jmp	LABEL(loop_ashr_1)
266*5d9d9091SRichard Lowe
267*5d9d9091SRichard Lowe	/*
268*5d9d9091SRichard Lowe	 * Nibble avoids loads across page boundary. This is to avoid a potential
269*5d9d9091SRichard Lowe	 * access into unmapped memory.
270*5d9d9091SRichard Lowe	 */
271*5d9d9091SRichard Lowe	.p2align 4
272*5d9d9091SRichard LoweLABEL(nibble_ashr_1):
273*5d9d9091SRichard Lowe	psrldq	$1, %xmm4
274*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
275*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
276*5d9d9091SRichard Lowe	pcmpeqb	%xmm4, %xmm1
277*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
278*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
279*5d9d9091SRichard Lowe	sub	$0x7fff, %edx
280*5d9d9091SRichard Lowe	jnz	LABEL(exit)
281*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
282*5d9d9091SRichard Lowe	cmp	$15, %r11
283*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
284*5d9d9091SRichard Lowe#endif
285*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
286*5d9d9091SRichard Lowe	sub	$0x1000, %r10		/* subtract 4K from %r10 */
287*5d9d9091SRichard Lowe	jmp	LABEL(gobble_ashr_1)
288*5d9d9091SRichard Lowe
289*5d9d9091SRichard Lowe/*
290*5d9d9091SRichard Lowe * ashr_2 handles the following cases:
291*5d9d9091SRichard Lowe * 	abs(str1 offset - str2 offset) = 14
292*5d9d9091SRichard Lowe */
293*5d9d9091SRichard Lowe	.p2align 4
294*5d9d9091SRichard LoweLABEL(ashr_2):
295*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
296*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm2
297*5d9d9091SRichard Lowe	movdqa	(%rsi), %xmm1
298*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
299*5d9d9091SRichard Lowe	pslldq	$14, %xmm2
300*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm2
301*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm2
302*5d9d9091SRichard Lowe	pmovmskb %xmm2, %r9d
303*5d9d9091SRichard Lowe	shr	%cl, %edx
304*5d9d9091SRichard Lowe	shr	%cl, %r9d
305*5d9d9091SRichard Lowe	sub	%r9d, %edx
306*5d9d9091SRichard Lowe	jnz	LABEL(less32bytes)
307*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm3
308*5d9d9091SRichard Lowe	UPDATE_STRNCMP_COUNTER
309*5d9d9091SRichard Lowe
310*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
311*5d9d9091SRichard Lowe	mov	$16, %rcx	/* index for loads */
312*5d9d9091SRichard Lowe	mov	$2, %r9d	/* rdi bytes already examined. Used in exit code */
313*5d9d9091SRichard Lowe	/*
314*5d9d9091SRichard Lowe	 * Setup %r10 value allows us to detect crossing a page boundary.
315*5d9d9091SRichard Lowe	 * When %r10 goes positive we are crossing a page boundary and
316*5d9d9091SRichard Lowe	 * need to do a nibble.
317*5d9d9091SRichard Lowe	 */
318*5d9d9091SRichard Lowe	lea	2(%rdi), %r10
319*5d9d9091SRichard Lowe	and	$0xfff, %r10	/* offset into 4K page */
320*5d9d9091SRichard Lowe	sub	$0x1000, %r10	/* subtract 4K pagesize */
321*5d9d9091SRichard Lowe	movdqa	%xmm3, %xmm4
322*5d9d9091SRichard Lowe
323*5d9d9091SRichard Lowe	.p2align 4
324*5d9d9091SRichard LoweLABEL(loop_ashr_2):
325*5d9d9091SRichard Lowe	add	$16, %r10
326*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_2)
327*5d9d9091SRichard Lowe
328*5d9d9091SRichard LoweLABEL(gobble_ashr_2):
329*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
330*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
331*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
332*5d9d9091SRichard Lowe
333*5d9d9091SRichard Lowe	psrldq	$2, %xmm3
334*5d9d9091SRichard Lowe	pslldq	$14, %xmm2
335*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
336*5d9d9091SRichard Lowe
337*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
338*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
339*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
340*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
341*5d9d9091SRichard Lowe	sub	$0xffff, %edx
342*5d9d9091SRichard Lowe	jnz	LABEL(exit)
343*5d9d9091SRichard Lowe
344*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
345*5d9d9091SRichard Lowe	sub	$16, %r11
346*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
347*5d9d9091SRichard Lowe#endif
348*5d9d9091SRichard Lowe
349*5d9d9091SRichard Lowe	add	$16, %rcx
350*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
351*5d9d9091SRichard Lowe
352*5d9d9091SRichard Lowe	add	$16, %r10
353*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_2)	/* cross page boundary */
354*5d9d9091SRichard Lowe
355*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
356*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
357*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
358*5d9d9091SRichard Lowe
359*5d9d9091SRichard Lowe	psrldq	$2, %xmm3
360*5d9d9091SRichard Lowe	pslldq 	$14, %xmm2
361*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
362*5d9d9091SRichard Lowe
363*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
364*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
365*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
366*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
367*5d9d9091SRichard Lowe	sub	$0xffff, %edx
368*5d9d9091SRichard Lowe	jnz	LABEL(exit)
369*5d9d9091SRichard Lowe
370*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
371*5d9d9091SRichard Lowe	sub	$16, %r11
372*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
373*5d9d9091SRichard Lowe#endif
374*5d9d9091SRichard Lowe
375*5d9d9091SRichard Lowe	add	$16, %rcx
376*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
377*5d9d9091SRichard Lowe	jmp	LABEL(loop_ashr_2)
378*5d9d9091SRichard Lowe
379*5d9d9091SRichard Lowe	.p2align 4
380*5d9d9091SRichard LoweLABEL(nibble_ashr_2):
381*5d9d9091SRichard Lowe	psrldq	$2, %xmm4
382*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
383*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
384*5d9d9091SRichard Lowe	pcmpeqb	%xmm4, %xmm1
385*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
386*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
387*5d9d9091SRichard Lowe	sub	$0x3fff, %edx
388*5d9d9091SRichard Lowe	jnz	LABEL(exit)
389*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
390*5d9d9091SRichard Lowe	cmp	$14, %r11
391*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
392*5d9d9091SRichard Lowe#endif
393*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
394*5d9d9091SRichard Lowe	sub	$0x1000, %r10		/* subtract 4K from %r10 */
395*5d9d9091SRichard Lowe	jmp	LABEL(gobble_ashr_2)
396*5d9d9091SRichard Lowe
397*5d9d9091SRichard Lowe/*
398*5d9d9091SRichard Lowe * ashr_3 handles the following cases:
399*5d9d9091SRichard Lowe * 	abs(str1 offset - str2 offset) = 13
400*5d9d9091SRichard Lowe */
401*5d9d9091SRichard Lowe	.p2align 4
402*5d9d9091SRichard LoweLABEL(ashr_3):
403*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
404*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm2
405*5d9d9091SRichard Lowe	movdqa	(%rsi), %xmm1
406*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
407*5d9d9091SRichard Lowe	pslldq	$13, %xmm2
408*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm2
409*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm2
410*5d9d9091SRichard Lowe	pmovmskb %xmm2, %r9d
411*5d9d9091SRichard Lowe	shr	%cl, %edx
412*5d9d9091SRichard Lowe	shr	%cl, %r9d
413*5d9d9091SRichard Lowe	sub	%r9d, %edx
414*5d9d9091SRichard Lowe	jnz	LABEL(less32bytes)
415*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm3
416*5d9d9091SRichard Lowe
417*5d9d9091SRichard Lowe	UPDATE_STRNCMP_COUNTER
418*5d9d9091SRichard Lowe
419*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
420*5d9d9091SRichard Lowe	mov	$16, %rcx	/* index for loads */
421*5d9d9091SRichard Lowe	mov	$3, %r9d	/* rdi bytes already examined. Used in exit code */
422*5d9d9091SRichard Lowe	/*
423*5d9d9091SRichard Lowe	 * Setup %r10 value allows us to detect crossing a page boundary.
424*5d9d9091SRichard Lowe	 * When %r10 goes positive we are crossing a page boundary and
425*5d9d9091SRichard Lowe	 * need to do a nibble.
426*5d9d9091SRichard Lowe	 */
427*5d9d9091SRichard Lowe	lea	3(%rdi), %r10
428*5d9d9091SRichard Lowe	and	$0xfff, %r10	/* offset into 4K page */
429*5d9d9091SRichard Lowe	sub	$0x1000, %r10	/* subtract 4K pagesize */
430*5d9d9091SRichard Lowe	movdqa	%xmm3, %xmm4
431*5d9d9091SRichard Lowe
432*5d9d9091SRichard Lowe	.p2align 4
433*5d9d9091SRichard LoweLABEL(loop_ashr_3):
434*5d9d9091SRichard Lowe	add	$16, %r10
435*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_3)
436*5d9d9091SRichard Lowe
437*5d9d9091SRichard LoweLABEL(gobble_ashr_3):
438*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
439*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
440*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
441*5d9d9091SRichard Lowe
442*5d9d9091SRichard Lowe	psrldq	$3, %xmm3
443*5d9d9091SRichard Lowe	pslldq	$13, %xmm2
444*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
445*5d9d9091SRichard Lowe
446*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
447*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
448*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
449*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
450*5d9d9091SRichard Lowe	sub	$0xffff, %edx
451*5d9d9091SRichard Lowe	jnz	LABEL(exit)
452*5d9d9091SRichard Lowe
453*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
454*5d9d9091SRichard Lowe	sub	$16, %r11
455*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
456*5d9d9091SRichard Lowe#endif
457*5d9d9091SRichard Lowe
458*5d9d9091SRichard Lowe	add	$16, %rcx
459*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
460*5d9d9091SRichard Lowe
461*5d9d9091SRichard Lowe	add	$16, %r10
462*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_3)	/* cross page boundary */
463*5d9d9091SRichard Lowe
464*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
465*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
466*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
467*5d9d9091SRichard Lowe
468*5d9d9091SRichard Lowe	psrldq	$3, %xmm3
469*5d9d9091SRichard Lowe	pslldq 	$13, %xmm2
470*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
471*5d9d9091SRichard Lowe
472*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
473*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
474*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
475*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
476*5d9d9091SRichard Lowe	sub	$0xffff, %edx
477*5d9d9091SRichard Lowe	jnz	LABEL(exit)
478*5d9d9091SRichard Lowe
479*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
480*5d9d9091SRichard Lowe	sub	$16, %r11
481*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
482*5d9d9091SRichard Lowe#endif
483*5d9d9091SRichard Lowe
484*5d9d9091SRichard Lowe	add	$16, %rcx
485*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
486*5d9d9091SRichard Lowe	jmp	LABEL(loop_ashr_3)
487*5d9d9091SRichard Lowe
488*5d9d9091SRichard Lowe	.p2align 4
489*5d9d9091SRichard LoweLABEL(nibble_ashr_3):
490*5d9d9091SRichard Lowe	psrldq	$3, %xmm4
491*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
492*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
493*5d9d9091SRichard Lowe	pcmpeqb	%xmm4, %xmm1
494*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
495*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
496*5d9d9091SRichard Lowe	sub	$0x1fff, %edx
497*5d9d9091SRichard Lowe	jnz	LABEL(exit)
498*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
499*5d9d9091SRichard Lowe	cmp	$13, %r11
500*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
501*5d9d9091SRichard Lowe#endif
502*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
503*5d9d9091SRichard Lowe	sub	$0x1000, %r10		/* subtract 4K from %r10 */
504*5d9d9091SRichard Lowe	jmp	LABEL(gobble_ashr_3)
505*5d9d9091SRichard Lowe
506*5d9d9091SRichard Lowe/*
507*5d9d9091SRichard Lowe * ashr_4 handles the following cases:
508*5d9d9091SRichard Lowe * 	abs(str1 offset - str2 offset) = 12
509*5d9d9091SRichard Lowe */
510*5d9d9091SRichard Lowe	.p2align 4
511*5d9d9091SRichard LoweLABEL(ashr_4):
512*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
513*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm2
514*5d9d9091SRichard Lowe	movdqa	(%rsi), %xmm1
515*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
516*5d9d9091SRichard Lowe	pslldq	$12, %xmm2
517*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm2
518*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm2
519*5d9d9091SRichard Lowe	pmovmskb %xmm2, %r9d
520*5d9d9091SRichard Lowe	shr	%cl, %edx
521*5d9d9091SRichard Lowe	shr	%cl, %r9d
522*5d9d9091SRichard Lowe	sub	%r9d, %edx
523*5d9d9091SRichard Lowe	jnz	LABEL(less32bytes)
524*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm3
525*5d9d9091SRichard Lowe
526*5d9d9091SRichard Lowe	UPDATE_STRNCMP_COUNTER
527*5d9d9091SRichard Lowe
528*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
529*5d9d9091SRichard Lowe	mov	$16, %rcx	/* index for loads */
530*5d9d9091SRichard Lowe	mov	$4, %r9d	/* rdi bytes already examined. Used in exit code */
531*5d9d9091SRichard Lowe	/*
532*5d9d9091SRichard Lowe	 * Setup %r10 value allows us to detect crossing a page boundary.
533*5d9d9091SRichard Lowe	 * When %r10 goes positive we are crossing a page boundary and
534*5d9d9091SRichard Lowe	 * need to do a nibble.
535*5d9d9091SRichard Lowe	 */
536*5d9d9091SRichard Lowe	lea	4(%rdi), %r10
537*5d9d9091SRichard Lowe	and	$0xfff, %r10	/* offset into 4K page */
538*5d9d9091SRichard Lowe	sub	$0x1000, %r10	/* subtract 4K pagesize */
539*5d9d9091SRichard Lowe	movdqa	%xmm3, %xmm4
540*5d9d9091SRichard Lowe
541*5d9d9091SRichard Lowe	.p2align 4
542*5d9d9091SRichard LoweLABEL(loop_ashr_4):
543*5d9d9091SRichard Lowe	add	$16, %r10
544*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_4)
545*5d9d9091SRichard Lowe
546*5d9d9091SRichard LoweLABEL(gobble_ashr_4):
547*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
548*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
549*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
550*5d9d9091SRichard Lowe
551*5d9d9091SRichard Lowe	psrldq	$4, %xmm3
552*5d9d9091SRichard Lowe	pslldq	$12, %xmm2
553*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
554*5d9d9091SRichard Lowe
555*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
556*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
557*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
558*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
559*5d9d9091SRichard Lowe	sub	$0xffff, %edx
560*5d9d9091SRichard Lowe	jnz	LABEL(exit)
561*5d9d9091SRichard Lowe
562*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
563*5d9d9091SRichard Lowe	sub	$16, %r11
564*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
565*5d9d9091SRichard Lowe#endif
566*5d9d9091SRichard Lowe
567*5d9d9091SRichard Lowe	add	$16, %rcx
568*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
569*5d9d9091SRichard Lowe
570*5d9d9091SRichard Lowe	add	$16, %r10
571*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_4)	/* cross page boundary */
572*5d9d9091SRichard Lowe
573*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
574*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
575*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
576*5d9d9091SRichard Lowe
577*5d9d9091SRichard Lowe	psrldq	$4, %xmm3
578*5d9d9091SRichard Lowe	pslldq 	$12, %xmm2
579*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
580*5d9d9091SRichard Lowe
581*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
582*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
583*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
584*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
585*5d9d9091SRichard Lowe	sub	$0xffff, %edx
586*5d9d9091SRichard Lowe	jnz	LABEL(exit)
587*5d9d9091SRichard Lowe
588*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
589*5d9d9091SRichard Lowe	sub	$16, %r11
590*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
591*5d9d9091SRichard Lowe#endif
592*5d9d9091SRichard Lowe
593*5d9d9091SRichard Lowe	add	$16, %rcx
594*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
595*5d9d9091SRichard Lowe	jmp	LABEL(loop_ashr_4)
596*5d9d9091SRichard Lowe
597*5d9d9091SRichard Lowe	.p2align 4
598*5d9d9091SRichard LoweLABEL(nibble_ashr_4):
599*5d9d9091SRichard Lowe	psrldq	$4, %xmm4
600*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
601*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
602*5d9d9091SRichard Lowe	pcmpeqb	%xmm4, %xmm1
603*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
604*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
605*5d9d9091SRichard Lowe	sub	$0x0fff, %edx
606*5d9d9091SRichard Lowe	jnz	LABEL(exit)
607*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
608*5d9d9091SRichard Lowe	cmp	$12, %r11
609*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
610*5d9d9091SRichard Lowe#endif
611*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
612*5d9d9091SRichard Lowe	sub	$0x1000, %r10		/* subtract 4K from %r10 */
613*5d9d9091SRichard Lowe	jmp	LABEL(gobble_ashr_4)
614*5d9d9091SRichard Lowe
615*5d9d9091SRichard Lowe/*
616*5d9d9091SRichard Lowe * ashr_5 handles the following cases:
617*5d9d9091SRichard Lowe * 	abs(str1 offset - str2 offset) = 11
618*5d9d9091SRichard Lowe */
619*5d9d9091SRichard Lowe	.p2align 4
620*5d9d9091SRichard LoweLABEL(ashr_5):
621*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
622*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm2
623*5d9d9091SRichard Lowe	movdqa	(%rsi), %xmm1
624*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
625*5d9d9091SRichard Lowe	pslldq	$11, %xmm2
626*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm2
627*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm2
628*5d9d9091SRichard Lowe	pmovmskb %xmm2, %r9d
629*5d9d9091SRichard Lowe	shr	%cl, %edx
630*5d9d9091SRichard Lowe	shr	%cl, %r9d
631*5d9d9091SRichard Lowe	sub	%r9d, %edx
632*5d9d9091SRichard Lowe	jnz	LABEL(less32bytes)
633*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm3
634*5d9d9091SRichard Lowe
635*5d9d9091SRichard Lowe	UPDATE_STRNCMP_COUNTER
636*5d9d9091SRichard Lowe
637*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
638*5d9d9091SRichard Lowe	mov	$16, %rcx	/* index for loads */
639*5d9d9091SRichard Lowe	mov	$5, %r9d	/* rdi bytes already examined. Used in exit code */
640*5d9d9091SRichard Lowe	/*
641*5d9d9091SRichard Lowe	 * Setup %r10 value allows us to detect crossing a page boundary.
642*5d9d9091SRichard Lowe	 * When %r10 goes positive we are crossing a page boundary and
643*5d9d9091SRichard Lowe	 * need to do a nibble.
644*5d9d9091SRichard Lowe	 */
645*5d9d9091SRichard Lowe	lea	5(%rdi), %r10
646*5d9d9091SRichard Lowe	and	$0xfff, %r10	/* offset into 4K page */
647*5d9d9091SRichard Lowe	sub	$0x1000, %r10	/* subtract 4K pagesize */
648*5d9d9091SRichard Lowe	movdqa	%xmm3, %xmm4
649*5d9d9091SRichard Lowe
650*5d9d9091SRichard Lowe	.p2align 4
651*5d9d9091SRichard LoweLABEL(loop_ashr_5):
652*5d9d9091SRichard Lowe	add	$16, %r10
653*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_5)
654*5d9d9091SRichard Lowe
655*5d9d9091SRichard LoweLABEL(gobble_ashr_5):
656*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
657*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
658*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
659*5d9d9091SRichard Lowe
660*5d9d9091SRichard Lowe	psrldq	$5, %xmm3
661*5d9d9091SRichard Lowe	pslldq	$11, %xmm2
662*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
663*5d9d9091SRichard Lowe
664*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
665*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
666*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
667*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
668*5d9d9091SRichard Lowe	sub	$0xffff, %edx
669*5d9d9091SRichard Lowe	jnz	LABEL(exit)
670*5d9d9091SRichard Lowe
671*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
672*5d9d9091SRichard Lowe	sub	$16, %r11
673*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
674*5d9d9091SRichard Lowe#endif
675*5d9d9091SRichard Lowe
676*5d9d9091SRichard Lowe	add	$16, %rcx
677*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
678*5d9d9091SRichard Lowe
679*5d9d9091SRichard Lowe	add	$16, %r10
680*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_5)	/* cross page boundary */
681*5d9d9091SRichard Lowe
682*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
683*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
684*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
685*5d9d9091SRichard Lowe
686*5d9d9091SRichard Lowe	psrldq	$5, %xmm3
687*5d9d9091SRichard Lowe	pslldq 	$11, %xmm2
688*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
689*5d9d9091SRichard Lowe
690*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
691*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
692*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
693*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
694*5d9d9091SRichard Lowe	sub	$0xffff, %edx
695*5d9d9091SRichard Lowe	jnz	LABEL(exit)
696*5d9d9091SRichard Lowe
697*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
698*5d9d9091SRichard Lowe	sub	$16, %r11
699*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
700*5d9d9091SRichard Lowe#endif
701*5d9d9091SRichard Lowe
702*5d9d9091SRichard Lowe	add	$16, %rcx
703*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
704*5d9d9091SRichard Lowe	jmp	LABEL(loop_ashr_5)
705*5d9d9091SRichard Lowe
706*5d9d9091SRichard Lowe	.p2align 4
707*5d9d9091SRichard LoweLABEL(nibble_ashr_5):
708*5d9d9091SRichard Lowe	psrldq	$5, %xmm4
709*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
710*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
711*5d9d9091SRichard Lowe	pcmpeqb	%xmm4, %xmm1
712*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
713*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
714*5d9d9091SRichard Lowe	sub	$0x07ff, %edx
715*5d9d9091SRichard Lowe	jnz	LABEL(exit)
716*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
717*5d9d9091SRichard Lowe	cmp	$11, %r11
718*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
719*5d9d9091SRichard Lowe#endif
720*5d9d9091SRichard Lowe 	pxor	%xmm0, %xmm0
721*5d9d9091SRichard Lowe	sub	$0x1000, %r10		/* subtract 4K from %r10 */
722*5d9d9091SRichard Lowe	jmp	LABEL(gobble_ashr_5)
723*5d9d9091SRichard Lowe
724*5d9d9091SRichard Lowe/*
725*5d9d9091SRichard Lowe * ashr_6 handles the following cases:
726*5d9d9091SRichard Lowe * 	abs(str1 offset - str2 offset) = 10
727*5d9d9091SRichard Lowe */
728*5d9d9091SRichard Lowe	.p2align 4
729*5d9d9091SRichard LoweLABEL(ashr_6):
730*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
731*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm2
732*5d9d9091SRichard Lowe	movdqa	(%rsi), %xmm1
733*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
734*5d9d9091SRichard Lowe	pslldq	$10, %xmm2
735*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm2
736*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm2
737*5d9d9091SRichard Lowe	pmovmskb %xmm2, %r9d
738*5d9d9091SRichard Lowe	shr	%cl, %edx
739*5d9d9091SRichard Lowe	shr	%cl, %r9d
740*5d9d9091SRichard Lowe	sub	%r9d, %edx
741*5d9d9091SRichard Lowe	jnz	LABEL(less32bytes)
742*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm3
743*5d9d9091SRichard Lowe
744*5d9d9091SRichard Lowe	UPDATE_STRNCMP_COUNTER
745*5d9d9091SRichard Lowe
746*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
747*5d9d9091SRichard Lowe	mov	$16, %rcx	/* index for loads */
748*5d9d9091SRichard Lowe	mov	$6, %r9d	/* rdi bytes already examined. Used in exit code */
749*5d9d9091SRichard Lowe	/*
750*5d9d9091SRichard Lowe	 * Setup %r10 value allows us to detect crossing a page boundary.
751*5d9d9091SRichard Lowe	 * When %r10 goes positive we are crossing a page boundary and
752*5d9d9091SRichard Lowe	 * need to do a nibble.
753*5d9d9091SRichard Lowe	 */
754*5d9d9091SRichard Lowe	lea	6(%rdi), %r10
755*5d9d9091SRichard Lowe	and	$0xfff, %r10	/* offset into 4K page */
756*5d9d9091SRichard Lowe	sub	$0x1000, %r10	/* subtract 4K pagesize */
757*5d9d9091SRichard Lowe	movdqa	%xmm3, %xmm4
758*5d9d9091SRichard Lowe
759*5d9d9091SRichard Lowe	.p2align 4
760*5d9d9091SRichard LoweLABEL(loop_ashr_6):
761*5d9d9091SRichard Lowe	add	$16, %r10
762*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_6)
763*5d9d9091SRichard Lowe
764*5d9d9091SRichard LoweLABEL(gobble_ashr_6):
765*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
766*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
767*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
768*5d9d9091SRichard Lowe
769*5d9d9091SRichard Lowe	psrldq	$6, %xmm3
770*5d9d9091SRichard Lowe	pslldq	$10, %xmm2
771*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
772*5d9d9091SRichard Lowe
773*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
774*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
775*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
776*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
777*5d9d9091SRichard Lowe	sub	$0xffff, %edx
778*5d9d9091SRichard Lowe	jnz	LABEL(exit)
779*5d9d9091SRichard Lowe
780*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
781*5d9d9091SRichard Lowe	sub	$16, %r11
782*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
783*5d9d9091SRichard Lowe#endif
784*5d9d9091SRichard Lowe
785*5d9d9091SRichard Lowe	add	$16, %rcx
786*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
787*5d9d9091SRichard Lowe
788*5d9d9091SRichard Lowe	add	$16, %r10
789*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_6)	/* cross page boundary */
790*5d9d9091SRichard Lowe
791*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
792*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
793*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
794*5d9d9091SRichard Lowe
795*5d9d9091SRichard Lowe	psrldq	$6, %xmm3
796*5d9d9091SRichard Lowe	pslldq 	$10, %xmm2
797*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
798*5d9d9091SRichard Lowe
799*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
800*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
801*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
802*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
803*5d9d9091SRichard Lowe	sub	$0xffff, %edx
804*5d9d9091SRichard Lowe	jnz	LABEL(exit)
805*5d9d9091SRichard Lowe
806*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
807*5d9d9091SRichard Lowe	sub	$16, %r11
808*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
809*5d9d9091SRichard Lowe#endif
810*5d9d9091SRichard Lowe
811*5d9d9091SRichard Lowe	add	$16, %rcx
812*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
813*5d9d9091SRichard Lowe	jmp	LABEL(loop_ashr_6)
814*5d9d9091SRichard Lowe
815*5d9d9091SRichard Lowe	.p2align 4
816*5d9d9091SRichard LoweLABEL(nibble_ashr_6):
817*5d9d9091SRichard Lowe	psrldq	$6, %xmm4
818*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
819*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
820*5d9d9091SRichard Lowe	pcmpeqb	%xmm4, %xmm1
821*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
822*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
823*5d9d9091SRichard Lowe	sub	$0x03ff, %edx
824*5d9d9091SRichard Lowe	jnz	LABEL(exit)
825*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
826*5d9d9091SRichard Lowe	cmp	$10, %r11
827*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
828*5d9d9091SRichard Lowe#endif
829*5d9d9091SRichard Lowe 	pxor	%xmm0, %xmm0
830*5d9d9091SRichard Lowe	sub	$0x1000, %r10		/* subtract 4K from %r10 */
831*5d9d9091SRichard Lowe	jmp	LABEL(gobble_ashr_6)
832*5d9d9091SRichard Lowe
833*5d9d9091SRichard Lowe/*
834*5d9d9091SRichard Lowe * ashr_7 handles the following cases:
835*5d9d9091SRichard Lowe * 	abs(str1 offset - str2 offset) = 9
836*5d9d9091SRichard Lowe */
837*5d9d9091SRichard Lowe	.p2align 4
838*5d9d9091SRichard LoweLABEL(ashr_7):
839*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
840*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm2
841*5d9d9091SRichard Lowe	movdqa	(%rsi), %xmm1
842*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
843*5d9d9091SRichard Lowe	pslldq	$9, %xmm2
844*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm2
845*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm2
846*5d9d9091SRichard Lowe	pmovmskb %xmm2, %r9d
847*5d9d9091SRichard Lowe	shr	%cl, %edx
848*5d9d9091SRichard Lowe	shr	%cl, %r9d
849*5d9d9091SRichard Lowe	sub	%r9d, %edx
850*5d9d9091SRichard Lowe	jnz	LABEL(less32bytes)
851*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm3
852*5d9d9091SRichard Lowe
853*5d9d9091SRichard Lowe	UPDATE_STRNCMP_COUNTER
854*5d9d9091SRichard Lowe
855*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
856*5d9d9091SRichard Lowe	mov	$16, %rcx	/* index for loads */
857*5d9d9091SRichard Lowe	mov	$7, %r9d	/* rdi bytes already examined. Used in exit code */
858*5d9d9091SRichard Lowe	/*
859*5d9d9091SRichard Lowe	 * Setup %r10 value allows us to detect crossing a page boundary.
860*5d9d9091SRichard Lowe	 * When %r10 goes positive we are crossing a page boundary and
861*5d9d9091SRichard Lowe	 * need to do a nibble.
862*5d9d9091SRichard Lowe	 */
863*5d9d9091SRichard Lowe	lea	7(%rdi), %r10
864*5d9d9091SRichard Lowe	and	$0xfff, %r10	/* offset into 4K page */
865*5d9d9091SRichard Lowe	sub	$0x1000, %r10	/* subtract 4K pagesize */
866*5d9d9091SRichard Lowe	movdqa	%xmm3, %xmm4
867*5d9d9091SRichard Lowe
868*5d9d9091SRichard Lowe	.p2align 4
869*5d9d9091SRichard LoweLABEL(loop_ashr_7):
870*5d9d9091SRichard Lowe	add	$16, %r10
871*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_7)
872*5d9d9091SRichard Lowe
873*5d9d9091SRichard LoweLABEL(gobble_ashr_7):
874*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
875*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
876*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
877*5d9d9091SRichard Lowe
878*5d9d9091SRichard Lowe	psrldq	$7, %xmm3
879*5d9d9091SRichard Lowe	pslldq	$9, %xmm2
880*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
881*5d9d9091SRichard Lowe
882*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
883*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
884*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
885*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
886*5d9d9091SRichard Lowe	sub	$0xffff, %edx
887*5d9d9091SRichard Lowe	jnz	LABEL(exit)
888*5d9d9091SRichard Lowe
889*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
890*5d9d9091SRichard Lowe	sub	$16, %r11
891*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
892*5d9d9091SRichard Lowe#endif
893*5d9d9091SRichard Lowe
894*5d9d9091SRichard Lowe	add	$16, %rcx
895*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
896*5d9d9091SRichard Lowe
897*5d9d9091SRichard Lowe	add	$16, %r10
898*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_7)	/* cross page boundary */
899*5d9d9091SRichard Lowe
900*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
901*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
902*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
903*5d9d9091SRichard Lowe
904*5d9d9091SRichard Lowe	psrldq	$7, %xmm3
905*5d9d9091SRichard Lowe	pslldq 	$9, %xmm2
906*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
907*5d9d9091SRichard Lowe
908*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
909*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
910*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
911*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
912*5d9d9091SRichard Lowe	sub	$0xffff, %edx
913*5d9d9091SRichard Lowe	jnz	LABEL(exit)
914*5d9d9091SRichard Lowe
915*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
916*5d9d9091SRichard Lowe	sub	$16, %r11
917*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
918*5d9d9091SRichard Lowe#endif
919*5d9d9091SRichard Lowe
920*5d9d9091SRichard Lowe	add	$16, %rcx
921*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
922*5d9d9091SRichard Lowe	jmp	LABEL(loop_ashr_7)
923*5d9d9091SRichard Lowe
924*5d9d9091SRichard Lowe	.p2align 4
925*5d9d9091SRichard LoweLABEL(nibble_ashr_7):
926*5d9d9091SRichard Lowe	psrldq	$7, %xmm4
927*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
928*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
929*5d9d9091SRichard Lowe	pcmpeqb	%xmm4, %xmm1
930*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
931*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
932*5d9d9091SRichard Lowe	sub	$0x01ff, %edx
933*5d9d9091SRichard Lowe	jnz	LABEL(exit)
934*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
935*5d9d9091SRichard Lowe	cmp	$9, %r11
936*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
937*5d9d9091SRichard Lowe#endif
938*5d9d9091SRichard Lowe 	pxor	%xmm0, %xmm0
939*5d9d9091SRichard Lowe	sub	$0x1000, %r10		/* subtract 4K from %r10 */
940*5d9d9091SRichard Lowe	jmp	LABEL(gobble_ashr_7)
941*5d9d9091SRichard Lowe
942*5d9d9091SRichard Lowe/*
943*5d9d9091SRichard Lowe * ashr_8 handles the following cases:
944*5d9d9091SRichard Lowe * 	abs(str1 offset - str2 offset) = 8
945*5d9d9091SRichard Lowe */
946*5d9d9091SRichard Lowe	.p2align 4
947*5d9d9091SRichard LoweLABEL(ashr_8):
948*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
949*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm2
950*5d9d9091SRichard Lowe	movdqa	(%rsi), %xmm1
951*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
952*5d9d9091SRichard Lowe	pslldq	$8, %xmm2
953*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm2
954*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm2
955*5d9d9091SRichard Lowe	pmovmskb %xmm2, %r9d
956*5d9d9091SRichard Lowe	shr	%cl, %edx
957*5d9d9091SRichard Lowe	shr	%cl, %r9d
958*5d9d9091SRichard Lowe	sub	%r9d, %edx
959*5d9d9091SRichard Lowe	jnz	LABEL(less32bytes)
960*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm3
961*5d9d9091SRichard Lowe
962*5d9d9091SRichard Lowe	UPDATE_STRNCMP_COUNTER
963*5d9d9091SRichard Lowe
964*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
965*5d9d9091SRichard Lowe	mov	$16, %rcx	/* index for loads */
966*5d9d9091SRichard Lowe	mov	$8, %r9d	/* rdi bytes already examined. Used in exit code */
967*5d9d9091SRichard Lowe	/*
968*5d9d9091SRichard Lowe	 * Setup %r10 value allows us to detect crossing a page boundary.
969*5d9d9091SRichard Lowe	 * When %r10 goes positive we are crossing a page boundary and
970*5d9d9091SRichard Lowe	 * need to do a nibble.
971*5d9d9091SRichard Lowe	 */
972*5d9d9091SRichard Lowe	lea	8(%rdi), %r10
973*5d9d9091SRichard Lowe	and	$0xfff, %r10	/* offset into 4K page */
974*5d9d9091SRichard Lowe	sub	$0x1000, %r10	/* subtract 4K pagesize */
975*5d9d9091SRichard Lowe	movdqa	%xmm3, %xmm4
976*5d9d9091SRichard Lowe
977*5d9d9091SRichard Lowe	.p2align 4
978*5d9d9091SRichard LoweLABEL(loop_ashr_8):
979*5d9d9091SRichard Lowe	add	$16, %r10
980*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_8)
981*5d9d9091SRichard Lowe
982*5d9d9091SRichard LoweLABEL(gobble_ashr_8):
983*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
984*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
985*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
986*5d9d9091SRichard Lowe
987*5d9d9091SRichard Lowe	psrldq	$8, %xmm3
988*5d9d9091SRichard Lowe	pslldq	$8, %xmm2
989*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
990*5d9d9091SRichard Lowe
991*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
992*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
993*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
994*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
995*5d9d9091SRichard Lowe	sub	$0xffff, %edx
996*5d9d9091SRichard Lowe	jnz	LABEL(exit)
997*5d9d9091SRichard Lowe
998*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
999*5d9d9091SRichard Lowe	sub	$16, %r11
1000*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1001*5d9d9091SRichard Lowe#endif
1002*5d9d9091SRichard Lowe
1003*5d9d9091SRichard Lowe	add	$16, %rcx
1004*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
1005*5d9d9091SRichard Lowe
1006*5d9d9091SRichard Lowe	add	$16, %r10
1007*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_8)	/* cross page boundary */
1008*5d9d9091SRichard Lowe
1009*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1010*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
1011*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
1012*5d9d9091SRichard Lowe
1013*5d9d9091SRichard Lowe	psrldq	$8, %xmm3
1014*5d9d9091SRichard Lowe	pslldq 	$8, %xmm2
1015*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
1016*5d9d9091SRichard Lowe
1017*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1018*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
1019*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1020*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1021*5d9d9091SRichard Lowe	sub	$0xffff, %edx
1022*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1023*5d9d9091SRichard Lowe
1024*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1025*5d9d9091SRichard Lowe	sub	$16, %r11
1026*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1027*5d9d9091SRichard Lowe#endif
1028*5d9d9091SRichard Lowe
1029*5d9d9091SRichard Lowe	add	$16, %rcx
1030*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
1031*5d9d9091SRichard Lowe	jmp	LABEL(loop_ashr_8)
1032*5d9d9091SRichard Lowe
1033*5d9d9091SRichard Lowe	.p2align 4
1034*5d9d9091SRichard LoweLABEL(nibble_ashr_8):
1035*5d9d9091SRichard Lowe	psrldq	$8, %xmm4
1036*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1037*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1038*5d9d9091SRichard Lowe	pcmpeqb	%xmm4, %xmm1
1039*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1040*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1041*5d9d9091SRichard Lowe	sub	$0x00ff, %edx
1042*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1043*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1044*5d9d9091SRichard Lowe	cmp	$8, %r11
1045*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1046*5d9d9091SRichard Lowe#endif
1047*5d9d9091SRichard Lowe 	pxor	%xmm0, %xmm0
1048*5d9d9091SRichard Lowe	sub	$0x1000, %r10		/* subtract 4K from %r10 */
1049*5d9d9091SRichard Lowe	jmp	LABEL(gobble_ashr_8)
1050*5d9d9091SRichard Lowe
1051*5d9d9091SRichard Lowe/*
1052*5d9d9091SRichard Lowe * ashr_9 handles the following cases:
1053*5d9d9091SRichard Lowe * 	abs(str1 offset - str2 offset) = 7
1054*5d9d9091SRichard Lowe */
1055*5d9d9091SRichard Lowe	.p2align 4
1056*5d9d9091SRichard LoweLABEL(ashr_9):
1057*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
1058*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm2
1059*5d9d9091SRichard Lowe	movdqa	(%rsi), %xmm1
1060*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1061*5d9d9091SRichard Lowe	pslldq	$7, %xmm2
1062*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm2
1063*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm2
1064*5d9d9091SRichard Lowe	pmovmskb %xmm2, %r9d
1065*5d9d9091SRichard Lowe	shr	%cl, %edx
1066*5d9d9091SRichard Lowe	shr	%cl, %r9d
1067*5d9d9091SRichard Lowe	sub	%r9d, %edx
1068*5d9d9091SRichard Lowe	jnz	LABEL(less32bytes)
1069*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm3
1070*5d9d9091SRichard Lowe
1071*5d9d9091SRichard Lowe	UPDATE_STRNCMP_COUNTER
1072*5d9d9091SRichard Lowe
1073*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
1074*5d9d9091SRichard Lowe	mov	$16, %rcx	/* index for loads */
1075*5d9d9091SRichard Lowe	mov	$9, %r9d	/* rdi bytes already examined. Used in exit code */
1076*5d9d9091SRichard Lowe	/*
1077*5d9d9091SRichard Lowe	 * Setup %r10 value allows us to detect crossing a page boundary.
1078*5d9d9091SRichard Lowe	 * When %r10 goes positive we are crossing a page boundary and
1079*5d9d9091SRichard Lowe	 * need to do a nibble.
1080*5d9d9091SRichard Lowe	 */
1081*5d9d9091SRichard Lowe	lea	9(%rdi), %r10
1082*5d9d9091SRichard Lowe	and	$0xfff, %r10	/* offset into 4K page */
1083*5d9d9091SRichard Lowe	sub	$0x1000, %r10	/* subtract 4K pagesize */
1084*5d9d9091SRichard Lowe	movdqa	%xmm3, %xmm4
1085*5d9d9091SRichard Lowe
1086*5d9d9091SRichard Lowe	.p2align 4
1087*5d9d9091SRichard LoweLABEL(loop_ashr_9):
1088*5d9d9091SRichard Lowe	add	$16, %r10
1089*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_9)
1090*5d9d9091SRichard Lowe
1091*5d9d9091SRichard LoweLABEL(gobble_ashr_9):
1092*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1093*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
1094*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
1095*5d9d9091SRichard Lowe
1096*5d9d9091SRichard Lowe	psrldq	$9, %xmm3
1097*5d9d9091SRichard Lowe	pslldq	$7, %xmm2
1098*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
1099*5d9d9091SRichard Lowe
1100*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1101*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
1102*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1103*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1104*5d9d9091SRichard Lowe	sub	$0xffff, %edx
1105*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1106*5d9d9091SRichard Lowe
1107*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1108*5d9d9091SRichard Lowe	sub	$16, %r11
1109*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1110*5d9d9091SRichard Lowe#endif
1111*5d9d9091SRichard Lowe
1112*5d9d9091SRichard Lowe	add	$16, %rcx
1113*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
1114*5d9d9091SRichard Lowe
1115*5d9d9091SRichard Lowe	add	$16, %r10
1116*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_9)	/* cross page boundary */
1117*5d9d9091SRichard Lowe
1118*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1119*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
1120*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
1121*5d9d9091SRichard Lowe
1122*5d9d9091SRichard Lowe	psrldq	$9, %xmm3
1123*5d9d9091SRichard Lowe	pslldq 	$7, %xmm2
1124*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
1125*5d9d9091SRichard Lowe
1126*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1127*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
1128*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1129*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1130*5d9d9091SRichard Lowe	sub	$0xffff, %edx
1131*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1132*5d9d9091SRichard Lowe
1133*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1134*5d9d9091SRichard Lowe	sub	$16, %r11
1135*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1136*5d9d9091SRichard Lowe#endif
1137*5d9d9091SRichard Lowe
1138*5d9d9091SRichard Lowe	add	$16, %rcx
1139*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3		/* store for next cycle */
1140*5d9d9091SRichard Lowe	jmp	LABEL(loop_ashr_9)
1141*5d9d9091SRichard Lowe
1142*5d9d9091SRichard Lowe	.p2align 4
1143*5d9d9091SRichard LoweLABEL(nibble_ashr_9):
1144*5d9d9091SRichard Lowe	psrldq	$9, %xmm4
1145*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1146*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1147*5d9d9091SRichard Lowe	pcmpeqb	%xmm4, %xmm1
1148*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1149*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1150*5d9d9091SRichard Lowe	sub	$0x007f, %edx
1151*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1152*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1153*5d9d9091SRichard Lowe	cmp	$7, %r11
1154*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1155*5d9d9091SRichard Lowe#endif
1156*5d9d9091SRichard Lowe 	pxor	%xmm0, %xmm0
1157*5d9d9091SRichard Lowe	sub	$0x1000, %r10		/* subtract 4K from %r10 */
1158*5d9d9091SRichard Lowe	jmp	LABEL(gobble_ashr_9)
1159*5d9d9091SRichard Lowe
1160*5d9d9091SRichard Lowe/*
1161*5d9d9091SRichard Lowe * ashr_10 handles the following cases:
1162*5d9d9091SRichard Lowe * 	abs(str1 offset - str2 offset) = 6
1163*5d9d9091SRichard Lowe */
1164*5d9d9091SRichard Lowe	.p2align 4
1165*5d9d9091SRichard LoweLABEL(ashr_10):
1166*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
1167*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm2
1168*5d9d9091SRichard Lowe	movdqa	(%rsi), %xmm1
1169*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1170*5d9d9091SRichard Lowe	pslldq	$6, %xmm2
1171*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm2
1172*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm2
1173*5d9d9091SRichard Lowe	pmovmskb %xmm2, %r9d
1174*5d9d9091SRichard Lowe	shr	%cl, %edx
1175*5d9d9091SRichard Lowe	shr	%cl, %r9d
1176*5d9d9091SRichard Lowe	sub	%r9d, %edx
1177*5d9d9091SRichard Lowe	jnz	LABEL(less32bytes)
1178*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm3
1179*5d9d9091SRichard Lowe
1180*5d9d9091SRichard Lowe	UPDATE_STRNCMP_COUNTER
1181*5d9d9091SRichard Lowe
1182*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
1183*5d9d9091SRichard Lowe	mov	$16, %rcx	/* index for loads */
1184*5d9d9091SRichard Lowe	mov	$10, %r9d	/* rdi bytes already examined. Used in exit code */
1185*5d9d9091SRichard Lowe	/*
1186*5d9d9091SRichard Lowe	 * Setup %r10 value allows us to detect crossing a page boundary.
1187*5d9d9091SRichard Lowe	 * When %r10 goes positive we are crossing a page boundary and
1188*5d9d9091SRichard Lowe	 * need to do a nibble.
1189*5d9d9091SRichard Lowe	 */
1190*5d9d9091SRichard Lowe	lea	10(%rdi), %r10
1191*5d9d9091SRichard Lowe	and	$0xfff, %r10	/* offset into 4K page */
1192*5d9d9091SRichard Lowe	sub	$0x1000, %r10	/* subtract 4K pagesize */
1193*5d9d9091SRichard Lowe	movdqa	%xmm3, %xmm4
1194*5d9d9091SRichard Lowe
1195*5d9d9091SRichard Lowe	.p2align 4
1196*5d9d9091SRichard LoweLABEL(loop_ashr_10):
1197*5d9d9091SRichard Lowe	add	$16, %r10
1198*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_10)
1199*5d9d9091SRichard Lowe
1200*5d9d9091SRichard LoweLABEL(gobble_ashr_10):
1201*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1202*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
1203*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
1204*5d9d9091SRichard Lowe
1205*5d9d9091SRichard Lowe	psrldq	$10, %xmm3
1206*5d9d9091SRichard Lowe	pslldq	$6, %xmm2
1207*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
1208*5d9d9091SRichard Lowe
1209*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1210*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
1211*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1212*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1213*5d9d9091SRichard Lowe	sub	$0xffff, %edx
1214*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1215*5d9d9091SRichard Lowe
1216*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1217*5d9d9091SRichard Lowe	sub	$16, %r11
1218*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1219*5d9d9091SRichard Lowe#endif
1220*5d9d9091SRichard Lowe
1221*5d9d9091SRichard Lowe	add	$16, %rcx
1222*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
1223*5d9d9091SRichard Lowe
1224*5d9d9091SRichard Lowe	add	$16, %r10
1225*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_10)	/* cross page boundary */
1226*5d9d9091SRichard Lowe
1227*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1228*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
1229*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
1230*5d9d9091SRichard Lowe
1231*5d9d9091SRichard Lowe	psrldq	$10, %xmm3
1232*5d9d9091SRichard Lowe	pslldq 	$6, %xmm2
1233*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
1234*5d9d9091SRichard Lowe
1235*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1236*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
1237*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1238*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1239*5d9d9091SRichard Lowe	sub	$0xffff, %edx
1240*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1241*5d9d9091SRichard Lowe
1242*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1243*5d9d9091SRichard Lowe	sub	$16, %r11
1244*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1245*5d9d9091SRichard Lowe#endif
1246*5d9d9091SRichard Lowe
1247*5d9d9091SRichard Lowe	add	$16, %rcx
1248*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
1249*5d9d9091SRichard Lowe	jmp	LABEL(loop_ashr_10)
1250*5d9d9091SRichard Lowe
1251*5d9d9091SRichard Lowe	.p2align 4
1252*5d9d9091SRichard LoweLABEL(nibble_ashr_10):
1253*5d9d9091SRichard Lowe	psrldq	$10, %xmm4
1254*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1255*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1256*5d9d9091SRichard Lowe	pcmpeqb	%xmm4, %xmm1
1257*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1258*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1259*5d9d9091SRichard Lowe	sub	$0x003f, %edx
1260*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1261*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1262*5d9d9091SRichard Lowe	cmp	$6, %r11
1263*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1264*5d9d9091SRichard Lowe#endif
1265*5d9d9091SRichard Lowe 	pxor	%xmm0, %xmm0
1266*5d9d9091SRichard Lowe	sub	$0x1000, %r10		/* subtract 4K from %r10 */
1267*5d9d9091SRichard Lowe	jmp	LABEL(gobble_ashr_10)
1268*5d9d9091SRichard Lowe
1269*5d9d9091SRichard Lowe/*
1270*5d9d9091SRichard Lowe * ashr_11 handles the following cases:
1271*5d9d9091SRichard Lowe * 	abs(str1 offset - str2 offset) = 5
1272*5d9d9091SRichard Lowe */
1273*5d9d9091SRichard Lowe	.p2align 4
1274*5d9d9091SRichard LoweLABEL(ashr_11):
1275*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
1276*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm2
1277*5d9d9091SRichard Lowe	movdqa	(%rsi), %xmm1
1278*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1279*5d9d9091SRichard Lowe	pslldq	$5, %xmm2
1280*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm2
1281*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm2
1282*5d9d9091SRichard Lowe	pmovmskb %xmm2, %r9d
1283*5d9d9091SRichard Lowe	shr	%cl, %edx
1284*5d9d9091SRichard Lowe	shr	%cl, %r9d
1285*5d9d9091SRichard Lowe	sub	%r9d, %edx
1286*5d9d9091SRichard Lowe	jnz	LABEL(less32bytes)
1287*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm3
1288*5d9d9091SRichard Lowe
1289*5d9d9091SRichard Lowe	UPDATE_STRNCMP_COUNTER
1290*5d9d9091SRichard Lowe
1291*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
1292*5d9d9091SRichard Lowe	mov	$16, %rcx	/* index for loads */
1293*5d9d9091SRichard Lowe	mov	$11, %r9d	/* rdi bytes already examined. Used in exit code */
1294*5d9d9091SRichard Lowe	/*
1295*5d9d9091SRichard Lowe	 * Setup %r10 value allows us to detect crossing a page boundary.
1296*5d9d9091SRichard Lowe	 * When %r10 goes positive we are crossing a page boundary and
1297*5d9d9091SRichard Lowe	 * need to do a nibble.
1298*5d9d9091SRichard Lowe	 */
1299*5d9d9091SRichard Lowe	lea	11(%rdi), %r10
1300*5d9d9091SRichard Lowe	and	$0xfff, %r10	/* offset into 4K page */
1301*5d9d9091SRichard Lowe	sub	$0x1000, %r10	/* subtract 4K pagesize */
1302*5d9d9091SRichard Lowe	movdqa	%xmm3, %xmm4
1303*5d9d9091SRichard Lowe
1304*5d9d9091SRichard Lowe	.p2align 4
1305*5d9d9091SRichard LoweLABEL(loop_ashr_11):
1306*5d9d9091SRichard Lowe	add	$16, %r10
1307*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_11)
1308*5d9d9091SRichard Lowe
1309*5d9d9091SRichard LoweLABEL(gobble_ashr_11):
1310*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1311*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
1312*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
1313*5d9d9091SRichard Lowe
1314*5d9d9091SRichard Lowe	psrldq	$11, %xmm3
1315*5d9d9091SRichard Lowe	pslldq	$5, %xmm2
1316*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
1317*5d9d9091SRichard Lowe
1318*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1319*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
1320*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1321*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1322*5d9d9091SRichard Lowe	sub	$0xffff, %edx
1323*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1324*5d9d9091SRichard Lowe
1325*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1326*5d9d9091SRichard Lowe	sub	$16, %r11
1327*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1328*5d9d9091SRichard Lowe#endif
1329*5d9d9091SRichard Lowe
1330*5d9d9091SRichard Lowe	add	$16, %rcx
1331*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
1332*5d9d9091SRichard Lowe
1333*5d9d9091SRichard Lowe	add	$16, %r10
1334*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_11)	/* cross page boundary */
1335*5d9d9091SRichard Lowe
1336*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1337*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
1338*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
1339*5d9d9091SRichard Lowe
1340*5d9d9091SRichard Lowe	psrldq	$11, %xmm3
1341*5d9d9091SRichard Lowe	pslldq 	$5, %xmm2
1342*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
1343*5d9d9091SRichard Lowe
1344*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1345*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
1346*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1347*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1348*5d9d9091SRichard Lowe	sub	$0xffff, %edx
1349*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1350*5d9d9091SRichard Lowe
1351*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1352*5d9d9091SRichard Lowe	sub	$16, %r11
1353*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1354*5d9d9091SRichard Lowe#endif
1355*5d9d9091SRichard Lowe
1356*5d9d9091SRichard Lowe	add	$16, %rcx
1357*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
1358*5d9d9091SRichard Lowe	jmp	LABEL(loop_ashr_11)
1359*5d9d9091SRichard Lowe
1360*5d9d9091SRichard Lowe	.p2align 4
1361*5d9d9091SRichard LoweLABEL(nibble_ashr_11):
1362*5d9d9091SRichard Lowe	psrldq	$11, %xmm4
1363*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1364*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1365*5d9d9091SRichard Lowe	pcmpeqb	%xmm4, %xmm1
1366*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1367*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1368*5d9d9091SRichard Lowe	sub	$0x001f, %edx
1369*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1370*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1371*5d9d9091SRichard Lowe	cmp	$5, %r11
1372*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1373*5d9d9091SRichard Lowe#endif
1374*5d9d9091SRichard Lowe 	pxor	%xmm0, %xmm0
1375*5d9d9091SRichard Lowe	sub	$0x1000, %r10		/* subtract 4K from %r10 */
1376*5d9d9091SRichard Lowe	jmp	LABEL(gobble_ashr_11)
1377*5d9d9091SRichard Lowe
1378*5d9d9091SRichard Lowe/*
1379*5d9d9091SRichard Lowe * ashr_12 handles the following cases:
1380*5d9d9091SRichard Lowe * 	abs(str1 offset - str2 offset) = 4
1381*5d9d9091SRichard Lowe */
1382*5d9d9091SRichard Lowe	.p2align 4
1383*5d9d9091SRichard LoweLABEL(ashr_12):
1384*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
1385*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm2
1386*5d9d9091SRichard Lowe	movdqa	(%rsi), %xmm1
1387*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1388*5d9d9091SRichard Lowe	pslldq	$4, %xmm2
1389*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm2
1390*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm2
1391*5d9d9091SRichard Lowe	pmovmskb %xmm2, %r9d
1392*5d9d9091SRichard Lowe	shr	%cl, %edx
1393*5d9d9091SRichard Lowe	shr	%cl, %r9d
1394*5d9d9091SRichard Lowe	sub	%r9d, %edx
1395*5d9d9091SRichard Lowe	jnz	LABEL(less32bytes)
1396*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm3
1397*5d9d9091SRichard Lowe
1398*5d9d9091SRichard Lowe	UPDATE_STRNCMP_COUNTER
1399*5d9d9091SRichard Lowe
1400*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
1401*5d9d9091SRichard Lowe	mov	$16, %rcx	/* index for loads */
1402*5d9d9091SRichard Lowe	mov	$12, %r9d	/* rdi bytes already examined. Used in exit code */
1403*5d9d9091SRichard Lowe	/*
1404*5d9d9091SRichard Lowe	 * Setup %r10 value allows us to detect crossing a page boundary.
1405*5d9d9091SRichard Lowe	 * When %r10 goes positive we are crossing a page boundary and
1406*5d9d9091SRichard Lowe	 * need to do a nibble.
1407*5d9d9091SRichard Lowe	 */
1408*5d9d9091SRichard Lowe	lea	12(%rdi), %r10
1409*5d9d9091SRichard Lowe	and	$0xfff, %r10	/* offset into 4K page */
1410*5d9d9091SRichard Lowe	sub	$0x1000, %r10	/* subtract 4K pagesize */
1411*5d9d9091SRichard Lowe	movdqa	%xmm3, %xmm4
1412*5d9d9091SRichard Lowe
1413*5d9d9091SRichard Lowe	.p2align 4
1414*5d9d9091SRichard LoweLABEL(loop_ashr_12):
1415*5d9d9091SRichard Lowe	add	$16, %r10
1416*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_12)
1417*5d9d9091SRichard Lowe
1418*5d9d9091SRichard LoweLABEL(gobble_ashr_12):
1419*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1420*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
1421*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
1422*5d9d9091SRichard Lowe
1423*5d9d9091SRichard Lowe	psrldq	$12, %xmm3
1424*5d9d9091SRichard Lowe	pslldq	$4, %xmm2
1425*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
1426*5d9d9091SRichard Lowe
1427*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1428*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
1429*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1430*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1431*5d9d9091SRichard Lowe	sub	$0xffff, %edx
1432*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1433*5d9d9091SRichard Lowe
1434*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1435*5d9d9091SRichard Lowe	sub	$16, %r11
1436*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1437*5d9d9091SRichard Lowe#endif
1438*5d9d9091SRichard Lowe
1439*5d9d9091SRichard Lowe	add	$16, %rcx
1440*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
1441*5d9d9091SRichard Lowe
1442*5d9d9091SRichard Lowe	add	$16, %r10
1443*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_12)	/* cross page boundary */
1444*5d9d9091SRichard Lowe
1445*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1446*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
1447*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
1448*5d9d9091SRichard Lowe
1449*5d9d9091SRichard Lowe	psrldq	$12, %xmm3
1450*5d9d9091SRichard Lowe	pslldq 	$4, %xmm2
1451*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
1452*5d9d9091SRichard Lowe
1453*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1454*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
1455*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1456*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1457*5d9d9091SRichard Lowe	sub	$0xffff, %edx
1458*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1459*5d9d9091SRichard Lowe
1460*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1461*5d9d9091SRichard Lowe	sub	$16, %r11
1462*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1463*5d9d9091SRichard Lowe#endif
1464*5d9d9091SRichard Lowe
1465*5d9d9091SRichard Lowe	add	$16, %rcx
1466*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
1467*5d9d9091SRichard Lowe	jmp	LABEL(loop_ashr_12)
1468*5d9d9091SRichard Lowe
1469*5d9d9091SRichard Lowe	.p2align 4
1470*5d9d9091SRichard LoweLABEL(nibble_ashr_12):
1471*5d9d9091SRichard Lowe	psrldq	$12, %xmm4
1472*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1473*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1474*5d9d9091SRichard Lowe	pcmpeqb	%xmm4, %xmm1
1475*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1476*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1477*5d9d9091SRichard Lowe	sub	$0x000f, %edx
1478*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1479*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1480*5d9d9091SRichard Lowe	cmp	$4, %r11
1481*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1482*5d9d9091SRichard Lowe#endif
1483*5d9d9091SRichard Lowe 	pxor	%xmm0, %xmm0
1484*5d9d9091SRichard Lowe	sub	$0x1000, %r10		/* subtract 4K from %r10 */
1485*5d9d9091SRichard Lowe	jmp	LABEL(gobble_ashr_12)
1486*5d9d9091SRichard Lowe
1487*5d9d9091SRichard Lowe/*
1488*5d9d9091SRichard Lowe * ashr_13 handles the following cases:
1489*5d9d9091SRichard Lowe * 	abs(str1 offset - str2 offset) = 3
1490*5d9d9091SRichard Lowe */
1491*5d9d9091SRichard Lowe	.p2align 4
1492*5d9d9091SRichard LoweLABEL(ashr_13):
1493*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
1494*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm2
1495*5d9d9091SRichard Lowe	movdqa	(%rsi), %xmm1
1496*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1497*5d9d9091SRichard Lowe	pslldq	$3, %xmm2
1498*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm2
1499*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm2
1500*5d9d9091SRichard Lowe	pmovmskb %xmm2, %r9d
1501*5d9d9091SRichard Lowe	shr	%cl, %edx
1502*5d9d9091SRichard Lowe	shr	%cl, %r9d
1503*5d9d9091SRichard Lowe	sub	%r9d, %edx
1504*5d9d9091SRichard Lowe	jnz	LABEL(less32bytes)
1505*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm3
1506*5d9d9091SRichard Lowe
1507*5d9d9091SRichard Lowe	UPDATE_STRNCMP_COUNTER
1508*5d9d9091SRichard Lowe
1509*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
1510*5d9d9091SRichard Lowe	mov	$16, %rcx	/* index for loads */
1511*5d9d9091SRichard Lowe	mov	$13, %r9d	/* rdi bytes already examined. Used in exit code */
1512*5d9d9091SRichard Lowe	/*
1513*5d9d9091SRichard Lowe	 * Setup %r10 value allows us to detect crossing a page boundary.
1514*5d9d9091SRichard Lowe	 * When %r10 goes positive we are crossing a page boundary and
1515*5d9d9091SRichard Lowe	 * need to do a nibble.
1516*5d9d9091SRichard Lowe	 */
1517*5d9d9091SRichard Lowe	lea	13(%rdi), %r10
1518*5d9d9091SRichard Lowe	and	$0xfff, %r10	/* offset into 4K page */
1519*5d9d9091SRichard Lowe	sub	$0x1000, %r10	/* subtract 4K pagesize */
1520*5d9d9091SRichard Lowe	movdqa	%xmm3, %xmm4
1521*5d9d9091SRichard Lowe
1522*5d9d9091SRichard Lowe	.p2align 4
1523*5d9d9091SRichard LoweLABEL(loop_ashr_13):
1524*5d9d9091SRichard Lowe	add	$16, %r10
1525*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_13)
1526*5d9d9091SRichard Lowe
1527*5d9d9091SRichard LoweLABEL(gobble_ashr_13):
1528*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1529*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
1530*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
1531*5d9d9091SRichard Lowe
1532*5d9d9091SRichard Lowe	psrldq	$13, %xmm3
1533*5d9d9091SRichard Lowe	pslldq	$3, %xmm2
1534*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
1535*5d9d9091SRichard Lowe
1536*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1537*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
1538*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1539*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1540*5d9d9091SRichard Lowe	sub	$0xffff, %edx
1541*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1542*5d9d9091SRichard Lowe
1543*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1544*5d9d9091SRichard Lowe	sub	$16, %r11
1545*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1546*5d9d9091SRichard Lowe#endif
1547*5d9d9091SRichard Lowe
1548*5d9d9091SRichard Lowe	add	$16, %rcx
1549*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
1550*5d9d9091SRichard Lowe
1551*5d9d9091SRichard Lowe	add	$16, %r10
1552*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_13)	/* cross page boundary */
1553*5d9d9091SRichard Lowe
1554*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1555*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
1556*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
1557*5d9d9091SRichard Lowe
1558*5d9d9091SRichard Lowe	psrldq	$13, %xmm3
1559*5d9d9091SRichard Lowe	pslldq 	$3, %xmm2
1560*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
1561*5d9d9091SRichard Lowe
1562*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1563*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
1564*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1565*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1566*5d9d9091SRichard Lowe	sub	$0xffff, %edx
1567*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1568*5d9d9091SRichard Lowe
1569*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1570*5d9d9091SRichard Lowe	sub	$16, %r11
1571*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1572*5d9d9091SRichard Lowe#endif
1573*5d9d9091SRichard Lowe
1574*5d9d9091SRichard Lowe	add	$16, %rcx
1575*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
1576*5d9d9091SRichard Lowe	jmp	LABEL(loop_ashr_13)
1577*5d9d9091SRichard Lowe
1578*5d9d9091SRichard Lowe	.p2align 4
1579*5d9d9091SRichard LoweLABEL(nibble_ashr_13):
1580*5d9d9091SRichard Lowe	psrldq	$13, %xmm4
1581*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1582*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1583*5d9d9091SRichard Lowe	pcmpeqb	%xmm4, %xmm1
1584*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1585*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1586*5d9d9091SRichard Lowe	sub	$0x0007, %edx
1587*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1588*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1589*5d9d9091SRichard Lowe	cmp	$3, %r11
1590*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1591*5d9d9091SRichard Lowe#endif
1592*5d9d9091SRichard Lowe 	pxor	%xmm0, %xmm0
1593*5d9d9091SRichard Lowe	sub	$0x1000, %r10		/* subtract 4K from %r10 */
1594*5d9d9091SRichard Lowe	jmp	LABEL(gobble_ashr_13)
1595*5d9d9091SRichard Lowe
1596*5d9d9091SRichard Lowe/*
1597*5d9d9091SRichard Lowe * ashr_14 handles the following cases:
1598*5d9d9091SRichard Lowe * 	abs(str1 offset - str2 offset) = 2
1599*5d9d9091SRichard Lowe */
1600*5d9d9091SRichard Lowe	.p2align 4
1601*5d9d9091SRichard LoweLABEL(ashr_14):
1602*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
1603*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm2
1604*5d9d9091SRichard Lowe	movdqa	(%rsi), %xmm1
1605*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1606*5d9d9091SRichard Lowe	pslldq  $2, %xmm2
1607*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm2
1608*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm2
1609*5d9d9091SRichard Lowe	pmovmskb %xmm2, %r9d
1610*5d9d9091SRichard Lowe	shr	%cl, %edx
1611*5d9d9091SRichard Lowe	shr	%cl, %r9d
1612*5d9d9091SRichard Lowe	sub	%r9d, %edx
1613*5d9d9091SRichard Lowe	jnz	LABEL(less32bytes)
1614*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm3
1615*5d9d9091SRichard Lowe
1616*5d9d9091SRichard Lowe	UPDATE_STRNCMP_COUNTER
1617*5d9d9091SRichard Lowe
1618*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
1619*5d9d9091SRichard Lowe	mov	$16, %rcx	/* index for loads */
1620*5d9d9091SRichard Lowe	mov	$14, %r9d	/* rdi bytes already examined. Used in exit code */
1621*5d9d9091SRichard Lowe	/*
1622*5d9d9091SRichard Lowe	 * Setup %r10 value allows us to detect crossing a page boundary.
1623*5d9d9091SRichard Lowe	 * When %r10 goes positive we are crossing a page boundary and
1624*5d9d9091SRichard Lowe	 * need to do a nibble.
1625*5d9d9091SRichard Lowe	 */
1626*5d9d9091SRichard Lowe	lea	14(%rdi), %r10
1627*5d9d9091SRichard Lowe	and	$0xfff, %r10	/* offset into 4K page */
1628*5d9d9091SRichard Lowe	sub	$0x1000, %r10	/* subtract 4K pagesize */
1629*5d9d9091SRichard Lowe	movdqa	%xmm3, %xmm4
1630*5d9d9091SRichard Lowe
1631*5d9d9091SRichard Lowe	.p2align 4
1632*5d9d9091SRichard LoweLABEL(loop_ashr_14):
1633*5d9d9091SRichard Lowe	add	$16, %r10
1634*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_14)
1635*5d9d9091SRichard Lowe
1636*5d9d9091SRichard LoweLABEL(gobble_ashr_14):
1637*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1638*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
1639*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
1640*5d9d9091SRichard Lowe
1641*5d9d9091SRichard Lowe	psrldq	$14, %xmm3
1642*5d9d9091SRichard Lowe	pslldq	$2, %xmm2
1643*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
1644*5d9d9091SRichard Lowe
1645*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1646*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
1647*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1648*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1649*5d9d9091SRichard Lowe	sub	$0xffff, %edx
1650*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1651*5d9d9091SRichard Lowe
1652*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1653*5d9d9091SRichard Lowe	sub	$16, %r11
1654*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1655*5d9d9091SRichard Lowe#endif
1656*5d9d9091SRichard Lowe
1657*5d9d9091SRichard Lowe	add	$16, %rcx
1658*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
1659*5d9d9091SRichard Lowe
1660*5d9d9091SRichard Lowe	add	$16, %r10
1661*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_14)	/* cross page boundary */
1662*5d9d9091SRichard Lowe
1663*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1664*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
1665*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
1666*5d9d9091SRichard Lowe
1667*5d9d9091SRichard Lowe	psrldq	$14, %xmm3
1668*5d9d9091SRichard Lowe	pslldq 	$2, %xmm2
1669*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
1670*5d9d9091SRichard Lowe
1671*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1672*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
1673*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1674*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1675*5d9d9091SRichard Lowe	sub	$0xffff, %edx
1676*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1677*5d9d9091SRichard Lowe
1678*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1679*5d9d9091SRichard Lowe	sub	$16, %r11
1680*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1681*5d9d9091SRichard Lowe#endif
1682*5d9d9091SRichard Lowe
1683*5d9d9091SRichard Lowe	add	$16, %rcx
1684*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
1685*5d9d9091SRichard Lowe	jmp	LABEL(loop_ashr_14)
1686*5d9d9091SRichard Lowe
1687*5d9d9091SRichard Lowe	.p2align 4
1688*5d9d9091SRichard LoweLABEL(nibble_ashr_14):
1689*5d9d9091SRichard Lowe	psrldq	$14, %xmm4
1690*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1691*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1692*5d9d9091SRichard Lowe	pcmpeqb	%xmm4, %xmm1
1693*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1694*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1695*5d9d9091SRichard Lowe	sub	$0x0003, %edx
1696*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1697*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1698*5d9d9091SRichard Lowe	cmp	$2, %r11
1699*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1700*5d9d9091SRichard Lowe#endif
1701*5d9d9091SRichard Lowe 	pxor	%xmm0, %xmm0
1702*5d9d9091SRichard Lowe	sub	$0x1000, %r10		/* subtract 4K from %r10 */
1703*5d9d9091SRichard Lowe	jmp	LABEL(gobble_ashr_14)
1704*5d9d9091SRichard Lowe
1705*5d9d9091SRichard Lowe/*
1706*5d9d9091SRichard Lowe * ashr_15 handles the following cases:
1707*5d9d9091SRichard Lowe * 	abs(str1 offset - str2 offset) = 1
1708*5d9d9091SRichard Lowe */
1709*5d9d9091SRichard Lowe	.p2align 4
1710*5d9d9091SRichard LoweLABEL(ashr_15):
1711*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
1712*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm2
1713*5d9d9091SRichard Lowe	movdqa	(%rsi), %xmm1
1714*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1715*5d9d9091SRichard Lowe	pslldq	$1, %xmm2
1716*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm2
1717*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm2
1718*5d9d9091SRichard Lowe	pmovmskb %xmm2, %r9d
1719*5d9d9091SRichard Lowe	shr	%cl, %edx
1720*5d9d9091SRichard Lowe	shr	%cl, %r9d
1721*5d9d9091SRichard Lowe	sub	%r9d, %edx
1722*5d9d9091SRichard Lowe	jnz	LABEL(less32bytes)
1723*5d9d9091SRichard Lowe
1724*5d9d9091SRichard Lowe	movdqa	(%rdi), %xmm3
1725*5d9d9091SRichard Lowe
1726*5d9d9091SRichard Lowe	UPDATE_STRNCMP_COUNTER
1727*5d9d9091SRichard Lowe
1728*5d9d9091SRichard Lowe	pxor	%xmm0, %xmm0
1729*5d9d9091SRichard Lowe	mov	$16, %rcx	/* index for loads */
1730*5d9d9091SRichard Lowe	mov	$15, %r9d	/* rdi bytes already examined. Used in exit code */
1731*5d9d9091SRichard Lowe	/*
1732*5d9d9091SRichard Lowe	 * Setup %r10 value allows us to detect crossing a page boundary.
1733*5d9d9091SRichard Lowe	 * When %r10 goes positive we are crossing a page boundary and
1734*5d9d9091SRichard Lowe	 * need to do a nibble.
1735*5d9d9091SRichard Lowe	 */
1736*5d9d9091SRichard Lowe	lea	15(%rdi), %r10
1737*5d9d9091SRichard Lowe	and	$0xfff, %r10	/* offset into 4K page */
1738*5d9d9091SRichard Lowe	sub	$0x1000, %r10	/* subtract 4K pagesize */
1739*5d9d9091SRichard Lowe	movdqa	%xmm3, %xmm4
1740*5d9d9091SRichard Lowe
1741*5d9d9091SRichard Lowe	.p2align 4
1742*5d9d9091SRichard LoweLABEL(loop_ashr_15):
1743*5d9d9091SRichard Lowe	add	$16, %r10
1744*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_15)
1745*5d9d9091SRichard Lowe
1746*5d9d9091SRichard LoweLABEL(gobble_ashr_15):
1747*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1748*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
1749*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
1750*5d9d9091SRichard Lowe
1751*5d9d9091SRichard Lowe	psrldq	$15, %xmm3
1752*5d9d9091SRichard Lowe	pslldq	$1, %xmm2
1753*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
1754*5d9d9091SRichard Lowe
1755*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1756*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
1757*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1758*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1759*5d9d9091SRichard Lowe	sub	$0xffff, %edx
1760*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1761*5d9d9091SRichard Lowe
1762*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1763*5d9d9091SRichard Lowe	sub	$16, %r11
1764*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1765*5d9d9091SRichard Lowe#endif
1766*5d9d9091SRichard Lowe
1767*5d9d9091SRichard Lowe	add	$16, %rcx
1768*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
1769*5d9d9091SRichard Lowe
1770*5d9d9091SRichard Lowe	add	$16, %r10
1771*5d9d9091SRichard Lowe	jg	LABEL(nibble_ashr_15)	/* cross page boundary */
1772*5d9d9091SRichard Lowe
1773*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1774*5d9d9091SRichard Lowe	movdqa	(%rdi, %rcx), %xmm2
1775*5d9d9091SRichard Lowe	movdqa	%xmm2, %xmm4
1776*5d9d9091SRichard Lowe
1777*5d9d9091SRichard Lowe	psrldq	$15, %xmm3
1778*5d9d9091SRichard Lowe	pslldq 	$1, %xmm2
1779*5d9d9091SRichard Lowe	por	%xmm3, %xmm2
1780*5d9d9091SRichard Lowe
1781*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1782*5d9d9091SRichard Lowe	pcmpeqb	%xmm2, %xmm1
1783*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1784*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1785*5d9d9091SRichard Lowe	sub	$0xffff, %edx
1786*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1787*5d9d9091SRichard Lowe
1788*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1789*5d9d9091SRichard Lowe	sub	$16, %r11
1790*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1791*5d9d9091SRichard Lowe#endif
1792*5d9d9091SRichard Lowe
1793*5d9d9091SRichard Lowe	add	$16, %rcx
1794*5d9d9091SRichard Lowe	movdqa	%xmm4, %xmm3
1795*5d9d9091SRichard Lowe	jmp	LABEL(loop_ashr_15)
1796*5d9d9091SRichard Lowe
1797*5d9d9091SRichard Lowe	.p2align 4
1798*5d9d9091SRichard LoweLABEL(nibble_ashr_15):
1799*5d9d9091SRichard Lowe	psrldq	$15, %xmm4
1800*5d9d9091SRichard Lowe	movdqa	(%rsi, %rcx), %xmm1
1801*5d9d9091SRichard Lowe	pcmpeqb	%xmm1, %xmm0
1802*5d9d9091SRichard Lowe	pcmpeqb	%xmm4, %xmm1
1803*5d9d9091SRichard Lowe	psubb	%xmm0, %xmm1
1804*5d9d9091SRichard Lowe	pmovmskb %xmm1, %edx
1805*5d9d9091SRichard Lowe	sub	$0x0001, %edx
1806*5d9d9091SRichard Lowe	jnz	LABEL(exit)
1807*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1808*5d9d9091SRichard Lowe	cmp	$1, %r11
1809*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1810*5d9d9091SRichard Lowe#endif
1811*5d9d9091SRichard Lowe 	pxor	%xmm0, %xmm0
1812*5d9d9091SRichard Lowe	sub	$0x1000, %r10		/* subtract 4K from %r10 */
1813*5d9d9091SRichard Lowe	jmp	LABEL(gobble_ashr_15)
1814*5d9d9091SRichard Lowe
1815*5d9d9091SRichard Lowe	.p2align 4
1816*5d9d9091SRichard LoweLABEL(exit):
1817*5d9d9091SRichard Lowe	lea	-16(%r9, %rcx), %rax	/* locate the exact offset for rdi */
1818*5d9d9091SRichard LoweLABEL(less32bytes):
1819*5d9d9091SRichard Lowe	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
1820*5d9d9091SRichard Lowe	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
1821*5d9d9091SRichard Lowe	test	%r8d, %r8d
1822*5d9d9091SRichard Lowe	jz	LABEL(ret)
1823*5d9d9091SRichard Lowe	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
1824*5d9d9091SRichard Lowe
1825*5d9d9091SRichard Lowe	.p2align 4
1826*5d9d9091SRichard LoweLABEL(ret):
1827*5d9d9091SRichard LoweLABEL(less16bytes):
1828*5d9d9091SRichard Lowe	/*
1829*5d9d9091SRichard Lowe	 * Check to see if BSF is fast on this processor. If not, use a different
1830*5d9d9091SRichard Lowe	 * exit tail.
1831*5d9d9091SRichard Lowe	 */
1832*5d9d9091SRichard Lowe	testl	$USE_BSF,.memops_method(%rip)
1833*5d9d9091SRichard Lowe	jz	LABEL(AMD_exit)
1834*5d9d9091SRichard Lowe	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
1835*5d9d9091SRichard Lowe
1836*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1837*5d9d9091SRichard Lowe	sub	%rdx, %r11
1838*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1839*5d9d9091SRichard Lowe#endif
1840*5d9d9091SRichard Lowe	xor	%ecx, %ecx		/* clear %ecx */
1841*5d9d9091SRichard Lowe	xor	%eax, %eax		/* clear %eax */
1842*5d9d9091SRichard Lowe
1843*5d9d9091SRichard Lowe	movb	(%rsi, %rdx), %cl
1844*5d9d9091SRichard Lowe	movb	(%rdi, %rdx), %al
1845*5d9d9091SRichard Lowe
1846*5d9d9091SRichard Lowe	sub	%ecx, %eax
1847*5d9d9091SRichard Lowe	ret
1848*5d9d9091SRichard Lowe
1849*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1850*5d9d9091SRichard LoweLABEL(strcmp_exitz):
1851*5d9d9091SRichard Lowe	xor	%eax, %eax
1852*5d9d9091SRichard Lowe	ret
1853*5d9d9091SRichard Lowe#endif
1854*5d9d9091SRichard Lowe
1855*5d9d9091SRichard Lowe	/*
1856*5d9d9091SRichard Lowe	 * This exit tail does not use the bsf instruction.
1857*5d9d9091SRichard Lowe	 */
1858*5d9d9091SRichard Lowe	.p2align 4
1859*5d9d9091SRichard LoweLABEL(AMD_exit):
1860*5d9d9091SRichard Lowe	test	%dl, %dl
1861*5d9d9091SRichard Lowe	jz	LABEL(next_8_bytes)
1862*5d9d9091SRichard Lowe
1863*5d9d9091SRichard Lowe	test	$0x01, %dl
1864*5d9d9091SRichard Lowe	jnz	LABEL(Byte0)
1865*5d9d9091SRichard Lowe
1866*5d9d9091SRichard Lowe	test	$0x02, %dl
1867*5d9d9091SRichard Lowe	jnz	LABEL(Byte1)
1868*5d9d9091SRichard Lowe
1869*5d9d9091SRichard Lowe	test	$0x04, %dl
1870*5d9d9091SRichard Lowe	jnz	LABEL(Byte2)
1871*5d9d9091SRichard Lowe
1872*5d9d9091SRichard Lowe	test	$0x08, %dl
1873*5d9d9091SRichard Lowe	jnz	LABEL(Byte3)
1874*5d9d9091SRichard Lowe
1875*5d9d9091SRichard Lowe	test	$0x10, %dl
1876*5d9d9091SRichard Lowe	jnz	LABEL(Byte4)
1877*5d9d9091SRichard Lowe
1878*5d9d9091SRichard Lowe	test	$0x20, %dl
1879*5d9d9091SRichard Lowe	jnz	LABEL(Byte5)
1880*5d9d9091SRichard Lowe
1881*5d9d9091SRichard Lowe	test	$0x40, %dl
1882*5d9d9091SRichard Lowe	jnz	LABEL(Byte6)
1883*5d9d9091SRichard Lowe
1884*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1885*5d9d9091SRichard Lowe	sub	$7, %r11
1886*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1887*5d9d9091SRichard Lowe#endif
1888*5d9d9091SRichard Lowe	movzx	7(%rsi), %ecx
1889*5d9d9091SRichard Lowe	movzx	7(%rdi), %eax
1890*5d9d9091SRichard Lowe
1891*5d9d9091SRichard Lowe	sub	%ecx, %eax
1892*5d9d9091SRichard Lowe	ret
1893*5d9d9091SRichard Lowe
1894*5d9d9091SRichard Lowe	.p2align 4
1895*5d9d9091SRichard LoweLABEL(Byte0):
1896*5d9d9091SRichard Lowe	/*
1897*5d9d9091SRichard Lowe	 * never need to handle byte 0 for strncmpy
1898*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1899*5d9d9091SRichard Lowe	sub	$0, %r11
1900*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1901*5d9d9091SRichard Lowe#endif
1902*5d9d9091SRichard Lowe	*/
1903*5d9d9091SRichard Lowe	movzx	(%rsi), %ecx
1904*5d9d9091SRichard Lowe	movzx	(%rdi), %eax
1905*5d9d9091SRichard Lowe
1906*5d9d9091SRichard Lowe	sub	%ecx, %eax
1907*5d9d9091SRichard Lowe	ret
1908*5d9d9091SRichard Lowe
1909*5d9d9091SRichard Lowe	.p2align 4
1910*5d9d9091SRichard LoweLABEL(Byte1):
1911*5d9d9091SRichard Lowe
1912*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1913*5d9d9091SRichard Lowe	sub	$1, %r11
1914*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1915*5d9d9091SRichard Lowe#endif
1916*5d9d9091SRichard Lowe	movzx	1(%rsi), %ecx
1917*5d9d9091SRichard Lowe	movzx	1(%rdi), %eax
1918*5d9d9091SRichard Lowe
1919*5d9d9091SRichard Lowe	sub	%ecx, %eax
1920*5d9d9091SRichard Lowe	ret
1921*5d9d9091SRichard Lowe
1922*5d9d9091SRichard Lowe	.p2align 4
1923*5d9d9091SRichard LoweLABEL(Byte2):
1924*5d9d9091SRichard Lowe
1925*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1926*5d9d9091SRichard Lowe	sub	$2, %r11
1927*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1928*5d9d9091SRichard Lowe#endif
1929*5d9d9091SRichard Lowe	movzx	2(%rsi), %ecx
1930*5d9d9091SRichard Lowe	movzx	2(%rdi), %eax
1931*5d9d9091SRichard Lowe
1932*5d9d9091SRichard Lowe	sub	%ecx, %eax
1933*5d9d9091SRichard Lowe	ret
1934*5d9d9091SRichard Lowe
1935*5d9d9091SRichard Lowe	.p2align 4
1936*5d9d9091SRichard LoweLABEL(Byte3):
1937*5d9d9091SRichard Lowe
1938*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1939*5d9d9091SRichard Lowe	sub	$3, %r11
1940*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1941*5d9d9091SRichard Lowe#endif
1942*5d9d9091SRichard Lowe	movzx	3(%rsi), %ecx
1943*5d9d9091SRichard Lowe	movzx	3(%rdi), %eax
1944*5d9d9091SRichard Lowe
1945*5d9d9091SRichard Lowe	sub	%ecx, %eax
1946*5d9d9091SRichard Lowe	ret
1947*5d9d9091SRichard Lowe
1948*5d9d9091SRichard Lowe	.p2align 4
1949*5d9d9091SRichard LoweLABEL(Byte4):
1950*5d9d9091SRichard Lowe
1951*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1952*5d9d9091SRichard Lowe	sub	$4, %r11
1953*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1954*5d9d9091SRichard Lowe#endif
1955*5d9d9091SRichard Lowe	movzx	4(%rsi), %ecx
1956*5d9d9091SRichard Lowe	movzx	4(%rdi), %eax
1957*5d9d9091SRichard Lowe
1958*5d9d9091SRichard Lowe	sub	%ecx, %eax
1959*5d9d9091SRichard Lowe	ret
1960*5d9d9091SRichard Lowe
1961*5d9d9091SRichard Lowe	.p2align 4
1962*5d9d9091SRichard LoweLABEL(Byte5):
1963*5d9d9091SRichard Lowe
1964*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1965*5d9d9091SRichard Lowe	sub	$5, %r11
1966*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1967*5d9d9091SRichard Lowe#endif
1968*5d9d9091SRichard Lowe	movzx	5(%rsi), %ecx
1969*5d9d9091SRichard Lowe	movzx	5(%rdi), %eax
1970*5d9d9091SRichard Lowe
1971*5d9d9091SRichard Lowe	sub	%ecx, %eax
1972*5d9d9091SRichard Lowe	ret
1973*5d9d9091SRichard Lowe
1974*5d9d9091SRichard Lowe	.p2align 4
1975*5d9d9091SRichard LoweLABEL(Byte6):
1976*5d9d9091SRichard Lowe
1977*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1978*5d9d9091SRichard Lowe	sub	$6, %r11
1979*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1980*5d9d9091SRichard Lowe#endif
1981*5d9d9091SRichard Lowe	movzx	6(%rsi), %ecx
1982*5d9d9091SRichard Lowe	movzx	6(%rdi), %eax
1983*5d9d9091SRichard Lowe
1984*5d9d9091SRichard Lowe	sub	%ecx, %eax
1985*5d9d9091SRichard Lowe	ret
1986*5d9d9091SRichard Lowe
1987*5d9d9091SRichard Lowe	.p2align 4
1988*5d9d9091SRichard LoweLABEL(next_8_bytes):
1989*5d9d9091SRichard Lowe	add	$8, %rdi
1990*5d9d9091SRichard Lowe	add	$8, %rsi
1991*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
1992*5d9d9091SRichard Lowe	sub	$8, %r11
1993*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
1994*5d9d9091SRichard Lowe#endif
1995*5d9d9091SRichard Lowe	test	$0x01, %dh
1996*5d9d9091SRichard Lowe	jnz	LABEL(Byte0)
1997*5d9d9091SRichard Lowe
1998*5d9d9091SRichard Lowe	test	$0x02, %dh
1999*5d9d9091SRichard Lowe	jnz	LABEL(Byte1)
2000*5d9d9091SRichard Lowe
2001*5d9d9091SRichard Lowe	test	$0x04, %dh
2002*5d9d9091SRichard Lowe	jnz	LABEL(Byte2)
2003*5d9d9091SRichard Lowe
2004*5d9d9091SRichard Lowe	test	$0x08, %dh
2005*5d9d9091SRichard Lowe	jnz	LABEL(Byte3)
2006*5d9d9091SRichard Lowe
2007*5d9d9091SRichard Lowe	test	$0x10, %dh
2008*5d9d9091SRichard Lowe	jnz	LABEL(Byte4)
2009*5d9d9091SRichard Lowe
2010*5d9d9091SRichard Lowe	test	$0x20, %dh
2011*5d9d9091SRichard Lowe	jnz	LABEL(Byte5)
2012*5d9d9091SRichard Lowe
2013*5d9d9091SRichard Lowe	test	$0x40, %dh
2014*5d9d9091SRichard Lowe	jnz	LABEL(Byte6)
2015*5d9d9091SRichard Lowe
2016*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
2017*5d9d9091SRichard Lowe	sub	$7, %r11
2018*5d9d9091SRichard Lowe	jbe	LABEL(strcmp_exitz)
2019*5d9d9091SRichard Lowe#endif
2020*5d9d9091SRichard Lowe	movzx	7(%rsi), %ecx
2021*5d9d9091SRichard Lowe	movzx	7(%rdi), %eax
2022*5d9d9091SRichard Lowe
2023*5d9d9091SRichard Lowe	sub	%ecx, %eax
2024*5d9d9091SRichard Lowe	ret
2025*5d9d9091SRichard Lowe
2026*5d9d9091SRichard Lowe	.pushsection .rodata
2027*5d9d9091SRichard Lowe	.p2align 4
2028*5d9d9091SRichard LoweLABEL(unaligned_table):
2029*5d9d9091SRichard Lowe	.int	LABEL(ashr_0) - LABEL(unaligned_table)
2030*5d9d9091SRichard Lowe	.int	LABEL(ashr_15) - LABEL(unaligned_table)
2031*5d9d9091SRichard Lowe	.int	LABEL(ashr_14) - LABEL(unaligned_table)
2032*5d9d9091SRichard Lowe	.int	LABEL(ashr_13) - LABEL(unaligned_table)
2033*5d9d9091SRichard Lowe	.int	LABEL(ashr_12) - LABEL(unaligned_table)
2034*5d9d9091SRichard Lowe	.int	LABEL(ashr_11) - LABEL(unaligned_table)
2035*5d9d9091SRichard Lowe	.int	LABEL(ashr_10) - LABEL(unaligned_table)
2036*5d9d9091SRichard Lowe	.int	LABEL(ashr_9) - LABEL(unaligned_table)
2037*5d9d9091SRichard Lowe	.int	LABEL(ashr_8) - LABEL(unaligned_table)
2038*5d9d9091SRichard Lowe	.int	LABEL(ashr_7) - LABEL(unaligned_table)
2039*5d9d9091SRichard Lowe	.int	LABEL(ashr_6) - LABEL(unaligned_table)
2040*5d9d9091SRichard Lowe	.int	LABEL(ashr_5) - LABEL(unaligned_table)
2041*5d9d9091SRichard Lowe	.int	LABEL(ashr_4) - LABEL(unaligned_table)
2042*5d9d9091SRichard Lowe	.int	LABEL(ashr_3) - LABEL(unaligned_table)
2043*5d9d9091SRichard Lowe	.int	LABEL(ashr_2) - LABEL(unaligned_table)
2044*5d9d9091SRichard Lowe	.int	LABEL(ashr_1) - LABEL(unaligned_table)
2045*5d9d9091SRichard Lowe	.popsection
2046*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP
2047*5d9d9091SRichard Lowe	SET_SIZE(strncmp)
2048*5d9d9091SRichard Lowe#else
2049*5d9d9091SRichard Lowe	SET_SIZE(strcmp)		/* (const char *, const char *) */
2050*5d9d9091SRichard Lowe#endif
2051