xref: /freebsd/contrib/bionic-x86_64-string/ssse3-strcmp-slm.S (revision 3dd5524264095ed8612c28908e13f80668eff2f9)
1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifdef USE_AS_STRNCMP
32/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
33   if the new counter > the old one or is 0.  */
34#define UPDATE_STRNCMP_COUNTER				\
35	/* calculate left number to compare */		\
36	lea	-16(%rcx, %r11), %r9;			\
37	cmp	%r9, %r11;				\
38	jb	L(strcmp_exitz);			\
39	test	%r9, %r9;				\
40	je	L(strcmp_exitz);			\
41	mov	%r9, %r11
42
43#else
44#define UPDATE_STRNCMP_COUNTER
45#ifndef STRCMP
46#define STRCMP		strcmp
47#endif
48#endif
49
50#ifndef L
51# define L(label)	.L##label
52#endif
53
54#ifndef cfi_startproc
55# define cfi_startproc			.cfi_startproc
56#endif
57
58#ifndef cfi_endproc
59# define cfi_endproc			.cfi_endproc
60#endif
61
62#ifndef ENTRY
63# define ENTRY(name)			\
64	.type name,  @function; 	\
65	.globl name;			\
66	.p2align 4;			\
67name:					\
68	cfi_startproc
69#endif
70
71#ifndef END
72# define END(name)			\
73	cfi_endproc;			\
74	.size name, .-name
75#endif
76#define RETURN ret
77	.section .text.ssse3,"ax",@progbits
78ENTRY (STRCMP)
79/*
80 * This implementation uses SSE to compare up to 16 bytes at a time.
81 */
82#ifdef USE_AS_STRNCMP
83	test	%rdx, %rdx
84	je	L(strcmp_exitz)
85	cmp	$1, %rdx
86	je	L(Byte0)
87	mov	%rdx, %r11
88#endif
89	mov	%esi, %ecx
90	mov	%edi, %eax
91/* Use 64bit AND here to avoid long NOP padding.  */
92	and	$0x3f, %rcx		/* rsi alignment in cache line */
93	and	$0x3f, %rax		/* rdi alignment in cache line */
94	cmp	$0x30, %ecx
95	ja	L(crosscache)	/* rsi: 16-byte load will cross cache line */
96	cmp	$0x30, %eax
97	ja	L(crosscache)	/* rdi: 16-byte load will cross cache line */
98	movlpd	(%rdi), %xmm1
99	movlpd	(%rsi), %xmm2
100	movhpd	8(%rdi), %xmm1
101	movhpd	8(%rsi), %xmm2
102	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
103	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
104	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
105	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
106	pmovmskb %xmm1, %edx
107	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
108	jnz	L(less16bytes)	/* If not, find different value or null char */
109#ifdef USE_AS_STRNCMP
110	sub	$16, %r11
111	jbe	L(strcmp_exitz)	/* finish comparision */
112#endif
113	add	$16, %rsi		/* prepare to search next 16 bytes */
114	add	$16, %rdi		/* prepare to search next 16 bytes */
115
116	/*
117	 * Determine source and destination string offsets from 16-byte alignment.
118	 * Use relative offset difference between the two to determine which case
119	 * below to use.
120	 */
121	.p2align 4
122L(crosscache):
123	and	$0xfffffffffffffff0, %rsi	/* force %rsi is 16 byte aligned */
124	and	$0xfffffffffffffff0, %rdi	/* force %rdi is 16 byte aligned */
125	mov	$0xffff, %edx			/* for equivalent offset */
126	xor	%r8d, %r8d
127	and	$0xf, %ecx			/* offset of rsi */
128	and	$0xf, %eax			/* offset of rdi */
129	cmp	%eax, %ecx
130	je	L(ashr_0)			/* rsi and rdi relative offset same */
131	ja	L(bigger)
132	mov	%edx, %r8d			/* r8d is offset flag for exit tail */
133	xchg	%ecx, %eax
134	xchg	%rsi, %rdi
135L(bigger):
136	lea	15(%rax), %r9
137	sub	%rcx, %r9
138	lea	L(unaligned_table)(%rip), %r10
139	movslq	(%r10, %r9,4), %r9
140	lea	(%r10, %r9), %r10
141	jmp	*%r10				/* jump to corresponding case */
142
143/*
144 * The following cases will be handled by ashr_0
145 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
146 *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
147 */
148	.p2align 4
149L(ashr_0):
150
151	movdqa	(%rsi), %xmm1
152	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
153	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
154	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
155	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
156	pmovmskb %xmm1, %r9d
157	shr	%cl, %edx			/* adjust 0xffff for offset */
158	shr	%cl, %r9d			/* adjust for 16-byte offset */
159	sub	%r9d, %edx
160	/*
161	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
162	 * the start from (16-rax) and no null char was seen.
163	 */
164	jne	L(less32bytes)		/* mismatch or null char */
165	UPDATE_STRNCMP_COUNTER
166	mov	$16, %rcx
167	mov	$16, %r9
168	pxor	%xmm0, %xmm0			/* clear xmm0, may have changed above */
169
170	/*
171	 * Now both strings are aligned at 16-byte boundary. Loop over strings
172	 * checking 32-bytes per iteration.
173	 */
174	.p2align 4
175L(loop_ashr_0):
176	movdqa	(%rsi, %rcx), %xmm1
177	movdqa	(%rdi, %rcx), %xmm2
178
179	pcmpeqb	%xmm1, %xmm0
180	pcmpeqb	%xmm2, %xmm1
181	psubb	%xmm0, %xmm1
182	pmovmskb %xmm1, %edx
183	sub	$0xffff, %edx
184	jnz	L(exit)		/* mismatch or null char seen */
185
186#ifdef USE_AS_STRNCMP
187	sub	$16, %r11
188	jbe	L(strcmp_exitz)
189#endif
190	add	$16, %rcx
191	movdqa	(%rsi, %rcx), %xmm1
192	movdqa	(%rdi, %rcx), %xmm2
193
194	pcmpeqb	%xmm1, %xmm0
195	pcmpeqb	%xmm2, %xmm1
196	psubb	%xmm0, %xmm1
197	pmovmskb %xmm1, %edx
198	sub	$0xffff, %edx
199	jnz	L(exit)
200#ifdef USE_AS_STRNCMP
201	sub	$16, %r11
202	jbe	L(strcmp_exitz)
203#endif
204	add	$16, %rcx
205	jmp	L(loop_ashr_0)
206
207/*
208 * The following cases will be handled by ashr_1
209 * rcx(offset of rsi)  rax(offset of rdi)   relative offset   	corresponding case
210 *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
211 */
212	.p2align 4
213L(ashr_1):
214	pxor	%xmm0, %xmm0
215	movdqa	(%rdi), %xmm2
216	movdqa	(%rsi), %xmm1
217	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
218	pslldq	$15, %xmm2		/* shift first string to align with second */
219	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
220	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
221	pmovmskb %xmm2, %r9d
222	shr	%cl, %edx		/* adjust 0xffff for offset */
223	shr	%cl, %r9d		/* adjust for 16-byte offset */
224	sub	%r9d, %edx
225	jnz	L(less32bytes)	/* mismatch or null char seen */
226	movdqa	(%rdi), %xmm3
227	UPDATE_STRNCMP_COUNTER
228
229	pxor	%xmm0, %xmm0
230	mov	$16, %rcx		/* index for loads*/
231	mov	$1, %r9d		/* byte position left over from less32bytes case */
232	/*
233	 * Setup %r10 value allows us to detect crossing a page boundary.
234	 * When %r10 goes positive we have crossed a page boundary and
235	 * need to do a nibble.
236	 */
237	lea	1(%rdi), %r10
238	and	$0xfff, %r10		/* offset into 4K page */
239	sub	$0x1000, %r10		/* subtract 4K pagesize */
240
241	.p2align 4
242L(loop_ashr_1):
243	add	$16, %r10
244	jg	L(nibble_ashr_1)	/* cross page boundary */
245
246L(gobble_ashr_1):
247	movdqa	(%rsi, %rcx), %xmm1
248	movdqa	(%rdi, %rcx), %xmm2
249	movdqa	%xmm2, %xmm4		 /* store for next cycle */
250
251	palignr $1, %xmm3, %xmm2        /* merge into one 16byte value */
252
253	pcmpeqb	%xmm1, %xmm0
254	pcmpeqb	%xmm2, %xmm1
255	psubb	%xmm0, %xmm1
256	pmovmskb %xmm1, %edx
257	sub	$0xffff, %edx
258	jnz	L(exit)
259
260#ifdef USE_AS_STRNCMP
261	sub	$16, %r11
262	jbe	L(strcmp_exitz)
263#endif
264	add	$16, %rcx
265	movdqa	%xmm4, %xmm3
266
267	add	$16, %r10
268	jg	L(nibble_ashr_1)	/* cross page boundary */
269
270	movdqa	(%rsi, %rcx), %xmm1
271	movdqa	(%rdi, %rcx), %xmm2
272	movdqa	%xmm2, %xmm4		/* store for next cycle */
273
274	palignr $1, %xmm3, %xmm2        /* merge into one 16byte value */
275
276	pcmpeqb	%xmm1, %xmm0
277	pcmpeqb	%xmm2, %xmm1
278	psubb	%xmm0, %xmm1
279	pmovmskb %xmm1, %edx
280	sub	$0xffff, %edx
281	jnz	L(exit)
282
283#ifdef USE_AS_STRNCMP
284	sub	$16, %r11
285	jbe	L(strcmp_exitz)
286#endif
287	add	$16, %rcx
288	movdqa	%xmm4, %xmm3
289	jmp	L(loop_ashr_1)
290
291	/*
292	 * Nibble avoids loads across page boundary. This is to avoid a potential
293	 * access into unmapped memory.
294	 */
295	.p2align 4
296L(nibble_ashr_1):
297	pcmpeqb	%xmm3, %xmm0		 /* check nibble for null char*/
298	pmovmskb %xmm0, %edx
299	test	$0xfffe, %edx
300	jnz	L(ashr_1_exittail)	/* find null char*/
301
302#ifdef USE_AS_STRNCMP
303	cmp	$14, %r11
304	jbe	L(ashr_1_exittail)
305#endif
306
307	pxor	%xmm0, %xmm0
308	sub	$0x1000, %r10		/* substract 4K from %r10 */
309	jmp	L(gobble_ashr_1)
310
311	/*
312	 * Once find null char, determine if there is a string mismatch
313	 * before the null char.
314	 */
315	.p2align 4
316L(ashr_1_exittail):
317	movdqa	(%rsi, %rcx), %xmm1
318	psrldq	$1, %xmm0
319	psrldq	$1, %xmm3
320	jmp	L(aftertail)
321
322/*
323 * The following cases will be handled by ashr_2
324 * rcx(offset of rsi)  rax(offset of rdi)   relative offset   	corresponding case
325 *        n(14~15)            n -14         1(15 +(n-14) - n)         ashr_2
326 */
327	.p2align 4
328L(ashr_2):
329	pxor	%xmm0, %xmm0
330	movdqa	(%rdi), %xmm2
331	movdqa	(%rsi), %xmm1
332	pcmpeqb	%xmm1, %xmm0
333	pslldq	$14, %xmm2
334	pcmpeqb	%xmm1, %xmm2
335	psubb	%xmm0, %xmm2
336	pmovmskb %xmm2, %r9d
337	shr	%cl, %edx
338	shr	%cl, %r9d
339	sub	%r9d, %edx
340	jnz	L(less32bytes)
341	movdqa	(%rdi), %xmm3
342	UPDATE_STRNCMP_COUNTER
343
344	pxor	%xmm0, %xmm0
345	mov	$16, %rcx	/* index for loads */
346	mov	$2, %r9d	/* byte position left over from less32bytes case */
347	/*
348	 * Setup %r10 value allows us to detect crossing a page boundary.
349	 * When %r10 goes positive we have crossed a page boundary and
350	 * need to do a nibble.
351	 */
352	lea	2(%rdi), %r10
353	and	$0xfff, %r10	/* offset into 4K page */
354	sub	$0x1000, %r10	/* subtract 4K pagesize */
355
356	.p2align 4
357L(loop_ashr_2):
358	add	$16, %r10
359	jg	L(nibble_ashr_2)
360
361L(gobble_ashr_2):
362	movdqa	(%rsi, %rcx), %xmm1
363	movdqa	(%rdi, %rcx), %xmm2
364	movdqa	%xmm2, %xmm4
365
366	palignr $2, %xmm3, %xmm2        /* merge into one 16byte value */
367
368	pcmpeqb	%xmm1, %xmm0
369	pcmpeqb	%xmm2, %xmm1
370	psubb	%xmm0, %xmm1
371	pmovmskb %xmm1, %edx
372	sub	$0xffff, %edx
373	jnz	L(exit)
374
375#ifdef USE_AS_STRNCMP
376	sub	$16, %r11
377	jbe	L(strcmp_exitz)
378#endif
379
380	add	$16, %rcx
381	movdqa	%xmm4, %xmm3
382
383	add	$16, %r10
384	jg	L(nibble_ashr_2)	/* cross page boundary */
385
386	movdqa	(%rsi, %rcx), %xmm1
387	movdqa	(%rdi, %rcx), %xmm2
388	movdqa	%xmm2, %xmm4
389
390	palignr $2, %xmm3, %xmm2        /* merge into one 16byte value */
391
392	pcmpeqb	%xmm1, %xmm0
393	pcmpeqb	%xmm2, %xmm1
394	psubb	%xmm0, %xmm1
395	pmovmskb %xmm1, %edx
396	sub	$0xffff, %edx
397	jnz	L(exit)
398
399#ifdef USE_AS_STRNCMP
400	sub	$16, %r11
401	jbe	L(strcmp_exitz)
402#endif
403
404	add	$16, %rcx
405	movdqa	%xmm4, %xmm3
406	jmp	L(loop_ashr_2)
407
408	.p2align 4
409L(nibble_ashr_2):
410	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
411	pmovmskb %xmm0, %edx
412	test	$0xfffc, %edx
413	jnz	L(ashr_2_exittail)
414
415#ifdef USE_AS_STRNCMP
416	cmp	$13, %r11
417	jbe	L(ashr_2_exittail)
418#endif
419
420	pxor	%xmm0, %xmm0
421	sub	$0x1000, %r10
422	jmp	L(gobble_ashr_2)
423
424	.p2align 4
425L(ashr_2_exittail):
426	movdqa	(%rsi, %rcx), %xmm1
427	psrldq	$2, %xmm0
428	psrldq	$2, %xmm3
429	jmp	L(aftertail)
430
431/*
432 * The following cases will be handled by ashr_3
433 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
434 *        n(13~15)            n -13         2(15 +(n-13) - n)         ashr_3
435 */
436	.p2align 4
437L(ashr_3):
438	pxor	%xmm0, %xmm0
439	movdqa	(%rdi), %xmm2
440	movdqa	(%rsi), %xmm1
441	pcmpeqb	%xmm1, %xmm0
442	pslldq	$13, %xmm2
443	pcmpeqb	%xmm1, %xmm2
444	psubb	%xmm0, %xmm2
445	pmovmskb %xmm2, %r9d
446	shr	%cl, %edx
447	shr	%cl, %r9d
448	sub	%r9d, %edx
449	jnz	L(less32bytes)
450	movdqa	(%rdi), %xmm3
451
452	UPDATE_STRNCMP_COUNTER
453
454	pxor	%xmm0, %xmm0
455	mov	$16, %rcx	/* index for loads */
456	mov	$3, %r9d	/* byte position left over from less32bytes case */
457	/*
458	 * Setup %r10 value allows us to detect crossing a page boundary.
459	 * When %r10 goes positive we have crossed a page boundary and
460	 * need to do a nibble.
461	 */
462	lea	3(%rdi), %r10
463	and	$0xfff, %r10	/* offset into 4K page */
464	sub	$0x1000, %r10	/* subtract 4K pagesize */
465
466	.p2align 4
467L(loop_ashr_3):
468	add	$16, %r10
469	jg	L(nibble_ashr_3)
470
471L(gobble_ashr_3):
472	movdqa	(%rsi, %rcx), %xmm1
473	movdqa	(%rdi, %rcx), %xmm2
474	movdqa	%xmm2, %xmm4
475
476	palignr $3, %xmm3, %xmm2        /* merge into one 16byte value */
477
478	pcmpeqb	%xmm1, %xmm0
479	pcmpeqb	%xmm2, %xmm1
480	psubb	%xmm0, %xmm1
481	pmovmskb %xmm1, %edx
482	sub	$0xffff, %edx
483	jnz	L(exit)
484
485#ifdef USE_AS_STRNCMP
486	sub	$16, %r11
487	jbe	L(strcmp_exitz)
488#endif
489
490	add	$16, %rcx
491	movdqa	%xmm4, %xmm3
492
493	add	$16, %r10
494	jg	L(nibble_ashr_3)	/* cross page boundary */
495
496	movdqa	(%rsi, %rcx), %xmm1
497	movdqa	(%rdi, %rcx), %xmm2
498	movdqa	%xmm2, %xmm4
499
500	palignr $3, %xmm3, %xmm2        /* merge into one 16byte value */
501
502	pcmpeqb	%xmm1, %xmm0
503	pcmpeqb	%xmm2, %xmm1
504	psubb	%xmm0, %xmm1
505	pmovmskb %xmm1, %edx
506	sub	$0xffff, %edx
507	jnz	L(exit)
508
509#ifdef USE_AS_STRNCMP
510	sub	$16, %r11
511	jbe	L(strcmp_exitz)
512#endif
513
514	add	$16, %rcx
515	movdqa	%xmm4, %xmm3
516	jmp	L(loop_ashr_3)
517
518	.p2align 4
519L(nibble_ashr_3):
520	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
521	pmovmskb %xmm0, %edx
522	test	$0xfff8, %edx
523	jnz	L(ashr_3_exittail)
524
525#ifdef USE_AS_STRNCMP
526	cmp	$12, %r11
527	jbe	L(ashr_3_exittail)
528#endif
529
530	pxor	%xmm0, %xmm0
531	sub	$0x1000, %r10
532	jmp	L(gobble_ashr_3)
533
534	.p2align 4
535L(ashr_3_exittail):
536	movdqa	(%rsi, %rcx), %xmm1
537	psrldq	$3, %xmm0
538	psrldq	$3, %xmm3
539	jmp	L(aftertail)
540
541/*
542 * The following cases will be handled by ashr_4
543 *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
544 *        n(12~15)            n -12         3(15 +(n-12) - n)         ashr_4
545 */
546	.p2align 4
547L(ashr_4):
548	pxor	%xmm0, %xmm0
549	movdqa	(%rdi), %xmm2
550	movdqa	(%rsi), %xmm1
551	pcmpeqb	%xmm1, %xmm0
552	pslldq	$12, %xmm2
553	pcmpeqb	%xmm1, %xmm2
554	psubb	%xmm0, %xmm2
555	pmovmskb %xmm2, %r9d
556	shr	%cl, %edx
557	shr	%cl, %r9d
558	sub	%r9d, %edx
559	jnz	L(less32bytes)
560	movdqa	(%rdi), %xmm3
561
562	UPDATE_STRNCMP_COUNTER
563
564	pxor	%xmm0, %xmm0
565	mov	$16, %rcx	/* index for loads */
566	mov	$4, %r9d	/* byte position left over from less32bytes case */
567	/*
568	 * Setup %r10 value allows us to detect crossing a page boundary.
569	 * When %r10 goes positive we have crossed a page boundary and
570	 * need to do a nibble.
571	 */
572	lea	4(%rdi), %r10
573	and	$0xfff, %r10	/* offset into 4K page */
574	sub	$0x1000, %r10	/* subtract 4K pagesize */
575
576	.p2align 4
577L(loop_ashr_4):
578	add	$16, %r10
579	jg	L(nibble_ashr_4)
580
581L(gobble_ashr_4):
582	movdqa	(%rsi, %rcx), %xmm1
583	movdqa	(%rdi, %rcx), %xmm2
584	movdqa	%xmm2, %xmm4
585
586	palignr $4, %xmm3, %xmm2        /* merge into one 16byte value */
587
588	pcmpeqb	%xmm1, %xmm0
589	pcmpeqb	%xmm2, %xmm1
590	psubb	%xmm0, %xmm1
591	pmovmskb %xmm1, %edx
592	sub	$0xffff, %edx
593	jnz	L(exit)
594
595#ifdef USE_AS_STRNCMP
596	sub	$16, %r11
597	jbe	L(strcmp_exitz)
598#endif
599
600	add	$16, %rcx
601	movdqa	%xmm4, %xmm3
602
603	add	$16, %r10
604	jg	L(nibble_ashr_4)	/* cross page boundary */
605
606	movdqa	(%rsi, %rcx), %xmm1
607	movdqa	(%rdi, %rcx), %xmm2
608	movdqa	%xmm2, %xmm4
609
610	palignr $4, %xmm3, %xmm2        /* merge into one 16byte value */
611
612	pcmpeqb	%xmm1, %xmm0
613	pcmpeqb	%xmm2, %xmm1
614	psubb	%xmm0, %xmm1
615	pmovmskb %xmm1, %edx
616	sub	$0xffff, %edx
617	jnz	L(exit)
618
619#ifdef USE_AS_STRNCMP
620	sub	$16, %r11
621	jbe	L(strcmp_exitz)
622#endif
623
624	add	$16, %rcx
625	movdqa	%xmm4, %xmm3
626	jmp	L(loop_ashr_4)
627
628	.p2align 4
629L(nibble_ashr_4):
630	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
631	pmovmskb %xmm0, %edx
632	test	$0xfff0, %edx
633	jnz	L(ashr_4_exittail)
634
635#ifdef USE_AS_STRNCMP
636	cmp	$11, %r11
637	jbe	L(ashr_4_exittail)
638#endif
639
640	pxor	%xmm0, %xmm0
641	sub	$0x1000, %r10
642	jmp	L(gobble_ashr_4)
643
644	.p2align 4
645L(ashr_4_exittail):
646	movdqa	(%rsi, %rcx), %xmm1
647	psrldq	$4, %xmm0
648	psrldq	$4, %xmm3
649	jmp	L(aftertail)
650
651/*
652 * The following cases will be handled by ashr_5
653 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
654 *        n(11~15)          n - 11      	  4(15 +(n-11) - n)         ashr_5
655 */
656	.p2align 4
657L(ashr_5):
658	pxor	%xmm0, %xmm0
659	movdqa	(%rdi), %xmm2
660	movdqa	(%rsi), %xmm1
661	pcmpeqb	%xmm1, %xmm0
662	pslldq	$11, %xmm2
663	pcmpeqb	%xmm1, %xmm2
664	psubb	%xmm0, %xmm2
665	pmovmskb %xmm2, %r9d
666	shr	%cl, %edx
667	shr	%cl, %r9d
668	sub	%r9d, %edx
669	jnz	L(less32bytes)
670	movdqa	(%rdi), %xmm3
671
672	UPDATE_STRNCMP_COUNTER
673
674	pxor	%xmm0, %xmm0
675	mov	$16, %rcx	/* index for loads */
676	mov	$5, %r9d	/* byte position left over from less32bytes case */
677	/*
678	 * Setup %r10 value allows us to detect crossing a page boundary.
679	 * When %r10 goes positive we have crossed a page boundary and
680	 * need to do a nibble.
681	 */
682	lea	5(%rdi), %r10
683	and	$0xfff, %r10	/* offset into 4K page */
684	sub	$0x1000, %r10	/* subtract 4K pagesize */
685
686	.p2align 4
687L(loop_ashr_5):
688	add	$16, %r10
689	jg	L(nibble_ashr_5)
690
691L(gobble_ashr_5):
692	movdqa	(%rsi, %rcx), %xmm1
693	movdqa	(%rdi, %rcx), %xmm2
694	movdqa	%xmm2, %xmm4
695
696	palignr $5, %xmm3, %xmm2        /* merge into one 16byte value */
697
698	pcmpeqb	%xmm1, %xmm0
699	pcmpeqb	%xmm2, %xmm1
700	psubb	%xmm0, %xmm1
701	pmovmskb %xmm1, %edx
702	sub	$0xffff, %edx
703	jnz	L(exit)
704
705#ifdef USE_AS_STRNCMP
706	sub	$16, %r11
707	jbe	L(strcmp_exitz)
708#endif
709
710	add	$16, %rcx
711	movdqa	%xmm4, %xmm3
712
713	add	$16, %r10
714	jg	L(nibble_ashr_5)	/* cross page boundary */
715
716	movdqa	(%rsi, %rcx), %xmm1
717	movdqa	(%rdi, %rcx), %xmm2
718	movdqa	%xmm2, %xmm4
719
720	palignr $5, %xmm3, %xmm2        /* merge into one 16byte value */
721
722	pcmpeqb	%xmm1, %xmm0
723	pcmpeqb	%xmm2, %xmm1
724	psubb	%xmm0, %xmm1
725	pmovmskb %xmm1, %edx
726	sub	$0xffff, %edx
727	jnz	L(exit)
728
729#ifdef USE_AS_STRNCMP
730	sub	$16, %r11
731	jbe	L(strcmp_exitz)
732#endif
733
734	add	$16, %rcx
735	movdqa	%xmm4, %xmm3
736	jmp	L(loop_ashr_5)
737
738	.p2align 4
739L(nibble_ashr_5):
740	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
741	pmovmskb %xmm0, %edx
742	test	$0xffe0, %edx
743	jnz	L(ashr_5_exittail)
744
745#ifdef USE_AS_STRNCMP
746	cmp	$10, %r11
747	jbe	L(ashr_5_exittail)
748#endif
749
750	pxor	%xmm0, %xmm0
751	sub	$0x1000, %r10
752	jmp	L(gobble_ashr_5)
753
754	.p2align 4
755L(ashr_5_exittail):
756	movdqa	(%rsi, %rcx), %xmm1
757	psrldq	$5, %xmm0
758	psrldq	$5, %xmm3
759	jmp	L(aftertail)
760
761/*
762 * The following cases will be handled by ashr_6
763 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
764 *        n(10~15)          n - 10      	  5(15 +(n-10) - n)         ashr_6
765 */
766	.p2align 4
767L(ashr_6):
768	pxor	%xmm0, %xmm0
769	movdqa	(%rdi), %xmm2
770	movdqa	(%rsi), %xmm1
771	pcmpeqb	%xmm1, %xmm0
772	pslldq	$10, %xmm2
773	pcmpeqb	%xmm1, %xmm2
774	psubb	%xmm0, %xmm2
775	pmovmskb %xmm2, %r9d
776	shr	%cl, %edx
777	shr	%cl, %r9d
778	sub	%r9d, %edx
779	jnz	L(less32bytes)
780	movdqa	(%rdi), %xmm3
781
782	UPDATE_STRNCMP_COUNTER
783
784	pxor	%xmm0, %xmm0
785	mov	$16, %rcx	/* index for loads */
786	mov	$6, %r9d	/* byte position left over from less32bytes case */
787	/*
788	 * Setup %r10 value allows us to detect crossing a page boundary.
789	 * When %r10 goes positive we have crossed a page boundary and
790	 * need to do a nibble.
791	 */
792	lea	6(%rdi), %r10
793	and	$0xfff, %r10	/* offset into 4K page */
794	sub	$0x1000, %r10	/* subtract 4K pagesize */
795
796	.p2align 4
797L(loop_ashr_6):
798	add	$16, %r10
799	jg	L(nibble_ashr_6)
800
801L(gobble_ashr_6):
802	movdqa	(%rsi, %rcx), %xmm1
803	movdqa	(%rdi, %rcx), %xmm2
804	movdqa	%xmm2, %xmm4
805
806	palignr $6, %xmm3, %xmm2        /* merge into one 16byte value */
807
808	pcmpeqb	%xmm1, %xmm0
809	pcmpeqb	%xmm2, %xmm1
810	psubb	%xmm0, %xmm1
811	pmovmskb %xmm1, %edx
812	sub	$0xffff, %edx
813	jnz	L(exit)
814
815#ifdef USE_AS_STRNCMP
816	sub	$16, %r11
817	jbe	L(strcmp_exitz)
818#endif
819
820	add	$16, %rcx
821	movdqa	%xmm4, %xmm3
822
823	add	$16, %r10
824	jg	L(nibble_ashr_6)	/* cross page boundary */
825
826	movdqa	(%rsi, %rcx), %xmm1
827	movdqa	(%rdi, %rcx), %xmm2
828	movdqa	%xmm2, %xmm4
829
830	palignr $6, %xmm3, %xmm2        /* merge into one 16byte value */
831
832	pcmpeqb	%xmm1, %xmm0
833	pcmpeqb	%xmm2, %xmm1
834	psubb	%xmm0, %xmm1
835	pmovmskb %xmm1, %edx
836	sub	$0xffff, %edx
837	jnz	L(exit)
838
839#ifdef USE_AS_STRNCMP
840	sub	$16, %r11
841	jbe	L(strcmp_exitz)
842#endif
843
844	add	$16, %rcx
845	movdqa	%xmm4, %xmm3
846	jmp	L(loop_ashr_6)
847
848	.p2align 4
849L(nibble_ashr_6):
850	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
851	pmovmskb %xmm0, %edx
852	test	$0xffc0, %edx
853	jnz	L(ashr_6_exittail)
854
855#ifdef USE_AS_STRNCMP
856	cmp	$9, %r11
857	jbe	L(ashr_6_exittail)
858#endif
859
860	pxor	%xmm0, %xmm0
861	sub	$0x1000, %r10
862	jmp	L(gobble_ashr_6)
863
864	.p2align 4
865L(ashr_6_exittail):
866	movdqa	(%rsi, %rcx), %xmm1
867	psrldq	$6, %xmm0
868	psrldq	$6, %xmm3
869	jmp	L(aftertail)
870
871/*
872 * The following cases will be handled by ashr_7
873 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
874 *        n(9~15)          n - 9      	        6(15 +(n - 9) - n)         ashr_7
875 */
876	.p2align 4
877L(ashr_7):
878	pxor	%xmm0, %xmm0
879	movdqa	(%rdi), %xmm2
880	movdqa	(%rsi), %xmm1
881	pcmpeqb	%xmm1, %xmm0
882	pslldq	$9, %xmm2
883	pcmpeqb	%xmm1, %xmm2
884	psubb	%xmm0, %xmm2
885	pmovmskb %xmm2, %r9d
886	shr	%cl, %edx
887	shr	%cl, %r9d
888	sub	%r9d, %edx
889	jnz	L(less32bytes)
890	movdqa	(%rdi), %xmm3
891
892	UPDATE_STRNCMP_COUNTER
893
894	pxor	%xmm0, %xmm0
895	mov	$16, %rcx	/* index for loads */
896	mov	$7, %r9d	/* byte position left over from less32bytes case */
897	/*
898	 * Setup %r10 value allows us to detect crossing a page boundary.
899	 * When %r10 goes positive we have crossed a page boundary and
900	 * need to do a nibble.
901	 */
902	lea	7(%rdi), %r10
903	and	$0xfff, %r10	/* offset into 4K page */
904	sub	$0x1000, %r10	/* subtract 4K pagesize */
905
906	.p2align 4
907L(loop_ashr_7):
908	add	$16, %r10
909	jg	L(nibble_ashr_7)
910
911L(gobble_ashr_7):
912	movdqa	(%rsi, %rcx), %xmm1
913	movdqa	(%rdi, %rcx), %xmm2
914	movdqa	%xmm2, %xmm4
915
916	palignr $7, %xmm3, %xmm2        /* merge into one 16byte value */
917
918	pcmpeqb	%xmm1, %xmm0
919	pcmpeqb	%xmm2, %xmm1
920	psubb	%xmm0, %xmm1
921	pmovmskb %xmm1, %edx
922	sub	$0xffff, %edx
923	jnz	L(exit)
924
925#ifdef USE_AS_STRNCMP
926	sub	$16, %r11
927	jbe	L(strcmp_exitz)
928#endif
929
930	add	$16, %rcx
931	movdqa	%xmm4, %xmm3
932
933	add	$16, %r10
934	jg	L(nibble_ashr_7)	/* cross page boundary */
935
936	movdqa	(%rsi, %rcx), %xmm1
937	movdqa	(%rdi, %rcx), %xmm2
938	movdqa	%xmm2, %xmm4
939
940	palignr $7, %xmm3, %xmm2        /* merge into one 16byte value */
941
942	pcmpeqb	%xmm1, %xmm0
943	pcmpeqb	%xmm2, %xmm1
944	psubb	%xmm0, %xmm1
945	pmovmskb %xmm1, %edx
946	sub	$0xffff, %edx
947	jnz	L(exit)
948
949#ifdef USE_AS_STRNCMP
950	sub	$16, %r11
951	jbe	L(strcmp_exitz)
952#endif
953
954	add	$16, %rcx
955	movdqa	%xmm4, %xmm3
956	jmp	L(loop_ashr_7)
957
958	.p2align 4
959L(nibble_ashr_7):
960	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
961	pmovmskb %xmm0, %edx
962	test	$0xff80, %edx
963	jnz	L(ashr_7_exittail)
964
965#ifdef USE_AS_STRNCMP
966	cmp	$8, %r11
967	jbe	L(ashr_7_exittail)
968#endif
969
970	pxor	%xmm0, %xmm0
971	sub	$0x1000, %r10
972	jmp	L(gobble_ashr_7)
973
974	.p2align 4
975L(ashr_7_exittail):
976	movdqa	(%rsi, %rcx), %xmm1
977	psrldq	$7, %xmm0
978	psrldq	$7, %xmm3
979	jmp	L(aftertail)
980
981/*
982 *  The following cases will be handled by ashr_8
983 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
984 *        n(8~15)          n - 8      	        7(15 +(n - 8) - n)         ashr_8
985 */
986	.p2align 4
987L(ashr_8):
988	pxor	%xmm0, %xmm0
989	movdqa	(%rdi), %xmm2
990	movdqa	(%rsi), %xmm1
991	pcmpeqb	%xmm1, %xmm0
992	pslldq	$8, %xmm2
993	pcmpeqb	%xmm1, %xmm2
994	psubb	%xmm0, %xmm2
995	pmovmskb %xmm2, %r9d
996	shr	%cl, %edx
997	shr	%cl, %r9d
998	sub	%r9d, %edx
999	jnz	L(less32bytes)
1000	movdqa	(%rdi), %xmm3
1001
1002	UPDATE_STRNCMP_COUNTER
1003
1004	pxor	%xmm0, %xmm0
1005	mov	$16, %rcx	/* index for loads */
1006	mov	$8, %r9d	/* byte position left over from less32bytes case */
1007	/*
1008	 * Setup %r10 value allows us to detect crossing a page boundary.
1009	 * When %r10 goes positive we have crossed a page boundary and
1010	 * need to do a nibble.
1011	 */
1012	lea	8(%rdi), %r10
1013	and	$0xfff, %r10	/* offset into 4K page */
1014	sub	$0x1000, %r10	/* subtract 4K pagesize */
1015
1016	.p2align 4
1017L(loop_ashr_8):
1018	add	$16, %r10
1019	jg	L(nibble_ashr_8)
1020
1021L(gobble_ashr_8):
1022	movdqa	(%rsi, %rcx), %xmm1
1023	movdqa	(%rdi, %rcx), %xmm2
1024	movdqa	%xmm2, %xmm4
1025
1026	palignr $8, %xmm3, %xmm2        /* merge into one 16byte value */
1027
1028	pcmpeqb	%xmm1, %xmm0
1029	pcmpeqb	%xmm2, %xmm1
1030	psubb	%xmm0, %xmm1
1031	pmovmskb %xmm1, %edx
1032	sub	$0xffff, %edx
1033	jnz	L(exit)
1034
1035#ifdef USE_AS_STRNCMP
1036	sub	$16, %r11
1037	jbe	L(strcmp_exitz)
1038#endif
1039
1040	add	$16, %rcx
1041	movdqa	%xmm4, %xmm3
1042
1043	add	$16, %r10
1044	jg	L(nibble_ashr_8)	/* cross page boundary */
1045
1046	movdqa	(%rsi, %rcx), %xmm1
1047	movdqa	(%rdi, %rcx), %xmm2
1048	movdqa	%xmm2, %xmm4
1049
1050	palignr $8, %xmm3, %xmm2        /* merge into one 16byte value */
1051
1052	pcmpeqb	%xmm1, %xmm0
1053	pcmpeqb	%xmm2, %xmm1
1054	psubb	%xmm0, %xmm1
1055	pmovmskb %xmm1, %edx
1056	sub	$0xffff, %edx
1057	jnz	L(exit)
1058
1059#ifdef USE_AS_STRNCMP
1060	sub	$16, %r11
1061	jbe	L(strcmp_exitz)
1062#endif
1063
1064	add	$16, %rcx
1065	movdqa	%xmm4, %xmm3
1066	jmp	L(loop_ashr_8)
1067
1068	.p2align 4
1069L(nibble_ashr_8):
1070	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1071	pmovmskb %xmm0, %edx
1072	test	$0xff00, %edx
1073	jnz	L(ashr_8_exittail)
1074
1075#ifdef USE_AS_STRNCMP
1076	cmp	$7, %r11
1077	jbe	L(ashr_8_exittail)
1078#endif
1079
1080	pxor	%xmm0, %xmm0
1081	sub	$0x1000, %r10
1082	jmp	L(gobble_ashr_8)
1083
1084	.p2align 4
1085L(ashr_8_exittail):
1086	movdqa	(%rsi, %rcx), %xmm1
1087	psrldq	$8, %xmm0
1088	psrldq	$8, %xmm3
1089	jmp	L(aftertail)
1090
1091/*
1092 *  The following cases will be handled by ashr_9
1093 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1094 *        n(7~15)          n - 7      	        8(15 +(n - 7) - n)         ashr_9
1095 */
1096	.p2align 4
1097L(ashr_9):
1098	pxor	%xmm0, %xmm0
1099	movdqa	(%rdi), %xmm2
1100	movdqa	(%rsi), %xmm1
1101	pcmpeqb	%xmm1, %xmm0
1102	pslldq	$7, %xmm2
1103	pcmpeqb	%xmm1, %xmm2
1104	psubb	%xmm0, %xmm2
1105	pmovmskb %xmm2, %r9d
1106	shr	%cl, %edx
1107	shr	%cl, %r9d
1108	sub	%r9d, %edx
1109	jnz	L(less32bytes)
1110	movdqa	(%rdi), %xmm3
1111
1112	UPDATE_STRNCMP_COUNTER
1113
1114	pxor	%xmm0, %xmm0
1115	mov	$16, %rcx	/* index for loads */
1116	mov	$9, %r9d	/* byte position left over from less32bytes case */
1117	/*
1118	 * Setup %r10 value allows us to detect crossing a page boundary.
1119	 * When %r10 goes positive we have crossed a page boundary and
1120	 * need to do a nibble.
1121	 */
1122	lea	9(%rdi), %r10
1123	and	$0xfff, %r10	/* offset into 4K page */
1124	sub	$0x1000, %r10	/* subtract 4K pagesize */
1125
1126	.p2align 4
1127L(loop_ashr_9):
1128	add	$16, %r10
1129	jg	L(nibble_ashr_9)
1130
1131L(gobble_ashr_9):
1132	movdqa	(%rsi, %rcx), %xmm1
1133	movdqa	(%rdi, %rcx), %xmm2
1134	movdqa	%xmm2, %xmm4
1135
1136	palignr $9, %xmm3, %xmm2        /* merge into one 16byte value */
1137
1138	pcmpeqb	%xmm1, %xmm0
1139	pcmpeqb	%xmm2, %xmm1
1140	psubb	%xmm0, %xmm1
1141	pmovmskb %xmm1, %edx
1142	sub	$0xffff, %edx
1143	jnz	L(exit)
1144
1145#ifdef USE_AS_STRNCMP
1146	sub	$16, %r11
1147	jbe	L(strcmp_exitz)
1148#endif
1149
1150	add	$16, %rcx
1151	movdqa	%xmm4, %xmm3
1152
1153	add	$16, %r10
1154	jg	L(nibble_ashr_9)	/* cross page boundary */
1155
1156	movdqa	(%rsi, %rcx), %xmm1
1157	movdqa	(%rdi, %rcx), %xmm2
1158	movdqa	%xmm2, %xmm4
1159
1160	palignr $9, %xmm3, %xmm2        /* merge into one 16byte value */
1161
1162	pcmpeqb	%xmm1, %xmm0
1163	pcmpeqb	%xmm2, %xmm1
1164	psubb	%xmm0, %xmm1
1165	pmovmskb %xmm1, %edx
1166	sub	$0xffff, %edx
1167	jnz	L(exit)
1168
1169#ifdef USE_AS_STRNCMP
1170	sub	$16, %r11
1171	jbe	L(strcmp_exitz)
1172#endif
1173
1174	add	$16, %rcx
1175	movdqa	%xmm4, %xmm3		/* store for next cycle */
1176	jmp	L(loop_ashr_9)
1177
1178	.p2align 4
1179L(nibble_ashr_9):
1180	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1181	pmovmskb %xmm0, %edx
1182	test	$0xfe00, %edx
1183	jnz	L(ashr_9_exittail)
1184
1185#ifdef USE_AS_STRNCMP
1186	cmp	$6, %r11
1187	jbe	L(ashr_9_exittail)
1188#endif
1189
1190	pxor	%xmm0, %xmm0
1191	sub	$0x1000, %r10
1192	jmp	L(gobble_ashr_9)
1193
1194	.p2align 4
1195L(ashr_9_exittail):
1196	movdqa	(%rsi, %rcx), %xmm1
1197	psrldq	$9, %xmm0
1198	psrldq	$9, %xmm3
1199	jmp	L(aftertail)
1200
1201/*
1202 *  The following cases will be handled by ashr_10
1203 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1204 *        n(6~15)          n - 6      	        9(15 +(n - 6) - n)         ashr_10
1205 */
1206	.p2align 4
1207L(ashr_10):
1208	pxor	%xmm0, %xmm0
1209	movdqa	(%rdi), %xmm2
1210	movdqa	(%rsi), %xmm1
1211	pcmpeqb	%xmm1, %xmm0
1212	pslldq	$6, %xmm2
1213	pcmpeqb	%xmm1, %xmm2
1214	psubb	%xmm0, %xmm2
1215	pmovmskb %xmm2, %r9d
1216	shr	%cl, %edx
1217	shr	%cl, %r9d
1218	sub	%r9d, %edx
1219	jnz	L(less32bytes)
1220	movdqa	(%rdi), %xmm3
1221
1222	UPDATE_STRNCMP_COUNTER
1223
1224	pxor	%xmm0, %xmm0
1225	mov	$16, %rcx	/* index for loads */
1226	mov	$10, %r9d	/* byte position left over from less32bytes case */
1227	/*
1228	 * Setup %r10 value allows us to detect crossing a page boundary.
1229	 * When %r10 goes positive we have crossed a page boundary and
1230	 * need to do a nibble.
1231	 */
1232	lea	10(%rdi), %r10
1233	and	$0xfff, %r10	/* offset into 4K page */
1234	sub	$0x1000, %r10	/* subtract 4K pagesize */
1235
1236	.p2align 4
1237L(loop_ashr_10):
1238	add	$16, %r10
1239	jg	L(nibble_ashr_10)
1240
1241L(gobble_ashr_10):
1242	movdqa	(%rsi, %rcx), %xmm1
1243	movdqa	(%rdi, %rcx), %xmm2
1244	movdqa	%xmm2, %xmm4
1245
1246	palignr $10, %xmm3, %xmm2        /* merge into one 16byte value */
1247
1248	pcmpeqb	%xmm1, %xmm0
1249	pcmpeqb	%xmm2, %xmm1
1250	psubb	%xmm0, %xmm1
1251	pmovmskb %xmm1, %edx
1252	sub	$0xffff, %edx
1253	jnz	L(exit)
1254
1255#ifdef USE_AS_STRNCMP
1256	sub	$16, %r11
1257	jbe	L(strcmp_exitz)
1258#endif
1259
1260	add	$16, %rcx
1261	movdqa	%xmm4, %xmm3
1262
1263	add	$16, %r10
1264	jg	L(nibble_ashr_10)	/* cross page boundary */
1265
1266	movdqa	(%rsi, %rcx), %xmm1
1267	movdqa	(%rdi, %rcx), %xmm2
1268	movdqa	%xmm2, %xmm4
1269
1270	palignr $10, %xmm3, %xmm2        /* merge into one 16byte value */
1271
1272	pcmpeqb	%xmm1, %xmm0
1273	pcmpeqb	%xmm2, %xmm1
1274	psubb	%xmm0, %xmm1
1275	pmovmskb %xmm1, %edx
1276	sub	$0xffff, %edx
1277	jnz	L(exit)
1278
1279#ifdef USE_AS_STRNCMP
1280	sub	$16, %r11
1281	jbe	L(strcmp_exitz)
1282#endif
1283
1284	add	$16, %rcx
1285	movdqa	%xmm4, %xmm3
1286	jmp	L(loop_ashr_10)
1287
1288	.p2align 4
1289L(nibble_ashr_10):
1290	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1291	pmovmskb %xmm0, %edx
1292	test	$0xfc00, %edx
1293	jnz	L(ashr_10_exittail)
1294
1295#ifdef USE_AS_STRNCMP
1296	cmp	$5, %r11
1297	jbe	L(ashr_10_exittail)
1298#endif
1299
1300	pxor	%xmm0, %xmm0
1301	sub	$0x1000, %r10
1302	jmp	L(gobble_ashr_10)
1303
1304	.p2align 4
1305L(ashr_10_exittail):
1306	movdqa	(%rsi, %rcx), %xmm1
1307	psrldq	$10, %xmm0
1308	psrldq	$10, %xmm3
1309	jmp	L(aftertail)
1310
1311/*
1312 *  The following cases will be handled by ashr_11
1313 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1314 *        n(5~15)          n - 5      	        10(15 +(n - 5) - n)         ashr_11
1315 */
1316	.p2align 4
1317L(ashr_11):
1318	pxor	%xmm0, %xmm0
1319	movdqa	(%rdi), %xmm2
1320	movdqa	(%rsi), %xmm1
1321	pcmpeqb	%xmm1, %xmm0
1322	pslldq	$5, %xmm2
1323	pcmpeqb	%xmm1, %xmm2
1324	psubb	%xmm0, %xmm2
1325	pmovmskb %xmm2, %r9d
1326	shr	%cl, %edx
1327	shr	%cl, %r9d
1328	sub	%r9d, %edx
1329	jnz	L(less32bytes)
1330	movdqa	(%rdi), %xmm3
1331
1332	UPDATE_STRNCMP_COUNTER
1333
1334	pxor	%xmm0, %xmm0
1335	mov	$16, %rcx	/* index for loads */
1336	mov	$11, %r9d	/* byte position left over from less32bytes case */
1337	/*
1338	 * Setup %r10 value allows us to detect crossing a page boundary.
1339	 * When %r10 goes positive we have crossed a page boundary and
1340	 * need to do a nibble.
1341	 */
1342	lea	11(%rdi), %r10
1343	and	$0xfff, %r10	/* offset into 4K page */
1344	sub	$0x1000, %r10	/* subtract 4K pagesize */
1345
1346	.p2align 4
1347L(loop_ashr_11):
1348	add	$16, %r10
1349	jg	L(nibble_ashr_11)
1350
1351L(gobble_ashr_11):
1352	movdqa	(%rsi, %rcx), %xmm1
1353	movdqa	(%rdi, %rcx), %xmm2
1354	movdqa	%xmm2, %xmm4
1355
1356	palignr $11, %xmm3, %xmm2        /* merge into one 16byte value */
1357
1358	pcmpeqb	%xmm1, %xmm0
1359	pcmpeqb	%xmm2, %xmm1
1360	psubb	%xmm0, %xmm1
1361	pmovmskb %xmm1, %edx
1362	sub	$0xffff, %edx
1363	jnz	L(exit)
1364
1365#ifdef USE_AS_STRNCMP
1366	sub	$16, %r11
1367	jbe	L(strcmp_exitz)
1368#endif
1369
1370	add	$16, %rcx
1371	movdqa	%xmm4, %xmm3
1372
1373	add	$16, %r10
1374	jg	L(nibble_ashr_11)	/* cross page boundary */
1375
1376	movdqa	(%rsi, %rcx), %xmm1
1377	movdqa	(%rdi, %rcx), %xmm2
1378	movdqa	%xmm2, %xmm4
1379
1380	palignr $11, %xmm3, %xmm2        /* merge into one 16byte value */
1381
1382	pcmpeqb	%xmm1, %xmm0
1383	pcmpeqb	%xmm2, %xmm1
1384	psubb	%xmm0, %xmm1
1385	pmovmskb %xmm1, %edx
1386	sub	$0xffff, %edx
1387	jnz	L(exit)
1388
1389#ifdef USE_AS_STRNCMP
1390	sub	$16, %r11
1391	jbe	L(strcmp_exitz)
1392#endif
1393
1394	add	$16, %rcx
1395	movdqa	%xmm4, %xmm3
1396	jmp	L(loop_ashr_11)
1397
1398	.p2align 4
1399L(nibble_ashr_11):
1400	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1401	pmovmskb %xmm0, %edx
1402	test	$0xf800, %edx
1403	jnz	L(ashr_11_exittail)
1404
1405#ifdef USE_AS_STRNCMP
1406	cmp	$4, %r11
1407	jbe	L(ashr_11_exittail)
1408#endif
1409
1410	pxor	%xmm0, %xmm0
1411	sub	$0x1000, %r10
1412	jmp	L(gobble_ashr_11)
1413
1414	.p2align 4
1415L(ashr_11_exittail):
1416	movdqa	(%rsi, %rcx), %xmm1
1417	psrldq	$11, %xmm0
1418	psrldq	$11, %xmm3
1419	jmp	L(aftertail)
1420
1421/*
1422 *  The following cases will be handled by ashr_12
1423 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1424 *        n(4~15)          n - 4      	        11(15 +(n - 4) - n)         ashr_12
1425 */
1426	.p2align 4
1427L(ashr_12):
1428	pxor	%xmm0, %xmm0
1429	movdqa	(%rdi), %xmm2
1430	movdqa	(%rsi), %xmm1
1431	pcmpeqb	%xmm1, %xmm0
1432	pslldq	$4, %xmm2
1433	pcmpeqb	%xmm1, %xmm2
1434	psubb	%xmm0, %xmm2
1435	pmovmskb %xmm2, %r9d
1436	shr	%cl, %edx
1437	shr	%cl, %r9d
1438	sub	%r9d, %edx
1439	jnz	L(less32bytes)
1440	movdqa	(%rdi), %xmm3
1441
1442	UPDATE_STRNCMP_COUNTER
1443
1444	pxor	%xmm0, %xmm0
1445	mov	$16, %rcx	/* index for loads */
1446	mov	$12, %r9d	/* byte position left over from less32bytes case */
1447	/*
1448	 * Setup %r10 value allows us to detect crossing a page boundary.
1449	 * When %r10 goes positive we have crossed a page boundary and
1450	 * need to do a nibble.
1451	 */
1452	lea	12(%rdi), %r10
1453	and	$0xfff, %r10	/* offset into 4K page */
1454	sub	$0x1000, %r10	/* subtract 4K pagesize */
1455
1456	.p2align 4
1457L(loop_ashr_12):
1458	add	$16, %r10
1459	jg	L(nibble_ashr_12)
1460
1461L(gobble_ashr_12):
1462	movdqa	(%rsi, %rcx), %xmm1
1463	movdqa	(%rdi, %rcx), %xmm2
1464	movdqa	%xmm2, %xmm4
1465
1466	palignr $12, %xmm3, %xmm2        /* merge into one 16byte value */
1467
1468	pcmpeqb	%xmm1, %xmm0
1469	pcmpeqb	%xmm2, %xmm1
1470	psubb	%xmm0, %xmm1
1471	pmovmskb %xmm1, %edx
1472	sub	$0xffff, %edx
1473	jnz	L(exit)
1474
1475#ifdef USE_AS_STRNCMP
1476	sub	$16, %r11
1477	jbe	L(strcmp_exitz)
1478#endif
1479
1480	add	$16, %rcx
1481	movdqa	%xmm4, %xmm3
1482
1483	add	$16, %r10
1484	jg	L(nibble_ashr_12)	/* cross page boundary */
1485
1486	movdqa	(%rsi, %rcx), %xmm1
1487	movdqa	(%rdi, %rcx), %xmm2
1488	movdqa	%xmm2, %xmm4
1489
1490	palignr $12, %xmm3, %xmm2        /* merge into one 16byte value */
1491
1492	pcmpeqb	%xmm1, %xmm0
1493	pcmpeqb	%xmm2, %xmm1
1494	psubb	%xmm0, %xmm1
1495	pmovmskb %xmm1, %edx
1496	sub	$0xffff, %edx
1497	jnz	L(exit)
1498
1499#ifdef USE_AS_STRNCMP
1500	sub	$16, %r11
1501	jbe	L(strcmp_exitz)
1502#endif
1503
1504	add	$16, %rcx
1505	movdqa	%xmm4, %xmm3
1506	jmp	L(loop_ashr_12)
1507
1508	.p2align 4
1509L(nibble_ashr_12):
1510	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1511	pmovmskb %xmm0, %edx
1512	test	$0xf000, %edx
1513	jnz	L(ashr_12_exittail)
1514
1515#ifdef USE_AS_STRNCMP
1516	cmp	$3, %r11
1517	jbe	L(ashr_12_exittail)
1518#endif
1519
1520	pxor	%xmm0, %xmm0
1521	sub	$0x1000, %r10
1522	jmp	L(gobble_ashr_12)
1523
1524	.p2align 4
1525L(ashr_12_exittail):
1526	movdqa	(%rsi, %rcx), %xmm1
1527	psrldq	$12, %xmm0
1528	psrldq	$12, %xmm3
1529	jmp	L(aftertail)
1530
1531/*
1532 *  The following cases will be handled by ashr_13
1533 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1534 *        n(3~15)          n - 3      	        12(15 +(n - 3) - n)         ashr_13
1535 */
1536	.p2align 4
1537L(ashr_13):
1538	pxor	%xmm0, %xmm0
1539	movdqa	(%rdi), %xmm2
1540	movdqa	(%rsi), %xmm1
1541	pcmpeqb	%xmm1, %xmm0
1542	pslldq	$3, %xmm2
1543	pcmpeqb	%xmm1, %xmm2
1544	psubb	%xmm0, %xmm2
1545	pmovmskb %xmm2, %r9d
1546	shr	%cl, %edx
1547	shr	%cl, %r9d
1548	sub	%r9d, %edx
1549	jnz	L(less32bytes)
1550	movdqa	(%rdi), %xmm3
1551
1552	UPDATE_STRNCMP_COUNTER
1553
1554	pxor	%xmm0, %xmm0
1555	mov	$16, %rcx	/* index for loads */
1556	mov	$13, %r9d	/* byte position left over from less32bytes case */
1557	/*
1558	 * Setup %r10 value allows us to detect crossing a page boundary.
1559	 * When %r10 goes positive we have crossed a page boundary and
1560	 * need to do a nibble.
1561	 */
1562	lea	13(%rdi), %r10
1563	and	$0xfff, %r10	/* offset into 4K page */
1564	sub	$0x1000, %r10	/* subtract 4K pagesize */
1565
1566	.p2align 4
1567L(loop_ashr_13):
1568	add	$16, %r10
1569	jg	L(nibble_ashr_13)
1570
1571L(gobble_ashr_13):
1572	movdqa	(%rsi, %rcx), %xmm1
1573	movdqa	(%rdi, %rcx), %xmm2
1574	movdqa	%xmm2, %xmm4
1575
1576	palignr $13, %xmm3, %xmm2        /* merge into one 16byte value */
1577
1578	pcmpeqb	%xmm1, %xmm0
1579	pcmpeqb	%xmm2, %xmm1
1580	psubb	%xmm0, %xmm1
1581	pmovmskb %xmm1, %edx
1582	sub	$0xffff, %edx
1583	jnz	L(exit)
1584
1585#ifdef USE_AS_STRNCMP
1586	sub	$16, %r11
1587	jbe	L(strcmp_exitz)
1588#endif
1589
1590	add	$16, %rcx
1591	movdqa	%xmm4, %xmm3
1592
1593	add	$16, %r10
1594	jg	L(nibble_ashr_13)	/* cross page boundary */
1595
1596	movdqa	(%rsi, %rcx), %xmm1
1597	movdqa	(%rdi, %rcx), %xmm2
1598	movdqa	%xmm2, %xmm4
1599
1600	palignr $13, %xmm3, %xmm2        /* merge into one 16byte value */
1601
1602	pcmpeqb	%xmm1, %xmm0
1603	pcmpeqb	%xmm2, %xmm1
1604	psubb	%xmm0, %xmm1
1605	pmovmskb %xmm1, %edx
1606	sub	$0xffff, %edx
1607	jnz	L(exit)
1608
1609#ifdef USE_AS_STRNCMP
1610	sub	$16, %r11
1611	jbe	L(strcmp_exitz)
1612#endif
1613
1614	add	$16, %rcx
1615	movdqa	%xmm4, %xmm3
1616	jmp	L(loop_ashr_13)
1617
1618	.p2align 4
1619L(nibble_ashr_13):
1620	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1621	pmovmskb %xmm0, %edx
1622	test	$0xe000, %edx
1623	jnz	L(ashr_13_exittail)
1624
1625#ifdef USE_AS_STRNCMP
1626	cmp	$2, %r11
1627	jbe	L(ashr_13_exittail)
1628#endif
1629
1630	pxor	%xmm0, %xmm0
1631	sub	$0x1000, %r10
1632	jmp	L(gobble_ashr_13)
1633
1634	.p2align 4
1635L(ashr_13_exittail):
1636	movdqa	(%rsi, %rcx), %xmm1
1637	psrldq  $13, %xmm0
1638	psrldq  $13, %xmm3
1639	jmp	L(aftertail)
1640
1641/*
1642 *  The following cases will be handled by ashr_14
1643 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1644 *        n(2~15)          n - 2      	        13(15 +(n - 2) - n)         ashr_14
1645 */
1646	.p2align 4
1647L(ashr_14):
1648	pxor	%xmm0, %xmm0
1649	movdqa	(%rdi), %xmm2
1650	movdqa	(%rsi), %xmm1
1651	pcmpeqb	%xmm1, %xmm0
1652	pslldq  $2, %xmm2
1653	pcmpeqb	%xmm1, %xmm2
1654	psubb	%xmm0, %xmm2
1655	pmovmskb %xmm2, %r9d
1656	shr	%cl, %edx
1657	shr	%cl, %r9d
1658	sub	%r9d, %edx
1659	jnz	L(less32bytes)
1660	movdqa	(%rdi), %xmm3
1661
1662	UPDATE_STRNCMP_COUNTER
1663
1664	pxor	%xmm0, %xmm0
1665	mov	$16, %rcx	/* index for loads */
1666	mov	$14, %r9d	/* byte position left over from less32bytes case */
1667	/*
1668	 * Setup %r10 value allows us to detect crossing a page boundary.
1669	 * When %r10 goes positive we have crossed a page boundary and
1670	 * need to do a nibble.
1671	 */
1672	lea	14(%rdi), %r10
1673	and	$0xfff, %r10	/* offset into 4K page */
1674	sub	$0x1000, %r10	/* subtract 4K pagesize */
1675
1676	.p2align 4
1677L(loop_ashr_14):
1678	add	$16, %r10
1679	jg	L(nibble_ashr_14)
1680
1681L(gobble_ashr_14):
1682	movdqa	(%rsi, %rcx), %xmm1
1683	movdqa	(%rdi, %rcx), %xmm2
1684	movdqa	%xmm2, %xmm4
1685
1686	palignr $14, %xmm3, %xmm2        /* merge into one 16byte value */
1687
1688	pcmpeqb	%xmm1, %xmm0
1689	pcmpeqb	%xmm2, %xmm1
1690	psubb	%xmm0, %xmm1
1691	pmovmskb %xmm1, %edx
1692	sub	$0xffff, %edx
1693	jnz	L(exit)
1694
1695#ifdef USE_AS_STRNCMP
1696	sub	$16, %r11
1697	jbe	L(strcmp_exitz)
1698#endif
1699
1700	add	$16, %rcx
1701	movdqa	%xmm4, %xmm3
1702
1703	add	$16, %r10
1704	jg	L(nibble_ashr_14)	/* cross page boundary */
1705
1706	movdqa	(%rsi, %rcx), %xmm1
1707	movdqa	(%rdi, %rcx), %xmm2
1708	movdqa	%xmm2, %xmm4
1709
1710	palignr $14, %xmm3, %xmm2        /* merge into one 16byte value */
1711
1712	pcmpeqb	%xmm1, %xmm0
1713	pcmpeqb	%xmm2, %xmm1
1714	psubb	%xmm0, %xmm1
1715	pmovmskb %xmm1, %edx
1716	sub	$0xffff, %edx
1717	jnz	L(exit)
1718
1719#ifdef USE_AS_STRNCMP
1720	sub	$16, %r11
1721	jbe	L(strcmp_exitz)
1722#endif
1723
1724	add	$16, %rcx
1725	movdqa	%xmm4, %xmm3
1726	jmp	L(loop_ashr_14)
1727
1728	.p2align 4
1729L(nibble_ashr_14):
1730	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1731	pmovmskb %xmm0, %edx
1732	test	$0xc000, %edx
1733	jnz	L(ashr_14_exittail)
1734
1735#ifdef USE_AS_STRNCMP
1736	cmp	$1, %r11
1737	jbe	L(ashr_14_exittail)
1738#endif
1739
1740	pxor	%xmm0, %xmm0
1741	sub	$0x1000, %r10
1742	jmp	L(gobble_ashr_14)
1743
1744	.p2align 4
1745L(ashr_14_exittail):
1746	movdqa	(%rsi, %rcx), %xmm1
1747	psrldq	$14, %xmm0
1748	psrldq	$14, %xmm3
1749	jmp	L(aftertail)
1750
1751/*
1752 *  The following cases will be handled by ashr_15
1753 *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1754 *        n(1~15)          n - 1      	        14(15 +(n - 1) - n)         ashr_15
1755 */
1756	.p2align 4
1757L(ashr_15):
1758	pxor	%xmm0, %xmm0
1759	movdqa	(%rdi), %xmm2
1760	movdqa	(%rsi), %xmm1
1761	pcmpeqb	%xmm1, %xmm0
1762	pslldq	$1, %xmm2
1763	pcmpeqb	%xmm1, %xmm2
1764	psubb	%xmm0, %xmm2
1765	pmovmskb %xmm2, %r9d
1766	shr	%cl, %edx
1767	shr	%cl, %r9d
1768	sub	%r9d, %edx
1769	jnz	L(less32bytes)
1770
1771	movdqa	(%rdi), %xmm3
1772
1773	UPDATE_STRNCMP_COUNTER
1774
1775	pxor	%xmm0, %xmm0
1776	mov	$16, %rcx	/* index for loads */
1777	mov	$15, %r9d	/* byte position left over from less32bytes case */
1778	/*
1779	 * Setup %r10 value allows us to detect crossing a page boundary.
1780	 * When %r10 goes positive we have crossed a page boundary and
1781	 * need to do a nibble.
1782	 */
1783	lea	15(%rdi), %r10
1784	and	$0xfff, %r10	/* offset into 4K page */
1785
1786	sub	$0x1000, %r10	/* subtract 4K pagesize */
1787
1788	.p2align 4
1789L(loop_ashr_15):
1790	add	$16, %r10
1791	jg	L(nibble_ashr_15)
1792
1793L(gobble_ashr_15):
1794	movdqa	(%rsi, %rcx), %xmm1
1795	movdqa	(%rdi, %rcx), %xmm2
1796	movdqa	%xmm2, %xmm4
1797
1798	palignr $15, %xmm3, %xmm2        /* merge into one 16byte value */
1799
1800	pcmpeqb	%xmm1, %xmm0
1801	pcmpeqb	%xmm2, %xmm1
1802	psubb	%xmm0, %xmm1
1803	pmovmskb %xmm1, %edx
1804	sub	$0xffff, %edx
1805	jnz	L(exit)
1806
1807#ifdef USE_AS_STRNCMP
1808	sub	$16, %r11
1809	jbe	L(strcmp_exitz)
1810#endif
1811
1812	add	$16, %rcx
1813	movdqa	%xmm4, %xmm3
1814
1815	add	$16, %r10
1816	jg	L(nibble_ashr_15)	/* cross page boundary */
1817
1818	movdqa	(%rsi, %rcx), %xmm1
1819	movdqa	(%rdi, %rcx), %xmm2
1820	movdqa	%xmm2, %xmm4
1821
1822	palignr $15, %xmm3, %xmm2        /* merge into one 16byte value */
1823
1824	pcmpeqb	%xmm1, %xmm0
1825	pcmpeqb	%xmm2, %xmm1
1826	psubb	%xmm0, %xmm1
1827	pmovmskb %xmm1, %edx
1828	sub	$0xffff, %edx
1829	jnz	L(exit)
1830
1831#ifdef USE_AS_STRNCMP
1832	sub	$16, %r11
1833	jbe	L(strcmp_exitz)
1834#endif
1835
1836	add	$16, %rcx
1837	movdqa	%xmm4, %xmm3
1838	jmp	L(loop_ashr_15)
1839
1840	.p2align 4
1841L(nibble_ashr_15):
1842	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1843	pmovmskb %xmm0, %edx
1844	test	$0x8000, %edx
1845	jnz	L(ashr_15_exittail)
1846
1847#ifdef USE_AS_STRNCMP
1848	test	%r11, %r11
1849	je	L(ashr_15_exittail)
1850#endif
1851
1852	pxor	%xmm0, %xmm0
1853	sub	$0x1000, %r10
1854	jmp	L(gobble_ashr_15)
1855
1856	.p2align 4
1857L(ashr_15_exittail):
1858	movdqa	(%rsi, %rcx), %xmm1
1859	psrldq	$15, %xmm3
1860	psrldq	$15, %xmm0
1861
1862	.p2align 4
1863L(aftertail):
1864	pcmpeqb	%xmm3, %xmm1
1865	psubb	%xmm0, %xmm1
1866	pmovmskb %xmm1, %edx
1867	not	%edx
1868
1869	.p2align 4
1870L(exit):
1871	lea	-16(%r9, %rcx), %rax	/* locate the exact offset for rdi */
1872L(less32bytes):
1873	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
1874	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
1875	test	%r8d, %r8d
1876	jz	L(ret)
1877	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
1878
1879	.p2align 4
1880L(ret):
1881L(less16bytes):
1882	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
1883
1884#ifdef USE_AS_STRNCMP
1885	sub	%rdx, %r11
1886	jbe	L(strcmp_exitz)
1887#endif
1888	movzbl	(%rsi, %rdx), %ecx
1889	movzbl	(%rdi, %rdx), %eax
1890
1891	sub	%ecx, %eax
1892	ret
1893
1894L(strcmp_exitz):
1895	xor	%eax, %eax
1896	ret
1897
1898	.p2align 4
1899L(Byte0):
1900	movzbl	(%rsi), %ecx
1901	movzbl	(%rdi), %eax
1902
1903	sub	%ecx, %eax
1904	ret
1905END (STRCMP)
1906
1907	.section .rodata,"a",@progbits
1908	.p2align 3
1909L(unaligned_table):
1910	.int	L(ashr_1) - L(unaligned_table)
1911	.int	L(ashr_2) - L(unaligned_table)
1912	.int	L(ashr_3) - L(unaligned_table)
1913	.int	L(ashr_4) - L(unaligned_table)
1914	.int	L(ashr_5) - L(unaligned_table)
1915	.int	L(ashr_6) - L(unaligned_table)
1916	.int	L(ashr_7) - L(unaligned_table)
1917	.int	L(ashr_8) - L(unaligned_table)
1918	.int	L(ashr_9) - L(unaligned_table)
1919	.int	L(ashr_10) - L(unaligned_table)
1920	.int	L(ashr_11) - L(unaligned_table)
1921	.int	L(ashr_12) - L(unaligned_table)
1922	.int	L(ashr_13) - L(unaligned_table)
1923	.int	L(ashr_14) - L(unaligned_table)
1924	.int	L(ashr_15) - L(unaligned_table)
1925	.int	L(ashr_0) - L(unaligned_table)
1926