xref: /titanic_52/usr/src/lib/libc/amd64/gen/strcpy.s (revision 533d3a4910febc9985154b885dbe971e3c21ca04)
17c478bd9Sstevel@tonic-gate/*
2*533d3a49SEdward Gillett * CDDL HEADER START
3*533d3a49SEdward Gillett *
4*533d3a49SEdward Gillett * The contents of this file are subject to the terms of the
5*533d3a49SEdward Gillett * Common Development and Distribution License (the "License").
6*533d3a49SEdward Gillett * You may not use this file except in compliance with the License.
7*533d3a49SEdward Gillett *
8*533d3a49SEdward Gillett * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*533d3a49SEdward Gillett * or http://www.opensolaris.org/os/licensing.
10*533d3a49SEdward Gillett * See the License for the specific language governing permissions
11*533d3a49SEdward Gillett * and limitations under the License.
12*533d3a49SEdward Gillett *
13*533d3a49SEdward Gillett * When distributing Covered Code, include this CDDL HEADER in each
14*533d3a49SEdward Gillett * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*533d3a49SEdward Gillett * If applicable, add the following below this CDDL HEADER, with the
16*533d3a49SEdward Gillett * fields enclosed by brackets "[]" replaced with your own identifying
17*533d3a49SEdward Gillett * information: Portions Copyright [yyyy] [name of copyright owner]
18*533d3a49SEdward Gillett *
19*533d3a49SEdward Gillett * CDDL HEADER END
207c478bd9Sstevel@tonic-gate */
217c478bd9Sstevel@tonic-gate
227c478bd9Sstevel@tonic-gate/*
23*533d3a49SEdward Gillett * Copyright (c) 2009, Intel Corporation
247c478bd9Sstevel@tonic-gate * All rights reserved.
257c478bd9Sstevel@tonic-gate */
267c478bd9Sstevel@tonic-gate
27*533d3a49SEdward Gillett/*
28*533d3a49SEdward Gillett *	str[n]cpy - copy [n] chars from second operand into first operand
29*533d3a49SEdward Gillett */
307c478bd9Sstevel@tonic-gate#include "SYS.h"
31*533d3a49SEdward Gillett#include "proc64_id.h"
327c478bd9Sstevel@tonic-gate
337c478bd9Sstevel@tonic-gate#define LABEL(s) .strcpy/**/s
347c478bd9Sstevel@tonic-gate
357c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCPY
367c478bd9Sstevel@tonic-gate	ENTRY(strncpy)
37*533d3a49SEdward Gillett	test	%edx, %edx
38*533d3a49SEdward Gillett	jz	LABEL(strncpy_exitz)
39*533d3a49SEdward Gillett	mov	%rdx, %r8
407c478bd9Sstevel@tonic-gate#else
417c478bd9Sstevel@tonic-gate	ENTRY(strcpy)				/* (char *, const char *) */
42*533d3a49SEdward Gillett	xor	%rdx, %rdx
437c478bd9Sstevel@tonic-gate#endif
44*533d3a49SEdward Gillett	mov	%esi, %ecx
45*533d3a49SEdward Gillett	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
46*533d3a49SEdward Gillett	and	$0xf, %rcx
47*533d3a49SEdward Gillett	mov	%rdi, %rax			/* save destination address for return value */
48*533d3a49SEdward Gillett
49*533d3a49SEdward Gillett
50*533d3a49SEdward Gillett	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char checks */
51*533d3a49SEdward Gillett	pcmpeqb	(%rsi), %xmm0			/* check 16 bytes in src for null */
52*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
53*533d3a49SEdward Gillett	shr	%cl, %edx			/* adjust for offset from 16byte boundary */
54*533d3a49SEdward Gillett	test	%edx, %edx			/* edx will be 0 if chars are non-null */
55*533d3a49SEdward Gillett	jnz	LABEL(less16bytes)		/* null char found in first 16 bytes examined */
56*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
57*533d3a49SEdward Gillett	/*
58*533d3a49SEdward Gillett	 * Check if the count is satisfied in first 16 bytes examined.
59*533d3a49SEdward Gillett	 */
60*533d3a49SEdward Gillett	lea	-16(%r8, %rcx), %r11
61*533d3a49SEdward Gillett	cmp	$0, %r11
62*533d3a49SEdward Gillett	jle	LABEL(less16bytes)
63*533d3a49SEdward Gillett#endif
64*533d3a49SEdward Gillett	mov	%rcx, %r9			/* rsi alignment offset */
65*533d3a49SEdward Gillett	or	%edi, %ecx
66*533d3a49SEdward Gillett	and	$0xf, %ecx
67*533d3a49SEdward Gillett	lea	-16(%r9), %r10
68*533d3a49SEdward Gillett	jz	LABEL(ashr_0)			/* src and dest are both 16 byte aligned */
69*533d3a49SEdward Gillett
70*533d3a49SEdward Gillett	neg	%r10				/* max src bytes remaining in current dqword */
71*533d3a49SEdward Gillett
72*533d3a49SEdward Gillett	pxor	%xmm0, %xmm0			/* clear %xmm0, may be polluted by unaligned operation */
73*533d3a49SEdward Gillett	pcmpeqb	16(%rsi), %xmm0			/* check next 16 bytes in src for a null */
74*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
75*533d3a49SEdward Gillett	test	%edx, %edx
76*533d3a49SEdward Gillett	jnz	LABEL(less32bytes)		/* null char found in first 32 bytes examined */
777c478bd9Sstevel@tonic-gate
787c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCPY
79*533d3a49SEdward Gillett	/*
80*533d3a49SEdward Gillett	 * If strncpy count <= 16 go to exit case
81*533d3a49SEdward Gillett	 */
82*533d3a49SEdward Gillett	sub	$16, %r8
83*533d3a49SEdward Gillett	jbe	LABEL(less32bytes_strncpy_truncation)
84*533d3a49SEdward Gillett#endif
85*533d3a49SEdward Gillett	/*
86*533d3a49SEdward Gillett	 * At least 16 bytes to copy to destination string. Move them now.
87*533d3a49SEdward Gillett	 * Don't worry about alignment.
88*533d3a49SEdward Gillett	 */
89*533d3a49SEdward Gillett	mov	(%rsi, %r9), %rdx
90*533d3a49SEdward Gillett	mov	%rdx, (%rdi)
91*533d3a49SEdward Gillett	mov	8(%rsi, %r9), %rdx
92*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
93*533d3a49SEdward Gillett
94*533d3a49SEdward Gillett	/*
95*533d3a49SEdward Gillett	 * so far destination rdi may be aligned by 16, re-calculate rsi and
96*533d3a49SEdward Gillett	 * jump to corresponding src/dest relative offset case.
97*533d3a49SEdward Gillett	 * 	rcx is offset of rsi
98*533d3a49SEdward Gillett	 * 	rdx is offset of rdi
99*533d3a49SEdward Gillett	 */
100*533d3a49SEdward Gillett	and	$0xfffffffffffffff0, %rdi	/* force rdi 16 byte align */
101*533d3a49SEdward Gillett	mov	%rax, %rdx			/* rax contains orignal rdi */
102*533d3a49SEdward Gillett	xor	%rdi, %rdx			/* same effect as "and $0xf, %rdx" */
103*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
104*533d3a49SEdward Gillett	/*
105*533d3a49SEdward Gillett	 * Will now do 16 byte aligned stores. Stores may overlap some bytes
106*533d3a49SEdward Gillett	 * (ie store twice) if destination was unaligned. Compensate here.
107*533d3a49SEdward Gillett	 */
108*533d3a49SEdward Gillett	add	%rdx, %r8			/* compensate for overlap */
1097c478bd9Sstevel@tonic-gate#endif
1107c478bd9Sstevel@tonic-gate
111*533d3a49SEdward Gillett	add	$16, %rdi			/* next 16 bytes for dest */
1127c478bd9Sstevel@tonic-gate
113*533d3a49SEdward Gillett	/*
114*533d3a49SEdward Gillett	 * align src to 16-byte boundary. Could be up or down depending on
115*533d3a49SEdward Gillett	 * whether src offset - dest offset > 0 (up) or
116*533d3a49SEdward Gillett	 *  src offset - dest offset < 0 (down).
117*533d3a49SEdward Gillett	 */
118*533d3a49SEdward Gillett	sub	%rdx, %r9			/* src offset - dest offset */
1197c478bd9Sstevel@tonic-gate
120*533d3a49SEdward Gillett	lea	16(%r9, %rsi), %rsi
121*533d3a49SEdward Gillett	mov	%esi, %ecx			/* for new src offset */
122*533d3a49SEdward Gillett	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
123*533d3a49SEdward Gillett
124*533d3a49SEdward Gillett	and	$0xf, %ecx			/* new src offset is 0 if rsi/rdi have same alignment */
125*533d3a49SEdward Gillett	jz	LABEL(ashr_0)
126*533d3a49SEdward Gillett
127*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
128*533d3a49SEdward Gillett	xor	%edx, %edx			/* In case unaligned_exit is taken */
129*533d3a49SEdward Gillett#endif
130*533d3a49SEdward Gillett	/*
131*533d3a49SEdward Gillett	 * Jump to case corresponding to source/dest string relative offsets
132*533d3a49SEdward Gillett	 * Index = (16 + (src offset - dest offset)) % 16
133*533d3a49SEdward Gillett	 */
134*533d3a49SEdward Gillett	lea	-16(%rcx), %r10
135*533d3a49SEdward Gillett	mov	%rcx, %r9
136*533d3a49SEdward Gillett	neg	%r10				/* max src bytes remaining in current dqword */
137*533d3a49SEdward Gillett	lea	LABEL(unaligned_table)(%rip), %r11
138*533d3a49SEdward Gillett	movslq	(%r11, %rcx, 4), %rcx
139*533d3a49SEdward Gillett	lea	(%r11, %rcx), %rcx
140*533d3a49SEdward Gillett	jmp	*%rcx
141*533d3a49SEdward Gillett
142*533d3a49SEdward Gillett/*
143*533d3a49SEdward Gillett * ashr_0 handles the following cases:
144*533d3a49SEdward Gillett * 	src alignment offset = dest alignment offset
145*533d3a49SEdward Gillett */
146*533d3a49SEdward Gillett	.p2align 5
147*533d3a49SEdward GillettLABEL(ashr_0):
148*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
149*533d3a49SEdward Gillett	sub	$16, %r8
150*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_aligned)
151*533d3a49SEdward Gillett#endif
152*533d3a49SEdward Gillett	movdqa	(%rsi), %xmm1		/* fetch 16 bytes from src string */
153*533d3a49SEdward Gillett	movdqa	%xmm1, (%rdi)		/* store 16 bytes into dest string */
154*533d3a49SEdward Gillett	add	$16, %rsi
155*533d3a49SEdward Gillett	add	$16, %rdi
156*533d3a49SEdward Gillett	pcmpeqb	(%rsi), %xmm0		/* check 16 bytes in src for a null */
157*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
158*533d3a49SEdward Gillett
159*533d3a49SEdward Gillett	test	%edx, %edx		/* edx will be 0 if chars are non-null */
160*533d3a49SEdward Gillett	jnz	LABEL(aligned_16bytes)	/* exit tail */
161*533d3a49SEdward Gillett
162*533d3a49SEdward GillettLABEL(ashr_0_loop):
163*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
164*533d3a49SEdward Gillett	sub	$16, %r8
165*533d3a49SEdward Gillett	jbe	LABEL(strncpy_truncation_aligned)
166*533d3a49SEdward Gillett#endif
167*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm1
168*533d3a49SEdward Gillett	movdqa	%xmm1, (%rdi, %rcx)
169*533d3a49SEdward Gillett	add	$16, %rcx
170*533d3a49SEdward Gillett	pcmpeqb	(%rsi, %rcx), %xmm0
171*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
172*533d3a49SEdward Gillett	test	%edx, %edx
173*533d3a49SEdward Gillett	jnz	LABEL(aligned_exit)
174*533d3a49SEdward Gillett
175*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
176*533d3a49SEdward Gillett	sub	$16, %r8
177*533d3a49SEdward Gillett	jbe	LABEL(strncpy_truncation_aligned)
178*533d3a49SEdward Gillett#endif
179*533d3a49SEdward Gillett	movdqa  (%rsi, %rcx), %xmm1
180*533d3a49SEdward Gillett	movdqa  %xmm1, (%rdi, %rcx)
181*533d3a49SEdward Gillett	add	$16, %rcx
182*533d3a49SEdward Gillett	pcmpeqb  (%rsi, %rcx), %xmm0
183*533d3a49SEdward Gillett	pmovmskb  %xmm0, %edx
184*533d3a49SEdward Gillett	test	%edx, %edx
185*533d3a49SEdward Gillett	jnz	LABEL(aligned_exit)
186*533d3a49SEdward Gillett
187*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
188*533d3a49SEdward Gillett	sub	$16, %r8
189*533d3a49SEdward Gillett	jbe	LABEL(strncpy_truncation_aligned)
190*533d3a49SEdward Gillett#endif
191*533d3a49SEdward Gillett	movdqa  (%rsi, %rcx), %xmm1
192*533d3a49SEdward Gillett	movdqa  %xmm1, (%rdi, %rcx)
193*533d3a49SEdward Gillett
194*533d3a49SEdward Gillett	add	$16, %rcx
195*533d3a49SEdward Gillett	pcmpeqb  (%rsi, %rcx), %xmm0
196*533d3a49SEdward Gillett	pmovmskb  %xmm0, %edx
197*533d3a49SEdward Gillett	test	%edx, %edx
198*533d3a49SEdward Gillett	jnz	LABEL(aligned_exit)
199*533d3a49SEdward Gillett
200*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
201*533d3a49SEdward Gillett	sub	$16, %r8
202*533d3a49SEdward Gillett	jbe	LABEL(strncpy_truncation_aligned)
203*533d3a49SEdward Gillett#endif
204*533d3a49SEdward Gillett	movdqa  (%rsi, %rcx), %xmm1
205*533d3a49SEdward Gillett	movdqa  %xmm1, (%rdi, %rcx)
206*533d3a49SEdward Gillett	add	$16, %rcx
207*533d3a49SEdward Gillett	pcmpeqb  (%rsi, %rcx), %xmm0
208*533d3a49SEdward Gillett	pmovmskb  %xmm0, %edx
209*533d3a49SEdward Gillett	test	%edx, %edx
210*533d3a49SEdward Gillett	jz	LABEL(ashr_0_loop)
211*533d3a49SEdward Gillett	jmp	LABEL(aligned_exit)
212*533d3a49SEdward Gillett
213*533d3a49SEdward Gillett
214*533d3a49SEdward Gillett/*
215*533d3a49SEdward Gillett * ashr_15 handles the following cases:
216*533d3a49SEdward Gillett * 	(16 + (src offset - dest offset)) % 16 = 15
217*533d3a49SEdward Gillett *
218*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
219*533d3a49SEdward Gillett * bank, there is no null byte.
220*533d3a49SEdward Gillett */
221*533d3a49SEdward Gillett	.p2align 4
222*533d3a49SEdward GillettLABEL(ashr_15):
223*533d3a49SEdward Gillett	xor	%ecx, %ecx				/* clear index */
224*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
225*533d3a49SEdward Gillett	cmp	%r10, %r8
226*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
227*533d3a49SEdward Gillett#endif
228*533d3a49SEdward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
229*533d3a49SEdward Gillett	jz	LABEL(ashr_15_use_sse2)
230*533d3a49SEdward Gillett
231*533d3a49SEdward Gillett	.p2align 4
232*533d3a49SEdward GillettLABEL(ashr_15_use_ssse3):
233*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
234*533d3a49SEdward Gillett	pcmpeqb	%xmm3, %xmm0
235*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
236*533d3a49SEdward Gillett	test	%edx, %edx
237*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
238*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
239*533d3a49SEdward Gillett	sub	$16, %r8
240*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
241*533d3a49SEdward Gillett#endif
242*533d3a49SEdward Gillett
243*533d3a49SEdward Gillett	#palignr $15, (%rsi, %rcx), %xmm3
244*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
245*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x0f
246*533d3a49SEdward Gillett
247*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
248*533d3a49SEdward Gillett	add	$16, %rcx
249*533d3a49SEdward Gillett
250*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
251*533d3a49SEdward Gillett	cmp	%r10, %r8
252*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
253*533d3a49SEdward Gillett#endif
254*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
255*533d3a49SEdward Gillett	pcmpeqb %xmm3, %xmm0
256*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
257*533d3a49SEdward Gillett	test	%edx, %edx
258*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
259*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
260*533d3a49SEdward Gillett	sub	$16, %r8
261*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
262*533d3a49SEdward Gillett#endif
263*533d3a49SEdward Gillett
264*533d3a49SEdward Gillett	#palignr $15, (%rsi, %rcx), %xmm3
265*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
266*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x0f
267*533d3a49SEdward Gillett
268*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
269*533d3a49SEdward Gillett	add	$16, %rcx
270*533d3a49SEdward Gillett
271*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
272*533d3a49SEdward Gillett	cmp	%r10, %r8
273*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
274*533d3a49SEdward Gillett#endif
275*533d3a49SEdward Gillett	jmp	LABEL(ashr_15_use_ssse3)
276*533d3a49SEdward Gillett
277*533d3a49SEdward Gillett	.p2align 4
278*533d3a49SEdward GillettLABEL(ashr_15_use_sse2):
279*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
280*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
281*533d3a49SEdward Gillett	test	%edx, %edx
282*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
283*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
284*533d3a49SEdward Gillett	sub	$16, %r8
285*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
286*533d3a49SEdward Gillett#endif
287*533d3a49SEdward Gillett
288*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
289*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
290*533d3a49SEdward Gillett
291*533d3a49SEdward Gillett	psrldq	$15, %xmm2
292*533d3a49SEdward Gillett	pslldq	$1, %xmm3
293*533d3a49SEdward Gillett	por	%xmm2, %xmm3
294*533d3a49SEdward Gillett
295*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
296*533d3a49SEdward Gillett	add	$16, %rcx
297*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
298*533d3a49SEdward Gillett	cmp	%r10, %r8
299*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
300*533d3a49SEdward Gillett#endif
301*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
302*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
303*533d3a49SEdward Gillett	test	%edx, %edx
304*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
305*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
306*533d3a49SEdward Gillett	sub	$16, %r8
307*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
308*533d3a49SEdward Gillett#endif
309*533d3a49SEdward Gillett
310*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
311*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
312*533d3a49SEdward Gillett
313*533d3a49SEdward Gillett	psrldq	$15, %xmm2
314*533d3a49SEdward Gillett	pslldq	$1, %xmm3
315*533d3a49SEdward Gillett	por	%xmm2, %xmm3
316*533d3a49SEdward Gillett
317*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
318*533d3a49SEdward Gillett	add	$16, %rcx
319*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
320*533d3a49SEdward Gillett	cmp	%r10, %r8
321*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
322*533d3a49SEdward Gillett#endif
323*533d3a49SEdward Gillett	jmp	LABEL(ashr_15_use_sse2)
324*533d3a49SEdward Gillett
325*533d3a49SEdward Gillett
326*533d3a49SEdward Gillett/*
327*533d3a49SEdward Gillett * ashr_14 handles the following cases:
328*533d3a49SEdward Gillett * 	(16 + (src offset - dest offset)) % 16 = 14
329*533d3a49SEdward Gillett *
330*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
331*533d3a49SEdward Gillett * bank, there is no null byte.
332*533d3a49SEdward Gillett */
333*533d3a49SEdward Gillett	.p2align 4
334*533d3a49SEdward GillettLABEL(ashr_14):
335*533d3a49SEdward Gillett	xor	%ecx, %ecx				/* clear index */
336*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
337*533d3a49SEdward Gillett	cmp	%r10, %r8
338*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
339*533d3a49SEdward Gillett#endif
340*533d3a49SEdward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
341*533d3a49SEdward Gillett	jz	LABEL(ashr_14_use_sse2)
342*533d3a49SEdward Gillett
343*533d3a49SEdward Gillett	.p2align 4
344*533d3a49SEdward GillettLABEL(ashr_14_use_ssse3):
345*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
346*533d3a49SEdward Gillett	pcmpeqb	%xmm3, %xmm0
347*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
348*533d3a49SEdward Gillett	test	%edx, %edx
349*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
350*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
351*533d3a49SEdward Gillett	sub	$16, %r8
352*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
353*533d3a49SEdward Gillett#endif
354*533d3a49SEdward Gillett
355*533d3a49SEdward Gillett	#palignr $14, (%rsi, %rcx), %xmm3
356*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
357*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x0e
358*533d3a49SEdward Gillett
359*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
360*533d3a49SEdward Gillett	add	$16, %rcx
361*533d3a49SEdward Gillett
362*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
363*533d3a49SEdward Gillett	cmp	%r10, %r8
364*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
365*533d3a49SEdward Gillett#endif
366*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
367*533d3a49SEdward Gillett	pcmpeqb %xmm3, %xmm0
368*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
369*533d3a49SEdward Gillett	test	%edx, %edx
370*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
371*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
372*533d3a49SEdward Gillett	sub	$16, %r8
373*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
374*533d3a49SEdward Gillett#endif
375*533d3a49SEdward Gillett
376*533d3a49SEdward Gillett	#palignr $14, (%rsi, %rcx), %xmm3
377*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
378*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x0e
379*533d3a49SEdward Gillett
380*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
381*533d3a49SEdward Gillett	add	$16, %rcx
382*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
383*533d3a49SEdward Gillett	cmp	%r10, %r8
384*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
385*533d3a49SEdward Gillett#endif
386*533d3a49SEdward Gillett	jmp	LABEL(ashr_14_use_ssse3)
387*533d3a49SEdward Gillett
388*533d3a49SEdward Gillett	.p2align 4
389*533d3a49SEdward GillettLABEL(ashr_14_use_sse2):
390*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
391*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
392*533d3a49SEdward Gillett	test	%edx, %edx
393*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
394*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
395*533d3a49SEdward Gillett	sub	$16, %r8
396*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
397*533d3a49SEdward Gillett#endif
398*533d3a49SEdward Gillett
399*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
400*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
401*533d3a49SEdward Gillett
402*533d3a49SEdward Gillett	psrldq	$14, %xmm2
403*533d3a49SEdward Gillett	pslldq	$2, %xmm3
404*533d3a49SEdward Gillett	por	%xmm2, %xmm3
405*533d3a49SEdward Gillett
406*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
407*533d3a49SEdward Gillett	add	$16, %rcx
408*533d3a49SEdward Gillett
409*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
410*533d3a49SEdward Gillett	cmp	%r10, %r8
411*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
412*533d3a49SEdward Gillett#endif
413*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
414*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
415*533d3a49SEdward Gillett	test	%edx, %edx
416*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
417*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
418*533d3a49SEdward Gillett	sub	$16, %r8
419*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
420*533d3a49SEdward Gillett#endif
421*533d3a49SEdward Gillett
422*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
423*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
424*533d3a49SEdward Gillett
425*533d3a49SEdward Gillett	psrldq	$14, %xmm2
426*533d3a49SEdward Gillett	pslldq	$2, %xmm3
427*533d3a49SEdward Gillett	por	%xmm2, %xmm3
428*533d3a49SEdward Gillett
429*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
430*533d3a49SEdward Gillett	add	$16, %rcx
431*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
432*533d3a49SEdward Gillett	cmp	%r10, %r8
433*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
434*533d3a49SEdward Gillett#endif
435*533d3a49SEdward Gillett	jmp	LABEL(ashr_14_use_sse2)
436*533d3a49SEdward Gillett
437*533d3a49SEdward Gillett
438*533d3a49SEdward Gillett/*
439*533d3a49SEdward Gillett * ashr_13 handles the following cases:
440*533d3a49SEdward Gillett * 	(16 + (src offset - dest offset)) % 16 = 13
441*533d3a49SEdward Gillett *
442*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
443*533d3a49SEdward Gillett * bank, there is no null byte.
444*533d3a49SEdward Gillett */
445*533d3a49SEdward Gillett	.p2align 4
446*533d3a49SEdward GillettLABEL(ashr_13):
447*533d3a49SEdward Gillett	xor	%ecx, %ecx				/* clear index */
448*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
449*533d3a49SEdward Gillett	cmp	%r10, %r8
450*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
451*533d3a49SEdward Gillett#endif
452*533d3a49SEdward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
453*533d3a49SEdward Gillett	jz	LABEL(ashr_13_use_sse2)
454*533d3a49SEdward Gillett
455*533d3a49SEdward Gillett	.p2align 4
456*533d3a49SEdward GillettLABEL(ashr_13_use_ssse3):
457*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
458*533d3a49SEdward Gillett	pcmpeqb	%xmm3, %xmm0
459*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
460*533d3a49SEdward Gillett	test	%edx, %edx
461*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
462*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
463*533d3a49SEdward Gillett	sub	$16, %r8
464*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
465*533d3a49SEdward Gillett#endif
466*533d3a49SEdward Gillett
467*533d3a49SEdward Gillett	#palignr $13, (%rsi, %rcx), %xmm3
468*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
469*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x0d
470*533d3a49SEdward Gillett
471*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
472*533d3a49SEdward Gillett	add	$16, %rcx
473*533d3a49SEdward Gillett
474*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
475*533d3a49SEdward Gillett	cmp	%r10, %r8
476*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
477*533d3a49SEdward Gillett#endif
478*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
479*533d3a49SEdward Gillett	pcmpeqb %xmm3, %xmm0
480*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
481*533d3a49SEdward Gillett	test	%edx, %edx
482*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
483*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
484*533d3a49SEdward Gillett	sub	$16, %r8
485*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
486*533d3a49SEdward Gillett#endif
487*533d3a49SEdward Gillett
488*533d3a49SEdward Gillett	#palignr $13, (%rsi, %rcx), %xmm3
489*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
490*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x0d
491*533d3a49SEdward Gillett
492*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
493*533d3a49SEdward Gillett	add	$16, %rcx
494*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
495*533d3a49SEdward Gillett	cmp	%r10, %r8
496*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
497*533d3a49SEdward Gillett#endif
498*533d3a49SEdward Gillett	jmp	LABEL(ashr_13_use_ssse3)
499*533d3a49SEdward Gillett
500*533d3a49SEdward Gillett	.p2align 4
501*533d3a49SEdward GillettLABEL(ashr_13_use_sse2):
502*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
503*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
504*533d3a49SEdward Gillett	test	%edx, %edx
505*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
506*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
507*533d3a49SEdward Gillett	sub	$16, %r8
508*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
509*533d3a49SEdward Gillett#endif
510*533d3a49SEdward Gillett
511*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
512*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
513*533d3a49SEdward Gillett
514*533d3a49SEdward Gillett	psrldq	$13, %xmm2
515*533d3a49SEdward Gillett	pslldq	$3, %xmm3
516*533d3a49SEdward Gillett	por	%xmm2, %xmm3
517*533d3a49SEdward Gillett
518*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
519*533d3a49SEdward Gillett	add	$16, %rcx
520*533d3a49SEdward Gillett
521*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
522*533d3a49SEdward Gillett	cmp	%r10, %r8
523*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
524*533d3a49SEdward Gillett#endif
525*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
526*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
527*533d3a49SEdward Gillett	test	%edx, %edx
528*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
529*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
530*533d3a49SEdward Gillett	sub	$16, %r8
531*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
532*533d3a49SEdward Gillett#endif
533*533d3a49SEdward Gillett
534*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
535*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
536*533d3a49SEdward Gillett
537*533d3a49SEdward Gillett	psrldq	$13, %xmm2
538*533d3a49SEdward Gillett	pslldq	$3, %xmm3
539*533d3a49SEdward Gillett	por	%xmm2, %xmm3
540*533d3a49SEdward Gillett
541*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
542*533d3a49SEdward Gillett	add	$16, %rcx
543*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
544*533d3a49SEdward Gillett	cmp	%r10, %r8
545*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
546*533d3a49SEdward Gillett#endif
547*533d3a49SEdward Gillett	jmp	LABEL(ashr_13_use_sse2)
548*533d3a49SEdward Gillett
549*533d3a49SEdward Gillett
550*533d3a49SEdward Gillett/*
551*533d3a49SEdward Gillett * ashr_12 handles the following cases:
552*533d3a49SEdward Gillett * 	(16 + (src offset - dest offset)) % 16 = 12
553*533d3a49SEdward Gillett *
554*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
555*533d3a49SEdward Gillett * bank, there is no null byte.
556*533d3a49SEdward Gillett */
557*533d3a49SEdward Gillett	.p2align 4
558*533d3a49SEdward GillettLABEL(ashr_12):
559*533d3a49SEdward Gillett	xor	%ecx, %ecx				/* clear index */
560*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
561*533d3a49SEdward Gillett	cmp	%r10, %r8
562*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
563*533d3a49SEdward Gillett#endif
564*533d3a49SEdward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
565*533d3a49SEdward Gillett	jz	LABEL(ashr_12_use_sse2)
566*533d3a49SEdward Gillett
567*533d3a49SEdward Gillett	.p2align 4
568*533d3a49SEdward GillettLABEL(ashr_12_use_ssse3):
569*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
570*533d3a49SEdward Gillett	pcmpeqb	%xmm3, %xmm0
571*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
572*533d3a49SEdward Gillett	test	%edx, %edx
573*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
574*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
575*533d3a49SEdward Gillett	sub	$16, %r8
576*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
577*533d3a49SEdward Gillett#endif
578*533d3a49SEdward Gillett
579*533d3a49SEdward Gillett	#palignr $12, (%rsi, %rcx), %xmm3
580*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
581*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x0c
582*533d3a49SEdward Gillett
583*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
584*533d3a49SEdward Gillett	add	$16, %rcx
585*533d3a49SEdward Gillett
586*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
587*533d3a49SEdward Gillett	cmp	%r10, %r8
588*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
589*533d3a49SEdward Gillett#endif
590*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
591*533d3a49SEdward Gillett	pcmpeqb %xmm3, %xmm0
592*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
593*533d3a49SEdward Gillett	test	%edx, %edx
594*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
595*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
596*533d3a49SEdward Gillett	sub	$16, %r8
597*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
598*533d3a49SEdward Gillett#endif
599*533d3a49SEdward Gillett
600*533d3a49SEdward Gillett	#palignr $12, (%rsi, %rcx), %xmm3
601*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
602*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x0c
603*533d3a49SEdward Gillett
604*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
605*533d3a49SEdward Gillett	add	$16, %rcx
606*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
607*533d3a49SEdward Gillett	cmp	%r10, %r8
608*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
609*533d3a49SEdward Gillett#endif
610*533d3a49SEdward Gillett	jmp	LABEL(ashr_12_use_ssse3)
611*533d3a49SEdward Gillett
612*533d3a49SEdward Gillett	.p2align 4
613*533d3a49SEdward GillettLABEL(ashr_12_use_sse2):
614*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
615*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
616*533d3a49SEdward Gillett	test	%edx, %edx
617*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
618*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
619*533d3a49SEdward Gillett	sub	$16, %r8
620*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
621*533d3a49SEdward Gillett#endif
622*533d3a49SEdward Gillett
623*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
624*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
625*533d3a49SEdward Gillett
626*533d3a49SEdward Gillett	psrldq	$12, %xmm2
627*533d3a49SEdward Gillett	pslldq	$4, %xmm3
628*533d3a49SEdward Gillett	por	%xmm2, %xmm3
629*533d3a49SEdward Gillett
630*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
631*533d3a49SEdward Gillett	add	$16, %rcx
632*533d3a49SEdward Gillett
633*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
634*533d3a49SEdward Gillett	cmp	%r10, %r8
635*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
636*533d3a49SEdward Gillett#endif
637*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
638*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
639*533d3a49SEdward Gillett	test	%edx, %edx
640*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
641*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
642*533d3a49SEdward Gillett	sub	$16, %r8
643*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
644*533d3a49SEdward Gillett#endif
645*533d3a49SEdward Gillett
646*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
647*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
648*533d3a49SEdward Gillett
649*533d3a49SEdward Gillett	psrldq	$12, %xmm2
650*533d3a49SEdward Gillett	pslldq	$4, %xmm3
651*533d3a49SEdward Gillett	por	%xmm2, %xmm3
652*533d3a49SEdward Gillett
653*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
654*533d3a49SEdward Gillett	add	$16, %rcx
655*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
656*533d3a49SEdward Gillett	cmp	%r10, %r8
657*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
658*533d3a49SEdward Gillett#endif
659*533d3a49SEdward Gillett	jmp	LABEL(ashr_12_use_sse2)
660*533d3a49SEdward Gillett
661*533d3a49SEdward Gillett
662*533d3a49SEdward Gillett/*
663*533d3a49SEdward Gillett * ashr_11 handles the following cases:
664*533d3a49SEdward Gillett * 	(16 + (src offset - dest offset)) % 16 = 11
665*533d3a49SEdward Gillett *
666*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
667*533d3a49SEdward Gillett * bank, there is no null byte.
668*533d3a49SEdward Gillett */
669*533d3a49SEdward Gillett	.p2align 4
670*533d3a49SEdward GillettLABEL(ashr_11):
671*533d3a49SEdward Gillett	xor	%ecx, %ecx				/* clear index */
672*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
673*533d3a49SEdward Gillett	cmp	%r10, %r8
674*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
675*533d3a49SEdward Gillett#endif
676*533d3a49SEdward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
677*533d3a49SEdward Gillett	jz	LABEL(ashr_11_use_sse2)
678*533d3a49SEdward Gillett
679*533d3a49SEdward Gillett	.p2align 4
680*533d3a49SEdward GillettLABEL(ashr_11_use_ssse3):
681*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
682*533d3a49SEdward Gillett	pcmpeqb	%xmm3, %xmm0
683*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
684*533d3a49SEdward Gillett	test	%edx, %edx
685*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
686*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
687*533d3a49SEdward Gillett	sub	$16, %r8
688*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
689*533d3a49SEdward Gillett#endif
690*533d3a49SEdward Gillett
691*533d3a49SEdward Gillett	#palignr $11, (%rsi, %rcx), %xmm3
692*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
693*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x0b
694*533d3a49SEdward Gillett
695*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
696*533d3a49SEdward Gillett	add	$16, %rcx
697*533d3a49SEdward Gillett
698*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
699*533d3a49SEdward Gillett	cmp	%r10, %r8
700*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
701*533d3a49SEdward Gillett#endif
702*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
703*533d3a49SEdward Gillett	pcmpeqb %xmm3, %xmm0
704*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
705*533d3a49SEdward Gillett	test	%edx, %edx
706*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
707*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
708*533d3a49SEdward Gillett	sub	$16, %r8
709*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
710*533d3a49SEdward Gillett#endif
711*533d3a49SEdward Gillett
712*533d3a49SEdward Gillett	#palignr $11, (%rsi, %rcx), %xmm3
713*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
714*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x0b
715*533d3a49SEdward Gillett
716*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
717*533d3a49SEdward Gillett	add	$16, %rcx
718*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
719*533d3a49SEdward Gillett	cmp	%r10, %r8
720*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
721*533d3a49SEdward Gillett#endif
722*533d3a49SEdward Gillett	jmp	LABEL(ashr_11_use_ssse3)
723*533d3a49SEdward Gillett
724*533d3a49SEdward Gillett	.p2align 4
725*533d3a49SEdward GillettLABEL(ashr_11_use_sse2):
726*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
727*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
728*533d3a49SEdward Gillett	test	%edx, %edx
729*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
730*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
731*533d3a49SEdward Gillett	sub	$16, %r8
732*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
733*533d3a49SEdward Gillett#endif
734*533d3a49SEdward Gillett
735*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
736*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
737*533d3a49SEdward Gillett
738*533d3a49SEdward Gillett	psrldq	$11, %xmm2
739*533d3a49SEdward Gillett	pslldq	$5, %xmm3
740*533d3a49SEdward Gillett	por	%xmm2, %xmm3
741*533d3a49SEdward Gillett
742*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
743*533d3a49SEdward Gillett	add	$16, %rcx
744*533d3a49SEdward Gillett
745*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
746*533d3a49SEdward Gillett	cmp	%r10, %r8
747*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
748*533d3a49SEdward Gillett#endif
749*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
750*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
751*533d3a49SEdward Gillett	test	%edx, %edx
752*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
753*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
754*533d3a49SEdward Gillett	sub	$16, %r8
755*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
756*533d3a49SEdward Gillett#endif
757*533d3a49SEdward Gillett
758*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
759*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
760*533d3a49SEdward Gillett
761*533d3a49SEdward Gillett	psrldq	$11, %xmm2
762*533d3a49SEdward Gillett	pslldq	$5, %xmm3
763*533d3a49SEdward Gillett	por	%xmm2, %xmm3
764*533d3a49SEdward Gillett
765*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
766*533d3a49SEdward Gillett	add	$16, %rcx
767*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
768*533d3a49SEdward Gillett	cmp	%r10, %r8
769*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
770*533d3a49SEdward Gillett#endif
771*533d3a49SEdward Gillett	jmp	LABEL(ashr_11_use_sse2)
772*533d3a49SEdward Gillett
773*533d3a49SEdward Gillett
774*533d3a49SEdward Gillett/*
775*533d3a49SEdward Gillett * ashr_10 handles the following cases:
776*533d3a49SEdward Gillett * 	(16 + (src offset - dest offset)) % 16 = 10
777*533d3a49SEdward Gillett *
778*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
779*533d3a49SEdward Gillett * bank, there is no null byte.
780*533d3a49SEdward Gillett */
781*533d3a49SEdward Gillett	.p2align 4
782*533d3a49SEdward GillettLABEL(ashr_10):
783*533d3a49SEdward Gillett	xor	%ecx, %ecx				/* clear index */
784*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
785*533d3a49SEdward Gillett	cmp	%r10, %r8
786*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
787*533d3a49SEdward Gillett#endif
788*533d3a49SEdward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
789*533d3a49SEdward Gillett	jz	LABEL(ashr_10_use_sse2)
790*533d3a49SEdward Gillett
791*533d3a49SEdward Gillett	.p2align 4
792*533d3a49SEdward GillettLABEL(ashr_10_use_ssse3):
793*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
794*533d3a49SEdward Gillett	pcmpeqb	%xmm3, %xmm0
795*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
796*533d3a49SEdward Gillett	test	%edx, %edx
797*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
798*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
799*533d3a49SEdward Gillett	sub	$16, %r8
800*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
801*533d3a49SEdward Gillett#endif
802*533d3a49SEdward Gillett
803*533d3a49SEdward Gillett	#palignr $10, (%rsi, %rcx), %xmm3
804*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
805*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x0a
806*533d3a49SEdward Gillett
807*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
808*533d3a49SEdward Gillett	add	$16, %rcx
809*533d3a49SEdward Gillett
810*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
811*533d3a49SEdward Gillett	cmp	%r10, %r8
812*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
813*533d3a49SEdward Gillett#endif
814*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
815*533d3a49SEdward Gillett	pcmpeqb %xmm3, %xmm0
816*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
817*533d3a49SEdward Gillett	test	%edx, %edx
818*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
819*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
820*533d3a49SEdward Gillett	sub	$16, %r8
821*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
822*533d3a49SEdward Gillett#endif
823*533d3a49SEdward Gillett
824*533d3a49SEdward Gillett	#palignr $10, (%rsi, %rcx), %xmm3
825*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
826*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x0a
827*533d3a49SEdward Gillett
828*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
829*533d3a49SEdward Gillett	add	$16, %rcx
830*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
831*533d3a49SEdward Gillett	cmp	%r10, %r8
832*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
833*533d3a49SEdward Gillett#endif
834*533d3a49SEdward Gillett	jmp	LABEL(ashr_10_use_ssse3)
835*533d3a49SEdward Gillett
836*533d3a49SEdward Gillett	.p2align 4
837*533d3a49SEdward GillettLABEL(ashr_10_use_sse2):
838*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
839*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
840*533d3a49SEdward Gillett	test	%edx, %edx
841*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
842*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
843*533d3a49SEdward Gillett	sub	$16, %r8
844*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
845*533d3a49SEdward Gillett#endif
846*533d3a49SEdward Gillett
847*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
848*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
849*533d3a49SEdward Gillett
850*533d3a49SEdward Gillett	psrldq	$10, %xmm2
851*533d3a49SEdward Gillett	pslldq	$6, %xmm3
852*533d3a49SEdward Gillett	por	%xmm2, %xmm3
853*533d3a49SEdward Gillett
854*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
855*533d3a49SEdward Gillett	add	$16, %rcx
856*533d3a49SEdward Gillett
857*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
858*533d3a49SEdward Gillett	cmp	%r10, %r8
859*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
860*533d3a49SEdward Gillett#endif
861*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
862*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
863*533d3a49SEdward Gillett	test	%edx, %edx
864*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
865*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
866*533d3a49SEdward Gillett	sub	$16, %r8
867*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
868*533d3a49SEdward Gillett#endif
869*533d3a49SEdward Gillett
870*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
871*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
872*533d3a49SEdward Gillett
873*533d3a49SEdward Gillett	psrldq	$10, %xmm2
874*533d3a49SEdward Gillett	pslldq	$6, %xmm3
875*533d3a49SEdward Gillett	por	%xmm2, %xmm3
876*533d3a49SEdward Gillett
877*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
878*533d3a49SEdward Gillett	add	$16, %rcx
879*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
880*533d3a49SEdward Gillett	cmp	%r10, %r8
881*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
882*533d3a49SEdward Gillett#endif
883*533d3a49SEdward Gillett	jmp	LABEL(ashr_10_use_sse2)
884*533d3a49SEdward Gillett
885*533d3a49SEdward Gillett
886*533d3a49SEdward Gillett/*
887*533d3a49SEdward Gillett * ashr_9 handles the following cases:
888*533d3a49SEdward Gillett * 	(16 + (src offset - dest offset)) % 16 = 9
889*533d3a49SEdward Gillett *
890*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
891*533d3a49SEdward Gillett * bank, there is no null byte.
892*533d3a49SEdward Gillett */
893*533d3a49SEdward Gillett	.p2align 4
894*533d3a49SEdward GillettLABEL(ashr_9):
895*533d3a49SEdward Gillett	xor	%ecx, %ecx				/* clear index */
896*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
897*533d3a49SEdward Gillett	cmp	%r10, %r8
898*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
899*533d3a49SEdward Gillett#endif
900*533d3a49SEdward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
901*533d3a49SEdward Gillett	jz	LABEL(ashr_9_use_sse2)
902*533d3a49SEdward Gillett
903*533d3a49SEdward Gillett	.p2align 4
904*533d3a49SEdward GillettLABEL(ashr_9_use_ssse3):
905*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
906*533d3a49SEdward Gillett	pcmpeqb	%xmm3, %xmm0
907*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
908*533d3a49SEdward Gillett	test	%edx, %edx
909*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
910*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
911*533d3a49SEdward Gillett	sub	$16, %r8
912*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
913*533d3a49SEdward Gillett#endif
914*533d3a49SEdward Gillett
915*533d3a49SEdward Gillett	#palignr $9, (%rsi, %rcx), %xmm3
916*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
917*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x09
918*533d3a49SEdward Gillett
919*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
920*533d3a49SEdward Gillett	add	$16, %rcx
921*533d3a49SEdward Gillett
922*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
923*533d3a49SEdward Gillett	cmp	%r10, %r8
924*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
925*533d3a49SEdward Gillett#endif
926*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
927*533d3a49SEdward Gillett	pcmpeqb %xmm3, %xmm0
928*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
929*533d3a49SEdward Gillett	test	%edx, %edx
930*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
931*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
932*533d3a49SEdward Gillett	sub	$16, %r8
933*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
934*533d3a49SEdward Gillett#endif
935*533d3a49SEdward Gillett
936*533d3a49SEdward Gillett	#palignr $9, (%rsi, %rcx), %xmm3
937*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
938*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x09
939*533d3a49SEdward Gillett
940*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
941*533d3a49SEdward Gillett	add	$16, %rcx
942*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
943*533d3a49SEdward Gillett	cmp	%r10, %r8
944*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
945*533d3a49SEdward Gillett#endif
946*533d3a49SEdward Gillett	jmp	LABEL(ashr_9_use_ssse3)
947*533d3a49SEdward Gillett
948*533d3a49SEdward Gillett	.p2align 4
949*533d3a49SEdward GillettLABEL(ashr_9_use_sse2):
950*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
951*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
952*533d3a49SEdward Gillett	test	%edx, %edx
953*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
954*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
955*533d3a49SEdward Gillett	sub	$16, %r8
956*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
957*533d3a49SEdward Gillett#endif
958*533d3a49SEdward Gillett
959*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
960*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
961*533d3a49SEdward Gillett
962*533d3a49SEdward Gillett	psrldq	$9, %xmm2
963*533d3a49SEdward Gillett	pslldq	$7, %xmm3
964*533d3a49SEdward Gillett	por	%xmm2, %xmm3
965*533d3a49SEdward Gillett
966*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
967*533d3a49SEdward Gillett	add	$16, %rcx
968*533d3a49SEdward Gillett
969*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
970*533d3a49SEdward Gillett	cmp	%r10, %r8
971*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
972*533d3a49SEdward Gillett#endif
973*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
974*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
975*533d3a49SEdward Gillett	test	%edx, %edx
976*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
977*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
978*533d3a49SEdward Gillett	sub	$16, %r8
979*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
980*533d3a49SEdward Gillett#endif
981*533d3a49SEdward Gillett
982*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
983*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
984*533d3a49SEdward Gillett
985*533d3a49SEdward Gillett	psrldq	$9, %xmm2
986*533d3a49SEdward Gillett	pslldq	$7, %xmm3
987*533d3a49SEdward Gillett	por	%xmm2, %xmm3
988*533d3a49SEdward Gillett
989*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
990*533d3a49SEdward Gillett	add	$16, %rcx
991*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
992*533d3a49SEdward Gillett	cmp	%r10, %r8
993*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
994*533d3a49SEdward Gillett#endif
995*533d3a49SEdward Gillett	jmp	LABEL(ashr_9_use_sse2)
996*533d3a49SEdward Gillett
997*533d3a49SEdward Gillett
998*533d3a49SEdward Gillett/*
999*533d3a49SEdward Gillett * ashr_8 handles the following cases:
1000*533d3a49SEdward Gillett * 	(16 + (src offset - dest offset)) % 16 = 8
1001*533d3a49SEdward Gillett *
1002*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
1003*533d3a49SEdward Gillett * bank, there is no null byte.
1004*533d3a49SEdward Gillett */
1005*533d3a49SEdward Gillett	.p2align 4
1006*533d3a49SEdward GillettLABEL(ashr_8):
1007*533d3a49SEdward Gillett	xor	%ecx, %ecx				/* clear index */
1008*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1009*533d3a49SEdward Gillett	cmp	%r10, %r8
1010*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1011*533d3a49SEdward Gillett#endif
1012*533d3a49SEdward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1013*533d3a49SEdward Gillett	jz	LABEL(ashr_8_use_sse2)
1014*533d3a49SEdward Gillett
1015*533d3a49SEdward Gillett	.p2align 4
1016*533d3a49SEdward GillettLABEL(ashr_8_use_ssse3):
1017*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1018*533d3a49SEdward Gillett	pcmpeqb	%xmm3, %xmm0
1019*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1020*533d3a49SEdward Gillett	test	%edx, %edx
1021*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1022*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1023*533d3a49SEdward Gillett	sub	$16, %r8
1024*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1025*533d3a49SEdward Gillett#endif
1026*533d3a49SEdward Gillett
1027*533d3a49SEdward Gillett	#palignr $8, (%rsi, %rcx), %xmm3
1028*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1029*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x08
1030*533d3a49SEdward Gillett
1031*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1032*533d3a49SEdward Gillett	add	$16, %rcx
1033*533d3a49SEdward Gillett
1034*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1035*533d3a49SEdward Gillett	cmp	%r10, %r8
1036*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1037*533d3a49SEdward Gillett#endif
1038*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1039*533d3a49SEdward Gillett	pcmpeqb %xmm3, %xmm0
1040*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1041*533d3a49SEdward Gillett	test	%edx, %edx
1042*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1043*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1044*533d3a49SEdward Gillett	sub	$16, %r8
1045*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1046*533d3a49SEdward Gillett#endif
1047*533d3a49SEdward Gillett
1048*533d3a49SEdward Gillett	#palignr $8, (%rsi, %rcx), %xmm3
1049*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1050*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x08
1051*533d3a49SEdward Gillett
1052*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1053*533d3a49SEdward Gillett	add	$16, %rcx
1054*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1055*533d3a49SEdward Gillett	cmp	%r10, %r8
1056*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1057*533d3a49SEdward Gillett#endif
1058*533d3a49SEdward Gillett	jmp	LABEL(ashr_8_use_ssse3)
1059*533d3a49SEdward Gillett
1060*533d3a49SEdward Gillett	.p2align 4
1061*533d3a49SEdward GillettLABEL(ashr_8_use_sse2):
1062*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1063*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1064*533d3a49SEdward Gillett	test	%edx, %edx
1065*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1066*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1067*533d3a49SEdward Gillett	sub	$16, %r8
1068*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1069*533d3a49SEdward Gillett#endif
1070*533d3a49SEdward Gillett
1071*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1072*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
1073*533d3a49SEdward Gillett
1074*533d3a49SEdward Gillett	psrldq	$8, %xmm2
1075*533d3a49SEdward Gillett	pslldq	$8, %xmm3
1076*533d3a49SEdward Gillett	por	%xmm2, %xmm3
1077*533d3a49SEdward Gillett
1078*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1079*533d3a49SEdward Gillett	add	$16, %rcx
1080*533d3a49SEdward Gillett
1081*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1082*533d3a49SEdward Gillett	cmp	%r10, %r8
1083*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1084*533d3a49SEdward Gillett#endif
1085*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1086*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1087*533d3a49SEdward Gillett	test	%edx, %edx
1088*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1089*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1090*533d3a49SEdward Gillett	sub	$16, %r8
1091*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1092*533d3a49SEdward Gillett#endif
1093*533d3a49SEdward Gillett
1094*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1095*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
1096*533d3a49SEdward Gillett
1097*533d3a49SEdward Gillett	psrldq	$8, %xmm2
1098*533d3a49SEdward Gillett	pslldq	$8, %xmm3
1099*533d3a49SEdward Gillett	por	%xmm2, %xmm3
1100*533d3a49SEdward Gillett
1101*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1102*533d3a49SEdward Gillett	add	$16, %rcx
1103*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1104*533d3a49SEdward Gillett	cmp	%r10, %r8
1105*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1106*533d3a49SEdward Gillett#endif
1107*533d3a49SEdward Gillett	jmp	LABEL(ashr_8_use_sse2)
1108*533d3a49SEdward Gillett
1109*533d3a49SEdward Gillett
1110*533d3a49SEdward Gillett/*
1111*533d3a49SEdward Gillett * ashr_7 handles the following cases:
1112*533d3a49SEdward Gillett * 	(16 + (src offset - dest offset)) % 16 = 7
1113*533d3a49SEdward Gillett *
1114*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
1115*533d3a49SEdward Gillett * bank, there is no null byte.
1116*533d3a49SEdward Gillett */
1117*533d3a49SEdward Gillett	.p2align 4
1118*533d3a49SEdward GillettLABEL(ashr_7):
1119*533d3a49SEdward Gillett	xor	%ecx, %ecx				/* clear index */
1120*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1121*533d3a49SEdward Gillett	cmp	%r10, %r8
1122*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1123*533d3a49SEdward Gillett#endif
1124*533d3a49SEdward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1125*533d3a49SEdward Gillett	jz	LABEL(ashr_7_use_sse2)
1126*533d3a49SEdward Gillett
1127*533d3a49SEdward Gillett	.p2align 4
1128*533d3a49SEdward GillettLABEL(ashr_7_use_ssse3):
1129*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1130*533d3a49SEdward Gillett	pcmpeqb	%xmm3, %xmm0
1131*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1132*533d3a49SEdward Gillett	test	%edx, %edx
1133*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1134*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1135*533d3a49SEdward Gillett	sub	$16, %r8
1136*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1137*533d3a49SEdward Gillett#endif
1138*533d3a49SEdward Gillett
1139*533d3a49SEdward Gillett	#palignr $7, (%rsi, %rcx), %xmm3
1140*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1141*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x07
1142*533d3a49SEdward Gillett
1143*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1144*533d3a49SEdward Gillett	add	$16, %rcx
1145*533d3a49SEdward Gillett
1146*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1147*533d3a49SEdward Gillett	cmp	%r10, %r8
1148*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1149*533d3a49SEdward Gillett#endif
1150*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1151*533d3a49SEdward Gillett	pcmpeqb %xmm3, %xmm0
1152*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1153*533d3a49SEdward Gillett	test	%edx, %edx
1154*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1155*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1156*533d3a49SEdward Gillett	sub	$16, %r8
1157*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1158*533d3a49SEdward Gillett#endif
1159*533d3a49SEdward Gillett
1160*533d3a49SEdward Gillett	#palignr $7, (%rsi, %rcx), %xmm3
1161*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1162*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x07
1163*533d3a49SEdward Gillett
1164*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1165*533d3a49SEdward Gillett	add	$16, %rcx
1166*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1167*533d3a49SEdward Gillett	cmp	%r10, %r8
1168*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1169*533d3a49SEdward Gillett#endif
1170*533d3a49SEdward Gillett	jmp	LABEL(ashr_7_use_ssse3)
1171*533d3a49SEdward Gillett
1172*533d3a49SEdward Gillett	.p2align 4
1173*533d3a49SEdward GillettLABEL(ashr_7_use_sse2):
1174*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1175*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1176*533d3a49SEdward Gillett	test	%edx, %edx
1177*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1178*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1179*533d3a49SEdward Gillett	sub	$16, %r8
1180*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1181*533d3a49SEdward Gillett#endif
1182*533d3a49SEdward Gillett
1183*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1184*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
1185*533d3a49SEdward Gillett
1186*533d3a49SEdward Gillett	psrldq	$7, %xmm2
1187*533d3a49SEdward Gillett	pslldq	$9, %xmm3
1188*533d3a49SEdward Gillett	por	%xmm2, %xmm3
1189*533d3a49SEdward Gillett
1190*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1191*533d3a49SEdward Gillett	add	$16, %rcx
1192*533d3a49SEdward Gillett
1193*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1194*533d3a49SEdward Gillett	cmp	%r10, %r8
1195*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1196*533d3a49SEdward Gillett#endif
1197*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1198*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1199*533d3a49SEdward Gillett	test	%edx, %edx
1200*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1201*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1202*533d3a49SEdward Gillett	sub	$16, %r8
1203*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1204*533d3a49SEdward Gillett#endif
1205*533d3a49SEdward Gillett
1206*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1207*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
1208*533d3a49SEdward Gillett
1209*533d3a49SEdward Gillett	psrldq	$7, %xmm2
1210*533d3a49SEdward Gillett	pslldq	$9, %xmm3
1211*533d3a49SEdward Gillett	por	%xmm2, %xmm3
1212*533d3a49SEdward Gillett
1213*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1214*533d3a49SEdward Gillett	add	$16, %rcx
1215*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1216*533d3a49SEdward Gillett	cmp	%r10, %r8
1217*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1218*533d3a49SEdward Gillett#endif
1219*533d3a49SEdward Gillett	jmp	LABEL(ashr_7_use_sse2)
1220*533d3a49SEdward Gillett
1221*533d3a49SEdward Gillett
1222*533d3a49SEdward Gillett/*
1223*533d3a49SEdward Gillett * ashr_6 handles the following cases:
1224*533d3a49SEdward Gillett * 	(16 + (src offset - dest offset)) % 16 = 6
1225*533d3a49SEdward Gillett *
1226*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
1227*533d3a49SEdward Gillett * bank, there is no null byte.
1228*533d3a49SEdward Gillett */
1229*533d3a49SEdward Gillett	.p2align 4
1230*533d3a49SEdward GillettLABEL(ashr_6):
1231*533d3a49SEdward Gillett	xor	%ecx, %ecx				/* clear index */
1232*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1233*533d3a49SEdward Gillett	cmp	%r10, %r8
1234*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1235*533d3a49SEdward Gillett#endif
1236*533d3a49SEdward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1237*533d3a49SEdward Gillett	jz	LABEL(ashr_6_use_sse2)
1238*533d3a49SEdward Gillett
1239*533d3a49SEdward Gillett	.p2align 4
1240*533d3a49SEdward GillettLABEL(ashr_6_use_ssse3):
1241*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1242*533d3a49SEdward Gillett	pcmpeqb	%xmm3, %xmm0
1243*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1244*533d3a49SEdward Gillett	test	%edx, %edx
1245*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1246*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1247*533d3a49SEdward Gillett	sub	$16, %r8
1248*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1249*533d3a49SEdward Gillett#endif
1250*533d3a49SEdward Gillett
1251*533d3a49SEdward Gillett	#palignr $6, (%rsi, %rcx), %xmm3
1252*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1253*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x06
1254*533d3a49SEdward Gillett
1255*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1256*533d3a49SEdward Gillett	add	$16, %rcx
1257*533d3a49SEdward Gillett
1258*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1259*533d3a49SEdward Gillett	cmp	%r10, %r8
1260*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1261*533d3a49SEdward Gillett#endif
1262*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1263*533d3a49SEdward Gillett	pcmpeqb %xmm3, %xmm0
1264*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1265*533d3a49SEdward Gillett	test	%edx, %edx
1266*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1267*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1268*533d3a49SEdward Gillett	sub	$16, %r8
1269*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1270*533d3a49SEdward Gillett#endif
1271*533d3a49SEdward Gillett
1272*533d3a49SEdward Gillett	#palignr $6, (%rsi, %rcx), %xmm3
1273*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1274*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x06
1275*533d3a49SEdward Gillett
1276*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1277*533d3a49SEdward Gillett	add	$16, %rcx
1278*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1279*533d3a49SEdward Gillett	cmp	%r10, %r8
1280*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1281*533d3a49SEdward Gillett#endif
1282*533d3a49SEdward Gillett	jmp	LABEL(ashr_6_use_ssse3)
1283*533d3a49SEdward Gillett
1284*533d3a49SEdward Gillett	.p2align 4
1285*533d3a49SEdward GillettLABEL(ashr_6_use_sse2):
1286*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1287*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1288*533d3a49SEdward Gillett	test	%edx, %edx
1289*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1290*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1291*533d3a49SEdward Gillett	sub	$16, %r8
1292*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1293*533d3a49SEdward Gillett#endif
1294*533d3a49SEdward Gillett
1295*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1296*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
1297*533d3a49SEdward Gillett
1298*533d3a49SEdward Gillett	psrldq	$6, %xmm2
1299*533d3a49SEdward Gillett	pslldq	$10, %xmm3
1300*533d3a49SEdward Gillett	por	%xmm2, %xmm3
1301*533d3a49SEdward Gillett
1302*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1303*533d3a49SEdward Gillett	add	$16, %rcx
1304*533d3a49SEdward Gillett
1305*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1306*533d3a49SEdward Gillett	cmp	%r10, %r8
1307*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1308*533d3a49SEdward Gillett#endif
1309*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1310*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1311*533d3a49SEdward Gillett	test	%edx, %edx
1312*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1313*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1314*533d3a49SEdward Gillett	sub	$16, %r8
1315*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1316*533d3a49SEdward Gillett#endif
1317*533d3a49SEdward Gillett
1318*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1319*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
1320*533d3a49SEdward Gillett
1321*533d3a49SEdward Gillett	psrldq	$6, %xmm2
1322*533d3a49SEdward Gillett	pslldq	$10, %xmm3
1323*533d3a49SEdward Gillett	por	%xmm2, %xmm3
1324*533d3a49SEdward Gillett
1325*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1326*533d3a49SEdward Gillett	add	$16, %rcx
1327*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1328*533d3a49SEdward Gillett	cmp	%r10, %r8
1329*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1330*533d3a49SEdward Gillett#endif
1331*533d3a49SEdward Gillett	jmp	LABEL(ashr_6_use_sse2)
1332*533d3a49SEdward Gillett
1333*533d3a49SEdward Gillett
1334*533d3a49SEdward Gillett/*
1335*533d3a49SEdward Gillett * ashr_5 handles the following cases:
1336*533d3a49SEdward Gillett * 	(16 + (src offset - dest offset)) % 16 = 5
1337*533d3a49SEdward Gillett *
1338*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
1339*533d3a49SEdward Gillett * bank, there is no null byte.
1340*533d3a49SEdward Gillett */
1341*533d3a49SEdward Gillett	.p2align 4
1342*533d3a49SEdward GillettLABEL(ashr_5):
1343*533d3a49SEdward Gillett	xor	%ecx, %ecx				/* clear index */
1344*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1345*533d3a49SEdward Gillett	cmp	%r10, %r8
1346*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1347*533d3a49SEdward Gillett#endif
1348*533d3a49SEdward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1349*533d3a49SEdward Gillett	jz	LABEL(ashr_5_use_sse2)
1350*533d3a49SEdward Gillett
1351*533d3a49SEdward Gillett	.p2align 4
1352*533d3a49SEdward GillettLABEL(ashr_5_use_ssse3):
1353*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1354*533d3a49SEdward Gillett	pcmpeqb	%xmm3, %xmm0
1355*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1356*533d3a49SEdward Gillett	test	%edx, %edx
1357*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1358*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1359*533d3a49SEdward Gillett	sub	$16, %r8
1360*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1361*533d3a49SEdward Gillett#endif
1362*533d3a49SEdward Gillett
1363*533d3a49SEdward Gillett	#palignr $5, (%rsi, %rcx), %xmm3
1364*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1365*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x05
1366*533d3a49SEdward Gillett
1367*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1368*533d3a49SEdward Gillett	add	$16, %rcx
1369*533d3a49SEdward Gillett
1370*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1371*533d3a49SEdward Gillett	cmp	%r10, %r8
1372*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1373*533d3a49SEdward Gillett#endif
1374*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1375*533d3a49SEdward Gillett	pcmpeqb %xmm3, %xmm0
1376*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1377*533d3a49SEdward Gillett	test	%edx, %edx
1378*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1379*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1380*533d3a49SEdward Gillett	sub	$16, %r8
1381*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1382*533d3a49SEdward Gillett#endif
1383*533d3a49SEdward Gillett
1384*533d3a49SEdward Gillett	#palignr $5, (%rsi, %rcx), %xmm3
1385*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1386*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x05
1387*533d3a49SEdward Gillett
1388*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1389*533d3a49SEdward Gillett	add	$16, %rcx
1390*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1391*533d3a49SEdward Gillett	cmp	%r10, %r8
1392*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1393*533d3a49SEdward Gillett#endif
1394*533d3a49SEdward Gillett	jmp	LABEL(ashr_5_use_ssse3)
1395*533d3a49SEdward Gillett
1396*533d3a49SEdward Gillett	.p2align 4
1397*533d3a49SEdward GillettLABEL(ashr_5_use_sse2):
1398*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1399*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1400*533d3a49SEdward Gillett	test	%edx, %edx
1401*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1402*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1403*533d3a49SEdward Gillett	sub	$16, %r8
1404*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1405*533d3a49SEdward Gillett#endif
1406*533d3a49SEdward Gillett
1407*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1408*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
1409*533d3a49SEdward Gillett
1410*533d3a49SEdward Gillett	psrldq	$5, %xmm2
1411*533d3a49SEdward Gillett	pslldq	$11, %xmm3
1412*533d3a49SEdward Gillett	por	%xmm2, %xmm3
1413*533d3a49SEdward Gillett
1414*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1415*533d3a49SEdward Gillett	add	$16, %rcx
1416*533d3a49SEdward Gillett
1417*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1418*533d3a49SEdward Gillett	cmp	%r10, %r8
1419*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1420*533d3a49SEdward Gillett#endif
1421*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1422*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1423*533d3a49SEdward Gillett	test	%edx, %edx
1424*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1425*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1426*533d3a49SEdward Gillett	sub	$16, %r8
1427*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1428*533d3a49SEdward Gillett#endif
1429*533d3a49SEdward Gillett
1430*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1431*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
1432*533d3a49SEdward Gillett
1433*533d3a49SEdward Gillett	psrldq	$5, %xmm2
1434*533d3a49SEdward Gillett	pslldq	$11, %xmm3
1435*533d3a49SEdward Gillett	por	%xmm2, %xmm3
1436*533d3a49SEdward Gillett
1437*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1438*533d3a49SEdward Gillett	add	$16, %rcx
1439*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1440*533d3a49SEdward Gillett	cmp	%r10, %r8
1441*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1442*533d3a49SEdward Gillett#endif
1443*533d3a49SEdward Gillett	jmp	LABEL(ashr_5_use_sse2)
1444*533d3a49SEdward Gillett
1445*533d3a49SEdward Gillett
1446*533d3a49SEdward Gillett/*
1447*533d3a49SEdward Gillett * ashr_4 handles the following cases:
1448*533d3a49SEdward Gillett * 	(16 + (src offset - dest offset)) % 16 = 4
1449*533d3a49SEdward Gillett *
1450*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
1451*533d3a49SEdward Gillett * bank, there is no null byte.
1452*533d3a49SEdward Gillett */
1453*533d3a49SEdward Gillett	.p2align 4
1454*533d3a49SEdward GillettLABEL(ashr_4):
1455*533d3a49SEdward Gillett	xor	%ecx, %ecx				/* clear index */
1456*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1457*533d3a49SEdward Gillett	cmp	%r10, %r8
1458*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1459*533d3a49SEdward Gillett#endif
1460*533d3a49SEdward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1461*533d3a49SEdward Gillett	jz	LABEL(ashr_4_use_sse2)
1462*533d3a49SEdward Gillett
1463*533d3a49SEdward Gillett	.p2align 4
1464*533d3a49SEdward GillettLABEL(ashr_4_use_ssse3):
1465*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1466*533d3a49SEdward Gillett	pcmpeqb	%xmm3, %xmm0
1467*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1468*533d3a49SEdward Gillett	test	%edx, %edx
1469*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1470*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1471*533d3a49SEdward Gillett	sub	$16, %r8
1472*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1473*533d3a49SEdward Gillett#endif
1474*533d3a49SEdward Gillett
1475*533d3a49SEdward Gillett	#palignr $4, (%rsi, %rcx), %xmm3
1476*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1477*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x04
1478*533d3a49SEdward Gillett
1479*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1480*533d3a49SEdward Gillett	add	$16, %rcx
1481*533d3a49SEdward Gillett
1482*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1483*533d3a49SEdward Gillett	cmp	%r10, %r8
1484*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1485*533d3a49SEdward Gillett#endif
1486*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1487*533d3a49SEdward Gillett	pcmpeqb %xmm3, %xmm0
1488*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1489*533d3a49SEdward Gillett	test	%edx, %edx
1490*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1491*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1492*533d3a49SEdward Gillett	sub	$16, %r8
1493*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1494*533d3a49SEdward Gillett#endif
1495*533d3a49SEdward Gillett
1496*533d3a49SEdward Gillett	#palignr $4, (%rsi, %rcx), %xmm3
1497*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1498*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x04
1499*533d3a49SEdward Gillett
1500*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1501*533d3a49SEdward Gillett	add	$16, %rcx
1502*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1503*533d3a49SEdward Gillett	cmp	%r10, %r8
1504*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1505*533d3a49SEdward Gillett#endif
1506*533d3a49SEdward Gillett	jmp	LABEL(ashr_4_use_ssse3)
1507*533d3a49SEdward Gillett
1508*533d3a49SEdward Gillett	.p2align 4
1509*533d3a49SEdward GillettLABEL(ashr_4_use_sse2):
1510*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1511*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1512*533d3a49SEdward Gillett	test	%edx, %edx
1513*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1514*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1515*533d3a49SEdward Gillett	sub	$16, %r8
1516*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1517*533d3a49SEdward Gillett#endif
1518*533d3a49SEdward Gillett
1519*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1520*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
1521*533d3a49SEdward Gillett
1522*533d3a49SEdward Gillett	psrldq	$4, %xmm2
1523*533d3a49SEdward Gillett	pslldq	$12, %xmm3
1524*533d3a49SEdward Gillett	por	%xmm2, %xmm3
1525*533d3a49SEdward Gillett
1526*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1527*533d3a49SEdward Gillett	add	$16, %rcx
1528*533d3a49SEdward Gillett
1529*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1530*533d3a49SEdward Gillett	cmp	%r10, %r8
1531*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1532*533d3a49SEdward Gillett#endif
1533*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1534*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1535*533d3a49SEdward Gillett	test	%edx, %edx
1536*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1537*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1538*533d3a49SEdward Gillett	sub	$16, %r8
1539*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1540*533d3a49SEdward Gillett#endif
1541*533d3a49SEdward Gillett
1542*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1543*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
1544*533d3a49SEdward Gillett
1545*533d3a49SEdward Gillett	psrldq	$4, %xmm2
1546*533d3a49SEdward Gillett	pslldq	$12, %xmm3
1547*533d3a49SEdward Gillett	por	%xmm2, %xmm3
1548*533d3a49SEdward Gillett
1549*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1550*533d3a49SEdward Gillett	add	$16, %rcx
1551*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1552*533d3a49SEdward Gillett	cmp	%r10, %r8
1553*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1554*533d3a49SEdward Gillett#endif
1555*533d3a49SEdward Gillett	jmp	LABEL(ashr_4_use_sse2)
1556*533d3a49SEdward Gillett
1557*533d3a49SEdward Gillett
1558*533d3a49SEdward Gillett/*
1559*533d3a49SEdward Gillett * ashr_3 handles the following cases:
1560*533d3a49SEdward Gillett * 	(16 + (src offset - dest offset)) % 16 = 3
1561*533d3a49SEdward Gillett *
1562*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
1563*533d3a49SEdward Gillett * bank, there is no null byte.
1564*533d3a49SEdward Gillett */
1565*533d3a49SEdward Gillett	.p2align 4
1566*533d3a49SEdward GillettLABEL(ashr_3):
1567*533d3a49SEdward Gillett	xor	%ecx, %ecx				/* clear index */
1568*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1569*533d3a49SEdward Gillett	cmp	%r10, %r8
1570*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1571*533d3a49SEdward Gillett#endif
1572*533d3a49SEdward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1573*533d3a49SEdward Gillett	jz	LABEL(ashr_3_use_sse2)
1574*533d3a49SEdward Gillett
1575*533d3a49SEdward Gillett	.p2align 4
1576*533d3a49SEdward GillettLABEL(ashr_3_use_ssse3):
1577*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1578*533d3a49SEdward Gillett	pcmpeqb	%xmm3, %xmm0
1579*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1580*533d3a49SEdward Gillett	test	%edx, %edx
1581*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1582*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1583*533d3a49SEdward Gillett	sub	$16, %r8
1584*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1585*533d3a49SEdward Gillett#endif
1586*533d3a49SEdward Gillett
1587*533d3a49SEdward Gillett	#palignr $3, (%rsi, %rcx), %xmm3
1588*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1589*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x03
1590*533d3a49SEdward Gillett
1591*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1592*533d3a49SEdward Gillett	add	$16, %rcx
1593*533d3a49SEdward Gillett
1594*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1595*533d3a49SEdward Gillett	cmp	%r10, %r8
1596*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1597*533d3a49SEdward Gillett#endif
1598*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1599*533d3a49SEdward Gillett	pcmpeqb %xmm3, %xmm0
1600*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1601*533d3a49SEdward Gillett	test	%edx, %edx
1602*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1603*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1604*533d3a49SEdward Gillett	sub	$16, %r8
1605*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1606*533d3a49SEdward Gillett#endif
1607*533d3a49SEdward Gillett
1608*533d3a49SEdward Gillett	#palignr $3, (%rsi, %rcx), %xmm3
1609*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1610*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x03
1611*533d3a49SEdward Gillett
1612*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1613*533d3a49SEdward Gillett	add	$16, %rcx
1614*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1615*533d3a49SEdward Gillett	cmp	%r10, %r8
1616*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1617*533d3a49SEdward Gillett#endif
1618*533d3a49SEdward Gillett	jmp	LABEL(ashr_3_use_ssse3)
1619*533d3a49SEdward Gillett
1620*533d3a49SEdward Gillett	.p2align 4
1621*533d3a49SEdward GillettLABEL(ashr_3_use_sse2):
1622*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1623*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1624*533d3a49SEdward Gillett	test	%edx, %edx
1625*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1626*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1627*533d3a49SEdward Gillett	sub	$16, %r8
1628*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1629*533d3a49SEdward Gillett#endif
1630*533d3a49SEdward Gillett
1631*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1632*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
1633*533d3a49SEdward Gillett
1634*533d3a49SEdward Gillett	psrldq	$3, %xmm2
1635*533d3a49SEdward Gillett	pslldq	$13, %xmm3
1636*533d3a49SEdward Gillett	por	%xmm2, %xmm3
1637*533d3a49SEdward Gillett
1638*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1639*533d3a49SEdward Gillett	add	$16, %rcx
1640*533d3a49SEdward Gillett
1641*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1642*533d3a49SEdward Gillett	cmp	%r10, %r8
1643*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1644*533d3a49SEdward Gillett#endif
1645*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1646*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1647*533d3a49SEdward Gillett	test	%edx, %edx
1648*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1649*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1650*533d3a49SEdward Gillett	sub	$16, %r8
1651*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1652*533d3a49SEdward Gillett#endif
1653*533d3a49SEdward Gillett
1654*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1655*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
1656*533d3a49SEdward Gillett
1657*533d3a49SEdward Gillett	psrldq	$3, %xmm2
1658*533d3a49SEdward Gillett	pslldq	$13, %xmm3
1659*533d3a49SEdward Gillett	por	%xmm2, %xmm3
1660*533d3a49SEdward Gillett
1661*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1662*533d3a49SEdward Gillett	add	$16, %rcx
1663*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1664*533d3a49SEdward Gillett	cmp	%r10, %r8
1665*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1666*533d3a49SEdward Gillett#endif
1667*533d3a49SEdward Gillett	jmp	LABEL(ashr_3_use_sse2)
1668*533d3a49SEdward Gillett
1669*533d3a49SEdward Gillett
1670*533d3a49SEdward Gillett/*
1671*533d3a49SEdward Gillett * ashr_2 handles the following cases:
1672*533d3a49SEdward Gillett * 	(16 + (src offset - dest offset)) % 16 = 2
1673*533d3a49SEdward Gillett *
1674*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
1675*533d3a49SEdward Gillett * bank, there is no null byte.
1676*533d3a49SEdward Gillett */
1677*533d3a49SEdward Gillett	.p2align 4
1678*533d3a49SEdward GillettLABEL(ashr_2):
1679*533d3a49SEdward Gillett	xor	%ecx, %ecx				/* clear index */
1680*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1681*533d3a49SEdward Gillett	cmp	%r10, %r8
1682*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1683*533d3a49SEdward Gillett#endif
1684*533d3a49SEdward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1685*533d3a49SEdward Gillett	jz	LABEL(ashr_2_use_sse2)
1686*533d3a49SEdward Gillett
1687*533d3a49SEdward Gillett	.p2align 4
1688*533d3a49SEdward GillettLABEL(ashr_2_use_ssse3):
1689*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1690*533d3a49SEdward Gillett	pcmpeqb	%xmm3, %xmm0
1691*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1692*533d3a49SEdward Gillett	test	%edx, %edx
1693*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1694*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1695*533d3a49SEdward Gillett	sub	$16, %r8
1696*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1697*533d3a49SEdward Gillett#endif
1698*533d3a49SEdward Gillett
1699*533d3a49SEdward Gillett	#palignr $2, (%rsi, %rcx), %xmm3
1700*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1701*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x02
1702*533d3a49SEdward Gillett
1703*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1704*533d3a49SEdward Gillett	add	$16, %rcx
1705*533d3a49SEdward Gillett
1706*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1707*533d3a49SEdward Gillett	cmp	%r10, %r8
1708*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1709*533d3a49SEdward Gillett#endif
1710*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1711*533d3a49SEdward Gillett	pcmpeqb %xmm3, %xmm0
1712*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1713*533d3a49SEdward Gillett	test	%edx, %edx
1714*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1715*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1716*533d3a49SEdward Gillett	sub	$16, %r8
1717*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1718*533d3a49SEdward Gillett#endif
1719*533d3a49SEdward Gillett
1720*533d3a49SEdward Gillett	#palignr $2, (%rsi, %rcx), %xmm3
1721*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1722*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x02
1723*533d3a49SEdward Gillett
1724*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1725*533d3a49SEdward Gillett	add	$16, %rcx
1726*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1727*533d3a49SEdward Gillett	cmp	%r10, %r8
1728*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1729*533d3a49SEdward Gillett#endif
1730*533d3a49SEdward Gillett	jmp	LABEL(ashr_2_use_ssse3)
1731*533d3a49SEdward Gillett
1732*533d3a49SEdward Gillett	.p2align 4
1733*533d3a49SEdward GillettLABEL(ashr_2_use_sse2):
1734*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1735*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1736*533d3a49SEdward Gillett	test	%edx, %edx
1737*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1738*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1739*533d3a49SEdward Gillett	sub	$16, %r8
1740*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1741*533d3a49SEdward Gillett#endif
1742*533d3a49SEdward Gillett
1743*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1744*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
1745*533d3a49SEdward Gillett
1746*533d3a49SEdward Gillett	psrldq	$2, %xmm2
1747*533d3a49SEdward Gillett	pslldq	$14, %xmm3
1748*533d3a49SEdward Gillett	por	%xmm2, %xmm3
1749*533d3a49SEdward Gillett
1750*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1751*533d3a49SEdward Gillett	add	$16, %rcx
1752*533d3a49SEdward Gillett
1753*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1754*533d3a49SEdward Gillett	cmp	%r10, %r8
1755*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1756*533d3a49SEdward Gillett#endif
1757*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1758*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1759*533d3a49SEdward Gillett	test	%edx, %edx
1760*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1761*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1762*533d3a49SEdward Gillett	sub	$16, %r8
1763*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1764*533d3a49SEdward Gillett#endif
1765*533d3a49SEdward Gillett
1766*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1767*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
1768*533d3a49SEdward Gillett
1769*533d3a49SEdward Gillett	psrldq	$2, %xmm2
1770*533d3a49SEdward Gillett	pslldq	$14, %xmm3
1771*533d3a49SEdward Gillett	por	%xmm2, %xmm3
1772*533d3a49SEdward Gillett
1773*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1774*533d3a49SEdward Gillett	add	$16, %rcx
1775*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1776*533d3a49SEdward Gillett	cmp	%r10, %r8
1777*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1778*533d3a49SEdward Gillett#endif
1779*533d3a49SEdward Gillett	jmp	LABEL(ashr_2_use_sse2)
1780*533d3a49SEdward Gillett
1781*533d3a49SEdward Gillett
1782*533d3a49SEdward Gillett/*
1783*533d3a49SEdward Gillett * ashr_1 handles the following cases:
1784*533d3a49SEdward Gillett * 	(16 + (src offset - dest offset)) % 16 = 1
1785*533d3a49SEdward Gillett *
1786*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
1787*533d3a49SEdward Gillett * bank, there is no null byte.
1788*533d3a49SEdward Gillett */
1789*533d3a49SEdward Gillett	.p2align 4
1790*533d3a49SEdward GillettLABEL(ashr_1):
1791*533d3a49SEdward Gillett	xor	%ecx, %ecx				/* clear index */
1792*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1793*533d3a49SEdward Gillett	cmp	%r10, %r8
1794*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1795*533d3a49SEdward Gillett#endif
1796*533d3a49SEdward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1797*533d3a49SEdward Gillett	jz	LABEL(ashr_1_use_sse2)
1798*533d3a49SEdward Gillett
1799*533d3a49SEdward Gillett	.p2align 4
1800*533d3a49SEdward GillettLABEL(ashr_1_use_ssse3):
1801*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1802*533d3a49SEdward Gillett	pcmpeqb	%xmm3, %xmm0
1803*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1804*533d3a49SEdward Gillett	test	%edx, %edx
1805*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1806*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1807*533d3a49SEdward Gillett	sub	$16, %r8
1808*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1809*533d3a49SEdward Gillett#endif
1810*533d3a49SEdward Gillett
1811*533d3a49SEdward Gillett	#palignr $1, (%rsi, %rcx), %xmm3
1812*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1813*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x01
1814*533d3a49SEdward Gillett
1815*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1816*533d3a49SEdward Gillett	add	$16, %rcx
1817*533d3a49SEdward Gillett
1818*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1819*533d3a49SEdward Gillett	cmp	%r10, %r8
1820*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1821*533d3a49SEdward Gillett#endif
1822*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1823*533d3a49SEdward Gillett	pcmpeqb %xmm3, %xmm0
1824*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1825*533d3a49SEdward Gillett	test	%edx, %edx
1826*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1827*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1828*533d3a49SEdward Gillett	sub	$16, %r8
1829*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1830*533d3a49SEdward Gillett#endif
1831*533d3a49SEdward Gillett	#palignr $1, (%rsi, %rcx), %xmm3
1832*533d3a49SEdward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1833*533d3a49SEdward Gillett	.byte	0x1c, 0x0e, 0x01
1834*533d3a49SEdward Gillett
1835*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1836*533d3a49SEdward Gillett	add	$16, %rcx
1837*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1838*533d3a49SEdward Gillett	cmp	%r10, %r8
1839*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1840*533d3a49SEdward Gillett#endif
1841*533d3a49SEdward Gillett	jmp	LABEL(ashr_1_use_ssse3)
1842*533d3a49SEdward Gillett
1843*533d3a49SEdward Gillett	.p2align 4
1844*533d3a49SEdward GillettLABEL(ashr_1_use_sse2):
1845*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1846*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1847*533d3a49SEdward Gillett	test	%edx, %edx
1848*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1849*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1850*533d3a49SEdward Gillett	sub	$16, %r8
1851*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1852*533d3a49SEdward Gillett#endif
1853*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1854*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
1855*533d3a49SEdward Gillett
1856*533d3a49SEdward Gillett	psrldq	$1, %xmm2
1857*533d3a49SEdward Gillett	pslldq	$15, %xmm3
1858*533d3a49SEdward Gillett	por	%xmm2, %xmm3
1859*533d3a49SEdward Gillett
1860*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1861*533d3a49SEdward Gillett	add	$16, %rcx
1862*533d3a49SEdward Gillett
1863*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1864*533d3a49SEdward Gillett	cmp	%r10, %r8
1865*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1866*533d3a49SEdward Gillett#endif
1867*533d3a49SEdward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1868*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
1869*533d3a49SEdward Gillett	test	%edx, %edx
1870*533d3a49SEdward Gillett	jnz	LABEL(unaligned_exit)
1871*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1872*533d3a49SEdward Gillett	sub	$16, %r8
1873*533d3a49SEdward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1874*533d3a49SEdward Gillett#endif
1875*533d3a49SEdward Gillett
1876*533d3a49SEdward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1877*533d3a49SEdward Gillett	movdqa	(%rsi, %rcx), %xmm2
1878*533d3a49SEdward Gillett
1879*533d3a49SEdward Gillett	psrldq	$1, %xmm2
1880*533d3a49SEdward Gillett	pslldq	$15, %xmm3
1881*533d3a49SEdward Gillett	por	%xmm2, %xmm3
1882*533d3a49SEdward Gillett
1883*533d3a49SEdward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1884*533d3a49SEdward Gillett	add	$16, %rcx
1885*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1886*533d3a49SEdward Gillett	cmp	%r10, %r8
1887*533d3a49SEdward Gillett	jbe	LABEL(unaligned_exit)
1888*533d3a49SEdward Gillett#endif
1889*533d3a49SEdward Gillett	jmp	LABEL(ashr_1_use_sse2)
1890*533d3a49SEdward Gillett
1891*533d3a49SEdward Gillett
1892*533d3a49SEdward Gillett	/*
1893*533d3a49SEdward Gillett	 * Exit tail code:
1894*533d3a49SEdward Gillett	 * Up to 32 bytes are copied in the case of strcpy.
1895*533d3a49SEdward Gillett	 */
1896*533d3a49SEdward Gillett	.p2align 4
1897*533d3a49SEdward GillettLABEL(less32bytes):
1898*533d3a49SEdward Gillett	xor	%ecx, %ecx
1899*533d3a49SEdward GillettLABEL(unaligned_exit):
1900*533d3a49SEdward Gillett	add	%r9, %rsi		/* r9 holds offset of rsi */
1901*533d3a49SEdward Gillett	mov	%rcx, %r9
1902*533d3a49SEdward Gillett	mov	%r10, %rcx
1903*533d3a49SEdward Gillett	shl	%cl, %edx		/* after shl, calculate the exact number to be filled */
1904*533d3a49SEdward Gillett	mov	%r9, %rcx
1905*533d3a49SEdward Gillett	.p2align 4
1906*533d3a49SEdward GillettLABEL(aligned_exit):
1907*533d3a49SEdward Gillett	add	%rcx, %rdi		/* locate exact address for rdi */
1908*533d3a49SEdward GillettLABEL(less16bytes):
1909*533d3a49SEdward Gillett	add	%rcx, %rsi		/* locate exact address for rsi */
1910*533d3a49SEdward GillettLABEL(aligned_16bytes):
1911*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1912*533d3a49SEdward Gillett	/*
1913*533d3a49SEdward Gillett	 * Null found in 16bytes checked. Set bit in bitmask corresponding to
1914*533d3a49SEdward Gillett	 * the strncpy count argument. We will copy to the null (inclusive)
1915*533d3a49SEdward Gillett	 * or count whichever comes first.
1916*533d3a49SEdward Gillett	 */
1917*533d3a49SEdward Gillett	mov	$1, %r9d
1918*533d3a49SEdward Gillett	lea	-1(%r8), %rcx
1919*533d3a49SEdward Gillett	shl	%cl, %r9d
1920*533d3a49SEdward Gillett	cmp	$32, %r8
1921*533d3a49SEdward Gillett	ja	LABEL(strncpy_tail)
1922*533d3a49SEdward Gillett	or	%r9d, %edx
1923*533d3a49SEdward GillettLABEL(strncpy_tail):
1924*533d3a49SEdward Gillett#endif
1925*533d3a49SEdward Gillett	/*
1926*533d3a49SEdward Gillett	 * Check to see if BSF is fast on this processor. If not, use a
1927*533d3a49SEdward Gillett	 * different exit tail.
1928*533d3a49SEdward Gillett	 */
1929*533d3a49SEdward Gillett	testb	$USE_BSF, .memops_method(%rip)
1930*533d3a49SEdward Gillett	jz	LABEL(AMD_exit)
1931*533d3a49SEdward Gillett	bsf	%rdx, %rcx		/* Find byte with null char */
1932*533d3a49SEdward Gillett	lea	LABEL(tail_table)(%rip), %r11
1933*533d3a49SEdward Gillett	movslq	(%r11, %rcx, 4), %rcx
1934*533d3a49SEdward Gillett	lea	(%r11, %rcx), %rcx
1935*533d3a49SEdward Gillett	jmp	*%rcx
1936*533d3a49SEdward Gillett
1937*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1938*533d3a49SEdward Gillett	/*
1939*533d3a49SEdward Gillett	 * Count reached before null found.
1940*533d3a49SEdward Gillett	 */
1941*533d3a49SEdward Gillett	.p2align 4
1942*533d3a49SEdward GillettLABEL(less32bytes_strncpy_truncation):
1943*533d3a49SEdward Gillett	xor	%ecx, %ecx
1944*533d3a49SEdward GillettLABEL(strncpy_truncation_unaligned):
1945*533d3a49SEdward Gillett	add	%r9, %rsi		/* next src char to copy */
1946*533d3a49SEdward GillettLABEL(strncpy_truncation_aligned):
1947*533d3a49SEdward Gillett	add	%rcx, %rdi
1948*533d3a49SEdward Gillett	add	%rcx, %rsi
1949*533d3a49SEdward Gillett	add	$16, %r8		/* compensation */
1950*533d3a49SEdward Gillett	lea	-1(%r8), %rcx
1951*533d3a49SEdward Gillett	lea	LABEL(tail_table)(%rip), %r11
1952*533d3a49SEdward Gillett	movslq	(%r11, %rcx, 4), %rcx
1953*533d3a49SEdward Gillett	lea	(%r11, %rcx), %rcx
1954*533d3a49SEdward Gillett	jmp	*%rcx
1955*533d3a49SEdward Gillett
1956*533d3a49SEdward Gillett	.p2align 4
1957*533d3a49SEdward GillettLABEL(strncpy_exitz):
1958*533d3a49SEdward Gillett	mov	%rdi, %rax
1959*533d3a49SEdward Gillett	ret
1960*533d3a49SEdward Gillett#endif
1961*533d3a49SEdward Gillett
1962*533d3a49SEdward Gillett	.p2align 4
1963*533d3a49SEdward GillettLABEL(AMD_exit):
1964*533d3a49SEdward Gillett	test	%dl, %dl
1965*533d3a49SEdward Gillett	jz	LABEL(AMD_exit_more_8)
1966*533d3a49SEdward Gillett	test	$0x01, %dl
1967*533d3a49SEdward Gillett	jnz	LABEL(tail_0)
1968*533d3a49SEdward Gillett	test	$0x02, %dl
1969*533d3a49SEdward Gillett	jnz	LABEL(tail_1)
1970*533d3a49SEdward Gillett	test	$0x04, %dl
1971*533d3a49SEdward Gillett	jnz	LABEL(tail_2)
1972*533d3a49SEdward Gillett	test	$0x08, %dl
1973*533d3a49SEdward Gillett	jnz	LABEL(tail_3)
1974*533d3a49SEdward Gillett	test	$0x10, %dl
1975*533d3a49SEdward Gillett	jnz	LABEL(tail_4)
1976*533d3a49SEdward Gillett	test	$0x20, %dl
1977*533d3a49SEdward Gillett	jnz	LABEL(tail_5)
1978*533d3a49SEdward Gillett	test	$0x40, %dl
1979*533d3a49SEdward Gillett	jnz	LABEL(tail_6)
1980*533d3a49SEdward Gillett
1981*533d3a49SEdward Gillett	.p2align 4
1982*533d3a49SEdward GillettLABEL(tail_7):				/* 8 bytes */
1983*533d3a49SEdward Gillett	mov	(%rsi), %rcx
1984*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
1985*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1986*533d3a49SEdward Gillett	mov	$8, %cl
19877c478bd9Sstevel@tonic-gate	sub	$8, %r8
1988*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
1989*533d3a49SEdward Gillett#endif
1990*533d3a49SEdward Gillett	ret
19917c478bd9Sstevel@tonic-gate
1992*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
1993*533d3a49SEdward Gillett	/*
1994*533d3a49SEdward Gillett	 * Null terminated src string shorter than count. Fill the rest of the
1995*533d3a49SEdward Gillett	 * destination with null chars.
1996*533d3a49SEdward Gillett	 */
19977c478bd9Sstevel@tonic-gate	.p2align 4
1998*533d3a49SEdward GillettLABEL(strncpy_fill_tail):
1999*533d3a49SEdward Gillett	mov	%rax, %rdx
2000*533d3a49SEdward Gillett	movzx	%cl, %rax
2001*533d3a49SEdward Gillett	mov	%r8, %rcx
2002*533d3a49SEdward Gillett	add	%rax, %rdi
2003*533d3a49SEdward Gillett	xor	%eax, %eax
2004*533d3a49SEdward Gillett	shr	$3, %ecx
2005*533d3a49SEdward Gillett	jz	LABEL(strncpy_fill_less_8)
20067c478bd9Sstevel@tonic-gate
20077c478bd9Sstevel@tonic-gate	rep	stosq
2008*533d3a49SEdward GillettLABEL(strncpy_fill_less_8):
2009*533d3a49SEdward Gillett	mov	%r8, %rcx
2010*533d3a49SEdward Gillett	and	$7, %rcx
2011*533d3a49SEdward Gillett	jz	LABEL(strncpy_fill_return)
2012*533d3a49SEdward GillettLABEL(strncpy_fill_less_7):
2013*533d3a49SEdward Gillett	sub	$1, %ecx
20147c478bd9Sstevel@tonic-gate	mov	%al, (%rdi, %rcx)
2015*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_less_7)
2016*533d3a49SEdward GillettLABEL(strncpy_fill_return):
2017*533d3a49SEdward Gillett	mov	%rdx, %rax
20187c478bd9Sstevel@tonic-gate	ret
20197c478bd9Sstevel@tonic-gate#endif
20207c478bd9Sstevel@tonic-gate
20217c478bd9Sstevel@tonic-gate	.p2align 4
2022*533d3a49SEdward GillettLABEL(tail_0):				/* 1 byte */
2023*533d3a49SEdward Gillett	mov	(%rsi), %cl
2024*533d3a49SEdward Gillett	mov	%cl, (%rdi)
2025*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2026*533d3a49SEdward Gillett	mov	$1, %cl
2027*533d3a49SEdward Gillett	sub	$1, %r8
2028*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
20297c478bd9Sstevel@tonic-gate#endif
20307c478bd9Sstevel@tonic-gate	ret
20317c478bd9Sstevel@tonic-gate
2032*533d3a49SEdward Gillett	.p2align 4
2033*533d3a49SEdward GillettLABEL(tail_1):				/* 2 bytes */
2034*533d3a49SEdward Gillett	mov	(%rsi), %cx
2035*533d3a49SEdward Gillett	mov	%cx, (%rdi)
2036*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2037*533d3a49SEdward Gillett	mov	$2, %cl
2038*533d3a49SEdward Gillett	sub	$2, %r8
2039*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2040*533d3a49SEdward Gillett#endif
2041*533d3a49SEdward Gillett	ret
2042*533d3a49SEdward Gillett
2043*533d3a49SEdward Gillett	.p2align 4
2044*533d3a49SEdward GillettLABEL(tail_2):				/* 3 bytes */
2045*533d3a49SEdward Gillett	mov	(%rsi), %cx
2046*533d3a49SEdward Gillett	mov	%cx, (%rdi)
2047*533d3a49SEdward Gillett	mov	1(%rsi), %cx
2048*533d3a49SEdward Gillett	mov	%cx, 1(%rdi)
2049*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2050*533d3a49SEdward Gillett	mov	$3, %cl
2051*533d3a49SEdward Gillett	sub	$3, %r8
2052*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2053*533d3a49SEdward Gillett#endif
2054*533d3a49SEdward Gillett	ret
2055*533d3a49SEdward Gillett
2056*533d3a49SEdward Gillett	.p2align 4
2057*533d3a49SEdward GillettLABEL(tail_3):				/* 4 bytes */
2058*533d3a49SEdward Gillett	mov	(%rsi), %ecx
2059*533d3a49SEdward Gillett	mov	%ecx, (%rdi)
2060*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2061*533d3a49SEdward Gillett	mov	$4, %cl
2062*533d3a49SEdward Gillett	sub	$4, %r8
2063*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2064*533d3a49SEdward Gillett#endif
2065*533d3a49SEdward Gillett	ret
2066*533d3a49SEdward Gillett
2067*533d3a49SEdward Gillett	.p2align 4
2068*533d3a49SEdward GillettLABEL(tail_4):				/* 5 bytes */
2069*533d3a49SEdward Gillett	mov	(%rsi), %ecx
2070*533d3a49SEdward Gillett	mov	%ecx, (%rdi)
2071*533d3a49SEdward Gillett	mov	1(%rsi), %edx
2072*533d3a49SEdward Gillett	mov	%edx, 1(%rdi)
2073*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2074*533d3a49SEdward Gillett	mov	$5, %cl
2075*533d3a49SEdward Gillett	sub	$5, %r8
2076*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2077*533d3a49SEdward Gillett#endif
2078*533d3a49SEdward Gillett	ret
2079*533d3a49SEdward Gillett
2080*533d3a49SEdward Gillett	.p2align 4
2081*533d3a49SEdward GillettLABEL(tail_5):				/* 6 bytes */
2082*533d3a49SEdward Gillett	mov	(%rsi), %ecx
2083*533d3a49SEdward Gillett	mov	%ecx, (%rdi)
2084*533d3a49SEdward Gillett	mov	2(%rsi), %edx
2085*533d3a49SEdward Gillett	mov	%edx, 2(%rdi)
2086*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2087*533d3a49SEdward Gillett	mov	$6, %cl
2088*533d3a49SEdward Gillett	sub	$6, %r8
2089*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2090*533d3a49SEdward Gillett#endif
2091*533d3a49SEdward Gillett	ret
2092*533d3a49SEdward Gillett
2093*533d3a49SEdward Gillett	.p2align 4
2094*533d3a49SEdward GillettLABEL(tail_6):				/* 7 bytes */
2095*533d3a49SEdward Gillett	mov	(%rsi), %ecx
2096*533d3a49SEdward Gillett	mov	%ecx, (%rdi)
2097*533d3a49SEdward Gillett	mov	3(%rsi), %edx
2098*533d3a49SEdward Gillett	mov	%edx,3(%rdi)
2099*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2100*533d3a49SEdward Gillett	mov	$7, %cl
2101*533d3a49SEdward Gillett	sub	$7, %r8
2102*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2103*533d3a49SEdward Gillett#endif
2104*533d3a49SEdward Gillett	ret
2105*533d3a49SEdward Gillett
2106*533d3a49SEdward Gillett	.p2align 4
2107*533d3a49SEdward GillettLABEL(tail_8):				/* 9 bytes */
2108*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2109*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2110*533d3a49SEdward Gillett	mov	5(%rsi), %edx
2111*533d3a49SEdward Gillett	mov	%edx, 5(%rdi)
2112*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2113*533d3a49SEdward Gillett	mov	$9, %cl
2114*533d3a49SEdward Gillett	sub	$9, %r8
2115*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2116*533d3a49SEdward Gillett#endif
2117*533d3a49SEdward Gillett	ret
2118*533d3a49SEdward Gillett
2119*533d3a49SEdward Gillett	.p2align 4
2120*533d3a49SEdward GillettLABEL(AMD_exit_more_8):
2121*533d3a49SEdward Gillett	test	%dh, %dh
2122*533d3a49SEdward Gillett	jz	LABEL(AMD_exit_more_16)
2123*533d3a49SEdward Gillett	test	$0x01, %dh
2124*533d3a49SEdward Gillett	jnz	LABEL(tail_8)
2125*533d3a49SEdward Gillett	test	$0x02, %dh
2126*533d3a49SEdward Gillett	jnz	LABEL(tail_9)
2127*533d3a49SEdward Gillett	test	$0x04, %dh
2128*533d3a49SEdward Gillett	jnz	LABEL(tail_10)
2129*533d3a49SEdward Gillett	test	$0x08, %dh
2130*533d3a49SEdward Gillett	jnz	LABEL(tail_11)
2131*533d3a49SEdward Gillett	test	$0x10, %dh
2132*533d3a49SEdward Gillett	jnz	LABEL(tail_12)
2133*533d3a49SEdward Gillett	test	$0x20, %dh
2134*533d3a49SEdward Gillett	jnz	LABEL(tail_13)
2135*533d3a49SEdward Gillett	test	$0x40, %dh
2136*533d3a49SEdward Gillett	jnz	LABEL(tail_14)
2137*533d3a49SEdward Gillett
2138*533d3a49SEdward Gillett	.p2align 4
2139*533d3a49SEdward GillettLABEL(tail_15):				/* 16 bytes */
2140*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2141*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2142*533d3a49SEdward Gillett	mov	8(%rsi), %rdx
2143*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
2144*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2145*533d3a49SEdward Gillett	mov	$16, %cl
2146*533d3a49SEdward Gillett	sub	$16, %r8
2147*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2148*533d3a49SEdward Gillett#endif
2149*533d3a49SEdward Gillett	ret
2150*533d3a49SEdward Gillett
2151*533d3a49SEdward Gillett	.p2align 4
2152*533d3a49SEdward GillettLABEL(tail_9):				/* 10 bytes */
2153*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2154*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2155*533d3a49SEdward Gillett	mov	6(%rsi), %edx
2156*533d3a49SEdward Gillett	mov	%edx, 6(%rdi)
2157*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2158*533d3a49SEdward Gillett	mov	$10, %cl
2159*533d3a49SEdward Gillett	sub	$10, %r8
2160*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2161*533d3a49SEdward Gillett#endif
2162*533d3a49SEdward Gillett	ret
2163*533d3a49SEdward Gillett
2164*533d3a49SEdward Gillett	.p2align 4
2165*533d3a49SEdward GillettLABEL(tail_10):				/* 11 bytes */
2166*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2167*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2168*533d3a49SEdward Gillett	mov	7(%rsi), %edx
2169*533d3a49SEdward Gillett	mov	%edx, 7(%rdi)
2170*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2171*533d3a49SEdward Gillett	mov	$11, %cl
2172*533d3a49SEdward Gillett	sub	$11, %r8
2173*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2174*533d3a49SEdward Gillett#endif
2175*533d3a49SEdward Gillett	ret
2176*533d3a49SEdward Gillett
2177*533d3a49SEdward Gillett	.p2align 4
2178*533d3a49SEdward GillettLABEL(tail_11):				/* 12 bytes */
2179*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2180*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2181*533d3a49SEdward Gillett	mov	8(%rsi), %edx
2182*533d3a49SEdward Gillett	mov	%edx, 8(%rdi)
2183*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2184*533d3a49SEdward Gillett	mov	$12, %cl
2185*533d3a49SEdward Gillett	sub	$12, %r8
2186*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2187*533d3a49SEdward Gillett#endif
2188*533d3a49SEdward Gillett	ret
2189*533d3a49SEdward Gillett
2190*533d3a49SEdward Gillett	.p2align 4
2191*533d3a49SEdward GillettLABEL(tail_12):				/* 13 bytes */
2192*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2193*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2194*533d3a49SEdward Gillett	mov	5(%rsi), %rcx
2195*533d3a49SEdward Gillett	mov	%rcx, 5(%rdi)
2196*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2197*533d3a49SEdward Gillett	mov	$13, %cl
2198*533d3a49SEdward Gillett	sub	$13, %r8
2199*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2200*533d3a49SEdward Gillett#endif
2201*533d3a49SEdward Gillett	ret
2202*533d3a49SEdward Gillett
2203*533d3a49SEdward Gillett	.p2align 4
2204*533d3a49SEdward GillettLABEL(tail_13):				/* 14 bytes */
2205*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2206*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2207*533d3a49SEdward Gillett	mov	6(%rsi), %rcx
2208*533d3a49SEdward Gillett	mov	%rcx, 6(%rdi)
2209*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2210*533d3a49SEdward Gillett	mov	$14, %cl
2211*533d3a49SEdward Gillett	sub	$14, %r8
2212*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2213*533d3a49SEdward Gillett#endif
2214*533d3a49SEdward Gillett	ret
2215*533d3a49SEdward Gillett
2216*533d3a49SEdward Gillett	.p2align 4
2217*533d3a49SEdward GillettLABEL(tail_14):				/* 15 bytes */
2218*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2219*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2220*533d3a49SEdward Gillett	mov	7(%rsi), %rcx
2221*533d3a49SEdward Gillett	mov	%rcx, 7(%rdi)
2222*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2223*533d3a49SEdward Gillett	mov	$15, %cl
2224*533d3a49SEdward Gillett	sub	$15, %r8
2225*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2226*533d3a49SEdward Gillett#endif
2227*533d3a49SEdward Gillett	ret
2228*533d3a49SEdward Gillett
2229*533d3a49SEdward Gillett	.p2align 4
2230*533d3a49SEdward GillettLABEL(AMD_exit_more_16):
2231*533d3a49SEdward Gillett	shr	$16, %edx
2232*533d3a49SEdward Gillett	test	%dl, %dl
2233*533d3a49SEdward Gillett	jz	LABEL(AMD_exit_more_24)
2234*533d3a49SEdward Gillett	test	$0x01, %dl
2235*533d3a49SEdward Gillett	jnz	LABEL(tail_16)
2236*533d3a49SEdward Gillett	test	$0x02, %dl
2237*533d3a49SEdward Gillett	jnz	LABEL(tail_17)
2238*533d3a49SEdward Gillett	test	$0x04, %dl
2239*533d3a49SEdward Gillett	jnz	LABEL(tail_18)
2240*533d3a49SEdward Gillett	test	$0x08, %dl
2241*533d3a49SEdward Gillett	jnz	LABEL(tail_19)
2242*533d3a49SEdward Gillett	test	$0x10, %dl
2243*533d3a49SEdward Gillett	jnz	LABEL(tail_20)
2244*533d3a49SEdward Gillett	test	$0x20, %dl
2245*533d3a49SEdward Gillett	jnz	LABEL(tail_21)
2246*533d3a49SEdward Gillett	test	$0x40, %dl
2247*533d3a49SEdward Gillett	jnz	LABEL(tail_22)
2248*533d3a49SEdward Gillett
2249*533d3a49SEdward Gillett	.p2align 4
2250*533d3a49SEdward GillettLABEL(tail_23):				/* 24 bytes */
2251*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2252*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2253*533d3a49SEdward Gillett	mov	8(%rsi), %rdx
2254*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
2255*533d3a49SEdward Gillett	mov	16(%rsi), %rcx
2256*533d3a49SEdward Gillett	mov	%rcx, 16(%rdi)
2257*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2258*533d3a49SEdward Gillett	mov	$24, %cl
2259*533d3a49SEdward Gillett	sub	$24, %r8
2260*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2261*533d3a49SEdward Gillett#endif
2262*533d3a49SEdward Gillett	ret
2263*533d3a49SEdward Gillett
2264*533d3a49SEdward Gillett	.p2align 4
2265*533d3a49SEdward GillettLABEL(tail_16):				/* 17 bytes */
2266*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2267*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2268*533d3a49SEdward Gillett	mov	8(%rsi), %rdx
2269*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
2270*533d3a49SEdward Gillett	mov	16(%rsi), %cl
2271*533d3a49SEdward Gillett	mov	%cl, 16(%rdi)
2272*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2273*533d3a49SEdward Gillett	mov	$17, %cl
2274*533d3a49SEdward Gillett	sub	$17, %r8
2275*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2276*533d3a49SEdward Gillett#endif
2277*533d3a49SEdward Gillett	ret
2278*533d3a49SEdward Gillett
2279*533d3a49SEdward Gillett	.p2align 4
2280*533d3a49SEdward GillettLABEL(tail_17):				/* 18 bytes */
2281*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2282*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2283*533d3a49SEdward Gillett	mov	8(%rsi), %rdx
2284*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
2285*533d3a49SEdward Gillett	mov	16(%rsi), %cx
2286*533d3a49SEdward Gillett	mov	%cx, 16(%rdi)
2287*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2288*533d3a49SEdward Gillett	mov	$18, %cl
2289*533d3a49SEdward Gillett	sub	$18, %r8
2290*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2291*533d3a49SEdward Gillett#endif
2292*533d3a49SEdward Gillett	ret
2293*533d3a49SEdward Gillett
2294*533d3a49SEdward Gillett	.p2align 4
2295*533d3a49SEdward GillettLABEL(tail_18):				/* 19 bytes */
2296*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2297*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2298*533d3a49SEdward Gillett	mov	8(%rsi), %rdx
2299*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
2300*533d3a49SEdward Gillett	mov	15(%rsi), %ecx
2301*533d3a49SEdward Gillett	mov	%ecx,15(%rdi)
2302*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2303*533d3a49SEdward Gillett	mov	$19, %cl
2304*533d3a49SEdward Gillett	sub	$19, %r8
2305*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2306*533d3a49SEdward Gillett#endif
2307*533d3a49SEdward Gillett	ret
2308*533d3a49SEdward Gillett
2309*533d3a49SEdward Gillett	.p2align 4
2310*533d3a49SEdward GillettLABEL(tail_19):				/* 20 bytes */
2311*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2312*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2313*533d3a49SEdward Gillett	mov	8(%rsi), %rdx
2314*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
2315*533d3a49SEdward Gillett	mov	16(%rsi), %ecx
2316*533d3a49SEdward Gillett	mov	%ecx, 16(%rdi)
2317*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2318*533d3a49SEdward Gillett	mov	$20, %cl
2319*533d3a49SEdward Gillett	sub	$20, %r8
2320*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2321*533d3a49SEdward Gillett#endif
2322*533d3a49SEdward Gillett	ret
2323*533d3a49SEdward Gillett
2324*533d3a49SEdward Gillett	.p2align 4
2325*533d3a49SEdward GillettLABEL(tail_20):				/* 21 bytes */
2326*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2327*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2328*533d3a49SEdward Gillett	mov	8(%rsi), %rdx
2329*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
2330*533d3a49SEdward Gillett	mov	13(%rsi), %rcx
2331*533d3a49SEdward Gillett	mov	%rcx, 13(%rdi)
2332*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2333*533d3a49SEdward Gillett	mov	$21, %cl
2334*533d3a49SEdward Gillett	sub	$21, %r8
2335*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2336*533d3a49SEdward Gillett#endif
2337*533d3a49SEdward Gillett	ret
2338*533d3a49SEdward Gillett
2339*533d3a49SEdward Gillett	.p2align 4
2340*533d3a49SEdward GillettLABEL(tail_21):				/* 22 bytes */
2341*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2342*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2343*533d3a49SEdward Gillett	mov	8(%rsi), %rdx
2344*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
2345*533d3a49SEdward Gillett	mov	14(%rsi), %rcx
2346*533d3a49SEdward Gillett	mov	%rcx, 14(%rdi)
2347*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2348*533d3a49SEdward Gillett	mov	$22, %cl
2349*533d3a49SEdward Gillett	sub	$22, %r8
2350*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2351*533d3a49SEdward Gillett#endif
2352*533d3a49SEdward Gillett	ret
2353*533d3a49SEdward Gillett
2354*533d3a49SEdward Gillett	.p2align 4
2355*533d3a49SEdward GillettLABEL(tail_22):				/* 23 bytes */
2356*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2357*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2358*533d3a49SEdward Gillett	mov	8(%rsi), %rdx
2359*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
2360*533d3a49SEdward Gillett	mov	15(%rsi), %rcx
2361*533d3a49SEdward Gillett	mov	%rcx, 15(%rdi)
2362*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2363*533d3a49SEdward Gillett	mov	$23, %cl
2364*533d3a49SEdward Gillett	sub	$23, %r8
2365*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2366*533d3a49SEdward Gillett#endif
2367*533d3a49SEdward Gillett	ret
2368*533d3a49SEdward Gillett
2369*533d3a49SEdward Gillett	.p2align 4
2370*533d3a49SEdward GillettLABEL(AMD_exit_more_24):
2371*533d3a49SEdward Gillett	test	$0x01, %dh
2372*533d3a49SEdward Gillett	jnz	LABEL(tail_24)
2373*533d3a49SEdward Gillett	test	$0x02, %dh
2374*533d3a49SEdward Gillett	jnz	LABEL(tail_25)
2375*533d3a49SEdward Gillett	test	$0x04, %dh
2376*533d3a49SEdward Gillett	jnz	LABEL(tail_26)
2377*533d3a49SEdward Gillett	test	$0x08, %dh
2378*533d3a49SEdward Gillett	jnz	LABEL(tail_27)
2379*533d3a49SEdward Gillett	test	$0x10, %dh
2380*533d3a49SEdward Gillett	jnz	LABEL(tail_28)
2381*533d3a49SEdward Gillett	test	$0x20, %dh
2382*533d3a49SEdward Gillett	jnz	LABEL(tail_29)
2383*533d3a49SEdward Gillett	test	$0x40, %dh
2384*533d3a49SEdward Gillett	jnz	LABEL(tail_30)
2385*533d3a49SEdward Gillett
2386*533d3a49SEdward Gillett	.p2align 4
2387*533d3a49SEdward GillettLABEL(tail_31):				/* 32 bytes */
2388*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2389*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2390*533d3a49SEdward Gillett	mov	8(%rsi), %rdx
2391*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
2392*533d3a49SEdward Gillett	mov	16(%rsi), %rcx
2393*533d3a49SEdward Gillett	mov	%rcx, 16(%rdi)
2394*533d3a49SEdward Gillett	mov	24(%rsi), %rdx
2395*533d3a49SEdward Gillett	mov	%rdx, 24(%rdi)
2396*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2397*533d3a49SEdward Gillett	mov	$32, %cl
2398*533d3a49SEdward Gillett	sub	$32, %r8
2399*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2400*533d3a49SEdward Gillett#endif
2401*533d3a49SEdward Gillett	ret
2402*533d3a49SEdward Gillett
2403*533d3a49SEdward Gillett	.p2align 4
2404*533d3a49SEdward GillettLABEL(tail_24):				/* 25 bytes */
2405*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2406*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2407*533d3a49SEdward Gillett	mov	8(%rsi), %rdx
2408*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
2409*533d3a49SEdward Gillett	mov	16(%rsi), %rcx
2410*533d3a49SEdward Gillett	mov	%rcx, 16(%rdi)
2411*533d3a49SEdward Gillett	mov	21(%rsi), %edx
2412*533d3a49SEdward Gillett	mov	%edx, 21(%rdi)
2413*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2414*533d3a49SEdward Gillett	mov	$25, %cl
2415*533d3a49SEdward Gillett	sub	$25, %r8
2416*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2417*533d3a49SEdward Gillett#endif
2418*533d3a49SEdward Gillett	ret
2419*533d3a49SEdward Gillett
2420*533d3a49SEdward Gillett	.p2align 4
2421*533d3a49SEdward GillettLABEL(tail_25):				/* 26 bytes */
2422*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2423*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2424*533d3a49SEdward Gillett	mov	8(%rsi), %rdx
2425*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
2426*533d3a49SEdward Gillett	mov	16(%rsi), %rcx
2427*533d3a49SEdward Gillett	mov	%rcx, 16(%rdi)
2428*533d3a49SEdward Gillett	mov	22(%rsi), %edx
2429*533d3a49SEdward Gillett	mov	%edx, 22(%rdi)
2430*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2431*533d3a49SEdward Gillett	mov	$26, %cl
2432*533d3a49SEdward Gillett	sub	$26, %r8
2433*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2434*533d3a49SEdward Gillett#endif
2435*533d3a49SEdward Gillett	ret
2436*533d3a49SEdward Gillett
2437*533d3a49SEdward Gillett	.p2align 4
2438*533d3a49SEdward GillettLABEL(tail_26):				/* 27 bytes */
2439*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2440*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2441*533d3a49SEdward Gillett	mov	8(%rsi), %rdx
2442*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
2443*533d3a49SEdward Gillett	mov	16(%rsi), %rcx
2444*533d3a49SEdward Gillett	mov	%rcx, 16(%rdi)
2445*533d3a49SEdward Gillett	mov	23(%rsi), %edx
2446*533d3a49SEdward Gillett	mov	%edx, 23(%rdi)
2447*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2448*533d3a49SEdward Gillett	mov	$27, %cl
2449*533d3a49SEdward Gillett	sub	$27, %r8
2450*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2451*533d3a49SEdward Gillett#endif
2452*533d3a49SEdward Gillett	ret
2453*533d3a49SEdward Gillett
2454*533d3a49SEdward Gillett	.p2align 4
2455*533d3a49SEdward GillettLABEL(tail_27):				/* 28 bytes */
2456*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2457*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2458*533d3a49SEdward Gillett	mov	8(%rsi), %rdx
2459*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
2460*533d3a49SEdward Gillett	mov	16(%rsi), %rcx
2461*533d3a49SEdward Gillett	mov	%rcx, 16(%rdi)
2462*533d3a49SEdward Gillett	mov	24(%rsi), %edx
2463*533d3a49SEdward Gillett	mov	%edx, 24(%rdi)
2464*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2465*533d3a49SEdward Gillett	mov	$28, %cl
2466*533d3a49SEdward Gillett	sub	$28, %r8
2467*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2468*533d3a49SEdward Gillett#endif
2469*533d3a49SEdward Gillett	ret
2470*533d3a49SEdward Gillett
2471*533d3a49SEdward Gillett	.p2align 4
2472*533d3a49SEdward GillettLABEL(tail_28):				/* 29 bytes */
2473*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2474*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2475*533d3a49SEdward Gillett	mov	8(%rsi), %rdx
2476*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
2477*533d3a49SEdward Gillett	mov	16(%rsi), %rcx
2478*533d3a49SEdward Gillett	mov	%rcx, 16(%rdi)
2479*533d3a49SEdward Gillett	mov	21(%rsi), %rdx
2480*533d3a49SEdward Gillett	mov	%rdx, 21(%rdi)
2481*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2482*533d3a49SEdward Gillett	mov	$29, %cl
2483*533d3a49SEdward Gillett	sub	$29, %r8
2484*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2485*533d3a49SEdward Gillett#endif
2486*533d3a49SEdward Gillett	ret
2487*533d3a49SEdward Gillett
2488*533d3a49SEdward Gillett	.p2align 4
2489*533d3a49SEdward GillettLABEL(tail_29):				/* 30 bytes */
2490*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2491*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2492*533d3a49SEdward Gillett	mov	8(%rsi), %rdx
2493*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
2494*533d3a49SEdward Gillett	mov	16(%rsi), %rcx
2495*533d3a49SEdward Gillett	mov	%rcx, 16(%rdi)
2496*533d3a49SEdward Gillett	mov	22(%rsi), %rdx
2497*533d3a49SEdward Gillett	mov	%rdx, 22(%rdi)
2498*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2499*533d3a49SEdward Gillett	mov	$30, %cl
2500*533d3a49SEdward Gillett	sub	$30, %r8
2501*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2502*533d3a49SEdward Gillett#endif
2503*533d3a49SEdward Gillett	ret
2504*533d3a49SEdward Gillett
2505*533d3a49SEdward Gillett	.p2align 4
2506*533d3a49SEdward GillettLABEL(tail_30):				/* 31 bytes */
2507*533d3a49SEdward Gillett	mov	(%rsi), %rcx
2508*533d3a49SEdward Gillett	mov	%rcx, (%rdi)
2509*533d3a49SEdward Gillett	mov	8(%rsi), %rdx
2510*533d3a49SEdward Gillett	mov	%rdx, 8(%rdi)
2511*533d3a49SEdward Gillett	mov	16(%rsi), %rcx
2512*533d3a49SEdward Gillett	mov	%rcx, 16(%rdi)
2513*533d3a49SEdward Gillett	mov	23(%rsi), %rdx
2514*533d3a49SEdward Gillett	mov	%rdx, 23(%rdi)
2515*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY
2516*533d3a49SEdward Gillett	mov	$31, %cl
2517*533d3a49SEdward Gillett	sub	$31, %r8
2518*533d3a49SEdward Gillett	jnz	LABEL(strncpy_fill_tail)
2519*533d3a49SEdward Gillett#endif
2520*533d3a49SEdward Gillett	ret
2521*533d3a49SEdward Gillett
2522*533d3a49SEdward Gillett	.pushsection .rodata
2523*533d3a49SEdward Gillett	.p2align 4
2524*533d3a49SEdward GillettLABEL(tail_table):
2525*533d3a49SEdward Gillett	.int	LABEL(tail_0) - LABEL(tail_table)	/* 1 byte */
2526*533d3a49SEdward Gillett	.int	LABEL(tail_1) - LABEL(tail_table)
2527*533d3a49SEdward Gillett	.int	LABEL(tail_2) - LABEL(tail_table)
2528*533d3a49SEdward Gillett	.int	LABEL(tail_3) - LABEL(tail_table)
2529*533d3a49SEdward Gillett	.int	LABEL(tail_4) - LABEL(tail_table)
2530*533d3a49SEdward Gillett	.int	LABEL(tail_5) - LABEL(tail_table)
2531*533d3a49SEdward Gillett	.int	LABEL(tail_6) - LABEL(tail_table)
2532*533d3a49SEdward Gillett	.int	LABEL(tail_7) - LABEL(tail_table)
2533*533d3a49SEdward Gillett	.int	LABEL(tail_8) - LABEL(tail_table)
2534*533d3a49SEdward Gillett	.int	LABEL(tail_9) - LABEL(tail_table)
2535*533d3a49SEdward Gillett	.int	LABEL(tail_10) - LABEL(tail_table)
2536*533d3a49SEdward Gillett	.int	LABEL(tail_11) - LABEL(tail_table)
2537*533d3a49SEdward Gillett	.int	LABEL(tail_12) - LABEL(tail_table)
2538*533d3a49SEdward Gillett	.int	LABEL(tail_13) - LABEL(tail_table)
2539*533d3a49SEdward Gillett	.int	LABEL(tail_14) - LABEL(tail_table)
2540*533d3a49SEdward Gillett	.int	LABEL(tail_15) - LABEL(tail_table)
2541*533d3a49SEdward Gillett	.int	LABEL(tail_16) - LABEL(tail_table)
2542*533d3a49SEdward Gillett	.int	LABEL(tail_17) - LABEL(tail_table)
2543*533d3a49SEdward Gillett	.int	LABEL(tail_18) - LABEL(tail_table)
2544*533d3a49SEdward Gillett	.int	LABEL(tail_19) - LABEL(tail_table)
2545*533d3a49SEdward Gillett	.int	LABEL(tail_20) - LABEL(tail_table)
2546*533d3a49SEdward Gillett	.int	LABEL(tail_21) - LABEL(tail_table)
2547*533d3a49SEdward Gillett	.int	LABEL(tail_22) - LABEL(tail_table)
2548*533d3a49SEdward Gillett	.int	LABEL(tail_23) - LABEL(tail_table)
2549*533d3a49SEdward Gillett	.int	LABEL(tail_24) - LABEL(tail_table)
2550*533d3a49SEdward Gillett	.int	LABEL(tail_25) - LABEL(tail_table)
2551*533d3a49SEdward Gillett	.int	LABEL(tail_26) - LABEL(tail_table)
2552*533d3a49SEdward Gillett	.int	LABEL(tail_27) - LABEL(tail_table)
2553*533d3a49SEdward Gillett	.int	LABEL(tail_28) - LABEL(tail_table)
2554*533d3a49SEdward Gillett	.int	LABEL(tail_29) - LABEL(tail_table)
2555*533d3a49SEdward Gillett	.int	LABEL(tail_30) - LABEL(tail_table)
2556*533d3a49SEdward Gillett	.int	LABEL(tail_31) - LABEL(tail_table)	/* 32 bytes */
2557*533d3a49SEdward Gillett
2558*533d3a49SEdward Gillett	.p2align 4
2559*533d3a49SEdward GillettLABEL(unaligned_table):
2560*533d3a49SEdward Gillett	.int	LABEL(ashr_0) - LABEL(unaligned_table)
2561*533d3a49SEdward Gillett	.int	LABEL(ashr_1) - LABEL(unaligned_table)
2562*533d3a49SEdward Gillett	.int	LABEL(ashr_2) - LABEL(unaligned_table)
2563*533d3a49SEdward Gillett	.int	LABEL(ashr_3) - LABEL(unaligned_table)
2564*533d3a49SEdward Gillett	.int	LABEL(ashr_4) - LABEL(unaligned_table)
2565*533d3a49SEdward Gillett	.int	LABEL(ashr_5) - LABEL(unaligned_table)
2566*533d3a49SEdward Gillett	.int	LABEL(ashr_6) - LABEL(unaligned_table)
2567*533d3a49SEdward Gillett	.int	LABEL(ashr_7) - LABEL(unaligned_table)
2568*533d3a49SEdward Gillett	.int	LABEL(ashr_8) - LABEL(unaligned_table)
2569*533d3a49SEdward Gillett	.int	LABEL(ashr_9) - LABEL(unaligned_table)
2570*533d3a49SEdward Gillett	.int	LABEL(ashr_10) - LABEL(unaligned_table)
2571*533d3a49SEdward Gillett	.int	LABEL(ashr_11) - LABEL(unaligned_table)
2572*533d3a49SEdward Gillett	.int	LABEL(ashr_12) - LABEL(unaligned_table)
2573*533d3a49SEdward Gillett	.int	LABEL(ashr_13) - LABEL(unaligned_table)
2574*533d3a49SEdward Gillett	.int	LABEL(ashr_14) - LABEL(unaligned_table)
2575*533d3a49SEdward Gillett	.int	LABEL(ashr_15) - LABEL(unaligned_table)
2576*533d3a49SEdward Gillett	.popsection
2577*533d3a49SEdward Gillett
25787c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCPY
25797c478bd9Sstevel@tonic-gate	SET_SIZE(strncpy)
25807c478bd9Sstevel@tonic-gate#else
25817c478bd9Sstevel@tonic-gate	SET_SIZE(strcpy)			/* (char *, const char *) */
25827c478bd9Sstevel@tonic-gate#endif
2583