xref: /illumos-gate/usr/src/lib/libc/amd64/gen/strcpy.S (revision b210e77709da8e42dfe621e10ccf4be504206058)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2009, Intel Corporation
24 * All rights reserved.
25 */
26
27/*
28 *	str[n]cpy - copy [n] chars from second operand into first operand
29 */
30#include "SYS.h"
31#include "proc64_id.h"
32
33#define LABEL(s) .strcpy##s
34
35#ifdef USE_AS_STRNCPY
36	ENTRY(strncpy)
37	test	%edx, %edx
38	jz	LABEL(strncpy_exitz)
39	mov	%rdx, %r8
40#else
41	ENTRY(strcpy)				/* (char *, const char *) */
42	xor	%rdx, %rdx
43#endif
44	mov	%esi, %ecx
45	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
46	and	$0xf, %rcx
47	mov	%rdi, %rax			/* save destination address for return value */
48
49
50	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char checks */
51	pcmpeqb	(%rsi), %xmm0			/* check 16 bytes in src for null */
52	pmovmskb %xmm0, %edx
53	shr	%cl, %edx			/* adjust for offset from 16byte boundary */
54	test	%edx, %edx			/* edx will be 0 if chars are non-null */
55	jnz	LABEL(less16bytes)		/* null char found in first 16 bytes examined */
56#ifdef USE_AS_STRNCPY
57	/*
58	 * Check if the count is satisfied in first 16 bytes examined.
59	 */
60	lea	-16(%r8, %rcx), %r11
61	cmp	$0, %r11
62	jle	LABEL(less16bytes)
63#endif
64	mov	%rcx, %r9			/* rsi alignment offset */
65	or	%edi, %ecx
66	and	$0xf, %ecx
67	lea	-16(%r9), %r10
68	jz	LABEL(ashr_0)			/* src and dest are both 16 byte aligned */
69
70	neg	%r10				/* max src bytes remaining in current dqword */
71
72	pxor	%xmm0, %xmm0			/* clear %xmm0, may be polluted by unaligned operation */
73	pcmpeqb	16(%rsi), %xmm0			/* check next 16 bytes in src for a null */
74	pmovmskb %xmm0, %edx
75	test	%edx, %edx
76	jnz	LABEL(less32bytes)		/* null char found in first 32 bytes examined */
77
78#ifdef USE_AS_STRNCPY
79	/*
80	 * If strncpy count <= 16 go to exit case
81	 */
82	sub	$16, %r8
83	jbe	LABEL(less32bytes_strncpy_truncation)
84#endif
85	/*
86	 * At least 16 bytes to copy to destination string. Move them now.
87	 * Don't worry about alignment.
88	 */
89	mov	(%rsi, %r9), %rdx
90	mov	%rdx, (%rdi)
91	mov	8(%rsi, %r9), %rdx
92	mov	%rdx, 8(%rdi)
93
94	/*
95	 * so far destination rdi may be aligned by 16, re-calculate rsi and
96	 * jump to corresponding src/dest relative offset case.
97	 * 	rcx is offset of rsi
98	 * 	rdx is offset of rdi
99	 */
100	and	$0xfffffffffffffff0, %rdi	/* force rdi 16 byte align */
101	mov	%rax, %rdx			/* rax contains orignal rdi */
102	xor	%rdi, %rdx			/* same effect as "and $0xf, %rdx" */
103#ifdef USE_AS_STRNCPY
104	/*
105	 * Will now do 16 byte aligned stores. Stores may overlap some bytes
106	 * (ie store twice) if destination was unaligned. Compensate here.
107	 */
108	add	%rdx, %r8			/* compensate for overlap */
109#endif
110
111	add	$16, %rdi			/* next 16 bytes for dest */
112
113	/*
114	 * align src to 16-byte boundary. Could be up or down depending on
115	 * whether src offset - dest offset > 0 (up) or
116	 *  src offset - dest offset < 0 (down).
117	 */
118	sub	%rdx, %r9			/* src offset - dest offset */
119
120	lea	16(%r9, %rsi), %rsi
121	mov	%esi, %ecx			/* for new src offset */
122	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
123
124	and	$0xf, %ecx			/* new src offset is 0 if rsi/rdi have same alignment */
125	jz	LABEL(ashr_0)
126
127#ifdef USE_AS_STRNCPY
128	xor	%edx, %edx			/* In case unaligned_exit is taken */
129#endif
130	/*
131	 * Jump to case corresponding to source/dest string relative offsets
132	 * Index = (16 + (src offset - dest offset)) % 16
133	 */
134	lea	-16(%rcx), %r10
135	mov	%rcx, %r9
136	neg	%r10				/* max src bytes remaining in current dqword */
137	lea	LABEL(unaligned_table)(%rip), %r11
138	movslq	(%r11, %rcx, 4), %rcx
139	lea	(%r11, %rcx), %rcx
140	jmp	*%rcx
141
142/*
143 * ashr_0 handles the following cases:
144 * 	src alignment offset = dest alignment offset
145 */
146	.p2align 5
147LABEL(ashr_0):
148#ifdef USE_AS_STRNCPY
149	sub	$16, %r8
150 	jbe	LABEL(strncpy_truncation_aligned)
151#endif
152	movdqa	(%rsi), %xmm1		/* fetch 16 bytes from src string */
153	movdqa	%xmm1, (%rdi)		/* store 16 bytes into dest string */
154	add	$16, %rsi
155	add	$16, %rdi
156	pcmpeqb	(%rsi), %xmm0		/* check 16 bytes in src for a null */
157	pmovmskb %xmm0, %edx
158
159	test	%edx, %edx		/* edx will be 0 if chars are non-null */
160	jnz	LABEL(aligned_16bytes)	/* exit tail */
161
162LABEL(ashr_0_loop):
163#ifdef USE_AS_STRNCPY
164	sub	$16, %r8
165	jbe	LABEL(strncpy_truncation_aligned)
166#endif
167	movdqa	(%rsi, %rcx), %xmm1
168	movdqa	%xmm1, (%rdi, %rcx)
169	add	$16, %rcx
170	pcmpeqb	(%rsi, %rcx), %xmm0
171	pmovmskb %xmm0, %edx
172	test	%edx, %edx
173	jnz	LABEL(aligned_exit)
174
175#ifdef USE_AS_STRNCPY
176	sub	$16, %r8
177	jbe	LABEL(strncpy_truncation_aligned)
178#endif
179	movdqa  (%rsi, %rcx), %xmm1
180	movdqa  %xmm1, (%rdi, %rcx)
181	add	$16, %rcx
182	pcmpeqb  (%rsi, %rcx), %xmm0
183	pmovmskb  %xmm0, %edx
184	test	%edx, %edx
185	jnz	LABEL(aligned_exit)
186
187#ifdef USE_AS_STRNCPY
188	sub	$16, %r8
189	jbe	LABEL(strncpy_truncation_aligned)
190#endif
191	movdqa  (%rsi, %rcx), %xmm1
192	movdqa  %xmm1, (%rdi, %rcx)
193
194	add	$16, %rcx
195	pcmpeqb  (%rsi, %rcx), %xmm0
196	pmovmskb  %xmm0, %edx
197	test	%edx, %edx
198	jnz	LABEL(aligned_exit)
199
200#ifdef USE_AS_STRNCPY
201	sub	$16, %r8
202	jbe	LABEL(strncpy_truncation_aligned)
203#endif
204	movdqa  (%rsi, %rcx), %xmm1
205	movdqa  %xmm1, (%rdi, %rcx)
206	add	$16, %rcx
207	pcmpeqb  (%rsi, %rcx), %xmm0
208	pmovmskb  %xmm0, %edx
209	test	%edx, %edx
210	jz	LABEL(ashr_0_loop)
211	jmp	LABEL(aligned_exit)
212
213
214/*
215 * ashr_15 handles the following cases:
216 * 	(16 + (src offset - dest offset)) % 16 = 15
217 *
218 * Based on above operation, start from (%r9 + rsi) to the left of this cache
219 * bank, there is no null byte.
220 */
221	.p2align 4
222LABEL(ashr_15):
223	xor	%ecx, %ecx				/* clear index */
224#ifdef USE_AS_STRNCPY
225	cmp	%r10, %r8
226	jbe	LABEL(unaligned_exit)
227#endif
228	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
229	jz	LABEL(ashr_15_use_sse2)
230
231	.p2align 4
232LABEL(ashr_15_use_ssse3):
233	movdqa	16(%rsi, %rcx), %xmm3
234	pcmpeqb	%xmm3, %xmm0
235	pmovmskb %xmm0, %edx
236	test	%edx, %edx
237	jnz	LABEL(unaligned_exit)
238#ifdef USE_AS_STRNCPY
239	sub	$16, %r8
240 	jbe	LABEL(strncpy_truncation_unaligned)
241#endif
242
243	#palignr $15, (%rsi, %rcx), %xmm3
244	.byte	0x66, 0x0F, 0x3A ,0x0F
245	.byte	0x1c, 0x0e, 0x0f
246
247	movdqa	%xmm3, (%rdi, %rcx)
248	add	$16, %rcx
249
250#ifdef USE_AS_STRNCPY
251	cmp	%r10, %r8
252	jbe	LABEL(unaligned_exit)
253#endif
254	movdqa	16(%rsi, %rcx), %xmm3
255	pcmpeqb %xmm3, %xmm0
256	pmovmskb %xmm0, %edx
257	test	%edx, %edx
258	jnz	LABEL(unaligned_exit)
259#ifdef USE_AS_STRNCPY
260	sub	$16, %r8
261 	jbe	LABEL(strncpy_truncation_unaligned)
262#endif
263
264	#palignr $15, (%rsi, %rcx), %xmm3
265	.byte	0x66, 0x0F, 0x3A ,0x0F
266	.byte	0x1c, 0x0e, 0x0f
267
268	movdqa	%xmm3, (%rdi, %rcx)
269	add	$16, %rcx
270
271#ifdef USE_AS_STRNCPY
272	cmp	%r10, %r8
273	jbe	LABEL(unaligned_exit)
274#endif
275	jmp	LABEL(ashr_15_use_ssse3)
276
277	.p2align 4
278LABEL(ashr_15_use_sse2):
279	pcmpeqb 16(%rsi, %rcx), %xmm0
280	pmovmskb %xmm0, %edx
281	test	%edx, %edx
282	jnz	LABEL(unaligned_exit)
283#ifdef USE_AS_STRNCPY
284	sub	$16, %r8
285 	jbe	LABEL(strncpy_truncation_unaligned)
286#endif
287
288	movdqa	16(%rsi, %rcx), %xmm3
289	movdqa	(%rsi, %rcx), %xmm2
290
291	psrldq	$15, %xmm2
292	pslldq	$1, %xmm3
293	por	%xmm2, %xmm3
294
295	movdqa	%xmm3, (%rdi, %rcx)
296	add	$16, %rcx
297#ifdef USE_AS_STRNCPY
298	cmp	%r10, %r8
299	jbe	LABEL(unaligned_exit)
300#endif
301	pcmpeqb 16(%rsi, %rcx), %xmm0
302	pmovmskb %xmm0, %edx
303	test	%edx, %edx
304	jnz	LABEL(unaligned_exit)
305#ifdef USE_AS_STRNCPY
306	sub	$16, %r8
307 	jbe	LABEL(strncpy_truncation_unaligned)
308#endif
309
310	movdqa	16(%rsi, %rcx), %xmm3
311	movdqa	(%rsi, %rcx), %xmm2
312
313	psrldq	$15, %xmm2
314	pslldq	$1, %xmm3
315	por	%xmm2, %xmm3
316
317	movdqa	%xmm3, (%rdi, %rcx)
318	add	$16, %rcx
319#ifdef USE_AS_STRNCPY
320	cmp	%r10, %r8
321	jbe	LABEL(unaligned_exit)
322#endif
323	jmp	LABEL(ashr_15_use_sse2)
324
325
326/*
327 * ashr_14 handles the following cases:
328 * 	(16 + (src offset - dest offset)) % 16 = 14
329 *
330 * Based on above operation, start from (%r9 + rsi) to the left of this cache
331 * bank, there is no null byte.
332 */
333	.p2align 4
334LABEL(ashr_14):
335	xor	%ecx, %ecx				/* clear index */
336#ifdef USE_AS_STRNCPY
337	cmp	%r10, %r8
338	jbe	LABEL(unaligned_exit)
339#endif
340	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
341	jz	LABEL(ashr_14_use_sse2)
342
343	.p2align 4
344LABEL(ashr_14_use_ssse3):
345	movdqa	16(%rsi, %rcx), %xmm3
346	pcmpeqb	%xmm3, %xmm0
347	pmovmskb %xmm0, %edx
348	test	%edx, %edx
349	jnz	LABEL(unaligned_exit)
350#ifdef USE_AS_STRNCPY
351	sub	$16, %r8
352 	jbe	LABEL(strncpy_truncation_unaligned)
353#endif
354
355	#palignr $14, (%rsi, %rcx), %xmm3
356	.byte	0x66, 0x0F, 0x3A ,0x0F
357	.byte	0x1c, 0x0e, 0x0e
358
359	movdqa	%xmm3, (%rdi, %rcx)
360	add	$16, %rcx
361
362#ifdef USE_AS_STRNCPY
363	cmp	%r10, %r8
364	jbe	LABEL(unaligned_exit)
365#endif
366	movdqa	16(%rsi, %rcx), %xmm3
367	pcmpeqb %xmm3, %xmm0
368	pmovmskb %xmm0, %edx
369	test	%edx, %edx
370	jnz	LABEL(unaligned_exit)
371#ifdef USE_AS_STRNCPY
372	sub	$16, %r8
373 	jbe	LABEL(strncpy_truncation_unaligned)
374#endif
375
376	#palignr $14, (%rsi, %rcx), %xmm3
377	.byte	0x66, 0x0F, 0x3A ,0x0F
378	.byte	0x1c, 0x0e, 0x0e
379
380	movdqa	%xmm3, (%rdi, %rcx)
381	add	$16, %rcx
382#ifdef USE_AS_STRNCPY
383	cmp	%r10, %r8
384	jbe	LABEL(unaligned_exit)
385#endif
386	jmp	LABEL(ashr_14_use_ssse3)
387
388	.p2align 4
389LABEL(ashr_14_use_sse2):
390	pcmpeqb 16(%rsi, %rcx), %xmm0
391	pmovmskb %xmm0, %edx
392	test	%edx, %edx
393	jnz	LABEL(unaligned_exit)
394#ifdef USE_AS_STRNCPY
395	sub	$16, %r8
396 	jbe	LABEL(strncpy_truncation_unaligned)
397#endif
398
399	movdqa	16(%rsi, %rcx), %xmm3
400	movdqa	(%rsi, %rcx), %xmm2
401
402	psrldq	$14, %xmm2
403	pslldq	$2, %xmm3
404	por	%xmm2, %xmm3
405
406	movdqa	%xmm3, (%rdi, %rcx)
407	add	$16, %rcx
408
409#ifdef USE_AS_STRNCPY
410	cmp	%r10, %r8
411	jbe	LABEL(unaligned_exit)
412#endif
413	pcmpeqb 16(%rsi, %rcx), %xmm0
414	pmovmskb %xmm0, %edx
415	test	%edx, %edx
416	jnz	LABEL(unaligned_exit)
417#ifdef USE_AS_STRNCPY
418	sub	$16, %r8
419 	jbe	LABEL(strncpy_truncation_unaligned)
420#endif
421
422	movdqa	16(%rsi, %rcx), %xmm3
423	movdqa	(%rsi, %rcx), %xmm2
424
425	psrldq	$14, %xmm2
426	pslldq	$2, %xmm3
427	por	%xmm2, %xmm3
428
429	movdqa	%xmm3, (%rdi, %rcx)
430	add	$16, %rcx
431#ifdef USE_AS_STRNCPY
432	cmp	%r10, %r8
433	jbe	LABEL(unaligned_exit)
434#endif
435	jmp	LABEL(ashr_14_use_sse2)
436
437
438/*
439 * ashr_13 handles the following cases:
440 * 	(16 + (src offset - dest offset)) % 16 = 13
441 *
442 * Based on above operation, start from (%r9 + rsi) to the left of this cache
443 * bank, there is no null byte.
444 */
445	.p2align 4
446LABEL(ashr_13):
447	xor	%ecx, %ecx				/* clear index */
448#ifdef USE_AS_STRNCPY
449	cmp	%r10, %r8
450	jbe	LABEL(unaligned_exit)
451#endif
452	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
453	jz	LABEL(ashr_13_use_sse2)
454
455	.p2align 4
456LABEL(ashr_13_use_ssse3):
457	movdqa	16(%rsi, %rcx), %xmm3
458	pcmpeqb	%xmm3, %xmm0
459	pmovmskb %xmm0, %edx
460	test	%edx, %edx
461	jnz	LABEL(unaligned_exit)
462#ifdef USE_AS_STRNCPY
463	sub	$16, %r8
464 	jbe	LABEL(strncpy_truncation_unaligned)
465#endif
466
467	#palignr $13, (%rsi, %rcx), %xmm3
468	.byte	0x66, 0x0F, 0x3A ,0x0F
469	.byte	0x1c, 0x0e, 0x0d
470
471	movdqa	%xmm3, (%rdi, %rcx)
472	add	$16, %rcx
473
474#ifdef USE_AS_STRNCPY
475	cmp	%r10, %r8
476	jbe	LABEL(unaligned_exit)
477#endif
478	movdqa	16(%rsi, %rcx), %xmm3
479	pcmpeqb %xmm3, %xmm0
480	pmovmskb %xmm0, %edx
481	test	%edx, %edx
482	jnz	LABEL(unaligned_exit)
483#ifdef USE_AS_STRNCPY
484	sub	$16, %r8
485 	jbe	LABEL(strncpy_truncation_unaligned)
486#endif
487
488	#palignr $13, (%rsi, %rcx), %xmm3
489	.byte	0x66, 0x0F, 0x3A ,0x0F
490	.byte	0x1c, 0x0e, 0x0d
491
492	movdqa	%xmm3, (%rdi, %rcx)
493	add	$16, %rcx
494#ifdef USE_AS_STRNCPY
495	cmp	%r10, %r8
496	jbe	LABEL(unaligned_exit)
497#endif
498	jmp	LABEL(ashr_13_use_ssse3)
499
500	.p2align 4
501LABEL(ashr_13_use_sse2):
502	pcmpeqb 16(%rsi, %rcx), %xmm0
503	pmovmskb %xmm0, %edx
504	test	%edx, %edx
505	jnz	LABEL(unaligned_exit)
506#ifdef USE_AS_STRNCPY
507	sub	$16, %r8
508 	jbe	LABEL(strncpy_truncation_unaligned)
509#endif
510
511	movdqa	16(%rsi, %rcx), %xmm3
512	movdqa	(%rsi, %rcx), %xmm2
513
514	psrldq	$13, %xmm2
515	pslldq	$3, %xmm3
516	por	%xmm2, %xmm3
517
518	movdqa	%xmm3, (%rdi, %rcx)
519	add	$16, %rcx
520
521#ifdef USE_AS_STRNCPY
522	cmp	%r10, %r8
523	jbe	LABEL(unaligned_exit)
524#endif
525	pcmpeqb 16(%rsi, %rcx), %xmm0
526	pmovmskb %xmm0, %edx
527	test	%edx, %edx
528	jnz	LABEL(unaligned_exit)
529#ifdef USE_AS_STRNCPY
530	sub	$16, %r8
531 	jbe	LABEL(strncpy_truncation_unaligned)
532#endif
533
534	movdqa	16(%rsi, %rcx), %xmm3
535	movdqa	(%rsi, %rcx), %xmm2
536
537	psrldq	$13, %xmm2
538	pslldq	$3, %xmm3
539	por	%xmm2, %xmm3
540
541	movdqa	%xmm3, (%rdi, %rcx)
542	add	$16, %rcx
543#ifdef USE_AS_STRNCPY
544	cmp	%r10, %r8
545	jbe	LABEL(unaligned_exit)
546#endif
547	jmp	LABEL(ashr_13_use_sse2)
548
549
550/*
551 * ashr_12 handles the following cases:
552 * 	(16 + (src offset - dest offset)) % 16 = 12
553 *
554 * Based on above operation, start from (%r9 + rsi) to the left of this cache
555 * bank, there is no null byte.
556 */
557	.p2align 4
558LABEL(ashr_12):
559	xor	%ecx, %ecx				/* clear index */
560#ifdef USE_AS_STRNCPY
561	cmp	%r10, %r8
562	jbe	LABEL(unaligned_exit)
563#endif
564	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
565	jz	LABEL(ashr_12_use_sse2)
566
567	.p2align 4
568LABEL(ashr_12_use_ssse3):
569	movdqa	16(%rsi, %rcx), %xmm3
570	pcmpeqb	%xmm3, %xmm0
571	pmovmskb %xmm0, %edx
572	test	%edx, %edx
573	jnz	LABEL(unaligned_exit)
574#ifdef USE_AS_STRNCPY
575	sub	$16, %r8
576 	jbe	LABEL(strncpy_truncation_unaligned)
577#endif
578
579	#palignr $12, (%rsi, %rcx), %xmm3
580	.byte	0x66, 0x0F, 0x3A ,0x0F
581	.byte	0x1c, 0x0e, 0x0c
582
583	movdqa	%xmm3, (%rdi, %rcx)
584	add	$16, %rcx
585
586#ifdef USE_AS_STRNCPY
587	cmp	%r10, %r8
588	jbe	LABEL(unaligned_exit)
589#endif
590	movdqa	16(%rsi, %rcx), %xmm3
591	pcmpeqb %xmm3, %xmm0
592	pmovmskb %xmm0, %edx
593	test	%edx, %edx
594	jnz	LABEL(unaligned_exit)
595#ifdef USE_AS_STRNCPY
596	sub	$16, %r8
597 	jbe	LABEL(strncpy_truncation_unaligned)
598#endif
599
600	#palignr $12, (%rsi, %rcx), %xmm3
601	.byte	0x66, 0x0F, 0x3A ,0x0F
602	.byte	0x1c, 0x0e, 0x0c
603
604	movdqa	%xmm3, (%rdi, %rcx)
605	add	$16, %rcx
606#ifdef USE_AS_STRNCPY
607	cmp	%r10, %r8
608	jbe	LABEL(unaligned_exit)
609#endif
610	jmp	LABEL(ashr_12_use_ssse3)
611
612	.p2align 4
613LABEL(ashr_12_use_sse2):
614	pcmpeqb 16(%rsi, %rcx), %xmm0
615	pmovmskb %xmm0, %edx
616	test	%edx, %edx
617	jnz	LABEL(unaligned_exit)
618#ifdef USE_AS_STRNCPY
619	sub	$16, %r8
620 	jbe	LABEL(strncpy_truncation_unaligned)
621#endif
622
623	movdqa	16(%rsi, %rcx), %xmm3
624	movdqa	(%rsi, %rcx), %xmm2
625
626	psrldq	$12, %xmm2
627	pslldq	$4, %xmm3
628	por	%xmm2, %xmm3
629
630	movdqa	%xmm3, (%rdi, %rcx)
631	add	$16, %rcx
632
633#ifdef USE_AS_STRNCPY
634	cmp	%r10, %r8
635	jbe	LABEL(unaligned_exit)
636#endif
637	pcmpeqb 16(%rsi, %rcx), %xmm0
638	pmovmskb %xmm0, %edx
639	test	%edx, %edx
640	jnz	LABEL(unaligned_exit)
641#ifdef USE_AS_STRNCPY
642	sub	$16, %r8
643 	jbe	LABEL(strncpy_truncation_unaligned)
644#endif
645
646	movdqa	16(%rsi, %rcx), %xmm3
647	movdqa	(%rsi, %rcx), %xmm2
648
649	psrldq	$12, %xmm2
650	pslldq	$4, %xmm3
651	por	%xmm2, %xmm3
652
653	movdqa	%xmm3, (%rdi, %rcx)
654	add	$16, %rcx
655#ifdef USE_AS_STRNCPY
656	cmp	%r10, %r8
657	jbe	LABEL(unaligned_exit)
658#endif
659	jmp	LABEL(ashr_12_use_sse2)
660
661
662/*
663 * ashr_11 handles the following cases:
664 * 	(16 + (src offset - dest offset)) % 16 = 11
665 *
666 * Based on above operation, start from (%r9 + rsi) to the left of this cache
667 * bank, there is no null byte.
668 */
669	.p2align 4
670LABEL(ashr_11):
671	xor	%ecx, %ecx				/* clear index */
672#ifdef USE_AS_STRNCPY
673	cmp	%r10, %r8
674	jbe	LABEL(unaligned_exit)
675#endif
676	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
677	jz	LABEL(ashr_11_use_sse2)
678
679	.p2align 4
680LABEL(ashr_11_use_ssse3):
681	movdqa	16(%rsi, %rcx), %xmm3
682	pcmpeqb	%xmm3, %xmm0
683	pmovmskb %xmm0, %edx
684	test	%edx, %edx
685	jnz	LABEL(unaligned_exit)
686#ifdef USE_AS_STRNCPY
687	sub	$16, %r8
688 	jbe	LABEL(strncpy_truncation_unaligned)
689#endif
690
691	#palignr $11, (%rsi, %rcx), %xmm3
692	.byte	0x66, 0x0F, 0x3A ,0x0F
693	.byte	0x1c, 0x0e, 0x0b
694
695	movdqa	%xmm3, (%rdi, %rcx)
696	add	$16, %rcx
697
698#ifdef USE_AS_STRNCPY
699	cmp	%r10, %r8
700	jbe	LABEL(unaligned_exit)
701#endif
702	movdqa	16(%rsi, %rcx), %xmm3
703	pcmpeqb %xmm3, %xmm0
704	pmovmskb %xmm0, %edx
705	test	%edx, %edx
706	jnz	LABEL(unaligned_exit)
707#ifdef USE_AS_STRNCPY
708	sub	$16, %r8
709 	jbe	LABEL(strncpy_truncation_unaligned)
710#endif
711
712	#palignr $11, (%rsi, %rcx), %xmm3
713	.byte	0x66, 0x0F, 0x3A ,0x0F
714	.byte	0x1c, 0x0e, 0x0b
715
716	movdqa	%xmm3, (%rdi, %rcx)
717	add	$16, %rcx
718#ifdef USE_AS_STRNCPY
719	cmp	%r10, %r8
720	jbe	LABEL(unaligned_exit)
721#endif
722	jmp	LABEL(ashr_11_use_ssse3)
723
724	.p2align 4
725LABEL(ashr_11_use_sse2):
726	pcmpeqb 16(%rsi, %rcx), %xmm0
727	pmovmskb %xmm0, %edx
728	test	%edx, %edx
729	jnz	LABEL(unaligned_exit)
730#ifdef USE_AS_STRNCPY
731	sub	$16, %r8
732 	jbe	LABEL(strncpy_truncation_unaligned)
733#endif
734
735	movdqa	16(%rsi, %rcx), %xmm3
736	movdqa	(%rsi, %rcx), %xmm2
737
738	psrldq	$11, %xmm2
739	pslldq	$5, %xmm3
740	por	%xmm2, %xmm3
741
742	movdqa	%xmm3, (%rdi, %rcx)
743	add	$16, %rcx
744
745#ifdef USE_AS_STRNCPY
746	cmp	%r10, %r8
747	jbe	LABEL(unaligned_exit)
748#endif
749	pcmpeqb 16(%rsi, %rcx), %xmm0
750	pmovmskb %xmm0, %edx
751	test	%edx, %edx
752	jnz	LABEL(unaligned_exit)
753#ifdef USE_AS_STRNCPY
754	sub	$16, %r8
755 	jbe	LABEL(strncpy_truncation_unaligned)
756#endif
757
758	movdqa	16(%rsi, %rcx), %xmm3
759	movdqa	(%rsi, %rcx), %xmm2
760
761	psrldq	$11, %xmm2
762	pslldq	$5, %xmm3
763	por	%xmm2, %xmm3
764
765	movdqa	%xmm3, (%rdi, %rcx)
766	add	$16, %rcx
767#ifdef USE_AS_STRNCPY
768	cmp	%r10, %r8
769	jbe	LABEL(unaligned_exit)
770#endif
771	jmp	LABEL(ashr_11_use_sse2)
772
773
774/*
775 * ashr_10 handles the following cases:
776 * 	(16 + (src offset - dest offset)) % 16 = 10
777 *
778 * Based on above operation, start from (%r9 + rsi) to the left of this cache
779 * bank, there is no null byte.
780 */
781	.p2align 4
782LABEL(ashr_10):
783	xor	%ecx, %ecx				/* clear index */
784#ifdef USE_AS_STRNCPY
785	cmp	%r10, %r8
786	jbe	LABEL(unaligned_exit)
787#endif
788	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
789	jz	LABEL(ashr_10_use_sse2)
790
791	.p2align 4
792LABEL(ashr_10_use_ssse3):
793	movdqa	16(%rsi, %rcx), %xmm3
794	pcmpeqb	%xmm3, %xmm0
795	pmovmskb %xmm0, %edx
796	test	%edx, %edx
797	jnz	LABEL(unaligned_exit)
798#ifdef USE_AS_STRNCPY
799	sub	$16, %r8
800 	jbe	LABEL(strncpy_truncation_unaligned)
801#endif
802
803	#palignr $10, (%rsi, %rcx), %xmm3
804	.byte	0x66, 0x0F, 0x3A ,0x0F
805	.byte	0x1c, 0x0e, 0x0a
806
807	movdqa	%xmm3, (%rdi, %rcx)
808	add	$16, %rcx
809
810#ifdef USE_AS_STRNCPY
811	cmp	%r10, %r8
812	jbe	LABEL(unaligned_exit)
813#endif
814	movdqa	16(%rsi, %rcx), %xmm3
815	pcmpeqb %xmm3, %xmm0
816	pmovmskb %xmm0, %edx
817	test	%edx, %edx
818	jnz	LABEL(unaligned_exit)
819#ifdef USE_AS_STRNCPY
820	sub	$16, %r8
821 	jbe	LABEL(strncpy_truncation_unaligned)
822#endif
823
824	#palignr $10, (%rsi, %rcx), %xmm3
825	.byte	0x66, 0x0F, 0x3A ,0x0F
826	.byte	0x1c, 0x0e, 0x0a
827
828	movdqa	%xmm3, (%rdi, %rcx)
829	add	$16, %rcx
830#ifdef USE_AS_STRNCPY
831	cmp	%r10, %r8
832	jbe	LABEL(unaligned_exit)
833#endif
834	jmp	LABEL(ashr_10_use_ssse3)
835
836	.p2align 4
837LABEL(ashr_10_use_sse2):
838	pcmpeqb 16(%rsi, %rcx), %xmm0
839	pmovmskb %xmm0, %edx
840	test	%edx, %edx
841	jnz	LABEL(unaligned_exit)
842#ifdef USE_AS_STRNCPY
843	sub	$16, %r8
844 	jbe	LABEL(strncpy_truncation_unaligned)
845#endif
846
847	movdqa	16(%rsi, %rcx), %xmm3
848	movdqa	(%rsi, %rcx), %xmm2
849
850	psrldq	$10, %xmm2
851	pslldq	$6, %xmm3
852	por	%xmm2, %xmm3
853
854	movdqa	%xmm3, (%rdi, %rcx)
855	add	$16, %rcx
856
857#ifdef USE_AS_STRNCPY
858	cmp	%r10, %r8
859	jbe	LABEL(unaligned_exit)
860#endif
861	pcmpeqb 16(%rsi, %rcx), %xmm0
862	pmovmskb %xmm0, %edx
863	test	%edx, %edx
864	jnz	LABEL(unaligned_exit)
865#ifdef USE_AS_STRNCPY
866	sub	$16, %r8
867 	jbe	LABEL(strncpy_truncation_unaligned)
868#endif
869
870	movdqa	16(%rsi, %rcx), %xmm3
871	movdqa	(%rsi, %rcx), %xmm2
872
873	psrldq	$10, %xmm2
874	pslldq	$6, %xmm3
875	por	%xmm2, %xmm3
876
877	movdqa	%xmm3, (%rdi, %rcx)
878	add	$16, %rcx
879#ifdef USE_AS_STRNCPY
880	cmp	%r10, %r8
881	jbe	LABEL(unaligned_exit)
882#endif
883	jmp	LABEL(ashr_10_use_sse2)
884
885
886/*
887 * ashr_9 handles the following cases:
888 * 	(16 + (src offset - dest offset)) % 16 = 9
889 *
890 * Based on above operation, start from (%r9 + rsi) to the left of this cache
891 * bank, there is no null byte.
892 */
893	.p2align 4
894LABEL(ashr_9):
895	xor	%ecx, %ecx				/* clear index */
896#ifdef USE_AS_STRNCPY
897	cmp	%r10, %r8
898	jbe	LABEL(unaligned_exit)
899#endif
900	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
901	jz	LABEL(ashr_9_use_sse2)
902
903	.p2align 4
904LABEL(ashr_9_use_ssse3):
905	movdqa	16(%rsi, %rcx), %xmm3
906	pcmpeqb	%xmm3, %xmm0
907	pmovmskb %xmm0, %edx
908	test	%edx, %edx
909	jnz	LABEL(unaligned_exit)
910#ifdef USE_AS_STRNCPY
911	sub	$16, %r8
912 	jbe	LABEL(strncpy_truncation_unaligned)
913#endif
914
915	#palignr $9, (%rsi, %rcx), %xmm3
916	.byte	0x66, 0x0F, 0x3A ,0x0F
917	.byte	0x1c, 0x0e, 0x09
918
919	movdqa	%xmm3, (%rdi, %rcx)
920	add	$16, %rcx
921
922#ifdef USE_AS_STRNCPY
923	cmp	%r10, %r8
924	jbe	LABEL(unaligned_exit)
925#endif
926	movdqa	16(%rsi, %rcx), %xmm3
927	pcmpeqb %xmm3, %xmm0
928	pmovmskb %xmm0, %edx
929	test	%edx, %edx
930	jnz	LABEL(unaligned_exit)
931#ifdef USE_AS_STRNCPY
932	sub	$16, %r8
933 	jbe	LABEL(strncpy_truncation_unaligned)
934#endif
935
936	#palignr $9, (%rsi, %rcx), %xmm3
937	.byte	0x66, 0x0F, 0x3A ,0x0F
938	.byte	0x1c, 0x0e, 0x09
939
940	movdqa	%xmm3, (%rdi, %rcx)
941	add	$16, %rcx
942#ifdef USE_AS_STRNCPY
943	cmp	%r10, %r8
944	jbe	LABEL(unaligned_exit)
945#endif
946	jmp	LABEL(ashr_9_use_ssse3)
947
948	.p2align 4
949LABEL(ashr_9_use_sse2):
950	pcmpeqb 16(%rsi, %rcx), %xmm0
951	pmovmskb %xmm0, %edx
952	test	%edx, %edx
953	jnz	LABEL(unaligned_exit)
954#ifdef USE_AS_STRNCPY
955	sub	$16, %r8
956 	jbe	LABEL(strncpy_truncation_unaligned)
957#endif
958
959	movdqa	16(%rsi, %rcx), %xmm3
960	movdqa	(%rsi, %rcx), %xmm2
961
962	psrldq	$9, %xmm2
963	pslldq	$7, %xmm3
964	por	%xmm2, %xmm3
965
966	movdqa	%xmm3, (%rdi, %rcx)
967	add	$16, %rcx
968
969#ifdef USE_AS_STRNCPY
970	cmp	%r10, %r8
971	jbe	LABEL(unaligned_exit)
972#endif
973	pcmpeqb 16(%rsi, %rcx), %xmm0
974	pmovmskb %xmm0, %edx
975	test	%edx, %edx
976	jnz	LABEL(unaligned_exit)
977#ifdef USE_AS_STRNCPY
978	sub	$16, %r8
979 	jbe	LABEL(strncpy_truncation_unaligned)
980#endif
981
982	movdqa	16(%rsi, %rcx), %xmm3
983	movdqa	(%rsi, %rcx), %xmm2
984
985	psrldq	$9, %xmm2
986	pslldq	$7, %xmm3
987	por	%xmm2, %xmm3
988
989	movdqa	%xmm3, (%rdi, %rcx)
990	add	$16, %rcx
991#ifdef USE_AS_STRNCPY
992	cmp	%r10, %r8
993	jbe	LABEL(unaligned_exit)
994#endif
995	jmp	LABEL(ashr_9_use_sse2)
996
997
998/*
999 * ashr_8 handles the following cases:
1000 * 	(16 + (src offset - dest offset)) % 16 = 8
1001 *
1002 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1003 * bank, there is no null byte.
1004 */
1005	.p2align 4
1006LABEL(ashr_8):
1007	xor	%ecx, %ecx				/* clear index */
1008#ifdef USE_AS_STRNCPY
1009	cmp	%r10, %r8
1010	jbe	LABEL(unaligned_exit)
1011#endif
1012	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1013	jz	LABEL(ashr_8_use_sse2)
1014
1015	.p2align 4
1016LABEL(ashr_8_use_ssse3):
1017	movdqa	16(%rsi, %rcx), %xmm3
1018	pcmpeqb	%xmm3, %xmm0
1019	pmovmskb %xmm0, %edx
1020	test	%edx, %edx
1021	jnz	LABEL(unaligned_exit)
1022#ifdef USE_AS_STRNCPY
1023	sub	$16, %r8
1024 	jbe	LABEL(strncpy_truncation_unaligned)
1025#endif
1026
1027	#palignr $8, (%rsi, %rcx), %xmm3
1028	.byte	0x66, 0x0F, 0x3A ,0x0F
1029	.byte	0x1c, 0x0e, 0x08
1030
1031	movdqa	%xmm3, (%rdi, %rcx)
1032	add	$16, %rcx
1033
1034#ifdef USE_AS_STRNCPY
1035	cmp	%r10, %r8
1036	jbe	LABEL(unaligned_exit)
1037#endif
1038	movdqa	16(%rsi, %rcx), %xmm3
1039	pcmpeqb %xmm3, %xmm0
1040	pmovmskb %xmm0, %edx
1041	test	%edx, %edx
1042	jnz	LABEL(unaligned_exit)
1043#ifdef USE_AS_STRNCPY
1044	sub	$16, %r8
1045 	jbe	LABEL(strncpy_truncation_unaligned)
1046#endif
1047
1048	#palignr $8, (%rsi, %rcx), %xmm3
1049	.byte	0x66, 0x0F, 0x3A ,0x0F
1050	.byte	0x1c, 0x0e, 0x08
1051
1052	movdqa	%xmm3, (%rdi, %rcx)
1053	add	$16, %rcx
1054#ifdef USE_AS_STRNCPY
1055	cmp	%r10, %r8
1056	jbe	LABEL(unaligned_exit)
1057#endif
1058	jmp	LABEL(ashr_8_use_ssse3)
1059
1060	.p2align 4
1061LABEL(ashr_8_use_sse2):
1062	pcmpeqb 16(%rsi, %rcx), %xmm0
1063	pmovmskb %xmm0, %edx
1064	test	%edx, %edx
1065	jnz	LABEL(unaligned_exit)
1066#ifdef USE_AS_STRNCPY
1067	sub	$16, %r8
1068 	jbe	LABEL(strncpy_truncation_unaligned)
1069#endif
1070
1071	movdqa	16(%rsi, %rcx), %xmm3
1072	movdqa	(%rsi, %rcx), %xmm2
1073
1074	psrldq	$8, %xmm2
1075	pslldq	$8, %xmm3
1076	por	%xmm2, %xmm3
1077
1078	movdqa	%xmm3, (%rdi, %rcx)
1079	add	$16, %rcx
1080
1081#ifdef USE_AS_STRNCPY
1082	cmp	%r10, %r8
1083	jbe	LABEL(unaligned_exit)
1084#endif
1085	pcmpeqb 16(%rsi, %rcx), %xmm0
1086	pmovmskb %xmm0, %edx
1087	test	%edx, %edx
1088	jnz	LABEL(unaligned_exit)
1089#ifdef USE_AS_STRNCPY
1090	sub	$16, %r8
1091 	jbe	LABEL(strncpy_truncation_unaligned)
1092#endif
1093
1094	movdqa	16(%rsi, %rcx), %xmm3
1095	movdqa	(%rsi, %rcx), %xmm2
1096
1097	psrldq	$8, %xmm2
1098	pslldq	$8, %xmm3
1099	por	%xmm2, %xmm3
1100
1101	movdqa	%xmm3, (%rdi, %rcx)
1102	add	$16, %rcx
1103#ifdef USE_AS_STRNCPY
1104	cmp	%r10, %r8
1105	jbe	LABEL(unaligned_exit)
1106#endif
1107	jmp	LABEL(ashr_8_use_sse2)
1108
1109
1110/*
1111 * ashr_7 handles the following cases:
1112 * 	(16 + (src offset - dest offset)) % 16 = 7
1113 *
1114 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1115 * bank, there is no null byte.
1116 */
1117	.p2align 4
1118LABEL(ashr_7):
1119	xor	%ecx, %ecx				/* clear index */
1120#ifdef USE_AS_STRNCPY
1121	cmp	%r10, %r8
1122	jbe	LABEL(unaligned_exit)
1123#endif
1124	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1125	jz	LABEL(ashr_7_use_sse2)
1126
1127	.p2align 4
1128LABEL(ashr_7_use_ssse3):
1129	movdqa	16(%rsi, %rcx), %xmm3
1130	pcmpeqb	%xmm3, %xmm0
1131	pmovmskb %xmm0, %edx
1132	test	%edx, %edx
1133	jnz	LABEL(unaligned_exit)
1134#ifdef USE_AS_STRNCPY
1135	sub	$16, %r8
1136 	jbe	LABEL(strncpy_truncation_unaligned)
1137#endif
1138
1139	#palignr $7, (%rsi, %rcx), %xmm3
1140	.byte	0x66, 0x0F, 0x3A ,0x0F
1141	.byte	0x1c, 0x0e, 0x07
1142
1143	movdqa	%xmm3, (%rdi, %rcx)
1144	add	$16, %rcx
1145
1146#ifdef USE_AS_STRNCPY
1147	cmp	%r10, %r8
1148	jbe	LABEL(unaligned_exit)
1149#endif
1150	movdqa	16(%rsi, %rcx), %xmm3
1151	pcmpeqb %xmm3, %xmm0
1152	pmovmskb %xmm0, %edx
1153	test	%edx, %edx
1154	jnz	LABEL(unaligned_exit)
1155#ifdef USE_AS_STRNCPY
1156	sub	$16, %r8
1157 	jbe	LABEL(strncpy_truncation_unaligned)
1158#endif
1159
1160	#palignr $7, (%rsi, %rcx), %xmm3
1161	.byte	0x66, 0x0F, 0x3A ,0x0F
1162	.byte	0x1c, 0x0e, 0x07
1163
1164	movdqa	%xmm3, (%rdi, %rcx)
1165	add	$16, %rcx
1166#ifdef USE_AS_STRNCPY
1167	cmp	%r10, %r8
1168	jbe	LABEL(unaligned_exit)
1169#endif
1170	jmp	LABEL(ashr_7_use_ssse3)
1171
1172	.p2align 4
1173LABEL(ashr_7_use_sse2):
1174	pcmpeqb 16(%rsi, %rcx), %xmm0
1175	pmovmskb %xmm0, %edx
1176	test	%edx, %edx
1177	jnz	LABEL(unaligned_exit)
1178#ifdef USE_AS_STRNCPY
1179	sub	$16, %r8
1180 	jbe	LABEL(strncpy_truncation_unaligned)
1181#endif
1182
1183	movdqa	16(%rsi, %rcx), %xmm3
1184	movdqa	(%rsi, %rcx), %xmm2
1185
1186	psrldq	$7, %xmm2
1187	pslldq	$9, %xmm3
1188	por	%xmm2, %xmm3
1189
1190	movdqa	%xmm3, (%rdi, %rcx)
1191	add	$16, %rcx
1192
1193#ifdef USE_AS_STRNCPY
1194	cmp	%r10, %r8
1195	jbe	LABEL(unaligned_exit)
1196#endif
1197	pcmpeqb 16(%rsi, %rcx), %xmm0
1198	pmovmskb %xmm0, %edx
1199	test	%edx, %edx
1200	jnz	LABEL(unaligned_exit)
1201#ifdef USE_AS_STRNCPY
1202	sub	$16, %r8
1203 	jbe	LABEL(strncpy_truncation_unaligned)
1204#endif
1205
1206	movdqa	16(%rsi, %rcx), %xmm3
1207	movdqa	(%rsi, %rcx), %xmm2
1208
1209	psrldq	$7, %xmm2
1210	pslldq	$9, %xmm3
1211	por	%xmm2, %xmm3
1212
1213	movdqa	%xmm3, (%rdi, %rcx)
1214	add	$16, %rcx
1215#ifdef USE_AS_STRNCPY
1216	cmp	%r10, %r8
1217	jbe	LABEL(unaligned_exit)
1218#endif
1219	jmp	LABEL(ashr_7_use_sse2)
1220
1221
1222/*
1223 * ashr_6 handles the following cases:
1224 * 	(16 + (src offset - dest offset)) % 16 = 6
1225 *
1226 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1227 * bank, there is no null byte.
1228 */
1229	.p2align 4
1230LABEL(ashr_6):
1231	xor	%ecx, %ecx				/* clear index */
1232#ifdef USE_AS_STRNCPY
1233	cmp	%r10, %r8
1234	jbe	LABEL(unaligned_exit)
1235#endif
1236	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1237	jz	LABEL(ashr_6_use_sse2)
1238
1239	.p2align 4
1240LABEL(ashr_6_use_ssse3):
1241	movdqa	16(%rsi, %rcx), %xmm3
1242	pcmpeqb	%xmm3, %xmm0
1243	pmovmskb %xmm0, %edx
1244	test	%edx, %edx
1245	jnz	LABEL(unaligned_exit)
1246#ifdef USE_AS_STRNCPY
1247	sub	$16, %r8
1248 	jbe	LABEL(strncpy_truncation_unaligned)
1249#endif
1250
1251	#palignr $6, (%rsi, %rcx), %xmm3
1252	.byte	0x66, 0x0F, 0x3A ,0x0F
1253	.byte	0x1c, 0x0e, 0x06
1254
1255	movdqa	%xmm3, (%rdi, %rcx)
1256	add	$16, %rcx
1257
1258#ifdef USE_AS_STRNCPY
1259	cmp	%r10, %r8
1260	jbe	LABEL(unaligned_exit)
1261#endif
1262	movdqa	16(%rsi, %rcx), %xmm3
1263	pcmpeqb %xmm3, %xmm0
1264	pmovmskb %xmm0, %edx
1265	test	%edx, %edx
1266	jnz	LABEL(unaligned_exit)
1267#ifdef USE_AS_STRNCPY
1268	sub	$16, %r8
1269 	jbe	LABEL(strncpy_truncation_unaligned)
1270#endif
1271
1272	#palignr $6, (%rsi, %rcx), %xmm3
1273	.byte	0x66, 0x0F, 0x3A ,0x0F
1274	.byte	0x1c, 0x0e, 0x06
1275
1276	movdqa	%xmm3, (%rdi, %rcx)
1277	add	$16, %rcx
1278#ifdef USE_AS_STRNCPY
1279	cmp	%r10, %r8
1280	jbe	LABEL(unaligned_exit)
1281#endif
1282	jmp	LABEL(ashr_6_use_ssse3)
1283
1284	.p2align 4
1285LABEL(ashr_6_use_sse2):
1286	pcmpeqb 16(%rsi, %rcx), %xmm0
1287	pmovmskb %xmm0, %edx
1288	test	%edx, %edx
1289	jnz	LABEL(unaligned_exit)
1290#ifdef USE_AS_STRNCPY
1291	sub	$16, %r8
1292 	jbe	LABEL(strncpy_truncation_unaligned)
1293#endif
1294
1295	movdqa	16(%rsi, %rcx), %xmm3
1296	movdqa	(%rsi, %rcx), %xmm2
1297
1298	psrldq	$6, %xmm2
1299	pslldq	$10, %xmm3
1300	por	%xmm2, %xmm3
1301
1302	movdqa	%xmm3, (%rdi, %rcx)
1303	add	$16, %rcx
1304
1305#ifdef USE_AS_STRNCPY
1306	cmp	%r10, %r8
1307	jbe	LABEL(unaligned_exit)
1308#endif
1309	pcmpeqb 16(%rsi, %rcx), %xmm0
1310	pmovmskb %xmm0, %edx
1311	test	%edx, %edx
1312	jnz	LABEL(unaligned_exit)
1313#ifdef USE_AS_STRNCPY
1314	sub	$16, %r8
1315 	jbe	LABEL(strncpy_truncation_unaligned)
1316#endif
1317
1318	movdqa	16(%rsi, %rcx), %xmm3
1319	movdqa	(%rsi, %rcx), %xmm2
1320
1321	psrldq	$6, %xmm2
1322	pslldq	$10, %xmm3
1323	por	%xmm2, %xmm3
1324
1325	movdqa	%xmm3, (%rdi, %rcx)
1326	add	$16, %rcx
1327#ifdef USE_AS_STRNCPY
1328	cmp	%r10, %r8
1329	jbe	LABEL(unaligned_exit)
1330#endif
1331	jmp	LABEL(ashr_6_use_sse2)
1332
1333
1334/*
1335 * ashr_5 handles the following cases:
1336 * 	(16 + (src offset - dest offset)) % 16 = 5
1337 *
1338 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1339 * bank, there is no null byte.
1340 */
1341	.p2align 4
1342LABEL(ashr_5):
1343	xor	%ecx, %ecx				/* clear index */
1344#ifdef USE_AS_STRNCPY
1345	cmp	%r10, %r8
1346	jbe	LABEL(unaligned_exit)
1347#endif
1348	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1349	jz	LABEL(ashr_5_use_sse2)
1350
1351	.p2align 4
1352LABEL(ashr_5_use_ssse3):
1353	movdqa	16(%rsi, %rcx), %xmm3
1354	pcmpeqb	%xmm3, %xmm0
1355	pmovmskb %xmm0, %edx
1356	test	%edx, %edx
1357	jnz	LABEL(unaligned_exit)
1358#ifdef USE_AS_STRNCPY
1359	sub	$16, %r8
1360 	jbe	LABEL(strncpy_truncation_unaligned)
1361#endif
1362
1363	#palignr $5, (%rsi, %rcx), %xmm3
1364	.byte	0x66, 0x0F, 0x3A ,0x0F
1365	.byte	0x1c, 0x0e, 0x05
1366
1367	movdqa	%xmm3, (%rdi, %rcx)
1368	add	$16, %rcx
1369
1370#ifdef USE_AS_STRNCPY
1371	cmp	%r10, %r8
1372	jbe	LABEL(unaligned_exit)
1373#endif
1374	movdqa	16(%rsi, %rcx), %xmm3
1375	pcmpeqb %xmm3, %xmm0
1376	pmovmskb %xmm0, %edx
1377	test	%edx, %edx
1378	jnz	LABEL(unaligned_exit)
1379#ifdef USE_AS_STRNCPY
1380	sub	$16, %r8
1381 	jbe	LABEL(strncpy_truncation_unaligned)
1382#endif
1383
1384	#palignr $5, (%rsi, %rcx), %xmm3
1385	.byte	0x66, 0x0F, 0x3A ,0x0F
1386	.byte	0x1c, 0x0e, 0x05
1387
1388	movdqa	%xmm3, (%rdi, %rcx)
1389	add	$16, %rcx
1390#ifdef USE_AS_STRNCPY
1391	cmp	%r10, %r8
1392	jbe	LABEL(unaligned_exit)
1393#endif
1394	jmp	LABEL(ashr_5_use_ssse3)
1395
1396	.p2align 4
1397LABEL(ashr_5_use_sse2):
1398	pcmpeqb 16(%rsi, %rcx), %xmm0
1399	pmovmskb %xmm0, %edx
1400	test	%edx, %edx
1401	jnz	LABEL(unaligned_exit)
1402#ifdef USE_AS_STRNCPY
1403	sub	$16, %r8
1404 	jbe	LABEL(strncpy_truncation_unaligned)
1405#endif
1406
1407	movdqa	16(%rsi, %rcx), %xmm3
1408	movdqa	(%rsi, %rcx), %xmm2
1409
1410	psrldq	$5, %xmm2
1411	pslldq	$11, %xmm3
1412	por	%xmm2, %xmm3
1413
1414	movdqa	%xmm3, (%rdi, %rcx)
1415	add	$16, %rcx
1416
1417#ifdef USE_AS_STRNCPY
1418	cmp	%r10, %r8
1419	jbe	LABEL(unaligned_exit)
1420#endif
1421	pcmpeqb 16(%rsi, %rcx), %xmm0
1422	pmovmskb %xmm0, %edx
1423	test	%edx, %edx
1424	jnz	LABEL(unaligned_exit)
1425#ifdef USE_AS_STRNCPY
1426	sub	$16, %r8
1427 	jbe	LABEL(strncpy_truncation_unaligned)
1428#endif
1429
1430	movdqa	16(%rsi, %rcx), %xmm3
1431	movdqa	(%rsi, %rcx), %xmm2
1432
1433	psrldq	$5, %xmm2
1434	pslldq	$11, %xmm3
1435	por	%xmm2, %xmm3
1436
1437	movdqa	%xmm3, (%rdi, %rcx)
1438	add	$16, %rcx
1439#ifdef USE_AS_STRNCPY
1440	cmp	%r10, %r8
1441	jbe	LABEL(unaligned_exit)
1442#endif
1443	jmp	LABEL(ashr_5_use_sse2)
1444
1445
1446/*
1447 * ashr_4 handles the following cases:
1448 * 	(16 + (src offset - dest offset)) % 16 = 4
1449 *
1450 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1451 * bank, there is no null byte.
1452 */
1453	.p2align 4
1454LABEL(ashr_4):
1455	xor	%ecx, %ecx				/* clear index */
1456#ifdef USE_AS_STRNCPY
1457	cmp	%r10, %r8
1458	jbe	LABEL(unaligned_exit)
1459#endif
1460	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1461	jz	LABEL(ashr_4_use_sse2)
1462
1463	.p2align 4
1464LABEL(ashr_4_use_ssse3):
1465	movdqa	16(%rsi, %rcx), %xmm3
1466	pcmpeqb	%xmm3, %xmm0
1467	pmovmskb %xmm0, %edx
1468	test	%edx, %edx
1469	jnz	LABEL(unaligned_exit)
1470#ifdef USE_AS_STRNCPY
1471	sub	$16, %r8
1472 	jbe	LABEL(strncpy_truncation_unaligned)
1473#endif
1474
1475	#palignr $4, (%rsi, %rcx), %xmm3
1476	.byte	0x66, 0x0F, 0x3A ,0x0F
1477	.byte	0x1c, 0x0e, 0x04
1478
1479	movdqa	%xmm3, (%rdi, %rcx)
1480	add	$16, %rcx
1481
1482#ifdef USE_AS_STRNCPY
1483	cmp	%r10, %r8
1484	jbe	LABEL(unaligned_exit)
1485#endif
1486	movdqa	16(%rsi, %rcx), %xmm3
1487	pcmpeqb %xmm3, %xmm0
1488	pmovmskb %xmm0, %edx
1489	test	%edx, %edx
1490	jnz	LABEL(unaligned_exit)
1491#ifdef USE_AS_STRNCPY
1492	sub	$16, %r8
1493 	jbe	LABEL(strncpy_truncation_unaligned)
1494#endif
1495
1496	#palignr $4, (%rsi, %rcx), %xmm3
1497	.byte	0x66, 0x0F, 0x3A ,0x0F
1498	.byte	0x1c, 0x0e, 0x04
1499
1500	movdqa	%xmm3, (%rdi, %rcx)
1501	add	$16, %rcx
1502#ifdef USE_AS_STRNCPY
1503	cmp	%r10, %r8
1504	jbe	LABEL(unaligned_exit)
1505#endif
1506	jmp	LABEL(ashr_4_use_ssse3)
1507
1508	.p2align 4
1509LABEL(ashr_4_use_sse2):
1510	pcmpeqb 16(%rsi, %rcx), %xmm0
1511	pmovmskb %xmm0, %edx
1512	test	%edx, %edx
1513	jnz	LABEL(unaligned_exit)
1514#ifdef USE_AS_STRNCPY
1515	sub	$16, %r8
1516 	jbe	LABEL(strncpy_truncation_unaligned)
1517#endif
1518
1519	movdqa	16(%rsi, %rcx), %xmm3
1520	movdqa	(%rsi, %rcx), %xmm2
1521
1522	psrldq	$4, %xmm2
1523	pslldq	$12, %xmm3
1524	por	%xmm2, %xmm3
1525
1526	movdqa	%xmm3, (%rdi, %rcx)
1527	add	$16, %rcx
1528
1529#ifdef USE_AS_STRNCPY
1530	cmp	%r10, %r8
1531	jbe	LABEL(unaligned_exit)
1532#endif
1533	pcmpeqb 16(%rsi, %rcx), %xmm0
1534	pmovmskb %xmm0, %edx
1535	test	%edx, %edx
1536	jnz	LABEL(unaligned_exit)
1537#ifdef USE_AS_STRNCPY
1538	sub	$16, %r8
1539 	jbe	LABEL(strncpy_truncation_unaligned)
1540#endif
1541
1542	movdqa	16(%rsi, %rcx), %xmm3
1543	movdqa	(%rsi, %rcx), %xmm2
1544
1545	psrldq	$4, %xmm2
1546	pslldq	$12, %xmm3
1547	por	%xmm2, %xmm3
1548
1549	movdqa	%xmm3, (%rdi, %rcx)
1550	add	$16, %rcx
1551#ifdef USE_AS_STRNCPY
1552	cmp	%r10, %r8
1553	jbe	LABEL(unaligned_exit)
1554#endif
1555	jmp	LABEL(ashr_4_use_sse2)
1556
1557
1558/*
1559 * ashr_3 handles the following cases:
1560 * 	(16 + (src offset - dest offset)) % 16 = 3
1561 *
1562 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1563 * bank, there is no null byte.
1564 */
1565	.p2align 4
1566LABEL(ashr_3):
1567	xor	%ecx, %ecx				/* clear index */
1568#ifdef USE_AS_STRNCPY
1569	cmp	%r10, %r8
1570	jbe	LABEL(unaligned_exit)
1571#endif
1572	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1573	jz	LABEL(ashr_3_use_sse2)
1574
1575	.p2align 4
1576LABEL(ashr_3_use_ssse3):
1577	movdqa	16(%rsi, %rcx), %xmm3
1578	pcmpeqb	%xmm3, %xmm0
1579	pmovmskb %xmm0, %edx
1580	test	%edx, %edx
1581	jnz	LABEL(unaligned_exit)
1582#ifdef USE_AS_STRNCPY
1583	sub	$16, %r8
1584 	jbe	LABEL(strncpy_truncation_unaligned)
1585#endif
1586
1587	#palignr $3, (%rsi, %rcx), %xmm3
1588	.byte	0x66, 0x0F, 0x3A ,0x0F
1589	.byte	0x1c, 0x0e, 0x03
1590
1591	movdqa	%xmm3, (%rdi, %rcx)
1592	add	$16, %rcx
1593
1594#ifdef USE_AS_STRNCPY
1595	cmp	%r10, %r8
1596	jbe	LABEL(unaligned_exit)
1597#endif
1598	movdqa	16(%rsi, %rcx), %xmm3
1599	pcmpeqb %xmm3, %xmm0
1600	pmovmskb %xmm0, %edx
1601	test	%edx, %edx
1602	jnz	LABEL(unaligned_exit)
1603#ifdef USE_AS_STRNCPY
1604	sub	$16, %r8
1605 	jbe	LABEL(strncpy_truncation_unaligned)
1606#endif
1607
1608	#palignr $3, (%rsi, %rcx), %xmm3
1609	.byte	0x66, 0x0F, 0x3A ,0x0F
1610	.byte	0x1c, 0x0e, 0x03
1611
1612	movdqa	%xmm3, (%rdi, %rcx)
1613	add	$16, %rcx
1614#ifdef USE_AS_STRNCPY
1615	cmp	%r10, %r8
1616	jbe	LABEL(unaligned_exit)
1617#endif
1618	jmp	LABEL(ashr_3_use_ssse3)
1619
1620	.p2align 4
1621LABEL(ashr_3_use_sse2):
1622	pcmpeqb 16(%rsi, %rcx), %xmm0
1623	pmovmskb %xmm0, %edx
1624	test	%edx, %edx
1625	jnz	LABEL(unaligned_exit)
1626#ifdef USE_AS_STRNCPY
1627	sub	$16, %r8
1628 	jbe	LABEL(strncpy_truncation_unaligned)
1629#endif
1630
1631	movdqa	16(%rsi, %rcx), %xmm3
1632	movdqa	(%rsi, %rcx), %xmm2
1633
1634	psrldq	$3, %xmm2
1635	pslldq	$13, %xmm3
1636	por	%xmm2, %xmm3
1637
1638	movdqa	%xmm3, (%rdi, %rcx)
1639	add	$16, %rcx
1640
1641#ifdef USE_AS_STRNCPY
1642	cmp	%r10, %r8
1643	jbe	LABEL(unaligned_exit)
1644#endif
1645	pcmpeqb 16(%rsi, %rcx), %xmm0
1646	pmovmskb %xmm0, %edx
1647	test	%edx, %edx
1648	jnz	LABEL(unaligned_exit)
1649#ifdef USE_AS_STRNCPY
1650	sub	$16, %r8
1651 	jbe	LABEL(strncpy_truncation_unaligned)
1652#endif
1653
1654	movdqa	16(%rsi, %rcx), %xmm3
1655	movdqa	(%rsi, %rcx), %xmm2
1656
1657	psrldq	$3, %xmm2
1658	pslldq	$13, %xmm3
1659	por	%xmm2, %xmm3
1660
1661	movdqa	%xmm3, (%rdi, %rcx)
1662	add	$16, %rcx
1663#ifdef USE_AS_STRNCPY
1664	cmp	%r10, %r8
1665	jbe	LABEL(unaligned_exit)
1666#endif
1667	jmp	LABEL(ashr_3_use_sse2)
1668
1669
1670/*
1671 * ashr_2 handles the following cases:
1672 * 	(16 + (src offset - dest offset)) % 16 = 2
1673 *
1674 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1675 * bank, there is no null byte.
1676 */
1677	.p2align 4
1678LABEL(ashr_2):
1679	xor	%ecx, %ecx				/* clear index */
1680#ifdef USE_AS_STRNCPY
1681	cmp	%r10, %r8
1682	jbe	LABEL(unaligned_exit)
1683#endif
1684	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1685	jz	LABEL(ashr_2_use_sse2)
1686
1687	.p2align 4
1688LABEL(ashr_2_use_ssse3):
1689	movdqa	16(%rsi, %rcx), %xmm3
1690	pcmpeqb	%xmm3, %xmm0
1691	pmovmskb %xmm0, %edx
1692	test	%edx, %edx
1693	jnz	LABEL(unaligned_exit)
1694#ifdef USE_AS_STRNCPY
1695	sub	$16, %r8
1696 	jbe	LABEL(strncpy_truncation_unaligned)
1697#endif
1698
1699	#palignr $2, (%rsi, %rcx), %xmm3
1700	.byte	0x66, 0x0F, 0x3A ,0x0F
1701	.byte	0x1c, 0x0e, 0x02
1702
1703	movdqa	%xmm3, (%rdi, %rcx)
1704	add	$16, %rcx
1705
1706#ifdef USE_AS_STRNCPY
1707	cmp	%r10, %r8
1708	jbe	LABEL(unaligned_exit)
1709#endif
1710	movdqa	16(%rsi, %rcx), %xmm3
1711	pcmpeqb %xmm3, %xmm0
1712	pmovmskb %xmm0, %edx
1713	test	%edx, %edx
1714	jnz	LABEL(unaligned_exit)
1715#ifdef USE_AS_STRNCPY
1716	sub	$16, %r8
1717 	jbe	LABEL(strncpy_truncation_unaligned)
1718#endif
1719
1720	#palignr $2, (%rsi, %rcx), %xmm3
1721	.byte	0x66, 0x0F, 0x3A ,0x0F
1722	.byte	0x1c, 0x0e, 0x02
1723
1724	movdqa	%xmm3, (%rdi, %rcx)
1725	add	$16, %rcx
1726#ifdef USE_AS_STRNCPY
1727	cmp	%r10, %r8
1728	jbe	LABEL(unaligned_exit)
1729#endif
1730	jmp	LABEL(ashr_2_use_ssse3)
1731
1732	.p2align 4
1733LABEL(ashr_2_use_sse2):
1734	pcmpeqb 16(%rsi, %rcx), %xmm0
1735	pmovmskb %xmm0, %edx
1736	test	%edx, %edx
1737	jnz	LABEL(unaligned_exit)
1738#ifdef USE_AS_STRNCPY
1739	sub	$16, %r8
1740 	jbe	LABEL(strncpy_truncation_unaligned)
1741#endif
1742
1743	movdqa	16(%rsi, %rcx), %xmm3
1744	movdqa	(%rsi, %rcx), %xmm2
1745
1746	psrldq	$2, %xmm2
1747	pslldq	$14, %xmm3
1748	por	%xmm2, %xmm3
1749
1750	movdqa	%xmm3, (%rdi, %rcx)
1751	add	$16, %rcx
1752
1753#ifdef USE_AS_STRNCPY
1754	cmp	%r10, %r8
1755	jbe	LABEL(unaligned_exit)
1756#endif
1757	pcmpeqb 16(%rsi, %rcx), %xmm0
1758	pmovmskb %xmm0, %edx
1759	test	%edx, %edx
1760	jnz	LABEL(unaligned_exit)
1761#ifdef USE_AS_STRNCPY
1762	sub	$16, %r8
1763 	jbe	LABEL(strncpy_truncation_unaligned)
1764#endif
1765
1766	movdqa	16(%rsi, %rcx), %xmm3
1767	movdqa	(%rsi, %rcx), %xmm2
1768
1769	psrldq	$2, %xmm2
1770	pslldq	$14, %xmm3
1771	por	%xmm2, %xmm3
1772
1773	movdqa	%xmm3, (%rdi, %rcx)
1774	add	$16, %rcx
1775#ifdef USE_AS_STRNCPY
1776	cmp	%r10, %r8
1777	jbe	LABEL(unaligned_exit)
1778#endif
1779	jmp	LABEL(ashr_2_use_sse2)
1780
1781
1782/*
1783 * ashr_1 handles the following cases:
1784 * 	(16 + (src offset - dest offset)) % 16 = 1
1785 *
1786 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1787 * bank, there is no null byte.
1788 */
1789	.p2align 4
1790LABEL(ashr_1):
1791	xor	%ecx, %ecx				/* clear index */
1792#ifdef USE_AS_STRNCPY
1793	cmp	%r10, %r8
1794	jbe	LABEL(unaligned_exit)
1795#endif
1796	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1797	jz	LABEL(ashr_1_use_sse2)
1798
1799	.p2align 4
1800LABEL(ashr_1_use_ssse3):
1801	movdqa	16(%rsi, %rcx), %xmm3
1802	pcmpeqb	%xmm3, %xmm0
1803	pmovmskb %xmm0, %edx
1804	test	%edx, %edx
1805	jnz	LABEL(unaligned_exit)
1806#ifdef USE_AS_STRNCPY
1807	sub	$16, %r8
1808 	jbe	LABEL(strncpy_truncation_unaligned)
1809#endif
1810
1811	#palignr $1, (%rsi, %rcx), %xmm3
1812	.byte	0x66, 0x0F, 0x3A ,0x0F
1813	.byte	0x1c, 0x0e, 0x01
1814
1815	movdqa	%xmm3, (%rdi, %rcx)
1816	add	$16, %rcx
1817
1818#ifdef USE_AS_STRNCPY
1819	cmp	%r10, %r8
1820	jbe	LABEL(unaligned_exit)
1821#endif
1822	movdqa	16(%rsi, %rcx), %xmm3
1823	pcmpeqb %xmm3, %xmm0
1824	pmovmskb %xmm0, %edx
1825	test	%edx, %edx
1826	jnz	LABEL(unaligned_exit)
1827#ifdef USE_AS_STRNCPY
1828	sub	$16, %r8
1829 	jbe	LABEL(strncpy_truncation_unaligned)
1830#endif
1831	#palignr $1, (%rsi, %rcx), %xmm3
1832	.byte	0x66, 0x0F, 0x3A ,0x0F
1833	.byte	0x1c, 0x0e, 0x01
1834
1835	movdqa	%xmm3, (%rdi, %rcx)
1836	add	$16, %rcx
1837#ifdef USE_AS_STRNCPY
1838	cmp	%r10, %r8
1839	jbe	LABEL(unaligned_exit)
1840#endif
1841	jmp	LABEL(ashr_1_use_ssse3)
1842
1843	.p2align 4
1844LABEL(ashr_1_use_sse2):
1845	pcmpeqb 16(%rsi, %rcx), %xmm0
1846	pmovmskb %xmm0, %edx
1847	test	%edx, %edx
1848	jnz	LABEL(unaligned_exit)
1849#ifdef USE_AS_STRNCPY
1850	sub	$16, %r8
1851 	jbe	LABEL(strncpy_truncation_unaligned)
1852#endif
1853	movdqa	16(%rsi, %rcx), %xmm3
1854	movdqa	(%rsi, %rcx), %xmm2
1855
1856	psrldq	$1, %xmm2
1857	pslldq	$15, %xmm3
1858	por	%xmm2, %xmm3
1859
1860	movdqa	%xmm3, (%rdi, %rcx)
1861	add	$16, %rcx
1862
1863#ifdef USE_AS_STRNCPY
1864	cmp	%r10, %r8
1865	jbe	LABEL(unaligned_exit)
1866#endif
1867	pcmpeqb 16(%rsi, %rcx), %xmm0
1868	pmovmskb %xmm0, %edx
1869	test	%edx, %edx
1870	jnz	LABEL(unaligned_exit)
1871#ifdef USE_AS_STRNCPY
1872	sub	$16, %r8
1873 	jbe	LABEL(strncpy_truncation_unaligned)
1874#endif
1875
1876	movdqa	16(%rsi, %rcx), %xmm3
1877	movdqa	(%rsi, %rcx), %xmm2
1878
1879	psrldq	$1, %xmm2
1880	pslldq	$15, %xmm3
1881	por	%xmm2, %xmm3
1882
1883	movdqa	%xmm3, (%rdi, %rcx)
1884	add	$16, %rcx
1885#ifdef USE_AS_STRNCPY
1886	cmp	%r10, %r8
1887	jbe	LABEL(unaligned_exit)
1888#endif
1889	jmp	LABEL(ashr_1_use_sse2)
1890
1891
1892	/*
1893	 * Exit tail code:
1894	 * Up to 32 bytes are copied in the case of strcpy.
1895	 */
1896	.p2align 4
1897LABEL(less32bytes):
1898	xor	%ecx, %ecx
1899LABEL(unaligned_exit):
1900	add	%r9, %rsi		/* r9 holds offset of rsi */
1901	mov	%rcx, %r9
1902	mov	%r10, %rcx
1903	shl	%cl, %edx		/* after shl, calculate the exact number to be filled */
1904	mov	%r9, %rcx
1905	.p2align 4
1906LABEL(aligned_exit):
1907	add	%rcx, %rdi		/* locate exact address for rdi */
1908LABEL(less16bytes):
1909	add	%rcx, %rsi		/* locate exact address for rsi */
1910LABEL(aligned_16bytes):
1911#ifdef USE_AS_STRNCPY
1912	/*
1913	 * Null found in 16bytes checked. Set bit in bitmask corresponding to
1914	 * the strncpy count argument. We will copy to the null (inclusive)
1915	 * or count whichever comes first.
1916	 */
1917	mov	$1, %r9d
1918	lea	-1(%r8), %rcx
1919	shl	%cl, %r9d
1920	cmp	$32, %r8
1921	ja	LABEL(strncpy_tail)
1922	or	%r9d, %edx
1923LABEL(strncpy_tail):
1924#endif
1925	/*
1926	 * Check to see if BSF is fast on this processor. If not, use a
1927	 * different exit tail.
1928	 */
1929	testb	$USE_BSF, .memops_method(%rip)
1930	jz	LABEL(AMD_exit)
1931	bsf	%rdx, %rcx		/* Find byte with null char */
1932	lea	LABEL(tail_table)(%rip), %r11
1933	movslq	(%r11, %rcx, 4), %rcx
1934	lea	(%r11, %rcx), %rcx
1935	jmp	*%rcx
1936
1937#ifdef USE_AS_STRNCPY
1938	/*
1939	 * Count reached before null found.
1940	 */
1941	.p2align 4
1942LABEL(less32bytes_strncpy_truncation):
1943	xor	%ecx, %ecx
1944LABEL(strncpy_truncation_unaligned):
1945	add	%r9, %rsi		/* next src char to copy */
1946LABEL(strncpy_truncation_aligned):
1947	add	%rcx, %rdi
1948	add	%rcx, %rsi
1949	add	$16, %r8		/* compensation */
1950	lea	-1(%r8), %rcx
1951	lea	LABEL(tail_table)(%rip), %r11
1952	movslq	(%r11, %rcx, 4), %rcx
1953	lea	(%r11, %rcx), %rcx
1954	jmp	*%rcx
1955
1956	.p2align 4
1957LABEL(strncpy_exitz):
1958	mov	%rdi, %rax
1959	ret
1960#endif
1961
1962	.p2align 4
1963LABEL(AMD_exit):
1964	test	%dl, %dl
1965	jz	LABEL(AMD_exit_more_8)
1966	test	$0x01, %dl
1967	jnz	LABEL(tail_0)
1968	test	$0x02, %dl
1969	jnz	LABEL(tail_1)
1970	test	$0x04, %dl
1971	jnz	LABEL(tail_2)
1972	test	$0x08, %dl
1973	jnz	LABEL(tail_3)
1974	test	$0x10, %dl
1975	jnz	LABEL(tail_4)
1976	test	$0x20, %dl
1977	jnz	LABEL(tail_5)
1978	test	$0x40, %dl
1979	jnz	LABEL(tail_6)
1980
1981	.p2align 4
1982LABEL(tail_7):				/* 8 bytes */
1983	mov	(%rsi), %rcx
1984	mov	%rcx, (%rdi)
1985#ifdef USE_AS_STRNCPY
1986	mov	$8, %cl
1987	sub	$8, %r8
1988	jnz	LABEL(strncpy_fill_tail)
1989#endif
1990	ret
1991
1992#ifdef USE_AS_STRNCPY
1993	/*
1994	 * Null terminated src string shorter than count. Fill the rest of the
1995	 * destination with null chars.
1996	 */
1997	.p2align 4
1998LABEL(strncpy_fill_tail):
1999	mov	%rax, %rdx
2000	movzx	%cl, %rax
2001	mov	%r8, %rcx
2002	add	%rax, %rdi
2003	xor	%eax, %eax
2004	shr	$3, %ecx
2005	jz	LABEL(strncpy_fill_less_8)
2006
2007	rep	stosq
2008LABEL(strncpy_fill_less_8):
2009	mov	%r8, %rcx
2010	and	$7, %rcx
2011	jz	LABEL(strncpy_fill_return)
2012LABEL(strncpy_fill_less_7):
2013	sub	$1, %ecx
2014	mov	%al, (%rdi, %rcx)
2015	jnz	LABEL(strncpy_fill_less_7)
2016LABEL(strncpy_fill_return):
2017	mov	%rdx, %rax
2018	ret
2019#endif
2020
2021	.p2align 4
2022LABEL(tail_0):				/* 1 byte */
2023	mov	(%rsi), %cl
2024	mov	%cl, (%rdi)
2025#ifdef USE_AS_STRNCPY
2026	mov	$1, %cl
2027	sub	$1, %r8
2028	jnz	LABEL(strncpy_fill_tail)
2029#endif
2030	ret
2031
2032	.p2align 4
2033LABEL(tail_1):				/* 2 bytes */
2034	mov	(%rsi), %cx
2035	mov	%cx, (%rdi)
2036#ifdef USE_AS_STRNCPY
2037	mov	$2, %cl
2038	sub	$2, %r8
2039	jnz	LABEL(strncpy_fill_tail)
2040#endif
2041	ret
2042
2043	.p2align 4
2044LABEL(tail_2):				/* 3 bytes */
2045	mov	(%rsi), %cx
2046	mov	%cx, (%rdi)
2047	mov	1(%rsi), %cx
2048	mov	%cx, 1(%rdi)
2049#ifdef USE_AS_STRNCPY
2050	mov	$3, %cl
2051	sub	$3, %r8
2052	jnz	LABEL(strncpy_fill_tail)
2053#endif
2054	ret
2055
2056	.p2align 4
2057LABEL(tail_3):				/* 4 bytes */
2058	mov	(%rsi), %ecx
2059	mov	%ecx, (%rdi)
2060#ifdef USE_AS_STRNCPY
2061	mov	$4, %cl
2062	sub	$4, %r8
2063	jnz	LABEL(strncpy_fill_tail)
2064#endif
2065	ret
2066
2067	.p2align 4
2068LABEL(tail_4):				/* 5 bytes */
2069	mov	(%rsi), %ecx
2070	mov	%ecx, (%rdi)
2071	mov	1(%rsi), %edx
2072	mov	%edx, 1(%rdi)
2073#ifdef USE_AS_STRNCPY
2074	mov	$5, %cl
2075	sub	$5, %r8
2076	jnz	LABEL(strncpy_fill_tail)
2077#endif
2078	ret
2079
2080	.p2align 4
2081LABEL(tail_5):				/* 6 bytes */
2082	mov	(%rsi), %ecx
2083	mov	%ecx, (%rdi)
2084	mov	2(%rsi), %edx
2085	mov	%edx, 2(%rdi)
2086#ifdef USE_AS_STRNCPY
2087	mov	$6, %cl
2088	sub	$6, %r8
2089	jnz	LABEL(strncpy_fill_tail)
2090#endif
2091	ret
2092
2093	.p2align 4
2094LABEL(tail_6):				/* 7 bytes */
2095	mov	(%rsi), %ecx
2096	mov	%ecx, (%rdi)
2097	mov	3(%rsi), %edx
2098	mov	%edx,3(%rdi)
2099#ifdef USE_AS_STRNCPY
2100	mov	$7, %cl
2101	sub	$7, %r8
2102	jnz	LABEL(strncpy_fill_tail)
2103#endif
2104	ret
2105
2106	.p2align 4
2107LABEL(tail_8):				/* 9 bytes */
2108	mov	(%rsi), %rcx
2109	mov	%rcx, (%rdi)
2110	mov	5(%rsi), %edx
2111	mov	%edx, 5(%rdi)
2112#ifdef USE_AS_STRNCPY
2113	mov	$9, %cl
2114	sub	$9, %r8
2115	jnz	LABEL(strncpy_fill_tail)
2116#endif
2117	ret
2118
2119	.p2align 4
2120LABEL(AMD_exit_more_8):
2121	test	%dh, %dh
2122	jz	LABEL(AMD_exit_more_16)
2123	test	$0x01, %dh
2124	jnz	LABEL(tail_8)
2125	test	$0x02, %dh
2126	jnz	LABEL(tail_9)
2127	test	$0x04, %dh
2128	jnz	LABEL(tail_10)
2129	test	$0x08, %dh
2130	jnz	LABEL(tail_11)
2131	test	$0x10, %dh
2132	jnz	LABEL(tail_12)
2133	test	$0x20, %dh
2134	jnz	LABEL(tail_13)
2135	test	$0x40, %dh
2136	jnz	LABEL(tail_14)
2137
2138	.p2align 4
2139LABEL(tail_15):				/* 16 bytes */
2140	mov	(%rsi), %rcx
2141	mov	%rcx, (%rdi)
2142	mov	8(%rsi), %rdx
2143	mov	%rdx, 8(%rdi)
2144#ifdef USE_AS_STRNCPY
2145	mov	$16, %cl
2146	sub	$16, %r8
2147	jnz	LABEL(strncpy_fill_tail)
2148#endif
2149	ret
2150
2151	.p2align 4
2152LABEL(tail_9):				/* 10 bytes */
2153	mov	(%rsi), %rcx
2154	mov	%rcx, (%rdi)
2155	mov	6(%rsi), %edx
2156	mov	%edx, 6(%rdi)
2157#ifdef USE_AS_STRNCPY
2158	mov	$10, %cl
2159	sub	$10, %r8
2160	jnz	LABEL(strncpy_fill_tail)
2161#endif
2162	ret
2163
2164	.p2align 4
2165LABEL(tail_10):				/* 11 bytes */
2166	mov	(%rsi), %rcx
2167	mov	%rcx, (%rdi)
2168	mov	7(%rsi), %edx
2169	mov	%edx, 7(%rdi)
2170#ifdef USE_AS_STRNCPY
2171	mov	$11, %cl
2172	sub	$11, %r8
2173	jnz	LABEL(strncpy_fill_tail)
2174#endif
2175	ret
2176
2177	.p2align 4
2178LABEL(tail_11):				/* 12 bytes */
2179	mov	(%rsi), %rcx
2180	mov	%rcx, (%rdi)
2181	mov	8(%rsi), %edx
2182	mov	%edx, 8(%rdi)
2183#ifdef USE_AS_STRNCPY
2184	mov	$12, %cl
2185	sub	$12, %r8
2186	jnz	LABEL(strncpy_fill_tail)
2187#endif
2188	ret
2189
2190	.p2align 4
2191LABEL(tail_12):				/* 13 bytes */
2192	mov	(%rsi), %rcx
2193	mov	%rcx, (%rdi)
2194	mov	5(%rsi), %rcx
2195	mov	%rcx, 5(%rdi)
2196#ifdef USE_AS_STRNCPY
2197	mov	$13, %cl
2198	sub	$13, %r8
2199	jnz	LABEL(strncpy_fill_tail)
2200#endif
2201	ret
2202
2203	.p2align 4
2204LABEL(tail_13):				/* 14 bytes */
2205	mov	(%rsi), %rcx
2206	mov	%rcx, (%rdi)
2207	mov	6(%rsi), %rcx
2208	mov	%rcx, 6(%rdi)
2209#ifdef USE_AS_STRNCPY
2210	mov	$14, %cl
2211	sub	$14, %r8
2212	jnz	LABEL(strncpy_fill_tail)
2213#endif
2214	ret
2215
2216	.p2align 4
2217LABEL(tail_14):				/* 15 bytes */
2218	mov	(%rsi), %rcx
2219	mov	%rcx, (%rdi)
2220	mov	7(%rsi), %rcx
2221	mov	%rcx, 7(%rdi)
2222#ifdef USE_AS_STRNCPY
2223	mov	$15, %cl
2224	sub	$15, %r8
2225	jnz	LABEL(strncpy_fill_tail)
2226#endif
2227	ret
2228
2229	.p2align 4
2230LABEL(AMD_exit_more_16):
2231	shr	$16, %edx
2232	test	%dl, %dl
2233	jz	LABEL(AMD_exit_more_24)
2234	test	$0x01, %dl
2235	jnz	LABEL(tail_16)
2236	test	$0x02, %dl
2237	jnz	LABEL(tail_17)
2238	test	$0x04, %dl
2239	jnz	LABEL(tail_18)
2240	test	$0x08, %dl
2241	jnz	LABEL(tail_19)
2242	test	$0x10, %dl
2243	jnz	LABEL(tail_20)
2244	test	$0x20, %dl
2245	jnz	LABEL(tail_21)
2246	test	$0x40, %dl
2247	jnz	LABEL(tail_22)
2248
2249	.p2align 4
2250LABEL(tail_23):				/* 24 bytes */
2251	mov	(%rsi), %rcx
2252	mov	%rcx, (%rdi)
2253	mov	8(%rsi), %rdx
2254	mov	%rdx, 8(%rdi)
2255	mov	16(%rsi), %rcx
2256	mov	%rcx, 16(%rdi)
2257#ifdef USE_AS_STRNCPY
2258	mov	$24, %cl
2259	sub	$24, %r8
2260	jnz	LABEL(strncpy_fill_tail)
2261#endif
2262	ret
2263
2264	.p2align 4
2265LABEL(tail_16):				/* 17 bytes */
2266	mov	(%rsi), %rcx
2267	mov	%rcx, (%rdi)
2268	mov	8(%rsi), %rdx
2269	mov	%rdx, 8(%rdi)
2270	mov	16(%rsi), %cl
2271	mov	%cl, 16(%rdi)
2272#ifdef USE_AS_STRNCPY
2273	mov	$17, %cl
2274	sub	$17, %r8
2275	jnz	LABEL(strncpy_fill_tail)
2276#endif
2277	ret
2278
2279	.p2align 4
2280LABEL(tail_17):				/* 18 bytes */
2281	mov	(%rsi), %rcx
2282	mov	%rcx, (%rdi)
2283	mov	8(%rsi), %rdx
2284	mov	%rdx, 8(%rdi)
2285	mov	16(%rsi), %cx
2286	mov	%cx, 16(%rdi)
2287#ifdef USE_AS_STRNCPY
2288	mov	$18, %cl
2289	sub	$18, %r8
2290	jnz	LABEL(strncpy_fill_tail)
2291#endif
2292	ret
2293
2294	.p2align 4
2295LABEL(tail_18):				/* 19 bytes */
2296	mov	(%rsi), %rcx
2297	mov	%rcx, (%rdi)
2298	mov	8(%rsi), %rdx
2299	mov	%rdx, 8(%rdi)
2300	mov	15(%rsi), %ecx
2301	mov	%ecx,15(%rdi)
2302#ifdef USE_AS_STRNCPY
2303	mov	$19, %cl
2304	sub	$19, %r8
2305	jnz	LABEL(strncpy_fill_tail)
2306#endif
2307	ret
2308
2309	.p2align 4
2310LABEL(tail_19):				/* 20 bytes */
2311	mov	(%rsi), %rcx
2312	mov	%rcx, (%rdi)
2313	mov	8(%rsi), %rdx
2314	mov	%rdx, 8(%rdi)
2315	mov	16(%rsi), %ecx
2316	mov	%ecx, 16(%rdi)
2317#ifdef USE_AS_STRNCPY
2318	mov	$20, %cl
2319	sub	$20, %r8
2320	jnz	LABEL(strncpy_fill_tail)
2321#endif
2322	ret
2323
2324	.p2align 4
2325LABEL(tail_20):				/* 21 bytes */
2326	mov	(%rsi), %rcx
2327	mov	%rcx, (%rdi)
2328	mov	8(%rsi), %rdx
2329	mov	%rdx, 8(%rdi)
2330	mov	13(%rsi), %rcx
2331	mov	%rcx, 13(%rdi)
2332#ifdef USE_AS_STRNCPY
2333	mov	$21, %cl
2334	sub	$21, %r8
2335	jnz	LABEL(strncpy_fill_tail)
2336#endif
2337	ret
2338
2339	.p2align 4
2340LABEL(tail_21):				/* 22 bytes */
2341	mov	(%rsi), %rcx
2342	mov	%rcx, (%rdi)
2343	mov	8(%rsi), %rdx
2344	mov	%rdx, 8(%rdi)
2345	mov	14(%rsi), %rcx
2346	mov	%rcx, 14(%rdi)
2347#ifdef USE_AS_STRNCPY
2348	mov	$22, %cl
2349	sub	$22, %r8
2350	jnz	LABEL(strncpy_fill_tail)
2351#endif
2352	ret
2353
2354	.p2align 4
2355LABEL(tail_22):				/* 23 bytes */
2356	mov	(%rsi), %rcx
2357	mov	%rcx, (%rdi)
2358	mov	8(%rsi), %rdx
2359	mov	%rdx, 8(%rdi)
2360	mov	15(%rsi), %rcx
2361	mov	%rcx, 15(%rdi)
2362#ifdef USE_AS_STRNCPY
2363	mov	$23, %cl
2364	sub	$23, %r8
2365	jnz	LABEL(strncpy_fill_tail)
2366#endif
2367	ret
2368
2369	.p2align 4
2370LABEL(AMD_exit_more_24):
2371	test	$0x01, %dh
2372	jnz	LABEL(tail_24)
2373	test	$0x02, %dh
2374	jnz	LABEL(tail_25)
2375	test	$0x04, %dh
2376	jnz	LABEL(tail_26)
2377	test	$0x08, %dh
2378	jnz	LABEL(tail_27)
2379	test	$0x10, %dh
2380	jnz	LABEL(tail_28)
2381	test	$0x20, %dh
2382	jnz	LABEL(tail_29)
2383	test	$0x40, %dh
2384	jnz	LABEL(tail_30)
2385
2386	.p2align 4
2387LABEL(tail_31):				/* 32 bytes */
2388	mov	(%rsi), %rcx
2389	mov	%rcx, (%rdi)
2390	mov	8(%rsi), %rdx
2391	mov	%rdx, 8(%rdi)
2392	mov	16(%rsi), %rcx
2393	mov	%rcx, 16(%rdi)
2394	mov	24(%rsi), %rdx
2395	mov	%rdx, 24(%rdi)
2396#ifdef USE_AS_STRNCPY
2397	mov	$32, %cl
2398	sub	$32, %r8
2399	jnz	LABEL(strncpy_fill_tail)
2400#endif
2401	ret
2402
2403	.p2align 4
2404LABEL(tail_24):				/* 25 bytes */
2405	mov	(%rsi), %rcx
2406	mov	%rcx, (%rdi)
2407	mov	8(%rsi), %rdx
2408	mov	%rdx, 8(%rdi)
2409	mov	16(%rsi), %rcx
2410	mov	%rcx, 16(%rdi)
2411	mov	21(%rsi), %edx
2412	mov	%edx, 21(%rdi)
2413#ifdef USE_AS_STRNCPY
2414	mov	$25, %cl
2415	sub	$25, %r8
2416	jnz	LABEL(strncpy_fill_tail)
2417#endif
2418	ret
2419
2420	.p2align 4
2421LABEL(tail_25):				/* 26 bytes */
2422	mov	(%rsi), %rcx
2423	mov	%rcx, (%rdi)
2424	mov	8(%rsi), %rdx
2425	mov	%rdx, 8(%rdi)
2426	mov	16(%rsi), %rcx
2427	mov	%rcx, 16(%rdi)
2428	mov	22(%rsi), %edx
2429	mov	%edx, 22(%rdi)
2430#ifdef USE_AS_STRNCPY
2431	mov	$26, %cl
2432	sub	$26, %r8
2433	jnz	LABEL(strncpy_fill_tail)
2434#endif
2435	ret
2436
2437	.p2align 4
2438LABEL(tail_26):				/* 27 bytes */
2439	mov	(%rsi), %rcx
2440	mov	%rcx, (%rdi)
2441	mov	8(%rsi), %rdx
2442	mov	%rdx, 8(%rdi)
2443	mov	16(%rsi), %rcx
2444	mov	%rcx, 16(%rdi)
2445	mov	23(%rsi), %edx
2446	mov	%edx, 23(%rdi)
2447#ifdef USE_AS_STRNCPY
2448	mov	$27, %cl
2449	sub	$27, %r8
2450	jnz	LABEL(strncpy_fill_tail)
2451#endif
2452	ret
2453
2454	.p2align 4
2455LABEL(tail_27):				/* 28 bytes */
2456	mov	(%rsi), %rcx
2457	mov	%rcx, (%rdi)
2458	mov	8(%rsi), %rdx
2459	mov	%rdx, 8(%rdi)
2460	mov	16(%rsi), %rcx
2461	mov	%rcx, 16(%rdi)
2462	mov	24(%rsi), %edx
2463	mov	%edx, 24(%rdi)
2464#ifdef USE_AS_STRNCPY
2465	mov	$28, %cl
2466	sub	$28, %r8
2467	jnz	LABEL(strncpy_fill_tail)
2468#endif
2469	ret
2470
2471	.p2align 4
2472LABEL(tail_28):				/* 29 bytes */
2473	mov	(%rsi), %rcx
2474	mov	%rcx, (%rdi)
2475	mov	8(%rsi), %rdx
2476	mov	%rdx, 8(%rdi)
2477	mov	16(%rsi), %rcx
2478	mov	%rcx, 16(%rdi)
2479	mov	21(%rsi), %rdx
2480	mov	%rdx, 21(%rdi)
2481#ifdef USE_AS_STRNCPY
2482	mov	$29, %cl
2483	sub	$29, %r8
2484	jnz	LABEL(strncpy_fill_tail)
2485#endif
2486	ret
2487
2488	.p2align 4
2489LABEL(tail_29):				/* 30 bytes */
2490	mov	(%rsi), %rcx
2491	mov	%rcx, (%rdi)
2492	mov	8(%rsi), %rdx
2493	mov	%rdx, 8(%rdi)
2494	mov	16(%rsi), %rcx
2495	mov	%rcx, 16(%rdi)
2496	mov	22(%rsi), %rdx
2497	mov	%rdx, 22(%rdi)
2498#ifdef USE_AS_STRNCPY
2499	mov	$30, %cl
2500	sub	$30, %r8
2501	jnz	LABEL(strncpy_fill_tail)
2502#endif
2503	ret
2504
2505	.p2align 4
2506LABEL(tail_30):				/* 31 bytes */
2507	mov	(%rsi), %rcx
2508	mov	%rcx, (%rdi)
2509	mov	8(%rsi), %rdx
2510	mov	%rdx, 8(%rdi)
2511	mov	16(%rsi), %rcx
2512	mov	%rcx, 16(%rdi)
2513	mov	23(%rsi), %rdx
2514	mov	%rdx, 23(%rdi)
2515#ifdef USE_AS_STRNCPY
2516	mov	$31, %cl
2517	sub	$31, %r8
2518	jnz	LABEL(strncpy_fill_tail)
2519#endif
2520	ret
2521
2522	.pushsection .rodata
2523	.p2align 4
2524LABEL(tail_table):
2525	.int	LABEL(tail_0) - LABEL(tail_table)	/* 1 byte */
2526	.int	LABEL(tail_1) - LABEL(tail_table)
2527	.int	LABEL(tail_2) - LABEL(tail_table)
2528	.int	LABEL(tail_3) - LABEL(tail_table)
2529	.int	LABEL(tail_4) - LABEL(tail_table)
2530	.int	LABEL(tail_5) - LABEL(tail_table)
2531	.int	LABEL(tail_6) - LABEL(tail_table)
2532	.int	LABEL(tail_7) - LABEL(tail_table)
2533	.int	LABEL(tail_8) - LABEL(tail_table)
2534	.int	LABEL(tail_9) - LABEL(tail_table)
2535	.int	LABEL(tail_10) - LABEL(tail_table)
2536	.int	LABEL(tail_11) - LABEL(tail_table)
2537	.int	LABEL(tail_12) - LABEL(tail_table)
2538	.int	LABEL(tail_13) - LABEL(tail_table)
2539	.int	LABEL(tail_14) - LABEL(tail_table)
2540	.int	LABEL(tail_15) - LABEL(tail_table)
2541	.int	LABEL(tail_16) - LABEL(tail_table)
2542	.int	LABEL(tail_17) - LABEL(tail_table)
2543	.int	LABEL(tail_18) - LABEL(tail_table)
2544	.int	LABEL(tail_19) - LABEL(tail_table)
2545	.int	LABEL(tail_20) - LABEL(tail_table)
2546	.int	LABEL(tail_21) - LABEL(tail_table)
2547	.int	LABEL(tail_22) - LABEL(tail_table)
2548	.int	LABEL(tail_23) - LABEL(tail_table)
2549	.int	LABEL(tail_24) - LABEL(tail_table)
2550	.int	LABEL(tail_25) - LABEL(tail_table)
2551	.int	LABEL(tail_26) - LABEL(tail_table)
2552	.int	LABEL(tail_27) - LABEL(tail_table)
2553	.int	LABEL(tail_28) - LABEL(tail_table)
2554	.int	LABEL(tail_29) - LABEL(tail_table)
2555	.int	LABEL(tail_30) - LABEL(tail_table)
2556	.int	LABEL(tail_31) - LABEL(tail_table)	/* 32 bytes */
2557
2558	.p2align 4
2559LABEL(unaligned_table):
2560	.int	LABEL(ashr_0) - LABEL(unaligned_table)
2561	.int	LABEL(ashr_1) - LABEL(unaligned_table)
2562	.int	LABEL(ashr_2) - LABEL(unaligned_table)
2563	.int	LABEL(ashr_3) - LABEL(unaligned_table)
2564	.int	LABEL(ashr_4) - LABEL(unaligned_table)
2565	.int	LABEL(ashr_5) - LABEL(unaligned_table)
2566	.int	LABEL(ashr_6) - LABEL(unaligned_table)
2567	.int	LABEL(ashr_7) - LABEL(unaligned_table)
2568	.int	LABEL(ashr_8) - LABEL(unaligned_table)
2569	.int	LABEL(ashr_9) - LABEL(unaligned_table)
2570	.int	LABEL(ashr_10) - LABEL(unaligned_table)
2571	.int	LABEL(ashr_11) - LABEL(unaligned_table)
2572	.int	LABEL(ashr_12) - LABEL(unaligned_table)
2573	.int	LABEL(ashr_13) - LABEL(unaligned_table)
2574	.int	LABEL(ashr_14) - LABEL(unaligned_table)
2575	.int	LABEL(ashr_15) - LABEL(unaligned_table)
2576	.popsection
2577
2578#ifdef USE_AS_STRNCPY
2579	SET_SIZE(strncpy)
2580#else
2581	SET_SIZE(strcpy)			/* (char *, const char *) */
2582#endif
2583