xref: /freebsd/contrib/bionic-x86_64-string/sse2-strcpy-slm.S (revision e64fe029e9d3ce476e77a478318e0c3cd201ff08)
1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef USE_AS_STRCAT
32
33# ifndef STRCPY
34#  define STRCPY	strcpy
35# endif
36
37# ifndef L
38#  define L(label)	.L##label
39# endif
40
41# ifndef cfi_startproc
42#  define cfi_startproc	.cfi_startproc
43# endif
44
45# ifndef cfi_endproc
46#  define cfi_endproc	.cfi_endproc
47# endif
48
49# ifndef ENTRY
50#  define ENTRY(name)	\
51	.type name, @function;	\
52	.globl name;	\
53	.p2align 4;	\
54name:	\
55	cfi_startproc
56# endif
57
58# ifndef END
59#  define END(name)	\
60	cfi_endproc;	\
61	.size name, .-name
62# endif
63
64#endif
65
66#define JMPTBL(I, B)	I - B
67#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
68	lea	TABLE(%rip), %r11;	\
69	movslq	(%r11, INDEX, SCALE), %rcx;	\
70	lea	(%r11, %rcx), %rcx;	\
71	jmp	*%rcx
72
73#ifndef USE_AS_STRCAT
74
75# define RETURN ret
76
77.text
78ENTRY (STRCPY)
79# ifdef USE_AS_STRNCPY
80	mov	%rdx, %r8
81	test	%r8, %r8
82	jz	L(ExitZero)
83# endif
84	mov	%rsi, %rcx
85# ifndef USE_AS_STPCPY
86	mov	%rdi, %rax      /* save result */
87# endif
88
89#endif
90	and	$63, %rcx
91	cmp	$32, %rcx
92	jbe	L(SourceStringAlignmentLess32)
93
94	and	$-16, %rsi
95	and	$15, %rcx
96	pxor	%xmm0, %xmm0
97	pxor	%xmm1, %xmm1
98
99	pcmpeqb	(%rsi), %xmm1
100	pmovmskb %xmm1, %rdx
101	shr	%cl, %rdx
102#ifdef USE_AS_STRNCPY
103# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
104	mov	$16, %r10
105	sub	%rcx, %r10
106	cmp	%r10, %r8
107# else
108	mov	$17, %r10
109	sub	%rcx, %r10
110	cmp	%r10, %r8
111# endif
112	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
113#endif
114	test	%rdx, %rdx
115	jnz	L(CopyFrom1To16BytesTail)
116
117	pcmpeqb	16(%rsi), %xmm0
118	pmovmskb %xmm0, %rdx
119#ifdef USE_AS_STRNCPY
120	add	$16, %r10
121	cmp	%r10, %r8
122	jbe	L(CopyFrom1To32BytesCase2OrCase3)
123#endif
124	test	%rdx, %rdx
125	jnz	L(CopyFrom1To32Bytes)
126
127	movdqu	(%rsi, %rcx), %xmm1   /* copy 16 bytes */
128	movdqu	%xmm1, (%rdi)
129
130/* If source adress alignment != destination adress alignment */
131	.p2align 4
132L(Unalign16Both):
133	sub	%rcx, %rdi
134#ifdef USE_AS_STRNCPY
135	add	%rcx, %r8
136#endif
137	mov	$16, %rcx
138	movdqa	(%rsi, %rcx), %xmm1
139	movaps	16(%rsi, %rcx), %xmm2
140	movdqu	%xmm1, (%rdi, %rcx)
141	pcmpeqb	%xmm2, %xmm0
142	pmovmskb %xmm0, %rdx
143	add	$16, %rcx
144#ifdef USE_AS_STRNCPY
145	sub	$48, %r8
146	jbe	L(CopyFrom1To16BytesCase2OrCase3)
147#endif
148	test	%rdx, %rdx
149#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
150	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
151#else
152	jnz	L(CopyFrom1To16Bytes)
153#endif
154
155	movaps	16(%rsi, %rcx), %xmm3
156	movdqu	%xmm2, (%rdi, %rcx)
157	pcmpeqb	%xmm3, %xmm0
158	pmovmskb %xmm0, %rdx
159	add	$16, %rcx
160#ifdef USE_AS_STRNCPY
161	sub	$16, %r8
162	jbe	L(CopyFrom1To16BytesCase2OrCase3)
163#endif
164	test	%rdx, %rdx
165#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
166	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
167#else
168	jnz	L(CopyFrom1To16Bytes)
169#endif
170
171	movaps	16(%rsi, %rcx), %xmm4
172	movdqu	%xmm3, (%rdi, %rcx)
173	pcmpeqb	%xmm4, %xmm0
174	pmovmskb %xmm0, %rdx
175	add	$16, %rcx
176#ifdef USE_AS_STRNCPY
177	sub	$16, %r8
178	jbe	L(CopyFrom1To16BytesCase2OrCase3)
179#endif
180	test	%rdx, %rdx
181#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
182	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
183#else
184	jnz	L(CopyFrom1To16Bytes)
185#endif
186
187	movaps	16(%rsi, %rcx), %xmm1
188	movdqu	%xmm4, (%rdi, %rcx)
189	pcmpeqb	%xmm1, %xmm0
190	pmovmskb %xmm0, %rdx
191	add	$16, %rcx
192#ifdef USE_AS_STRNCPY
193	sub	$16, %r8
194	jbe	L(CopyFrom1To16BytesCase2OrCase3)
195#endif
196	test	%rdx, %rdx
197#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
198	jnz	L(CopyFrom1To16BytesUnalignedXmm1)
199#else
200	jnz	L(CopyFrom1To16Bytes)
201#endif
202
203	movaps	16(%rsi, %rcx), %xmm2
204	movdqu	%xmm1, (%rdi, %rcx)
205	pcmpeqb	%xmm2, %xmm0
206	pmovmskb %xmm0, %rdx
207	add	$16, %rcx
208#ifdef USE_AS_STRNCPY
209	sub	$16, %r8
210	jbe	L(CopyFrom1To16BytesCase2OrCase3)
211#endif
212	test	%rdx, %rdx
213#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
214	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
215#else
216	jnz	L(CopyFrom1To16Bytes)
217#endif
218
219	movaps	16(%rsi, %rcx), %xmm3
220	movdqu	%xmm2, (%rdi, %rcx)
221	pcmpeqb	%xmm3, %xmm0
222	pmovmskb %xmm0, %rdx
223	add	$16, %rcx
224#ifdef USE_AS_STRNCPY
225	sub	$16, %r8
226	jbe	L(CopyFrom1To16BytesCase2OrCase3)
227#endif
228	test	%rdx, %rdx
229#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
230	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
231#else
232	jnz	L(CopyFrom1To16Bytes)
233#endif
234
235	movdqu	%xmm3, (%rdi, %rcx)
236	mov	%rsi, %rdx
237	lea	16(%rsi, %rcx), %rsi
238	and	$-0x40, %rsi
239	sub	%rsi, %rdx
240	sub	%rdx, %rdi
241#ifdef USE_AS_STRNCPY
242	lea	128(%r8, %rdx), %r8
243#endif
244L(Unaligned64Loop):
245	movaps	(%rsi), %xmm2
246	movaps	%xmm2, %xmm4
247	movaps	16(%rsi), %xmm5
248	movaps	32(%rsi), %xmm3
249	movaps	%xmm3, %xmm6
250	movaps	48(%rsi), %xmm7
251	pminub	%xmm5, %xmm2
252	pminub	%xmm7, %xmm3
253	pminub	%xmm2, %xmm3
254	pcmpeqb	%xmm0, %xmm3
255	pmovmskb %xmm3, %rdx
256#ifdef USE_AS_STRNCPY
257	sub	$64, %r8
258	jbe	L(UnalignedLeaveCase2OrCase3)
259#endif
260	test	%rdx, %rdx
261	jnz	L(Unaligned64Leave)
262
263L(Unaligned64Loop_start):
264	add	$64, %rdi
265	add	$64, %rsi
266	movdqu	%xmm4, -64(%rdi)
267	movaps	(%rsi), %xmm2
268	movdqa	%xmm2, %xmm4
269	movdqu	%xmm5, -48(%rdi)
270	movaps	16(%rsi), %xmm5
271	pminub	%xmm5, %xmm2
272	movaps	32(%rsi), %xmm3
273	movdqu	%xmm6, -32(%rdi)
274	movaps	%xmm3, %xmm6
275	movdqu	%xmm7, -16(%rdi)
276	movaps	48(%rsi), %xmm7
277	pminub	%xmm7, %xmm3
278	pminub	%xmm2, %xmm3
279	pcmpeqb	%xmm0, %xmm3
280	pmovmskb %xmm3, %rdx
281#ifdef USE_AS_STRNCPY
282	sub	$64, %r8
283	jbe	L(UnalignedLeaveCase2OrCase3)
284#endif
285	test	%rdx, %rdx
286	jz	L(Unaligned64Loop_start)
287
288L(Unaligned64Leave):
289	pxor	%xmm1, %xmm1
290
291	pcmpeqb	%xmm4, %xmm0
292	pcmpeqb	%xmm5, %xmm1
293	pmovmskb %xmm0, %rdx
294	pmovmskb %xmm1, %rcx
295	test	%rdx, %rdx
296	jnz	L(CopyFrom1To16BytesUnaligned_0)
297	test	%rcx, %rcx
298	jnz	L(CopyFrom1To16BytesUnaligned_16)
299
300	pcmpeqb	%xmm6, %xmm0
301	pcmpeqb	%xmm7, %xmm1
302	pmovmskb %xmm0, %rdx
303	pmovmskb %xmm1, %rcx
304	test	%rdx, %rdx
305	jnz	L(CopyFrom1To16BytesUnaligned_32)
306
307	bsf	%rcx, %rdx
308	movdqu	%xmm4, (%rdi)
309	movdqu	%xmm5, 16(%rdi)
310	movdqu	%xmm6, 32(%rdi)
311#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
312# ifdef USE_AS_STPCPY
313	lea	48(%rdi, %rdx), %rax
314# endif
315	movdqu	%xmm7, 48(%rdi)
316	add	$15, %r8
317	sub	%rdx, %r8
318	lea	49(%rdi, %rdx), %rdi
319	jmp	L(StrncpyFillTailWithZero)
320#else
321	add	$48, %rsi
322	add	$48, %rdi
323	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
324#endif
325
326/* If source adress alignment == destination adress alignment */
327
328L(SourceStringAlignmentLess32):
329	pxor	%xmm0, %xmm0
330	movdqu	(%rsi), %xmm1
331	movdqu	16(%rsi), %xmm2
332	pcmpeqb	%xmm1, %xmm0
333	pmovmskb %xmm0, %rdx
334
335#ifdef USE_AS_STRNCPY
336# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
337	cmp	$16, %r8
338# else
339	cmp	$17, %r8
340# endif
341	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
342#endif
343	test	%rdx, %rdx
344	jnz	L(CopyFrom1To16BytesTail1)
345
346	pcmpeqb	%xmm2, %xmm0
347	movdqu	%xmm1, (%rdi)
348	pmovmskb %xmm0, %rdx
349
350#ifdef USE_AS_STRNCPY
351# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
352	cmp	$32, %r8
353# else
354	cmp	$33, %r8
355# endif
356	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
357#endif
358	test	%rdx, %rdx
359	jnz	L(CopyFrom1To32Bytes1)
360
361	and	$15, %rcx
362	and	$-16, %rsi
363
364	jmp	L(Unalign16Both)
365
366/*------End of main part with loops---------------------*/
367
368/* Case1 */
369
370#if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
371	.p2align 4
372L(CopyFrom1To16Bytes):
373	add	%rcx, %rdi
374	add	%rcx, %rsi
375	bsf	%rdx, %rdx
376	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
377#endif
378	.p2align 4
379L(CopyFrom1To16BytesTail):
380	add	%rcx, %rsi
381	bsf	%rdx, %rdx
382	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
383
384	.p2align 4
385L(CopyFrom1To32Bytes1):
386	add	$16, %rsi
387	add	$16, %rdi
388#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
389	sub	$16, %r8
390#endif
391L(CopyFrom1To16BytesTail1):
392	bsf	%rdx, %rdx
393	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
394
395	.p2align 4
396L(CopyFrom1To32Bytes):
397	bsf	%rdx, %rdx
398	add	%rcx, %rsi
399	add	$16, %rdx
400	sub	%rcx, %rdx
401	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
402
403	.p2align 4
404L(CopyFrom1To16BytesUnaligned_0):
405	bsf	%rdx, %rdx
406#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
407# ifdef USE_AS_STPCPY
408	lea	(%rdi, %rdx), %rax
409# endif
410	movdqu	%xmm4, (%rdi)
411	add	$63, %r8
412	sub	%rdx, %r8
413	lea	1(%rdi, %rdx), %rdi
414	jmp	L(StrncpyFillTailWithZero)
415#else
416	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
417#endif
418
419	.p2align 4
420L(CopyFrom1To16BytesUnaligned_16):
421	bsf	%rcx, %rdx
422	movdqu	%xmm4, (%rdi)
423#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
424# ifdef USE_AS_STPCPY
425	lea	16(%rdi, %rdx), %rax
426# endif
427	movdqu	%xmm5, 16(%rdi)
428	add	$47, %r8
429	sub	%rdx, %r8
430	lea	17(%rdi, %rdx), %rdi
431	jmp	L(StrncpyFillTailWithZero)
432#else
433	add	$16, %rsi
434	add	$16, %rdi
435	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
436#endif
437
438	.p2align 4
439L(CopyFrom1To16BytesUnaligned_32):
440	bsf	%rdx, %rdx
441	movdqu	%xmm4, (%rdi)
442	movdqu	%xmm5, 16(%rdi)
443#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
444# ifdef USE_AS_STPCPY
445	lea	32(%rdi, %rdx), %rax
446# endif
447	movdqu	%xmm6, 32(%rdi)
448	add	$31, %r8
449	sub	%rdx, %r8
450	lea	33(%rdi, %rdx), %rdi
451	jmp	L(StrncpyFillTailWithZero)
452#else
453	add	$32, %rsi
454	add	$32, %rdi
455	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
456#endif
457
458#ifdef USE_AS_STRNCPY
459# ifndef USE_AS_STRCAT
460	.p2align 4
461L(CopyFrom1To16BytesUnalignedXmm6):
462	movdqu	%xmm6, (%rdi, %rcx)
463	jmp	L(CopyFrom1To16BytesXmmExit)
464
465	.p2align 4
466L(CopyFrom1To16BytesUnalignedXmm5):
467	movdqu	%xmm5, (%rdi, %rcx)
468	jmp	L(CopyFrom1To16BytesXmmExit)
469
470	.p2align 4
471L(CopyFrom1To16BytesUnalignedXmm4):
472	movdqu	%xmm4, (%rdi, %rcx)
473	jmp	L(CopyFrom1To16BytesXmmExit)
474
475	.p2align 4
476L(CopyFrom1To16BytesUnalignedXmm3):
477	movdqu	%xmm3, (%rdi, %rcx)
478	jmp	L(CopyFrom1To16BytesXmmExit)
479
480	.p2align 4
481L(CopyFrom1To16BytesUnalignedXmm1):
482	movdqu	%xmm1, (%rdi, %rcx)
483	jmp	L(CopyFrom1To16BytesXmmExit)
484# endif
485
486	.p2align 4
487L(CopyFrom1To16BytesExit):
488	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
489
490/* Case2 */
491
492	.p2align 4
493L(CopyFrom1To16BytesCase2):
494	add	$16, %r8
495	add	%rcx, %rdi
496	add	%rcx, %rsi
497	bsf	%rdx, %rdx
498	cmp	%r8, %rdx
499	jb	L(CopyFrom1To16BytesExit)
500	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
501
502	.p2align 4
503L(CopyFrom1To32BytesCase2):
504	add	%rcx, %rsi
505	bsf	%rdx, %rdx
506	add	$16, %rdx
507	sub	%rcx, %rdx
508	cmp	%r8, %rdx
509	jb	L(CopyFrom1To16BytesExit)
510	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
511
512L(CopyFrom1To16BytesTailCase2):
513	add	%rcx, %rsi
514	bsf	%rdx, %rdx
515	cmp	%r8, %rdx
516	jb	L(CopyFrom1To16BytesExit)
517	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
518
519L(CopyFrom1To16BytesTail1Case2):
520	bsf	%rdx, %rdx
521	cmp	%r8, %rdx
522	jb	L(CopyFrom1To16BytesExit)
523	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
524
525/* Case2 or Case3,  Case3 */
526
527	.p2align 4
528L(CopyFrom1To16BytesCase2OrCase3):
529	test	%rdx, %rdx
530	jnz	L(CopyFrom1To16BytesCase2)
531L(CopyFrom1To16BytesCase3):
532	add	$16, %r8
533	add	%rcx, %rdi
534	add	%rcx, %rsi
535	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
536
537	.p2align 4
538L(CopyFrom1To32BytesCase2OrCase3):
539	test	%rdx, %rdx
540	jnz	L(CopyFrom1To32BytesCase2)
541	add	%rcx, %rsi
542	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
543
544	.p2align 4
545L(CopyFrom1To16BytesTailCase2OrCase3):
546	test	%rdx, %rdx
547	jnz	L(CopyFrom1To16BytesTailCase2)
548	add	%rcx, %rsi
549	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
550
551	.p2align 4
552L(CopyFrom1To32Bytes1Case2OrCase3):
553	add	$16, %rdi
554	add	$16, %rsi
555	sub	$16, %r8
556L(CopyFrom1To16BytesTail1Case2OrCase3):
557	test	%rdx, %rdx
558	jnz	L(CopyFrom1To16BytesTail1Case2)
559	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
560
561#endif
562
563/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
564
565	.p2align 4
566L(Exit1):
567	mov	%dh, (%rdi)
568#ifdef USE_AS_STPCPY
569	lea	(%rdi), %rax
570#endif
571#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
572	sub	$1, %r8
573	lea	1(%rdi), %rdi
574	jnz	L(StrncpyFillTailWithZero)
575#endif
576	RETURN
577
578	.p2align 4
579L(Exit2):
580	mov	(%rsi), %dx
581	mov	%dx, (%rdi)
582#ifdef USE_AS_STPCPY
583	lea	1(%rdi), %rax
584#endif
585#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
586	sub	$2, %r8
587	lea	2(%rdi), %rdi
588	jnz	L(StrncpyFillTailWithZero)
589#endif
590	RETURN
591
592	.p2align 4
593L(Exit3):
594	mov	(%rsi), %cx
595	mov	%cx, (%rdi)
596	mov	%dh, 2(%rdi)
597#ifdef USE_AS_STPCPY
598	lea	2(%rdi), %rax
599#endif
600#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
601	sub	$3, %r8
602	lea	3(%rdi), %rdi
603	jnz	L(StrncpyFillTailWithZero)
604#endif
605	RETURN
606
607	.p2align 4
608L(Exit4):
609	mov	(%rsi), %edx
610	mov	%edx, (%rdi)
611#ifdef USE_AS_STPCPY
612	lea	3(%rdi), %rax
613#endif
614#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
615	sub	$4, %r8
616	lea	4(%rdi), %rdi
617	jnz	L(StrncpyFillTailWithZero)
618#endif
619	RETURN
620
621	.p2align 4
622L(Exit5):
623	mov	(%rsi), %ecx
624	mov	%dh, 4(%rdi)
625	mov	%ecx, (%rdi)
626#ifdef USE_AS_STPCPY
627	lea	4(%rdi), %rax
628#endif
629#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
630	sub	$5, %r8
631	lea	5(%rdi), %rdi
632	jnz	L(StrncpyFillTailWithZero)
633#endif
634	RETURN
635
636	.p2align 4
637L(Exit6):
638	mov	(%rsi), %ecx
639	mov	4(%rsi), %dx
640	mov	%ecx, (%rdi)
641	mov	%dx, 4(%rdi)
642#ifdef USE_AS_STPCPY
643	lea	5(%rdi), %rax
644#endif
645#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
646	sub	$6, %r8
647	lea	6(%rdi), %rdi
648	jnz	L(StrncpyFillTailWithZero)
649#endif
650	RETURN
651
652	.p2align 4
653L(Exit7):
654	mov	(%rsi), %ecx
655	mov	3(%rsi), %edx
656	mov	%ecx, (%rdi)
657	mov	%edx, 3(%rdi)
658#ifdef USE_AS_STPCPY
659	lea	6(%rdi), %rax
660#endif
661#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
662	sub	$7, %r8
663	lea	7(%rdi), %rdi
664	jnz	L(StrncpyFillTailWithZero)
665#endif
666	RETURN
667
668	.p2align 4
669L(Exit8):
670	mov	(%rsi), %rdx
671	mov	%rdx, (%rdi)
672#ifdef USE_AS_STPCPY
673	lea	7(%rdi), %rax
674#endif
675#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
676	sub	$8, %r8
677	lea	8(%rdi), %rdi
678	jnz	L(StrncpyFillTailWithZero)
679#endif
680	RETURN
681
682	.p2align 4
683L(Exit9):
684	mov	(%rsi), %rcx
685	mov	%dh, 8(%rdi)
686	mov	%rcx, (%rdi)
687#ifdef USE_AS_STPCPY
688	lea	8(%rdi), %rax
689#endif
690#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
691	sub	$9, %r8
692	lea	9(%rdi), %rdi
693	jnz	L(StrncpyFillTailWithZero)
694#endif
695	RETURN
696
697	.p2align 4
698L(Exit10):
699	mov	(%rsi), %rcx
700	mov	8(%rsi), %dx
701	mov	%rcx, (%rdi)
702	mov	%dx, 8(%rdi)
703#ifdef USE_AS_STPCPY
704	lea	9(%rdi), %rax
705#endif
706#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
707	sub	$10, %r8
708	lea	10(%rdi), %rdi
709	jnz	L(StrncpyFillTailWithZero)
710#endif
711	RETURN
712
713	.p2align 4
714L(Exit11):
715	mov	(%rsi), %rcx
716	mov	7(%rsi), %edx
717	mov	%rcx, (%rdi)
718	mov	%edx, 7(%rdi)
719#ifdef USE_AS_STPCPY
720	lea	10(%rdi), %rax
721#endif
722#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
723	sub	$11, %r8
724	lea	11(%rdi), %rdi
725	jnz	L(StrncpyFillTailWithZero)
726#endif
727	RETURN
728
729	.p2align 4
730L(Exit12):
731	mov	(%rsi), %rcx
732	mov	8(%rsi), %edx
733	mov	%rcx, (%rdi)
734	mov	%edx, 8(%rdi)
735#ifdef USE_AS_STPCPY
736	lea	11(%rdi), %rax
737#endif
738#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
739	sub	$12, %r8
740	lea	12(%rdi), %rdi
741	jnz	L(StrncpyFillTailWithZero)
742#endif
743	RETURN
744
745	.p2align 4
746L(Exit13):
747	mov	(%rsi), %rcx
748	mov	5(%rsi), %rdx
749	mov	%rcx, (%rdi)
750	mov	%rdx, 5(%rdi)
751#ifdef USE_AS_STPCPY
752	lea	12(%rdi), %rax
753#endif
754#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
755	sub	$13, %r8
756	lea	13(%rdi), %rdi
757	jnz	L(StrncpyFillTailWithZero)
758#endif
759	RETURN
760
761	.p2align 4
762L(Exit14):
763	mov	(%rsi), %rcx
764	mov	6(%rsi), %rdx
765	mov	%rcx, (%rdi)
766	mov	%rdx, 6(%rdi)
767#ifdef USE_AS_STPCPY
768	lea	13(%rdi), %rax
769#endif
770#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
771	sub	$14, %r8
772	lea	14(%rdi), %rdi
773	jnz	L(StrncpyFillTailWithZero)
774#endif
775	RETURN
776
777	.p2align 4
778L(Exit15):
779	mov	(%rsi), %rcx
780	mov	7(%rsi), %rdx
781	mov	%rcx, (%rdi)
782	mov	%rdx, 7(%rdi)
783#ifdef USE_AS_STPCPY
784	lea	14(%rdi), %rax
785#endif
786#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
787	sub	$15, %r8
788	lea	15(%rdi), %rdi
789	jnz	L(StrncpyFillTailWithZero)
790#endif
791	RETURN
792
793	.p2align 4
794L(Exit16):
795	movdqu	(%rsi), %xmm0
796	movdqu	%xmm0, (%rdi)
797#ifdef USE_AS_STPCPY
798	lea	15(%rdi), %rax
799#endif
800#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
801	sub	$16, %r8
802	lea	16(%rdi), %rdi
803	jnz	L(StrncpyFillTailWithZero)
804#endif
805	RETURN
806
807	.p2align 4
808L(Exit17):
809	movdqu	(%rsi), %xmm0
810	movdqu	%xmm0, (%rdi)
811	mov	%dh, 16(%rdi)
812#ifdef USE_AS_STPCPY
813	lea	16(%rdi), %rax
814#endif
815#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
816	sub	$17, %r8
817	lea	17(%rdi), %rdi
818	jnz	L(StrncpyFillTailWithZero)
819#endif
820	RETURN
821
822	.p2align 4
823L(Exit18):
824	movdqu	(%rsi), %xmm0
825	mov	16(%rsi), %cx
826	movdqu	%xmm0, (%rdi)
827	mov	%cx, 16(%rdi)
828#ifdef USE_AS_STPCPY
829	lea	17(%rdi), %rax
830#endif
831#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
832	sub	$18, %r8
833	lea	18(%rdi), %rdi
834	jnz	L(StrncpyFillTailWithZero)
835#endif
836	RETURN
837
838	.p2align 4
839L(Exit19):
840	movdqu	(%rsi), %xmm0
841	mov	15(%rsi), %ecx
842	movdqu	%xmm0, (%rdi)
843	mov	%ecx, 15(%rdi)
844#ifdef USE_AS_STPCPY
845	lea	18(%rdi), %rax
846#endif
847#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
848	sub	$19, %r8
849	lea	19(%rdi), %rdi
850	jnz	L(StrncpyFillTailWithZero)
851#endif
852	RETURN
853
854	.p2align 4
855L(Exit20):
856	movdqu	(%rsi), %xmm0
857	mov	16(%rsi), %ecx
858	movdqu	%xmm0, (%rdi)
859	mov	%ecx, 16(%rdi)
860#ifdef USE_AS_STPCPY
861	lea	19(%rdi), %rax
862#endif
863#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
864	sub	$20, %r8
865	lea	20(%rdi), %rdi
866	jnz	L(StrncpyFillTailWithZero)
867#endif
868	RETURN
869
870	.p2align 4
871L(Exit21):
872	movdqu	(%rsi), %xmm0
873	mov	16(%rsi), %ecx
874	movdqu	%xmm0, (%rdi)
875	mov	%ecx, 16(%rdi)
876	mov	%dh, 20(%rdi)
877#ifdef USE_AS_STPCPY
878	lea	20(%rdi), %rax
879#endif
880#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
881	sub	$21, %r8
882	lea	21(%rdi), %rdi
883	jnz	L(StrncpyFillTailWithZero)
884#endif
885	RETURN
886
887	.p2align 4
888L(Exit22):
889	movdqu	(%rsi), %xmm0
890	mov	14(%rsi), %rcx
891	movdqu	%xmm0, (%rdi)
892	mov	%rcx, 14(%rdi)
893#ifdef USE_AS_STPCPY
894	lea	21(%rdi), %rax
895#endif
896#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
897	sub	$22, %r8
898	lea	22(%rdi), %rdi
899	jnz	L(StrncpyFillTailWithZero)
900#endif
901	RETURN
902
903	.p2align 4
904L(Exit23):
905	movdqu	(%rsi), %xmm0
906	mov	15(%rsi), %rcx
907	movdqu	%xmm0, (%rdi)
908	mov	%rcx, 15(%rdi)
909#ifdef USE_AS_STPCPY
910	lea	22(%rdi), %rax
911#endif
912#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
913	sub	$23, %r8
914	lea	23(%rdi), %rdi
915	jnz	L(StrncpyFillTailWithZero)
916#endif
917	RETURN
918
919	.p2align 4
920L(Exit24):
921	movdqu	(%rsi), %xmm0
922	mov	16(%rsi), %rcx
923	movdqu	%xmm0, (%rdi)
924	mov	%rcx, 16(%rdi)
925#ifdef USE_AS_STPCPY
926	lea	23(%rdi), %rax
927#endif
928#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
929	sub	$24, %r8
930	lea	24(%rdi), %rdi
931	jnz	L(StrncpyFillTailWithZero)
932#endif
933	RETURN
934
935	.p2align 4
936L(Exit25):
937	movdqu	(%rsi), %xmm0
938	mov	16(%rsi), %rcx
939	movdqu	%xmm0, (%rdi)
940	mov	%rcx, 16(%rdi)
941	mov	%dh, 24(%rdi)
942#ifdef USE_AS_STPCPY
943	lea	24(%rdi), %rax
944#endif
945#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
946	sub	$25, %r8
947	lea	25(%rdi), %rdi
948	jnz	L(StrncpyFillTailWithZero)
949#endif
950	RETURN
951
952	.p2align 4
953L(Exit26):
954	movdqu	(%rsi), %xmm0
955	mov	16(%rsi), %rdx
956	mov	24(%rsi), %cx
957	movdqu	%xmm0, (%rdi)
958	mov	%rdx, 16(%rdi)
959	mov	%cx, 24(%rdi)
960#ifdef USE_AS_STPCPY
961	lea	25(%rdi), %rax
962#endif
963#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
964	sub	$26, %r8
965	lea	26(%rdi), %rdi
966	jnz	L(StrncpyFillTailWithZero)
967#endif
968	RETURN
969
970	.p2align 4
971L(Exit27):
972	movdqu	(%rsi), %xmm0
973	mov	16(%rsi), %rdx
974	mov	23(%rsi), %ecx
975	movdqu	%xmm0, (%rdi)
976	mov	%rdx, 16(%rdi)
977	mov	%ecx, 23(%rdi)
978#ifdef USE_AS_STPCPY
979	lea	26(%rdi), %rax
980#endif
981#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
982	sub	$27, %r8
983	lea	27(%rdi), %rdi
984	jnz	L(StrncpyFillTailWithZero)
985#endif
986	RETURN
987
988	.p2align 4
989L(Exit28):
990	movdqu	(%rsi), %xmm0
991	mov	16(%rsi), %rdx
992	mov	24(%rsi), %ecx
993	movdqu	%xmm0, (%rdi)
994	mov	%rdx, 16(%rdi)
995	mov	%ecx, 24(%rdi)
996#ifdef USE_AS_STPCPY
997	lea	27(%rdi), %rax
998#endif
999#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
1000	sub	$28, %r8
1001	lea	28(%rdi), %rdi
1002	jnz	L(StrncpyFillTailWithZero)
1003#endif
1004	RETURN
1005
1006	.p2align 4
1007L(Exit29):
1008	movdqu	(%rsi), %xmm0
1009	movdqu	13(%rsi), %xmm2
1010	movdqu	%xmm0, (%rdi)
1011	movdqu	%xmm2, 13(%rdi)
1012#ifdef USE_AS_STPCPY
1013	lea	28(%rdi), %rax
1014#endif
1015#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
1016	sub	$29, %r8
1017	lea	29(%rdi), %rdi
1018	jnz	L(StrncpyFillTailWithZero)
1019#endif
1020	RETURN
1021
1022	.p2align 4
1023L(Exit30):
1024	movdqu	(%rsi), %xmm0
1025	movdqu	14(%rsi), %xmm2
1026	movdqu	%xmm0, (%rdi)
1027	movdqu	%xmm2, 14(%rdi)
1028#ifdef USE_AS_STPCPY
1029	lea	29(%rdi), %rax
1030#endif
1031#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
1032	sub	$30, %r8
1033	lea	30(%rdi), %rdi
1034	jnz	L(StrncpyFillTailWithZero)
1035#endif
1036	RETURN
1037
1038	.p2align 4
1039L(Exit31):
1040	movdqu	(%rsi), %xmm0
1041	movdqu	15(%rsi), %xmm2
1042	movdqu	%xmm0, (%rdi)
1043	movdqu	%xmm2, 15(%rdi)
1044#ifdef USE_AS_STPCPY
1045	lea	30(%rdi), %rax
1046#endif
1047#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
1048	sub	$31, %r8
1049	lea	31(%rdi), %rdi
1050	jnz	L(StrncpyFillTailWithZero)
1051#endif
1052	RETURN
1053
1054	.p2align 4
1055L(Exit32):
1056	movdqu	(%rsi), %xmm0
1057	movdqu	16(%rsi), %xmm2
1058	movdqu	%xmm0, (%rdi)
1059	movdqu	%xmm2, 16(%rdi)
1060#ifdef USE_AS_STPCPY
1061	lea	31(%rdi), %rax
1062#endif
1063#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
1064	sub	$32, %r8
1065	lea	32(%rdi), %rdi
1066	jnz	L(StrncpyFillTailWithZero)
1067#endif
1068	RETURN
1069
1070#ifdef USE_AS_STRNCPY
1071
1072	.p2align 4
1073L(StrncpyExit0):
1074#ifdef USE_AS_STPCPY
1075	mov	%rdi, %rax
1076#endif
1077#ifdef USE_AS_STRCAT
1078	xor	%ch, %ch
1079	movb	%ch, (%rdi)
1080#endif
1081	RETURN
1082
1083	.p2align 4
1084L(StrncpyExit1):
1085	mov	(%rsi), %dl
1086	mov	%dl, (%rdi)
1087#ifdef USE_AS_STPCPY
1088	lea	1(%rdi), %rax
1089#endif
1090#ifdef USE_AS_STRCAT
1091	xor	%ch, %ch
1092	movb	%ch, 1(%rdi)
1093#endif
1094	RETURN
1095
1096	.p2align 4
1097L(StrncpyExit2):
1098	mov	(%rsi), %dx
1099	mov	%dx, (%rdi)
1100#ifdef USE_AS_STPCPY
1101	lea	2(%rdi), %rax
1102#endif
1103#ifdef USE_AS_STRCAT
1104	xor	%ch, %ch
1105	movb	%ch, 2(%rdi)
1106#endif
1107	RETURN
1108
1109	.p2align 4
1110L(StrncpyExit3):
1111	mov	(%rsi), %cx
1112	mov	2(%rsi), %dl
1113	mov	%cx, (%rdi)
1114	mov	%dl, 2(%rdi)
1115#ifdef USE_AS_STPCPY
1116	lea	3(%rdi), %rax
1117#endif
1118#ifdef USE_AS_STRCAT
1119	xor	%ch, %ch
1120	movb	%ch, 3(%rdi)
1121#endif
1122	RETURN
1123
1124	.p2align 4
1125L(StrncpyExit4):
1126	mov	(%rsi), %edx
1127	mov	%edx, (%rdi)
1128#ifdef USE_AS_STPCPY
1129	lea	4(%rdi), %rax
1130#endif
1131#ifdef USE_AS_STRCAT
1132	xor	%ch, %ch
1133	movb	%ch, 4(%rdi)
1134#endif
1135	RETURN
1136
1137	.p2align 4
1138L(StrncpyExit5):
1139	mov	(%rsi), %ecx
1140	mov	4(%rsi), %dl
1141	mov	%ecx, (%rdi)
1142	mov	%dl, 4(%rdi)
1143#ifdef USE_AS_STPCPY
1144	lea	5(%rdi), %rax
1145#endif
1146#ifdef USE_AS_STRCAT
1147	xor	%ch, %ch
1148	movb	%ch, 5(%rdi)
1149#endif
1150	RETURN
1151
1152	.p2align 4
1153L(StrncpyExit6):
1154	mov	(%rsi), %ecx
1155	mov	4(%rsi), %dx
1156	mov	%ecx, (%rdi)
1157	mov	%dx, 4(%rdi)
1158#ifdef USE_AS_STPCPY
1159	lea	6(%rdi), %rax
1160#endif
1161#ifdef USE_AS_STRCAT
1162	xor	%ch, %ch
1163	movb	%ch, 6(%rdi)
1164#endif
1165	RETURN
1166
1167	.p2align 4
1168L(StrncpyExit7):
1169	mov	(%rsi), %ecx
1170	mov	3(%rsi), %edx
1171	mov	%ecx, (%rdi)
1172	mov	%edx, 3(%rdi)
1173#ifdef USE_AS_STPCPY
1174	lea	7(%rdi), %rax
1175#endif
1176#ifdef USE_AS_STRCAT
1177	xor	%ch, %ch
1178	movb	%ch, 7(%rdi)
1179#endif
1180	RETURN
1181
1182	.p2align 4
1183L(StrncpyExit8):
1184	mov	(%rsi), %rdx
1185	mov	%rdx, (%rdi)
1186#ifdef USE_AS_STPCPY
1187	lea	8(%rdi), %rax
1188#endif
1189#ifdef USE_AS_STRCAT
1190	xor	%ch, %ch
1191	movb	%ch, 8(%rdi)
1192#endif
1193	RETURN
1194
1195	.p2align 4
1196L(StrncpyExit9):
1197	mov	(%rsi), %rcx
1198	mov	8(%rsi), %dl
1199	mov	%rcx, (%rdi)
1200	mov	%dl, 8(%rdi)
1201#ifdef USE_AS_STPCPY
1202	lea	9(%rdi), %rax
1203#endif
1204#ifdef USE_AS_STRCAT
1205	xor	%ch, %ch
1206	movb	%ch, 9(%rdi)
1207#endif
1208	RETURN
1209
1210	.p2align 4
1211L(StrncpyExit10):
1212	mov	(%rsi), %rcx
1213	mov	8(%rsi), %dx
1214	mov	%rcx, (%rdi)
1215	mov	%dx, 8(%rdi)
1216#ifdef USE_AS_STPCPY
1217	lea	10(%rdi), %rax
1218#endif
1219#ifdef USE_AS_STRCAT
1220	xor	%ch, %ch
1221	movb	%ch, 10(%rdi)
1222#endif
1223	RETURN
1224
1225	.p2align 4
1226L(StrncpyExit11):
1227	mov	(%rsi), %rcx
1228	mov	7(%rsi), %edx
1229	mov	%rcx, (%rdi)
1230	mov	%edx, 7(%rdi)
1231#ifdef USE_AS_STPCPY
1232	lea	11(%rdi), %rax
1233#endif
1234#ifdef USE_AS_STRCAT
1235	xor	%ch, %ch
1236	movb	%ch, 11(%rdi)
1237#endif
1238	RETURN
1239
1240	.p2align 4
1241L(StrncpyExit12):
1242	mov	(%rsi), %rcx
1243	mov	8(%rsi), %edx
1244	mov	%rcx, (%rdi)
1245	mov	%edx, 8(%rdi)
1246#ifdef USE_AS_STPCPY
1247	lea	12(%rdi), %rax
1248#endif
1249#ifdef USE_AS_STRCAT
1250	xor	%ch, %ch
1251	movb	%ch, 12(%rdi)
1252#endif
1253	RETURN
1254
1255	.p2align 4
1256L(StrncpyExit13):
1257	mov	(%rsi), %rcx
1258	mov	5(%rsi), %rdx
1259	mov	%rcx, (%rdi)
1260	mov	%rdx, 5(%rdi)
1261#ifdef USE_AS_STPCPY
1262	lea	13(%rdi), %rax
1263#endif
1264#ifdef USE_AS_STRCAT
1265	xor	%ch, %ch
1266	movb	%ch, 13(%rdi)
1267#endif
1268	RETURN
1269
1270	.p2align 4
1271L(StrncpyExit14):
1272	mov	(%rsi), %rcx
1273	mov	6(%rsi), %rdx
1274	mov	%rcx, (%rdi)
1275	mov	%rdx, 6(%rdi)
1276#ifdef USE_AS_STPCPY
1277	lea	14(%rdi), %rax
1278#endif
1279#ifdef USE_AS_STRCAT
1280	xor	%ch, %ch
1281	movb	%ch, 14(%rdi)
1282#endif
1283	RETURN
1284
1285	.p2align 4
1286L(StrncpyExit15):
1287	mov	(%rsi), %rcx
1288	mov	7(%rsi), %rdx
1289	mov	%rcx, (%rdi)
1290	mov	%rdx, 7(%rdi)
1291#ifdef USE_AS_STPCPY
1292	lea	15(%rdi), %rax
1293#endif
1294#ifdef USE_AS_STRCAT
1295	xor	%ch, %ch
1296	movb	%ch, 15(%rdi)
1297#endif
1298	RETURN
1299
1300	.p2align 4
1301L(StrncpyExit16):
1302	movdqu	(%rsi), %xmm0
1303	movdqu	%xmm0, (%rdi)
1304#ifdef USE_AS_STPCPY
1305	lea	16(%rdi), %rax
1306#endif
1307#ifdef USE_AS_STRCAT
1308	xor	%ch, %ch
1309	movb	%ch, 16(%rdi)
1310#endif
1311	RETURN
1312
1313	.p2align 4
1314L(StrncpyExit17):
1315	movdqu	(%rsi), %xmm0
1316	mov	16(%rsi), %cl
1317	movdqu	%xmm0, (%rdi)
1318	mov	%cl, 16(%rdi)
1319#ifdef USE_AS_STPCPY
1320	lea	17(%rdi), %rax
1321#endif
1322#ifdef USE_AS_STRCAT
1323	xor	%ch, %ch
1324	movb	%ch, 17(%rdi)
1325#endif
1326	RETURN
1327
1328	.p2align 4
1329L(StrncpyExit18):
1330	movdqu	(%rsi), %xmm0
1331	mov	16(%rsi), %cx
1332	movdqu	%xmm0, (%rdi)
1333	mov	%cx, 16(%rdi)
1334#ifdef USE_AS_STPCPY
1335	lea	18(%rdi), %rax
1336#endif
1337#ifdef USE_AS_STRCAT
1338	xor	%ch, %ch
1339	movb	%ch, 18(%rdi)
1340#endif
1341	RETURN
1342
1343	.p2align 4
1344L(StrncpyExit19):
1345	movdqu	(%rsi), %xmm0
1346	mov	15(%rsi), %ecx
1347	movdqu	%xmm0, (%rdi)
1348	mov	%ecx, 15(%rdi)
1349#ifdef USE_AS_STPCPY
1350	lea	19(%rdi), %rax
1351#endif
1352#ifdef USE_AS_STRCAT
1353	xor	%ch, %ch
1354	movb	%ch, 19(%rdi)
1355#endif
1356	RETURN
1357
1358	.p2align 4
1359L(StrncpyExit20):
1360	movdqu	(%rsi), %xmm0
1361	mov	16(%rsi), %ecx
1362	movdqu	%xmm0, (%rdi)
1363	mov	%ecx, 16(%rdi)
1364#ifdef USE_AS_STPCPY
1365	lea	20(%rdi), %rax
1366#endif
1367#ifdef USE_AS_STRCAT
1368	xor	%ch, %ch
1369	movb	%ch, 20(%rdi)
1370#endif
1371	RETURN
1372
1373	.p2align 4
1374L(StrncpyExit21):
1375	movdqu	(%rsi), %xmm0
1376	mov	16(%rsi), %ecx
1377	mov	20(%rsi), %dl
1378	movdqu	%xmm0, (%rdi)
1379	mov	%ecx, 16(%rdi)
1380	mov	%dl, 20(%rdi)
1381#ifdef USE_AS_STPCPY
1382	lea	21(%rdi), %rax
1383#endif
1384#ifdef USE_AS_STRCAT
1385	xor	%ch, %ch
1386	movb	%ch, 21(%rdi)
1387#endif
1388	RETURN
1389
1390	.p2align 4
1391L(StrncpyExit22):
1392	movdqu	(%rsi), %xmm0
1393	mov	14(%rsi), %rcx
1394	movdqu	%xmm0, (%rdi)
1395	mov	%rcx, 14(%rdi)
1396#ifdef USE_AS_STPCPY
1397	lea	22(%rdi), %rax
1398#endif
1399#ifdef USE_AS_STRCAT
1400	xor	%ch, %ch
1401	movb	%ch, 22(%rdi)
1402#endif
1403	RETURN
1404
1405	.p2align 4
1406L(StrncpyExit23):
1407	movdqu	(%rsi), %xmm0
1408	mov	15(%rsi), %rcx
1409	movdqu	%xmm0, (%rdi)
1410	mov	%rcx, 15(%rdi)
1411#ifdef USE_AS_STPCPY
1412	lea	23(%rdi), %rax
1413#endif
1414#ifdef USE_AS_STRCAT
1415	xor	%ch, %ch
1416	movb	%ch, 23(%rdi)
1417#endif
1418	RETURN
1419
1420	.p2align 4
1421L(StrncpyExit24):
1422	movdqu	(%rsi), %xmm0
1423	mov	16(%rsi), %rcx
1424	movdqu	%xmm0, (%rdi)
1425	mov	%rcx, 16(%rdi)
1426#ifdef USE_AS_STPCPY
1427	lea	24(%rdi), %rax
1428#endif
1429#ifdef USE_AS_STRCAT
1430	xor	%ch, %ch
1431	movb	%ch, 24(%rdi)
1432#endif
1433	RETURN
1434
1435	.p2align 4
1436L(StrncpyExit25):
1437	movdqu	(%rsi), %xmm0
1438	mov	16(%rsi), %rdx
1439	mov	24(%rsi), %cl
1440	movdqu	%xmm0, (%rdi)
1441	mov	%rdx, 16(%rdi)
1442	mov	%cl, 24(%rdi)
1443#ifdef USE_AS_STPCPY
1444	lea	25(%rdi), %rax
1445#endif
1446#ifdef USE_AS_STRCAT
1447	xor	%ch, %ch
1448	movb	%ch, 25(%rdi)
1449#endif
1450	RETURN
1451
1452	.p2align 4
1453L(StrncpyExit26):
1454	movdqu	(%rsi), %xmm0
1455	mov	16(%rsi), %rdx
1456	mov	24(%rsi), %cx
1457	movdqu	%xmm0, (%rdi)
1458	mov	%rdx, 16(%rdi)
1459	mov	%cx, 24(%rdi)
1460#ifdef USE_AS_STPCPY
1461	lea	26(%rdi), %rax
1462#endif
1463#ifdef USE_AS_STRCAT
1464	xor	%ch, %ch
1465	movb	%ch, 26(%rdi)
1466#endif
1467	RETURN
1468
1469	.p2align 4
1470L(StrncpyExit27):
1471	movdqu	(%rsi), %xmm0
1472	mov	16(%rsi), %rdx
1473	mov	23(%rsi), %ecx
1474	movdqu	%xmm0, (%rdi)
1475	mov	%rdx, 16(%rdi)
1476	mov	%ecx, 23(%rdi)
1477#ifdef USE_AS_STPCPY
1478	lea	27(%rdi), %rax
1479#endif
1480#ifdef USE_AS_STRCAT
1481	xor	%ch, %ch
1482	movb	%ch, 27(%rdi)
1483#endif
1484	RETURN
1485
1486	.p2align 4
1487L(StrncpyExit28):
1488	movdqu	(%rsi), %xmm0
1489	mov	16(%rsi), %rdx
1490	mov	24(%rsi), %ecx
1491	movdqu	%xmm0, (%rdi)
1492	mov	%rdx, 16(%rdi)
1493	mov	%ecx, 24(%rdi)
1494#ifdef USE_AS_STPCPY
1495	lea	28(%rdi), %rax
1496#endif
1497#ifdef USE_AS_STRCAT
1498	xor	%ch, %ch
1499	movb	%ch, 28(%rdi)
1500#endif
1501	RETURN
1502
1503	.p2align 4
1504L(StrncpyExit29):
1505	movdqu	(%rsi), %xmm0
1506	movdqu	13(%rsi), %xmm2
1507	movdqu	%xmm0, (%rdi)
1508	movdqu	%xmm2, 13(%rdi)
1509#ifdef USE_AS_STPCPY
1510	lea	29(%rdi), %rax
1511#endif
1512#ifdef USE_AS_STRCAT
1513	xor	%ch, %ch
1514	movb	%ch, 29(%rdi)
1515#endif
1516	RETURN
1517
1518	.p2align 4
1519L(StrncpyExit30):
1520	movdqu	(%rsi), %xmm0
1521	movdqu	14(%rsi), %xmm2
1522	movdqu	%xmm0, (%rdi)
1523	movdqu	%xmm2, 14(%rdi)
1524#ifdef USE_AS_STPCPY
1525	lea	30(%rdi), %rax
1526#endif
1527#ifdef USE_AS_STRCAT
1528	xor	%ch, %ch
1529	movb	%ch, 30(%rdi)
1530#endif
1531	RETURN
1532
1533	.p2align 4
1534L(StrncpyExit31):
1535	movdqu	(%rsi), %xmm0
1536	movdqu	15(%rsi), %xmm2
1537	movdqu	%xmm0, (%rdi)
1538	movdqu	%xmm2, 15(%rdi)
1539#ifdef USE_AS_STPCPY
1540	lea	31(%rdi), %rax
1541#endif
1542#ifdef USE_AS_STRCAT
1543	xor	%ch, %ch
1544	movb	%ch, 31(%rdi)
1545#endif
1546	RETURN
1547
1548	.p2align 4
1549L(StrncpyExit32):
1550	movdqu	(%rsi), %xmm0
1551	movdqu	16(%rsi), %xmm2
1552	movdqu	%xmm0, (%rdi)
1553	movdqu	%xmm2, 16(%rdi)
1554#ifdef USE_AS_STPCPY
1555	lea	32(%rdi), %rax
1556#endif
1557#ifdef USE_AS_STRCAT
1558	xor	%ch, %ch
1559	movb	%ch, 32(%rdi)
1560#endif
1561	RETURN
1562
1563	.p2align 4
1564L(StrncpyExit33):
1565	movdqu	(%rsi), %xmm0
1566	movdqu	16(%rsi), %xmm2
1567	mov	32(%rsi), %cl
1568	movdqu	%xmm0, (%rdi)
1569	movdqu	%xmm2, 16(%rdi)
1570	mov	%cl, 32(%rdi)
1571#ifdef USE_AS_STRCAT
1572	xor	%ch, %ch
1573	movb	%ch, 33(%rdi)
1574#endif
1575	RETURN
1576
1577#ifndef USE_AS_STRCAT
1578
1579	.p2align 4
1580L(Fill0):
1581	RETURN
1582
1583	.p2align 4
1584L(Fill1):
1585	mov	%dl, (%rdi)
1586	RETURN
1587
1588	.p2align 4
1589L(Fill2):
1590	mov	%dx, (%rdi)
1591	RETURN
1592
1593	.p2align 4
1594L(Fill3):
1595	mov	%edx, -1(%rdi)
1596	RETURN
1597
1598	.p2align 4
1599L(Fill4):
1600	mov	%edx, (%rdi)
1601	RETURN
1602
1603	.p2align 4
1604L(Fill5):
1605	mov	%edx, (%rdi)
1606	mov	%dl, 4(%rdi)
1607	RETURN
1608
1609	.p2align 4
1610L(Fill6):
1611	mov	%edx, (%rdi)
1612	mov	%dx, 4(%rdi)
1613	RETURN
1614
1615	.p2align 4
1616L(Fill7):
1617	mov	%rdx, -1(%rdi)
1618	RETURN
1619
1620	.p2align 4
1621L(Fill8):
1622	mov	%rdx, (%rdi)
1623	RETURN
1624
1625	.p2align 4
1626L(Fill9):
1627	mov	%rdx, (%rdi)
1628	mov	%dl, 8(%rdi)
1629	RETURN
1630
1631	.p2align 4
1632L(Fill10):
1633	mov	%rdx, (%rdi)
1634	mov	%dx, 8(%rdi)
1635	RETURN
1636
1637	.p2align 4
1638L(Fill11):
1639	mov	%rdx, (%rdi)
1640	mov	%edx, 7(%rdi)
1641	RETURN
1642
1643	.p2align 4
1644L(Fill12):
1645	mov	%rdx, (%rdi)
1646	mov	%edx, 8(%rdi)
1647	RETURN
1648
1649	.p2align 4
1650L(Fill13):
1651	mov	%rdx, (%rdi)
1652	mov	%rdx, 5(%rdi)
1653	RETURN
1654
1655	.p2align 4
1656L(Fill14):
1657	mov	%rdx, (%rdi)
1658	mov	%rdx, 6(%rdi)
1659	RETURN
1660
1661	.p2align 4
1662L(Fill15):
1663	movdqu	%xmm0, -1(%rdi)
1664	RETURN
1665
1666	.p2align 4
1667L(Fill16):
1668	movdqu	%xmm0, (%rdi)
1669	RETURN
1670
1671	.p2align 4
1672L(CopyFrom1To16BytesUnalignedXmm2):
1673	movdqu	%xmm2, (%rdi, %rcx)
1674
1675	.p2align 4
1676L(CopyFrom1To16BytesXmmExit):
1677	bsf	%rdx, %rdx
1678	add	$15, %r8
1679	add	%rcx, %rdi
1680#ifdef USE_AS_STPCPY
1681	lea	(%rdi, %rdx), %rax
1682#endif
1683	sub	%rdx, %r8
1684	lea	1(%rdi, %rdx), %rdi
1685
1686	.p2align 4
1687L(StrncpyFillTailWithZero):
1688	pxor	%xmm0, %xmm0
1689	xor	%rdx, %rdx
1690	sub	$16, %r8
1691	jbe	L(StrncpyFillExit)
1692
1693	movdqu	%xmm0, (%rdi)
1694	add	$16, %rdi
1695
1696	mov	%rdi, %rsi
1697	and	$0xf, %rsi
1698	sub	%rsi, %rdi
1699	add	%rsi, %r8
1700	sub	$64, %r8
1701	jb	L(StrncpyFillLess64)
1702
1703L(StrncpyFillLoopMovdqa):
1704	movdqa	%xmm0, (%rdi)
1705	movdqa	%xmm0, 16(%rdi)
1706	movdqa	%xmm0, 32(%rdi)
1707	movdqa	%xmm0, 48(%rdi)
1708	add	$64, %rdi
1709	sub	$64, %r8
1710	jae	L(StrncpyFillLoopMovdqa)
1711
1712L(StrncpyFillLess64):
1713	add	$32, %r8
1714	jl	L(StrncpyFillLess32)
1715	movdqa	%xmm0, (%rdi)
1716	movdqa	%xmm0, 16(%rdi)
1717	add	$32, %rdi
1718	sub	$16, %r8
1719	jl	L(StrncpyFillExit)
1720	movdqa	%xmm0, (%rdi)
1721	add	$16, %rdi
1722	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
1723
1724L(StrncpyFillLess32):
1725	add	$16, %r8
1726	jl	L(StrncpyFillExit)
1727	movdqa	%xmm0, (%rdi)
1728	add	$16, %rdi
1729	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
1730
1731L(StrncpyFillExit):
1732	add	$16, %r8
1733	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
1734
1735/* end of ifndef USE_AS_STRCAT */
1736#endif
1737
1738	.p2align 4
1739L(UnalignedLeaveCase2OrCase3):
1740	test	%rdx, %rdx
1741	jnz	L(Unaligned64LeaveCase2)
1742L(Unaligned64LeaveCase3):
1743	lea	64(%r8), %rcx
1744	and	$-16, %rcx
1745	add	$48, %r8
1746	jl	L(CopyFrom1To16BytesCase3)
1747	movdqu	%xmm4, (%rdi)
1748	sub	$16, %r8
1749	jb	L(CopyFrom1To16BytesCase3)
1750	movdqu	%xmm5, 16(%rdi)
1751	sub	$16, %r8
1752	jb	L(CopyFrom1To16BytesCase3)
1753	movdqu	%xmm6, 32(%rdi)
1754	sub	$16, %r8
1755	jb	L(CopyFrom1To16BytesCase3)
1756	movdqu	%xmm7, 48(%rdi)
1757#ifdef USE_AS_STPCPY
1758	lea	64(%rdi), %rax
1759#endif
1760#ifdef USE_AS_STRCAT
1761	xor	%ch, %ch
1762	movb	%ch, 64(%rdi)
1763#endif
1764	RETURN
1765
1766	.p2align 4
1767L(Unaligned64LeaveCase2):
1768	xor	%rcx, %rcx
1769	pcmpeqb	%xmm4, %xmm0
1770	pmovmskb %xmm0, %rdx
1771	add	$48, %r8
1772	jle	L(CopyFrom1To16BytesCase2OrCase3)
1773	test	%rdx, %rdx
1774#ifndef USE_AS_STRCAT
1775	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
1776#else
1777	jnz	L(CopyFrom1To16Bytes)
1778#endif
1779	pcmpeqb	%xmm5, %xmm0
1780	pmovmskb %xmm0, %rdx
1781	movdqu	%xmm4, (%rdi)
1782	add	$16, %rcx
1783	sub	$16, %r8
1784	jbe	L(CopyFrom1To16BytesCase2OrCase3)
1785	test	%rdx, %rdx
1786#ifndef USE_AS_STRCAT
1787	jnz	L(CopyFrom1To16BytesUnalignedXmm5)
1788#else
1789	jnz	L(CopyFrom1To16Bytes)
1790#endif
1791
1792	pcmpeqb	%xmm6, %xmm0
1793	pmovmskb %xmm0, %rdx
1794	movdqu	%xmm5, 16(%rdi)
1795	add	$16, %rcx
1796	sub	$16, %r8
1797	jbe	L(CopyFrom1To16BytesCase2OrCase3)
1798	test	%rdx, %rdx
1799#ifndef USE_AS_STRCAT
1800	jnz	L(CopyFrom1To16BytesUnalignedXmm6)
1801#else
1802	jnz	L(CopyFrom1To16Bytes)
1803#endif
1804
1805	pcmpeqb	%xmm7, %xmm0
1806	pmovmskb %xmm0, %rdx
1807	movdqu	%xmm6, 32(%rdi)
1808	lea	16(%rdi, %rcx), %rdi
1809	lea	16(%rsi, %rcx), %rsi
1810	bsf	%rdx, %rdx
1811	cmp	%r8, %rdx
1812	jb	L(CopyFrom1To16BytesExit)
1813	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
1814
1815	.p2align 4
1816L(ExitZero):
1817#ifndef USE_AS_STRCAT
1818	mov	%rdi, %rax
1819#endif
1820	RETURN
1821
1822#endif
1823
1824#ifndef USE_AS_STRCAT
1825END (STRCPY)
1826#else
1827END (STRCAT)
1828#endif
1829	.p2align 4
1830	.section .rodata
1831L(ExitTable):
1832	.int	JMPTBL(L(Exit1), L(ExitTable))
1833	.int	JMPTBL(L(Exit2), L(ExitTable))
1834	.int	JMPTBL(L(Exit3), L(ExitTable))
1835	.int	JMPTBL(L(Exit4), L(ExitTable))
1836	.int	JMPTBL(L(Exit5), L(ExitTable))
1837	.int	JMPTBL(L(Exit6), L(ExitTable))
1838	.int	JMPTBL(L(Exit7), L(ExitTable))
1839	.int	JMPTBL(L(Exit8), L(ExitTable))
1840	.int	JMPTBL(L(Exit9), L(ExitTable))
1841	.int	JMPTBL(L(Exit10), L(ExitTable))
1842	.int	JMPTBL(L(Exit11), L(ExitTable))
1843	.int	JMPTBL(L(Exit12), L(ExitTable))
1844	.int	JMPTBL(L(Exit13), L(ExitTable))
1845	.int	JMPTBL(L(Exit14), L(ExitTable))
1846	.int	JMPTBL(L(Exit15), L(ExitTable))
1847	.int	JMPTBL(L(Exit16), L(ExitTable))
1848	.int	JMPTBL(L(Exit17), L(ExitTable))
1849	.int	JMPTBL(L(Exit18), L(ExitTable))
1850	.int	JMPTBL(L(Exit19), L(ExitTable))
1851	.int	JMPTBL(L(Exit20), L(ExitTable))
1852	.int	JMPTBL(L(Exit21), L(ExitTable))
1853	.int	JMPTBL(L(Exit22), L(ExitTable))
1854	.int	JMPTBL(L(Exit23), L(ExitTable))
1855	.int	JMPTBL(L(Exit24), L(ExitTable))
1856	.int	JMPTBL(L(Exit25), L(ExitTable))
1857	.int	JMPTBL(L(Exit26), L(ExitTable))
1858	.int	JMPTBL(L(Exit27), L(ExitTable))
1859	.int	JMPTBL(L(Exit28), L(ExitTable))
1860	.int	JMPTBL(L(Exit29), L(ExitTable))
1861	.int	JMPTBL(L(Exit30), L(ExitTable))
1862	.int	JMPTBL(L(Exit31), L(ExitTable))
1863	.int	JMPTBL(L(Exit32), L(ExitTable))
1864#ifdef USE_AS_STRNCPY
1865L(ExitStrncpyTable):
1866	.int	JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
1867	.int	JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
1868	.int	JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
1869	.int	JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
1870	.int	JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
1871	.int	JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
1872	.int	JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
1873	.int	JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
1874	.int	JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
1875	.int	JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
1876	.int	JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
1877	.int	JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
1878	.int	JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
1879	.int	JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
1880	.int	JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
1881	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
1882	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
1883	.int	JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
1884	.int	JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
1885	.int	JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
1886	.int	JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
1887	.int	JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
1888	.int	JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
1889	.int	JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
1890	.int	JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
1891	.int	JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
1892	.int	JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
1893	.int	JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
1894	.int	JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
1895	.int	JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
1896	.int	JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
1897	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
1898	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
1899	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
1900# ifndef USE_AS_STRCAT
1901	.p2align 4
1902L(FillTable):
1903	.int	JMPTBL(L(Fill0), L(FillTable))
1904	.int	JMPTBL(L(Fill1), L(FillTable))
1905	.int	JMPTBL(L(Fill2), L(FillTable))
1906	.int	JMPTBL(L(Fill3), L(FillTable))
1907	.int	JMPTBL(L(Fill4), L(FillTable))
1908	.int	JMPTBL(L(Fill5), L(FillTable))
1909	.int	JMPTBL(L(Fill6), L(FillTable))
1910	.int	JMPTBL(L(Fill7), L(FillTable))
1911	.int	JMPTBL(L(Fill8), L(FillTable))
1912	.int	JMPTBL(L(Fill9), L(FillTable))
1913	.int	JMPTBL(L(Fill10), L(FillTable))
1914	.int	JMPTBL(L(Fill11), L(FillTable))
1915	.int	JMPTBL(L(Fill12), L(FillTable))
1916	.int	JMPTBL(L(Fill13), L(FillTable))
1917	.int	JMPTBL(L(Fill14), L(FillTable))
1918	.int	JMPTBL(L(Fill15), L(FillTable))
1919	.int	JMPTBL(L(Fill16), L(FillTable))
1920# endif
1921#endif
1922