xref: /titanic_51/usr/src/lib/libc/amd64/gen/memcpy.s (revision 4496171313bed39e96f21bc2f9faf2868e267ae3)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Copyright (c) 2002 Advanced Micro Devices, Inc.
29 *
30 * All rights reserved.
31 *
32 * Redistribution and  use in source and binary  forms, with or
33 * without  modification,  are   permitted  provided  that  the
34 * following conditions are met:
35 *
36 * + Redistributions  of source  code  must  retain  the  above
37 *   copyright  notice,   this  list  of   conditions  and  the
38 *   following disclaimer.
39 *
40 * + Redistributions  in binary  form must reproduce  the above
41 *   copyright  notice,   this  list  of   conditions  and  the
42 *   following  disclaimer in  the  documentation and/or  other
43 *   materials provided with the distribution.
44 *
45 * + Neither the  name of Advanced Micro Devices,  Inc. nor the
46 *   names  of  its contributors  may  be  used  to endorse  or
47 *   promote  products  derived   from  this  software  without
48 *   specific prior written permission.
49 *
50 * THIS  SOFTWARE  IS PROVIDED  BY  THE  COPYRIGHT HOLDERS  AND
51 * CONTRIBUTORS AS IS AND  ANY EXPRESS OR IMPLIED WARRANTIES,
52 * INCLUDING,  BUT NOT  LIMITED TO,  THE IMPLIED  WARRANTIES OF
53 * MERCHANTABILITY  AND FITNESS  FOR A  PARTICULAR  PURPOSE ARE
54 * DISCLAIMED.  IN  NO  EVENT  SHALL  ADVANCED  MICRO  DEVICES,
55 * INC.  OR CONTRIBUTORS  BE LIABLE  FOR ANY  DIRECT, INDIRECT,
56 * INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR CONSEQUENTIAL  DAMAGES
57 * (INCLUDING,  BUT NOT LIMITED  TO, PROCUREMENT  OF SUBSTITUTE
58 * GOODS  OR  SERVICES;  LOSS  OF  USE, DATA,  OR  PROFITS;  OR
59 * BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON  ANY THEORY OF
60 * LIABILITY,  WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
61 * (INCLUDING NEGLIGENCE  OR OTHERWISE) ARISING IN  ANY WAY OUT
62 * OF THE  USE  OF  THIS  SOFTWARE, EVEN  IF  ADVISED  OF  THE
63 * POSSIBILITY OF SUCH DAMAGE.
64 *
65 * It is  licensee's responsibility  to comply with  any export
66 * regulations applicable in licensee's jurisdiction.
67 */
68
69	.ident	"%Z%%M%	%I%	%E% SMI"
70
71	.file	"%M%"
72
73#include <sys/asm_linkage.h>
74
75	ANSI_PRAGMA_WEAK(memmove,function)
76	ANSI_PRAGMA_WEAK(memcpy,function)
77
78#include "SYS.h"
79#include "cache.h"
80
81	ANSI_PRAGMA_WEAK2(_private_memcpy,memcpy,function)
82
83#define LABEL(s) .memcpy/**/s
84
85	ENTRY(memmove)		/* (void *s1, void *s2, size_t n) */
86	cmpq	%rsi,%rdi	/ if (source addr > dest addr)
87	leaq	-1(%rsi,%rdx),%r9
88	jle	.CopyRight	/
89	cmpq	%r9,%rdi
90	jle	.CopyLeft
91	jmp	.CopyRight
92
93	ENTRY(memcpy)                        /* (void *, const void*, size_t) */
94
95.CopyRight:
96LABEL(1try):
97        cmp     $16, %rdx
98        mov     %rdi, %rax
99        jae     LABEL(1after)
100
101        .p2align 4
102
103LABEL(1):				/* 1-byte */
104        test    $1, %dl
105        jz      LABEL(1a)
106
107        mov     (%rsi), %cl
108        mov     %cl, (%rdi)
109
110	dec	%dl
111	lea	1 (%rsi), %rsi
112	lea	1 (%rdi), %rdi
113	jz	LABEL(exit)
114
115        .p2align 4,, 4
116
117LABEL(1a):
118        test    $2, %dl
119        jz      LABEL(1b)
120
121        mov     (%rsi), %cx
122        mov     %cx, (%rdi)
123
124	sub	$2, %dl
125	lea	2 (%rsi), %rsi
126	lea	2 (%rdi), %rdi
127	jz	LABEL(exit)
128
129        .p2align 4,, 4
130
131LABEL(1b):
132        test    $4, %dl
133        jz      LABEL(1c)
134
135        mov     (%rsi), %ecx
136        mov     %ecx, (%rdi)
137
138/*	sub	$4, %dl */
139	lea	4 (%rsi), %rsi
140	lea	4 (%rdi), %rdi
141/*	jz	LABEL(exit) */
142
143        .p2align 4,, 4
144
145LABEL(1c):
146        test    $8, %dl
147        jz      LABEL(1d)
148
149        mov     (%rsi), %rcx
150        mov     %rcx, (%rdi)
151
152/*	sub	$8, %dl */
153/*	lea	8 (%rsi), %rsi */
154/*	lea	8 (%rdi), %rdi */
155/*	jz	LABEL(exit) */
156
157        .p2align 4
158
159LABEL(1d):
160
161LABEL(exit):
162        rep
163        ret
164
165        .p2align 4
166
167LABEL(1after):
168        push    %rax
169
170LABEL(8try):
171        cmp     $32, %rdx
172        jae     LABEL(8after)
173
174LABEL(8):                        /* 8-byte */
175        mov     %edx, %ecx
176        shr     $3, %ecx
177        jz      LABEL(8skip)
178
179        .p2align 4
180
181LABEL(8loop):
182        dec     %ecx
183
184        mov     (%rsi), %rax
185        mov     %rax, (%rdi)
186
187        lea     8 (%rsi), %rsi
188        lea     8 (%rdi), %rdi
189
190        jnz     LABEL(8loop)
191
192LABEL(8skip):
193        and     $7, %edx
194        pop     %rax
195        jnz     LABEL(1)
196
197        rep
198        ret
199
200        .p2align 4
201
202LABEL(8after):
203
204LABEL(32try):
205	mov	$512, %r8d		/* size for unaligned data */
206	mov	$4096, %r9d		/* size for aligned data */
207	test	$7, %esi		/* check if either source.. */
208	cmovz	%r9, %r8
209	test	$7, %edi		/* .. or destination is aligned */
210	cmovz	%r9, %r8
211
212        cmp     %r8, %rdx
213        ja	LABEL(32after)
214
215LABEL(32):				/* 32-byte */
216        mov     %edx, %ecx
217        shr     $5, %ecx
218        jz      LABEL(32skip)
219
220        .p2align 4
221
222LABEL(32loop):
223        dec     %ecx
224
225        mov        (%rsi), %rax
226        mov      8 (%rsi), %r8
227        mov     16 (%rsi), %r9
228        mov     24 (%rsi), %r10
229
230        mov     %rax,    (%rdi)
231        mov      %r8,  8 (%rdi)
232        mov      %r9, 16 (%rdi)
233        mov     %r10, 24 (%rdi)
234
235        lea     32 (%rsi), %rsi
236        lea     32 (%rdi), %rdi
237
238        jz      LABEL(32skip)
239
240        dec     %ecx
241
242        mov        (%rsi), %rax
243        mov      8 (%rsi), %r8
244        mov     16 (%rsi), %r9
245        mov     24 (%rsi), %r10
246
247        mov     %rax,    (%rdi)
248        mov      %r8,  8 (%rdi)
249        mov      %r9, 16 (%rdi)
250        mov     %r10, 24 (%rdi)
251
252        lea     32 (%rsi), %rsi
253        lea     32 (%rdi), %rdi
254
255        jnz     LABEL(32loop)
256
257        .p2align 4
258
259LABEL(32skip):
260        and     $31, %edx
261        jnz     LABEL(8)
262
263        pop     %rax
264        ret
265
266        .p2align 4
267
268LABEL(32after):
269
270	/* 3DNow: use prefetch */
271	prefetchnta _sref_(.amd64cache1) /* improves test further ahead on B0 */
272
273LABEL(aligntry):
274        mov     %edi, %r8d      	/* align by destination */
275
276        and	$7, %r8d
277        jz      LABEL(alignafter)  	/* not unaligned */
278
279LABEL(align):                      	/* align */
280        lea     -8 (%r8, %rdx), %rdx
281        sub     $8, %r8d
282
283        .p2align 4
284
285LABEL(alignloop):
286        inc     %r8d
287
288        mov     (%rsi), %al
289        mov     %al, (%rdi)
290
291        lea     1 (%rsi), %rsi
292        lea     1 (%rdi), %rdi
293
294        jnz     LABEL(alignloop)
295
296        .p2align 4
297
298LABEL(alignafter):
299        mov     _sref_(.amd64cache1half), %r11
300        cmp     %rdx, %r11
301        cmova   %rdx, %r11
302
303LABEL(fast):
304	mov	%r11, %rcx
305	and	$-8, %r11
306	shr	$3, %rcx
307/*	jz	LABEL(fastskip) */
308
309	rep				/* good ol' MOVS */
310	movsq
311
312LABEL(fastskip):
313	sub	%r11, %rdx
314	test	$-8, %rdx
315	jnz	LABEL(fastafterlater)
316
317	and	$7, %edx
318	pop	%rax
319	jnz	LABEL(1)
320
321	rep
322	ret
323
324        .p2align 4
325
326LABEL(64try):
327        mov     _sref_(.amd64cache1half), %r11
328        cmp     %rdx, %r11
329        cmova   %rdx, %r11
330
331LABEL(64):                               /* 64-byte */
332        mov     %r11, %rcx
333        and     $-64, %r11
334        shr     $6, %rcx
335        jz      LABEL(64skip)
336
337        .p2align 4
338
339LABEL(64loop):
340        dec     %ecx
341
342        mov        (%rsi), %rax
343        mov      8 (%rsi), %r8
344        mov     16 (%rsi), %r9
345        mov     24 (%rsi), %r10
346
347        mov     %rax,    (%rdi)
348        mov      %r8,  8 (%rdi)
349        mov      %r9, 16 (%rdi)
350        mov     %r10, 24 (%rdi)
351
352        mov     32 (%rsi), %rax
353        mov     40 (%rsi), %r8
354        mov     48 (%rsi), %r9
355        mov     56 (%rsi), %r10
356
357        mov     %rax, 32 (%rdi)
358        mov      %r8, 40 (%rdi)
359        mov      %r9, 48 (%rdi)
360        mov     %r10, 56 (%rdi)
361
362        lea     64 (%rsi), %rsi
363        lea     64 (%rdi), %rdi
364
365        jz      LABEL(64skip)
366
367        dec     %ecx
368
369        mov        (%rsi), %rax
370        mov      8 (%rsi), %r8
371        mov     16 (%rsi), %r9
372        mov     24 (%rsi), %r10
373
374        mov     %rax,    (%rdi)
375        mov      %r8,  8 (%rdi)
376        mov      %r9, 16 (%rdi)
377        mov     %r10, 24 (%rdi)
378
379        mov     32 (%rsi), %rax
380        mov     40 (%rsi), %r8
381        mov     48 (%rsi), %r9
382        mov     56 (%rsi), %r10
383
384        mov     %rax, 32 (%rdi)
385        mov      %r8, 40 (%rdi)
386        mov      %r9, 48 (%rdi)
387        mov     %r10, 56 (%rdi)
388
389        lea     64 (%rsi), %rsi
390        lea     64 (%rdi), %rdi
391
392        jnz     LABEL(64loop)
393
394        .p2align 4
395
396LABEL(64skip):
397        sub     %r11, %rdx
398        test    $-64, %rdx
399        jnz     LABEL(64after)
400
401        and     $63, %edx
402        jnz     LABEL(32)
403
404        pop     %rax
405        ret
406
407        .p2align 4
408
409LABEL(64after):
410
411LABEL(fastafterlater):
412
413LABEL(pretry):
414        mov     _sref_(.amd64cache2half), %r8
415        cmp     %rdx, %r8
416        cmova   %rdx, %r8
417
418LABEL(pre):                              /* 64-byte prefetching */
419        mov     %r8, %rcx
420        and     $-64, %r8
421        shr     $6, %rcx
422        jz      LABEL(preskip)
423
424        push    %r14
425        push    %r13
426        push    %r12
427        push    %rbx
428
429        .p2align 4
430
431LABEL(preloop):
432        dec     %rcx
433
434        mov        (%rsi), %rax
435        mov      8 (%rsi), %rbx
436        mov     16 (%rsi), %r9
437        mov     24 (%rsi), %r10
438        mov     32 (%rsi), %r11
439        mov     40 (%rsi), %r12
440        mov     48 (%rsi), %r13
441        mov     56 (%rsi), %r14
442
443        prefetchnta  0 + 896 (%rsi)	/* 3DNow: use prefetch */
444        prefetchnta 64 + 896 (%rsi)	/* 3DNow: use prefetch */
445
446        mov     %rax,    (%rdi)
447        mov     %rbx,  8 (%rdi)
448        mov      %r9, 16 (%rdi)
449        mov     %r10, 24 (%rdi)
450        mov     %r11, 32 (%rdi)
451        mov     %r12, 40 (%rdi)
452        mov     %r13, 48 (%rdi)
453        mov     %r14, 56 (%rdi)
454
455        lea     64 (%rsi), %rsi
456        lea     64 (%rdi), %rdi
457
458        jz      LABEL(preskipa)
459
460        dec     %rcx
461
462        mov        (%rsi), %rax
463        mov      8 (%rsi), %rbx
464        mov     16 (%rsi), %r9
465        mov     24 (%rsi), %r10
466        mov     32 (%rsi), %r11
467        mov     40 (%rsi), %r12
468        mov     48 (%rsi), %r13
469        mov     56 (%rsi), %r14
470
471        mov     %rax,    (%rdi)
472        mov     %rbx,  8 (%rdi)
473        mov      %r9, 16 (%rdi)
474        mov     %r10, 24 (%rdi)
475        mov     %r11, 32 (%rdi)
476        mov     %r12, 40 (%rdi)
477        mov     %r13, 48 (%rdi)
478        mov     %r14, 56 (%rdi)
479
480        prefetchnta -64 + 896 (%rdi)	/* 3DNow: use prefetchw */
481        prefetchnta   0 + 896 (%rdi)	/* 3DNow: use prefetchw */
482
483        lea     64 (%rsi), %rsi
484        lea     64 (%rdi), %rdi
485
486        jnz     LABEL(preloop)
487
488LABEL(preskipa):
489        pop     %rbx
490        pop     %r12
491        pop     %r13
492        pop     %r14
493
494
495LABEL(preskip):
496        sub     %r8, %rdx
497        test    $-64, %rdx
498        jnz     LABEL(preafter)
499
500        and     $63, %edx
501        jnz     LABEL(32)
502
503        pop     %rax
504        ret
505
506        .p2align 4
507
508LABEL(preafter):
509
510LABEL(NTtry):
511
512LABEL(NT):                               /* NT 64-byte */
513        mov     %rdx, %rcx
514        shr     $7, %rcx
515        jz      LABEL(NTskip)
516
517        push    %r14
518        push    %r13
519        push    %r12
520
521       .p2align 4
522
523LABEL(NTloop):
524        prefetchnta 768 (%rsi)		/* prefetching NT here is not so good on B0 and C0 MP systems */
525        prefetchnta 832 (%rsi)
526
527        dec     %rcx
528
529        mov        (%rsi), %rax
530        mov      8 (%rsi), %r8
531        mov     16 (%rsi), %r9
532        mov     24 (%rsi), %r10
533        mov     32 (%rsi), %r11
534        mov     40 (%rsi), %r12
535        mov     48 (%rsi), %r13
536        mov     56 (%rsi), %r14
537
538        movnti  %rax,    (%rdi)
539        movnti   %r8,  8 (%rdi)
540        movnti   %r9, 16 (%rdi)
541        movnti  %r10, 24 (%rdi)
542        movnti  %r11, 32 (%rdi)
543        movnti  %r12, 40 (%rdi)
544        movnti  %r13, 48 (%rdi)
545        movnti  %r14, 56 (%rdi)
546
547        mov      64 (%rsi), %rax
548        mov      72 (%rsi), %r8
549        mov      80 (%rsi), %r9
550        mov      88 (%rsi), %r10
551        mov      96 (%rsi), %r11
552        mov     104 (%rsi), %r12
553        mov     112 (%rsi), %r13
554        mov     120 (%rsi), %r14
555
556        movnti  %rax,  64 (%rdi)
557        movnti   %r8,  72 (%rdi)
558        movnti   %r9,  80 (%rdi)
559        movnti  %r10,  88 (%rdi)
560        movnti  %r11,  96 (%rdi)
561        movnti  %r12, 104 (%rdi)
562        movnti  %r13, 112 (%rdi)
563        movnti  %r14, 120 (%rdi)
564
565        lea     128 (%rsi), %rsi
566        lea     128 (%rdi), %rdi
567
568        jnz     LABEL(NTloop)
569
570        mfence
571
572        pop     %r12
573        pop     %r13
574        pop     %r14
575
576LABEL(NTskip):
577        and     $127, %edx
578        jnz     LABEL(32)
579
580        pop     %rax
581        ret
582
583	SET_SIZE(memcpy)                   /* (void *, const void*, size_t) */
584
585.CopyLeft:
586	movq	%rdi,%rax		/ set up return value
587	movq	$7,%r8			/ heavily used constant
588	movq	%rdx,%rcx		/ put len into %rcx for rep
589	std				/ reverse direction bit (RtoL)
590	cmpq	$24,%rcx		/ if (size < 24)
591	ja	.BigCopyLeft		/ {
592	movq	%r9,%rsi		/     src = src + size - 1
593	leaq	-1(%rcx,%rdi),%rdi	/     dst = dst + size - 1
594	rep;	smovb			/    do the byte copy
595	cld				/    reset direction flag to LtoR
596	ret				/  return(dba);
597.BigCopyLeft:				/ } else {
598	xchgq	%r9,%rcx
599	movq	%rcx,%rsi		/ align source w/byte copy
600	leaq	-1(%r9,%rdi),%rdi
601	andq	%r8,%rcx
602	jz	.SkipAlignLeft
603	addq	$1, %rcx		/ we need to insure that future
604	subq	%rcx,%r9		/ copy is done on aligned boundary
605	rep;	smovb
606.SkipAlignLeft:
607	movq	%r9,%rcx
608	subq	%r8,%rsi
609	shrq	$3,%rcx			/ do 8 byte copy RtoL
610	subq	%r8,%rdi
611	rep;	smovq
612	andq	%r8,%r9		/ do 1 byte copy whats left
613	jz	.CleanupReturnLeft
614	movq	%r9,%rcx
615	addq	%r8,%rsi		/ rep; smovl instruction will decrement
616	addq	%r8,%rdi		/ %rdi, %rsi by four after each copy
617					/ adding 3 will restore pointers to byte
618					/ before last double word copied
619					/ which is where they are expected to
620					/ be for the single byte copy code
621	rep;	smovb
622.CleanupReturnLeft:
623	cld				/ reset direction flag to LtoR
624	ret				/ return(dba);
625	SET_SIZE(memmove)
626