xref: /titanic_44/usr/src/lib/libc/i386_hwcap1/gen/memcpy.s (revision ae115bc77f6fcde83175c75b4206dc2e50747966)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27	.ident	"%Z%%M%	%I%	%E% SMI"
28
29	.file	"%M%"
30
31#include <sys/asm_linkage.h>
32
33	ANSI_PRAGMA_WEAK(memmove,function)
34	ANSI_PRAGMA_WEAK(memcpy,function)
35
36#include "SYS.h"
37
38	ANSI_PRAGMA_WEAK2(_private_memcpy,memcpy,function)
39
40	ENTRY(memmove)
41	movl	0+12(%esp),%ecx	/ get number of bytes to move
42	pushl	%esi		/ save off %edi, %esi and move destination
43	pushl	%edi
44	movl	8+ 4(%esp),%edi	/ destination buffer address
45	movl	8+ 8(%esp),%esi	/ source buffer address
46	movl	%edi, %eax
47	testl	%ecx,%ecx
48	jz	.Return
49
50	cmpl	%esi,%edi	/ if (source addr > dest addr)
51	leal	-1(%esi,%ecx),%edx	/ %edx = src + size - 1
52	jle	.memcpy_post	/ jump if dst < src
53	cmpl	%edx,%edi
54	jle	.CopyLeft	/ jump if dst <= src + size - 1
55	jmp	.memcpy_post
56
57	ENTRY(memcpy)
58	pushl	%esi
59	pushl	%edi
60
61	movl	8+4(%esp),%edi	/ %edi = dest address
62	movl	%edi, %eax	/ save this
63	movl	8+8(%esp),%esi	/ %esi = source address
64	movl	8+12(%esp),%ecx/ %ecx = length of string
65				/ %edx scratch register
66				/ %eax scratch register
67.memcpy_post:
68	nop			/ this really helps, don't know why
69				/ note:	cld is perf death on P4
70	cmpl	$63,%ecx
71	ja	.move_sse	/ not worth doing sse for less
72
73.movew:
74	movl	%ecx,%edx	/ save byte cnt
75	shrl	$2,%ecx		/ %ecx = number of words to move
76	rep ; smovl		/ move the words
77
78
79	andl	$0x3,%edx	/ %edx = number of bytes left to move
80	jz	.Return		/ %edx <= 3, so just unroll the loop
81
82	movb	(%esi), %cl
83	movb	%cl, (%edi)
84	decl	%edx
85	jz	.Return
86	movb	1(%esi), %cl
87	movb	%cl, 1(%edi)
88	decl	%edx
89	jz	.Return
90	movb	2(%esi), %cl
91	movb	%cl, 2(%edi)
92
93.Return:
94	popl	%edi		/ restore register variables
95	popl	%esi
96	ret
97
98.move_sse:
99	/
100	/ time to 16 byte align destination
101	/
102	andl	$15, %eax
103	jnz	.sse_unaligned	/ jmp if dest is unaligned
104.sse:				/ dest is aligned, check source
105	movl	%ecx, %edx	/ get byte count
106	shrl	$6, %edx	/ number of 64 byte blocks to move
107	testl	$15, %esi
108	jnz	.sse_da		/ go to slow loop if source is unaligned
109	cmpl	$65535, %ecx
110	ja	.sse_sa_nt_loop
111
112	/
113	/ use aligned load since we're lucky
114	/
115.sse_sa_loop:
116	prefetcht0 568(%esi)	/ prefetch source & copy 64 byte at a time
117	prefetcht0 568(%edi)	/ prefetch source & copy 64 byte at a time
118	movaps	0(%esi), %xmm0
119	movaps	%xmm0, 0(%edi)
120	movaps	16(%esi), %xmm1
121	movaps	%xmm1, 16(%edi)
122	movaps	32(%esi), %xmm2
123	movaps	%xmm2, 32(%edi)
124	movaps	48(%esi), %xmm3
125	movaps	%xmm3, 48(%edi)
126	addl	$64, %esi
127	addl	$64, %edi
128	decl	%edx
129	jnz	.sse_sa_loop
130
131.sse_cleanup:
132	andl	$63, %ecx	/ compute remaining bytes
133	movl	8+4(%esp), %eax	/ setup return value
134	jz	.Return
135	jmp	.movew
136
137	/
138	/ use aligned load since we're lucky
139	/
140	.align 16
141.sse_sa_nt_loop:
142	prefetchnta 16384(%esi)	/ prefetch source & copy 64 byte at a time
143	movaps	(%esi), %xmm0
144	movntps	%xmm0, 0(%edi)
145	movaps	16(%esi), %xmm1
146	movntps	%xmm1, 16(%edi)
147	movaps	32(%esi), %xmm2
148	movntps	%xmm2, 32(%edi)
149	movaps	48(%esi), %xmm3
150	movntps	%xmm3, 48(%edi)
151	addl	$64, %esi
152	addl	$64, %edi
153	decl	%edx
154	jnz	.sse_sa_nt_loop
155#if defined(_SSE2_INSN)
156	mfence
157#elif defined(_SSE_INSN)
158	sfence
159#else
160#error "Must have either SSE or SSE2"
161#endif
162	jmp	.sse_cleanup
163
164	/
165	/ Make certain that destination buffer becomes aligned
166	/
167.sse_unaligned:
168	neg	%eax		/ subtract from 16 and get destination
169	andl	$15, %eax	/ aligned on a 16 byte boundary
170	movl	%ecx, %edx	/ saved count
171	subl	%eax, %ecx	/ subtract from byte count
172	cmpl	$64, %ecx	/ after aligning, will we still have 64 bytes?
173	cmovb	%edx, %ecx	/ if not, restore original byte count,
174	cmovb	8+4(%esp), %eax	/ and restore return value,
175	jb	.movew		/ and do a non-SSE move.
176	xchg	%ecx, %eax	/ flip for copy
177	rep ; smovb		/ move the bytes
178	xchg	%ecx, %eax	/ flip back
179	jmp	.sse
180
181	.align 16
182.sse_da:
183	cmpl	$65535, %ecx
184	jbe	.sse_da_loop
185
186	/
187	/ use unaligned load since source doesn't line up
188	/
189.sse_da_nt_loop:
190	prefetchnta 16384(%esi)	/ prefetch source & copy 64 byte at a time
191	movups	0(%esi), %xmm0
192	movntps	%xmm0, 0(%edi)
193	movups	16(%esi), %xmm1
194	movntps	%xmm1, 16(%edi)
195	movups	32(%esi), %xmm2
196	movntps	%xmm2, 32(%edi)
197	movups	48(%esi), %xmm3
198	movntps	%xmm3, 48(%edi)
199	addl	$64, %esi
200	addl	$64, %edi
201	decl	%edx
202	jnz	.sse_da_nt_loop
203#if defined(_SSE2_INSN)
204	mfence
205#elif defined(_SSE_INSN)
206	sfence
207#else
208#error "Must have either SSE or SSE2"
209#endif
210	jmp	.sse_cleanup
211	/
212	/ use unaligned load since source doesn't line up
213	/
214	.align	16
215.sse_da_loop:
216	prefetcht0 568(%esi)	/ prefetch source & copy 64 byte at a time
217	prefetcht0 568(%edi)
218	movups	0(%esi), %xmm0
219	movaps	%xmm0, 0(%edi)
220	movups	16(%esi), %xmm1
221	movaps	%xmm1, 16(%edi)
222	movups	32(%esi), %xmm2
223	movaps	%xmm2, 32(%edi)
224	movups	48(%esi), %xmm3
225	movaps	%xmm3, 48(%edi)
226	addl	$64, %esi
227	addl	$64, %edi
228	decl	%edx
229	jnz	.sse_da_loop
230	jmp	.sse_cleanup
231
232	SET_SIZE(memcpy)
233
234
235/ .CopyLeft handles the memmove case where we must perform the copy backwards,
236/ because of overlap between src and dst. This is not particularly optimized.
237
238.CopyLeft:
239	movl	$3,%eax			/ heavily used constant
240	std				/ reverse direction bit (RtoL)
241	cmpl	$12,%ecx		/ if (size < 12)
242	ja	.BigCopyLeft		/ {
243	movl	%edx,%esi		/     src = src + size - 1
244	leal	-1(%ecx,%edi),%edi	/     dst = dst + size - 1
245	rep;	smovb			/    do the byte copy
246	cld				/    reset direction flag to LtoR
247	popl	%edi			/  }
248	popl	%esi			/  restore registers
249	movl	4(%esp),%eax		/  set up return value
250	ret				/  return(dba);
251.BigCopyLeft:				/ } else {
252	xchgl	%edx,%ecx
253	movl	%ecx,%esi		/ align source w/byte copy
254	leal	-1(%edx,%edi),%edi
255	andl	%eax,%ecx
256	jz	.SkipAlignLeft
257	addl	$1, %ecx		/ we need to insure that future
258	subl	%ecx,%edx		/ copy is done on aligned boundary
259	rep;	smovb
260.SkipAlignLeft:
261	movl	%edx,%ecx
262	subl	%eax,%esi
263	shrl	$2,%ecx			/ do 4 byte copy RtoL
264	subl	%eax,%edi
265	rep;	smovl
266	andl	%eax,%edx		/ do 1 byte copy whats left
267	jz	.CleanupReturnLeft
268	movl	%edx,%ecx
269	addl	%eax,%esi		/ rep; smovl instruction will decrement
270	addl	%eax,%edi		/ %edi, %esi by four after each copy
271					/ adding 3 will restore pointers to byte
272					/ before last double word copied
273					/ which is where they are expected to
274					/ be for the single byte copy code
275	rep;	smovb
276.CleanupReturnLeft:
277	cld				/ reset direction flag to LtoR
278	popl	%edi
279	popl	%esi			/ restore registers
280	movl	4(%esp),%eax		/ set up return value
281	ret				/ return(dba);
282	SET_SIZE(memmove)
283