xref: /titanic_44/usr/src/lib/libc/i386_hwcap1/gen/memcpy.s (revision 32b87932f3ef0887d873b7f6d2d1943799b2afc0)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27	.ident	"%Z%%M%	%I%	%E% SMI"
28
29	.file	"%M%"
30
31#include <sys/asm_linkage.h>
32
33	ANSI_PRAGMA_WEAK(memmove,function)
34	ANSI_PRAGMA_WEAK(memcpy,function)
35
36#include "SYS.h"
37
38	ENTRY(memmove)
39	movl	0+12(%esp),%ecx	/ get number of bytes to move
40	pushl	%esi		/ save off %edi, %esi and move destination
41	pushl	%edi
42	movl	8+ 4(%esp),%edi	/ destination buffer address
43	movl	8+ 8(%esp),%esi	/ source buffer address
44	movl	%edi, %eax
45	testl	%ecx,%ecx
46	jz	.Return
47
48	cmpl	%esi,%edi	/ if (source addr > dest addr)
49	leal	-1(%esi,%ecx),%edx	/ %edx = src + size - 1
50	jle	.memcpy_post	/ jump if dst < src
51	cmpl	%edx,%edi
52	jle	.CopyLeft	/ jump if dst <= src + size - 1
53	jmp	.memcpy_post
54
55	ENTRY(memcpy)
56	pushl	%esi
57	pushl	%edi
58
59	movl	8+4(%esp),%edi	/ %edi = dest address
60	movl	%edi, %eax	/ save this
61	movl	8+8(%esp),%esi	/ %esi = source address
62	movl	8+12(%esp),%ecx/ %ecx = length of string
63				/ %edx scratch register
64				/ %eax scratch register
65.memcpy_post:
66	nop			/ this really helps, don't know why
67				/ note:	cld is perf death on P4
68	cmpl	$63,%ecx
69	ja	.move_sse	/ not worth doing sse for less
70
71.movew:
72	movl	%ecx,%edx	/ save byte cnt
73	shrl	$2,%ecx		/ %ecx = number of words to move
74	rep ; smovl		/ move the words
75
76
77	andl	$0x3,%edx	/ %edx = number of bytes left to move
78	jz	.Return		/ %edx <= 3, so just unroll the loop
79
80	movb	(%esi), %cl
81	movb	%cl, (%edi)
82	decl	%edx
83	jz	.Return
84	movb	1(%esi), %cl
85	movb	%cl, 1(%edi)
86	decl	%edx
87	jz	.Return
88	movb	2(%esi), %cl
89	movb	%cl, 2(%edi)
90
91.Return:
92	popl	%edi		/ restore register variables
93	popl	%esi
94	ret
95
96.move_sse:
97	/
98	/ time to 16 byte align destination
99	/
100	andl	$15, %eax
101	jnz	.sse_unaligned	/ jmp if dest is unaligned
102.sse:				/ dest is aligned, check source
103	movl	%ecx, %edx	/ get byte count
104	shrl	$6, %edx	/ number of 64 byte blocks to move
105	testl	$15, %esi
106	jnz	.sse_da		/ go to slow loop if source is unaligned
107	cmpl	$65535, %ecx
108	ja	.sse_sa_nt_loop
109
110	/
111	/ use aligned load since we're lucky
112	/
113.sse_sa_loop:
114	prefetcht0 568(%esi)	/ prefetch source & copy 64 byte at a time
115	prefetcht0 568(%edi)	/ prefetch source & copy 64 byte at a time
116	movaps	0(%esi), %xmm0
117	movaps	%xmm0, 0(%edi)
118	movaps	16(%esi), %xmm1
119	movaps	%xmm1, 16(%edi)
120	movaps	32(%esi), %xmm2
121	movaps	%xmm2, 32(%edi)
122	movaps	48(%esi), %xmm3
123	movaps	%xmm3, 48(%edi)
124	addl	$64, %esi
125	addl	$64, %edi
126	decl	%edx
127	jnz	.sse_sa_loop
128
129.sse_cleanup:
130	andl	$63, %ecx	/ compute remaining bytes
131	movl	8+4(%esp), %eax	/ setup return value
132	jz	.Return
133	jmp	.movew
134
135	/
136	/ use aligned load since we're lucky
137	/
138	.align 16
139.sse_sa_nt_loop:
140	prefetchnta 16384(%esi)	/ prefetch source & copy 64 byte at a time
141	movaps	(%esi), %xmm0
142	movntps	%xmm0, 0(%edi)
143	movaps	16(%esi), %xmm1
144	movntps	%xmm1, 16(%edi)
145	movaps	32(%esi), %xmm2
146	movntps	%xmm2, 32(%edi)
147	movaps	48(%esi), %xmm3
148	movntps	%xmm3, 48(%edi)
149	addl	$64, %esi
150	addl	$64, %edi
151	decl	%edx
152	jnz	.sse_sa_nt_loop
153#if defined(_SSE2_INSN)
154	mfence
155#elif defined(_SSE_INSN)
156	sfence
157#else
158#error "Must have either SSE or SSE2"
159#endif
160	jmp	.sse_cleanup
161
162	/
163	/ Make certain that destination buffer becomes aligned
164	/
165.sse_unaligned:
166	neg	%eax		/ subtract from 16 and get destination
167	andl	$15, %eax	/ aligned on a 16 byte boundary
168	movl	%ecx, %edx	/ saved count
169	subl	%eax, %ecx	/ subtract from byte count
170	cmpl	$64, %ecx	/ after aligning, will we still have 64 bytes?
171	cmovb	%edx, %ecx	/ if not, restore original byte count,
172	cmovb	8+4(%esp), %eax	/ and restore return value,
173	jb	.movew		/ and do a non-SSE move.
174	xchg	%ecx, %eax	/ flip for copy
175	rep ; smovb		/ move the bytes
176	xchg	%ecx, %eax	/ flip back
177	jmp	.sse
178
179	.align 16
180.sse_da:
181	cmpl	$65535, %ecx
182	jbe	.sse_da_loop
183
184	/
185	/ use unaligned load since source doesn't line up
186	/
187.sse_da_nt_loop:
188	prefetchnta 16384(%esi)	/ prefetch source & copy 64 byte at a time
189	movups	0(%esi), %xmm0
190	movntps	%xmm0, 0(%edi)
191	movups	16(%esi), %xmm1
192	movntps	%xmm1, 16(%edi)
193	movups	32(%esi), %xmm2
194	movntps	%xmm2, 32(%edi)
195	movups	48(%esi), %xmm3
196	movntps	%xmm3, 48(%edi)
197	addl	$64, %esi
198	addl	$64, %edi
199	decl	%edx
200	jnz	.sse_da_nt_loop
201#if defined(_SSE2_INSN)
202	mfence
203#elif defined(_SSE_INSN)
204	sfence
205#else
206#error "Must have either SSE or SSE2"
207#endif
208	jmp	.sse_cleanup
209	/
210	/ use unaligned load since source doesn't line up
211	/
212	.align	16
213.sse_da_loop:
214	prefetcht0 568(%esi)	/ prefetch source & copy 64 byte at a time
215	prefetcht0 568(%edi)
216	movups	0(%esi), %xmm0
217	movaps	%xmm0, 0(%edi)
218	movups	16(%esi), %xmm1
219	movaps	%xmm1, 16(%edi)
220	movups	32(%esi), %xmm2
221	movaps	%xmm2, 32(%edi)
222	movups	48(%esi), %xmm3
223	movaps	%xmm3, 48(%edi)
224	addl	$64, %esi
225	addl	$64, %edi
226	decl	%edx
227	jnz	.sse_da_loop
228	jmp	.sse_cleanup
229
230	SET_SIZE(memcpy)
231
232
233/ .CopyLeft handles the memmove case where we must perform the copy backwards,
234/ because of overlap between src and dst. This is not particularly optimized.
235
236.CopyLeft:
237	movl	$3,%eax			/ heavily used constant
238	std				/ reverse direction bit (RtoL)
239	cmpl	$12,%ecx		/ if (size < 12)
240	ja	.BigCopyLeft		/ {
241	movl	%edx,%esi		/     src = src + size - 1
242	leal	-1(%ecx,%edi),%edi	/     dst = dst + size - 1
243	rep;	smovb			/    do the byte copy
244	cld				/    reset direction flag to LtoR
245	popl	%edi			/  }
246	popl	%esi			/  restore registers
247	movl	4(%esp),%eax		/  set up return value
248	ret				/  return(dba);
249.BigCopyLeft:				/ } else {
250	xchgl	%edx,%ecx
251	movl	%ecx,%esi		/ align source w/byte copy
252	leal	-1(%edx,%edi),%edi
253	andl	%eax,%ecx
254	jz	.SkipAlignLeft
255	addl	$1, %ecx		/ we need to insure that future
256	subl	%ecx,%edx		/ copy is done on aligned boundary
257	rep;	smovb
258.SkipAlignLeft:
259	movl	%edx,%ecx
260	subl	%eax,%esi
261	shrl	$2,%ecx			/ do 4 byte copy RtoL
262	subl	%eax,%edi
263	rep;	smovl
264	andl	%eax,%edx		/ do 1 byte copy whats left
265	jz	.CleanupReturnLeft
266	movl	%edx,%ecx
267	addl	%eax,%esi		/ rep; smovl instruction will decrement
268	addl	%eax,%edi		/ %edi, %esi by four after each copy
269					/ adding 3 will restore pointers to byte
270					/ before last double word copied
271					/ which is where they are expected to
272					/ be for the single byte copy code
273	rep;	smovb
274.CleanupReturnLeft:
275	cld				/ reset direction flag to LtoR
276	popl	%edi
277	popl	%esi			/ restore registers
278	movl	4(%esp),%eax		/ set up return value
279	ret				/ return(dba);
280	SET_SIZE(memmove)
281