xref: /titanic_41/usr/src/lib/libc/i386_hwcap1/gen/memcpy.s (revision 9ec394dbf343c1f23c6e13c39df427f238e5a369)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29	.file	"%M%"
30
31#include <sys/asm_linkage.h>
32
33	ANSI_PRAGMA_WEAK(memmove,function)
34	ANSI_PRAGMA_WEAK(memcpy,function)
35
36	ENTRY(memmove)
37	movl	0+12(%esp),%ecx	/ get number of bytes to move
38	pushl	%esi		/ save off %edi, %esi and move destination
39	pushl	%edi
40	movl	8+ 4(%esp),%edi	/ destination buffer address
41	movl	8+ 8(%esp),%esi	/ source buffer address
42	movl	%edi, %eax
43	testl	%ecx,%ecx
44	jz	.Return
45
46	cmpl	%esi,%edi	/ if (source addr > dest addr)
47	leal	-1(%esi,%ecx),%edx	/ %edx = src + size - 1
48	jle	.memcpy_post	/ jump if dst < src
49	cmpl	%edx,%edi
50	jle	.CopyLeft	/ jump if dst <= src + size - 1
51	jmp	.memcpy_post
52
53	ENTRY(memcpy)
54	pushl	%esi
55	pushl	%edi
56
57	movl	8+4(%esp),%edi	/ %edi = dest address
58	movl	%edi, %eax	/ save this
59	movl	8+8(%esp),%esi	/ %esi = source address
60	movl	8+12(%esp),%ecx/ %ecx = length of string
61				/ %edx scratch register
62				/ %eax scratch register
63.memcpy_post:
64	nop			/ this really helps, don't know why
65				/ note:	cld is perf death on P4
66	cmpl	$63,%ecx
67	ja	.move_sse	/ not worth doing sse for less
68
69.movew:
70	movl	%ecx,%edx	/ save byte cnt
71	shrl	$2,%ecx		/ %ecx = number of words to move
72	rep ; smovl		/ move the words
73
74
75	andl	$0x3,%edx	/ %edx = number of bytes left to move
76	jz	.Return		/ %edx <= 3, so just unroll the loop
77
78	movb	(%esi), %cl
79	movb	%cl, (%edi)
80	decl	%edx
81	jz	.Return
82	movb	1(%esi), %cl
83	movb	%cl, 1(%edi)
84	decl	%edx
85	jz	.Return
86	movb	2(%esi), %cl
87	movb	%cl, 2(%edi)
88
89.Return:
90	popl	%edi		/ restore register variables
91	popl	%esi
92	ret
93
94.move_sse:
95	/
96	/ time to 16 byte align destination
97	/
98	andl	$15, %eax
99	jnz	.sse_unaligned	/ jmp if dest is unaligned
100.sse:				/ dest is aligned, check source
101	movl	%ecx, %edx	/ get byte count
102	shrl	$6, %edx	/ number of 64 byte blocks to move
103	testl	$15, %esi
104	jnz	.sse_da		/ go to slow loop if source is unaligned
105	cmpl	$65535, %ecx
106	ja	.sse_sa_nt_loop
107
108	/
109	/ use aligned load since we're lucky
110	/
111.sse_sa_loop:
112	prefetcht0 568(%esi)	/ prefetch source & copy 64 byte at a time
113	prefetcht0 568(%edi)	/ prefetch source & copy 64 byte at a time
114	movaps	0(%esi), %xmm0
115	movaps	%xmm0, 0(%edi)
116	movaps	16(%esi), %xmm1
117	movaps	%xmm1, 16(%edi)
118	movaps	32(%esi), %xmm2
119	movaps	%xmm2, 32(%edi)
120	movaps	48(%esi), %xmm3
121	movaps	%xmm3, 48(%edi)
122	addl	$64, %esi
123	addl	$64, %edi
124	decl	%edx
125	jnz	.sse_sa_loop
126
127.sse_cleanup:
128	andl	$63, %ecx	/ compute remaining bytes
129	movl	8+4(%esp), %eax	/ setup return value
130	jz	.Return
131	jmp	.movew
132
133	/
134	/ use aligned load since we're lucky
135	/
136	.align 16
137.sse_sa_nt_loop:
138	prefetchnta 16384(%esi)	/ prefetch source & copy 64 byte at a time
139	movaps	(%esi), %xmm0
140	movntps	%xmm0, 0(%edi)
141	movaps	16(%esi), %xmm1
142	movntps	%xmm1, 16(%edi)
143	movaps	32(%esi), %xmm2
144	movntps	%xmm2, 32(%edi)
145	movaps	48(%esi), %xmm3
146	movntps	%xmm3, 48(%edi)
147	addl	$64, %esi
148	addl	$64, %edi
149	decl	%edx
150	jnz	.sse_sa_nt_loop
151#if defined(_SSE2_INSN)
152	mfence
153#elif defined(_SSE_INSN)
154	sfence
155#else
156#error "Must have either SSE or SSE2"
157#endif
158	jmp	.sse_cleanup
159
160	/
161	/ Make certain that destination buffer becomes aligned
162	/
163.sse_unaligned:
164	neg	%eax		/ subtract from 16 and get destination
165	andl	$15, %eax	/ aligned on a 16 byte boundary
166	movl	%ecx, %edx	/ saved count
167	subl	%eax, %ecx	/ subtract from byte count
168	cmpl	$64, %ecx	/ after aligning, will we still have 64 bytes?
169	cmovb	%edx, %ecx	/ if not, restore original byte count,
170	cmovb	8+4(%esp), %eax	/ and restore return value,
171	jb	.movew		/ and do a non-SSE move.
172	xchg	%ecx, %eax	/ flip for copy
173	rep ; smovb		/ move the bytes
174	xchg	%ecx, %eax	/ flip back
175	jmp	.sse
176
177	.align 16
178.sse_da:
179	cmpl	$65535, %ecx
180	jbe	.sse_da_loop
181
182	/
183	/ use unaligned load since source doesn't line up
184	/
185.sse_da_nt_loop:
186	prefetchnta 16384(%esi)	/ prefetch source & copy 64 byte at a time
187	movups	0(%esi), %xmm0
188	movntps	%xmm0, 0(%edi)
189	movups	16(%esi), %xmm1
190	movntps	%xmm1, 16(%edi)
191	movups	32(%esi), %xmm2
192	movntps	%xmm2, 32(%edi)
193	movups	48(%esi), %xmm3
194	movntps	%xmm3, 48(%edi)
195	addl	$64, %esi
196	addl	$64, %edi
197	decl	%edx
198	jnz	.sse_da_nt_loop
199#if defined(_SSE2_INSN)
200	mfence
201#elif defined(_SSE_INSN)
202	sfence
203#else
204#error "Must have either SSE or SSE2"
205#endif
206	jmp	.sse_cleanup
207	/
208	/ use unaligned load since source doesn't line up
209	/
210	.align	16
211.sse_da_loop:
212	prefetcht0 568(%esi)	/ prefetch source & copy 64 byte at a time
213	prefetcht0 568(%edi)
214	movups	0(%esi), %xmm0
215	movaps	%xmm0, 0(%edi)
216	movups	16(%esi), %xmm1
217	movaps	%xmm1, 16(%edi)
218	movups	32(%esi), %xmm2
219	movaps	%xmm2, 32(%edi)
220	movups	48(%esi), %xmm3
221	movaps	%xmm3, 48(%edi)
222	addl	$64, %esi
223	addl	$64, %edi
224	decl	%edx
225	jnz	.sse_da_loop
226	jmp	.sse_cleanup
227
228	SET_SIZE(memcpy)
229
230
231/ .CopyLeft handles the memmove case where we must perform the copy backwards,
232/ because of overlap between src and dst. This is not particularly optimized.
233
234.CopyLeft:
235	movl	$3,%eax			/ heavily used constant
236	std				/ reverse direction bit (RtoL)
237	cmpl	$12,%ecx		/ if (size < 12)
238	ja	.BigCopyLeft		/ {
239	movl	%edx,%esi		/     src = src + size - 1
240	leal	-1(%ecx,%edi),%edi	/     dst = dst + size - 1
241	rep;	smovb			/    do the byte copy
242	cld				/    reset direction flag to LtoR
243	popl	%edi			/  }
244	popl	%esi			/  restore registers
245	movl	4(%esp),%eax		/  set up return value
246	ret				/  return(dba);
247.BigCopyLeft:				/ } else {
248	xchgl	%edx,%ecx
249	movl	%ecx,%esi		/ align source w/byte copy
250	leal	-1(%edx,%edi),%edi
251	andl	%eax,%ecx
252	jz	.SkipAlignLeft
253	addl	$1, %ecx		/ we need to insure that future
254	subl	%ecx,%edx		/ copy is done on aligned boundary
255	rep;	smovb
256.SkipAlignLeft:
257	movl	%edx,%ecx
258	subl	%eax,%esi
259	shrl	$2,%ecx			/ do 4 byte copy RtoL
260	subl	%eax,%edi
261	rep;	smovl
262	andl	%eax,%edx		/ do 1 byte copy whats left
263	jz	.CleanupReturnLeft
264	movl	%edx,%ecx
265	addl	%eax,%esi		/ rep; smovl instruction will decrement
266	addl	%eax,%edi		/ %edi, %esi by four after each copy
267					/ adding 3 will restore pointers to byte
268					/ before last double word copied
269					/ which is where they are expected to
270					/ be for the single byte copy code
271	rep;	smovb
272.CleanupReturnLeft:
273	cld				/ reset direction flag to LtoR
274	popl	%edi
275	popl	%esi			/ restore registers
276	movl	4(%esp),%eax		/ set up return value
277	ret				/ return(dba);
278	SET_SIZE(memmove)
279