xref: /titanic_41/usr/src/lib/libc/i386_hwcap1/gen/memcpy.s (revision d5ace9454616652a717c9831d949dffa319381f9)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27	.file	"memcpy.s"
28
29#include <sys/asm_linkage.h>
30
31	ANSI_PRAGMA_WEAK(memmove,function)
32	ANSI_PRAGMA_WEAK(memcpy,function)
33
34	ENTRY(memmove)
35	movl	0+12(%esp),%ecx	/ get number of bytes to move
36	pushl	%esi		/ save off %edi, %esi and move destination
37	pushl	%edi
38	movl	8+ 4(%esp),%edi	/ destination buffer address
39	movl	8+ 8(%esp),%esi	/ source buffer address
40	movl	%edi, %eax
41	testl	%ecx,%ecx
42	jz	.Return
43
44	cmpl	%esi,%edi	/ if (source addr > dest addr)
45	leal	-1(%esi,%ecx),%edx	/ %edx = src + size - 1
46	jle	.memcpy_post	/ jump if dst < src
47	cmpl	%edx,%edi
48	jle	.CopyLeft	/ jump if dst <= src + size - 1
49	jmp	.memcpy_post
50
51	ENTRY(memcpy)
52	pushl	%esi
53	pushl	%edi
54
55	movl	8+4(%esp),%edi	/ %edi = dest address
56	movl	%edi, %eax	/ save this
57	movl	8+8(%esp),%esi	/ %esi = source address
58	movl	8+12(%esp),%ecx/ %ecx = length of string
59				/ %edx scratch register
60				/ %eax scratch register
61.memcpy_post:
62	nop			/ this really helps, don't know why
63				/ note:	cld is perf death on P4
64	cmpl	$63,%ecx
65	ja	.move_sse	/ not worth doing sse for less
66
67.movew:
68	movl	%ecx,%edx	/ save byte cnt
69	shrl	$2,%ecx		/ %ecx = number of words to move
70	rep ; smovl		/ move the words
71
72
73	andl	$0x3,%edx	/ %edx = number of bytes left to move
74	jz	.Return		/ %edx <= 3, so just unroll the loop
75
76	movb	(%esi), %cl
77	movb	%cl, (%edi)
78	decl	%edx
79	jz	.Return
80	movb	1(%esi), %cl
81	movb	%cl, 1(%edi)
82	decl	%edx
83	jz	.Return
84	movb	2(%esi), %cl
85	movb	%cl, 2(%edi)
86
87.Return:
88	popl	%edi		/ restore register variables
89	popl	%esi
90	ret
91
92.move_sse:
93	/
94	/ time to 16 byte align destination
95	/
96	andl	$15, %eax
97	jnz	.sse_unaligned	/ jmp if dest is unaligned
98.sse:				/ dest is aligned, check source
99	movl	%ecx, %edx	/ get byte count
100	shrl	$6, %edx	/ number of 64 byte blocks to move
101	testl	$15, %esi
102	jnz	.sse_da		/ go to slow loop if source is unaligned
103	cmpl	$65535, %ecx
104	ja	.sse_sa_nt_loop
105
106	/
107	/ use aligned load since we're lucky
108	/
109.sse_sa_loop:
110	prefetcht0 568(%esi)	/ prefetch source & copy 64 byte at a time
111	prefetcht0 568(%edi)	/ prefetch source & copy 64 byte at a time
112	movaps	0(%esi), %xmm0
113	movaps	%xmm0, 0(%edi)
114	movaps	16(%esi), %xmm1
115	movaps	%xmm1, 16(%edi)
116	movaps	32(%esi), %xmm2
117	movaps	%xmm2, 32(%edi)
118	movaps	48(%esi), %xmm3
119	movaps	%xmm3, 48(%edi)
120	addl	$64, %esi
121	addl	$64, %edi
122	decl	%edx
123	jnz	.sse_sa_loop
124
125.sse_cleanup:
126	andl	$63, %ecx	/ compute remaining bytes
127	movl	8+4(%esp), %eax	/ setup return value
128	jz	.Return
129	jmp	.movew
130
131	/
132	/ use aligned load since we're lucky
133	/
134	.align 16
135.sse_sa_nt_loop:
136	prefetchnta 16384(%esi)	/ prefetch source & copy 64 byte at a time
137	movaps	(%esi), %xmm0
138	movntps	%xmm0, 0(%edi)
139	movaps	16(%esi), %xmm1
140	movntps	%xmm1, 16(%edi)
141	movaps	32(%esi), %xmm2
142	movntps	%xmm2, 32(%edi)
143	movaps	48(%esi), %xmm3
144	movntps	%xmm3, 48(%edi)
145	addl	$64, %esi
146	addl	$64, %edi
147	decl	%edx
148	jnz	.sse_sa_nt_loop
149#if defined(_SSE2_INSN)
150	mfence
151#elif defined(_SSE_INSN)
152	sfence
153#else
154#error "Must have either SSE or SSE2"
155#endif
156	jmp	.sse_cleanup
157
158	/
159	/ Make certain that destination buffer becomes aligned
160	/
161.sse_unaligned:
162	neg	%eax		/ subtract from 16 and get destination
163	andl	$15, %eax	/ aligned on a 16 byte boundary
164	movl	%ecx, %edx	/ saved count
165	subl	%eax, %ecx	/ subtract from byte count
166	cmpl	$64, %ecx	/ after aligning, will we still have 64 bytes?
167	cmovb	%edx, %ecx	/ if not, restore original byte count,
168	cmovb	8+4(%esp), %eax	/ and restore return value,
169	jb	.movew		/ and do a non-SSE move.
170	xchg	%ecx, %eax	/ flip for copy
171	rep ; smovb		/ move the bytes
172	xchg	%ecx, %eax	/ flip back
173	jmp	.sse
174
175	.align 16
176.sse_da:
177	cmpl	$65535, %ecx
178	jbe	.sse_da_loop
179
180	/
181	/ use unaligned load since source doesn't line up
182	/
183.sse_da_nt_loop:
184	prefetchnta 16384(%esi)	/ prefetch source & copy 64 byte at a time
185	movups	0(%esi), %xmm0
186	movntps	%xmm0, 0(%edi)
187	movups	16(%esi), %xmm1
188	movntps	%xmm1, 16(%edi)
189	movups	32(%esi), %xmm2
190	movntps	%xmm2, 32(%edi)
191	movups	48(%esi), %xmm3
192	movntps	%xmm3, 48(%edi)
193	addl	$64, %esi
194	addl	$64, %edi
195	decl	%edx
196	jnz	.sse_da_nt_loop
197#if defined(_SSE2_INSN)
198	mfence
199#elif defined(_SSE_INSN)
200	sfence
201#else
202#error "Must have either SSE or SSE2"
203#endif
204	jmp	.sse_cleanup
205	/
206	/ use unaligned load since source doesn't line up
207	/
208	.align	16
209.sse_da_loop:
210	prefetcht0 568(%esi)	/ prefetch source & copy 64 byte at a time
211	prefetcht0 568(%edi)
212	movups	0(%esi), %xmm0
213	movaps	%xmm0, 0(%edi)
214	movups	16(%esi), %xmm1
215	movaps	%xmm1, 16(%edi)
216	movups	32(%esi), %xmm2
217	movaps	%xmm2, 32(%edi)
218	movups	48(%esi), %xmm3
219	movaps	%xmm3, 48(%edi)
220	addl	$64, %esi
221	addl	$64, %edi
222	decl	%edx
223	jnz	.sse_da_loop
224	jmp	.sse_cleanup
225
226	SET_SIZE(memcpy)
227
228
229/ .CopyLeft handles the memmove case where we must perform the copy backwards,
230/ because of overlap between src and dst. This is not particularly optimized.
231
232.CopyLeft:
233	movl	$3,%eax			/ heavily used constant
234	std				/ reverse direction bit (RtoL)
235	cmpl	$12,%ecx		/ if (size < 12)
236	ja	.BigCopyLeft		/ {
237	movl	%edx,%esi		/     src = src + size - 1
238	leal	-1(%ecx,%edi),%edi	/     dst = dst + size - 1
239	rep;	smovb			/    do the byte copy
240	cld				/    reset direction flag to LtoR
241	popl	%edi			/  }
242	popl	%esi			/  restore registers
243	movl	4(%esp),%eax		/  set up return value
244	ret				/  return(dba);
245.BigCopyLeft:				/ } else {
246	xchgl	%edx,%ecx
247	movl	%ecx,%esi		/ align source w/byte copy
248	leal	-1(%edx,%edi),%edi
249	andl	%eax,%ecx
250	jz	.SkipAlignLeft
251	addl	$1, %ecx		/ we need to insure that future
252	subl	%ecx,%edx		/ copy is done on aligned boundary
253	rep;	smovb
254.SkipAlignLeft:
255	movl	%edx,%ecx
256	subl	%eax,%esi
257	shrl	$2,%ecx			/ do 4 byte copy RtoL
258	subl	%eax,%edi
259	rep;	smovl
260	andl	%eax,%edx		/ do 1 byte copy whats left
261	jz	.CleanupReturnLeft
262	movl	%edx,%ecx
263	addl	%eax,%esi		/ rep; smovl instruction will decrement
264	addl	%eax,%edi		/ %edi, %esi by four after each copy
265					/ adding 3 will restore pointers to byte
266					/ before last double word copied
267					/ which is where they are expected to
268					/ be for the single byte copy code
269	rep;	smovb
270.CleanupReturnLeft:
271	cld				/ reset direction flag to LtoR
272	popl	%edi
273	popl	%esi			/ restore registers
274	movl	4(%esp),%eax		/ set up return value
275	ret				/ return(dba);
276	SET_SIZE(memmove)
277