xref: /titanic_44/usr/src/lib/libc/i386_hwcap1/gen/memset.s (revision 54925bf60766fbb4f1f2d7c843721406a7b7a3fb)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27	.ident	"%Z%%M%	%I%	%E% SMI"
28
29	.file	"%M%"
30
31#include <sys/asm_linkage.h>
32
33	ANSI_PRAGMA_WEAK(memset,function)
34
35#include "SYS.h"
36
37	ANSI_PRAGMA_WEAK2(_private_memset,memset,function)
38
39	ENTRY(memset)
40	pushl	%edi		/ save register variable
41	movl	8(%esp),%edi	/ %edi = string address
42	movl	12(%esp),%eax	/ %al = byte to duplicate
43	movl	16(%esp),%ecx	/ %ecx = number of copies
44
45	/ For all basic blocks in this routine, maintain the following
46	/ entry conditions:	%eax each byte is set to desired byte.
47	/			NOTE: .byteset doesn't require this
48	/			%ecx contains # bytes to set
49	/			%edi contain address to set
50
51	cld			/ make sure we go the right way...
52	cmpl	$20,%ecx	/ strings with fewer than 20 chars should be byte set
53	jbe	.byteset
54
55	andl	$0xff, %eax	/ trim anything above low byte
56	imul	$0x01010101, %eax	/ extend low byte to each byte
57
58	cmpl	$256, %ecx	/ smaller areas don't benefit from alignment
59	jbe	.wordset
60
61	cmpl	$511, %ecx	/ areas smaller than this should be wordset
62	jbe	.check_wordset
63
64	/
65	/ prep work for sse temporal and non-temporal
66	/
67
68	pushl	%ebx		/ more registers are needed
69	pushl	%esi		/ for alignment work
70
71	/
72	/ align address to 64 byte boundaries.
73	/
74
75	movl	%ecx, %ebx	/ save byte count
76	movl	%edi, %esi	/ esi is scratch register
77	andl	$63, %esi	/ bytes to align to 64 byte align addr
78	neg	%esi		/ compute count of bytes
79	addl	$64, %esi	/ needed to align
80	andl	$63, %esi	/ to 64 byte align addr
81	jz	.sse_aligned	/ skip alignment if not needed
82	subl	%esi, %ebx	/ ebx contains remainder of bytes to set
83	movl	%esi, %ecx	/ alignment bytes
84	shrl	$2,%ecx		/ %ecx = number of words to set
85	rep; sstol
86	movl	%esi,%ecx
87	andl	$3,%ecx		/ %ecx = number of bytes left
88	rep; sstob
89	movl	%ebx, %ecx	/ remainder to be set
90
91.sse_aligned:
92
93	shr	$6, %ecx	/ number of 64 byte blocks to set
94
95	/
96	/ load xmm0 with bytes to be set
97	/
98	subl	$16,%esp	/ give ourselves some working room on the stack
99	movl	%eax,(%esp)	/ copy eax into each of 4 bytes
100	movl	%eax,4(%esp)	/ avoid pushl since it causes more interlocking
101	movl	%eax,8(%esp)	/
102	movl	%eax,12(%esp)	/
103	movups	(%esp), %xmm0	/ unaligned load from stack into xmm0
104	addl	$16,%esp	/ restore stack position
105
106	cmpl	$262143, %ebx	/ blocks smaller than this allocate in the cache
107	jbe	.sse_loop
108	jmp	.sse_nt_loop	/ branch across alignment nops
109
110	.align 16
111
112.sse_nt_loop:
113	movntps %xmm0, (%edi)	/ block non-temporal store
114	movntps %xmm0, 16(%edi)	/ use sse rather than sse2
115	movntps %xmm0, 32(%edi)	/ so we work more places
116	movntps %xmm0, 48(%edi)	/
117
118	addl	$64, %edi	/ increment dest address
119	dec	%ecx		/ dec count of blocks
120	jnz	.sse_nt_loop	/ jump if not done
121
122	andl	$63, %ebx	/ remainder of bytes to copy
123	movl	%ebx, %ecx	/ ecx contains remainer of bytes to set
124	popl	%esi		/ restore stack config
125	popl	%ebx		/
126#if defined(_SSE2_INSN)
127	mfence
128#elif defined(_SSE_INSN)
129	sfence
130#else
131#error "Must have either SSE or SSE2"
132#endif
133	cmpl	$20, %ecx	/ compare and jump accordingly
134	jbe	.byteset
135	jmp	.wordset
136
137	.align 16
138.sse_loop:
139 	movaps %xmm0, (%edi)	/ block copy w/ SSE
140	movaps %xmm0, 16(%edi)
141	movaps %xmm0, 32(%edi)
142	movaps %xmm0, 48(%edi)
143
144	addl	$64, %edi	/ increment addr
145	dec	%ecx		/ dec count of blocks
146	jnz	.sse_loop	/ jump if not done
147
148	andl	$63, %ebx	/ remainder of bytes to copy
149	movl	%ebx, %ecx	/ in %ecx as normal
150	popl	%esi		/ restore stack config
151	popl	%ebx		/
152	cmpl	$20, %ecx
153	jbe	.byteset
154	jmp	.wordset
155
156.check_wordset:
157	movl	%edi, %edx	/ save current store ptr
158	andl	$7, %edi	/ check alignment
159	movl	%edx,%edi	/ %edi = string address
160	jz	.wordset	/ all ok
161
162
163.align_wordset:
164	pushl	%ebx		/ more registers are needed
165	pushl	%esi
166
167	movl	%ecx, %ebx
168	movl	%edi, %esi
169	andl	$7, %esi
170	neg	%esi
171	addl	$8, %esi
172	andl	$7, %esi
173	subl	%esi, %ebx	/ ebx contains remainder of bytes to copy
174	movl	%esi, %ecx
175	rep; sstob
176	movl	%ebx, %ecx
177	popl	%esi		/ restore stack config
178	popl	%ebx		/
179
180.wordset:
181	movl	%ecx, %edx	/ save cont
182	shrl	$2,%ecx		/ %ecx = number of words to set
183	rep; sstol
184	movl	%edx,%ecx
185	andl	$3,%ecx		/ %ecx = number of bytes left
186
187.byteset:
188	rep; sstob
189	movl	8(%esp),%eax	/ return string address
190	popl	%edi		/ restore register variable
191	ret
192	SET_SIZE(memset)
193