xref: /titanic_52/usr/src/lib/libc/i386_hwcap1/gen/memset.s (revision d4660949aa62dd6a963f4913b7120b383cf473c4)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27	.ident	"%Z%%M%	%I%	%E% SMI"
28
29	.file	"%M%"
30
31#include <sys/asm_linkage.h>
32
33	ANSI_PRAGMA_WEAK(memset,function)
34
35#include "SYS.h"
36
37	ENTRY(memset)
38	pushl	%edi		/ save register variable
39	movl	8(%esp),%edi	/ %edi = string address
40	movl	12(%esp),%eax	/ %al = byte to duplicate
41	movl	16(%esp),%ecx	/ %ecx = number of copies
42
43	/ For all basic blocks in this routine, maintain the following
44	/ entry conditions:	%eax each byte is set to desired byte.
45	/			NOTE: .byteset doesn't require this
46	/			%ecx contains # bytes to set
47	/			%edi contain address to set
48
49	cld			/ make sure we go the right way...
50	cmpl	$20,%ecx	/ strings with fewer than 20 chars should be byte set
51	jbe	.byteset
52
53	andl	$0xff, %eax	/ trim anything above low byte
54	imul	$0x01010101, %eax	/ extend low byte to each byte
55
56	cmpl	$256, %ecx	/ smaller areas don't benefit from alignment
57	jbe	.wordset
58
59	cmpl	$511, %ecx	/ areas smaller than this should be wordset
60	jbe	.check_wordset
61
62	/
63	/ prep work for sse temporal and non-temporal
64	/
65
66	pushl	%ebx		/ more registers are needed
67	pushl	%esi		/ for alignment work
68
69	/
70	/ align address to 64 byte boundaries.
71	/
72
73	movl	%ecx, %ebx	/ save byte count
74	movl	%edi, %esi	/ esi is scratch register
75	andl	$63, %esi	/ bytes to align to 64 byte align addr
76	neg	%esi		/ compute count of bytes
77	addl	$64, %esi	/ needed to align
78	andl	$63, %esi	/ to 64 byte align addr
79	jz	.sse_aligned	/ skip alignment if not needed
80	subl	%esi, %ebx	/ ebx contains remainder of bytes to set
81	movl	%esi, %ecx	/ alignment bytes
82	shrl	$2,%ecx		/ %ecx = number of words to set
83	rep; sstol
84	movl	%esi,%ecx
85	andl	$3,%ecx		/ %ecx = number of bytes left
86	rep; sstob
87	movl	%ebx, %ecx	/ remainder to be set
88
89.sse_aligned:
90
91	shr	$6, %ecx	/ number of 64 byte blocks to set
92
93	/
94	/ load xmm0 with bytes to be set
95	/
96	subl	$16,%esp	/ give ourselves some working room on the stack
97	movl	%eax,(%esp)	/ copy eax into each of 4 bytes
98	movl	%eax,4(%esp)	/ avoid pushl since it causes more interlocking
99	movl	%eax,8(%esp)	/
100	movl	%eax,12(%esp)	/
101	movups	(%esp), %xmm0	/ unaligned load from stack into xmm0
102	addl	$16,%esp	/ restore stack position
103
104	cmpl	$262143, %ebx	/ blocks smaller than this allocate in the cache
105	jbe	.sse_loop
106	jmp	.sse_nt_loop	/ branch across alignment nops
107
108	.align 16
109
110.sse_nt_loop:
111	movntps %xmm0, (%edi)	/ block non-temporal store
112	movntps %xmm0, 16(%edi)	/ use sse rather than sse2
113	movntps %xmm0, 32(%edi)	/ so we work more places
114	movntps %xmm0, 48(%edi)	/
115
116	addl	$64, %edi	/ increment dest address
117	dec	%ecx		/ dec count of blocks
118	jnz	.sse_nt_loop	/ jump if not done
119
120	andl	$63, %ebx	/ remainder of bytes to copy
121	movl	%ebx, %ecx	/ ecx contains remainer of bytes to set
122	popl	%esi		/ restore stack config
123	popl	%ebx		/
124#if defined(_SSE2_INSN)
125	mfence
126#elif defined(_SSE_INSN)
127	sfence
128#else
129#error "Must have either SSE or SSE2"
130#endif
131	cmpl	$20, %ecx	/ compare and jump accordingly
132	jbe	.byteset
133	jmp	.wordset
134
135	.align 16
136.sse_loop:
137 	movaps %xmm0, (%edi)	/ block copy w/ SSE
138	movaps %xmm0, 16(%edi)
139	movaps %xmm0, 32(%edi)
140	movaps %xmm0, 48(%edi)
141
142	addl	$64, %edi	/ increment addr
143	dec	%ecx		/ dec count of blocks
144	jnz	.sse_loop	/ jump if not done
145
146	andl	$63, %ebx	/ remainder of bytes to copy
147	movl	%ebx, %ecx	/ in %ecx as normal
148	popl	%esi		/ restore stack config
149	popl	%ebx		/
150	cmpl	$20, %ecx
151	jbe	.byteset
152	jmp	.wordset
153
154.check_wordset:
155	movl	%edi, %edx	/ save current store ptr
156	andl	$7, %edi	/ check alignment
157	movl	%edx,%edi	/ %edi = string address
158	jz	.wordset	/ all ok
159
160
161.align_wordset:
162	pushl	%ebx		/ more registers are needed
163	pushl	%esi
164
165	movl	%ecx, %ebx
166	movl	%edi, %esi
167	andl	$7, %esi
168	neg	%esi
169	addl	$8, %esi
170	andl	$7, %esi
171	subl	%esi, %ebx	/ ebx contains remainder of bytes to copy
172	movl	%esi, %ecx
173	rep; sstob
174	movl	%ebx, %ecx
175	popl	%esi		/ restore stack config
176	popl	%ebx		/
177
178.wordset:
179	movl	%ecx, %edx	/ save cont
180	shrl	$2,%ecx		/ %ecx = number of words to set
181	rep; sstol
182	movl	%edx,%ecx
183	andl	$3,%ecx		/ %ecx = number of bytes left
184
185.byteset:
186	rep; sstob
187	movl	8(%esp),%eax	/ return string address
188	popl	%edi		/ restore register variable
189	ret
190	SET_SIZE(memset)
191