xref: /titanic_44/usr/src/lib/libc/amd64/gen/memset.s (revision 99ebb4ca412cb0a19d77a3899a87c055b9c30fa8)
1/*
2 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
3 * Use is subject to license terms.
4 */
5
6/*
7 * Copyright (c) 2002 Advanced Micro Devices, Inc.
8 *
9 * All rights reserved.
10 *
11 * Redistribution and  use in source and binary  forms, with or
12 * without  modification,  are   permitted  provided  that  the
13 * following conditions are met:
14 *
15 * + Redistributions  of source  code  must  retain  the  above
16 *   copyright  notice,   this  list  of   conditions  and  the
17 *   following disclaimer.
18 *
19 * + Redistributions  in binary  form must reproduce  the above
20 *   copyright  notice,   this  list  of   conditions  and  the
21 *   following  disclaimer in  the  documentation and/or  other
22 *   materials provided with the distribution.
23 *
24 * + Neither the  name of Advanced Micro Devices,  Inc. nor the
25 *   names  of  its contributors  may  be  used  to endorse  or
26 *   promote  products  derived   from  this  software  without
27 *   specific prior written permission.
28 *
29 * THIS  SOFTWARE  IS PROVIDED  BY  THE  COPYRIGHT HOLDERS  AND
30 * CONTRIBUTORS AS IS AND  ANY EXPRESS OR IMPLIED WARRANTIES,
31 * INCLUDING,  BUT NOT  LIMITED TO,  THE IMPLIED  WARRANTIES OF
32 * MERCHANTABILITY  AND FITNESS  FOR A  PARTICULAR  PURPOSE ARE
33 * DISCLAIMED.  IN  NO  EVENT  SHALL  ADVANCED  MICRO  DEVICES,
34 * INC.  OR CONTRIBUTORS  BE LIABLE  FOR ANY  DIRECT, INDIRECT,
35 * INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR CONSEQUENTIAL  DAMAGES
36 * (INCLUDING,  BUT NOT LIMITED  TO, PROCUREMENT  OF SUBSTITUTE
37 * GOODS  OR  SERVICES;  LOSS  OF  USE, DATA,  OR  PROFITS;  OR
38 * BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON  ANY THEORY OF
39 * LIABILITY,  WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
40 * (INCLUDING NEGLIGENCE  OR OTHERWISE) ARISING IN  ANY WAY OUT
41 * OF THE  USE  OF  THIS  SOFTWARE, EVEN  IF  ADVISED  OF  THE
42 * POSSIBILITY OF SUCH DAMAGE.
43 *
44 * It is  licensee's responsibility  to comply with  any export
45 * regulations applicable in licensee's jurisdiction.
46 */
47
48	.ident	"%Z%%M%	%I%	%E% SMI"
49
50	.file	"%M%"
51
52#include <sys/asm_linkage.h>
53
54	ANSI_PRAGMA_WEAK(memset,function)
55
56#include "SYS.h"
57#include "cache.h"
58
59	ANSI_PRAGMA_WEAK2(_private_memset,memset,function)
60
61#define LABEL(s) .memset/**/s
62
63	ENTRY(memset)                	/* (void *, const void*, size_t) */
64
65	mov	$0x0101010101010101, %rcx /* memset is itself */
66        movzx   %sil, %rsi
67        imul    %rcx, %rsi		/* replicate 8 times */
68
69LABEL(try1):
70        cmp     $64, %rdx
71        mov     %rdi, %rax		/* return memory block address (even for bzero ()) */
72        jae	LABEL(1after)
73
74LABEL(1):                                /* 1-byte */
75        test    $1, %dl
76        jz      LABEL(1a)
77
78        mov     %sil, (%rdi)
79        inc	%rdi
80
81LABEL(1a):
82        test    $2, %dl
83        jz      LABEL(1b)
84
85        mov     %si, (%rdi)
86        add	$2, %rdi
87
88LABEL(1b):
89        test    $4, %dl
90        jz      LABEL(1c)
91
92        mov     %esi, (%rdi)
93	add	$4, %rdi
94
95LABEL(1c):
96        test    $8, %dl
97        jz      LABEL(1d)
98
99        mov     %rsi, (%rdi)
100	add	$8, %rdi
101
102LABEL(1d):
103        test    $16, %dl
104        jz      LABEL(1e)
105
106        mov     %rsi,   (%rdi)
107        mov     %rsi, 8 (%rdi)
108	add	$16, %rdi
109
110LABEL(1e):
111
112        test    $32, %dl
113        jz      LABEL(1f)
114
115        mov     %rsi,    (%rdi)
116        mov     %rsi,  8 (%rdi)
117        mov     %rsi, 16 (%rdi)
118        mov     %rsi, 24 (%rdi)
119/*	add	$32, %rdi */
120
121LABEL(1f):
122
123LABEL(exit):
124        rep
125        ret
126
127        .p2align 4
128
129LABEL(1after):
130
131LABEL(32try):
132        cmp     $256, %rdx
133        ja     LABEL(32after)
134
135LABEL(32):                               /* 32-byte */
136        mov     %edx, %ecx
137        shr     $5, %ecx
138        jz      LABEL(32skip)
139
140        .p2align 4
141
142LABEL(32loop):
143        dec     %ecx
144
145        mov     %rsi,    (%rdi)
146        mov     %rsi,  8 (%rdi)
147        mov     %rsi, 16 (%rdi)
148        mov     %rsi, 24 (%rdi)
149
150        lea     32 (%rdi), %rdi
151
152        jz      LABEL(32skip)
153
154        dec     %ecx
155
156        mov     %rsi,    (%rdi)
157        mov     %rsi,  8 (%rdi)
158        mov     %rsi, 16 (%rdi)
159        mov     %rsi, 24 (%rdi)
160
161        lea     32 (%rdi), %rdi
162
163        jnz     LABEL(32loop)
164
165        .p2align 4
166
167LABEL(32skip):
168        and     $31, %edx
169        jnz     LABEL(1)
170
171        rep
172        ret
173
174        .p2align 4
175
176LABEL(32after):
177
178	/* 3DNow: use prefetch */
179	prefetchnta _sref_(.amd64cache1) /* improves test further ahead on B0 */
180
181LABEL(aligntry):
182        mov     %edi, %ecx              /* align by destination */
183
184        and     $7, %ecx                /* skip if already aligned */
185        jz      LABEL(alignafter)
186
187LABEL(align):                            /* align */
188        lea     -8 (%rcx, %rdx), %rdx
189        sub     $8, %ecx
190
191        .p2align 4
192
193LABEL(alignloop):
194        inc     %ecx
195
196        mov     %sil, (%rdi)
197        lea     1 (%rdi), %rdi
198
199        jnz     LABEL(alignloop)
200
201        .p2align 4
202
203LABEL(alignafter):
204        mov	_sref_(.amd64cache2), %r8
205        cmp     %rdx, %r8
206        cmova   %rdx, %r8
207
208	cmp	$2048, %rdx		/* this is slow for some block sizes */
209	jb	LABEL(64)
210
211LABEL(fast):				/* microcode */
212	mov	%r8, %rcx
213	and	$-8, %r8
214	shr	$3, %rcx
215/*	jz	LABEL(fastskip) */
216
217	xchg	%rax, %rsi
218
219	rep
220	stosq
221
222	xchg	%rax, %rsi
223
224LABEL(fastskip):
225	sub	%r8, %rdx
226	ja	LABEL(64after)
227
228	and	$7, %edx
229	jnz	LABEL(1)
230
231	rep
232	ret
233
234	.p2align 4
235
236LABEL(64try):
237
238LABEL(64):                               /* 64-byte */
239        mov     %r8, %rcx
240        and     $-64, %r8
241        shr     $6, %rcx
242
243        dec     %rcx                    /* this iteration starts the prefetcher sooner */
244
245        mov     %rsi,    (%rdi)
246        mov     %rsi,  8 (%rdi)
247        mov     %rsi, 16 (%rdi)
248        mov     %rsi, 24 (%rdi)
249        mov     %rsi, 32 (%rdi)
250        mov     %rsi, 40 (%rdi)
251        mov     %rsi, 48 (%rdi)
252        mov     %rsi, 56 (%rdi)
253
254        lea     64 (%rdi), %rdi
255
256        .p2align 4
257
258LABEL(64loop):
259        dec     %rcx
260
261        mov     %rsi,    (%rdi)
262        mov     %rsi,  8 (%rdi)
263        mov     %rsi, 16 (%rdi)
264        mov     %rsi, 24 (%rdi)
265        mov     %rsi, 32 (%rdi)
266        mov     %rsi, 40 (%rdi)
267        mov     %rsi, 48 (%rdi)
268        mov     %rsi, 56 (%rdi)
269
270        lea     64 (%rdi), %rdi
271
272        jnz     LABEL(64loop)
273
274LABEL(64skip):
275        sub     %r8, %rdx
276        ja      LABEL(64after)
277
278	and     $63, %edx
279	jnz     LABEL(32)
280
281        rep
282        ret
283
284        .p2align 4
285
286LABEL(64after):
287
288LABEL(NTtry):
289
290LABEL(NT):                               /* 128-byte */
291        mov     %rdx, %rcx
292        shr     $7, %rcx
293        jz      LABEL(NTskip)
294
295        .p2align 4
296
297LABEL(NTloop):                  /* on an MP system it would be better to prefetchnta 320 (%rdi) and 384 (%rdi) here, but not so on an 1P system */
298        dec     %rcx
299
300        movnti  %rsi,     (%rdi)
301        movnti  %rsi,   8 (%rdi)
302        movnti  %rsi,  16 (%rdi)
303        movnti  %rsi,  24 (%rdi)
304        movnti  %rsi,  32 (%rdi)
305        movnti  %rsi,  40 (%rdi)
306        movnti  %rsi,  48 (%rdi)
307        movnti  %rsi,  56 (%rdi)
308        movnti  %rsi,  64 (%rdi)
309        movnti  %rsi,  72 (%rdi)
310        movnti  %rsi,  80 (%rdi)
311        movnti  %rsi,  88 (%rdi)
312        movnti  %rsi,  96 (%rdi)
313        movnti  %rsi, 104 (%rdi)
314        movnti  %rsi, 112 (%rdi)
315        movnti  %rsi, 120 (%rdi)
316
317        lea     128 (%rdi), %rdi
318
319        jnz     LABEL(NTloop)
320
321        mfence
322
323LABEL(NTskip):
324        and     $127, %edx
325        jnz     LABEL(32)
326
327        rep
328        ret
329
330	SET_SIZE(memset)
331