xref: /titanic_52/usr/src/lib/libc/amd64/gen/memset.s (revision bdfc6d18da790deeec2e0eb09c625902defe2498)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Copyright (c) 2002 Advanced Micro Devices, Inc.
29 *
30 * All rights reserved.
31 *
32 * Redistribution and  use in source and binary  forms, with or
33 * without  modification,  are   permitted  provided  that  the
34 * following conditions are met:
35 *
36 * + Redistributions  of source  code  must  retain  the  above
37 *   copyright  notice,   this  list  of   conditions  and  the
38 *   following disclaimer.
39 *
40 * + Redistributions  in binary  form must reproduce  the above
41 *   copyright  notice,   this  list  of   conditions  and  the
42 *   following  disclaimer in  the  documentation and/or  other
43 *   materials provided with the distribution.
44 *
45 * + Neither the  name of Advanced Micro Devices,  Inc. nor the
46 *   names  of  its contributors  may  be  used  to endorse  or
47 *   promote  products  derived   from  this  software  without
48 *   specific prior written permission.
49 *
50 * THIS  SOFTWARE  IS PROVIDED  BY  THE  COPYRIGHT HOLDERS  AND
51 * CONTRIBUTORS AS IS AND  ANY EXPRESS OR IMPLIED WARRANTIES,
52 * INCLUDING,  BUT NOT  LIMITED TO,  THE IMPLIED  WARRANTIES OF
53 * MERCHANTABILITY  AND FITNESS  FOR A  PARTICULAR  PURPOSE ARE
54 * DISCLAIMED.  IN  NO  EVENT  SHALL  ADVANCED  MICRO  DEVICES,
55 * INC.  OR CONTRIBUTORS  BE LIABLE  FOR ANY  DIRECT, INDIRECT,
56 * INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR CONSEQUENTIAL  DAMAGES
57 * (INCLUDING,  BUT NOT LIMITED  TO, PROCUREMENT  OF SUBSTITUTE
58 * GOODS  OR  SERVICES;  LOSS  OF  USE, DATA,  OR  PROFITS;  OR
59 * BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON  ANY THEORY OF
60 * LIABILITY,  WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
61 * (INCLUDING NEGLIGENCE  OR OTHERWISE) ARISING IN  ANY WAY OUT
62 * OF THE  USE  OF  THIS  SOFTWARE, EVEN  IF  ADVISED  OF  THE
63 * POSSIBILITY OF SUCH DAMAGE.
64 *
65 * It is  licensee's responsibility  to comply with  any export
66 * regulations applicable in licensee's jurisdiction.
67 */
68
69	.ident	"%Z%%M%	%I%	%E% SMI"
70
71	.file	"%M%"
72
73#include <sys/asm_linkage.h>
74
75	ANSI_PRAGMA_WEAK(memset,function)
76
77#include "SYS.h"
78#include "cache.h"
79
80	ANSI_PRAGMA_WEAK2(_private_memset,memset,function)
81
82#define LABEL(s) .memset/**/s
83
84	ENTRY(memset)                	/* (void *, const void*, size_t) */
85
86	mov	$0x0101010101010101, %rcx /* memset is itself */
87        movzx   %sil, %rsi
88        imul    %rcx, %rsi		/* replicate 8 times */
89
90LABEL(try1):
91        cmp     $64, %rdx
92        mov     %rdi, %rax		/* return memory block address (even for bzero ()) */
93        jae	LABEL(1after)
94
95LABEL(1):                                /* 1-byte */
96        test    $1, %dl
97        jz      LABEL(1a)
98
99        mov     %sil, (%rdi)
100        inc	%rdi
101
102LABEL(1a):
103        test    $2, %dl
104        jz      LABEL(1b)
105
106        mov     %si, (%rdi)
107        add	$2, %rdi
108
109LABEL(1b):
110        test    $4, %dl
111        jz      LABEL(1c)
112
113        mov     %esi, (%rdi)
114	add	$4, %rdi
115
116LABEL(1c):
117        test    $8, %dl
118        jz      LABEL(1d)
119
120        mov     %rsi, (%rdi)
121	add	$8, %rdi
122
123LABEL(1d):
124        test    $16, %dl
125        jz      LABEL(1e)
126
127        mov     %rsi,   (%rdi)
128        mov     %rsi, 8 (%rdi)
129	add	$16, %rdi
130
131LABEL(1e):
132
133        test    $32, %dl
134        jz      LABEL(1f)
135
136        mov     %rsi,    (%rdi)
137        mov     %rsi,  8 (%rdi)
138        mov     %rsi, 16 (%rdi)
139        mov     %rsi, 24 (%rdi)
140/*	add	$32, %rdi */
141
142LABEL(1f):
143
144LABEL(exit):
145        rep
146        ret
147
148        .p2align 4
149
150LABEL(1after):
151
152LABEL(32try):
153        cmp     $256, %rdx
154        ja     LABEL(32after)
155
156LABEL(32):                               /* 32-byte */
157        mov     %edx, %ecx
158        shr     $5, %ecx
159        jz      LABEL(32skip)
160
161        .p2align 4
162
163LABEL(32loop):
164        dec     %ecx
165
166        mov     %rsi,    (%rdi)
167        mov     %rsi,  8 (%rdi)
168        mov     %rsi, 16 (%rdi)
169        mov     %rsi, 24 (%rdi)
170
171        lea     32 (%rdi), %rdi
172
173        jz      LABEL(32skip)
174
175        dec     %ecx
176
177        mov     %rsi,    (%rdi)
178        mov     %rsi,  8 (%rdi)
179        mov     %rsi, 16 (%rdi)
180        mov     %rsi, 24 (%rdi)
181
182        lea     32 (%rdi), %rdi
183
184        jnz     LABEL(32loop)
185
186        .p2align 4
187
188LABEL(32skip):
189        and     $31, %edx
190        jnz     LABEL(1)
191
192        rep
193        ret
194
195        .p2align 4
196
197LABEL(32after):
198
199	/* 3DNow: use prefetch */
200	prefetchnta _sref_(.amd64cache1) /* improves test further ahead on B0 */
201
202LABEL(aligntry):
203        mov     %edi, %ecx              /* align by destination */
204
205        and     $7, %ecx                /* skip if already aligned */
206        jz      LABEL(alignafter)
207
208LABEL(align):                            /* align */
209        lea     -8 (%rcx, %rdx), %rdx
210        sub     $8, %ecx
211
212        .p2align 4
213
214LABEL(alignloop):
215        inc     %ecx
216
217        mov     %sil, (%rdi)
218        lea     1 (%rdi), %rdi
219
220        jnz     LABEL(alignloop)
221
222        .p2align 4
223
224LABEL(alignafter):
225        mov	_sref_(.amd64cache2), %r8
226        cmp     %rdx, %r8
227        cmova   %rdx, %r8
228
229	cmp	$2048, %rdx		/* this is slow for some block sizes */
230	jb	LABEL(64)
231
232LABEL(fast):				/* microcode */
233	mov	%r8, %rcx
234	and	$-8, %r8
235	shr	$3, %rcx
236/*	jz	LABEL(fastskip) */
237
238	xchg	%rax, %rsi
239
240	rep
241	stosq
242
243	xchg	%rax, %rsi
244
245LABEL(fastskip):
246	sub	%r8, %rdx
247	ja	LABEL(64after)
248
249	and	$7, %edx
250	jnz	LABEL(1)
251
252	rep
253	ret
254
255	.p2align 4
256
257LABEL(64try):
258
259LABEL(64):                               /* 64-byte */
260        mov     %r8, %rcx
261        and     $-64, %r8
262        shr     $6, %rcx
263
264        dec     %rcx                    /* this iteration starts the prefetcher sooner */
265
266        mov     %rsi,    (%rdi)
267        mov     %rsi,  8 (%rdi)
268        mov     %rsi, 16 (%rdi)
269        mov     %rsi, 24 (%rdi)
270        mov     %rsi, 32 (%rdi)
271        mov     %rsi, 40 (%rdi)
272        mov     %rsi, 48 (%rdi)
273        mov     %rsi, 56 (%rdi)
274
275        lea     64 (%rdi), %rdi
276
277        .p2align 4
278
279LABEL(64loop):
280        dec     %rcx
281
282        mov     %rsi,    (%rdi)
283        mov     %rsi,  8 (%rdi)
284        mov     %rsi, 16 (%rdi)
285        mov     %rsi, 24 (%rdi)
286        mov     %rsi, 32 (%rdi)
287        mov     %rsi, 40 (%rdi)
288        mov     %rsi, 48 (%rdi)
289        mov     %rsi, 56 (%rdi)
290
291        lea     64 (%rdi), %rdi
292
293        jnz     LABEL(64loop)
294
295LABEL(64skip):
296        sub     %r8, %rdx
297        ja      LABEL(64after)
298
299	and     $63, %edx
300	jnz     LABEL(32)
301
302        rep
303        ret
304
305        .p2align 4
306
307LABEL(64after):
308
309LABEL(NTtry):
310
311LABEL(NT):                               /* 128-byte */
312        mov     %rdx, %rcx
313        shr     $7, %rcx
314        jz      LABEL(NTskip)
315
316        .p2align 4
317
318LABEL(NTloop):                  /* on an MP system it would be better to prefetchnta 320 (%rdi) and 384 (%rdi) here, but not so on an 1P system */
319        dec     %rcx
320
321        movnti  %rsi,     (%rdi)
322        movnti  %rsi,   8 (%rdi)
323        movnti  %rsi,  16 (%rdi)
324        movnti  %rsi,  24 (%rdi)
325        movnti  %rsi,  32 (%rdi)
326        movnti  %rsi,  40 (%rdi)
327        movnti  %rsi,  48 (%rdi)
328        movnti  %rsi,  56 (%rdi)
329        movnti  %rsi,  64 (%rdi)
330        movnti  %rsi,  72 (%rdi)
331        movnti  %rsi,  80 (%rdi)
332        movnti  %rsi,  88 (%rdi)
333        movnti  %rsi,  96 (%rdi)
334        movnti  %rsi, 104 (%rdi)
335        movnti  %rsi, 112 (%rdi)
336        movnti  %rsi, 120 (%rdi)
337
338        lea     128 (%rdi), %rdi
339
340        jnz     LABEL(NTloop)
341
342        mfence
343
344LABEL(NTskip):
345        and     $127, %edx
346        jnz     LABEL(32)
347
348        rep
349        ret
350
351	SET_SIZE(memset)
352