xref: /titanic_50/usr/src/lib/libc/capabilities/sun4u-us3/common/memset.s (revision 1e49577a7fcde812700ded04431b49d67cc57d6d)
1*1e49577aSRod Evans/*
2*1e49577aSRod Evans * CDDL HEADER START
3*1e49577aSRod Evans *
4*1e49577aSRod Evans * The contents of this file are subject to the terms of the
5*1e49577aSRod Evans * Common Development and Distribution License (the "License").
6*1e49577aSRod Evans * You may not use this file except in compliance with the License.
7*1e49577aSRod Evans *
8*1e49577aSRod Evans * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*1e49577aSRod Evans * or http://www.opensolaris.org/os/licensing.
10*1e49577aSRod Evans * See the License for the specific language governing permissions
11*1e49577aSRod Evans * and limitations under the License.
12*1e49577aSRod Evans *
13*1e49577aSRod Evans * When distributing Covered Code, include this CDDL HEADER in each
14*1e49577aSRod Evans * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*1e49577aSRod Evans * If applicable, add the following below this CDDL HEADER, with the
16*1e49577aSRod Evans * fields enclosed by brackets "[]" replaced with your own identifying
17*1e49577aSRod Evans * information: Portions Copyright [yyyy] [name of copyright owner]
18*1e49577aSRod Evans *
19*1e49577aSRod Evans * CDDL HEADER END
20*1e49577aSRod Evans */
21*1e49577aSRod Evans
22*1e49577aSRod Evans/*
23*1e49577aSRod Evans * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24*1e49577aSRod Evans */
25*1e49577aSRod Evans
26*1e49577aSRod Evans	.file	"memset.s"
27*1e49577aSRod Evans
28*1e49577aSRod Evans/*
29*1e49577aSRod Evans * char *memset(sp, c, n)
30*1e49577aSRod Evans *
31*1e49577aSRod Evans * Set an array of n chars starting at sp to the character c.
32*1e49577aSRod Evans * Return sp.
33*1e49577aSRod Evans *
34*1e49577aSRod Evans * Fast assembler language version of the following C-program for memset
35*1e49577aSRod Evans * which represents the `standard' for the C-library.
36*1e49577aSRod Evans *
37*1e49577aSRod Evans *	void *
38*1e49577aSRod Evans *	memset(void *sp1, int c, size_t n)
39*1e49577aSRod Evans *	{
40*1e49577aSRod Evans *	    if (n != 0) {
41*1e49577aSRod Evans *		char *sp = sp1;
42*1e49577aSRod Evans *		do {
43*1e49577aSRod Evans *		    *sp++ = (char)c;
44*1e49577aSRod Evans *		} while (--n != 0);
45*1e49577aSRod Evans *	    }
46*1e49577aSRod Evans *	    return (sp1);
47*1e49577aSRod Evans *	}
48*1e49577aSRod Evans */
49*1e49577aSRod Evans
50*1e49577aSRod Evans#include <sys/asm_linkage.h>
51*1e49577aSRod Evans#include <sys/sun4asi.h>
52*1e49577aSRod Evans
53*1e49577aSRod Evans	ANSI_PRAGMA_WEAK(memset,function)
54*1e49577aSRod Evans
55*1e49577aSRod Evans#define	ALIGN8(X)	(((X) + 7) & ~7)
56*1e49577aSRod Evans#define BLOCK_SIZE      64
57*1e49577aSRod Evans
58*1e49577aSRod Evans	.section        ".text"
59*1e49577aSRod Evans	.align 32
60*1e49577aSRod Evans
61*1e49577aSRod Evans	ENTRY(memset)
62*1e49577aSRod Evans	cmp	%o2, 12			! if small counts, just write bytes
63*1e49577aSRod Evans	bgeu,pn	%ncc, .wrbig
64*1e49577aSRod Evans	mov	%o0, %o5		! copy sp1 before using it
65*1e49577aSRod Evans
66*1e49577aSRod Evans.wrchar:
67*1e49577aSRod Evans	deccc   %o2			! byte clearing loop
68*1e49577aSRod Evans        inc     %o5
69*1e49577aSRod Evans	bgeu,a,pt %ncc, .wrchar
70*1e49577aSRod Evans        stb     %o1, [%o5 + -1]         ! we've already incremented the address
71*1e49577aSRod Evans
72*1e49577aSRod Evans        retl
73*1e49577aSRod Evans	.empty	! next instruction is safe, %o0 still good
74*1e49577aSRod Evans
75*1e49577aSRod Evans.wrbig:
76*1e49577aSRod Evans        andcc	%o5, 7, %o3		! is sp1 aligned on a 8 byte bound
77*1e49577aSRod Evans        bz,pt	%ncc, .blkchk		! already double aligned
78*1e49577aSRod Evans	and	%o1, 0xff, %o1		! o1 is (char)c
79*1e49577aSRod Evans        sub	%o3, 8, %o3		! -(bytes till double aligned)
80*1e49577aSRod Evans        add	%o2, %o3, %o2		! update o2 with new count
81*1e49577aSRod Evans
82*1e49577aSRod Evans	! Set -(%o3) bytes till sp1 double aligned
83*1e49577aSRod Evans1:	stb	%o1, [%o5]		! there is at least 1 byte to set
84*1e49577aSRod Evans	inccc	%o3			! byte clearing loop
85*1e49577aSRod Evans        bl,pt	%ncc, 1b
86*1e49577aSRod Evans        inc	%o5
87*1e49577aSRod Evans
88*1e49577aSRod Evans
89*1e49577aSRod Evans	! Now sp1 is double aligned (sp1 is found in %o5)
90*1e49577aSRod Evans.blkchk:
91*1e49577aSRod Evans	sll     %o1, 8, %o3
92*1e49577aSRod Evans        or      %o1, %o3, %o1		! now o1 has 2 bytes of c
93*1e49577aSRod Evans
94*1e49577aSRod Evans        sll     %o1, 16, %o3
95*1e49577aSRod Evans        or      %o1, %o3, %o1		! now o1 has 4 bytes of c
96*1e49577aSRod Evans
97*1e49577aSRod Evans	cmp     %o2, 4095		! if large count use Block ld/st
98*1e49577aSRod Evans
99*1e49577aSRod Evans	sllx	%o1, 32, %o3
100*1e49577aSRod Evans	or	%o1, %o3, %o1		! now o1 has 8 bytes of c
101*1e49577aSRod Evans
102*1e49577aSRod Evans        bgu,a,pn %ncc, .blkwr		! Do block write for large count
103*1e49577aSRod Evans        andcc   %o5, 63, %o3            ! is sp1 block aligned?
104*1e49577aSRod Evans
105*1e49577aSRod Evans	and	%o2, 24, %o3		! o3 is {0, 8, 16, 24}
106*1e49577aSRod Evans
107*1e49577aSRod Evans1:	subcc	%o3, 8, %o3		! double-word loop
108*1e49577aSRod Evans	add	%o5, 8, %o5
109*1e49577aSRod Evans	bgeu,a,pt %ncc, 1b
110*1e49577aSRod Evans	stx	%o1, [%o5 - 8]		! already incremented the address
111*1e49577aSRod Evans
112*1e49577aSRod Evans	andncc	%o2, 31, %o4		! o4 has 32 byte aligned count
113*1e49577aSRod Evans	bz,pn	%ncc, 3f		! First instruction of icache line
114*1e49577aSRod Evans2:
115*1e49577aSRod Evans	subcc	%o4, 32, %o4		! main loop, 32 bytes per iteration
116*1e49577aSRod Evans	stx	%o1, [%o5 - 8]
117*1e49577aSRod Evans	stx	%o1, [%o5]
118*1e49577aSRod Evans	stx	%o1, [%o5 + 8]
119*1e49577aSRod Evans	stx	%o1, [%o5 + 16]
120*1e49577aSRod Evans	bnz,pt	%ncc, 2b
121*1e49577aSRod Evans	add	%o5, 32, %o5
122*1e49577aSRod Evans
123*1e49577aSRod Evans3:
124*1e49577aSRod Evans	and	%o2, 7, %o2		! o2 has the remaining bytes (<8)
125*1e49577aSRod Evans
126*1e49577aSRod Evans4:
127*1e49577aSRod Evans	deccc   %o2                     ! byte clearing loop
128*1e49577aSRod Evans        inc     %o5
129*1e49577aSRod Evans        bgeu,a,pt %ncc, 4b
130*1e49577aSRod Evans        stb     %o1, [%o5 - 9]		! already incremented the address
131*1e49577aSRod Evans
132*1e49577aSRod Evans	retl
133*1e49577aSRod Evans	nop				! %o0 still preserved
134*1e49577aSRod Evans
135*1e49577aSRod Evans.blkwr:
136*1e49577aSRod Evans        bz,pn   %ncc, .blalign		! now block aligned
137*1e49577aSRod Evans        sub	%o3, 64, %o3		! o3 is -(bytes till block aligned)
138*1e49577aSRod Evans	add	%o2, %o3, %o2		! o2 is the remainder
139*1e49577aSRod Evans
140*1e49577aSRod Evans        ! Store -(%o3) bytes till dst is block (64 byte) aligned.
141*1e49577aSRod Evans        ! Use double word stores.
142*1e49577aSRod Evans	! Recall that dst is already double word aligned
143*1e49577aSRod Evans1:
144*1e49577aSRod Evans        stx     %o1, [%o5]
145*1e49577aSRod Evans	addcc   %o3, 8, %o3
146*1e49577aSRod Evans	bl,pt	%ncc, 1b
147*1e49577aSRod Evans	add     %o5, 8, %o5
148*1e49577aSRod Evans
149*1e49577aSRod Evans	! sp1 is block aligned
150*1e49577aSRod Evans.blalign:
151*1e49577aSRod Evans        rd      %fprs, %g1              ! g1 = fprs
152*1e49577aSRod Evans
153*1e49577aSRod Evans	and	%o2, 63, %o3		! calc bytes left after blk store.
154*1e49577aSRod Evans
155*1e49577aSRod Evans	andcc	%g1, 0x4, %g1		! fprs.du = fprs.dl = 0
156*1e49577aSRod Evans	bz,a	%ncc, 2f		! Is fprs.fef == 0
157*1e49577aSRod Evans        wr      %g0, 0x4, %fprs         ! fprs.fef = 1
158*1e49577aSRod Evans2:
159*1e49577aSRod Evans	brnz,pn	%o1, 3f			! %o1 is safe to check all 64-bits
160*1e49577aSRod Evans	andn	%o2, 63, %o4		! calc size of blocks in bytes
161*1e49577aSRod Evans	fzero   %d0
162*1e49577aSRod Evans	fzero   %d2
163*1e49577aSRod Evans	fzero   %d4
164*1e49577aSRod Evans	fzero   %d6
165*1e49577aSRod Evans	fmuld   %d0, %d0, %d8
166*1e49577aSRod Evans	fzero   %d10
167*1e49577aSRod Evans	ba	4f
168*1e49577aSRod Evans	fmuld   %d0, %d0, %d12
169*1e49577aSRod Evans
170*1e49577aSRod Evans3:
171*1e49577aSRod Evans	! allocate 8 bytes of scratch space on the stack
172*1e49577aSRod Evans	add	%sp, -SA(16), %sp
173*1e49577aSRod Evans	stx	%o1, [%sp + STACK_BIAS + ALIGN8(MINFRAME)]  ! move %o1 to %d0
174*1e49577aSRod Evans	ldd	[%sp + STACK_BIAS + ALIGN8(MINFRAME)], %d0
175*1e49577aSRod Evans
176*1e49577aSRod Evans	fmovd	%d0, %d2
177*1e49577aSRod Evans	add	%sp, SA(16), %sp	! deallocate the scratch space
178*1e49577aSRod Evans	fmovd	%d0, %d4
179*1e49577aSRod Evans	fmovd	%d0, %d6
180*1e49577aSRod Evans	fmovd	%d0, %d8
181*1e49577aSRod Evans	fmovd	%d0, %d10
182*1e49577aSRod Evans	fmovd	%d0, %d12
183*1e49577aSRod Evans4:
184*1e49577aSRod Evans	fmovd	%d0, %d14
185*1e49577aSRod Evans
186*1e49577aSRod Evans	! 1st quadrant has 64 bytes of c
187*1e49577aSRod Evans	! instructions 32-byte aligned here
188*1e49577aSRod Evans#ifdef PANTHER_ONLY
189*1e49577aSRod Evans	! Panther only code
190*1e49577aSRod Evans	prefetch	[%o5 + (3 * BLOCK_SIZE)], 22
191*1e49577aSRod Evans	prefetch	[%o5 + (6 * BLOCK_SIZE)], 22
192*1e49577aSRod Evans	std	%d0, [%o5]
193*1e49577aSRod Evans	std	%d0, [%o5 + 8]
194*1e49577aSRod Evans	std	%d0, [%o5 + 16]
195*1e49577aSRod Evans	std	%d0, [%o5 + 24]
196*1e49577aSRod Evans	std	%d0, [%o5 + 32]
197*1e49577aSRod Evans	std	%d0, [%o5 + 40]
198*1e49577aSRod Evans	std	%d0, [%o5 + 48]
199*1e49577aSRod Evans	std	%d0, [%o5 + 56]
200*1e49577aSRod Evans#else
201*1e49577aSRod Evans	! Cheetah/Jaguar code
202*1e49577aSRod Evans        stda    %d0, [%o5]ASI_BLK_P
203*1e49577aSRod Evans#endif
204*1e49577aSRod Evans        subcc   %o4, 64, %o4
205*1e49577aSRod Evans        bgu,pt	%ncc, 4b
206*1e49577aSRod Evans        add     %o5, 64, %o5
207*1e49577aSRod Evans
208*1e49577aSRod Evans	! Set the remaining doubles
209*1e49577aSRod Evans	subcc   %o3, 8, %o3		! Can we store any doubles?
210*1e49577aSRod Evans	blu,pn  %ncc, 6f
211*1e49577aSRod Evans	and	%o2, 7, %o2		! calc bytes left after doubles
212*1e49577aSRod Evans
213*1e49577aSRod Evans5:
214*1e49577aSRod Evans	std     %d0, [%o5]		! store the doubles
215*1e49577aSRod Evans	subcc   %o3, 8, %o3
216*1e49577aSRod Evans	bgeu,pt	%ncc, 5b
217*1e49577aSRod Evans        add     %o5, 8, %o5
218*1e49577aSRod Evans6:
219*1e49577aSRod Evans	! Set the remaining bytes
220*1e49577aSRod Evans	brz	%o2, .exit		! safe to check all 64-bits
221*1e49577aSRod Evans
222*1e49577aSRod Evans#if 0
223*1e49577aSRod Evans	! Terminate the copy with a partial store. (bug 1200071 does not apply)
224*1e49577aSRod Evans	! The data should be at d0
225*1e49577aSRod Evans        dec     %o2                     ! needed to get the mask right
226*1e49577aSRod Evans	edge8n	%g0, %o2, %o4
227*1e49577aSRod Evans	stda	%d0, [%o5]%o4, ASI_PST8_P
228*1e49577aSRod Evans#else
229*1e49577aSRod Evans7:
230*1e49577aSRod Evans	deccc	%o2
231*1e49577aSRod Evans	stb	%o1, [%o5]
232*1e49577aSRod Evans	bgu,pt	%ncc, 7b
233*1e49577aSRod Evans	inc	%o5
234*1e49577aSRod Evans#endif
235*1e49577aSRod Evans
236*1e49577aSRod Evans.exit:
237*1e49577aSRod Evans        membar  #StoreLoad|#StoreStore
238*1e49577aSRod Evans        retl				! %o0 was preserved
239*1e49577aSRod Evans        wr	%g1, %g0, %fprs         ! fprs = g1  restore fprs
240*1e49577aSRod Evans
241*1e49577aSRod Evans	SET_SIZE(memset)
242