xref: /linux/arch/sparc/lib/U1memcpy.S (revision 6f1d827f299085a48cb3a987e1487e16f1a980c9)
1478b8fecSSam Ravnborg/* U1memcpy.S: UltraSPARC-I/II/IIi/IIe optimized memcpy.
2478b8fecSSam Ravnborg *
3478b8fecSSam Ravnborg * Copyright (C) 1997, 2004 David S. Miller (davem@redhat.com)
4478b8fecSSam Ravnborg * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz)
5478b8fecSSam Ravnborg */
6478b8fecSSam Ravnborg
7478b8fecSSam Ravnborg#ifdef __KERNEL__
8478b8fecSSam Ravnborg#include <asm/visasm.h>
9478b8fecSSam Ravnborg#include <asm/asi.h>
10478b8fecSSam Ravnborg#define GLOBAL_SPARE	g7
11478b8fecSSam Ravnborg#else
12478b8fecSSam Ravnborg#define GLOBAL_SPARE	g5
13478b8fecSSam Ravnborg#define ASI_BLK_P 0xf0
14478b8fecSSam Ravnborg#define FPRS_FEF  0x04
15478b8fecSSam Ravnborg#ifdef MEMCPY_DEBUG
16478b8fecSSam Ravnborg#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
17478b8fecSSam Ravnborg		 clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
18478b8fecSSam Ravnborg#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
19478b8fecSSam Ravnborg#else
20478b8fecSSam Ravnborg#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
21478b8fecSSam Ravnborg#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
22478b8fecSSam Ravnborg#endif
23478b8fecSSam Ravnborg#endif
24478b8fecSSam Ravnborg
25478b8fecSSam Ravnborg#ifndef EX_LD
26478b8fecSSam Ravnborg#define EX_LD(x)	x
27478b8fecSSam Ravnborg#endif
28478b8fecSSam Ravnborg
29478b8fecSSam Ravnborg#ifndef EX_ST
30478b8fecSSam Ravnborg#define EX_ST(x)	x
31478b8fecSSam Ravnborg#endif
32478b8fecSSam Ravnborg
33478b8fecSSam Ravnborg#ifndef EX_RETVAL
34478b8fecSSam Ravnborg#define EX_RETVAL(x)	x
35478b8fecSSam Ravnborg#endif
36478b8fecSSam Ravnborg
37478b8fecSSam Ravnborg#ifndef LOAD
38478b8fecSSam Ravnborg#define LOAD(type,addr,dest)	type [addr], dest
39478b8fecSSam Ravnborg#endif
40478b8fecSSam Ravnborg
41478b8fecSSam Ravnborg#ifndef LOAD_BLK
42478b8fecSSam Ravnborg#define LOAD_BLK(addr,dest)	ldda [addr] ASI_BLK_P, dest
43478b8fecSSam Ravnborg#endif
44478b8fecSSam Ravnborg
45478b8fecSSam Ravnborg#ifndef STORE
46478b8fecSSam Ravnborg#define STORE(type,src,addr)	type src, [addr]
47478b8fecSSam Ravnborg#endif
48478b8fecSSam Ravnborg
49478b8fecSSam Ravnborg#ifndef STORE_BLK
50478b8fecSSam Ravnborg#define STORE_BLK(src,addr)	stda src, [addr] ASI_BLK_P
51478b8fecSSam Ravnborg#endif
52478b8fecSSam Ravnborg
53478b8fecSSam Ravnborg#ifndef FUNC_NAME
54478b8fecSSam Ravnborg#define FUNC_NAME	memcpy
55478b8fecSSam Ravnborg#endif
56478b8fecSSam Ravnborg
57478b8fecSSam Ravnborg#ifndef PREAMBLE
58478b8fecSSam Ravnborg#define PREAMBLE
59478b8fecSSam Ravnborg#endif
60478b8fecSSam Ravnborg
61478b8fecSSam Ravnborg#ifndef XCC
62478b8fecSSam Ravnborg#define XCC xcc
63478b8fecSSam Ravnborg#endif
64478b8fecSSam Ravnborg
65478b8fecSSam Ravnborg#define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9)		\
66478b8fecSSam Ravnborg	faligndata		%f1, %f2, %f48;			\
67478b8fecSSam Ravnborg	faligndata		%f2, %f3, %f50;			\
68478b8fecSSam Ravnborg	faligndata		%f3, %f4, %f52;			\
69478b8fecSSam Ravnborg	faligndata		%f4, %f5, %f54;			\
70478b8fecSSam Ravnborg	faligndata		%f5, %f6, %f56;			\
71478b8fecSSam Ravnborg	faligndata		%f6, %f7, %f58;			\
72478b8fecSSam Ravnborg	faligndata		%f7, %f8, %f60;			\
73478b8fecSSam Ravnborg	faligndata		%f8, %f9, %f62;
74478b8fecSSam Ravnborg
75478b8fecSSam Ravnborg#define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, len, jmptgt)	\
76478b8fecSSam Ravnborg	EX_LD(LOAD_BLK(%src, %fdest));				\
77478b8fecSSam Ravnborg	EX_ST(STORE_BLK(%fsrc, %dest));				\
78478b8fecSSam Ravnborg	add			%src, 0x40, %src;		\
79478b8fecSSam Ravnborg	subcc			%len, 0x40, %len;		\
80478b8fecSSam Ravnborg	be,pn			%xcc, jmptgt;			\
81478b8fecSSam Ravnborg	 add			%dest, 0x40, %dest;		\
82478b8fecSSam Ravnborg
83478b8fecSSam Ravnborg#define LOOP_CHUNK1(src, dest, len, branch_dest)		\
84478b8fecSSam Ravnborg	MAIN_LOOP_CHUNK(src, dest, f0,  f48, len, branch_dest)
85478b8fecSSam Ravnborg#define LOOP_CHUNK2(src, dest, len, branch_dest)		\
86478b8fecSSam Ravnborg	MAIN_LOOP_CHUNK(src, dest, f16, f48, len, branch_dest)
87478b8fecSSam Ravnborg#define LOOP_CHUNK3(src, dest, len, branch_dest)		\
88478b8fecSSam Ravnborg	MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest)
89478b8fecSSam Ravnborg
90478b8fecSSam Ravnborg#define DO_SYNC			membar	#Sync;
91478b8fecSSam Ravnborg#define STORE_SYNC(dest, fsrc)				\
92478b8fecSSam Ravnborg	EX_ST(STORE_BLK(%fsrc, %dest));			\
93478b8fecSSam Ravnborg	add			%dest, 0x40, %dest;	\
94478b8fecSSam Ravnborg	DO_SYNC
95478b8fecSSam Ravnborg
96478b8fecSSam Ravnborg#define STORE_JUMP(dest, fsrc, target)			\
97478b8fecSSam Ravnborg	EX_ST(STORE_BLK(%fsrc, %dest));			\
98478b8fecSSam Ravnborg	add			%dest, 0x40, %dest;	\
99478b8fecSSam Ravnborg	ba,pt			%xcc, target;		\
100478b8fecSSam Ravnborg	 nop;
101478b8fecSSam Ravnborg
102478b8fecSSam Ravnborg#define FINISH_VISCHUNK(dest, f0, f1, left)	\
103478b8fecSSam Ravnborg	subcc			%left, 8, %left;\
104478b8fecSSam Ravnborg	bl,pn			%xcc, 95f;	\
105478b8fecSSam Ravnborg	 faligndata		%f0, %f1, %f48;	\
106478b8fecSSam Ravnborg	EX_ST(STORE(std, %f48, %dest));		\
107478b8fecSSam Ravnborg	add			%dest, 8, %dest;
108478b8fecSSam Ravnborg
109478b8fecSSam Ravnborg#define UNEVEN_VISCHUNK_LAST(dest, f0, f1, left)	\
110478b8fecSSam Ravnborg	subcc			%left, 8, %left;	\
111478b8fecSSam Ravnborg	bl,pn			%xcc, 95f;		\
112*6f1d827fSDavid S. Miller	 fsrc2			%f0, %f1;
113478b8fecSSam Ravnborg
114478b8fecSSam Ravnborg#define UNEVEN_VISCHUNK(dest, f0, f1, left)		\
115478b8fecSSam Ravnborg	UNEVEN_VISCHUNK_LAST(dest, f0, f1, left)	\
116478b8fecSSam Ravnborg	ba,a,pt			%xcc, 93f;
117478b8fecSSam Ravnborg
118478b8fecSSam Ravnborg	.register	%g2,#scratch
119478b8fecSSam Ravnborg	.register	%g3,#scratch
120478b8fecSSam Ravnborg
121478b8fecSSam Ravnborg	.text
122478b8fecSSam Ravnborg	.align		64
123478b8fecSSam Ravnborg
124478b8fecSSam Ravnborg	.globl		FUNC_NAME
125478b8fecSSam Ravnborg	.type		FUNC_NAME,#function
126478b8fecSSam RavnborgFUNC_NAME:		/* %o0=dst, %o1=src, %o2=len */
127478b8fecSSam Ravnborg	srlx		%o2, 31, %g2
128478b8fecSSam Ravnborg	cmp		%g2, 0
129478b8fecSSam Ravnborg	tne		%xcc, 5
130478b8fecSSam Ravnborg	PREAMBLE
131478b8fecSSam Ravnborg	mov		%o0, %o4
132478b8fecSSam Ravnborg	cmp		%o2, 0
133478b8fecSSam Ravnborg	be,pn		%XCC, 85f
134478b8fecSSam Ravnborg	 or		%o0, %o1, %o3
135478b8fecSSam Ravnborg	cmp		%o2, 16
136478b8fecSSam Ravnborg	blu,a,pn	%XCC, 80f
137478b8fecSSam Ravnborg	 or		%o3, %o2, %o3
138478b8fecSSam Ravnborg
139478b8fecSSam Ravnborg	cmp		%o2, (5 * 64)
140478b8fecSSam Ravnborg	blu,pt		%XCC, 70f
141478b8fecSSam Ravnborg	 andcc		%o3, 0x7, %g0
142478b8fecSSam Ravnborg
143478b8fecSSam Ravnborg	/* Clobbers o5/g1/g2/g3/g7/icc/xcc.  */
144478b8fecSSam Ravnborg	VISEntry
145478b8fecSSam Ravnborg
146478b8fecSSam Ravnborg	/* Is 'dst' already aligned on an 64-byte boundary? */
147478b8fecSSam Ravnborg	andcc		%o0, 0x3f, %g2
148478b8fecSSam Ravnborg	be,pt		%XCC, 2f
149478b8fecSSam Ravnborg
150478b8fecSSam Ravnborg	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
151478b8fecSSam Ravnborg	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
152478b8fecSSam Ravnborg	 * subtract this from 'len'.
153478b8fecSSam Ravnborg	 */
154478b8fecSSam Ravnborg	 sub		%o0, %o1, %GLOBAL_SPARE
155478b8fecSSam Ravnborg	sub		%g2, 0x40, %g2
156478b8fecSSam Ravnborg	sub		%g0, %g2, %g2
157478b8fecSSam Ravnborg	sub		%o2, %g2, %o2
158478b8fecSSam Ravnborg	andcc		%g2, 0x7, %g1
159478b8fecSSam Ravnborg	be,pt		%icc, 2f
160478b8fecSSam Ravnborg	 and		%g2, 0x38, %g2
161478b8fecSSam Ravnborg
162478b8fecSSam Ravnborg1:	subcc		%g1, 0x1, %g1
163478b8fecSSam Ravnborg	EX_LD(LOAD(ldub, %o1 + 0x00, %o3))
164478b8fecSSam Ravnborg	EX_ST(STORE(stb, %o3, %o1 + %GLOBAL_SPARE))
165478b8fecSSam Ravnborg	bgu,pt		%XCC, 1b
166478b8fecSSam Ravnborg	 add		%o1, 0x1, %o1
167478b8fecSSam Ravnborg
168478b8fecSSam Ravnborg	add		%o1, %GLOBAL_SPARE, %o0
169478b8fecSSam Ravnborg
170478b8fecSSam Ravnborg2:	cmp		%g2, 0x0
171478b8fecSSam Ravnborg	and		%o1, 0x7, %g1
172478b8fecSSam Ravnborg	be,pt		%icc, 3f
173478b8fecSSam Ravnborg	 alignaddr	%o1, %g0, %o1
174478b8fecSSam Ravnborg
175478b8fecSSam Ravnborg	EX_LD(LOAD(ldd, %o1, %f4))
176478b8fecSSam Ravnborg1:	EX_LD(LOAD(ldd, %o1 + 0x8, %f6))
177478b8fecSSam Ravnborg	add		%o1, 0x8, %o1
178478b8fecSSam Ravnborg	subcc		%g2, 0x8, %g2
179478b8fecSSam Ravnborg	faligndata	%f4, %f6, %f0
180478b8fecSSam Ravnborg	EX_ST(STORE(std, %f0, %o0))
181478b8fecSSam Ravnborg	be,pn		%icc, 3f
182478b8fecSSam Ravnborg	 add		%o0, 0x8, %o0
183478b8fecSSam Ravnborg
184478b8fecSSam Ravnborg	EX_LD(LOAD(ldd, %o1 + 0x8, %f4))
185478b8fecSSam Ravnborg	add		%o1, 0x8, %o1
186478b8fecSSam Ravnborg	subcc		%g2, 0x8, %g2
187478b8fecSSam Ravnborg	faligndata	%f6, %f4, %f0
188478b8fecSSam Ravnborg	EX_ST(STORE(std, %f0, %o0))
189478b8fecSSam Ravnborg	bne,pt		%icc, 1b
190478b8fecSSam Ravnborg	 add		%o0, 0x8, %o0
191478b8fecSSam Ravnborg
192478b8fecSSam Ravnborg	/* Destination is 64-byte aligned.  */
193478b8fecSSam Ravnborg3:
194478b8fecSSam Ravnborg	membar		  #LoadStore | #StoreStore | #StoreLoad
195478b8fecSSam Ravnborg
196478b8fecSSam Ravnborg	subcc		%o2, 0x40, %GLOBAL_SPARE
197478b8fecSSam Ravnborg	add		%o1, %g1, %g1
198478b8fecSSam Ravnborg	andncc		%GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE
199478b8fecSSam Ravnborg	srl		%g1, 3, %g2
200478b8fecSSam Ravnborg	sub		%o2, %GLOBAL_SPARE, %g3
201478b8fecSSam Ravnborg	andn		%o1, (0x40 - 1), %o1
202478b8fecSSam Ravnborg	and		%g2, 7, %g2
203478b8fecSSam Ravnborg	andncc		%g3, 0x7, %g3
204*6f1d827fSDavid S. Miller	fsrc2		%f0, %f2
205478b8fecSSam Ravnborg	sub		%g3, 0x8, %g3
206478b8fecSSam Ravnborg	sub		%o2, %GLOBAL_SPARE, %o2
207478b8fecSSam Ravnborg
208478b8fecSSam Ravnborg	add		%g1, %GLOBAL_SPARE, %g1
209478b8fecSSam Ravnborg	subcc		%o2, %g3, %o2
210478b8fecSSam Ravnborg
211478b8fecSSam Ravnborg	EX_LD(LOAD_BLK(%o1, %f0))
212478b8fecSSam Ravnborg	add		%o1, 0x40, %o1
213478b8fecSSam Ravnborg	add		%g1, %g3, %g1
214478b8fecSSam Ravnborg	EX_LD(LOAD_BLK(%o1, %f16))
215478b8fecSSam Ravnborg	add		%o1, 0x40, %o1
216478b8fecSSam Ravnborg	sub		%GLOBAL_SPARE, 0x80, %GLOBAL_SPARE
217478b8fecSSam Ravnborg	EX_LD(LOAD_BLK(%o1, %f32))
218478b8fecSSam Ravnborg	add		%o1, 0x40, %o1
219478b8fecSSam Ravnborg
220478b8fecSSam Ravnborg	/* There are 8 instances of the unrolled loop,
221478b8fecSSam Ravnborg	 * one for each possible alignment of the
222478b8fecSSam Ravnborg	 * source buffer.  Each loop instance is 452
223478b8fecSSam Ravnborg	 * bytes.
224478b8fecSSam Ravnborg	 */
225478b8fecSSam Ravnborg	sll		%g2, 3, %o3
226478b8fecSSam Ravnborg	sub		%o3, %g2, %o3
227478b8fecSSam Ravnborg	sllx		%o3, 4, %o3
228478b8fecSSam Ravnborg	add		%o3, %g2, %o3
229478b8fecSSam Ravnborg	sllx		%o3, 2, %g2
230478b8fecSSam Ravnborg1:	rd		%pc, %o3
231478b8fecSSam Ravnborg	add		%o3, %lo(1f - 1b), %o3
232478b8fecSSam Ravnborg	jmpl		%o3 + %g2, %g0
233478b8fecSSam Ravnborg	 nop
234478b8fecSSam Ravnborg
235478b8fecSSam Ravnborg	.align		64
236478b8fecSSam Ravnborg1:	FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
237478b8fecSSam Ravnborg	LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
238478b8fecSSam Ravnborg	FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
239478b8fecSSam Ravnborg	LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
240478b8fecSSam Ravnborg	FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
241478b8fecSSam Ravnborg	LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
242478b8fecSSam Ravnborg	ba,pt		%xcc, 1b+4
243478b8fecSSam Ravnborg	 faligndata	%f0, %f2, %f48
244478b8fecSSam Ravnborg1:	FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
245478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
246478b8fecSSam Ravnborg	FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
247478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 40f)
248478b8fecSSam Ravnborg2:	FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
249478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
250478b8fecSSam Ravnborg	FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
251478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 48f)
252478b8fecSSam Ravnborg3:	FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
253478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
254478b8fecSSam Ravnborg	FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
255478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 56f)
256478b8fecSSam Ravnborg
257478b8fecSSam Ravnborg1:	FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
258478b8fecSSam Ravnborg	LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
259478b8fecSSam Ravnborg	FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
260478b8fecSSam Ravnborg	LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
261478b8fecSSam Ravnborg	FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
262478b8fecSSam Ravnborg	LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
263478b8fecSSam Ravnborg	ba,pt		%xcc, 1b+4
264478b8fecSSam Ravnborg	 faligndata	%f2, %f4, %f48
265478b8fecSSam Ravnborg1:	FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
266478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
267478b8fecSSam Ravnborg	FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
268478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 41f)
269478b8fecSSam Ravnborg2:	FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
270478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
271478b8fecSSam Ravnborg	FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
272478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 49f)
273478b8fecSSam Ravnborg3:	FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
274478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
275478b8fecSSam Ravnborg	FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
276478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 57f)
277478b8fecSSam Ravnborg
278478b8fecSSam Ravnborg1:	FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
279478b8fecSSam Ravnborg	LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
280478b8fecSSam Ravnborg	FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
281478b8fecSSam Ravnborg	LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
282478b8fecSSam Ravnborg	FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
283478b8fecSSam Ravnborg	LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
284478b8fecSSam Ravnborg	ba,pt		%xcc, 1b+4
285478b8fecSSam Ravnborg	 faligndata	%f4, %f6, %f48
286478b8fecSSam Ravnborg1:	FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
287478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
288478b8fecSSam Ravnborg	FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
289478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 42f)
290478b8fecSSam Ravnborg2:	FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
291478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
292478b8fecSSam Ravnborg	FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
293478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 50f)
294478b8fecSSam Ravnborg3:	FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
295478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
296478b8fecSSam Ravnborg	FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
297478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 58f)
298478b8fecSSam Ravnborg
299478b8fecSSam Ravnborg1:	FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
300478b8fecSSam Ravnborg	LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
301478b8fecSSam Ravnborg	FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
302478b8fecSSam Ravnborg	LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
303478b8fecSSam Ravnborg	FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
304478b8fecSSam Ravnborg	LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
305478b8fecSSam Ravnborg	ba,pt		%xcc, 1b+4
306478b8fecSSam Ravnborg	 faligndata	%f6, %f8, %f48
307478b8fecSSam Ravnborg1:	FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
308478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
309478b8fecSSam Ravnborg	FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
310478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 43f)
311478b8fecSSam Ravnborg2:	FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
312478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
313478b8fecSSam Ravnborg	FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
314478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 51f)
315478b8fecSSam Ravnborg3:	FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
316478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
317478b8fecSSam Ravnborg	FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
318478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 59f)
319478b8fecSSam Ravnborg
320478b8fecSSam Ravnborg1:	FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
321478b8fecSSam Ravnborg	LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
322478b8fecSSam Ravnborg	FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
323478b8fecSSam Ravnborg	LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
324478b8fecSSam Ravnborg	FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
325478b8fecSSam Ravnborg	LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
326478b8fecSSam Ravnborg	ba,pt		%xcc, 1b+4
327478b8fecSSam Ravnborg	 faligndata	%f8, %f10, %f48
328478b8fecSSam Ravnborg1:	FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
329478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
330478b8fecSSam Ravnborg	FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
331478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 44f)
332478b8fecSSam Ravnborg2:	FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
333478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
334478b8fecSSam Ravnborg	FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
335478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 52f)
336478b8fecSSam Ravnborg3:	FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
337478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
338478b8fecSSam Ravnborg	FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
339478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 60f)
340478b8fecSSam Ravnborg
341478b8fecSSam Ravnborg1:	FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
342478b8fecSSam Ravnborg	LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
343478b8fecSSam Ravnborg	FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
344478b8fecSSam Ravnborg	LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
345478b8fecSSam Ravnborg	FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
346478b8fecSSam Ravnborg	LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
347478b8fecSSam Ravnborg	ba,pt		%xcc, 1b+4
348478b8fecSSam Ravnborg	 faligndata	%f10, %f12, %f48
349478b8fecSSam Ravnborg1:	FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
350478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
351478b8fecSSam Ravnborg	FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
352478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 45f)
353478b8fecSSam Ravnborg2:	FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
354478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
355478b8fecSSam Ravnborg	FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
356478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 53f)
357478b8fecSSam Ravnborg3:	FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
358478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
359478b8fecSSam Ravnborg	FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
360478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 61f)
361478b8fecSSam Ravnborg
362478b8fecSSam Ravnborg1:	FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
363478b8fecSSam Ravnborg	LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
364478b8fecSSam Ravnborg	FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
365478b8fecSSam Ravnborg	LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
366478b8fecSSam Ravnborg	FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
367478b8fecSSam Ravnborg	LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
368478b8fecSSam Ravnborg	ba,pt		%xcc, 1b+4
369478b8fecSSam Ravnborg	 faligndata	%f12, %f14, %f48
370478b8fecSSam Ravnborg1:	FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
371478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
372478b8fecSSam Ravnborg	FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
373478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 46f)
374478b8fecSSam Ravnborg2:	FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
375478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
376478b8fecSSam Ravnborg	FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
377478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 54f)
378478b8fecSSam Ravnborg3:	FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
379478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
380478b8fecSSam Ravnborg	FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
381478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 62f)
382478b8fecSSam Ravnborg
383478b8fecSSam Ravnborg1:	FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
384478b8fecSSam Ravnborg	LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
385478b8fecSSam Ravnborg	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
386478b8fecSSam Ravnborg	LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
387478b8fecSSam Ravnborg	FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
388478b8fecSSam Ravnborg	LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
389478b8fecSSam Ravnborg	ba,pt		%xcc, 1b+4
390478b8fecSSam Ravnborg	 faligndata	%f14, %f16, %f48
391478b8fecSSam Ravnborg1:	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
392478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
393478b8fecSSam Ravnborg	FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
394478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 47f)
395478b8fecSSam Ravnborg2:	FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
396478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
397478b8fecSSam Ravnborg	FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
398478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 55f)
399478b8fecSSam Ravnborg3:	FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
400478b8fecSSam Ravnborg	STORE_SYNC(o0, f48)
401478b8fecSSam Ravnborg	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
402478b8fecSSam Ravnborg	STORE_JUMP(o0, f48, 63f)
403478b8fecSSam Ravnborg
404478b8fecSSam Ravnborg40:	FINISH_VISCHUNK(o0, f0,  f2,  g3)
405478b8fecSSam Ravnborg41:	FINISH_VISCHUNK(o0, f2,  f4,  g3)
406478b8fecSSam Ravnborg42:	FINISH_VISCHUNK(o0, f4,  f6,  g3)
407478b8fecSSam Ravnborg43:	FINISH_VISCHUNK(o0, f6,  f8,  g3)
408478b8fecSSam Ravnborg44:	FINISH_VISCHUNK(o0, f8,  f10, g3)
409478b8fecSSam Ravnborg45:	FINISH_VISCHUNK(o0, f10, f12, g3)
410478b8fecSSam Ravnborg46:	FINISH_VISCHUNK(o0, f12, f14, g3)
411478b8fecSSam Ravnborg47:	UNEVEN_VISCHUNK(o0, f14, f0,  g3)
412478b8fecSSam Ravnborg48:	FINISH_VISCHUNK(o0, f16, f18, g3)
413478b8fecSSam Ravnborg49:	FINISH_VISCHUNK(o0, f18, f20, g3)
414478b8fecSSam Ravnborg50:	FINISH_VISCHUNK(o0, f20, f22, g3)
415478b8fecSSam Ravnborg51:	FINISH_VISCHUNK(o0, f22, f24, g3)
416478b8fecSSam Ravnborg52:	FINISH_VISCHUNK(o0, f24, f26, g3)
417478b8fecSSam Ravnborg53:	FINISH_VISCHUNK(o0, f26, f28, g3)
418478b8fecSSam Ravnborg54:	FINISH_VISCHUNK(o0, f28, f30, g3)
419478b8fecSSam Ravnborg55:	UNEVEN_VISCHUNK(o0, f30, f0,  g3)
420478b8fecSSam Ravnborg56:	FINISH_VISCHUNK(o0, f32, f34, g3)
421478b8fecSSam Ravnborg57:	FINISH_VISCHUNK(o0, f34, f36, g3)
422478b8fecSSam Ravnborg58:	FINISH_VISCHUNK(o0, f36, f38, g3)
423478b8fecSSam Ravnborg59:	FINISH_VISCHUNK(o0, f38, f40, g3)
424478b8fecSSam Ravnborg60:	FINISH_VISCHUNK(o0, f40, f42, g3)
425478b8fecSSam Ravnborg61:	FINISH_VISCHUNK(o0, f42, f44, g3)
426478b8fecSSam Ravnborg62:	FINISH_VISCHUNK(o0, f44, f46, g3)
427478b8fecSSam Ravnborg63:	UNEVEN_VISCHUNK_LAST(o0, f46, f0,  g3)
428478b8fecSSam Ravnborg
429478b8fecSSam Ravnborg93:	EX_LD(LOAD(ldd, %o1, %f2))
430478b8fecSSam Ravnborg	add		%o1, 8, %o1
431478b8fecSSam Ravnborg	subcc		%g3, 8, %g3
432478b8fecSSam Ravnborg	faligndata	%f0, %f2, %f8
433478b8fecSSam Ravnborg	EX_ST(STORE(std, %f8, %o0))
434478b8fecSSam Ravnborg	bl,pn		%xcc, 95f
435478b8fecSSam Ravnborg	 add		%o0, 8, %o0
436478b8fecSSam Ravnborg	EX_LD(LOAD(ldd, %o1, %f0))
437478b8fecSSam Ravnborg	add		%o1, 8, %o1
438478b8fecSSam Ravnborg	subcc		%g3, 8, %g3
439478b8fecSSam Ravnborg	faligndata	%f2, %f0, %f8
440478b8fecSSam Ravnborg	EX_ST(STORE(std, %f8, %o0))
441478b8fecSSam Ravnborg	bge,pt		%xcc, 93b
442478b8fecSSam Ravnborg	 add		%o0, 8, %o0
443478b8fecSSam Ravnborg
444478b8fecSSam Ravnborg95:	brz,pt		%o2, 2f
445478b8fecSSam Ravnborg	 mov		%g1, %o1
446478b8fecSSam Ravnborg
447478b8fecSSam Ravnborg1:	EX_LD(LOAD(ldub, %o1, %o3))
448478b8fecSSam Ravnborg	add		%o1, 1, %o1
449478b8fecSSam Ravnborg	subcc		%o2, 1, %o2
450478b8fecSSam Ravnborg	EX_ST(STORE(stb, %o3, %o0))
451478b8fecSSam Ravnborg	bne,pt		%xcc, 1b
452478b8fecSSam Ravnborg	 add		%o0, 1, %o0
453478b8fecSSam Ravnborg
454478b8fecSSam Ravnborg2:	membar		#StoreLoad | #StoreStore
455478b8fecSSam Ravnborg	VISExit
456478b8fecSSam Ravnborg	retl
457478b8fecSSam Ravnborg	 mov		EX_RETVAL(%o4), %o0
458478b8fecSSam Ravnborg
459478b8fecSSam Ravnborg	.align		64
460478b8fecSSam Ravnborg70:	/* 16 < len <= (5 * 64) */
461478b8fecSSam Ravnborg	bne,pn		%XCC, 75f
462478b8fecSSam Ravnborg	 sub		%o0, %o1, %o3
463478b8fecSSam Ravnborg
464478b8fecSSam Ravnborg72:	andn		%o2, 0xf, %GLOBAL_SPARE
465478b8fecSSam Ravnborg	and		%o2, 0xf, %o2
466478b8fecSSam Ravnborg1:	EX_LD(LOAD(ldx, %o1 + 0x00, %o5))
467478b8fecSSam Ravnborg	EX_LD(LOAD(ldx, %o1 + 0x08, %g1))
468478b8fecSSam Ravnborg	subcc		%GLOBAL_SPARE, 0x10, %GLOBAL_SPARE
469478b8fecSSam Ravnborg	EX_ST(STORE(stx, %o5, %o1 + %o3))
470478b8fecSSam Ravnborg	add		%o1, 0x8, %o1
471478b8fecSSam Ravnborg	EX_ST(STORE(stx, %g1, %o1 + %o3))
472478b8fecSSam Ravnborg	bgu,pt		%XCC, 1b
473478b8fecSSam Ravnborg	 add		%o1, 0x8, %o1
474478b8fecSSam Ravnborg73:	andcc		%o2, 0x8, %g0
475478b8fecSSam Ravnborg	be,pt		%XCC, 1f
476478b8fecSSam Ravnborg	 nop
477478b8fecSSam Ravnborg	EX_LD(LOAD(ldx, %o1, %o5))
478478b8fecSSam Ravnborg	sub		%o2, 0x8, %o2
479478b8fecSSam Ravnborg	EX_ST(STORE(stx, %o5, %o1 + %o3))
480478b8fecSSam Ravnborg	add		%o1, 0x8, %o1
481478b8fecSSam Ravnborg1:	andcc		%o2, 0x4, %g0
482478b8fecSSam Ravnborg	be,pt		%XCC, 1f
483478b8fecSSam Ravnborg	 nop
484478b8fecSSam Ravnborg	EX_LD(LOAD(lduw, %o1, %o5))
485478b8fecSSam Ravnborg	sub		%o2, 0x4, %o2
486478b8fecSSam Ravnborg	EX_ST(STORE(stw, %o5, %o1 + %o3))
487478b8fecSSam Ravnborg	add		%o1, 0x4, %o1
488478b8fecSSam Ravnborg1:	cmp		%o2, 0
489478b8fecSSam Ravnborg	be,pt		%XCC, 85f
490478b8fecSSam Ravnborg	 nop
491478b8fecSSam Ravnborg	ba,pt		%xcc, 90f
492478b8fecSSam Ravnborg	 nop
493478b8fecSSam Ravnborg
494478b8fecSSam Ravnborg75:	andcc		%o0, 0x7, %g1
495478b8fecSSam Ravnborg	sub		%g1, 0x8, %g1
496478b8fecSSam Ravnborg	be,pn		%icc, 2f
497478b8fecSSam Ravnborg	 sub		%g0, %g1, %g1
498478b8fecSSam Ravnborg	sub		%o2, %g1, %o2
499478b8fecSSam Ravnborg
500478b8fecSSam Ravnborg1:	EX_LD(LOAD(ldub, %o1, %o5))
501478b8fecSSam Ravnborg	subcc		%g1, 1, %g1
502478b8fecSSam Ravnborg	EX_ST(STORE(stb, %o5, %o1 + %o3))
503478b8fecSSam Ravnborg	bgu,pt		%icc, 1b
504478b8fecSSam Ravnborg	 add		%o1, 1, %o1
505478b8fecSSam Ravnborg
506478b8fecSSam Ravnborg2:	add		%o1, %o3, %o0
507478b8fecSSam Ravnborg	andcc		%o1, 0x7, %g1
508478b8fecSSam Ravnborg	bne,pt		%icc, 8f
509478b8fecSSam Ravnborg	 sll		%g1, 3, %g1
510478b8fecSSam Ravnborg
511478b8fecSSam Ravnborg	cmp		%o2, 16
512478b8fecSSam Ravnborg	bgeu,pt		%icc, 72b
513478b8fecSSam Ravnborg	 nop
514478b8fecSSam Ravnborg	ba,a,pt		%xcc, 73b
515478b8fecSSam Ravnborg
516478b8fecSSam Ravnborg8:	mov		64, %o3
517478b8fecSSam Ravnborg	andn		%o1, 0x7, %o1
518478b8fecSSam Ravnborg	EX_LD(LOAD(ldx, %o1, %g2))
519478b8fecSSam Ravnborg	sub		%o3, %g1, %o3
520478b8fecSSam Ravnborg	andn		%o2, 0x7, %GLOBAL_SPARE
521478b8fecSSam Ravnborg	sllx		%g2, %g1, %g2
522478b8fecSSam Ravnborg1:	EX_LD(LOAD(ldx, %o1 + 0x8, %g3))
523478b8fecSSam Ravnborg	subcc		%GLOBAL_SPARE, 0x8, %GLOBAL_SPARE
524478b8fecSSam Ravnborg	add		%o1, 0x8, %o1
525478b8fecSSam Ravnborg	srlx		%g3, %o3, %o5
526478b8fecSSam Ravnborg	or		%o5, %g2, %o5
527478b8fecSSam Ravnborg	EX_ST(STORE(stx, %o5, %o0))
528478b8fecSSam Ravnborg	add		%o0, 0x8, %o0
529478b8fecSSam Ravnborg	bgu,pt		%icc, 1b
530478b8fecSSam Ravnborg	 sllx		%g3, %g1, %g2
531478b8fecSSam Ravnborg
532478b8fecSSam Ravnborg	srl		%g1, 3, %g1
533478b8fecSSam Ravnborg	andcc		%o2, 0x7, %o2
534478b8fecSSam Ravnborg	be,pn		%icc, 85f
535478b8fecSSam Ravnborg	 add		%o1, %g1, %o1
536478b8fecSSam Ravnborg	ba,pt		%xcc, 90f
537478b8fecSSam Ravnborg	 sub		%o0, %o1, %o3
538478b8fecSSam Ravnborg
539478b8fecSSam Ravnborg	.align		64
540478b8fecSSam Ravnborg80:	/* 0 < len <= 16 */
541478b8fecSSam Ravnborg	andcc		%o3, 0x3, %g0
542478b8fecSSam Ravnborg	bne,pn		%XCC, 90f
543478b8fecSSam Ravnborg	 sub		%o0, %o1, %o3
544478b8fecSSam Ravnborg
545478b8fecSSam Ravnborg1:	EX_LD(LOAD(lduw, %o1, %g1))
546478b8fecSSam Ravnborg	subcc		%o2, 4, %o2
547478b8fecSSam Ravnborg	EX_ST(STORE(stw, %g1, %o1 + %o3))
548478b8fecSSam Ravnborg	bgu,pt		%XCC, 1b
549478b8fecSSam Ravnborg	 add		%o1, 4, %o1
550478b8fecSSam Ravnborg
551478b8fecSSam Ravnborg85:	retl
552478b8fecSSam Ravnborg	 mov		EX_RETVAL(%o4), %o0
553478b8fecSSam Ravnborg
554478b8fecSSam Ravnborg	.align		32
555478b8fecSSam Ravnborg90:	EX_LD(LOAD(ldub, %o1, %g1))
556478b8fecSSam Ravnborg	subcc		%o2, 1, %o2
557478b8fecSSam Ravnborg	EX_ST(STORE(stb, %g1, %o1 + %o3))
558478b8fecSSam Ravnborg	bgu,pt		%XCC, 90b
559478b8fecSSam Ravnborg	 add		%o1, 1, %o1
560478b8fecSSam Ravnborg	retl
561478b8fecSSam Ravnborg	 mov		EX_RETVAL(%o4), %o0
562478b8fecSSam Ravnborg
563478b8fecSSam Ravnborg	.size		FUNC_NAME, .-FUNC_NAME
564