xref: /illumos-gate/usr/src/lib/libmvec/common/vis/__vcos.S (revision cffcfaee1e6b29ef9ceb7d80e4e053ffd029906b)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
23 */
24/*
25 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
26 * Use is subject to license terms.
27 */
28
29	.file	"__vcos.S"
30
31#include "libm.h"
32
33	RO_DATA
34	.align	64
35constants:
36	.word	0x3ec718e3,0xa6972785
37	.word	0x3ef9fd39,0x94293940
38	.word	0xbf2a019f,0x75ee4be1
39	.word	0xbf56c16b,0xba552569
40	.word	0x3f811111,0x1108c703
41	.word	0x3fa55555,0x554f5b35
42	.word	0xbfc55555,0x555554d0
43	.word	0xbfdfffff,0xffffff85
44	.word	0x3ff00000,0x00000000
45	.word	0xbfc55555,0x5551fc28
46	.word	0x3f811107,0x62eacc9d
47	.word	0xbfdfffff,0xffff6328
48	.word	0x3fa55551,0x5f7acf0c
49	.word	0x3fe45f30,0x6dc9c883
50	.word	0x43380000,0x00000000
51	.word	0x3ff921fb,0x54400000
52	.word	0x3dd0b461,0x1a600000
53	.word	0x3ba3198a,0x2e000000
54	.word	0x397b839a,0x252049c1
55	.word	0x80000000,0x00004000
56	.word	0xffff8000,0x00000000	! N.B.: low-order words used
57	.word	0x3fc90000,0x80000000	! for sign bit hacking; see
58	.word	0x3fc40000,0x00000000	! references to "thresh" below
59
60#define p4		0x0
61#define q4		0x08
62#define p3		0x10
63#define q3		0x18
64#define p2		0x20
65#define q2		0x28
66#define p1		0x30
67#define q1		0x38
68#define one		0x40
69#define pp1		0x48
70#define pp2		0x50
71#define qq1		0x58
72#define qq2		0x60
73#define invpio2		0x68
74#define round		0x70
75#define pio2_1		0x78
76#define pio2_2		0x80
77#define pio2_3		0x88
78#define pio2_3t		0x90
79#define f30val		0x98
80#define mask		0xa0
81#define thresh		0xa8
82
83! local storage indices
84
85#define xsave		STACK_BIAS-0x8
86#define ysave		STACK_BIAS-0x10
87#define nsave		STACK_BIAS-0x14
88#define sxsave		STACK_BIAS-0x18
89#define sysave		STACK_BIAS-0x1c
90#define biguns		STACK_BIAS-0x20
91#define n2		STACK_BIAS-0x24
92#define n1		STACK_BIAS-0x28
93#define n0		STACK_BIAS-0x2c
94#define x2_1		STACK_BIAS-0x40
95#define x1_1		STACK_BIAS-0x50
96#define x0_1		STACK_BIAS-0x60
97#define y2_0		STACK_BIAS-0x70
98#define y1_0		STACK_BIAS-0x80
99#define y0_0		STACK_BIAS-0x90
100! sizeof temp storage - must be a multiple of 16 for V9
101#define tmps		0x90
102
103!--------------------------------------------------------------------
104! define pipes for easier reading
105
106#define P0_f0		%f0
107#define P0_f1		%f1
108#define P0_f2		%f2
109#define P0_f3		%f3
110#define P0_f4		%f4
111#define P0_f5		%f5
112#define P0_f6		%f6
113#define P0_f7		%f7
114#define P0_f8		%f8
115#define P0_f9		%f9
116
117#define P1_f10		%f10
118#define P1_f11		%f11
119#define P1_f12		%f12
120#define P1_f13		%f13
121#define P1_f14		%f14
122#define P1_f15		%f15
123#define P1_f16		%f16
124#define P1_f17		%f17
125#define P1_f18		%f18
126#define P1_f19		%f19
127
128#define P2_f20		%f20
129#define P2_f21		%f21
130#define P2_f22		%f22
131#define P2_f23		%f23
132#define P2_f24		%f24
133#define P2_f25		%f25
134#define P2_f26		%f26
135#define P2_f27		%f27
136#define P2_f28		%f28
137#define P2_f29		%f29
138
139! define __vlibm_TBL_sincos_hi & lo for easy reading
140
141#define SC_HI		%l3
142#define SC_LO		%l4
143
144! define constants for easy reading
145
146#define C_q1 %f46
147#define C_q2 %f48
148#define C_q3 %f50
149#define C_q4 %f52
150
151! one ( 1 ) uno eins echi un
152#define C_ONE		%f54
153#define C_ONE_LO	%f55
154
155! masks
156#define MSK_SIGN	%i5
157#define MSK_BIT31	%f30
158#define MSK_BIT13	%f31
159#define MSK_BITSHI17	%f44
160
161
162! constants for pp and qq
163#define C_pp1 %f56
164#define C_pp2 %f58
165#define C_qq1 %f60
166#define C_qq2 %f62
167
168! sign mask
169#define C_signM		%i5
170
171#define LIM_l5		%l5
172#define LIM_l6		%l6
173! when in pri range, using value as transition from poly to table.
174! for Medium range,change use of %l6 and use to keep track of biguns.
175#define LIM_l7		%l7
176
177!--------------------------------------------------------------------
178
179
180	ENTRY(__vcos)
181	save	%sp,-SA(MINFRAME)-tmps,%sp
182	PIC_SETUP(g5)
183	PIC_SET(g5,__vlibm_TBL_sincos_hi,l3)
184	PIC_SET(g5,__vlibm_TBL_sincos_lo,l4)
185	PIC_SET(g5,constants,o0)
186	mov	%o0,%g1
187	wr	%g0,0x82,%asi		! set %asi for non-faulting loads
188
189! ========== primary range ==========
190
191! register use
192
193! i0  n
194! i1  x
195! i2  stridex
196! i3  y
197! i4  stridey
198! i5  0x80000000
199
200! l0  hx0
201! l1  hx1
202! l2  hx2
203! l3  __vlibm_TBL_sincos_hi
204! l4  __vlibm_TBL_sincos_lo
205! l5  0x3fc40000
206! l6  0x3e400000
207! l7  0x3fe921fb
208
209! the following are 64-bit registers in both V8+ and V9
210
211! g1  scratch
212! g5
213
214! o0  py0
215! o1  py1
216! o2  py2
217! o3  oy0
218! o4  oy1
219! o5  oy2
220! o7  scratch
221
222! f0  x0
223! f2
224! f4
225! f6
226! f8  scratch for table base
227! f9  signbit0
228! f10 x1
229! f12
230! f14
231! f16
232! f18 scratch for table base
233! f19 signbit1
234! f20 x2
235! f22
236! f24
237! f26
238! f28 scratch for table base
239! f29 signbit2
240! f30 0x80000000
241! f31 0x4000
242! f32
243! f34
244! f36
245! f38
246! f40
247! f42
248! f44 0xffff800000000000
249! f46 p1
250! f48 p2
251! f50 p3
252! f52 p4
253! f54 one
254! f56 pp1
255! f58 pp2
256! f60 qq1
257! f62 qq2
258
259#ifdef __sparcv9
260	stx	%i1,[%fp+xsave]		! save arguments
261	stx	%i3,[%fp+ysave]
262#else
263	st	%i1,[%fp+xsave]		! save arguments
264	st	%i3,[%fp+ysave]
265#endif
266
267	st	%i0,[%fp+nsave]
268	st	%i2,[%fp+sxsave]
269	st	%i4,[%fp+sysave]
270	sethi	%hi(0x80000000),MSK_SIGN	! load/set up constants
271	sethi	%hi(0x3fc40000),LIM_l5
272	sethi	%hi(0x3e400000),LIM_l6
273	sethi	%hi(0x3fe921fb),LIM_l7
274	or	LIM_l7,%lo(0x3fe921fb),LIM_l7
275	ldd	[%g1+f30val],MSK_BIT31
276	ldd	[%g1+mask],MSK_BITSHI17
277	ldd	[%g1+q1],C_q1
278	ldd	[%g1+q2],C_q2
279	ldd	[%g1+q3],C_q3
280	ldd	[%g1+q4],C_q4
281	ldd	[%g1+one],C_ONE
282	ldd	[%g1+pp1],C_pp1
283	ldd	[%g1+pp2],C_pp2
284	ldd	[%g1+qq1],C_qq1
285	ldd	[%g1+qq2],C_qq2
286	sll	%i2,3,%i2		! scale strides
287	sll	%i4,3,%i4
288	add	%fp,x0_1,%o3		! precondition loop
289	add	%fp,x0_1,%o4
290	add	%fp,x0_1,%o5
291	ld	[%i1],%l0		! hx = *x
292	ld	[%i1],P0_f0
293	ld	[%i1+4],P0_f1
294	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
295	add	%i1,%i2,%i1		! x += stridex
296
297	ba,pt	%icc,.loop0
298!delay slot
299	nop
300
301	.align 32
302.loop0:
303	lda	[%i1]%asi,%l1		! preload next argument
304	sub	%l0,LIM_l6,%g1
305	sub	LIM_l7,%l0,%o7
306	fands	P0_f0,MSK_BIT31,P0_f9		! save signbit
307
308	lda	[%i1]%asi,P1_f10
309	orcc	%o7,%g1,%g0
310	mov	%i3,%o0			! py0 = y
311	bl,pn	%icc,.range0		! if hx < 0x3e400000 or > 0x3fe921fb
312
313! delay slot
314	lda	[%i1+4]%asi,P1_f11
315	addcc	%i0,-1,%i0
316	add	%i3,%i4,%i3		! y += stridey
317	ble,pn	%icc,.endloop1
318
319! delay slot
320	andn	%l1,MSK_SIGN,%l1
321	add	%i1,%i2,%i1		! x += stridex
322	fabsd	P0_f0,P0_f0
323	fmuld	C_ONE,C_ONE,C_ONE		! one*one; a nop for alignment only
324
325.loop1:
326	lda	[%i1]%asi,%l2		! preload next argument
327	sub	%l1,LIM_l6,%g1
328	sub	LIM_l7,%l1,%o7
329	fands	P1_f10,MSK_BIT31,P1_f19		! save signbit
330
331	lda	[%i1]%asi,P2_f20
332	orcc	%o7,%g1,%g0
333	mov	%i3,%o1			! py1 = y
334	bl,pn	%icc,.range1		! if hx < 0x3e400000 or > 0x3fe921fb
335
336! delay slot
337	lda	[%i1+4]%asi,P2_f21
338	addcc	%i0,-1,%i0
339	add	%i3,%i4,%i3		! y += stridey
340	ble,pn	%icc,.endloop2
341
342! delay slot
343	andn	%l2,MSK_SIGN,%l2
344	add	%i1,%i2,%i1		! x += stridex
345	fabsd	P1_f10,P1_f10
346	fmuld	C_ONE,C_ONE,C_ONE		! one*one; a nop for alignment only
347
348.loop2:
349	st	P0_f6,[%o3]
350	sub	%l2,LIM_l6,%g1
351	sub	LIM_l7,%l2,%o7
352	fands	P2_f20,MSK_BIT31,P2_f29		! save signbit
353
354	st	P0_f7,[%o3+4]
355	orcc	%g1,%o7,%g0
356	mov	%i3,%o2			! py2 = y
357	bl,pn	%icc,.range2		! if hx < 0x3e400000 or > 0x3fe921fb
358
359! delay slot
360	add	%i3,%i4,%i3		! y += stridey
361	cmp	%l0,LIM_l5
362	fabsd	P2_f20,P2_f20
363	bl,pn	%icc,.case4
364
365! delay slot
366	st	P1_f16,[%o4]
367	cmp	%l1,LIM_l5
368	fpadd32s P0_f0,MSK_BIT13,P0_f8
369	bl,pn	%icc,.case2
370
371! delay slot
372	st	P1_f17,[%o4+4]
373	cmp	%l2,LIM_l5
374	fpadd32s P1_f10,MSK_BIT13,P1_f18
375	bl,pn	%icc,.case1
376
377! delay slot
378	st	P2_f26,[%o5]
379	mov	%o0,%o3
380	sethi	%hi(0x3fc3c000),%o7
381	fpadd32s P2_f20,MSK_BIT13,P2_f28
382
383	st	P2_f27,[%o5+4]
384	fand	P0_f8,MSK_BITSHI17,P0_f2
385	mov	%o1,%o4
386
387	fand	P1_f18,MSK_BITSHI17,P1_f12
388	mov	%o2,%o5
389	sub	%l0,%o7,%l0
390
391	fand	P2_f28,MSK_BITSHI17,P2_f22
392	sub	%l1,%o7,%l1
393	sub	%l2,%o7,%l2
394
395	fsubd	P0_f0,P0_f2,P0_f0
396	srl	%l0,10,%l0
397	add	SC_HI,8,%g1;add	SC_LO,8,%o7
398
399	fsubd	P1_f10,P1_f12,P1_f10
400	srl	%l1,10,%l1
401
402	fsubd	P2_f20,P2_f22,P2_f20
403	srl	%l2,10,%l2
404
405	fmuld	P0_f0,P0_f0,P0_f2
406	andn	%l0,0x1f,%l0
407
408	fmuld	P1_f10,P1_f10,P1_f12
409	andn	%l1,0x1f,%l1
410
411	fmuld	P2_f20,P2_f20,P2_f22
412	andn	%l2,0x1f,%l2
413
414	fmuld	P0_f2,C_pp2,P0_f6
415	ldd	[%g1+%l0],%f32
416
417	fmuld	P1_f12,C_pp2,P1_f16
418	ldd	[%g1+%l1],%f36
419
420	fmuld	P2_f22,C_pp2,P2_f26
421	ldd	[%g1+%l2],%f40
422
423	faddd	P0_f6,C_pp1,P0_f6
424	fmuld	P0_f2,C_qq2,P0_f4
425	ldd	[SC_HI+%l0],%f34
426
427	faddd	P1_f16,C_pp1,P1_f16
428	fmuld	P1_f12,C_qq2,P1_f14
429	ldd	[SC_HI+%l1],%f38
430
431	faddd	P2_f26,C_pp1,P2_f26
432	fmuld	P2_f22,C_qq2,P2_f24
433	ldd	[SC_HI+%l2],%f42
434
435	fmuld	P0_f2,P0_f6,P0_f6
436	faddd	P0_f4,C_qq1,P0_f4
437
438	fmuld	P1_f12,P1_f16,P1_f16
439	faddd	P1_f14,C_qq1,P1_f14
440
441	fmuld	P2_f22,P2_f26,P2_f26
442	faddd	P2_f24,C_qq1,P2_f24
443
444	faddd	P0_f6,C_ONE,P0_f6
445	fmuld	P0_f2,P0_f4,P0_f4
446
447	faddd	P1_f16,C_ONE,P1_f16
448	fmuld	P1_f12,P1_f14,P1_f14
449
450	faddd	P2_f26,C_ONE,P2_f26
451	fmuld	P2_f22,P2_f24,P2_f24
452
453	fmuld	P0_f0,P0_f6,P0_f6
454	ldd	[%o7+%l0],P0_f2
455
456	fmuld	P1_f10,P1_f16,P1_f16
457	ldd	[%o7+%l1],P1_f12
458
459	fmuld	P2_f20,P2_f26,P2_f26
460	ldd	[%o7+%l2],P2_f22
461
462	fmuld	P0_f4,%f32,P0_f4
463	lda	[%i1]%asi,%l0		! preload next argument
464
465	fmuld	P1_f14,%f36,P1_f14
466	lda	[%i1]%asi,P0_f0
467
468	fmuld	P2_f24,%f40,P2_f24
469	lda	[%i1+4]%asi,P0_f1
470
471	fmuld	P0_f6,%f34,P0_f6
472	add	%i1,%i2,%i1		! x += stridex
473
474	fmuld	P1_f16,%f38,P1_f16
475
476	fmuld	P2_f26,%f42,P2_f26
477
478	fsubd	P0_f6,P0_f4,P0_f6
479
480	fsubd	P1_f16,P1_f14,P1_f16
481
482	fsubd	P2_f26,P2_f24,P2_f26
483
484	fsubd	P0_f2,P0_f6,P0_f6
485
486	fsubd	P1_f12,P1_f16,P1_f16
487
488	fsubd	P2_f22,P2_f26,P2_f26
489
490	faddd	P0_f6,%f32,P0_f6
491
492	faddd	P1_f16,%f36,P1_f16
493
494	faddd	P2_f26,%f40,P2_f26
495	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
496
497	nop	!!(vsin) 	fors	P0_f6,P0_f9,P0_f6
498	addcc	%i0,-1,%i0
499
500	nop	!!(vsin) 	fors	P1_f16,P1_f19,P1_f16
501	bg,pt	%icc,.loop0
502
503! delay slot
504	nop	!!(vsin) 	fors	P2_f26,P2_f29,P2_f26
505
506	ba,pt	%icc,.endloop0
507! delay slot
508	nop
509
510	.align	32
511.case1:
512	st	P2_f27,[%o5+4]
513	sethi	%hi(0x3fc3c000),%o7
514	fand	P0_f8,MSK_BITSHI17,P0_f2
515
516	sub	%l0,%o7,%l0
517	sub	%l1,%o7,%l1
518	add	SC_HI,8,%g1;add	SC_LO,8,%o7
519	fand	P1_f18,MSK_BITSHI17,P1_f12
520	fmuld	P2_f20,P2_f20,P2_f22
521
522	fsubd	P0_f0,P0_f2,P0_f0
523	srl	%l0,10,%l0
524	mov	%o0,%o3
525
526	fsubd	P1_f10,P1_f12,P1_f10
527	srl	%l1,10,%l1
528	mov	%o1,%o4
529
530	fmuld	P2_f22,C_q4,P2_f24
531	mov	%o2,%o5
532
533	fmuld	P0_f0,P0_f0,P0_f2
534	andn	%l0,0x1f,%l0
535
536	fmuld	P1_f10,P1_f10,P1_f12
537	andn	%l1,0x1f,%l1
538
539	faddd	P2_f24,C_q3,P2_f24
540
541	fmuld	P0_f2,C_pp2,P0_f6
542	ldd	[%g1+%l0],%f32
543
544	fmuld	P1_f12,C_pp2,P1_f16
545	ldd	[%g1+%l1],%f36
546
547	fmuld	P2_f22,P2_f24,P2_f24
548
549	faddd	P0_f6,C_pp1,P0_f6
550	fmuld	P0_f2,C_qq2,P0_f4
551	ldd	[SC_HI+%l0],%f34
552
553	faddd	P1_f16,C_pp1,P1_f16
554	fmuld	P1_f12,C_qq2,P1_f14
555	ldd	[SC_HI+%l1],%f38
556
557	faddd	P2_f24,C_q2,P2_f24
558
559	fmuld	P0_f2,P0_f6,P0_f6
560	faddd	P0_f4,C_qq1,P0_f4
561
562	fmuld	P1_f12,P1_f16,P1_f16
563	faddd	P1_f14,C_qq1,P1_f14
564
565	fmuld	P2_f22,P2_f24,P2_f24
566
567	faddd	P0_f6,C_ONE,P0_f6
568	fmuld	P0_f2,P0_f4,P0_f4
569
570	faddd	P1_f16,C_ONE,P1_f16
571	fmuld	P1_f12,P1_f14,P1_f14
572
573	faddd	P2_f24,C_q1,P2_f24
574
575	fmuld	P0_f0,P0_f6,P0_f6
576	ldd	[%o7+%l0],P0_f2
577
578	fmuld	P1_f10,P1_f16,P1_f16
579	ldd	[%o7+%l1],P1_f12
580
581	fmuld	P0_f4,%f32,P0_f4
582	lda	[%i1]%asi,%l0		! preload next argument
583
584	fmuld	P1_f14,%f36,P1_f14
585	lda	[%i1]%asi,P0_f0
586
587	fmuld	P0_f6,%f34,P0_f6
588	lda	[%i1+4]%asi,P0_f1
589
590	fmuld	P1_f16,%f38,P1_f16
591	add	%i1,%i2,%i1		! x += stridex
592
593	fmuld	P2_f22,P2_f24,P2_f24
594
595	fsubd	P0_f6,P0_f4,P0_f6
596
597	fsubd	P1_f16,P1_f14,P1_f16
598
599	!!(vsin)fmuld	P2_f20,P2_f24,P2_f24
600
601	fsubd	P0_f2,P0_f6,P0_f6
602
603	fsubd	P1_f12,P1_f16,P1_f16
604
605	faddd	C_ONE,P2_f24,P2_f26 !!(vsin)faddd	P2_f20,P2_f24,P2_f26
606
607	faddd	P0_f6,%f32,P0_f6
608
609	faddd	P1_f16,%f36,P1_f16
610	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
611
612	nop	!!(vsin) 	fors	P2_f26,P2_f29,P2_f26
613	addcc	%i0,-1,%i0
614
615	nop	!!(vsin) 	fors	P0_f6,P0_f9,P0_f6
616	bg,pt	%icc,.loop0
617
618! delay slot
619	nop	!!(vsin) 	fors	P1_f16,P1_f19,P1_f16
620
621	ba,pt	%icc,.endloop0
622! delay slot
623	nop
624
625	.align	32
626.case2:
627	st	P2_f26,[%o5]
628	cmp	%l2,LIM_l5
629	fpadd32s P2_f20,MSK_BIT13,P2_f28
630	bl,pn	%icc,.case3
631
632! delay slot
633	st	P2_f27,[%o5+4]
634	sethi	%hi(0x3fc3c000),%o7
635	fand	P0_f8,MSK_BITSHI17,P0_f2
636
637	sub	%l0,%o7,%l0
638	sub	%l2,%o7,%l2
639	add	SC_HI,8,%g1;add	SC_LO,8,%o7
640	fand	P2_f28,MSK_BITSHI17,P2_f22
641	fmuld	P1_f10,P1_f10,P1_f12
642
643	fsubd	P0_f0,P0_f2,P0_f0
644	srl	%l0,10,%l0
645	mov	%o0,%o3
646
647	fsubd	P2_f20,P2_f22,P2_f20
648	srl	%l2,10,%l2
649	mov	%o2,%o5
650
651	fmuld	P1_f12,C_q4,P1_f14
652	mov	%o1,%o4
653
654	fmuld	P0_f0,P0_f0,P0_f2
655	andn	%l0,0x1f,%l0
656
657	fmuld	P2_f20,P2_f20,P2_f22
658	andn	%l2,0x1f,%l2
659
660	faddd	P1_f14,C_q3,P1_f14
661
662	fmuld	P0_f2,C_pp2,P0_f6
663	ldd	[%g1+%l0],%f32
664
665	fmuld	P2_f22,C_pp2,P2_f26
666	ldd	[%g1+%l2],%f40
667
668	fmuld	P1_f12,P1_f14,P1_f14
669
670	faddd	P0_f6,C_pp1,P0_f6
671	fmuld	P0_f2,C_qq2,P0_f4
672	ldd	[SC_HI+%l0],%f34
673
674	faddd	P2_f26,C_pp1,P2_f26
675	fmuld	P2_f22,C_qq2,P2_f24
676	ldd	[SC_HI+%l2],%f42
677
678	faddd	P1_f14,C_q2,P1_f14
679
680	fmuld	P0_f2,P0_f6,P0_f6
681	faddd	P0_f4,C_qq1,P0_f4
682
683	fmuld	P2_f22,P2_f26,P2_f26
684	faddd	P2_f24,C_qq1,P2_f24
685
686	fmuld	P1_f12,P1_f14,P1_f14
687
688	faddd	P0_f6,C_ONE,P0_f6
689	fmuld	P0_f2,P0_f4,P0_f4
690
691	faddd	P2_f26,C_ONE,P2_f26
692	fmuld	P2_f22,P2_f24,P2_f24
693
694	faddd	P1_f14,C_q1,P1_f14
695
696	fmuld	P0_f0,P0_f6,P0_f6
697	ldd	[%o7+%l0],P0_f2
698
699	fmuld	P2_f20,P2_f26,P2_f26
700	ldd	[%o7+%l2],P2_f22
701
702	fmuld	P0_f4,%f32,P0_f4
703	lda	[%i1]%asi,%l0		! preload next argument
704
705	fmuld	P2_f24,%f40,P2_f24
706	lda	[%i1]%asi,P0_f0
707
708	fmuld	P0_f6,%f34,P0_f6
709	lda	[%i1+4]%asi,P0_f1
710
711	fmuld	P2_f26,%f42,P2_f26
712	add	%i1,%i2,%i1		! x += stridex
713
714	fmuld	P1_f12,P1_f14,P1_f14
715
716	fsubd	P0_f6,P0_f4,P0_f6
717
718	fsubd	P2_f26,P2_f24,P2_f26
719
720	!!(vsin)fmuld	P1_f10,P1_f14,P1_f14
721
722	fsubd	P0_f2,P0_f6,P0_f6
723
724	fsubd	P2_f22,P2_f26,P2_f26
725
726	faddd	C_ONE,P1_f14,P1_f16 !!(vsin)faddd	P1_f10,P1_f14,P1_f16
727
728	faddd	P0_f6,%f32,P0_f6
729
730	faddd	P2_f26,%f40,P2_f26
731	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
732
733	nop	!!(vsin) 	fors	P1_f16,P1_f19,P1_f16
734	addcc	%i0,-1,%i0
735
736	nop	!!(vsin) 	fors	P0_f6,P0_f9,P0_f6
737	bg,pt	%icc,.loop0
738
739! delay slot
740	nop	!!(vsin) 	fors	P2_f26,P2_f29,P2_f26
741
742	ba,pt	%icc,.endloop0
743! delay slot
744	nop
745
746	.align	32
747.case3:
748	sethi	%hi(0x3fc3c000),%o7
749	fand	P0_f8,MSK_BITSHI17,P0_f2
750	fmuld	P1_f10,P1_f10,P1_f12
751
752	sub	%l0,%o7,%l0
753	add	SC_HI,8,%g1;add	SC_LO,8,%o7
754	fmuld	P2_f20,P2_f20,P2_f22
755
756	fsubd	P0_f0,P0_f2,P0_f0
757	srl	%l0,10,%l0
758	mov	%o0,%o3
759
760	fmuld	P1_f12,C_q4,P1_f14
761	mov	%o1,%o4
762
763	fmuld	P2_f22,C_q4,P2_f24
764	mov	%o2,%o5
765
766	fmuld	P0_f0,P0_f0,P0_f2
767	andn	%l0,0x1f,%l0
768
769	faddd	P1_f14,C_q3,P1_f14
770
771	faddd	P2_f24,C_q3,P2_f24
772
773	fmuld	P0_f2,C_pp2,P0_f6
774	ldd	[%g1+%l0],%f32
775
776	fmuld	P1_f12,P1_f14,P1_f14
777
778	fmuld	P2_f22,P2_f24,P2_f24
779
780	faddd	P0_f6,C_pp1,P0_f6
781	fmuld	P0_f2,C_qq2,P0_f4
782	ldd	[SC_HI+%l0],%f34
783
784	faddd	P1_f14,C_q2,P1_f14
785
786	faddd	P2_f24,C_q2,P2_f24
787
788	fmuld	P0_f2,P0_f6,P0_f6
789	faddd	P0_f4,C_qq1,P0_f4
790
791	fmuld	P1_f12,P1_f14,P1_f14
792
793	fmuld	P2_f22,P2_f24,P2_f24
794
795	faddd	P0_f6,C_ONE,P0_f6
796	fmuld	P0_f2,P0_f4,P0_f4
797
798	faddd	P1_f14,C_q1,P1_f14
799
800	faddd	P2_f24,C_q1,P2_f24
801
802	fmuld	P0_f0,P0_f6,P0_f6
803	ldd	[%o7+%l0],P0_f2
804
805	fmuld	P0_f4,%f32,P0_f4
806	lda	[%i1]%asi,%l0		! preload next argument
807
808	fmuld	P1_f12,P1_f14,P1_f14
809	lda	[%i1]%asi,P0_f0
810
811	fmuld	P0_f6,%f34,P0_f6
812	lda	[%i1+4]%asi,P0_f1
813
814	fmuld	P2_f22,P2_f24,P2_f24
815	add	%i1,%i2,%i1		! x += stridex
816
817	!!(vsin)fmuld	P1_f10,P1_f14,P1_f14
818
819	fsubd	P0_f6,P0_f4,P0_f6
820
821	!!(vsin)fmuld	P2_f20,P2_f24,P2_f24
822
823	faddd	C_ONE,P1_f14,P1_f16 !!(vsin)faddd	P1_f10,P1_f14,P1_f16
824
825	fsubd	P0_f2,P0_f6,P0_f6
826
827	faddd	C_ONE,P2_f24,P2_f26 !!(vsin)faddd	P2_f20,P2_f24,P2_f26
828
829	nop	!!(vsin) 	fors	P1_f16,P1_f19,P1_f16
830	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
831
832	faddd	P0_f6,%f32,P0_f6
833	addcc	%i0,-1,%i0
834
835	nop	!!(vsin) 	fors	P2_f26,P2_f29,P2_f26
836	bg,pt	%icc,.loop0
837
838! delay slot
839	nop	!!(vsin) 	fors	P0_f6,P0_f9,P0_f6
840
841	ba,pt	%icc,.endloop0
842! delay slot
843	nop
844
845	.align	32
846.case4:
847	st	P1_f17,[%o4+4]
848	cmp	%l1,LIM_l5
849	fpadd32s P1_f10,MSK_BIT13,P1_f18
850	bl,pn	%icc,.case6
851
852! delay slot
853	st	P2_f26,[%o5]
854	cmp	%l2,LIM_l5
855	fpadd32s P2_f20,MSK_BIT13,P2_f28
856	bl,pn	%icc,.case5
857
858! delay slot
859	st	P2_f27,[%o5+4]
860	sethi	%hi(0x3fc3c000),%o7
861	fand	P1_f18,MSK_BITSHI17,P1_f12
862
863	sub	%l1,%o7,%l1
864	sub	%l2,%o7,%l2
865	add	SC_HI,8,%g1;add	SC_LO,8,%o7
866	fand	P2_f28,MSK_BITSHI17,P2_f22
867	fmuld	P0_f0,P0_f0,P0_f2
868
869	fsubd	P1_f10,P1_f12,P1_f10
870	srl	%l1,10,%l1
871	mov	%o1,%o4
872
873	fsubd	P2_f20,P2_f22,P2_f20
874	srl	%l2,10,%l2
875	mov	%o2,%o5
876
877	fmovd	P0_f0,P0_f6		!ID for processing
878	fmuld	P0_f2,C_q4,P0_f4
879	mov	%o0,%o3
880
881	fmuld	P1_f10,P1_f10,P1_f12
882	andn	%l1,0x1f,%l1
883
884	fmuld	P2_f20,P2_f20,P2_f22
885	andn	%l2,0x1f,%l2
886
887	faddd	P0_f4,C_q3,P0_f4
888
889	fmuld	P1_f12,C_pp2,P1_f16
890	ldd	[%g1+%l1],%f36
891
892	fmuld	P2_f22,C_pp2,P2_f26
893	ldd	[%g1+%l2],%f40
894
895	fmuld	P0_f2,P0_f4,P0_f4
896
897	faddd	P1_f16,C_pp1,P1_f16
898	fmuld	P1_f12,C_qq2,P1_f14
899	ldd	[SC_HI+%l1],%f38
900
901	faddd	P2_f26,C_pp1,P2_f26
902	fmuld	P2_f22,C_qq2,P2_f24
903	ldd	[SC_HI+%l2],%f42
904
905	faddd	P0_f4,C_q2,P0_f4
906
907	fmuld	P1_f12,P1_f16,P1_f16
908	faddd	P1_f14,C_qq1,P1_f14
909
910	fmuld	P2_f22,P2_f26,P2_f26
911	faddd	P2_f24,C_qq1,P2_f24
912
913	fmuld	P0_f2,P0_f4,P0_f4
914
915	faddd	P1_f16,C_ONE,P1_f16
916	fmuld	P1_f12,P1_f14,P1_f14
917
918	faddd	P2_f26,C_ONE,P2_f26
919	fmuld	P2_f22,P2_f24,P2_f24
920
921	faddd	P0_f4,C_q1,P0_f4
922
923	fmuld	P1_f10,P1_f16,P1_f16
924	ldd	[%o7+%l1],P1_f12
925
926	fmuld	P2_f20,P2_f26,P2_f26
927	ldd	[%o7+%l2],P2_f22
928
929	fmuld	P1_f14,%f36,P1_f14
930	lda	[%i1]%asi,%l0		! preload next argument
931
932	fmuld	P2_f24,%f40,P2_f24
933	lda	[%i1]%asi,P0_f0
934
935	fmuld	P1_f16,%f38,P1_f16
936	lda	[%i1+4]%asi,P0_f1
937
938	fmuld	P2_f26,%f42,P2_f26
939	add	%i1,%i2,%i1		! x += stridex
940
941	fmuld	P0_f2,P0_f4,P0_f4
942
943	fsubd	P1_f16,P1_f14,P1_f16
944
945	fsubd	P2_f26,P2_f24,P2_f26
946
947	!!(vsin)fmuld	P0_f6,P0_f4,P0_f4
948
949	fsubd	P1_f12,P1_f16,P1_f16
950
951	fsubd	P2_f22,P2_f26,P2_f26
952
953	faddd	C_ONE,P0_f4,P0_f6 !!(vsin)faddd   P0_f6,P0_f4,P0_f6	! faddd then spaces for processing
954
955	faddd	P1_f16,%f36,P1_f16
956
957	faddd	P2_f26,%f40,P2_f26
958	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
959
960	nop	!!(vsin) 	fors	P0_f6,P0_f9,P0_f6
961	addcc	%i0,-1,%i0
962
963	nop	!!(vsin) 	fors	P1_f16,P1_f19,P1_f16
964	bg,pt	%icc,.loop0
965
966! delay slot
967	nop	!!(vsin) 	fors	P2_f26,P2_f29,P2_f26
968
969	ba,pt	%icc,.endloop0
970! delay slot
971	nop
972
973	.align	32
974.case5:
975	sethi	%hi(0x3fc3c000),%o7
976	fand	P1_f18,MSK_BITSHI17,P1_f12
977	fmuld	P0_f0,P0_f0,P0_f2
978
979	sub	%l1,%o7,%l1
980	add	SC_HI,8,%g1;add	SC_LO,8,%o7
981	fmuld	P2_f20,P2_f20,P2_f22
982
983	fsubd	P1_f10,P1_f12,P1_f10
984	srl	%l1,10,%l1
985	mov	%o1,%o4
986
987	fmovd	P0_f0,P0_f6		!ID for processing
988	fmuld	P0_f2,C_q4,P0_f4
989	mov	%o0,%o3
990
991	fmuld	P2_f22,C_q4,P2_f24
992	mov	%o2,%o5
993
994	fmuld	P1_f10,P1_f10,P1_f12
995	andn	%l1,0x1f,%l1
996
997	faddd	P0_f4,C_q3,P0_f4
998
999	faddd	P2_f24,C_q3,P2_f24
1000
1001	fmuld	P1_f12,C_pp2,P1_f16
1002	ldd	[%g1+%l1],%f36
1003
1004	fmuld	P0_f2,P0_f4,P0_f4
1005
1006	fmuld	P2_f22,P2_f24,P2_f24
1007
1008	faddd	P1_f16,C_pp1,P1_f16
1009	fmuld	P1_f12,C_qq2,P1_f14
1010	ldd	[SC_HI+%l1],%f38
1011
1012	faddd	P0_f4,C_q2,P0_f4
1013
1014	faddd	P2_f24,C_q2,P2_f24
1015
1016	fmuld	P1_f12,P1_f16,P1_f16
1017	faddd	P1_f14,C_qq1,P1_f14
1018
1019	fmuld	P0_f2,P0_f4,P0_f4
1020
1021	fmuld	P2_f22,P2_f24,P2_f24
1022
1023	faddd	P1_f16,C_ONE,P1_f16
1024	fmuld	P1_f12,P1_f14,P1_f14
1025
1026	faddd	P0_f4,C_q1,P0_f4
1027
1028	faddd	P2_f24,C_q1,P2_f24
1029
1030	fmuld	P1_f10,P1_f16,P1_f16
1031	ldd	[%o7+%l1],P1_f12
1032
1033	fmuld	P1_f14,%f36,P1_f14
1034	lda	[%i1]%asi,%l0		! preload next argument
1035
1036	fmuld	P0_f2,P0_f4,P0_f4
1037	lda	[%i1]%asi,P0_f0
1038
1039	fmuld	P1_f16,%f38,P1_f16
1040	lda	[%i1+4]%asi,P0_f1
1041
1042	fmuld	P2_f22,P2_f24,P2_f24
1043	add	%i1,%i2,%i1		! x += stridex
1044
1045	!!(vsin)fmuld	P0_f6,P0_f4,P0_f4
1046
1047	fsubd	P1_f16,P1_f14,P1_f16
1048
1049	!!(vsin)fmuld	P2_f20,P2_f24,P2_f24
1050
1051	faddd	C_ONE,P0_f4,P0_f6 !!(vsin)faddd   P0_f6,P0_f4,P0_f6	! faddd then spaces for processing
1052
1053	fsubd	P1_f12,P1_f16,P1_f16
1054
1055	faddd	C_ONE,P2_f24,P2_f26 !!(vsin)faddd	P2_f20,P2_f24,P2_f26
1056
1057	nop	!!(vsin) 	fors	P0_f6,P0_f9,P0_f6
1058	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
1059
1060	faddd	P1_f16,%f36,P1_f16
1061	addcc	%i0,-1,%i0
1062
1063	nop	!!(vsin) 	fors	P2_f26,P2_f29,P2_f26
1064	bg,pt	%icc,.loop0
1065
1066! delay slot
1067	nop	!!(vsin) 	fors	P1_f16,P1_f19,P1_f16
1068
1069	ba,pt	%icc,.endloop0
1070! delay slot
1071	nop
1072
1073	.align	32
1074.case6:
1075	st	P2_f27,[%o5+4]
1076	cmp	%l2,LIM_l5
1077	fpadd32s P2_f20,MSK_BIT13,P2_f28
1078	bl,pn	%icc,.case7
1079
1080! delay slot
1081	sethi	%hi(0x3fc3c000),%o7
1082	fand	P2_f28,MSK_BITSHI17,P2_f22
1083	fmuld	P0_f0,P0_f0,P0_f2
1084
1085	sub	%l2,%o7,%l2
1086	add	SC_HI,8,%g1;add	SC_LO,8,%o7
1087	fmuld	P1_f10,P1_f10,P1_f12
1088
1089	fsubd	P2_f20,P2_f22,P2_f20
1090	srl	%l2,10,%l2
1091	mov	%o2,%o5
1092
1093	fmovd	P0_f0,P0_f6		!ID for processing
1094	fmuld	P0_f2,C_q4,P0_f4
1095	mov	%o0,%o3
1096
1097	fmuld	P1_f12,C_q4,P1_f14
1098	mov	%o1,%o4
1099
1100	fmuld	P2_f20,P2_f20,P2_f22
1101	andn	%l2,0x1f,%l2
1102
1103	faddd	P0_f4,C_q3,P0_f4
1104
1105	faddd	P1_f14,C_q3,P1_f14
1106
1107	fmuld	P2_f22,C_pp2,P2_f26
1108	ldd	[%g1+%l2],%f40
1109
1110	fmuld	P0_f2,P0_f4,P0_f4
1111
1112	fmuld	P1_f12,P1_f14,P1_f14
1113
1114	faddd	P2_f26,C_pp1,P2_f26
1115	fmuld	P2_f22,C_qq2,P2_f24
1116	ldd	[SC_HI+%l2],%f42
1117
1118	faddd	P0_f4,C_q2,P0_f4
1119
1120	faddd	P1_f14,C_q2,P1_f14
1121
1122	fmuld	P2_f22,P2_f26,P2_f26
1123	faddd	P2_f24,C_qq1,P2_f24
1124
1125	fmuld	P0_f2,P0_f4,P0_f4
1126
1127	fmuld	P1_f12,P1_f14,P1_f14
1128
1129	faddd	P2_f26,C_ONE,P2_f26
1130	fmuld	P2_f22,P2_f24,P2_f24
1131
1132	faddd	P0_f4,C_q1,P0_f4
1133
1134	faddd	P1_f14,C_q1,P1_f14
1135
1136	fmuld	P2_f20,P2_f26,P2_f26
1137	ldd	[%o7+%l2],P2_f22
1138
1139	fmuld	P2_f24,%f40,P2_f24
1140	lda	[%i1]%asi,%l0		! preload next argument
1141
1142	fmuld	P0_f2,P0_f4,P0_f4
1143	lda	[%i1]%asi,P0_f0
1144
1145	fmuld	P2_f26,%f42,P2_f26
1146	lda	[%i1+4]%asi,P0_f1
1147
1148	fmuld	P1_f12,P1_f14,P1_f14
1149	add	%i1,%i2,%i1		! x += stridex
1150
1151	!!(vsin)fmuld	P0_f6,P0_f4,P0_f4
1152
1153	fsubd	P2_f26,P2_f24,P2_f26
1154
1155	!!(vsin)fmuld	P1_f10,P1_f14,P1_f14
1156
1157	faddd	C_ONE,P0_f4,P0_f6 !!(vsin)faddd   P0_f6,P0_f4,P0_f6	! faddd then spaces for processing
1158
1159	fsubd	P2_f22,P2_f26,P2_f26
1160
1161	faddd	C_ONE,P1_f14,P1_f16 !!(vsin)faddd	P1_f10,P1_f14,P1_f16
1162
1163	nop	!!(vsin) 	fors	P0_f6,P0_f9,P0_f6
1164	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
1165
1166	faddd	P2_f26,%f40,P2_f26
1167	addcc	%i0,-1,%i0
1168
1169	nop	!!(vsin) 	fors	P1_f16,P1_f19,P1_f16
1170	bg,pt	%icc,.loop0
1171
1172! delay slot
1173	nop	!!(vsin) 	fors	P2_f26,P2_f29,P2_f26
1174
1175	ba,pt	%icc,.endloop0
1176! delay slot
1177	nop
1178
1179	.align	32
1180.case7:
1181	fmuld	P0_f0,P0_f0,P0_f2
1182	fmovd	P0_f0,P0_f6		!ID for processing
1183	mov	%o0,%o3
1184
1185	fmuld	P1_f10,P1_f10,P1_f12
1186	mov	%o1,%o4
1187
1188	fmuld	P2_f20,P2_f20,P2_f22
1189	mov	%o2,%o5
1190
1191	fmuld	P0_f2,C_q4,P0_f4
1192	lda	[%i1]%asi,%l0		! preload next argument
1193
1194	fmuld	P1_f12,C_q4,P1_f14
1195	lda	[%i1]%asi,P0_f0
1196
1197	fmuld	P2_f22,C_q4,P2_f24
1198	lda	[%i1+4]%asi,P0_f1
1199
1200	faddd	P0_f4,C_q3,P0_f4
1201	add	%i1,%i2,%i1		! x += stridex
1202
1203	faddd	P1_f14,C_q3,P1_f14
1204
1205	faddd	P2_f24,C_q3,P2_f24
1206
1207	fmuld	P0_f2,P0_f4,P0_f4
1208
1209	fmuld	P1_f12,P1_f14,P1_f14
1210
1211	fmuld	P2_f22,P2_f24,P2_f24
1212
1213	faddd	P0_f4,C_q2,P0_f4
1214
1215	faddd	P1_f14,C_q2,P1_f14
1216
1217	faddd	P2_f24,C_q2,P2_f24
1218
1219	fmuld	P0_f2,P0_f4,P0_f4
1220
1221	fmuld	P1_f12,P1_f14,P1_f14
1222
1223	fmuld	P2_f22,P2_f24,P2_f24
1224
1225	faddd	P0_f4,C_q1,P0_f4
1226
1227	faddd	P1_f14,C_q1,P1_f14
1228
1229	faddd	P2_f24,C_q1,P2_f24
1230
1231	fmuld	P0_f2,P0_f4,P0_f4
1232
1233	fmuld	P1_f12,P1_f14,P1_f14
1234
1235	fmuld	P2_f22,P2_f24,P2_f24
1236
1237	!!(vsin)fmuld	P0_f6,P0_f4,P0_f4
1238
1239	!!(vsin)fmuld	P1_f10,P1_f14,P1_f14
1240
1241	!!(vsin)fmuld	P2_f20,P2_f24,P2_f24
1242
1243	faddd	C_ONE,P0_f4,P0_f6 !!(vsin)faddd   P0_f6,P0_f4,P0_f6	! faddd then spaces for processing
1244
1245	faddd	C_ONE,P1_f14,P1_f16 !!(vsin)faddd	P1_f10,P1_f14,P1_f16
1246
1247	faddd	C_ONE,P2_f24,P2_f26 !!(vsin)faddd	P2_f20,P2_f24,P2_f26
1248	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
1249
1250	nop	!!(vsin) 	fors	P0_f6,P0_f9,P0_f6
1251	addcc	%i0,-1,%i0
1252
1253	nop	!!(vsin) 	fors	P1_f16,P1_f19,P1_f16
1254	bg,pt	%icc,.loop0
1255
1256! delay slot
1257	nop	!!(vsin) 	fors	P2_f26,P2_f29,P2_f26
1258
1259	ba,pt	%icc,.endloop0
1260! delay slot
1261	nop
1262
1263
1264	.align	32
1265.endloop2:
1266	cmp	%l1,LIM_l5
1267	bl,pn	%icc,1f
1268! delay slot
1269	fabsd	P1_f10,P1_f10
1270	sethi	%hi(0x3fc3c000),%o7
1271	fpadd32s P1_f10,MSK_BIT13,P1_f18
1272	fand	P1_f18,MSK_BITSHI17,P1_f12
1273	sub	%l1,%o7,%l1
1274	add	SC_HI,8,%g1;add	SC_LO,8,%o7
1275	fsubd	P1_f10,P1_f12,P1_f10
1276	srl	%l1,10,%l1
1277	fmuld	P1_f10,P1_f10,P1_f12
1278	andn	%l1,0x1f,%l1
1279	fmuld	P1_f12,C_pp2,P2_f20
1280	ldd	[%g1+%l1],%f36
1281	faddd	P2_f20,C_pp1,P2_f20
1282	fmuld	P1_f12,C_qq2,P1_f14
1283	ldd	[SC_HI+%l1],%f38
1284	fmuld	P1_f12,P2_f20,P2_f20
1285	faddd	P1_f14,C_qq1,P1_f14
1286	faddd	P2_f20,C_ONE,P2_f20
1287	fmuld	P1_f12,P1_f14,P1_f14
1288	fmuld	P1_f10,P2_f20,P2_f20
1289	ldd	[%o7+%l1],P1_f12
1290	fmuld	P1_f14,%f36,P1_f14
1291	fmuld	P2_f20,%f38,P2_f20
1292	fsubd	P2_f20,P1_f14,P2_f20
1293	fsubd	P1_f12,P2_f20,P2_f20
1294	ba,pt	%icc,2f
1295! delay slot
1296	faddd	P2_f20,%f36,P2_f20
12971:
1298	fmuld	P1_f10,P1_f10,P1_f12
1299	fmuld	P1_f12,C_q4,P1_f14
1300	faddd	P1_f14,C_q3,P1_f14
1301	fmuld	P1_f12,P1_f14,P1_f14
1302	faddd	P1_f14,C_q2,P1_f14
1303	fmuld	P1_f12,P1_f14,P1_f14
1304	faddd	P1_f14,C_q1,P1_f14
1305	fmuld	P1_f12,P1_f14,P1_f14
1306	!!(vsin)fmuld	P1_f10,P1_f14,P1_f14
1307	faddd	C_ONE,P1_f14,P2_f20 !!(vsin)faddd	P1_f10,P1_f14,P2_f20
13082:
1309	nop	!!(vsin) 	fors	P2_f20,P1_f19,P2_f20
1310	st	P2_f20,[%o1]
1311	st	P2_f21,[%o1+4]
1312
1313.endloop1:
1314	cmp	%l0,LIM_l5
1315	bl,pn	%icc,1f
1316! delay slot
1317	fabsd	P0_f0,P0_f0
1318	sethi	%hi(0x3fc3c000),%o7
1319	fpadd32s P0_f0,MSK_BIT13,P0_f8
1320	fand	P0_f8,MSK_BITSHI17,P0_f2
1321	sub	%l0,%o7,%l0
1322	add	SC_HI,8,%g1;add	SC_LO,8,%o7
1323	fsubd	P0_f0,P0_f2,P0_f0
1324	srl	%l0,10,%l0
1325	fmuld	P0_f0,P0_f0,P0_f2
1326	andn	%l0,0x1f,%l0
1327	fmuld	P0_f2,C_pp2,P2_f20
1328	ldd	[%g1+%l0],%f32
1329	faddd	P2_f20,C_pp1,P2_f20
1330	fmuld	P0_f2,C_qq2,P0_f4
1331	ldd	[SC_HI+%l0],%f34
1332	fmuld	P0_f2,P2_f20,P2_f20
1333	faddd	P0_f4,C_qq1,P0_f4
1334	faddd	P2_f20,C_ONE,P2_f20
1335	fmuld	P0_f2,P0_f4,P0_f4
1336	fmuld	P0_f0,P2_f20,P2_f20
1337	ldd	[%o7+%l0],P0_f2
1338	fmuld	P0_f4,%f32,P0_f4
1339	fmuld	P2_f20,%f34,P2_f20
1340	fsubd	P2_f20,P0_f4,P2_f20
1341	fsubd	P0_f2,P2_f20,P2_f20
1342	ba,pt	%icc,2f
1343! delay slot
1344	faddd	P2_f20,%f32,P2_f20
13451:
1346	fmuld	P0_f0,P0_f0,P0_f2
1347	fmuld	P0_f2,C_q4,P0_f4
1348	faddd	P0_f4,C_q3,P0_f4
1349	fmuld	P0_f2,P0_f4,P0_f4
1350	faddd	P0_f4,C_q2,P0_f4
1351	fmuld	P0_f2,P0_f4,P0_f4
1352	faddd	P0_f4,C_q1,P0_f4
1353	fmuld	P0_f2,P0_f4,P0_f4
1354	!!(vsin)fmuld	P0_f0,P0_f4,P0_f4
1355	faddd	C_ONE,P0_f4,P2_f20 !!(vsin)faddd	P0_f0,P0_f4,P2_f20
13562:
1357	nop	!!(vsin) 	fors	P2_f20,P0_f9,P2_f20
1358	st	P2_f20,[%o0]
1359	st	P2_f21,[%o0+4]
1360
1361.endloop0:
1362	st	P0_f6,[%o3]
1363	st	P0_f7,[%o3+4]
1364	st	P1_f16,[%o4]
1365	st	P1_f17,[%o4+4]
1366	st	P2_f26,[%o5]
1367	st	P2_f27,[%o5+4]
1368
1369! return.  finished off with only primary range arguments
1370
1371	ret
1372	restore
1373
1374
1375	.align	32
1376.range0:
1377	cmp	%l0,LIM_l6
1378	bg,a,pt	%icc,.MEDIUM		! branch to Medium range on big arg.
1379! delay slot, annulled if branch not taken
1380	mov	0x1,LIM_l6		! set biguns flag or
1381	fdtoi	P0_f0,P0_f2; fmovd	C_ONE,P0_f0 ; st	P0_f0,[%o0]		! *y = *x with inexact if x nonzero
1382	st	P0_f1,[%o0+4]
1383	!nop		! (vsin) fdtoi	P0_f0,P0_f2
1384	addcc	%i0,-1,%i0
1385	ble,pn	%icc,.endloop0
1386! delay slot, harmless if branch taken
1387	add	%i3,%i4,%i3		! y += stridey
1388	andn	%l1,MSK_SIGN,%l0		! hx &= ~0x80000000
1389	fmovd	P1_f10,P0_f0
1390	ba,pt	%icc,.loop0
1391! delay slot
1392	add	%i1,%i2,%i1		! x += stridex
1393
1394
1395	.align	32
1396.range1:
1397	cmp	%l1,LIM_l6
1398	bg,a,pt	%icc,.MEDIUM		! branch to Medium range on big arg.
1399! delay slot, annulled if branch not taken
1400	mov	0x2,LIM_l6		! set biguns flag or
1401	fdtoi	P1_f10,P1_f12; fmovd	C_ONE,P1_f10 ; st	P1_f10,[%o1]		! *y = *x with inexact if x nonzero
1402	st	P1_f11,[%o1+4]
1403	!nop		! (vsin) fdtoi	P1_f10,P1_f12
1404	addcc	%i0,-1,%i0
1405	ble,pn	%icc,.endloop1
1406! delay slot, harmless if branch taken
1407	add	%i3,%i4,%i3		! y += stridey
1408	andn	%l2,MSK_SIGN,%l1		! hx &= ~0x80000000
1409	fmovd	P2_f20,P1_f10
1410	ba,pt	%icc,.loop1
1411! delay slot
1412	add	%i1,%i2,%i1		! x += stridex
1413
1414
1415	.align	32
1416.range2:
1417	cmp	%l2,LIM_l6
1418	bg,a,pt	%icc,.MEDIUM		! brance to Medium range on big arg.
1419! delay slot, annulled if branch not taken
1420	mov	0x3,LIM_l6		! set biguns flag or
1421	fdtoi	P2_f20,P2_f22; fmovd	C_ONE,P2_f20 ; st	P2_f20,[%o2]		! *y = *x with inexact if x nonzero
1422	st	P2_f21,[%o2+4]
1423	nop		! (vsin) fdtoi	P2_f20,P2_f22
14241:
1425	addcc	%i0,-1,%i0
1426	ble,pn	%icc,.endloop2
1427! delay slot
1428	nop
1429	ld	[%i1],%l2
1430	ld	[%i1],P2_f20
1431	ld	[%i1+4],P2_f21
1432	andn	%l2,MSK_SIGN,%l2		! hx &= ~0x80000000
1433	ba,pt	%icc,.loop2
1434! delay slot
1435	add	%i1,%i2,%i1		! x += stridex
1436
1437
1438	.align	32
1439.MEDIUM:
1440
1441! ========== medium range ==========
1442
1443! register use
1444
1445! i0  n
1446! i1  x
1447! i2  stridex
1448! i3  y
1449! i4  stridey
1450! i5  0x80000000
1451
1452! l0  hx0
1453! l1  hx1
1454! l2  hx2
1455! l3  __vlibm_TBL_sincos_hi
1456! l4  __vlibm_TBL_sincos_lo
1457! l5  constants
1458! l6  biguns stored here : still called LIM_l6
1459! l7  0x413921fb
1460
1461! the following are 64-bit registers in both V8+ and V9
1462
1463! g1  scratch
1464! g5
1465
1466! o0  py0
1467! o1  py1
1468! o2  py2
1469! o3  n0
1470! o4  n1
1471! o5  n2
1472! o7  scratch
1473
1474! f0  x0
1475! f2  n0,y0
1476! f4
1477! f6
1478! f8  scratch for table base
1479! f9  signbit0
1480! f10 x1
1481! f12 n1,y1
1482! f14
1483! f16
1484! f18 scratch for table base
1485! f19 signbit1
1486! f20 x2
1487! f22 n2,y2
1488! f24
1489! f26
1490! f28 scratch for table base
1491! f29 signbit2
1492! f30 0x80000000
1493! f31 0x4000
1494! f32
1495! f34
1496! f36
1497! f38
1498! f40 invpio2
1499! f42 round
1500! f44 0xffff800000000000
1501! f46 pio2_1
1502! f48 pio2_2
1503! f50 pio2_3
1504! f52 pio2_3t
1505! f54 one
1506! f56 pp1
1507! f58 pp2
1508! f60 qq1
1509! f62 qq2
1510
1511
1512	PIC_SET(g5,constants,l5)
1513
1514	! %o3,%o4,%o5 need to be stored
1515	st      P0_f6,[%o3]
1516	sethi	%hi(0x413921fb),%l7
1517	st      P0_f7,[%o3+4]
1518	or	%l7,%lo(0x413921fb),%l7
1519	st      P1_f16,[%o4]
1520	st      P1_f17,[%o4+4]
1521	st      P2_f26,[%o5]
1522	st      P2_f27,[%o5+4]
1523	ldd	[%l5+invpio2],%f40
1524	ldd	[%l5+round],%f42
1525	ldd	[%l5+pio2_1],%f46
1526	ldd	[%l5+pio2_2],%f48
1527	ldd	[%l5+pio2_3],%f50
1528	ldd	[%l5+pio2_3t],%f52
1529	std	%f54,[%fp+x0_1+8]	! set up stack data
1530	std	%f54,[%fp+x1_1+8]
1531	std	%f54,[%fp+x2_1+8]
1532	stx	%g0,[%fp+y0_0+8]
1533	stx	%g0,[%fp+y1_0+8]
1534	stx	%g0,[%fp+y2_0+8]
1535
1536!	branched here in the middle of the array.  Need to adjust
1537!	for the members of the triple that were selected in the primary
1538!	loop.
1539
1540!	no adjustment since all three selected here
1541	subcc	LIM_l6,0x1,%g0		! continue in LOOP0?
1542	bz,a	%icc,.LOOP0
1543	mov	0x0,LIM_l6		! delay slot set biguns=0
1544
1545!	ajust 1st triple since 2d and 3d done here
1546	subcc	LIM_l6,0x2,%g0		! continue in LOOP1?
1547	fmuld	%f0,%f40,%f2		! adj LOOP0
1548	bz,a	%icc,.LOOP1
1549	mov	0x0,LIM_l6		! delay slot set biguns=0
1550
1551!	ajust 1st and 2d triple since 3d done here
1552	subcc	LIM_l6,0x3,%g0		! continue in LOOP2?
1553	!done fmuld	%f0,%f40,%f2		! adj LOOP0
1554	sub	%i3,%i4,%i3		! adjust to not double increment
1555	fmuld	%f10,%f40,%f12		! adj LOOP1
1556	faddd	%f2,%f42,%f2		! adj LOOP1
1557	bz,a	%icc,.LOOP2
1558	mov	0x0,LIM_l6		! delay slot set biguns=0
1559
1560	ba	.LOOP0
1561	nop
1562
1563! -- 16 byte aligned
1564
1565	.align	32
1566.LOOP0:
1567	lda	[%i1]%asi,%l1		! preload next argument
1568	mov	%i3,%o0			! py0 = y
1569
1570	lda	[%i1]%asi,%f10
1571	cmp	%l0,%l7
1572	add	%i3,%i4,%i3		! y += stridey
1573	bg,pn	%icc,.BIG0		! if hx > 0x413921fb
1574
1575! delay slot
1576	lda	[%i1+4]%asi,%f11
1577	addcc	%i0,-1,%i0
1578	add	%i1,%i2,%i1		! x += stridex
1579	ble,pn	%icc,.ENDLOOP1
1580
1581! delay slot
1582	andn	%l1,%i5,%l1
1583	nop
1584	fmuld	%f0,%f40,%f2
1585	fabsd	%f54,%f54		! a nop for alignment only
1586
1587.LOOP1:
1588	lda	[%i1]%asi,%l2		! preload next argument
1589	mov	%i3,%o1			! py1 = y
1590
1591	lda	[%i1]%asi,%f20
1592	cmp	%l1,%l7
1593	add	%i3,%i4,%i3		! y += stridey
1594	bg,pn	%icc,.BIG1		! if hx > 0x413921fb
1595
1596! delay slot
1597	lda	[%i1+4]%asi,%f21
1598	addcc	%i0,-1,%i0
1599	add	%i1,%i2,%i1		! x += stridex
1600	ble,pn	%icc,.ENDLOOP2
1601
1602! delay slot
1603	andn	%l2,%i5,%l2
1604	nop
1605	fmuld	%f10,%f40,%f12
1606	faddd	%f2,%f42,%f2
1607
1608.LOOP2:
1609	st	%f3,[%fp+n0]
1610	mov	%i3,%o2			! py2 = y
1611
1612	cmp	%l2,%l7
1613	add	%i3,%i4,%i3		! y += stridey
1614	fmuld	%f20,%f40,%f22
1615	bg,pn	%icc,.BIG2		! if hx > 0x413921fb
1616
1617! delay slot
1618	add	%l5,thresh+4,%o7
1619	faddd	%f12,%f42,%f12
1620	st	%f13,[%fp+n1]
1621
1622! -
1623
1624	add	%l5,thresh,%g1
1625	faddd	%f22,%f42,%f22
1626	st	%f23,[%fp+n2]
1627
1628	fsubd	%f2,%f42,%f2		! n
1629
1630	fsubd	%f12,%f42,%f12		! n
1631
1632	fsubd	%f22,%f42,%f22		! n
1633
1634	fmuld	%f2,%f46,%f4
1635
1636	fmuld	%f12,%f46,%f14
1637
1638	fmuld	%f22,%f46,%f24
1639
1640	fsubd	%f0,%f4,%f4
1641	fmuld	%f2,%f48,%f6
1642
1643	fsubd	%f10,%f14,%f14
1644	fmuld	%f12,%f48,%f16
1645
1646	fsubd	%f20,%f24,%f24
1647	fmuld	%f22,%f48,%f26
1648
1649	fsubd	%f4,%f6,%f0
1650	ld	[%fp+n0],%o3 ; add	%o3,1,%o3
1651
1652	fsubd	%f14,%f16,%f10
1653	ld	[%fp+n1],%o4 ; add	%o4,1,%o4
1654
1655	fsubd	%f24,%f26,%f20
1656	ld	[%fp+n2],%o5 ; add	%o5,1,%o5
1657
1658	fsubd	%f4,%f0,%f32
1659	and	%o3,1,%o3
1660
1661	fsubd	%f14,%f10,%f34
1662	and	%o4,1,%o4
1663
1664	fsubd	%f24,%f20,%f36
1665	and	%o5,1,%o5
1666
1667	fsubd	%f32,%f6,%f32
1668	fmuld	%f2,%f50,%f8
1669	sll	%o3,3,%o3
1670
1671	fsubd	%f34,%f16,%f34
1672	fmuld	%f12,%f50,%f18
1673	sll	%o4,3,%o4
1674
1675	fsubd	%f36,%f26,%f36
1676	fmuld	%f22,%f50,%f28
1677	sll	%o5,3,%o5
1678
1679	fsubd	%f8,%f32,%f8
1680	ld	[%g1+%o3],%f6
1681
1682	fsubd	%f18,%f34,%f18
1683	ld	[%g1+%o4],%f16
1684
1685	fsubd	%f28,%f36,%f28
1686	ld	[%g1+%o5],%f26
1687
1688	fsubd	%f0,%f8,%f4
1689
1690	fsubd	%f10,%f18,%f14
1691
1692	fsubd	%f20,%f28,%f24
1693
1694	fsubd	%f0,%f4,%f32
1695
1696	fsubd	%f10,%f14,%f34
1697
1698	fsubd	%f20,%f24,%f36
1699
1700	fsubd	%f32,%f8,%f32
1701	fmuld	%f2,%f52,%f2
1702
1703	fsubd	%f34,%f18,%f34
1704	fmuld	%f12,%f52,%f12
1705
1706	fsubd	%f36,%f28,%f36
1707	fmuld	%f22,%f52,%f22
1708
1709	fsubd	%f2,%f32,%f2
1710	ld	[%o7+%o3],%f8
1711
1712	fsubd	%f12,%f34,%f12
1713	ld	[%o7+%o4],%f18
1714
1715	fsubd	%f22,%f36,%f22
1716	ld	[%o7+%o5],%f28
1717
1718	fsubd	%f4,%f2,%f0		! x
1719
1720	fsubd	%f14,%f12,%f10		! x
1721
1722	fsubd	%f24,%f22,%f20		! x
1723
1724	fsubd	%f4,%f0,%f4
1725
1726	fsubd	%f14,%f10,%f14
1727
1728	fsubd	%f24,%f20,%f24
1729
1730	fands	%f0,%f30,%f9		! save signbit
1731
1732	fands	%f10,%f30,%f19		! save signbit
1733
1734	fands	%f20,%f30,%f29		! save signbit
1735
1736	fabsd	%f0,%f0
1737	std	%f0,[%fp+x0_1]
1738
1739	fabsd	%f10,%f10
1740	std	%f10,[%fp+x1_1]
1741
1742	fabsd	%f20,%f20
1743	std	%f20,[%fp+x2_1]
1744
1745	fsubd	%f4,%f2,%f2		! y
1746
1747	fsubd	%f14,%f12,%f12		! y
1748
1749	fsubd	%f24,%f22,%f22		! y
1750
1751	fcmpgt32 %f6,%f0,%l0
1752
1753	fcmpgt32 %f16,%f10,%l1
1754
1755	fcmpgt32 %f26,%f20,%l2
1756
1757! -- 16 byte aligned
1758	fxors	%f2,%f9,%f2
1759
1760	fxors	%f12,%f19,%f12
1761
1762	fxors	%f22,%f29,%f22
1763
1764	fands	%f9,%f8,%f9		! if (n & 1) clear sign bit
1765	andcc	%l0,2,%g0
1766	bne,pn	%icc,.CASE4
1767
1768! delay slot
1769	fands	%f19,%f18,%f19		! if (n & 1) clear sign bit
1770	andcc	%l1,2,%g0
1771	bne,pn	%icc,.CASE2
1772
1773! delay slot
1774	fands	%f29,%f28,%f29		! if (n & 1) clear sign bit
1775	andcc	%l2,2,%g0
1776	bne,pn	%icc,.CASE1
1777
1778! delay slot
1779	fpadd32s %f0,%f31,%f8
1780	sethi	%hi(0x3fc3c000),%o7
1781	ld	[%fp+x0_1],%l0
1782
1783	fpadd32s %f10,%f31,%f18
1784	add	%l3,8,%g1
1785	ld	[%fp+x1_1],%l1
1786
1787	fpadd32s %f20,%f31,%f28
1788	ld	[%fp+x2_1],%l2
1789
1790	fand	%f8,%f44,%f4
1791	sub	%l0,%o7,%l0
1792
1793	fand	%f18,%f44,%f14
1794	sub	%l1,%o7,%l1
1795
1796	fand	%f28,%f44,%f24
1797	sub	%l2,%o7,%l2
1798
1799	fsubd	%f0,%f4,%f0
1800	srl	%l0,10,%l0
1801
1802	fsubd	%f10,%f14,%f10
1803	srl	%l1,10,%l1
1804
1805	fsubd	%f20,%f24,%f20
1806	srl	%l2,10,%l2
1807
1808	faddd	%f0,%f2,%f0
1809	andn	%l0,0x1f,%l0
1810
1811	faddd	%f10,%f12,%f10
1812	andn	%l1,0x1f,%l1
1813
1814	faddd	%f20,%f22,%f20
1815	andn	%l2,0x1f,%l2
1816
1817	fmuld	%f0,%f0,%f2
1818	add	%l0,%o3,%l0
1819
1820	fmuld	%f10,%f10,%f12
1821	add	%l1,%o4,%l1
1822
1823	fmuld	%f20,%f20,%f22
1824	add	%l2,%o5,%l2
1825
1826	fmuld	%f2,%f58,%f6
1827	ldd	[%l3+%l0],%f32
1828
1829	fmuld	%f12,%f58,%f16
1830	ldd	[%l3+%l1],%f34
1831
1832	fmuld	%f22,%f58,%f26
1833	ldd	[%l3+%l2],%f36
1834
1835	faddd	%f6,%f56,%f6
1836	fmuld	%f2,%f62,%f4
1837
1838	faddd	%f16,%f56,%f16
1839	fmuld	%f12,%f62,%f14
1840
1841	faddd	%f26,%f56,%f26
1842	fmuld	%f22,%f62,%f24
1843
1844	fmuld	%f2,%f6,%f6
1845	faddd	%f4,%f60,%f4
1846
1847	fmuld	%f12,%f16,%f16
1848	faddd	%f14,%f60,%f14
1849
1850	fmuld	%f22,%f26,%f26
1851	faddd	%f24,%f60,%f24
1852
1853	faddd	%f6,%f54,%f6
1854	fmuld	%f2,%f4,%f4
1855
1856	faddd	%f16,%f54,%f16
1857	fmuld	%f12,%f14,%f14
1858
1859	faddd	%f26,%f54,%f26
1860	fmuld	%f22,%f24,%f24
1861
1862	fmuld	%f0,%f6,%f6
1863	ldd	[%g1+%l0],%f2
1864
1865	fmuld	%f10,%f16,%f16
1866	ldd	[%g1+%l1],%f12
1867
1868	fmuld	%f20,%f26,%f26
1869	ldd	[%g1+%l2],%f22
1870
1871	fmuld	%f4,%f32,%f4
1872	ldd	[%l4+%l0],%f0
1873
1874	fmuld	%f14,%f34,%f14
1875	ldd	[%l4+%l1],%f10
1876
1877	fmuld	%f24,%f36,%f24
1878	ldd	[%l4+%l2],%f20
1879
1880	fmuld	%f6,%f2,%f6
1881
1882	fmuld	%f16,%f12,%f16
1883
1884	fmuld	%f26,%f22,%f26
1885
1886	faddd	%f6,%f4,%f6
1887
1888	faddd	%f16,%f14,%f16
1889
1890	faddd	%f26,%f24,%f26
1891
1892	faddd	%f6,%f0,%f6
1893
1894	faddd	%f16,%f10,%f16
1895
1896	faddd	%f26,%f20,%f26
1897
1898	faddd	%f6,%f32,%f6
1899
1900	faddd	%f16,%f34,%f16
1901
1902	faddd	%f26,%f36,%f26
1903
1904.FIXSIGN:
1905	ld	[%fp+n0],%o3 ; add	%o3,1,%o3
1906	add	%l5,thresh-4,%g1
1907
1908	ld	[%fp+n1],%o4 ; add	%o4,1,%o4
1909
1910	ld	[%fp+n2],%o5 ; add	%o5,1,%o5
1911	and	%o3,2,%o3
1912
1913	sll	%o3,2,%o3
1914	and	%o4,2,%o4
1915	lda	[%i1]%asi,%l0		! preload next argument
1916
1917	sll	%o4,2,%o4
1918	and	%o5,2,%o5
1919	ld	[%g1+%o3],%f8
1920
1921	sll	%o5,2,%o5
1922	ld	[%g1+%o4],%f18
1923
1924	ld	[%g1+%o5],%f28
1925	fxors	%f9,%f8,%f9
1926
1927	lda	[%i1]%asi,%f0
1928	fxors	%f29,%f28,%f29
1929
1930	lda	[%i1+4]%asi,%f1
1931	fxors	%f19,%f18,%f19
1932
1933	fors	%f6,%f9,%f6		! tack on sign
1934	add	%i1,%i2,%i1		! x += stridex
1935	st	%f6,[%o0]
1936
1937	fors	%f26,%f29,%f26		! tack on sign
1938	st	%f7,[%o0+4]
1939
1940	fors	%f16,%f19,%f16		! tack on sign
1941	st	%f26,[%o2]
1942
1943	st	%f27,[%o2+4]
1944	addcc	%i0,-1,%i0
1945
1946	st	%f16,[%o1]
1947	andn	%l0,%i5,%l0		! hx &= ~0x80000000
1948	bg,pt	%icc,.LOOP0
1949
1950! delay slot
1951	st	%f17,[%o1+4]
1952
1953	ba,pt	%icc,.ENDLOOP0
1954! delay slot
1955	nop
1956
1957	.align	32
1958.CASE1:
1959	fpadd32s %f10,%f31,%f18
1960	sethi	%hi(0x3fc3c000),%o7
1961	ld	[%fp+x0_1],%l0
1962
1963	fand	%f8,%f44,%f4
1964	add	%l3,8,%g1
1965	ld	[%fp+x1_1],%l1
1966
1967	fand	%f18,%f44,%f14
1968	sub	%l0,%o7,%l0
1969
1970	fsubd	%f0,%f4,%f0
1971	srl	%l0,10,%l0
1972	sub	%l1,%o7,%l1
1973
1974	fsubd	%f10,%f14,%f10
1975	srl	%l1,10,%l1
1976
1977	fmuld	%f20,%f20,%f20
1978	ldd	[%l5+%o5],%f36
1979	add	%l5,%o5,%l2
1980
1981	faddd	%f0,%f2,%f0
1982	andn	%l0,0x1f,%l0
1983
1984	faddd	%f10,%f12,%f10
1985	andn	%l1,0x1f,%l1
1986
1987	fmuld	%f20,%f36,%f24
1988	ldd	[%l2+0x10],%f26
1989	add	%fp,%o5,%o5
1990
1991	fmuld	%f0,%f0,%f2
1992	add	%l0,%o3,%l0
1993
1994	fmuld	%f10,%f10,%f12
1995	add	%l1,%o4,%l1
1996
1997	faddd	%f24,%f26,%f24
1998	ldd	[%l2+0x20],%f36
1999
2000	fmuld	%f2,%f58,%f6
2001	ldd	[%l3+%l0],%f32
2002
2003	fmuld	%f12,%f58,%f16
2004	ldd	[%l3+%l1],%f34
2005
2006	fmuld	%f20,%f24,%f24
2007	ldd	[%l2+0x30],%f26
2008
2009	faddd	%f6,%f56,%f6
2010	fmuld	%f2,%f62,%f4
2011
2012	faddd	%f16,%f56,%f16
2013	fmuld	%f12,%f62,%f14
2014
2015	faddd	%f24,%f36,%f24
2016	ldd	[%o5+x2_1],%f36
2017
2018	fmuld	%f2,%f6,%f6
2019	faddd	%f4,%f60,%f4
2020
2021	fmuld	%f12,%f16,%f16
2022	faddd	%f14,%f60,%f14
2023
2024	fmuld	%f20,%f24,%f24
2025
2026	faddd	%f6,%f54,%f6
2027	fmuld	%f2,%f4,%f4
2028	ldd	[%g1+%l0],%f2
2029
2030	faddd	%f16,%f54,%f16
2031	fmuld	%f12,%f14,%f14
2032	ldd	[%g1+%l1],%f12
2033
2034	faddd	%f24,%f26,%f24
2035
2036	fmuld	%f0,%f6,%f6
2037	ldd	[%l4+%l0],%f0
2038
2039	fmuld	%f10,%f16,%f16
2040	ldd	[%l4+%l1],%f10
2041
2042	fmuld	%f4,%f32,%f4
2043	std	%f22,[%fp+y2_0]
2044
2045	fmuld	%f14,%f34,%f14
2046
2047	fmuld	%f6,%f2,%f6
2048
2049	fmuld	%f16,%f12,%f16
2050
2051	fmuld	%f20,%f24,%f24
2052
2053	faddd	%f6,%f4,%f6
2054
2055	faddd	%f16,%f14,%f16
2056
2057	fmuld	%f36,%f24,%f24
2058	ldd	[%o5+y2_0],%f22
2059
2060	faddd	%f6,%f0,%f6
2061
2062	faddd	%f16,%f10,%f16
2063
2064	faddd	%f24,%f22,%f24
2065
2066	faddd	%f6,%f32,%f6
2067
2068	faddd	%f16,%f34,%f16
2069	ba,pt	%icc,.FIXSIGN
2070
2071! delay slot
2072	faddd	%f36,%f24,%f26
2073
2074	.align	32
2075.CASE2:
2076	fpadd32s %f0,%f31,%f8
2077	ld	[%fp+x0_1],%l0
2078	andcc	%l2,2,%g0
2079	bne,pn	%icc,.CASE3
2080
2081! delay slot
2082	sethi	%hi(0x3fc3c000),%o7
2083	fpadd32s %f20,%f31,%f28
2084	ld	[%fp+x2_1],%l2
2085
2086	fand	%f8,%f44,%f4
2087	sub	%l0,%o7,%l0
2088	add	%l3,8,%g1
2089
2090	fand	%f28,%f44,%f24
2091	sub	%l2,%o7,%l2
2092
2093	fsubd	%f0,%f4,%f0
2094	srl	%l0,10,%l0
2095
2096	fsubd	%f20,%f24,%f20
2097	srl	%l2,10,%l2
2098
2099	fmuld	%f10,%f10,%f10
2100	ldd	[%l5+%o4],%f34
2101	add	%l5,%o4,%l1
2102
2103	faddd	%f0,%f2,%f0
2104	andn	%l0,0x1f,%l0
2105
2106	faddd	%f20,%f22,%f20
2107	andn	%l2,0x1f,%l2
2108
2109	fmuld	%f10,%f34,%f14
2110	ldd	[%l1+0x10],%f16
2111	add	%fp,%o4,%o4
2112
2113	fmuld	%f0,%f0,%f2
2114	add	%l0,%o3,%l0
2115
2116	fmuld	%f20,%f20,%f22
2117	add	%l2,%o5,%l2
2118
2119	faddd	%f14,%f16,%f14
2120	ldd	[%l1+0x20],%f34
2121
2122	fmuld	%f2,%f58,%f6
2123	ldd	[%l3+%l0],%f32
2124
2125	fmuld	%f22,%f58,%f26
2126	ldd	[%l3+%l2],%f36
2127
2128	fmuld	%f10,%f14,%f14
2129	ldd	[%l1+0x30],%f16
2130
2131	faddd	%f6,%f56,%f6
2132	fmuld	%f2,%f62,%f4
2133
2134	faddd	%f26,%f56,%f26
2135	fmuld	%f22,%f62,%f24
2136
2137	faddd	%f14,%f34,%f14
2138	ldd	[%o4+x1_1],%f34
2139
2140	fmuld	%f2,%f6,%f6
2141	faddd	%f4,%f60,%f4
2142
2143	fmuld	%f22,%f26,%f26
2144	faddd	%f24,%f60,%f24
2145
2146	fmuld	%f10,%f14,%f14
2147
2148	faddd	%f6,%f54,%f6
2149	fmuld	%f2,%f4,%f4
2150	ldd	[%g1+%l0],%f2
2151
2152	faddd	%f26,%f54,%f26
2153	fmuld	%f22,%f24,%f24
2154	ldd	[%g1+%l2],%f22
2155
2156	faddd	%f14,%f16,%f14
2157
2158	fmuld	%f0,%f6,%f6
2159	ldd	[%l4+%l0],%f0
2160
2161	fmuld	%f20,%f26,%f26
2162	ldd	[%l4+%l2],%f20
2163
2164	fmuld	%f4,%f32,%f4
2165	std	%f12,[%fp+y1_0]
2166
2167	fmuld	%f24,%f36,%f24
2168
2169	fmuld	%f6,%f2,%f6
2170
2171	fmuld	%f26,%f22,%f26
2172
2173	fmuld	%f10,%f14,%f14
2174
2175	faddd	%f6,%f4,%f6
2176
2177	faddd	%f26,%f24,%f26
2178
2179	fmuld	%f34,%f14,%f14
2180	ldd	[%o4+y1_0],%f12
2181
2182	faddd	%f6,%f0,%f6
2183
2184	faddd	%f26,%f20,%f26
2185
2186	faddd	%f14,%f12,%f14
2187
2188	faddd	%f6,%f32,%f6
2189
2190	faddd	%f26,%f36,%f26
2191	ba,pt	%icc,.FIXSIGN
2192
2193! delay slot
2194	faddd	%f34,%f14,%f16
2195
2196	.align	32
2197.CASE3:
2198	fand	%f8,%f44,%f4
2199	add	%l3,8,%g1
2200	sub	%l0,%o7,%l0
2201
2202	fmuld	%f10,%f10,%f10
2203	ldd	[%l5+%o4],%f34
2204	add	%l5,%o4,%l1
2205
2206	fsubd	%f0,%f4,%f0
2207	srl	%l0,10,%l0
2208
2209	fmuld	%f20,%f20,%f20
2210	ldd	[%l5+%o5],%f36
2211	add	%l5,%o5,%l2
2212
2213	fmuld	%f10,%f34,%f14
2214	ldd	[%l1+0x10],%f16
2215	add	%fp,%o4,%o4
2216
2217	faddd	%f0,%f2,%f0
2218	andn	%l0,0x1f,%l0
2219
2220	fmuld	%f20,%f36,%f24
2221	ldd	[%l2+0x10],%f26
2222	add	%fp,%o5,%o5
2223
2224	faddd	%f14,%f16,%f14
2225	ldd	[%l1+0x20],%f34
2226
2227	fmuld	%f0,%f0,%f2
2228	add	%l0,%o3,%l0
2229
2230	faddd	%f24,%f26,%f24
2231	ldd	[%l2+0x20],%f36
2232
2233	fmuld	%f10,%f14,%f14
2234	ldd	[%l1+0x30],%f16
2235
2236	fmuld	%f2,%f58,%f6
2237	ldd	[%l3+%l0],%f32
2238
2239	fmuld	%f20,%f24,%f24
2240	ldd	[%l2+0x30],%f26
2241
2242	faddd	%f14,%f34,%f14
2243	ldd	[%o4+x1_1],%f34
2244
2245	faddd	%f6,%f56,%f6
2246	fmuld	%f2,%f62,%f4
2247
2248	faddd	%f24,%f36,%f24
2249	ldd	[%o5+x2_1],%f36
2250
2251	fmuld	%f10,%f14,%f14
2252	std	%f12,[%fp+y1_0]
2253
2254	fmuld	%f2,%f6,%f6
2255	faddd	%f4,%f60,%f4
2256
2257	fmuld	%f20,%f24,%f24
2258	std	%f22,[%fp+y2_0]
2259
2260	faddd	%f14,%f16,%f14
2261
2262	faddd	%f6,%f54,%f6
2263	fmuld	%f2,%f4,%f4
2264	ldd	[%g1+%l0],%f2
2265
2266	faddd	%f24,%f26,%f24
2267
2268	fmuld	%f10,%f14,%f14
2269
2270	fmuld	%f0,%f6,%f6
2271	ldd	[%l4+%l0],%f0
2272
2273	fmuld	%f4,%f32,%f4
2274
2275	fmuld	%f20,%f24,%f24
2276
2277	fmuld	%f6,%f2,%f6
2278
2279	fmuld	%f34,%f14,%f14
2280	ldd	[%o4+y1_0],%f12
2281
2282	fmuld	%f36,%f24,%f24
2283	ldd	[%o5+y2_0],%f22
2284
2285	faddd	%f6,%f4,%f6
2286
2287	faddd	%f14,%f12,%f14
2288
2289	faddd	%f24,%f22,%f24
2290
2291	faddd	%f6,%f0,%f6
2292
2293	faddd	%f34,%f14,%f16
2294
2295	faddd	%f36,%f24,%f26
2296	ba,pt	%icc,.FIXSIGN
2297
2298! delay slot
2299	faddd	%f6,%f32,%f6
2300
2301	.align	32
2302.CASE4:
2303	fands	%f29,%f28,%f29		! if (n & 1) clear sign bit
2304	sethi	%hi(0x3fc3c000),%o7
2305	andcc	%l1,2,%g0
2306	bne,pn	%icc,.CASE6
2307
2308! delay slot
2309	andcc	%l2,2,%g0
2310	fpadd32s %f10,%f31,%f18
2311	ld	[%fp+x1_1],%l1
2312	bne,pn	%icc,.CASE5
2313
2314! delay slot
2315	add	%l3,8,%g1
2316	ld	[%fp+x2_1],%l2
2317	fpadd32s %f20,%f31,%f28
2318
2319	fand	%f18,%f44,%f14
2320	sub	%l1,%o7,%l1
2321
2322	fand	%f28,%f44,%f24
2323	sub	%l2,%o7,%l2
2324
2325	fsubd	%f10,%f14,%f10
2326	srl	%l1,10,%l1
2327
2328	fsubd	%f20,%f24,%f20
2329	srl	%l2,10,%l2
2330
2331	fmuld	%f0,%f0,%f0
2332	ldd	[%l5+%o3],%f32
2333	add	%l5,%o3,%l0
2334
2335	faddd	%f10,%f12,%f10
2336	andn	%l1,0x1f,%l1
2337
2338	faddd	%f20,%f22,%f20
2339	andn	%l2,0x1f,%l2
2340
2341	fmuld	%f0,%f32,%f4
2342	ldd	[%l0+0x10],%f6
2343	add	%fp,%o3,%o3
2344
2345	fmuld	%f10,%f10,%f12
2346	add	%l1,%o4,%l1
2347
2348	fmuld	%f20,%f20,%f22
2349	add	%l2,%o5,%l2
2350
2351	faddd	%f4,%f6,%f4
2352	ldd	[%l0+0x20],%f32
2353
2354	fmuld	%f12,%f58,%f16
2355	ldd	[%l3+%l1],%f34
2356
2357	fmuld	%f22,%f58,%f26
2358	ldd	[%l3+%l2],%f36
2359
2360	fmuld	%f0,%f4,%f4
2361	ldd	[%l0+0x30],%f6
2362
2363	faddd	%f16,%f56,%f16
2364	fmuld	%f12,%f62,%f14
2365
2366	faddd	%f26,%f56,%f26
2367	fmuld	%f22,%f62,%f24
2368
2369	faddd	%f4,%f32,%f4
2370	ldd	[%o3+x0_1],%f32
2371
2372	fmuld	%f12,%f16,%f16
2373	faddd	%f14,%f60,%f14
2374
2375	fmuld	%f22,%f26,%f26
2376	faddd	%f24,%f60,%f24
2377
2378	fmuld	%f0,%f4,%f4
2379
2380	faddd	%f16,%f54,%f16
2381	fmuld	%f12,%f14,%f14
2382	ldd	[%g1+%l1],%f12
2383
2384	faddd	%f26,%f54,%f26
2385	fmuld	%f22,%f24,%f24
2386	ldd	[%g1+%l2],%f22
2387
2388	faddd	%f4,%f6,%f4
2389
2390	fmuld	%f10,%f16,%f16
2391	ldd	[%l4+%l1],%f10
2392
2393	fmuld	%f20,%f26,%f26
2394	ldd	[%l4+%l2],%f20
2395
2396	fmuld	%f14,%f34,%f14
2397	std	%f2,[%fp+y0_0]
2398
2399	fmuld	%f24,%f36,%f24
2400
2401	fmuld	%f0,%f4,%f4
2402
2403	fmuld	%f16,%f12,%f16
2404
2405	fmuld	%f26,%f22,%f26
2406
2407	fmuld	%f32,%f4,%f4
2408	ldd	[%o3+y0_0],%f2
2409
2410	faddd	%f16,%f14,%f16
2411
2412	faddd	%f26,%f24,%f26
2413
2414	faddd	%f4,%f2,%f4
2415
2416	faddd	%f16,%f10,%f16
2417
2418	faddd	%f26,%f20,%f26
2419
2420	faddd	%f32,%f4,%f6
2421
2422	faddd	%f16,%f34,%f16
2423	ba,pt	%icc,.FIXSIGN
2424
2425! delay slot
2426	faddd	%f26,%f36,%f26
2427
2428	.align	32
2429.CASE5:
2430	fand	%f18,%f44,%f14
2431	sub	%l1,%o7,%l1
2432
2433	fmuld	%f0,%f0,%f0
2434	ldd	[%l5+%o3],%f32
2435	add	%l5,%o3,%l0
2436
2437	fsubd	%f10,%f14,%f10
2438	srl	%l1,10,%l1
2439
2440	fmuld	%f20,%f20,%f20
2441	ldd	[%l5+%o5],%f36
2442	add	%l5,%o5,%l2
2443
2444	fmuld	%f0,%f32,%f4
2445	ldd	[%l0+0x10],%f6
2446	add	%fp,%o3,%o3
2447
2448	faddd	%f10,%f12,%f10
2449	andn	%l1,0x1f,%l1
2450
2451	fmuld	%f20,%f36,%f24
2452	ldd	[%l2+0x10],%f26
2453	add	%fp,%o5,%o5
2454
2455	faddd	%f4,%f6,%f4
2456	ldd	[%l0+0x20],%f32
2457
2458	fmuld	%f10,%f10,%f12
2459	add	%l1,%o4,%l1
2460
2461	faddd	%f24,%f26,%f24
2462	ldd	[%l2+0x20],%f36
2463
2464	fmuld	%f0,%f4,%f4
2465	ldd	[%l0+0x30],%f6
2466
2467	fmuld	%f12,%f58,%f16
2468	ldd	[%l3+%l1],%f34
2469
2470	fmuld	%f20,%f24,%f24
2471	ldd	[%l2+0x30],%f26
2472
2473	faddd	%f4,%f32,%f4
2474	ldd	[%o3+x0_1],%f32
2475
2476	faddd	%f16,%f56,%f16
2477	fmuld	%f12,%f62,%f14
2478
2479	faddd	%f24,%f36,%f24
2480	ldd	[%o5+x2_1],%f36
2481
2482	fmuld	%f0,%f4,%f4
2483	std	%f2,[%fp+y0_0]
2484
2485	fmuld	%f12,%f16,%f16
2486	faddd	%f14,%f60,%f14
2487
2488	fmuld	%f20,%f24,%f24
2489	std	%f22,[%fp+y2_0]
2490
2491	faddd	%f4,%f6,%f4
2492
2493	faddd	%f16,%f54,%f16
2494	fmuld	%f12,%f14,%f14
2495	ldd	[%g1+%l1],%f12
2496
2497	faddd	%f24,%f26,%f24
2498
2499	fmuld	%f0,%f4,%f4
2500
2501	fmuld	%f10,%f16,%f16
2502	ldd	[%l4+%l1],%f10
2503
2504	fmuld	%f14,%f34,%f14
2505
2506	fmuld	%f20,%f24,%f24
2507
2508	fmuld	%f16,%f12,%f16
2509
2510	fmuld	%f32,%f4,%f4
2511	ldd	[%o3+y0_0],%f2
2512
2513	fmuld	%f36,%f24,%f24
2514	ldd	[%o5+y2_0],%f22
2515
2516	faddd	%f16,%f14,%f16
2517
2518	faddd	%f4,%f2,%f4
2519
2520	faddd	%f24,%f22,%f24
2521
2522	faddd	%f16,%f10,%f16
2523
2524	faddd	%f32,%f4,%f6
2525
2526	faddd	%f36,%f24,%f26
2527	ba,pt	%icc,.FIXSIGN
2528
2529! delay slot
2530	faddd	%f16,%f34,%f16
2531
2532	.align	32
2533.CASE6:
2534	ld	[%fp+x2_1],%l2
2535	add	%l3,8,%g1
2536	bne,pn	%icc,.CASE7
2537! delay slot
2538	fpadd32s %f20,%f31,%f28
2539
2540	fand	%f28,%f44,%f24
2541	ldd	[%l5+%o3],%f32
2542	add	%l5,%o3,%l0
2543
2544	fmuld	%f0,%f0,%f0
2545	sub	%l2,%o7,%l2
2546
2547	fsubd	%f20,%f24,%f20
2548	srl	%l2,10,%l2
2549
2550	fmuld	%f10,%f10,%f10
2551	ldd	[%l5+%o4],%f34
2552	add	%l5,%o4,%l1
2553
2554	fmuld	%f0,%f32,%f4
2555	ldd	[%l0+0x10],%f6
2556	add	%fp,%o3,%o3
2557
2558	faddd	%f20,%f22,%f20
2559	andn	%l2,0x1f,%l2
2560
2561	fmuld	%f10,%f34,%f14
2562	ldd	[%l1+0x10],%f16
2563	add	%fp,%o4,%o4
2564
2565	faddd	%f4,%f6,%f4
2566	ldd	[%l0+0x20],%f32
2567
2568	fmuld	%f20,%f20,%f22
2569	add	%l2,%o5,%l2
2570
2571	faddd	%f14,%f16,%f14
2572	ldd	[%l1+0x20],%f34
2573
2574	fmuld	%f0,%f4,%f4
2575	ldd	[%l0+0x30],%f6
2576
2577	fmuld	%f22,%f58,%f26
2578	ldd	[%l3+%l2],%f36
2579
2580	fmuld	%f10,%f14,%f14
2581	ldd	[%l1+0x30],%f16
2582
2583	faddd	%f4,%f32,%f4
2584	ldd	[%o3+x0_1],%f32
2585
2586	faddd	%f26,%f56,%f26
2587	fmuld	%f22,%f62,%f24
2588
2589	faddd	%f14,%f34,%f14
2590	ldd	[%o4+x1_1],%f34
2591
2592	fmuld	%f0,%f4,%f4
2593	std	%f2,[%fp+y0_0]
2594
2595	fmuld	%f22,%f26,%f26
2596	faddd	%f24,%f60,%f24
2597
2598	fmuld	%f10,%f14,%f14
2599	std	%f12,[%fp+y1_0]
2600
2601	faddd	%f4,%f6,%f4
2602
2603	faddd	%f26,%f54,%f26
2604	fmuld	%f22,%f24,%f24
2605	ldd	[%g1+%l2],%f22
2606
2607	faddd	%f14,%f16,%f14
2608
2609	fmuld	%f0,%f4,%f4
2610
2611	fmuld	%f20,%f26,%f26
2612	ldd	[%l4+%l2],%f20
2613
2614	fmuld	%f24,%f36,%f24
2615
2616	fmuld	%f10,%f14,%f14
2617
2618	fmuld	%f26,%f22,%f26
2619
2620	fmuld	%f32,%f4,%f4
2621	ldd	[%o3+y0_0],%f2
2622
2623	fmuld	%f34,%f14,%f14
2624	ldd	[%o4+y1_0],%f12
2625
2626	faddd	%f26,%f24,%f26
2627
2628	faddd	%f4,%f2,%f4
2629
2630	faddd	%f14,%f12,%f14
2631
2632	faddd	%f26,%f20,%f26
2633
2634	faddd	%f32,%f4,%f6
2635
2636	faddd	%f34,%f14,%f16
2637	ba,pt	%icc,.FIXSIGN
2638
2639! delay slot
2640	faddd	%f26,%f36,%f26
2641
2642	.align	32
2643.CASE7:
2644	fmuld	%f0,%f0,%f0
2645	ldd	[%l5+%o3],%f32
2646	add	%l5,%o3,%l0
2647
2648	fmuld	%f10,%f10,%f10
2649	ldd	[%l5+%o4],%f34
2650	add	%l5,%o4,%l1
2651
2652	fmuld	%f20,%f20,%f20
2653	ldd	[%l5+%o5],%f36
2654	add	%l5,%o5,%l2
2655
2656	fmuld	%f0,%f32,%f4
2657	ldd	[%l0+0x10],%f6
2658	add	%fp,%o3,%o3
2659
2660	fmuld	%f10,%f34,%f14
2661	ldd	[%l1+0x10],%f16
2662	add	%fp,%o4,%o4
2663
2664	fmuld	%f20,%f36,%f24
2665	ldd	[%l2+0x10],%f26
2666	add	%fp,%o5,%o5
2667
2668	faddd	%f4,%f6,%f4
2669	ldd	[%l0+0x20],%f32
2670
2671	faddd	%f14,%f16,%f14
2672	ldd	[%l1+0x20],%f34
2673
2674	faddd	%f24,%f26,%f24
2675	ldd	[%l2+0x20],%f36
2676
2677	fmuld	%f0,%f4,%f4
2678	ldd	[%l0+0x30],%f6
2679
2680	fmuld	%f10,%f14,%f14
2681	ldd	[%l1+0x30],%f16
2682
2683	fmuld	%f20,%f24,%f24
2684	ldd	[%l2+0x30],%f26
2685
2686	faddd	%f4,%f32,%f4
2687	ldd	[%o3+x0_1],%f32
2688
2689	faddd	%f14,%f34,%f14
2690	ldd	[%o4+x1_1],%f34
2691
2692	faddd	%f24,%f36,%f24
2693	ldd	[%o5+x2_1],%f36
2694
2695	fmuld	%f0,%f4,%f4
2696	std	%f2,[%fp+y0_0]
2697
2698	fmuld	%f10,%f14,%f14
2699	std	%f12,[%fp+y1_0]
2700
2701	fmuld	%f20,%f24,%f24
2702	std	%f22,[%fp+y2_0]
2703
2704	faddd	%f4,%f6,%f4
2705
2706	faddd	%f14,%f16,%f14
2707
2708	faddd	%f24,%f26,%f24
2709
2710	fmuld	%f0,%f4,%f4
2711
2712	fmuld	%f10,%f14,%f14
2713
2714	fmuld	%f20,%f24,%f24
2715
2716	fmuld	%f32,%f4,%f4
2717	ldd	[%o3+y0_0],%f2
2718
2719	fmuld	%f34,%f14,%f14
2720	ldd	[%o4+y1_0],%f12
2721
2722	fmuld	%f36,%f24,%f24
2723	ldd	[%o5+y2_0],%f22
2724
2725	faddd	%f4,%f2,%f4
2726
2727	faddd	%f14,%f12,%f14
2728
2729	faddd	%f24,%f22,%f24
2730
2731	faddd	%f32,%f4,%f6
2732
2733	faddd	%f34,%f14,%f16
2734	ba,pt	%icc,.FIXSIGN
2735
2736! delay slot
2737	faddd	%f36,%f24,%f26
2738
2739
2740	.align	32
2741.ENDLOOP2:
2742	fmuld	%f10,%f40,%f12
2743	add	%l5,thresh,%g1
2744	faddd	%f12,%f42,%f12
2745	st	%f13,[%fp+n1]
2746	fsubd	%f12,%f42,%f12		! n
2747	fmuld	%f12,%f46,%f14
2748	fsubd	%f10,%f14,%f14
2749	fmuld	%f12,%f48,%f16
2750	fsubd	%f14,%f16,%f10
2751	ld	[%fp+n1],%o4 ; add	%o4,1,%o4
2752	fsubd	%f14,%f10,%f34
2753	and	%o4,1,%o4
2754	fsubd	%f34,%f16,%f34
2755	fmuld	%f12,%f50,%f18
2756	sll	%o4,3,%o4
2757	fsubd	%f18,%f34,%f18
2758	ld	[%g1+%o4],%f16
2759	fsubd	%f10,%f18,%f14
2760	fsubd	%f10,%f14,%f34
2761	add	%l5,thresh+4,%o7
2762	fsubd	%f34,%f18,%f34
2763	fmuld	%f12,%f52,%f12
2764	fsubd	%f12,%f34,%f12
2765	ld	[%o7+%o4],%f18
2766	fsubd	%f14,%f12,%f10		! x
2767	fsubd	%f14,%f10,%f14
2768	fands	%f10,%f30,%f19		! save signbit
2769	fabsd	%f10,%f10
2770	std	%f10,[%fp+x1_1]
2771	fsubd	%f14,%f12,%f12		! y
2772	fcmpgt32 %f16,%f10,%l1
2773	fxors	%f12,%f19,%f12
2774	fands	%f19,%f18,%f19		! if (n & 1) clear sign bit
2775	andcc	%l1,2,%g0
2776	bne,pn	%icc,1f
2777! delay slot
2778	nop
2779	fpadd32s %f10,%f31,%f18
2780	ld	[%fp+x1_1],%l1
2781	fand	%f18,%f44,%f14
2782	sethi	%hi(0x3fc3c000),%o7
2783	add	%l3,8,%g1
2784	fsubd	%f10,%f14,%f10
2785	sub	%l1,%o7,%l1
2786	srl	%l1,10,%l1
2787	faddd	%f10,%f12,%f10
2788	andn	%l1,0x1f,%l1
2789	fmuld	%f10,%f10,%f12
2790	add	%l1,%o4,%l1
2791	fmuld	%f12,%f58,%f16
2792	ldd	[%l3+%l1],%f34
2793	faddd	%f16,%f56,%f16
2794	fmuld	%f12,%f62,%f14
2795	fmuld	%f12,%f16,%f16
2796	faddd	%f14,%f60,%f14
2797	faddd	%f16,%f54,%f16
2798	fmuld	%f12,%f14,%f14
2799	ldd	[%g1+%l1],%f12
2800	fmuld	%f10,%f16,%f16
2801	ldd	[%l4+%l1],%f10
2802	fmuld	%f14,%f34,%f14
2803	fmuld	%f16,%f12,%f16
2804	faddd	%f16,%f14,%f16
2805	faddd	%f16,%f10,%f16
2806	ba,pt	%icc,2f
2807	faddd	%f16,%f34,%f16
28081:
2809	fmuld	%f10,%f10,%f10
2810	ldd	[%l5+%o4],%f34
2811	add	%l5,%o4,%l1
2812	fmuld	%f10,%f34,%f14
2813	ldd	[%l1+0x10],%f16
2814	add	%fp,%o4,%o4
2815	faddd	%f14,%f16,%f14
2816	ldd	[%l1+0x20],%f34
2817	fmuld	%f10,%f14,%f14
2818	ldd	[%l1+0x30],%f16
2819	faddd	%f14,%f34,%f14
2820	ldd	[%o4+x1_1],%f34
2821	fmuld	%f10,%f14,%f14
2822	std	%f12,[%fp+y1_0]
2823	faddd	%f14,%f16,%f14
2824	fmuld	%f10,%f14,%f14
2825	fmuld	%f34,%f14,%f14
2826	ldd	[%o4+y1_0],%f12
2827	faddd	%f14,%f12,%f14
2828	faddd	%f34,%f14,%f16
28292:
2830	add	%l5,thresh-4,%g1
2831	ld	[%fp+n1],%o4 ; add	%o4,1,%o4
2832	and	%o4,2,%o4
2833	sll	%o4,2,%o4
2834	ld	[%g1+%o4],%f18
2835	fxors	%f19,%f18,%f19
2836	fors	%f16,%f19,%f16		! tack on sign
2837	st	%f16,[%o1]
2838	st	%f17,[%o1+4]
2839
2840.ENDLOOP1:
2841	fmuld	%f0,%f40,%f2
2842	add	%l5,thresh,%g1
2843	faddd	%f2,%f42,%f2
2844	st	%f3,[%fp+n0]
2845	fsubd	%f2,%f42,%f2		! n
2846	fmuld	%f2,%f46,%f4
2847	fsubd	%f0,%f4,%f4
2848	fmuld	%f2,%f48,%f6
2849	fsubd	%f4,%f6,%f0
2850	ld	[%fp+n0],%o3 ; add	%o3,1,%o3
2851	fsubd	%f4,%f0,%f32
2852	and	%o3,1,%o3
2853	fsubd	%f32,%f6,%f32
2854	fmuld	%f2,%f50,%f8
2855	sll	%o3,3,%o3
2856	fsubd	%f8,%f32,%f8
2857	ld	[%g1+%o3],%f6
2858	fsubd	%f0,%f8,%f4
2859	fsubd	%f0,%f4,%f32
2860	add	%l5,thresh+4,%o7
2861	fsubd	%f32,%f8,%f32
2862	fmuld	%f2,%f52,%f2
2863	fsubd	%f2,%f32,%f2
2864	ld	[%o7+%o3],%f8
2865	fsubd	%f4,%f2,%f0		! x
2866	fsubd	%f4,%f0,%f4
2867	fands	%f0,%f30,%f9		! save signbit
2868	fabsd	%f0,%f0
2869	std	%f0,[%fp+x0_1]
2870	fsubd	%f4,%f2,%f2		! y
2871	fcmpgt32 %f6,%f0,%l0
2872	fxors	%f2,%f9,%f2
2873	fands	%f9,%f8,%f9		! if (n & 1) clear sign bit
2874	andcc	%l0,2,%g0
2875	bne,pn	%icc,1f
2876! delay slot
2877	nop
2878	fpadd32s %f0,%f31,%f8
2879	ld	[%fp+x0_1],%l0
2880	fand	%f8,%f44,%f4
2881	sethi	%hi(0x3fc3c000),%o7
2882	add	%l3,8,%g1
2883	fsubd	%f0,%f4,%f0
2884	sub	%l0,%o7,%l0
2885	srl	%l0,10,%l0
2886	faddd	%f0,%f2,%f0
2887	andn	%l0,0x1f,%l0
2888	fmuld	%f0,%f0,%f2
2889	add	%l0,%o3,%l0
2890	fmuld	%f2,%f58,%f6
2891	ldd	[%l3+%l0],%f32
2892	faddd	%f6,%f56,%f6
2893	fmuld	%f2,%f62,%f4
2894	fmuld	%f2,%f6,%f6
2895	faddd	%f4,%f60,%f4
2896	faddd	%f6,%f54,%f6
2897	fmuld	%f2,%f4,%f4
2898	ldd	[%g1+%l0],%f2
2899	fmuld	%f0,%f6,%f6
2900	ldd	[%l4+%l0],%f0
2901	fmuld	%f4,%f32,%f4
2902	fmuld	%f6,%f2,%f6
2903	faddd	%f6,%f4,%f6
2904	faddd	%f6,%f0,%f6
2905	ba,pt	%icc,2f
2906	faddd	%f6,%f32,%f6
29071:
2908	fmuld	%f0,%f0,%f0
2909	ldd	[%l5+%o3],%f32
2910	add	%l5,%o3,%l0
2911	fmuld	%f0,%f32,%f4
2912	ldd	[%l0+0x10],%f6
2913	add	%fp,%o3,%o3
2914	faddd	%f4,%f6,%f4
2915	ldd	[%l0+0x20],%f32
2916	fmuld	%f0,%f4,%f4
2917	ldd	[%l0+0x30],%f6
2918	faddd	%f4,%f32,%f4
2919	ldd	[%o3+x0_1],%f32
2920	fmuld	%f0,%f4,%f4
2921	std	%f2,[%fp+y0_0]
2922	faddd	%f4,%f6,%f4
2923	fmuld	%f0,%f4,%f4
2924	fmuld	%f32,%f4,%f4
2925	ldd	[%o3+y0_0],%f2
2926	faddd	%f4,%f2,%f4
2927	faddd	%f32,%f4,%f6
29282:
2929	add	%l5,thresh-4,%g1
2930	ld	[%fp+n0],%o3 ; add	%o3,1,%o3
2931	and	%o3,2,%o3
2932	sll	%o3,2,%o3
2933	ld	[%g1+%o3],%f8
2934	fxors	%f9,%f8,%f9
2935	fors	%f6,%f9,%f6		! tack on sign
2936	st	%f6,[%o0]
2937	st	%f7,[%o0+4]
2938
2939.ENDLOOP0:
2940
2941! check for huge arguments remaining
2942
2943	tst	LIM_l6
2944	be,pt	%icc,.exit
2945! delay slot
2946	nop
2947
2948! ========== huge range (use C code) ==========
2949
2950#ifdef __sparcv9
2951	ldx	[%fp+xsave],%o1
2952	ldx	[%fp+ysave],%o3
2953#else
2954	ld	[%fp+xsave],%o1
2955	ld	[%fp+ysave],%o3
2956#endif
2957	ld	[%fp+nsave],%o0
2958	ld	[%fp+sxsave],%o2
2959	ld	[%fp+sysave],%o4
2960	sra	%o2,0,%o2		! sign-extend for V9
2961	sra	%o4,0,%o4
2962	call	__vlibm_vcos_big
2963	mov	%l7,%o5			! delay slot
2964
2965.exit:
2966	ret
2967	restore
2968
2969
2970	.align	32
2971.SKIP0:
2972	addcc	%i0,-1,%i0
2973	ble,pn	%icc,.ENDLOOP0
2974! delay slot, harmless if branch taken
2975	add	%i3,%i4,%i3		! y += stridey
2976	andn	%l1,%i5,%l0		! hx &= ~0x80000000
2977	fmovs	%f10,%f0
2978	ld	[%i1+4],%f1
2979	ba,pt	%icc,.LOOP0
2980! delay slot
2981	add	%i1,%i2,%i1		! x += stridex
2982
2983
2984	.align	32
2985.SKIP1:
2986	addcc	%i0,-1,%i0
2987	ble,pn	%icc,.ENDLOOP1
2988! delay slot, harmless if branch taken
2989	add	%i3,%i4,%i3		! y += stridey
2990	andn	%l2,%i5,%l1		! hx &= ~0x80000000
2991	fmovs	%f20,%f10
2992	ld	[%i1+4],%f11
2993	ba,pt	%icc,.LOOP1
2994! delay slot
2995	add	%i1,%i2,%i1		! x += stridex
2996
2997
2998	.align	32
2999.SKIP2:
3000	addcc	%i0,-1,%i0
3001	ble,pn	%icc,.ENDLOOP2
3002! delay slot, harmless if branch taken
3003	add	%i3,%i4,%i3		! y += stridey
3004	ld	[%i1],%l2
3005	ld	[%i1],%f20
3006	ld	[%i1+4],%f21
3007	andn	%l2,%i5,%l2		! hx &= ~0x80000000
3008	ba,pt	%icc,.LOOP2
3009! delay slot
3010	add	%i1,%i2,%i1		! x += stridex
3011
3012
3013	.align	32
3014.BIG0:
3015	sethi	%hi(0x7ff00000),%o7
3016	cmp	%l0,%o7
3017	bl,a,pt	%icc,1f			! if hx < 0x7ff00000
3018! delay slot, annulled if branch not taken
3019	mov	%l7,LIM_l6	! set biguns flag or
3020	fsubd	%f0,%f0,%f0		! y = x - x
3021	st	%f0,[%o0]
3022	st	%f1,[%o0+4]
30231:
3024	addcc	%i0,-1,%i0
3025	ble,pn	%icc,.ENDLOOP0
3026! delay slot, harmless if branch taken
3027	andn	%l1,%i5,%l0		! hx &= ~0x80000000
3028	fmovd	%f10,%f0
3029	ba,pt	%icc,.LOOP0
3030! delay slot
3031	add	%i1,%i2,%i1		! x += stridex
3032
3033
3034	.align	32
3035.BIG1:
3036	sethi	%hi(0x7ff00000),%o7
3037	cmp	%l1,%o7
3038	bl,a,pt	%icc,1f			! if hx < 0x7ff00000
3039! delay slot, annulled if branch not taken
3040	mov	%l7,LIM_l6		! set biguns flag or
3041	fsubd	%f10,%f10,%f10		! y = x - x
3042	st	%f10,[%o1]
3043	st	%f11,[%o1+4]
30441:
3045	addcc	%i0,-1,%i0
3046	ble,pn	%icc,.ENDLOOP1
3047! delay slot, harmless if branch taken
3048	andn	%l2,%i5,%l1		! hx &= ~0x80000000
3049	fmovd	%f20,%f10
3050	ba,pt	%icc,.LOOP1
3051! delay slot
3052	add	%i1,%i2,%i1		! x += stridex
3053
3054
3055	.align	32
3056.BIG2:
3057	sethi	%hi(0x7ff00000),%o7
3058	cmp	%l2,%o7
3059	bl,a,pt	%icc,1f			! if hx < 0x7ff00000
3060! delay slot, annulled if branch not taken
3061	mov	%l7,LIM_l6		! set biguns flag or
3062	fsubd	%f20,%f20,%f20		! y = x - x
3063	st	%f20,[%o2]
3064	st	%f21,[%o2+4]
30651:
3066	addcc	%i0,-1,%i0
3067	ble,pn	%icc,.ENDLOOP2
3068! delay slot
3069	nop
3070	ld	[%i1],%l2
3071	ld	[%i1],%f20
3072	ld	[%i1+4],%f21
3073	andn	%l2,%i5,%l2		! hx &= ~0x80000000
3074	ba,pt	%icc,.LOOP2
3075! delay slot
3076	add	%i1,%i2,%i1		! x += stridex
3077
3078	SET_SIZE(__vcos)
3079
3080