/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

	.file	"__vsin_ultra3.S"

#include "libm.h"
	.weak	__vsin
	.type	__vsin,#function
	__vsin = __vsin_ultra3

	RO_DATA
	.align	64
constants:
	.word	0x42c80000,0x00000000	! 3 * 2^44
	.word	0x43380000,0x00000000	! 3 * 2^51
	.word	0x3fe45f30,0x6dc9c883	! invpio2
	.word	0x3ff921fb,0x54442c00	! pio2_1
	.word	0x3d318469,0x898cc400	! pio2_2
	.word	0x3a71701b,0x839a2520	! pio2_3
	.word	0xbfc55555,0x55555533	! pp1
	.word	0x3f811111,0x10e7d53b	! pp2
	.word	0xbf2a0167,0xe6b3cf9b	! pp3
	.word	0xbfdfffff,0xffffff65	! qq1
	.word	0x3fa55555,0x54f88ed0	! qq2
	.word	0xbf56c12c,0xdd185f60	! qq3

! local storage indices

#define xsave		STACK_BIAS-0x8
#define ysave		STACK_BIAS-0x10
#define nsave		STACK_BIAS-0x14
#define sxsave		STACK_BIAS-0x18
#define sysave		STACK_BIAS-0x1c
#define biguns		STACK_BIAS-0x20
#define nk3		STACK_BIAS-0x24
#define nk2		STACK_BIAS-0x28
#define nk1		STACK_BIAS-0x2c
#define nk0		STACK_BIAS-0x30
#define junk		STACK_BIAS-0x38
! sizeof temp storage - must be a multiple of 16 for V9
#define tmps		0x40

! register use

! i0  n
! i1  x
! i2  stridex
! i3  y
! i4  stridey
! i5  0x80000000

! l0  hx0
! l1  hx1
! l2  hx2
! l3  hx3
! l4  k0
! l5  k1
! l6  k2
! l7  k3

! the following are 64-bit registers in both V8+ and V9

! g1  __vlibm_TBL_sincos2
! g5  scratch

! o0  py0
! o1  py1
! o2  py2
! o3  py3
! o4  0x3e400000
! o5  0x3fe921fb,0x4099251e
! o7  scratch

! f0  hx0
! f2  
! f4  
! f6  
! f8  hx1
! f10 
! f12 
! f14 
! f16 hx2
! f18 
! f20 
! f22 
! f24 hx3
! f26 
! f28 
! f30 
! f32 
! f34 
! f36
! f38

#define c3two44	%f40
#define c3two51	%f42
#define invpio2	%f44
#define pio2_1	%f46
#define pio2_2	%f48
#define pio2_3	%f50
#define pp1	%f52
#define pp2	%f54
#define pp3	%f56
#define qq1	%f58
#define qq2	%f60
#define qq3	%f62

	ENTRY(__vsin_ultra3)
	save	%sp,-SA(MINFRAME)-tmps,%sp
	PIC_SETUP(l7)
	PIC_SET(l7,constants,o0)
	PIC_SET(l7,__vlibm_TBL_sincos2,o1)
	mov	%o1,%g1
	wr	%g0,0x82,%asi		! set %asi for non-faulting loads
#ifdef __sparcv9
	stx	%i1,[%fp+xsave]		! save arguments
	stx	%i3,[%fp+ysave]
#else
	st	%i1,[%fp+xsave]		! save arguments
	st	%i3,[%fp+ysave]
#endif
	st	%i0,[%fp+nsave]
	st	%i2,[%fp+sxsave]
	st	%i4,[%fp+sysave]
	st	%g0,[%fp+biguns]	! biguns = 0
	ldd	[%o0+0x00],c3two44	! load/set up constants
	ldd	[%o0+0x08],c3two51
	ldd	[%o0+0x10],invpio2
	ldd	[%o0+0x18],pio2_1
	ldd	[%o0+0x20],pio2_2
	ldd	[%o0+0x28],pio2_3
	ldd	[%o0+0x30],pp1
	ldd	[%o0+0x38],pp2
	ldd	[%o0+0x40],pp3
	ldd	[%o0+0x48],qq1
	ldd	[%o0+0x50],qq2
	ldd	[%o0+0x58],qq3
	sethi	%hi(0x80000000),%i5
	sethi	%hi(0x3e400000),%o4
	sethi	%hi(0x3fe921fb),%o5
	or	%o5,%lo(0x3fe921fb),%o5
	sllx	%o5,32,%o5
	sethi	%hi(0x4099251e),%o7
	or	%o7,%lo(0x4099251e),%o7
	or	%o5,%o7,%o5
	sll	%i2,3,%i2		! scale strides
	sll	%i4,3,%i4
	add	%fp,junk,%o1		! loop prologue
	add	%fp,junk,%o2
	add	%fp,junk,%o3
	ld	[%i1],%l0		! *x
	ld	[%i1],%f0
	ld	[%i1+4],%f3
	andn	%l0,%i5,%l0		! mask off sign
	ba	.loop0
	add	%i1,%i2,%i1		! x += stridex

! 16-byte aligned
	.align	16
.loop0:
	lda	[%i1]%asi,%l1		! preload next argument
	sub	%l0,%o4,%g5
	sub	%o5,%l0,%o7
	fabss	%f0,%f2

	lda	[%i1]%asi,%f8
	orcc	%o7,%g5,%g0
	mov	%i3,%o0			! py0 = y
	bl,pn	%icc,.range0		! hx < 0x3e400000 or hx > 0x4099251e

! delay slot
	lda	[%i1+4]%asi,%f11
	addcc	%i0,-1,%i0
	add	%i3,%i4,%i3		! y += stridey
	ble,pn	%icc,.last1

! delay slot
	andn	%l1,%i5,%l1
	add	%i1,%i2,%i1		! x += stridex
	faddd	%f2,c3two44,%f4
	st	%f15,[%o1+4]

.loop1:
	lda	[%i1]%asi,%l2		! preload next argument
	sub	%l1,%o4,%g5
	sub	%o5,%l1,%o7
	fabss	%f8,%f10

	lda	[%i1]%asi,%f16
	orcc	%o7,%g5,%g0
	mov	%i3,%o1			! py1 = y
	bl,pn	%icc,.range1		! hx < 0x3e400000 or hx > 0x4099251e

! delay slot
	lda	[%i1+4]%asi,%f19
	addcc	%i0,-1,%i0
	add	%i3,%i4,%i3		! y += stridey
	ble,pn	%icc,.last2

! delay slot
	andn	%l2,%i5,%l2
	add	%i1,%i2,%i1		! x += stridex
	faddd	%f10,c3two44,%f12
	st	%f23,[%o2+4]

.loop2:
	lda	[%i1]%asi,%l3		! preload next argument
	sub	%l2,%o4,%g5
	sub	%o5,%l2,%o7
	fabss	%f16,%f18

	lda	[%i1]%asi,%f24
	orcc	%o7,%g5,%g0
	mov	%i3,%o2			! py2 = y
	bl,pn	%icc,.range2		! hx < 0x3e400000 or hx > 0x4099251e

! delay slot
	lda	[%i1+4]%asi,%f27
	addcc	%i0,-1,%i0
	add	%i3,%i4,%i3		! y += stridey
	ble,pn	%icc,.last3

! delay slot
	andn	%l3,%i5,%l3
	add	%i1,%i2,%i1		! x += stridex
	faddd	%f18,c3two44,%f20
	st	%f31,[%o3+4]

.loop3:
	sub	%l3,%o4,%g5
	sub	%o5,%l3,%o7
	fabss	%f24,%f26
	st	%f5,[%fp+nk0]

	orcc	%o7,%g5,%g0
	mov	%i3,%o3			! py3 = y
	bl,pn	%icc,.range3		! hx < 0x3e400000 or > hx 0x4099251e
! delay slot
	st	%f13,[%fp+nk1]

!!! DONE?
.cont:
	srlx	%o5,32,%o7
	add	%i3,%i4,%i3		! y += stridey
	fmovs	%f3,%f1
	st	%f21,[%fp+nk2]

	sub	%o7,%l0,%l0
	sub	%o7,%l1,%l1
	faddd	%f26,c3two44,%f28
	st	%f29,[%fp+nk3]

	sub	%o7,%l2,%l2
	sub	%o7,%l3,%l3
	fmovs	%f11,%f9

	or	%l0,%l1,%l0
	or	%l2,%l3,%l2
	fmovs	%f19,%f17

	fmovs	%f27,%f25
	fmuld	%f0,invpio2,%f6		! x * invpio2, for medium range

	fmuld	%f8,invpio2,%f14
	ld	[%fp+nk0],%l4

	fmuld	%f16,invpio2,%f22
	ld	[%fp+nk1],%l5

	orcc	%l0,%l2,%g0
	bl,pn	%icc,.medium
! delay slot
	fmuld	%f24,invpio2,%f30
	ld	[%fp+nk2],%l6

	ld	[%fp+nk3],%l7
	sll	%l4,5,%l4		! k
	fcmpd	%fcc0,%f0,pio2_3	! x < pio2_3 iff x < 0

	sll	%l5,5,%l5
	ldd	[%l4+%g1],%f4
	fcmpd	%fcc1,%f8,pio2_3

	sll	%l6,5,%l6
	ldd	[%l5+%g1],%f12
	fcmpd	%fcc2,%f16,pio2_3

	sll	%l7,5,%l7
	ldd	[%l6+%g1],%f20
	fcmpd	%fcc3,%f24,pio2_3

	ldd	[%l7+%g1],%f28
	fsubd	%f2,%f4,%f2		! x -= __vlibm_TBL_sincos2[k]

	fsubd	%f10,%f12,%f10

	fsubd	%f18,%f20,%f18

	fsubd	%f26,%f28,%f26

	fmuld	%f2,%f2,%f0		! z = x * x

	fmuld	%f10,%f10,%f8

	fmuld	%f18,%f18,%f16

	fmuld	%f26,%f26,%f24

	fmuld	%f0,pp3,%f6

	fmuld	%f8,pp3,%f14

	fmuld	%f16,pp3,%f22

	fmuld	%f24,pp3,%f30

	faddd	%f6,pp2,%f6
	fmuld	%f0,qq2,%f4

	faddd	%f14,pp2,%f14
	fmuld	%f8,qq2,%f12

	faddd	%f22,pp2,%f22
	fmuld	%f16,qq2,%f20

	faddd	%f30,pp2,%f30
	fmuld	%f24,qq2,%f28

	fmuld	%f0,%f6,%f6
	faddd	%f4,qq1,%f4

	fmuld	%f8,%f14,%f14
	faddd	%f12,qq1,%f12

	fmuld	%f16,%f22,%f22
	faddd	%f20,qq1,%f20

	fmuld	%f24,%f30,%f30
	faddd	%f28,qq1,%f28

	faddd	%f6,pp1,%f6
	fmuld	%f0,%f4,%f4
	add	%l4,%g1,%l4

	faddd	%f14,pp1,%f14
	fmuld	%f8,%f12,%f12
	add	%l5,%g1,%l5

	faddd	%f22,pp1,%f22
	fmuld	%f16,%f20,%f20
	add	%l6,%g1,%l6

	faddd	%f30,pp1,%f30
	fmuld	%f24,%f28,%f28
	add	%l7,%g1,%l7

	fmuld	%f0,%f6,%f6
	ldd	[%l4+8],%f0

	fmuld	%f8,%f14,%f14
	ldd	[%l5+8],%f8

	fmuld	%f16,%f22,%f22
	ldd	[%l6+8],%f16

	fmuld	%f24,%f30,%f30
	ldd	[%l7+8],%f24

	fmuld	%f2,%f6,%f6

	fmuld	%f10,%f14,%f14

	fmuld	%f18,%f22,%f22

	fmuld	%f26,%f30,%f30

	faddd	%f6,%f2,%f6
	fmuld	%f0,%f4,%f4
	ldd	[%l4+16],%f2

	faddd	%f14,%f10,%f14
	fmuld	%f8,%f12,%f12
	ldd	[%l5+16],%f10

	faddd	%f22,%f18,%f22
	fmuld	%f16,%f20,%f20
	ldd	[%l6+16],%f18

	faddd	%f30,%f26,%f30
	fmuld	%f24,%f28,%f28
	ldd	[%l7+16],%f26

	fmuld	%f2,%f6,%f6

	fmuld	%f10,%f14,%f14

	fmuld	%f18,%f22,%f22

	fmuld	%f26,%f30,%f30

	faddd	%f6,%f4,%f6

	faddd	%f14,%f12,%f14

	faddd	%f22,%f20,%f22

	faddd	%f30,%f28,%f30

	faddd	%f6,%f0,%f6

	faddd	%f14,%f8,%f14

	faddd	%f22,%f16,%f22

	faddd	%f30,%f24,%f30

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f14,%f12
	lda	[%i1]%asi,%f0

	fnegd	%f22,%f20
	lda	[%i1+4]%asi,%f3

	fnegd	%f30,%f28
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	fmovdl	%fcc0,%f4,%f6		! (hx < -0)? -s : s
	st	%f6,[%o0]

	fmovdl	%fcc1,%f12,%f14
	st	%f14,[%o1]

	fmovdl	%fcc2,%f20,%f22
	st	%f22,[%o2]

	fmovdl	%fcc3,%f28,%f30
	st	%f30,[%o3]
	addcc	%i0,-1,%i0

	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop


	.align	16
.medium:
	faddd	%f6,c3two51,%f4
	st	%f5,[%fp+nk0]

	faddd	%f14,c3two51,%f12
	st	%f13,[%fp+nk1]

	faddd	%f22,c3two51,%f20
	st	%f21,[%fp+nk2]

	faddd	%f30,c3two51,%f28
	st	%f29,[%fp+nk3]

	fsubd	%f4,c3two51,%f6

	fsubd	%f12,c3two51,%f14

	fsubd	%f20,c3two51,%f22

	fsubd	%f28,c3two51,%f30

	fmuld	%f6,pio2_1,%f2
	ld	[%fp+nk0],%l0		! n

	fmuld	%f14,pio2_1,%f10
	ld	[%fp+nk1],%l1

	fmuld	%f22,pio2_1,%f18
	ld	[%fp+nk2],%l2

	fmuld	%f30,pio2_1,%f26
	ld	[%fp+nk3],%l3

	fsubd	%f0,%f2,%f0
	fmuld	%f6,pio2_2,%f4

	fsubd	%f8,%f10,%f8
	fmuld	%f14,pio2_2,%f12

	fsubd	%f16,%f18,%f16
	fmuld	%f22,pio2_2,%f20

	fsubd	%f24,%f26,%f24
	fmuld	%f30,pio2_2,%f28

	fsubd	%f0,%f4,%f32

	fsubd	%f8,%f12,%f34

	fsubd	%f16,%f20,%f36

	fsubd	%f24,%f28,%f38

	fsubd	%f0,%f32,%f0
	fcmple32 %f32,pio2_3,%l4	! x <= pio2_3 iff x < 0

	fsubd	%f8,%f34,%f8
	fcmple32 %f34,pio2_3,%l5

	fsubd	%f16,%f36,%f16
	fcmple32 %f36,pio2_3,%l6

	fsubd	%f24,%f38,%f24
	fcmple32 %f38,pio2_3,%l7

	fsubd	%f0,%f4,%f0
	fmuld	%f6,pio2_3,%f6
	sll	%l4,30,%l4		! if (x < 0) n = -n ^ 2

	fsubd	%f8,%f12,%f8
	fmuld	%f14,pio2_3,%f14
	sll	%l5,30,%l5

	fsubd	%f16,%f20,%f16
	fmuld	%f22,pio2_3,%f22
	sll	%l6,30,%l6

	fsubd	%f24,%f28,%f24
	fmuld	%f30,pio2_3,%f30
	sll	%l7,30,%l7

	fsubd	%f6,%f0,%f6
	sra	%l4,31,%l4

	fsubd	%f14,%f8,%f14
	sra	%l5,31,%l5

	fsubd	%f22,%f16,%f22
	sra	%l6,31,%l6

	fsubd	%f30,%f24,%f30
	sra	%l7,31,%l7

	fsubd	%f32,%f6,%f0		! reduced x
	xor	%l0,%l4,%l0

	fsubd	%f34,%f14,%f8
	xor	%l1,%l5,%l1

	fsubd	%f36,%f22,%f16
	xor	%l2,%l6,%l2

	fsubd	%f38,%f30,%f24
	xor	%l3,%l7,%l3

	fabsd	%f0,%f2
	sub	%l0,%l4,%l0

	fabsd	%f8,%f10
	sub	%l1,%l5,%l1

	fabsd	%f16,%f18
	sub	%l2,%l6,%l2

	fabsd	%f24,%f26
	sub	%l3,%l7,%l3

	faddd	%f2,c3two44,%f4
	st	%f5,[%fp+nk0]
	and	%l4,2,%l4

	faddd	%f10,c3two44,%f12
	st	%f13,[%fp+nk1]
	and	%l5,2,%l5

	faddd	%f18,c3two44,%f20
	st	%f21,[%fp+nk2]
	and	%l6,2,%l6

	faddd	%f26,c3two44,%f28
	st	%f29,[%fp+nk3]
	and	%l7,2,%l7

	fsubd	%f32,%f0,%f4
	xor	%l0,%l4,%l0

	fsubd	%f34,%f8,%f12
	xor	%l1,%l5,%l1

	fsubd	%f36,%f16,%f20
	xor	%l2,%l6,%l2

	fsubd	%f38,%f24,%f28
	xor	%l3,%l7,%l3

	fzero	%f38
	ld	[%fp+nk0],%l4

	fsubd	%f4,%f6,%f6		! w
	ld	[%fp+nk1],%l5

	fsubd	%f12,%f14,%f14
	ld	[%fp+nk2],%l6

	fnegd	%f38,%f38
	ld	[%fp+nk3],%l7
	sll	%l4,5,%l4		! k

	fsubd	%f20,%f22,%f22
	sll	%l5,5,%l5

	fsubd	%f28,%f30,%f30
	sll	%l6,5,%l6

	fand	%f0,%f38,%f32		! sign bit of x
	ldd	[%l4+%g1],%f4
	sll	%l7,5,%l7

	fand	%f8,%f38,%f34
	ldd	[%l5+%g1],%f12

	fand	%f16,%f38,%f36
	ldd	[%l6+%g1],%f20

	fand	%f24,%f38,%f38
	ldd	[%l7+%g1],%f28

	fsubd	%f2,%f4,%f2		! x -= __vlibm_TBL_sincos2[k]

	fsubd	%f10,%f12,%f10

	fsubd	%f18,%f20,%f18
	nop

	fsubd	%f26,%f28,%f26
	nop

! 16-byte aligned
	fmuld	%f2,%f2,%f0		! z = x * x
	andcc	%l0,1,%g0
	bz,pn	%icc,.case8
! delay slot
	fxor	%f6,%f32,%f32

	fmuld	%f10,%f10,%f8
	andcc	%l1,1,%g0
	bz,pn	%icc,.case4
! delay slot
	fxor	%f14,%f34,%f34

	fmuld	%f18,%f18,%f16
	andcc	%l2,1,%g0
	bz,pn	%icc,.case2
! delay slot
	fxor	%f22,%f36,%f36

	fmuld	%f26,%f26,%f24
	andcc	%l3,1,%g0
	bz,pn	%icc,.case1
! delay slot
	fxor	%f30,%f38,%f38

!.case0:
	fmuld	%f0,qq3,%f6		! cos(x0)

	fmuld	%f8,qq3,%f14		! cos(x1)

	fmuld	%f16,qq3,%f22		! cos(x2)

	fmuld	%f24,qq3,%f30		! cos(x3)

	faddd	%f6,qq2,%f6
	fmuld	%f0,pp2,%f4

	faddd	%f14,qq2,%f14
	fmuld	%f8,pp2,%f12

	faddd	%f22,qq2,%f22
	fmuld	%f16,pp2,%f20

	faddd	%f30,qq2,%f30
	fmuld	%f24,pp2,%f28

	fmuld	%f0,%f6,%f6
	faddd	%f4,pp1,%f4

	fmuld	%f8,%f14,%f14
	faddd	%f12,pp1,%f12

	fmuld	%f16,%f22,%f22
	faddd	%f20,pp1,%f20

	fmuld	%f24,%f30,%f30
	faddd	%f28,pp1,%f28

	faddd	%f6,qq1,%f6
	fmuld	%f0,%f4,%f4
	add	%l4,%g1,%l4

	faddd	%f14,qq1,%f14
	fmuld	%f8,%f12,%f12
	add	%l5,%g1,%l5

	faddd	%f22,qq1,%f22
	fmuld	%f16,%f20,%f20
	add	%l6,%g1,%l6

	faddd	%f30,qq1,%f30
	fmuld	%f24,%f28,%f28
	add	%l7,%g1,%l7

	fmuld	%f2,%f4,%f4

	fmuld	%f10,%f12,%f12

	fmuld	%f18,%f20,%f20

	fmuld	%f26,%f28,%f28

	fmuld	%f0,%f6,%f6
	faddd	%f4,%f32,%f4
	ldd	[%l4+16],%f0

	fmuld	%f8,%f14,%f14
	faddd	%f12,%f34,%f12
	ldd	[%l5+16],%f8

	fmuld	%f16,%f22,%f22
	faddd	%f20,%f36,%f20
	ldd	[%l6+16],%f16

	fmuld	%f24,%f30,%f30
	faddd	%f28,%f38,%f28
	ldd	[%l7+16],%f24

	fmuld	%f0,%f6,%f6
	faddd	%f4,%f2,%f4
	ldd	[%l4+8],%f32

	fmuld	%f8,%f14,%f14
	faddd	%f12,%f10,%f12
	ldd	[%l5+8],%f34

	fmuld	%f16,%f22,%f22
	faddd	%f20,%f18,%f20
	ldd	[%l6+8],%f36

	fmuld	%f24,%f30,%f30
	faddd	%f28,%f26,%f28
	ldd	[%l7+8],%f38

	fmuld	%f32,%f4,%f4

	fmuld	%f34,%f12,%f12

	fmuld	%f36,%f20,%f20

	fmuld	%f38,%f28,%f28

	fsubd	%f6,%f4,%f6

	fsubd	%f14,%f12,%f14

	fsubd	%f22,%f20,%f22

	fsubd	%f30,%f28,%f30

	faddd	%f6,%f0,%f6

	faddd	%f14,%f8,%f14

	faddd	%f22,%f16,%f22

	faddd	%f30,%f24,%f30
	mov	%l0,%l4

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f14,%f12
	lda	[%i1]%asi,%f0

	fnegd	%f22,%f20
	lda	[%i1+4]%asi,%f3

	fnegd	%f30,%f28
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	andcc	%l4,2,%g0
	fmovdnz	%icc,%f4,%f6
	st	%f6,[%o0]

	andcc	%l1,2,%g0
	fmovdnz	%icc,%f12,%f14
	st	%f14,[%o1]

	andcc	%l2,2,%g0
	fmovdnz	%icc,%f20,%f22
	st	%f22,[%o2]

	andcc	%l3,2,%g0
	fmovdnz	%icc,%f28,%f30
	st	%f30,[%o3]

	addcc	%i0,-1,%i0
	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop

	.align	16
.case1:
	fmuld	%f24,pp3,%f30		! sin(x3)

	fmuld	%f0,qq3,%f6		! cos(x0)

	fmuld	%f8,qq3,%f14		! cos(x1)

	fmuld	%f16,qq3,%f22		! cos(x2)

	faddd	%f30,pp2,%f30
	fmuld	%f24,qq2,%f28

	faddd	%f6,qq2,%f6
	fmuld	%f0,pp2,%f4

	faddd	%f14,qq2,%f14
	fmuld	%f8,pp2,%f12

	faddd	%f22,qq2,%f22
	fmuld	%f16,pp2,%f20

	fmuld	%f24,%f30,%f30
	faddd	%f28,qq1,%f28

	fmuld	%f0,%f6,%f6
	faddd	%f4,pp1,%f4

	fmuld	%f8,%f14,%f14
	faddd	%f12,pp1,%f12

	fmuld	%f16,%f22,%f22
	faddd	%f20,pp1,%f20

	faddd	%f30,pp1,%f30
	fmuld	%f24,%f28,%f28
	add	%l7,%g1,%l7

	faddd	%f6,qq1,%f6
	fmuld	%f0,%f4,%f4
	add	%l4,%g1,%l4

	faddd	%f14,qq1,%f14
	fmuld	%f8,%f12,%f12
	add	%l5,%g1,%l5

	faddd	%f22,qq1,%f22
	fmuld	%f16,%f20,%f20
	add	%l6,%g1,%l6

	fmuld	%f24,%f30,%f30

	fmuld	%f2,%f4,%f4

	fmuld	%f10,%f12,%f12

	fmuld	%f18,%f20,%f20

	fmuld	%f26,%f30,%f30
	ldd	[%l7+8],%f24

	fmuld	%f0,%f6,%f6
	faddd	%f4,%f32,%f4
	ldd	[%l4+16],%f0

	fmuld	%f8,%f14,%f14
	faddd	%f12,%f34,%f12
	ldd	[%l5+16],%f8

	fmuld	%f16,%f22,%f22
	faddd	%f20,%f36,%f20
	ldd	[%l6+16],%f16

	fmuld	%f24,%f28,%f28
	faddd	%f38,%f30,%f30

	fmuld	%f0,%f6,%f6
	faddd	%f4,%f2,%f4
	ldd	[%l4+8],%f32

	fmuld	%f8,%f14,%f14
	faddd	%f12,%f10,%f12
	ldd	[%l5+8],%f34

	fmuld	%f16,%f22,%f22
	faddd	%f20,%f18,%f20
	ldd	[%l6+8],%f36

	faddd	%f26,%f30,%f30
	ldd	[%l7+16],%f38

	fmuld	%f32,%f4,%f4

	fmuld	%f34,%f12,%f12

	fmuld	%f36,%f20,%f20

	fmuld	%f38,%f30,%f30

	fsubd	%f6,%f4,%f6

	fsubd	%f14,%f12,%f14

	fsubd	%f22,%f20,%f22

	faddd	%f30,%f28,%f30

	faddd	%f6,%f0,%f6

	faddd	%f14,%f8,%f14

	faddd	%f22,%f16,%f22

	faddd	%f30,%f24,%f30
	mov	%l0,%l4

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f14,%f12
	lda	[%i1]%asi,%f0

	fnegd	%f22,%f20
	lda	[%i1+4]%asi,%f3

	fnegd	%f30,%f28
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	andcc	%l4,2,%g0
	fmovdnz	%icc,%f4,%f6
	st	%f6,[%o0]

	andcc	%l1,2,%g0
	fmovdnz	%icc,%f12,%f14
	st	%f14,[%o1]

	andcc	%l2,2,%g0
	fmovdnz	%icc,%f20,%f22
	st	%f22,[%o2]

	andcc	%l3,2,%g0
	fmovdnz	%icc,%f28,%f30
	st	%f30,[%o3]

	addcc	%i0,-1,%i0
	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop

	.align	16
.case2:
	fmuld	%f26,%f26,%f24
	andcc	%l3,1,%g0
	bz,pn	%icc,.case3
! delay slot
	fxor	%f30,%f38,%f38

	fmuld	%f16,pp3,%f22		! sin(x2)

	fmuld	%f0,qq3,%f6		! cos(x0)

	fmuld	%f8,qq3,%f14		! cos(x1)

	faddd	%f22,pp2,%f22
	fmuld	%f16,qq2,%f20

	fmuld	%f24,qq3,%f30		! cos(x3)

	faddd	%f6,qq2,%f6
	fmuld	%f0,pp2,%f4

	faddd	%f14,qq2,%f14
	fmuld	%f8,pp2,%f12

	fmuld	%f16,%f22,%f22
	faddd	%f20,qq1,%f20

	faddd	%f30,qq2,%f30
	fmuld	%f24,pp2,%f28

	fmuld	%f0,%f6,%f6
	faddd	%f4,pp1,%f4

	fmuld	%f8,%f14,%f14
	faddd	%f12,pp1,%f12

	faddd	%f22,pp1,%f22
	fmuld	%f16,%f20,%f20
	add	%l6,%g1,%l6

	fmuld	%f24,%f30,%f30
	faddd	%f28,pp1,%f28

	faddd	%f6,qq1,%f6
	fmuld	%f0,%f4,%f4
	add	%l4,%g1,%l4

	faddd	%f14,qq1,%f14
	fmuld	%f8,%f12,%f12
	add	%l5,%g1,%l5

	fmuld	%f16,%f22,%f22

	faddd	%f30,qq1,%f30
	fmuld	%f24,%f28,%f28
	add	%l7,%g1,%l7

	fmuld	%f2,%f4,%f4

	fmuld	%f10,%f12,%f12

	fmuld	%f18,%f22,%f22
	ldd	[%l6+8],%f16

	fmuld	%f26,%f28,%f28

	fmuld	%f0,%f6,%f6
	faddd	%f4,%f32,%f4
	ldd	[%l4+16],%f0

	fmuld	%f8,%f14,%f14
	faddd	%f12,%f34,%f12
	ldd	[%l5+16],%f8

	fmuld	%f16,%f20,%f20
	faddd	%f36,%f22,%f22

	fmuld	%f24,%f30,%f30
	faddd	%f28,%f38,%f28
	ldd	[%l7+16],%f24

	fmuld	%f0,%f6,%f6
	faddd	%f4,%f2,%f4
	ldd	[%l4+8],%f32

	fmuld	%f8,%f14,%f14
	faddd	%f12,%f10,%f12
	ldd	[%l5+8],%f34

	faddd	%f18,%f22,%f22
	ldd	[%l6+16],%f36

	fmuld	%f24,%f30,%f30
	faddd	%f28,%f26,%f28
	ldd	[%l7+8],%f38

	fmuld	%f32,%f4,%f4

	fmuld	%f34,%f12,%f12

	fmuld	%f36,%f22,%f22

	fmuld	%f38,%f28,%f28

	fsubd	%f6,%f4,%f6

	fsubd	%f14,%f12,%f14

	faddd	%f22,%f20,%f22

	fsubd	%f30,%f28,%f30

	faddd	%f6,%f0,%f6

	faddd	%f14,%f8,%f14

	faddd	%f22,%f16,%f22

	faddd	%f30,%f24,%f30
	mov	%l0,%l4

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f14,%f12
	lda	[%i1]%asi,%f0

	fnegd	%f22,%f20
	lda	[%i1+4]%asi,%f3

	fnegd	%f30,%f28
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	andcc	%l4,2,%g0
	fmovdnz	%icc,%f4,%f6
	st	%f6,[%o0]

	andcc	%l1,2,%g0
	fmovdnz	%icc,%f12,%f14
	st	%f14,[%o1]

	andcc	%l2,2,%g0
	fmovdnz	%icc,%f20,%f22
	st	%f22,[%o2]

	andcc	%l3,2,%g0
	fmovdnz	%icc,%f28,%f30
	st	%f30,[%o3]

	addcc	%i0,-1,%i0
	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop

	.align	16
.case3:
	fmuld	%f16,pp3,%f22		! sin(x2)

	fmuld	%f24,pp3,%f30		! sin(x3)

	fmuld	%f0,qq3,%f6		! cos(x0)

	fmuld	%f8,qq3,%f14		! cos(x1)

	faddd	%f22,pp2,%f22
	fmuld	%f16,qq2,%f20

	faddd	%f30,pp2,%f30
	fmuld	%f24,qq2,%f28

	faddd	%f6,qq2,%f6
	fmuld	%f0,pp2,%f4

	faddd	%f14,qq2,%f14
	fmuld	%f8,pp2,%f12

	fmuld	%f16,%f22,%f22
	faddd	%f20,qq1,%f20

	fmuld	%f24,%f30,%f30
	faddd	%f28,qq1,%f28

	fmuld	%f0,%f6,%f6
	faddd	%f4,pp1,%f4

	fmuld	%f8,%f14,%f14
	faddd	%f12,pp1,%f12

	faddd	%f22,pp1,%f22
	fmuld	%f16,%f20,%f20
	add	%l6,%g1,%l6

	faddd	%f30,pp1,%f30
	fmuld	%f24,%f28,%f28
	add	%l7,%g1,%l7

	faddd	%f6,qq1,%f6
	fmuld	%f0,%f4,%f4
	add	%l4,%g1,%l4

	faddd	%f14,qq1,%f14
	fmuld	%f8,%f12,%f12
	add	%l5,%g1,%l5

	fmuld	%f16,%f22,%f22

	fmuld	%f24,%f30,%f30

	fmuld	%f2,%f4,%f4

	fmuld	%f10,%f12,%f12

	fmuld	%f18,%f22,%f22
	ldd	[%l6+8],%f16

	fmuld	%f26,%f30,%f30
	ldd	[%l7+8],%f24

	fmuld	%f0,%f6,%f6
	faddd	%f4,%f32,%f4
	ldd	[%l4+16],%f0

	fmuld	%f8,%f14,%f14
	faddd	%f12,%f34,%f12
	ldd	[%l5+16],%f8

	fmuld	%f16,%f20,%f20
	faddd	%f36,%f22,%f22

	fmuld	%f24,%f28,%f28
	faddd	%f38,%f30,%f30

	fmuld	%f0,%f6,%f6
	faddd	%f4,%f2,%f4
	ldd	[%l4+8],%f32

	fmuld	%f8,%f14,%f14
	faddd	%f12,%f10,%f12
	ldd	[%l5+8],%f34

	faddd	%f18,%f22,%f22
	ldd	[%l6+16],%f36

	faddd	%f26,%f30,%f30
	ldd	[%l7+16],%f38

	fmuld	%f32,%f4,%f4

	fmuld	%f34,%f12,%f12

	fmuld	%f36,%f22,%f22

	fmuld	%f38,%f30,%f30

	fsubd	%f6,%f4,%f6

	fsubd	%f14,%f12,%f14

	faddd	%f22,%f20,%f22

	faddd	%f30,%f28,%f30

	faddd	%f6,%f0,%f6

	faddd	%f14,%f8,%f14

	faddd	%f22,%f16,%f22

	faddd	%f30,%f24,%f30
	mov	%l0,%l4

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f14,%f12
	lda	[%i1]%asi,%f0

	fnegd	%f22,%f20
	lda	[%i1+4]%asi,%f3

	fnegd	%f30,%f28
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	andcc	%l4,2,%g0
	fmovdnz	%icc,%f4,%f6
	st	%f6,[%o0]

	andcc	%l1,2,%g0
	fmovdnz	%icc,%f12,%f14
	st	%f14,[%o1]

	andcc	%l2,2,%g0
	fmovdnz	%icc,%f20,%f22
	st	%f22,[%o2]

	andcc	%l3,2,%g0
	fmovdnz	%icc,%f28,%f30
	st	%f30,[%o3]

	addcc	%i0,-1,%i0
	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop

	.align	16
.case4:
	fmuld	%f18,%f18,%f16
	andcc	%l2,1,%g0
	bz,pn	%icc,.case6
! delay slot
	fxor	%f22,%f36,%f36

	fmuld	%f26,%f26,%f24
	andcc	%l3,1,%g0
	bz,pn	%icc,.case5
! delay slot
	fxor	%f30,%f38,%f38

	fmuld	%f8,pp3,%f14		! sin(x1)

	fmuld	%f0,qq3,%f6		! cos(x0)

	faddd	%f14,pp2,%f14
	fmuld	%f8,qq2,%f12

	fmuld	%f16,qq3,%f22		! cos(x2)

	fmuld	%f24,qq3,%f30		! cos(x3)

	faddd	%f6,qq2,%f6
	fmuld	%f0,pp2,%f4

	fmuld	%f8,%f14,%f14
	faddd	%f12,qq1,%f12

	faddd	%f22,qq2,%f22
	fmuld	%f16,pp2,%f20

	faddd	%f30,qq2,%f30
	fmuld	%f24,pp2,%f28

	fmuld	%f0,%f6,%f6
	faddd	%f4,pp1,%f4

	faddd	%f14,pp1,%f14
	fmuld	%f8,%f12,%f12
	add	%l5,%g1,%l5

	fmuld	%f16,%f22,%f22
	faddd	%f20,pp1,%f20

	fmuld	%f24,%f30,%f30
	faddd	%f28,pp1,%f28

	faddd	%f6,qq1,%f6
	fmuld	%f0,%f4,%f4
	add	%l4,%g1,%l4

	fmuld	%f8,%f14,%f14

	faddd	%f22,qq1,%f22
	fmuld	%f16,%f20,%f20
	add	%l6,%g1,%l6

	faddd	%f30,qq1,%f30
	fmuld	%f24,%f28,%f28
	add	%l7,%g1,%l7

	fmuld	%f2,%f4,%f4

	fmuld	%f10,%f14,%f14
	ldd	[%l5+8],%f8

	fmuld	%f18,%f20,%f20

	fmuld	%f26,%f28,%f28

	fmuld	%f0,%f6,%f6
	faddd	%f4,%f32,%f4
	ldd	[%l4+16],%f0

	fmuld	%f8,%f12,%f12
	faddd	%f34,%f14,%f14

	fmuld	%f16,%f22,%f22
	faddd	%f20,%f36,%f20
	ldd	[%l6+16],%f16

	fmuld	%f24,%f30,%f30
	faddd	%f28,%f38,%f28
	ldd	[%l7+16],%f24

	fmuld	%f0,%f6,%f6
	faddd	%f4,%f2,%f4
	ldd	[%l4+8],%f32

	faddd	%f10,%f14,%f14
	ldd	[%l5+16],%f34

	fmuld	%f16,%f22,%f22
	faddd	%f20,%f18,%f20
	ldd	[%l6+8],%f36

	fmuld	%f24,%f30,%f30
	faddd	%f28,%f26,%f28
	ldd	[%l7+8],%f38

	fmuld	%f32,%f4,%f4

	fmuld	%f34,%f14,%f14

	fmuld	%f36,%f20,%f20

	fmuld	%f38,%f28,%f28

	fsubd	%f6,%f4,%f6

	faddd	%f14,%f12,%f14

	fsubd	%f22,%f20,%f22

	fsubd	%f30,%f28,%f30

	faddd	%f6,%f0,%f6

	faddd	%f14,%f8,%f14

	faddd	%f22,%f16,%f22

	faddd	%f30,%f24,%f30
	mov	%l0,%l4

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f14,%f12
	lda	[%i1]%asi,%f0

	fnegd	%f22,%f20
	lda	[%i1+4]%asi,%f3

	fnegd	%f30,%f28
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	andcc	%l4,2,%g0
	fmovdnz	%icc,%f4,%f6
	st	%f6,[%o0]

	andcc	%l1,2,%g0
	fmovdnz	%icc,%f12,%f14
	st	%f14,[%o1]

	andcc	%l2,2,%g0
	fmovdnz	%icc,%f20,%f22
	st	%f22,[%o2]

	andcc	%l3,2,%g0
	fmovdnz	%icc,%f28,%f30
	st	%f30,[%o3]

	addcc	%i0,-1,%i0
	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop

	.align	16
.case5:
	fmuld	%f8,pp3,%f14		! sin(x1)

	fmuld	%f24,pp3,%f30		! sin(x3)

	fmuld	%f0,qq3,%f6		! cos(x0)

	faddd	%f14,pp2,%f14
	fmuld	%f8,qq2,%f12

	fmuld	%f16,qq3,%f22		! cos(x2)

	faddd	%f30,pp2,%f30
	fmuld	%f24,qq2,%f28

	faddd	%f6,qq2,%f6
	fmuld	%f0,pp2,%f4

	fmuld	%f8,%f14,%f14
	faddd	%f12,qq1,%f12

	faddd	%f22,qq2,%f22
	fmuld	%f16,pp2,%f20

	fmuld	%f24,%f30,%f30
	faddd	%f28,qq1,%f28

	fmuld	%f0,%f6,%f6
	faddd	%f4,pp1,%f4

	faddd	%f14,pp1,%f14
	fmuld	%f8,%f12,%f12
	add	%l5,%g1,%l5

	fmuld	%f16,%f22,%f22
	faddd	%f20,pp1,%f20

	faddd	%f30,pp1,%f30
	fmuld	%f24,%f28,%f28
	add	%l7,%g1,%l7

	faddd	%f6,qq1,%f6
	fmuld	%f0,%f4,%f4
	add	%l4,%g1,%l4

	fmuld	%f8,%f14,%f14

	faddd	%f22,qq1,%f22
	fmuld	%f16,%f20,%f20
	add	%l6,%g1,%l6

	fmuld	%f24,%f30,%f30

	fmuld	%f2,%f4,%f4

	fmuld	%f10,%f14,%f14
	ldd	[%l5+8],%f8

	fmuld	%f18,%f20,%f20

	fmuld	%f26,%f30,%f30
	ldd	[%l7+8],%f24

	fmuld	%f0,%f6,%f6
	faddd	%f4,%f32,%f4
	ldd	[%l4+16],%f0

	fmuld	%f8,%f12,%f12
	faddd	%f34,%f14,%f14

	fmuld	%f16,%f22,%f22
	faddd	%f20,%f36,%f20
	ldd	[%l6+16],%f16

	fmuld	%f24,%f28,%f28
	faddd	%f38,%f30,%f30

	fmuld	%f0,%f6,%f6
	faddd	%f4,%f2,%f4
	ldd	[%l4+8],%f32

	faddd	%f10,%f14,%f14
	ldd	[%l5+16],%f34

	fmuld	%f16,%f22,%f22
	faddd	%f20,%f18,%f20
	ldd	[%l6+8],%f36

	faddd	%f26,%f30,%f30
	ldd	[%l7+16],%f38

	fmuld	%f32,%f4,%f4

	fmuld	%f34,%f14,%f14

	fmuld	%f36,%f20,%f20

	fmuld	%f38,%f30,%f30

	fsubd	%f6,%f4,%f6

	faddd	%f14,%f12,%f14

	fsubd	%f22,%f20,%f22

	faddd	%f30,%f28,%f30

	faddd	%f6,%f0,%f6

	faddd	%f14,%f8,%f14

	faddd	%f22,%f16,%f22

	faddd	%f30,%f24,%f30
	mov	%l0,%l4

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f14,%f12
	lda	[%i1]%asi,%f0

	fnegd	%f22,%f20
	lda	[%i1+4]%asi,%f3

	fnegd	%f30,%f28
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	andcc	%l4,2,%g0
	fmovdnz	%icc,%f4,%f6
	st	%f6,[%o0]

	andcc	%l1,2,%g0
	fmovdnz	%icc,%f12,%f14
	st	%f14,[%o1]

	andcc	%l2,2,%g0
	fmovdnz	%icc,%f20,%f22
	st	%f22,[%o2]

	andcc	%l3,2,%g0
	fmovdnz	%icc,%f28,%f30
	st	%f30,[%o3]

	addcc	%i0,-1,%i0
	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop

	.align	16
.case6:
	fmuld	%f26,%f26,%f24
	andcc	%l3,1,%g0
	bz,pn	%icc,.case7
! delay slot
	fxor	%f30,%f38,%f38

	fmuld	%f8,pp3,%f14		! sin(x1)

	fmuld	%f16,pp3,%f22		! sin(x2)

	fmuld	%f0,qq3,%f6		! cos(x0)

	faddd	%f14,pp2,%f14
	fmuld	%f8,qq2,%f12

	faddd	%f22,pp2,%f22
	fmuld	%f16,qq2,%f20

	fmuld	%f24,qq3,%f30		! cos(x3)

	faddd	%f6,qq2,%f6
	fmuld	%f0,pp2,%f4

	fmuld	%f8,%f14,%f14
	faddd	%f12,qq1,%f12

	fmuld	%f16,%f22,%f22
	faddd	%f20,qq1,%f20

	faddd	%f30,qq2,%f30
	fmuld	%f24,pp2,%f28

	fmuld	%f0,%f6,%f6
	faddd	%f4,pp1,%f4

	faddd	%f14,pp1,%f14
	fmuld	%f8,%f12,%f12
	add	%l5,%g1,%l5

	faddd	%f22,pp1,%f22
	fmuld	%f16,%f20,%f20
	add	%l6,%g1,%l6

	fmuld	%f24,%f30,%f30
	faddd	%f28,pp1,%f28

	faddd	%f6,qq1,%f6
	fmuld	%f0,%f4,%f4
	add	%l4,%g1,%l4

	fmuld	%f8,%f14,%f14

	fmuld	%f16,%f22,%f22

	faddd	%f30,qq1,%f30
	fmuld	%f24,%f28,%f28
	add	%l7,%g1,%l7

	fmuld	%f2,%f4,%f4

	fmuld	%f10,%f14,%f14
	ldd	[%l5+8],%f8

	fmuld	%f18,%f22,%f22
	ldd	[%l6+8],%f16

	fmuld	%f26,%f28,%f28

	fmuld	%f0,%f6,%f6
	faddd	%f4,%f32,%f4
	ldd	[%l4+16],%f0

	fmuld	%f8,%f12,%f12
	faddd	%f34,%f14,%f14

	fmuld	%f16,%f20,%f20
	faddd	%f36,%f22,%f22

	fmuld	%f24,%f30,%f30
	faddd	%f28,%f38,%f28
	ldd	[%l7+16],%f24

	fmuld	%f0,%f6,%f6
	faddd	%f4,%f2,%f4
	ldd	[%l4+8],%f32

	faddd	%f10,%f14,%f14
	ldd	[%l5+16],%f34

	faddd	%f18,%f22,%f22
	ldd	[%l6+16],%f36

	fmuld	%f24,%f30,%f30
	faddd	%f28,%f26,%f28
	ldd	[%l7+8],%f38

	fmuld	%f32,%f4,%f4

	fmuld	%f34,%f14,%f14

	fmuld	%f36,%f22,%f22

	fmuld	%f38,%f28,%f28

	fsubd	%f6,%f4,%f6

	faddd	%f14,%f12,%f14

	faddd	%f22,%f20,%f22

	fsubd	%f30,%f28,%f30

	faddd	%f6,%f0,%f6

	faddd	%f14,%f8,%f14

	faddd	%f22,%f16,%f22

	faddd	%f30,%f24,%f30
	mov	%l0,%l4

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f14,%f12
	lda	[%i1]%asi,%f0

	fnegd	%f22,%f20
	lda	[%i1+4]%asi,%f3

	fnegd	%f30,%f28
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	andcc	%l4,2,%g0
	fmovdnz	%icc,%f4,%f6
	st	%f6,[%o0]

	andcc	%l1,2,%g0
	fmovdnz	%icc,%f12,%f14
	st	%f14,[%o1]

	andcc	%l2,2,%g0
	fmovdnz	%icc,%f20,%f22
	st	%f22,[%o2]

	andcc	%l3,2,%g0
	fmovdnz	%icc,%f28,%f30
	st	%f30,[%o3]

	addcc	%i0,-1,%i0
	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop

	.align	16
.case7:
	fmuld	%f8,pp3,%f14		! sin(x1)

	fmuld	%f16,pp3,%f22		! sin(x2)

	fmuld	%f24,pp3,%f30		! sin(x3)

	fmuld	%f0,qq3,%f6		! cos(x0)

	faddd	%f14,pp2,%f14
	fmuld	%f8,qq2,%f12

	faddd	%f22,pp2,%f22
	fmuld	%f16,qq2,%f20

	faddd	%f30,pp2,%f30
	fmuld	%f24,qq2,%f28

	faddd	%f6,qq2,%f6
	fmuld	%f0,pp2,%f4

	fmuld	%f8,%f14,%f14
	faddd	%f12,qq1,%f12

	fmuld	%f16,%f22,%f22
	faddd	%f20,qq1,%f20

	fmuld	%f24,%f30,%f30
	faddd	%f28,qq1,%f28

	fmuld	%f0,%f6,%f6
	faddd	%f4,pp1,%f4

	faddd	%f14,pp1,%f14
	fmuld	%f8,%f12,%f12
	add	%l5,%g1,%l5

	faddd	%f22,pp1,%f22
	fmuld	%f16,%f20,%f20
	add	%l6,%g1,%l6

	faddd	%f30,pp1,%f30
	fmuld	%f24,%f28,%f28
	add	%l7,%g1,%l7

	faddd	%f6,qq1,%f6
	fmuld	%f0,%f4,%f4
	add	%l4,%g1,%l4

	fmuld	%f8,%f14,%f14

	fmuld	%f16,%f22,%f22

	fmuld	%f24,%f30,%f30

	fmuld	%f2,%f4,%f4

	fmuld	%f10,%f14,%f14
	ldd	[%l5+8],%f8

	fmuld	%f18,%f22,%f22
	ldd	[%l6+8],%f16

	fmuld	%f26,%f30,%f30
	ldd	[%l7+8],%f24

	fmuld	%f0,%f6,%f6
	faddd	%f4,%f32,%f4
	ldd	[%l4+16],%f0

	fmuld	%f8,%f12,%f12
	faddd	%f34,%f14,%f14

	fmuld	%f16,%f20,%f20
	faddd	%f36,%f22,%f22

	fmuld	%f24,%f28,%f28
	faddd	%f38,%f30,%f30

	fmuld	%f0,%f6,%f6
	faddd	%f4,%f2,%f4
	ldd	[%l4+8],%f32

	faddd	%f10,%f14,%f14
	ldd	[%l5+16],%f34

	faddd	%f18,%f22,%f22
	ldd	[%l6+16],%f36

	faddd	%f26,%f30,%f30
	ldd	[%l7+16],%f38

	fmuld	%f32,%f4,%f4

	fmuld	%f34,%f14,%f14

	fmuld	%f36,%f22,%f22

	fmuld	%f38,%f30,%f30

	fsubd	%f6,%f4,%f6

	faddd	%f14,%f12,%f14

	faddd	%f22,%f20,%f22

	faddd	%f30,%f28,%f30

	faddd	%f6,%f0,%f6

	faddd	%f14,%f8,%f14

	faddd	%f22,%f16,%f22

	faddd	%f30,%f24,%f30
	mov	%l0,%l4

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f14,%f12
	lda	[%i1]%asi,%f0

	fnegd	%f22,%f20
	lda	[%i1+4]%asi,%f3

	fnegd	%f30,%f28
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	andcc	%l4,2,%g0
	fmovdnz	%icc,%f4,%f6
	st	%f6,[%o0]

	andcc	%l1,2,%g0
	fmovdnz	%icc,%f12,%f14
	st	%f14,[%o1]

	andcc	%l2,2,%g0
	fmovdnz	%icc,%f20,%f22
	st	%f22,[%o2]

	andcc	%l3,2,%g0
	fmovdnz	%icc,%f28,%f30
	st	%f30,[%o3]

	addcc	%i0,-1,%i0
	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop

	.align	16
.case8:
	fmuld	%f10,%f10,%f8
	andcc	%l1,1,%g0
	bz,pn	%icc,.case12
! delay slot
	fxor	%f14,%f34,%f34

	fmuld	%f18,%f18,%f16
	andcc	%l2,1,%g0
	bz,pn	%icc,.case10
! delay slot
	fxor	%f22,%f36,%f36

	fmuld	%f26,%f26,%f24
	andcc	%l3,1,%g0
	bz,pn	%icc,.case9
! delay slot
	fxor	%f30,%f38,%f38

	fmuld	%f0,pp3,%f6		! sin(x0)

	faddd	%f6,pp2,%f6
	fmuld	%f0,qq2,%f4

	fmuld	%f8,qq3,%f14		! cos(x1)

	fmuld	%f16,qq3,%f22		! cos(x2)

	fmuld	%f24,qq3,%f30		! cos(x3)

	fmuld	%f0,%f6,%f6
	faddd	%f4,qq1,%f4

	faddd	%f14,qq2,%f14
	fmuld	%f8,pp2,%f12

	faddd	%f22,qq2,%f22
	fmuld	%f16,pp2,%f20

	faddd	%f30,qq2,%f30
	fmuld	%f24,pp2,%f28

	faddd	%f6,pp1,%f6
	fmuld	%f0,%f4,%f4
	add	%l4,%g1,%l4

	fmuld	%f8,%f14,%f14
	faddd	%f12,pp1,%f12

	fmuld	%f16,%f22,%f22
	faddd	%f20,pp1,%f20

	fmuld	%f24,%f30,%f30
	faddd	%f28,pp1,%f28

	fmuld	%f0,%f6,%f6

	faddd	%f14,qq1,%f14
	fmuld	%f8,%f12,%f12
	add	%l5,%g1,%l5

	faddd	%f22,qq1,%f22
	fmuld	%f16,%f20,%f20
	add	%l6,%g1,%l6

	faddd	%f30,qq1,%f30
	fmuld	%f24,%f28,%f28
	add	%l7,%g1,%l7

	fmuld	%f2,%f6,%f6
	ldd	[%l4+8],%f0

	fmuld	%f10,%f12,%f12

	fmuld	%f18,%f20,%f20

	fmuld	%f26,%f28,%f28

	fmuld	%f0,%f4,%f4
	faddd	%f32,%f6,%f6

	fmuld	%f8,%f14,%f14
	faddd	%f12,%f34,%f12
	ldd	[%l5+16],%f8

	fmuld	%f16,%f22,%f22
	faddd	%f20,%f36,%f20
	ldd	[%l6+16],%f16

	fmuld	%f24,%f30,%f30
	faddd	%f28,%f38,%f28
	ldd	[%l7+16],%f24

	faddd	%f2,%f6,%f6
	ldd	[%l4+16],%f32

	fmuld	%f8,%f14,%f14
	faddd	%f12,%f10,%f12
	ldd	[%l5+8],%f34

	fmuld	%f16,%f22,%f22
	faddd	%f20,%f18,%f20
	ldd	[%l6+8],%f36

	fmuld	%f24,%f30,%f30
	faddd	%f28,%f26,%f28
	ldd	[%l7+8],%f38

	fmuld	%f32,%f6,%f6

	fmuld	%f34,%f12,%f12

	fmuld	%f36,%f20,%f20

	fmuld	%f38,%f28,%f28

	faddd	%f6,%f4,%f6

	fsubd	%f14,%f12,%f14

	fsubd	%f22,%f20,%f22

	fsubd	%f30,%f28,%f30

	faddd	%f6,%f0,%f6

	faddd	%f14,%f8,%f14

	faddd	%f22,%f16,%f22

	faddd	%f30,%f24,%f30
	mov	%l0,%l4

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f14,%f12
	lda	[%i1]%asi,%f0

	fnegd	%f22,%f20
	lda	[%i1+4]%asi,%f3

	fnegd	%f30,%f28
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	andcc	%l4,2,%g0
	fmovdnz	%icc,%f4,%f6
	st	%f6,[%o0]

	andcc	%l1,2,%g0
	fmovdnz	%icc,%f12,%f14
	st	%f14,[%o1]

	andcc	%l2,2,%g0
	fmovdnz	%icc,%f20,%f22
	st	%f22,[%o2]

	andcc	%l3,2,%g0
	fmovdnz	%icc,%f28,%f30
	st	%f30,[%o3]

	addcc	%i0,-1,%i0
	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop

	.align	16
.case9:
	fmuld	%f0,pp3,%f6		! sin(x0)

	fmuld	%f24,pp3,%f30		! sin(x3)

	faddd	%f6,pp2,%f6
	fmuld	%f0,qq2,%f4

	fmuld	%f8,qq3,%f14		! cos(x1)

	fmuld	%f16,qq3,%f22		! cos(x2)

	faddd	%f30,pp2,%f30
	fmuld	%f24,qq2,%f28

	fmuld	%f0,%f6,%f6
	faddd	%f4,qq1,%f4

	faddd	%f14,qq2,%f14
	fmuld	%f8,pp2,%f12

	faddd	%f22,qq2,%f22
	fmuld	%f16,pp2,%f20

	fmuld	%f24,%f30,%f30
	faddd	%f28,qq1,%f28

	faddd	%f6,pp1,%f6
	fmuld	%f0,%f4,%f4
	add	%l4,%g1,%l4

	fmuld	%f8,%f14,%f14
	faddd	%f12,pp1,%f12

	fmuld	%f16,%f22,%f22
	faddd	%f20,pp1,%f20

	faddd	%f30,pp1,%f30
	fmuld	%f24,%f28,%f28
	add	%l7,%g1,%l7

	fmuld	%f0,%f6,%f6

	faddd	%f14,qq1,%f14
	fmuld	%f8,%f12,%f12
	add	%l5,%g1,%l5

	faddd	%f22,qq1,%f22
	fmuld	%f16,%f20,%f20
	add	%l6,%g1,%l6

	fmuld	%f24,%f30,%f30

	fmuld	%f2,%f6,%f6
	ldd	[%l4+8],%f0

	fmuld	%f10,%f12,%f12

	fmuld	%f18,%f20,%f20

	fmuld	%f26,%f30,%f30
	ldd	[%l7+8],%f24

	fmuld	%f0,%f4,%f4
	faddd	%f32,%f6,%f6

	fmuld	%f8,%f14,%f14
	faddd	%f12,%f34,%f12
	ldd	[%l5+16],%f8

	fmuld	%f16,%f22,%f22
	faddd	%f20,%f36,%f20
	ldd	[%l6+16],%f16

	fmuld	%f24,%f28,%f28
	faddd	%f38,%f30,%f30

	faddd	%f2,%f6,%f6
	ldd	[%l4+16],%f32

	fmuld	%f8,%f14,%f14
	faddd	%f12,%f10,%f12
	ldd	[%l5+8],%f34

	fmuld	%f16,%f22,%f22
	faddd	%f20,%f18,%f20
	ldd	[%l6+8],%f36

	faddd	%f26,%f30,%f30
	ldd	[%l7+16],%f38

	fmuld	%f32,%f6,%f6

	fmuld	%f34,%f12,%f12

	fmuld	%f36,%f20,%f20

	fmuld	%f38,%f30,%f30

	faddd	%f6,%f4,%f6

	fsubd	%f14,%f12,%f14

	fsubd	%f22,%f20,%f22

	faddd	%f30,%f28,%f30

	faddd	%f6,%f0,%f6

	faddd	%f14,%f8,%f14

	faddd	%f22,%f16,%f22

	faddd	%f30,%f24,%f30
	mov	%l0,%l4

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f14,%f12
	lda	[%i1]%asi,%f0

	fnegd	%f22,%f20
	lda	[%i1+4]%asi,%f3

	fnegd	%f30,%f28
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	andcc	%l4,2,%g0
	fmovdnz	%icc,%f4,%f6
	st	%f6,[%o0]

	andcc	%l1,2,%g0
	fmovdnz	%icc,%f12,%f14
	st	%f14,[%o1]

	andcc	%l2,2,%g0
	fmovdnz	%icc,%f20,%f22
	st	%f22,[%o2]

	andcc	%l3,2,%g0
	fmovdnz	%icc,%f28,%f30
	st	%f30,[%o3]

	addcc	%i0,-1,%i0
	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop

	.align	16
.case10:
	fmuld	%f26,%f26,%f24
	andcc	%l3,1,%g0
	bz,pn	%icc,.case11
! delay slot
	fxor	%f30,%f38,%f38

	fmuld	%f0,pp3,%f6		! sin(x0)

	fmuld	%f16,pp3,%f22		! sin(x2)

	faddd	%f6,pp2,%f6
	fmuld	%f0,qq2,%f4

	fmuld	%f8,qq3,%f14		! cos(x1)

	faddd	%f22,pp2,%f22
	fmuld	%f16,qq2,%f20

	fmuld	%f24,qq3,%f30		! cos(x3)

	fmuld	%f0,%f6,%f6
	faddd	%f4,qq1,%f4

	faddd	%f14,qq2,%f14
	fmuld	%f8,pp2,%f12

	fmuld	%f16,%f22,%f22
	faddd	%f20,qq1,%f20

	faddd	%f30,qq2,%f30
	fmuld	%f24,pp2,%f28

	faddd	%f6,pp1,%f6
	fmuld	%f0,%f4,%f4
	add	%l4,%g1,%l4

	fmuld	%f8,%f14,%f14
	faddd	%f12,pp1,%f12

	faddd	%f22,pp1,%f22
	fmuld	%f16,%f20,%f20
	add	%l6,%g1,%l6

	fmuld	%f24,%f30,%f30
	faddd	%f28,pp1,%f28

	fmuld	%f0,%f6,%f6

	faddd	%f14,qq1,%f14
	fmuld	%f8,%f12,%f12
	add	%l5,%g1,%l5

	fmuld	%f16,%f22,%f22

	faddd	%f30,qq1,%f30
	fmuld	%f24,%f28,%f28
	add	%l7,%g1,%l7

	fmuld	%f2,%f6,%f6
	ldd	[%l4+8],%f0

	fmuld	%f10,%f12,%f12

	fmuld	%f18,%f22,%f22
	ldd	[%l6+8],%f16

	fmuld	%f26,%f28,%f28

	fmuld	%f0,%f4,%f4
	faddd	%f32,%f6,%f6

	fmuld	%f8,%f14,%f14
	faddd	%f12,%f34,%f12
	ldd	[%l5+16],%f8

	fmuld	%f16,%f20,%f20
	faddd	%f36,%f22,%f22

	fmuld	%f24,%f30,%f30
	faddd	%f28,%f38,%f28
	ldd	[%l7+16],%f24

	faddd	%f2,%f6,%f6
	ldd	[%l4+16],%f32

	fmuld	%f8,%f14,%f14
	faddd	%f12,%f10,%f12
	ldd	[%l5+8],%f34

	faddd	%f18,%f22,%f22
	ldd	[%l6+16],%f36

	fmuld	%f24,%f30,%f30
	faddd	%f28,%f26,%f28
	ldd	[%l7+8],%f38

	fmuld	%f32,%f6,%f6

	fmuld	%f34,%f12,%f12

	fmuld	%f36,%f22,%f22

	fmuld	%f38,%f28,%f28

	faddd	%f6,%f4,%f6

	fsubd	%f14,%f12,%f14

	faddd	%f22,%f20,%f22

	fsubd	%f30,%f28,%f30

	faddd	%f6,%f0,%f6

	faddd	%f14,%f8,%f14

	faddd	%f22,%f16,%f22

	faddd	%f30,%f24,%f30
	mov	%l0,%l4

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f14,%f12
	lda	[%i1]%asi,%f0

	fnegd	%f22,%f20
	lda	[%i1+4]%asi,%f3

	fnegd	%f30,%f28
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	andcc	%l4,2,%g0
	fmovdnz	%icc,%f4,%f6
	st	%f6,[%o0]

	andcc	%l1,2,%g0
	fmovdnz	%icc,%f12,%f14
	st	%f14,[%o1]

	andcc	%l2,2,%g0
	fmovdnz	%icc,%f20,%f22
	st	%f22,[%o2]

	andcc	%l3,2,%g0
	fmovdnz	%icc,%f28,%f30
	st	%f30,[%o3]

	addcc	%i0,-1,%i0
	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop

	.align	16
.case11:
	fmuld	%f0,pp3,%f6		! sin(x0)

	fmuld	%f16,pp3,%f22		! sin(x2)

	fmuld	%f24,pp3,%f30		! sin(x3)

	faddd	%f6,pp2,%f6
	fmuld	%f0,qq2,%f4

	fmuld	%f8,qq3,%f14		! cos(x1)

	faddd	%f22,pp2,%f22
	fmuld	%f16,qq2,%f20

	faddd	%f30,pp2,%f30
	fmuld	%f24,qq2,%f28

	fmuld	%f0,%f6,%f6
	faddd	%f4,qq1,%f4

	faddd	%f14,qq2,%f14
	fmuld	%f8,pp2,%f12

	fmuld	%f16,%f22,%f22
	faddd	%f20,qq1,%f20

	fmuld	%f24,%f30,%f30
	faddd	%f28,qq1,%f28

	faddd	%f6,pp1,%f6
	fmuld	%f0,%f4,%f4
	add	%l4,%g1,%l4

	fmuld	%f8,%f14,%f14
	faddd	%f12,pp1,%f12

	faddd	%f22,pp1,%f22
	fmuld	%f16,%f20,%f20
	add	%l6,%g1,%l6

	faddd	%f30,pp1,%f30
	fmuld	%f24,%f28,%f28
	add	%l7,%g1,%l7

	fmuld	%f0,%f6,%f6

	faddd	%f14,qq1,%f14
	fmuld	%f8,%f12,%f12
	add	%l5,%g1,%l5

	fmuld	%f16,%f22,%f22

	fmuld	%f24,%f30,%f30

	fmuld	%f2,%f6,%f6
	ldd	[%l4+8],%f0

	fmuld	%f10,%f12,%f12

	fmuld	%f18,%f22,%f22
	ldd	[%l6+8],%f16

	fmuld	%f26,%f30,%f30
	ldd	[%l7+8],%f24

	fmuld	%f0,%f4,%f4
	faddd	%f32,%f6,%f6

	fmuld	%f8,%f14,%f14
	faddd	%f12,%f34,%f12
	ldd	[%l5+16],%f8

	fmuld	%f16,%f20,%f20
	faddd	%f36,%f22,%f22

	fmuld	%f24,%f28,%f28
	faddd	%f38,%f30,%f30

	faddd	%f2,%f6,%f6
	ldd	[%l4+16],%f32

	fmuld	%f8,%f14,%f14
	faddd	%f12,%f10,%f12
	ldd	[%l5+8],%f34

	faddd	%f18,%f22,%f22
	ldd	[%l6+16],%f36

	faddd	%f26,%f30,%f30
	ldd	[%l7+16],%f38

	fmuld	%f32,%f6,%f6

	fmuld	%f34,%f12,%f12

	fmuld	%f36,%f22,%f22

	fmuld	%f38,%f30,%f30

	faddd	%f6,%f4,%f6

	fsubd	%f14,%f12,%f14

	faddd	%f22,%f20,%f22

	faddd	%f30,%f28,%f30

	faddd	%f6,%f0,%f6

	faddd	%f14,%f8,%f14

	faddd	%f22,%f16,%f22

	faddd	%f30,%f24,%f30
	mov	%l0,%l4

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f14,%f12
	lda	[%i1]%asi,%f0

	fnegd	%f22,%f20
	lda	[%i1+4]%asi,%f3

	fnegd	%f30,%f28
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	andcc	%l4,2,%g0
	fmovdnz	%icc,%f4,%f6
	st	%f6,[%o0]

	andcc	%l1,2,%g0
	fmovdnz	%icc,%f12,%f14
	st	%f14,[%o1]

	andcc	%l2,2,%g0
	fmovdnz	%icc,%f20,%f22
	st	%f22,[%o2]

	andcc	%l3,2,%g0
	fmovdnz	%icc,%f28,%f30
	st	%f30,[%o3]

	addcc	%i0,-1,%i0
	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop

	.align	16
.case12:
	fmuld	%f18,%f18,%f16
	andcc	%l2,1,%g0
	bz,pn	%icc,.case14
! delay slot
	fxor	%f22,%f36,%f36

	fmuld	%f26,%f26,%f24
	andcc	%l3,1,%g0
	bz,pn	%icc,.case13
! delay slot
	fxor	%f30,%f38,%f38

	fmuld	%f0,pp3,%f6		! sin(x0)

	fmuld	%f8,pp3,%f14		! sin(x1)

	faddd	%f6,pp2,%f6
	fmuld	%f0,qq2,%f4

	faddd	%f14,pp2,%f14
	fmuld	%f8,qq2,%f12

	fmuld	%f16,qq3,%f22		! cos(x2)

	fmuld	%f24,qq3,%f30		! cos(x3)

	fmuld	%f0,%f6,%f6
	faddd	%f4,qq1,%f4

	fmuld	%f8,%f14,%f14
	faddd	%f12,qq1,%f12

	faddd	%f22,qq2,%f22
	fmuld	%f16,pp2,%f20

	faddd	%f30,qq2,%f30
	fmuld	%f24,pp2,%f28

	faddd	%f6,pp1,%f6
	fmuld	%f0,%f4,%f4
	add	%l4,%g1,%l4

	faddd	%f14,pp1,%f14
	fmuld	%f8,%f12,%f12
	add	%l5,%g1,%l5

	fmuld	%f16,%f22,%f22
	faddd	%f20,pp1,%f20

	fmuld	%f24,%f30,%f30
	faddd	%f28,pp1,%f28

	fmuld	%f0,%f6,%f6

	fmuld	%f8,%f14,%f14

	faddd	%f22,qq1,%f22
	fmuld	%f16,%f20,%f20
	add	%l6,%g1,%l6

	faddd	%f30,qq1,%f30
	fmuld	%f24,%f28,%f28
	add	%l7,%g1,%l7

	fmuld	%f2,%f6,%f6
	ldd	[%l4+8],%f0

	fmuld	%f10,%f14,%f14
	ldd	[%l5+8],%f8

	fmuld	%f18,%f20,%f20

	fmuld	%f26,%f28,%f28

	fmuld	%f0,%f4,%f4
	faddd	%f32,%f6,%f6

	fmuld	%f8,%f12,%f12
	faddd	%f34,%f14,%f14

	fmuld	%f16,%f22,%f22
	faddd	%f20,%f36,%f20
	ldd	[%l6+16],%f16

	fmuld	%f24,%f30,%f30
	faddd	%f28,%f38,%f28
	ldd	[%l7+16],%f24

	faddd	%f2,%f6,%f6
	ldd	[%l4+16],%f32

	faddd	%f10,%f14,%f14
	ldd	[%l5+16],%f34

	fmuld	%f16,%f22,%f22
	faddd	%f20,%f18,%f20
	ldd	[%l6+8],%f36

	fmuld	%f24,%f30,%f30
	faddd	%f28,%f26,%f28
	ldd	[%l7+8],%f38

	fmuld	%f32,%f6,%f6

	fmuld	%f34,%f14,%f14

	fmuld	%f36,%f20,%f20

	fmuld	%f38,%f28,%f28

	faddd	%f6,%f4,%f6

	faddd	%f14,%f12,%f14

	fsubd	%f22,%f20,%f22

	fsubd	%f30,%f28,%f30

	faddd	%f6,%f0,%f6

	faddd	%f14,%f8,%f14

	faddd	%f22,%f16,%f22

	faddd	%f30,%f24,%f30
	mov	%l0,%l4

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f14,%f12
	lda	[%i1]%asi,%f0

	fnegd	%f22,%f20
	lda	[%i1+4]%asi,%f3

	fnegd	%f30,%f28
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	andcc	%l4,2,%g0
	fmovdnz	%icc,%f4,%f6
	st	%f6,[%o0]

	andcc	%l1,2,%g0
	fmovdnz	%icc,%f12,%f14
	st	%f14,[%o1]

	andcc	%l2,2,%g0
	fmovdnz	%icc,%f20,%f22
	st	%f22,[%o2]

	andcc	%l3,2,%g0
	fmovdnz	%icc,%f28,%f30
	st	%f30,[%o3]

	addcc	%i0,-1,%i0
	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop

	.align	16
.case13:
	fmuld	%f0,pp3,%f6		! sin(x0)

	fmuld	%f8,pp3,%f14		! sin(x1)

	fmuld	%f24,pp3,%f30		! sin(x3)

	faddd	%f6,pp2,%f6
	fmuld	%f0,qq2,%f4

	faddd	%f14,pp2,%f14
	fmuld	%f8,qq2,%f12

	fmuld	%f16,qq3,%f22		! cos(x2)

	faddd	%f30,pp2,%f30
	fmuld	%f24,qq2,%f28

	fmuld	%f0,%f6,%f6
	faddd	%f4,qq1,%f4

	fmuld	%f8,%f14,%f14
	faddd	%f12,qq1,%f12

	faddd	%f22,qq2,%f22
	fmuld	%f16,pp2,%f20

	fmuld	%f24,%f30,%f30
	faddd	%f28,qq1,%f28

	faddd	%f6,pp1,%f6
	fmuld	%f0,%f4,%f4
	add	%l4,%g1,%l4

	faddd	%f14,pp1,%f14
	fmuld	%f8,%f12,%f12
	add	%l5,%g1,%l5

	fmuld	%f16,%f22,%f22
	faddd	%f20,pp1,%f20

	faddd	%f30,pp1,%f30
	fmuld	%f24,%f28,%f28
	add	%l7,%g1,%l7

	fmuld	%f0,%f6,%f6

	fmuld	%f8,%f14,%f14

	faddd	%f22,qq1,%f22
	fmuld	%f16,%f20,%f20
	add	%l6,%g1,%l6

	fmuld	%f24,%f30,%f30

	fmuld	%f2,%f6,%f6
	ldd	[%l4+8],%f0

	fmuld	%f10,%f14,%f14
	ldd	[%l5+8],%f8

	fmuld	%f18,%f20,%f20

	fmuld	%f26,%f30,%f30
	ldd	[%l7+8],%f24

	fmuld	%f0,%f4,%f4
	faddd	%f32,%f6,%f6

	fmuld	%f8,%f12,%f12
	faddd	%f34,%f14,%f14

	fmuld	%f16,%f22,%f22
	faddd	%f20,%f36,%f20
	ldd	[%l6+16],%f16

	fmuld	%f24,%f28,%f28
	faddd	%f38,%f30,%f30

	faddd	%f2,%f6,%f6
	ldd	[%l4+16],%f32

	faddd	%f10,%f14,%f14
	ldd	[%l5+16],%f34

	fmuld	%f16,%f22,%f22
	faddd	%f20,%f18,%f20
	ldd	[%l6+8],%f36

	faddd	%f26,%f30,%f30
	ldd	[%l7+16],%f38

	fmuld	%f32,%f6,%f6

	fmuld	%f34,%f14,%f14

	fmuld	%f36,%f20,%f20

	fmuld	%f38,%f30,%f30

	faddd	%f6,%f4,%f6

	faddd	%f14,%f12,%f14

	fsubd	%f22,%f20,%f22

	faddd	%f30,%f28,%f30

	faddd	%f6,%f0,%f6

	faddd	%f14,%f8,%f14

	faddd	%f22,%f16,%f22

	faddd	%f30,%f24,%f30
	mov	%l0,%l4

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f14,%f12
	lda	[%i1]%asi,%f0

	fnegd	%f22,%f20
	lda	[%i1+4]%asi,%f3

	fnegd	%f30,%f28
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	andcc	%l4,2,%g0
	fmovdnz	%icc,%f4,%f6
	st	%f6,[%o0]

	andcc	%l1,2,%g0
	fmovdnz	%icc,%f12,%f14
	st	%f14,[%o1]

	andcc	%l2,2,%g0
	fmovdnz	%icc,%f20,%f22
	st	%f22,[%o2]

	andcc	%l3,2,%g0
	fmovdnz	%icc,%f28,%f30
	st	%f30,[%o3]

	addcc	%i0,-1,%i0
	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop

	.align	16
.case14:
	fmuld	%f26,%f26,%f24
	andcc	%l3,1,%g0
	bz,pn	%icc,.case15
! delay slot
	fxor	%f30,%f38,%f38

	fmuld	%f0,pp3,%f6		! sin(x0)

	fmuld	%f8,pp3,%f14		! sin(x1)

	fmuld	%f16,pp3,%f22		! sin(x2)

	faddd	%f6,pp2,%f6
	fmuld	%f0,qq2,%f4

	faddd	%f14,pp2,%f14
	fmuld	%f8,qq2,%f12

	faddd	%f22,pp2,%f22
	fmuld	%f16,qq2,%f20

	fmuld	%f24,qq3,%f30		! cos(x3)

	fmuld	%f0,%f6,%f6
	faddd	%f4,qq1,%f4

	fmuld	%f8,%f14,%f14
	faddd	%f12,qq1,%f12

	fmuld	%f16,%f22,%f22
	faddd	%f20,qq1,%f20

	faddd	%f30,qq2,%f30
	fmuld	%f24,pp2,%f28

	faddd	%f6,pp1,%f6
	fmuld	%f0,%f4,%f4
	add	%l4,%g1,%l4

	faddd	%f14,pp1,%f14
	fmuld	%f8,%f12,%f12
	add	%l5,%g1,%l5

	faddd	%f22,pp1,%f22
	fmuld	%f16,%f20,%f20
	add	%l6,%g1,%l6

	fmuld	%f24,%f30,%f30
	faddd	%f28,pp1,%f28

	fmuld	%f0,%f6,%f6

	fmuld	%f8,%f14,%f14

	fmuld	%f16,%f22,%f22

	faddd	%f30,qq1,%f30
	fmuld	%f24,%f28,%f28
	add	%l7,%g1,%l7

	fmuld	%f2,%f6,%f6
	ldd	[%l4+8],%f0

	fmuld	%f10,%f14,%f14
	ldd	[%l5+8],%f8

	fmuld	%f18,%f22,%f22
	ldd	[%l6+8],%f16

	fmuld	%f26,%f28,%f28

	fmuld	%f0,%f4,%f4
	faddd	%f32,%f6,%f6

	fmuld	%f8,%f12,%f12
	faddd	%f34,%f14,%f14

	fmuld	%f16,%f20,%f20
	faddd	%f36,%f22,%f22

	fmuld	%f24,%f30,%f30
	faddd	%f28,%f38,%f28
	ldd	[%l7+16],%f24

	faddd	%f2,%f6,%f6
	ldd	[%l4+16],%f32

	faddd	%f10,%f14,%f14
	ldd	[%l5+16],%f34

	faddd	%f18,%f22,%f22
	ldd	[%l6+16],%f36

	fmuld	%f24,%f30,%f30
	faddd	%f28,%f26,%f28
	ldd	[%l7+8],%f38

	fmuld	%f32,%f6,%f6

	fmuld	%f34,%f14,%f14

	fmuld	%f36,%f22,%f22

	fmuld	%f38,%f28,%f28

	faddd	%f6,%f4,%f6

	faddd	%f14,%f12,%f14

	faddd	%f22,%f20,%f22

	fsubd	%f30,%f28,%f30

	faddd	%f6,%f0,%f6

	faddd	%f14,%f8,%f14

	faddd	%f22,%f16,%f22

	faddd	%f30,%f24,%f30
	mov	%l0,%l4

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f14,%f12
	lda	[%i1]%asi,%f0

	fnegd	%f22,%f20
	lda	[%i1+4]%asi,%f3

	fnegd	%f30,%f28
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	andcc	%l4,2,%g0
	fmovdnz	%icc,%f4,%f6
	st	%f6,[%o0]

	andcc	%l1,2,%g0
	fmovdnz	%icc,%f12,%f14
	st	%f14,[%o1]

	andcc	%l2,2,%g0
	fmovdnz	%icc,%f20,%f22
	st	%f22,[%o2]

	andcc	%l3,2,%g0
	fmovdnz	%icc,%f28,%f30
	st	%f30,[%o3]

	addcc	%i0,-1,%i0
	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop

	.align	16
.case15:
	fmuld	%f0,pp3,%f6		! sin(x0)

	fmuld	%f8,pp3,%f14		! sin(x1)

	fmuld	%f16,pp3,%f22		! sin(x2)

	fmuld	%f24,pp3,%f30		! sin(x3)

	faddd	%f6,pp2,%f6
	fmuld	%f0,qq2,%f4

	faddd	%f14,pp2,%f14
	fmuld	%f8,qq2,%f12

	faddd	%f22,pp2,%f22
	fmuld	%f16,qq2,%f20

	faddd	%f30,pp2,%f30
	fmuld	%f24,qq2,%f28

	fmuld	%f0,%f6,%f6
	faddd	%f4,qq1,%f4

	fmuld	%f8,%f14,%f14
	faddd	%f12,qq1,%f12

	fmuld	%f16,%f22,%f22
	faddd	%f20,qq1,%f20

	fmuld	%f24,%f30,%f30
	faddd	%f28,qq1,%f28

	faddd	%f6,pp1,%f6
	fmuld	%f0,%f4,%f4
	add	%l4,%g1,%l4

	faddd	%f14,pp1,%f14
	fmuld	%f8,%f12,%f12
	add	%l5,%g1,%l5

	faddd	%f22,pp1,%f22
	fmuld	%f16,%f20,%f20
	add	%l6,%g1,%l6

	faddd	%f30,pp1,%f30
	fmuld	%f24,%f28,%f28
	add	%l7,%g1,%l7

	fmuld	%f0,%f6,%f6

	fmuld	%f8,%f14,%f14

	fmuld	%f16,%f22,%f22

	fmuld	%f24,%f30,%f30

	fmuld	%f2,%f6,%f6
	ldd	[%l4+8],%f0

	fmuld	%f10,%f14,%f14
	ldd	[%l5+8],%f8

	fmuld	%f18,%f22,%f22
	ldd	[%l6+8],%f16

	fmuld	%f26,%f30,%f30
	ldd	[%l7+8],%f24

	fmuld	%f0,%f4,%f4
	faddd	%f32,%f6,%f6

	fmuld	%f8,%f12,%f12
	faddd	%f34,%f14,%f14

	fmuld	%f16,%f20,%f20
	faddd	%f36,%f22,%f22

	fmuld	%f24,%f28,%f28
	faddd	%f38,%f30,%f30

	faddd	%f2,%f6,%f6
	ldd	[%l4+16],%f32

	faddd	%f10,%f14,%f14
	ldd	[%l5+16],%f34

	faddd	%f18,%f22,%f22
	ldd	[%l6+16],%f36

	faddd	%f26,%f30,%f30
	ldd	[%l7+16],%f38

	fmuld	%f32,%f6,%f6

	fmuld	%f34,%f14,%f14

	fmuld	%f36,%f22,%f22

	fmuld	%f38,%f30,%f30

	faddd	%f6,%f4,%f6

	faddd	%f14,%f12,%f14

	faddd	%f22,%f20,%f22

	faddd	%f30,%f28,%f30

	faddd	%f6,%f0,%f6

	faddd	%f14,%f8,%f14

	faddd	%f22,%f16,%f22

	faddd	%f30,%f24,%f30
	mov	%l0,%l4

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f14,%f12
	lda	[%i1]%asi,%f0

	fnegd	%f22,%f20
	lda	[%i1+4]%asi,%f3

	fnegd	%f30,%f28
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	andcc	%l4,2,%g0
	fmovdnz	%icc,%f4,%f6
	st	%f6,[%o0]

	andcc	%l1,2,%g0
	fmovdnz	%icc,%f12,%f14
	st	%f14,[%o1]

	andcc	%l2,2,%g0
	fmovdnz	%icc,%f20,%f22
	st	%f22,[%o2]

	andcc	%l3,2,%g0
	fmovdnz	%icc,%f28,%f30
	st	%f30,[%o3]

	addcc	%i0,-1,%i0
	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop


	.align	16
.end:
	st	%f15,[%o1+4]
	st	%f23,[%o2+4]
	st	%f31,[%o3+4]
	ld	[%fp+biguns],%i5
	tst	%i5			! check for huge arguments remaining
	be,pt	%icc,.exit
! delay slot
	nop
#ifdef __sparcv9
	ldx	[%fp+xsave],%o1
	ldx	[%fp+ysave],%o3
#else
	ld	[%fp+xsave],%o1
	ld	[%fp+ysave],%o3
#endif
	ld	[%fp+nsave],%o0
	ld	[%fp+sxsave],%o2
	ld	[%fp+sysave],%o4
	sra	%o2,0,%o2		! sign-extend for V9
	sra	%o4,0,%o4
	call	__vlibm_vsin_big_ultra3
	sra	%o5,0,%o5		! delay slot

.exit:
	ret
	restore


	.align	16
.last1:
	faddd	%f2,c3two44,%f4
	st	%f15,[%o1+4]
.last1_from_range1:
	mov	0,%l1
	fzeros	%f8
	fzero	%f10
	add	%fp,junk,%o1
.last2:
	faddd	%f10,c3two44,%f12
	st	%f23,[%o2+4]
.last2_from_range2:
	mov	0,%l2
	fzeros	%f16
	fzero	%f18
	add	%fp,junk,%o2
.last3:
	faddd	%f18,c3two44,%f20
	st	%f31,[%o3+4]
	st	%f5,[%fp+nk0]
	st	%f13,[%fp+nk1]
.last3_from_range3:
	mov	0,%l3
	fzeros	%f24
	fzero	%f26
	ba,pt	%icc,.cont
! delay slot
	add	%fp,junk,%o3


	.align	16
.range0:
	cmp	%l0,%o4
	bl,pt	%icc,1f			! hx < 0x3e400000
! delay slot, harmless if branch taken
	sethi	%hi(0x7ff00000),%o7
	cmp	%l0,%o7
	bl,a,pt	%icc,2f			! branch if finite
! delay slot, squashed if branch not taken
	st	%o4,[%fp+biguns]	! set biguns
	fzero	%f0
	fmuld	%f2,%f0,%f2
	st	%f2,[%o0]
	ba,pt	%icc,2f
! delay slot
	st	%f3,[%o0+4]
1:
	fdtoi	%f2,%f4			! raise inexact if not zero
	st	%f0,[%o0]
	st	%f3,[%o0+4]
2:
	addcc	%i0,-1,%i0
	ble,pn	%icc,.end
! delay slot, harmless if branch taken
	add	%i3,%i4,%i3		! y += stridey
	andn	%l1,%i5,%l0		! hx &= ~0x80000000
	fmovs	%f8,%f0
	fmovs	%f11,%f3
	ba,pt	%icc,.loop0
! delay slot
	add	%i1,%i2,%i1		! x += stridex


	.align	16
.range1:
	cmp	%l1,%o4
	bl,pt	%icc,1f			! hx < 0x3e400000
! delay slot, harmless if branch taken
	sethi	%hi(0x7ff00000),%o7
	cmp	%l1,%o7
	bl,a,pt	%icc,2f			! branch if finite
! delay slot, squashed if branch not taken
	st	%o4,[%fp+biguns]	! set biguns
	fzero	%f8
	fmuld	%f10,%f8,%f10
	st	%f10,[%o1]
	ba,pt	%icc,2f
! delay slot
	st	%f11,[%o1+4]
1:
	fdtoi	%f10,%f12		! raise inexact if not zero
	st	%f8,[%o1]
	st	%f11,[%o1+4]
2:
	addcc	%i0,-1,%i0
	ble,pn	%icc,.last1_from_range1
! delay slot, harmless if branch taken
	add	%i3,%i4,%i3		! y += stridey
	andn	%l2,%i5,%l1		! hx &= ~0x80000000
	fmovs	%f16,%f8
	fmovs	%f19,%f11
	ba,pt	%icc,.loop1
! delay slot
	add	%i1,%i2,%i1		! x += stridex


	.align	16
.range2:
	cmp	%l2,%o4
	bl,pt	%icc,1f			! hx < 0x3e400000
! delay slot, harmless if branch taken
	sethi	%hi(0x7ff00000),%o7
	cmp	%l2,%o7
	bl,a,pt	%icc,2f			! branch if finite
! delay slot, squashed if branch not taken
	st	%o4,[%fp+biguns]	! set biguns
	fzero	%f16
	fmuld	%f18,%f16,%f18
	st	%f18,[%o2]
	ba,pt	%icc,2f
! delay slot
	st	%f19,[%o2+4]
1:
	fdtoi	%f18,%f20		! raise inexact if not zero
	st	%f16,[%o2]
	st	%f19,[%o2+4]
2:
	addcc	%i0,-1,%i0
	ble,pn	%icc,.last2_from_range2
! delay slot, harmless if branch taken
	add	%i3,%i4,%i3		! y += stridey
	andn	%l3,%i5,%l2		! hx &= ~0x80000000
	fmovs	%f24,%f16
	fmovs	%f27,%f19
	ba,pt	%icc,.loop2
! delay slot
	add	%i1,%i2,%i1		! x += stridex


	.align	16
.range3:
	cmp	%l3,%o4
	bl,pt	%icc,1f			! hx < 0x3e400000
! delay slot, harmless if branch taken
	sethi	%hi(0x7ff00000),%o7
	cmp	%l3,%o7
	bl,a,pt	%icc,2f			! branch if finite
! delay slot, squashed if branch not taken
	st	%o4,[%fp+biguns]	! set biguns
	fzero	%f24
	fmuld	%f26,%f24,%f26
	st	%f26,[%o3]
	ba,pt	%icc,2f
! delay slot
	st	%f27,[%o3+4]
1:
	fdtoi	%f26,%f28		! raise inexact if not zero
	st	%f24,[%o3]
	st	%f27,[%o3+4]
2:
	addcc	%i0,-1,%i0
	ble,pn	%icc,.last3_from_range3
! delay slot, harmless if branch taken
	add	%i3,%i4,%i3		! y += stridey
	ld	[%i1],%l3
	ld	[%i1],%f24
	ld	[%i1+4],%f27
	andn	%l3,%i5,%l3		! hx &= ~0x80000000
	ba,pt	%icc,.loop3
! delay slot
	add	%i1,%i2,%i1		! x += stridex

	SET_SIZE(__vsin_ultra3)