/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

	.file	"__vsincos.S"

#include "libm.h"

	RO_DATA
	.align	64
constants:
	.word	0x42c80000,0x00000000	! 3 * 2^44
	.word	0x43380000,0x00000000	! 3 * 2^51
	.word	0x3fe45f30,0x6dc9c883	! invpio2
	.word	0x3ff921fb,0x54442c00	! pio2_1
	.word	0x3d318469,0x898cc400	! pio2_2
	.word	0x3a71701b,0x839a2520	! pio2_3
	.word	0xbfc55555,0x55555533	! pp1
	.word	0x3f811111,0x10e7d53b	! pp2
	.word	0xbf2a0167,0xe6b3cf9b	! pp3
	.word	0xbfdfffff,0xffffff65	! qq1
	.word	0x3fa55555,0x54f88ed0	! qq2
	.word	0xbf56c12c,0xdd185f60	! qq3

! local storage indices

#define xsave		STACK_BIAS-0x8
#define ssave		STACK_BIAS-0x10
#define csave		STACK_BIAS-0x18
#define nsave		STACK_BIAS-0x1c
#define sxsave		STACK_BIAS-0x20
#define sssave		STACK_BIAS-0x24
#define biguns		STACK_BIAS-0x28
#define junk		STACK_BIAS-0x30
#define nk2		STACK_BIAS-0x38
#define nk1		STACK_BIAS-0x3c
#define nk0		STACK_BIAS-0x40
! sizeof temp storage - must be a multiple of 16 for V9
#define tmps		0x40

! register use

! i0  n
! i1  x
! i2  stridex
! i3  s
! i4  strides
! i5  0x80000000,n0

! l0  hx0,k0
! l1  hx1,k1
! l2  hx2,k2
! l3  c
! l4  pc0
! l5  pc1
! l6  pc2
! l7  stridec

! the following are 64-bit registers in both V8+ and V9

! g1  __vlibm_TBL_sincos2
! g5  scratch,n1

! o0  ps0
! o1  ps1
! o2  ps2
! o3  0x3fe921fb
! o4  0x3e400000
! o5  0x4099251e
! o7  scratch,n2

! f0  x0,z0
! f2  abs(x0)
! f4  
! f6  
! f8  
! f10 x1,z1
! f12 abs(x1)
! f14 
! f16 
! f18 
! f20 x2,z2
! f22 abs(x2)
! f24 
! f26 
! f28 
! f30 
! f32 
! f34 
! f36
! f38

#define c3two44	%f40
#define c3two51	%f42
#define invpio2	%f44
#define pio2_1	%f46
#define pio2_2	%f48
#define pio2_3	%f50
#define pp1	%f52
#define pp2	%f54
#define pp3	%f56
#define qq1	%f58
#define qq2	%f60
#define qq3	%f62

	ENTRY(__vsincos)
	save	%sp,-SA(MINFRAME)-tmps,%sp
	PIC_SETUP(l7)
	PIC_SET(l7,constants,o0)
	PIC_SET(l7,__vlibm_TBL_sincos2,o1)
	mov	%o1,%g1
	wr	%g0,0x82,%asi		! set %asi for non-faulting loads
#ifdef __sparcv9
	stx	%i1,[%fp+xsave]		! save arguments
	stx	%i3,[%fp+ssave]
	stx	%i5,[%fp+csave]
	ldx	[%fp+STACK_BIAS+0xb0],%l7
#else
	st	%i1,[%fp+xsave]		! save arguments
	st	%i3,[%fp+ssave]
	st	%i5,[%fp+csave]
	ld	[%fp+0x5c],%l7
#endif
	st	%i0,[%fp+nsave]
	st	%i2,[%fp+sxsave]
	st	%i4,[%fp+sssave]
	mov	%i5,%l3
	st	%g0,[%fp+biguns]	! biguns = 0
	ldd	[%o0+0x00],c3two44	! load/set up constants
	ldd	[%o0+0x08],c3two51
	ldd	[%o0+0x10],invpio2
	ldd	[%o0+0x18],pio2_1
	ldd	[%o0+0x20],pio2_2
	ldd	[%o0+0x28],pio2_3
	ldd	[%o0+0x30],pp1
	ldd	[%o0+0x38],pp2
	ldd	[%o0+0x40],pp3
	ldd	[%o0+0x48],qq1
	ldd	[%o0+0x50],qq2
	ldd	[%o0+0x58],qq3
	sethi	%hi(0x80000000),%i5
	sethi	%hi(0x3e400000),%o4
	sethi	%hi(0x3fe921fb),%o3
	or	%o3,%lo(0x3fe921fb),%o3
	sethi	%hi(0x4099251e),%o5
	or	%o5,%lo(0x4099251e),%o5
	sll	%i2,3,%i2		! scale strides
	sll	%i4,3,%i4
	sll	%l7,3,%l7
	add	%fp,junk,%o0		! loop prologue
	add	%fp,junk,%o1
	add	%fp,junk,%o2
	ld	[%i1],%l0		! *x
	ld	[%i1],%f0
	ld	[%i1+4],%f3
	andn	%l0,%i5,%l0		! mask off sign
	ba	.loop0
	add	%i1,%i2,%i1		! x += stridex

! 16-byte aligned
	.align	16
.loop0:
	lda	[%i1]%asi,%l1		! preload next argument
	sub	%l0,%o4,%g5
	sub	%o5,%l0,%o7
	fabss	%f0,%f2

	lda	[%i1]%asi,%f10
	orcc	%o7,%g5,%g0
	mov	%i3,%o0			! ps0 = s
	bl,pn	%icc,.range0		! hx < 0x3e400000 or hx > 0x4099251e

! delay slot
	lda	[%i1+4]%asi,%f13
	addcc	%i0,-1,%i0
	add	%i3,%i4,%i3		! s += strides

	mov	%l3,%l4			! pc0 = c
	add	%l3,%l7,%l3		! c += stridec
	ble,pn	%icc,.last1

! delay slot
	andn	%l1,%i5,%l1
	add	%i1,%i2,%i1		! x += stridex
	faddd	%f2,c3two44,%f4
	st	%f17,[%o1+4]

.loop1:
	lda	[%i1]%asi,%l2		! preload next argument
	sub	%l1,%o4,%g5
	sub	%o5,%l1,%o7
	fabss	%f10,%f12

	lda	[%i1]%asi,%f20
	orcc	%o7,%g5,%g0
	mov	%i3,%o1			! ps1 = s
	bl,pn	%icc,.range1		! hx < 0x3e400000 or hx > 0x4099251e

! delay slot
	lda	[%i1+4]%asi,%f23
	addcc	%i0,-1,%i0
	add	%i3,%i4,%i3		! s += strides

	mov	%l3,%l5			! pc1 = c
	add	%l3,%l7,%l3		! c += stridec
	ble,pn	%icc,.last2

! delay slot
	andn	%l2,%i5,%l2
	add	%i1,%i2,%i1		! x += stridex
	faddd	%f12,c3two44,%f14
	st	%f27,[%o2+4]

.loop2:
	sub	%l2,%o4,%g5
	sub	%o5,%l2,%o7
	fabss	%f20,%f22
	st	%f5,[%fp+nk0]

	orcc	%o7,%g5,%g0
	mov	%i3,%o2			! ps2 = s
	bl,pn	%icc,.range2		! hx < 0x3e400000 or hx > 0x4099251e
! delay slot
	st	%f15,[%fp+nk1]

	mov	%l3,%l6			! pc2 = c

.cont:
	add	%i3,%i4,%i3		! s += strides
	add	%l3,%l7,%l3		! c += stridec
	faddd	%f22,c3two44,%f24
	st	%f25,[%fp+nk2]

	sub	%o3,%l0,%l0
	sub	%o3,%l1,%l1
	fmovs	%f3,%f1

	sub	%o3,%l2,%l2
	fmovs	%f13,%f11

	or	%l0,%l1,%l0
	orcc	%l0,%l2,%g0
	fmovs	%f23,%f21

	fmuld	%f0,invpio2,%f6		! x * invpio2, for medium range

	fmuld	%f10,invpio2,%f16
	ld	[%fp+nk0],%l0

	fmuld	%f20,invpio2,%f26
	ld	[%fp+nk1],%l1

	bl,pn	%icc,.medium
! delay slot
	ld	[%fp+nk2],%l2

	sll	%l0,5,%l0		! k
	fcmpd	%fcc0,%f0,pio2_3	! x < pio2_3 iff x < 0

	sll	%l1,5,%l1
	ldd	[%l0+%g1],%f4
	fcmpd	%fcc1,%f10,pio2_3

	sll	%l2,5,%l2
	ldd	[%l1+%g1],%f14
	fcmpd	%fcc2,%f20,pio2_3

	ldd	[%l2+%g1],%f24

	fsubd	%f2,%f4,%f2		! x -= __vlibm_TBL_sincos2[k]

	fsubd	%f12,%f14,%f12

	fsubd	%f22,%f24,%f22

	fmuld	%f2,%f2,%f0		! z = x * x

	fmuld	%f12,%f12,%f10

	fmuld	%f22,%f22,%f20

	fmuld	%f0,pp3,%f6

	fmuld	%f10,pp3,%f16

	fmuld	%f20,pp3,%f26

	faddd	%f6,pp2,%f6
	fmuld	%f0,qq3,%f4

	faddd	%f16,pp2,%f16
	fmuld	%f10,qq3,%f14

	faddd	%f26,pp2,%f26
	fmuld	%f20,qq3,%f24

	fmuld	%f0,%f6,%f6
	faddd	%f4,qq2,%f4

	fmuld	%f10,%f16,%f16
	faddd	%f14,qq2,%f14

	fmuld	%f20,%f26,%f26
	faddd	%f24,qq2,%f24

	faddd	%f6,pp1,%f6
	fmuld	%f0,%f4,%f4
	add	%l0,%g1,%l0

	faddd	%f16,pp1,%f16
	fmuld	%f10,%f14,%f14
	add	%l1,%g1,%l1

	faddd	%f26,pp1,%f26
	fmuld	%f20,%f24,%f24
	add	%l2,%g1,%l2

	fmuld	%f0,%f6,%f6
	faddd	%f4,qq1,%f4

	fmuld	%f10,%f16,%f16
	faddd	%f14,qq1,%f14

	fmuld	%f20,%f26,%f26
	faddd	%f24,qq1,%f24

	fmuld	%f2,%f6,%f6
	ldd	[%l0+8],%f8

	fmuld	%f12,%f16,%f16
	ldd	[%l1+8],%f18

	fmuld	%f22,%f26,%f26
	ldd	[%l2+8],%f28

	faddd	%f6,%f2,%f6
	fmuld	%f0,%f4,%f4
	ldd	[%l0+16],%f30

	faddd	%f16,%f12,%f16
	fmuld	%f10,%f14,%f14
	ldd	[%l1+16],%f32

	faddd	%f26,%f22,%f26
	fmuld	%f20,%f24,%f24
	ldd	[%l2+16],%f34

	fmuld	%f8,%f6,%f0		! s * spoly

	fmuld	%f18,%f16,%f10

	fmuld	%f28,%f26,%f20

	fmuld	%f30,%f4,%f2		! c * cpoly

	fmuld	%f32,%f14,%f12

	fmuld	%f34,%f24,%f22

	fmuld	%f30,%f6,%f6		! c * spoly
	fsubd	%f2,%f0,%f2

	fmuld	%f32,%f16,%f16
	fsubd	%f12,%f10,%f12

	fmuld	%f34,%f26,%f26
	fsubd	%f22,%f20,%f22

	fmuld	%f8,%f4,%f4		! s * cpoly
	faddd	%f2,%f30,%f2
	st	%f2,[%l4]

	fmuld	%f18,%f14,%f14
	faddd	%f12,%f32,%f12
	st	%f3,[%l4+4]

	fmuld	%f28,%f24,%f24
	faddd	%f22,%f34,%f22
	st	%f12,[%l5]

	faddd	%f6,%f4,%f6
	st	%f13,[%l5+4]

	faddd	%f16,%f14,%f16
	st	%f22,[%l6]

	faddd	%f26,%f24,%f26
	st	%f23,[%l6+4]

	faddd	%f6,%f8,%f6

	faddd	%f16,%f18,%f16

	faddd	%f26,%f28,%f26

	fnegd	%f6,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fnegd	%f16,%f14
	lda	[%i1]%asi,%f0

	fnegd	%f26,%f24
	lda	[%i1+4]%asi,%f3
	andn	%l0,%i5,%l0
	add	%i1,%i2,%i1

	fmovdl	%fcc0,%f4,%f6		! (hx < -0)? -s : s
	st	%f6,[%o0]

	fmovdl	%fcc1,%f14,%f16
	st	%f16,[%o1]

	fmovdl	%fcc2,%f24,%f26
	st	%f26,[%o2]
	addcc	%i0,-1,%i0

	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop


	.align	16
.medium:
	faddd	%f6,c3two51,%f4
	st	%f5,[%fp+nk0]

	faddd	%f16,c3two51,%f14
	st	%f15,[%fp+nk1]

	faddd	%f26,c3two51,%f24
	st	%f25,[%fp+nk2]

	fsubd	%f4,c3two51,%f6

	fsubd	%f14,c3two51,%f16

	fsubd	%f24,c3two51,%f26

	fmuld	%f6,pio2_1,%f2
	ld	[%fp+nk0],%i5		! n

	fmuld	%f16,pio2_1,%f12
	ld	[%fp+nk1],%g5

	fmuld	%f26,pio2_1,%f22
	ld	[%fp+nk2],%o7

	fsubd	%f0,%f2,%f0
	fmuld	%f6,pio2_2,%f4
	mov	%o0,%o4			! if (n & 1) swap ps, pc
	andcc	%i5,1,%g0

	fsubd	%f10,%f12,%f10
	fmuld	%f16,pio2_2,%f14
	movnz	%icc,%l4,%o0
	and	%i5,3,%i5

	fsubd	%f20,%f22,%f20
	fmuld	%f26,pio2_2,%f24
	movnz	%icc,%o4,%l4

	fsubd	%f0,%f4,%f30
	mov	%o1,%o4
	andcc	%g5,1,%g0

	fsubd	%f10,%f14,%f32
	movnz	%icc,%l5,%o1
	and	%g5,3,%g5

	fsubd	%f20,%f24,%f34
	movnz	%icc,%o4,%l5

	fsubd	%f0,%f30,%f0
	fcmple32 %f30,pio2_3,%l0	! x <= pio2_3 iff x < 0
	mov	%o2,%o4
	andcc	%o7,1,%g0

	fsubd	%f10,%f32,%f10
	fcmple32 %f32,pio2_3,%l1
	movnz	%icc,%l6,%o2
	and	%o7,3,%o7

	fsubd	%f20,%f34,%f20
	fcmple32 %f34,pio2_3,%l2
	movnz	%icc,%o4,%l6

	fsubd	%f0,%f4,%f0
	fmuld	%f6,pio2_3,%f6
	add	%i5,1,%o4		! n = (n >> 1) | (((n + 1) ^ l) & 2)
	srl	%i5,1,%i5

	fsubd	%f10,%f14,%f10
	fmuld	%f16,pio2_3,%f16
	xor	%o4,%l0,%o4

	fsubd	%f20,%f24,%f20
	fmuld	%f26,pio2_3,%f26
	and	%o4,2,%o4

	fsubd	%f6,%f0,%f6
	or	%i5,%o4,%i5

	fsubd	%f16,%f10,%f16
	add	%g5,1,%o4
	srl	%g5,1,%g5

	fsubd	%f26,%f20,%f26
	xor	%o4,%l1,%o4

	fsubd	%f30,%f6,%f0		! reduced x
	and	%o4,2,%o4

	fsubd	%f32,%f16,%f10
	or	%g5,%o4,%g5

	fsubd	%f34,%f26,%f20
	add	%o7,1,%o4
	srl	%o7,1,%o7

	fzero	%f38
	xor	%o4,%l2,%o4

	fabsd	%f0,%f2
	and	%o4,2,%o4

	fabsd	%f10,%f12
	or	%o7,%o4,%o7

	fabsd	%f20,%f22
	sethi	%hi(0x3e400000),%o4

	fnegd	%f38,%f38

	faddd	%f2,c3two44,%f4
	st	%f5,[%fp+nk0]

	faddd	%f12,c3two44,%f14
	st	%f15,[%fp+nk1]

	faddd	%f22,c3two44,%f24
	st	%f25,[%fp+nk2]

	fsubd	%f30,%f0,%f4

	fsubd	%f32,%f10,%f14

	fsubd	%f34,%f20,%f24

	fsubd	%f4,%f6,%f6		! w
	ld	[%fp+nk0],%l0

	fsubd	%f14,%f16,%f16
	ld	[%fp+nk1],%l1

	fsubd	%f24,%f26,%f26
	ld	[%fp+nk2],%l2
	sll	%l0,5,%l0		! k

	fand	%f0,%f38,%f30		! sign bit of x
	ldd	[%l0+%g1],%f4
	sll	%l1,5,%l1

	fand	%f10,%f38,%f32
	ldd	[%l1+%g1],%f14
	sll	%l2,5,%l2

	fand	%f20,%f38,%f34
	ldd	[%l2+%g1],%f24

	fsubd	%f2,%f4,%f2		! x -= __vlibm_TBL_sincos2[k]

	fsubd	%f12,%f14,%f12

	fsubd	%f22,%f24,%f22

	fmuld	%f2,%f2,%f0		! z = x * x
	fxor	%f6,%f30,%f30

	fmuld	%f12,%f12,%f10
	fxor	%f16,%f32,%f32

	fmuld	%f22,%f22,%f20
	fxor	%f26,%f34,%f34

	fmuld	%f0,pp3,%f6

	fmuld	%f10,pp3,%f16

	fmuld	%f20,pp3,%f26

	faddd	%f6,pp2,%f6
	fmuld	%f0,qq3,%f4

	faddd	%f16,pp2,%f16
	fmuld	%f10,qq3,%f14

	faddd	%f26,pp2,%f26
	fmuld	%f20,qq3,%f24

	fmuld	%f0,%f6,%f6
	faddd	%f4,qq2,%f4

	fmuld	%f10,%f16,%f16
	faddd	%f14,qq2,%f14

	fmuld	%f20,%f26,%f26
	faddd	%f24,qq2,%f24

	faddd	%f6,pp1,%f6
	fmuld	%f0,%f4,%f4
	add	%l0,%g1,%l0

	faddd	%f16,pp1,%f16
	fmuld	%f10,%f14,%f14
	add	%l1,%g1,%l1

	faddd	%f26,pp1,%f26
	fmuld	%f20,%f24,%f24
	add	%l2,%g1,%l2

	fmuld	%f0,%f6,%f6
	faddd	%f4,qq1,%f4

	fmuld	%f10,%f16,%f16
	faddd	%f14,qq1,%f14

	fmuld	%f20,%f26,%f26
	faddd	%f24,qq1,%f24

	fmuld	%f2,%f6,%f6
	ldd	[%l0+16],%f8

	fmuld	%f12,%f16,%f16
	ldd	[%l1+16],%f18

	fmuld	%f22,%f26,%f26
	ldd	[%l2+16],%f28

	faddd	%f6,%f30,%f6
	fmuld	%f0,%f4,%f4
	ldd	[%l0+8],%f30

	faddd	%f16,%f32,%f16
	fmuld	%f10,%f14,%f14
	ldd	[%l1+8],%f32

	faddd	%f26,%f34,%f26
	fmuld	%f20,%f24,%f24
	ldd	[%l2+8],%f34

	fmuld	%f8,%f4,%f0		! c * cpoly
	faddd	%f6,%f2,%f6

	fmuld	%f18,%f14,%f10
	faddd	%f16,%f12,%f16

	fmuld	%f28,%f24,%f20
	faddd	%f26,%f22,%f26

	fmuld	%f30,%f6,%f2		! s * spoly

	fmuld	%f32,%f16,%f12

	fmuld	%f34,%f26,%f22

	fmuld	%f8,%f6,%f6		! c * spoly
	fsubd	%f0,%f2,%f2

	fmuld	%f18,%f16,%f16
	fsubd	%f10,%f12,%f12

	fmuld	%f28,%f26,%f26
	fsubd	%f20,%f22,%f22

	fmuld	%f30,%f4,%f4		! s * cpoly
	faddd	%f8,%f2,%f8

	fmuld	%f32,%f14,%f14
	faddd	%f18,%f12,%f18

	fmuld	%f34,%f24,%f24
	faddd	%f28,%f22,%f28

	faddd	%f4,%f6,%f6

	faddd	%f14,%f16,%f16

	faddd	%f24,%f26,%f26

	faddd	%f30,%f6,%f6		! now %f6 = sin |x|, %f8 = cos |x|

	faddd	%f32,%f16,%f16

	faddd	%f34,%f26,%f26

	fnegd	%f8,%f4			! if (n & 1) c = -c
	lda	[%i1]%asi,%l0		! preload next argument
	mov	%i5,%l1

	fnegd	%f18,%f14
	lda	[%i1]%asi,%f0
	sethi	%hi(0x80000000),%i5

	fnegd	%f28,%f24
	lda	[%i1+4]%asi,%f3

	andcc	%l1,1,%g0
	fmovdnz	%icc,%f4,%f8
	st	%f8,[%l4]

	andcc	%g5,1,%g0
	fmovdnz	%icc,%f14,%f18
	st	%f9,[%l4+4]

	andcc	%o7,1,%g0
	fmovdnz	%icc,%f24,%f28
	st	%f18,[%l5]

	fnegd	%f6,%f4			! if (n & 2) s = -s
	st	%f19,[%l5+4]
	andn	%l0,%i5,%l0

	fnegd	%f16,%f14
	st	%f28,[%l6]
	add	%i1,%i2,%i1

	fnegd	%f26,%f24
	st	%f29,[%l6+4]

	andcc	%l1,2,%g0
	fmovdnz	%icc,%f4,%f6
	st	%f6,[%o0]

	andcc	%g5,2,%g0
	fmovdnz	%icc,%f14,%f16
	st	%f16,[%o1]

	andcc	%o7,2,%g0
	fmovdnz	%icc,%f24,%f26
	st	%f26,[%o2]

	addcc	%i0,-1,%i0
	bg,pt	%icc,.loop0
! delay slot
	st	%f7,[%o0+4]

	ba,pt	%icc,.end
! delay slot
	nop


	.align	16
.end:
	st	%f17,[%o1+4]
	st	%f27,[%o2+4]
	ld	[%fp+biguns],%i5
	tst	%i5			! check for huge arguments remaining
	be,pt	%icc,.exit
! delay slot
	nop
#ifdef __sparcv9
	stx	%o5,[%sp+STACK_BIAS+0xb8]
	ldx	[%fp+xsave],%o1
	ldx	[%fp+ssave],%o3
	ldx	[%fp+csave],%o5
	ldx	[%fp+STACK_BIAS+0xb0],%i5
	stx	%i5,[%sp+STACK_BIAS+0xb0]
#else
	st	%o5,[%sp+0x60]
	ld	[%fp+xsave],%o1
	ld	[%fp+ssave],%o3
	ld	[%fp+csave],%o5
	ld	[%fp+0x5c],%i5
	st	%i5,[%sp+0x5c]
#endif
	ld	[%fp+nsave],%o0
	ld	[%fp+sxsave],%o2
	ld	[%fp+sssave],%o4
	sra	%o2,0,%o2		! sign-extend for V9
	call	__vlibm_vsincos_big
	sra	%o4,0,%o4		! delay slot

.exit:
	ret
	restore


	.align	16
.last1:
	faddd	%f2,c3two44,%f4
	st	%f17,[%o1+4]
.last1_from_range1:
	mov	0,%l1
	fzeros	%f10
	fzero	%f12
	add	%fp,junk,%o1
	add	%fp,junk,%l5
.last2:
	faddd	%f12,c3two44,%f14
	st	%f27,[%o2+4]
	st	%f5,[%fp+nk0]
	st	%f15,[%fp+nk1]
.last2_from_range2:
	mov	0,%l2
	fzeros	%f20
	fzero	%f22
	add	%fp,junk,%o2
	ba,pt	%icc,.cont
! delay slot
	add	%fp,junk,%l6


	.align	16
.range0:
	cmp	%l0,%o4
	bl,pt	%icc,1f			! hx < 0x3e400000
! delay slot, harmless if branch taken
	sethi	%hi(0x7ff00000),%o7
	cmp	%l0,%o7
	bl,a,pt	%icc,2f			! branch if finite
! delay slot, squashed if branch not taken
	st	%o4,[%fp+biguns]	! set biguns
	fzero	%f0
	fmuld	%f2,%f0,%f2
	st	%f2,[%o0]
	st	%f3,[%o0+4]
	st	%f2,[%l3]
	ba,pt	%icc,2f
! delay slot
	st	%f3,[%l3+4]
1:
	fdtoi	%f2,%f4			! raise inexact if not zero
	st	%f0,[%o0]
	st	%f3,[%o0+4]
	sethi	%hi(0x3ff00000),%g5
	st	%g5,[%l3]
	st	%g0,[%l3+4]
2:
	addcc	%i0,-1,%i0
	ble,pn	%icc,.end
! delay slot, harmless if branch taken
	add	%i3,%i4,%i3		! s += strides
	add	%l3,%l7,%l3		! c += stridec
	andn	%l1,%i5,%l0		! hx &= ~0x80000000
	fmovs	%f10,%f0
	fmovs	%f13,%f3
	ba,pt	%icc,.loop0
! delay slot
	add	%i1,%i2,%i1		! x += stridex


	.align	16
.range1:
	cmp	%l1,%o4
	bl,pt	%icc,1f			! hx < 0x3e400000
! delay slot, harmless if branch taken
	sethi	%hi(0x7ff00000),%o7
	cmp	%l1,%o7
	bl,a,pt	%icc,2f			! branch if finite
! delay slot, squashed if branch not taken
	st	%o4,[%fp+biguns]	! set biguns
	fzero	%f10
	fmuld	%f12,%f10,%f12
	st	%f12,[%o1]
	st	%f13,[%o1+4]
	st	%f12,[%l3]
	ba,pt	%icc,2f
! delay slot
	st	%f13,[%l3+4]
1:
	fdtoi	%f12,%f14		! raise inexact if not zero
	st	%f10,[%o1]
	st	%f13,[%o1+4]
	sethi	%hi(0x3ff00000),%g5
	st	%g5,[%l3]
	st	%g0,[%l3+4]
2:
	addcc	%i0,-1,%i0
	ble,pn	%icc,.last1_from_range1
! delay slot, harmless if branch taken
	add	%i3,%i4,%i3		! s += strides
	add	%l3,%l7,%l3		! c += stridec
	andn	%l2,%i5,%l1		! hx &= ~0x80000000
	fmovs	%f20,%f10
	fmovs	%f23,%f13
	ba,pt	%icc,.loop1
! delay slot
	add	%i1,%i2,%i1		! x += stridex


	.align	16
.range2:
	cmp	%l2,%o4
	bl,pt	%icc,1f			! hx < 0x3e400000
! delay slot, harmless if branch taken
	sethi	%hi(0x7ff00000),%o7
	cmp	%l2,%o7
	bl,a,pt	%icc,2f			! branch if finite
! delay slot, squashed if branch not taken
	st	%o4,[%fp+biguns]	! set biguns
	fzero	%f20
	fmuld	%f22,%f20,%f22
	st	%f22,[%o2]
	st	%f23,[%o2+4]
	st	%f22,[%l3]
	ba,pt	%icc,2f
! delay slot
	st	%f23,[%l3+4]
1:
	fdtoi	%f22,%f24		! raise inexact if not zero
	st	%f20,[%o2]
	st	%f23,[%o2+4]
	sethi	%hi(0x3ff00000),%g5
	st	%g5,[%l3]
	st	%g0,[%l3+4]
2:
	addcc	%i0,-1,%i0
	ble,pn	%icc,.last2_from_range2
! delay slot, harmless if branch taken
	add	%i3,%i4,%i3		! s += strides
	add	%l3,%l7,%l3		! c += stridec
	ld	[%i1],%l2
	ld	[%i1],%f20
	ld	[%i1+4],%f23
	andn	%l2,%i5,%l2		! hx &= ~0x80000000
	ba,pt	%icc,.loop2
! delay slot
	add	%i1,%i2,%i1		! x += stridex

	SET_SIZE(__vsincos)