xref: /illumos-gate/usr/src/lib/libmvec/common/vis/__vsincos.S (revision 67fa3f2c31312dc0caac188f568ab1fdc6b27295)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
23 */
24/*
25 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
26 * Use is subject to license terms.
27 */
28
29	.file	"__vsincos.S"
30
31#include "libm.h"
32
33	RO_DATA
34	.align	64
35constants:
36	.word	0x42c80000,0x00000000	! 3 * 2^44
37	.word	0x43380000,0x00000000	! 3 * 2^51
38	.word	0x3fe45f30,0x6dc9c883	! invpio2
39	.word	0x3ff921fb,0x54442c00	! pio2_1
40	.word	0x3d318469,0x898cc400	! pio2_2
41	.word	0x3a71701b,0x839a2520	! pio2_3
42	.word	0xbfc55555,0x55555533	! pp1
43	.word	0x3f811111,0x10e7d53b	! pp2
44	.word	0xbf2a0167,0xe6b3cf9b	! pp3
45	.word	0xbfdfffff,0xffffff65	! qq1
46	.word	0x3fa55555,0x54f88ed0	! qq2
47	.word	0xbf56c12c,0xdd185f60	! qq3
48
49! local storage indices
50
51#define xsave		STACK_BIAS-0x8
52#define ssave		STACK_BIAS-0x10
53#define csave		STACK_BIAS-0x18
54#define nsave		STACK_BIAS-0x1c
55#define sxsave		STACK_BIAS-0x20
56#define sssave		STACK_BIAS-0x24
57#define biguns		STACK_BIAS-0x28
58#define junk		STACK_BIAS-0x30
59#define nk2		STACK_BIAS-0x38
60#define nk1		STACK_BIAS-0x3c
61#define nk0		STACK_BIAS-0x40
62! sizeof temp storage - must be a multiple of 16 for V9
63#define tmps		0x40
64
65! register use
66
67! i0  n
68! i1  x
69! i2  stridex
70! i3  s
71! i4  strides
72! i5  0x80000000,n0
73
74! l0  hx0,k0
75! l1  hx1,k1
76! l2  hx2,k2
77! l3  c
78! l4  pc0
79! l5  pc1
80! l6  pc2
81! l7  stridec
82
83! the following are 64-bit registers in both V8+ and V9
84
85! g1  __vlibm_TBL_sincos2
86! g5  scratch,n1
87
88! o0  ps0
89! o1  ps1
90! o2  ps2
91! o3  0x3fe921fb
92! o4  0x3e400000
93! o5  0x4099251e
94! o7  scratch,n2
95
96! f0  x0,z0
97! f2  abs(x0)
98! f4
99! f6
100! f8
101! f10 x1,z1
102! f12 abs(x1)
103! f14
104! f16
105! f18
106! f20 x2,z2
107! f22 abs(x2)
108! f24
109! f26
110! f28
111! f30
112! f32
113! f34
114! f36
115! f38
116
117#define c3two44	%f40
118#define c3two51	%f42
119#define invpio2	%f44
120#define pio2_1	%f46
121#define pio2_2	%f48
122#define pio2_3	%f50
123#define pp1	%f52
124#define pp2	%f54
125#define pp3	%f56
126#define qq1	%f58
127#define qq2	%f60
128#define qq3	%f62
129
130	ENTRY(__vsincos)
131	save	%sp,-SA(MINFRAME)-tmps,%sp
132	PIC_SETUP(l7)
133	PIC_SET(l7,constants,o0)
134	PIC_SET(l7,__vlibm_TBL_sincos2,o1)
135	mov	%o1,%g1
136	wr	%g0,0x82,%asi		! set %asi for non-faulting loads
137#ifdef __sparcv9
138	stx	%i1,[%fp+xsave]		! save arguments
139	stx	%i3,[%fp+ssave]
140	stx	%i5,[%fp+csave]
141	ldx	[%fp+STACK_BIAS+0xb0],%l7
142#else
143	st	%i1,[%fp+xsave]		! save arguments
144	st	%i3,[%fp+ssave]
145	st	%i5,[%fp+csave]
146	ld	[%fp+0x5c],%l7
147#endif
148	st	%i0,[%fp+nsave]
149	st	%i2,[%fp+sxsave]
150	st	%i4,[%fp+sssave]
151	mov	%i5,%l3
152	st	%g0,[%fp+biguns]	! biguns = 0
153	ldd	[%o0+0x00],c3two44	! load/set up constants
154	ldd	[%o0+0x08],c3two51
155	ldd	[%o0+0x10],invpio2
156	ldd	[%o0+0x18],pio2_1
157	ldd	[%o0+0x20],pio2_2
158	ldd	[%o0+0x28],pio2_3
159	ldd	[%o0+0x30],pp1
160	ldd	[%o0+0x38],pp2
161	ldd	[%o0+0x40],pp3
162	ldd	[%o0+0x48],qq1
163	ldd	[%o0+0x50],qq2
164	ldd	[%o0+0x58],qq3
165	sethi	%hi(0x80000000),%i5
166	sethi	%hi(0x3e400000),%o4
167	sethi	%hi(0x3fe921fb),%o3
168	or	%o3,%lo(0x3fe921fb),%o3
169	sethi	%hi(0x4099251e),%o5
170	or	%o5,%lo(0x4099251e),%o5
171	sll	%i2,3,%i2		! scale strides
172	sll	%i4,3,%i4
173	sll	%l7,3,%l7
174	add	%fp,junk,%o0		! loop prologue
175	add	%fp,junk,%o1
176	add	%fp,junk,%o2
177	ld	[%i1],%l0		! *x
178	ld	[%i1],%f0
179	ld	[%i1+4],%f3
180	andn	%l0,%i5,%l0		! mask off sign
181	ba	.loop0
182	add	%i1,%i2,%i1		! x += stridex
183
184! 16-byte aligned
185	.align	16
186.loop0:
187	lda	[%i1]%asi,%l1		! preload next argument
188	sub	%l0,%o4,%g5
189	sub	%o5,%l0,%o7
190	fabss	%f0,%f2
191
192	lda	[%i1]%asi,%f10
193	orcc	%o7,%g5,%g0
194	mov	%i3,%o0			! ps0 = s
195	bl,pn	%icc,.range0		! hx < 0x3e400000 or hx > 0x4099251e
196
197! delay slot
198	lda	[%i1+4]%asi,%f13
199	addcc	%i0,-1,%i0
200	add	%i3,%i4,%i3		! s += strides
201
202	mov	%l3,%l4			! pc0 = c
203	add	%l3,%l7,%l3		! c += stridec
204	ble,pn	%icc,.last1
205
206! delay slot
207	andn	%l1,%i5,%l1
208	add	%i1,%i2,%i1		! x += stridex
209	faddd	%f2,c3two44,%f4
210	st	%f17,[%o1+4]
211
212.loop1:
213	lda	[%i1]%asi,%l2		! preload next argument
214	sub	%l1,%o4,%g5
215	sub	%o5,%l1,%o7
216	fabss	%f10,%f12
217
218	lda	[%i1]%asi,%f20
219	orcc	%o7,%g5,%g0
220	mov	%i3,%o1			! ps1 = s
221	bl,pn	%icc,.range1		! hx < 0x3e400000 or hx > 0x4099251e
222
223! delay slot
224	lda	[%i1+4]%asi,%f23
225	addcc	%i0,-1,%i0
226	add	%i3,%i4,%i3		! s += strides
227
228	mov	%l3,%l5			! pc1 = c
229	add	%l3,%l7,%l3		! c += stridec
230	ble,pn	%icc,.last2
231
232! delay slot
233	andn	%l2,%i5,%l2
234	add	%i1,%i2,%i1		! x += stridex
235	faddd	%f12,c3two44,%f14
236	st	%f27,[%o2+4]
237
238.loop2:
239	sub	%l2,%o4,%g5
240	sub	%o5,%l2,%o7
241	fabss	%f20,%f22
242	st	%f5,[%fp+nk0]
243
244	orcc	%o7,%g5,%g0
245	mov	%i3,%o2			! ps2 = s
246	bl,pn	%icc,.range2		! hx < 0x3e400000 or hx > 0x4099251e
247! delay slot
248	st	%f15,[%fp+nk1]
249
250	mov	%l3,%l6			! pc2 = c
251
252.cont:
253	add	%i3,%i4,%i3		! s += strides
254	add	%l3,%l7,%l3		! c += stridec
255	faddd	%f22,c3two44,%f24
256	st	%f25,[%fp+nk2]
257
258	sub	%o3,%l0,%l0
259	sub	%o3,%l1,%l1
260	fmovs	%f3,%f1
261
262	sub	%o3,%l2,%l2
263	fmovs	%f13,%f11
264
265	or	%l0,%l1,%l0
266	orcc	%l0,%l2,%g0
267	fmovs	%f23,%f21
268
269	fmuld	%f0,invpio2,%f6		! x * invpio2, for medium range
270
271	fmuld	%f10,invpio2,%f16
272	ld	[%fp+nk0],%l0
273
274	fmuld	%f20,invpio2,%f26
275	ld	[%fp+nk1],%l1
276
277	bl,pn	%icc,.medium
278! delay slot
279	ld	[%fp+nk2],%l2
280
281	sll	%l0,5,%l0		! k
282	fcmpd	%fcc0,%f0,pio2_3	! x < pio2_3 iff x < 0
283
284	sll	%l1,5,%l1
285	ldd	[%l0+%g1],%f4
286	fcmpd	%fcc1,%f10,pio2_3
287
288	sll	%l2,5,%l2
289	ldd	[%l1+%g1],%f14
290	fcmpd	%fcc2,%f20,pio2_3
291
292	ldd	[%l2+%g1],%f24
293
294	fsubd	%f2,%f4,%f2		! x -= __vlibm_TBL_sincos2[k]
295
296	fsubd	%f12,%f14,%f12
297
298	fsubd	%f22,%f24,%f22
299
300	fmuld	%f2,%f2,%f0		! z = x * x
301
302	fmuld	%f12,%f12,%f10
303
304	fmuld	%f22,%f22,%f20
305
306	fmuld	%f0,pp3,%f6
307
308	fmuld	%f10,pp3,%f16
309
310	fmuld	%f20,pp3,%f26
311
312	faddd	%f6,pp2,%f6
313	fmuld	%f0,qq3,%f4
314
315	faddd	%f16,pp2,%f16
316	fmuld	%f10,qq3,%f14
317
318	faddd	%f26,pp2,%f26
319	fmuld	%f20,qq3,%f24
320
321	fmuld	%f0,%f6,%f6
322	faddd	%f4,qq2,%f4
323
324	fmuld	%f10,%f16,%f16
325	faddd	%f14,qq2,%f14
326
327	fmuld	%f20,%f26,%f26
328	faddd	%f24,qq2,%f24
329
330	faddd	%f6,pp1,%f6
331	fmuld	%f0,%f4,%f4
332	add	%l0,%g1,%l0
333
334	faddd	%f16,pp1,%f16
335	fmuld	%f10,%f14,%f14
336	add	%l1,%g1,%l1
337
338	faddd	%f26,pp1,%f26
339	fmuld	%f20,%f24,%f24
340	add	%l2,%g1,%l2
341
342	fmuld	%f0,%f6,%f6
343	faddd	%f4,qq1,%f4
344
345	fmuld	%f10,%f16,%f16
346	faddd	%f14,qq1,%f14
347
348	fmuld	%f20,%f26,%f26
349	faddd	%f24,qq1,%f24
350
351	fmuld	%f2,%f6,%f6
352	ldd	[%l0+8],%f8
353
354	fmuld	%f12,%f16,%f16
355	ldd	[%l1+8],%f18
356
357	fmuld	%f22,%f26,%f26
358	ldd	[%l2+8],%f28
359
360	faddd	%f6,%f2,%f6
361	fmuld	%f0,%f4,%f4
362	ldd	[%l0+16],%f30
363
364	faddd	%f16,%f12,%f16
365	fmuld	%f10,%f14,%f14
366	ldd	[%l1+16],%f32
367
368	faddd	%f26,%f22,%f26
369	fmuld	%f20,%f24,%f24
370	ldd	[%l2+16],%f34
371
372	fmuld	%f8,%f6,%f0		! s * spoly
373
374	fmuld	%f18,%f16,%f10
375
376	fmuld	%f28,%f26,%f20
377
378	fmuld	%f30,%f4,%f2		! c * cpoly
379
380	fmuld	%f32,%f14,%f12
381
382	fmuld	%f34,%f24,%f22
383
384	fmuld	%f30,%f6,%f6		! c * spoly
385	fsubd	%f2,%f0,%f2
386
387	fmuld	%f32,%f16,%f16
388	fsubd	%f12,%f10,%f12
389
390	fmuld	%f34,%f26,%f26
391	fsubd	%f22,%f20,%f22
392
393	fmuld	%f8,%f4,%f4		! s * cpoly
394	faddd	%f2,%f30,%f2
395	st	%f2,[%l4]
396
397	fmuld	%f18,%f14,%f14
398	faddd	%f12,%f32,%f12
399	st	%f3,[%l4+4]
400
401	fmuld	%f28,%f24,%f24
402	faddd	%f22,%f34,%f22
403	st	%f12,[%l5]
404
405	faddd	%f6,%f4,%f6
406	st	%f13,[%l5+4]
407
408	faddd	%f16,%f14,%f16
409	st	%f22,[%l6]
410
411	faddd	%f26,%f24,%f26
412	st	%f23,[%l6+4]
413
414	faddd	%f6,%f8,%f6
415
416	faddd	%f16,%f18,%f16
417
418	faddd	%f26,%f28,%f26
419
420	fnegd	%f6,%f4
421	lda	[%i1]%asi,%l0		! preload next argument
422
423	fnegd	%f16,%f14
424	lda	[%i1]%asi,%f0
425
426	fnegd	%f26,%f24
427	lda	[%i1+4]%asi,%f3
428	andn	%l0,%i5,%l0
429	add	%i1,%i2,%i1
430
431	fmovdl	%fcc0,%f4,%f6		! (hx < -0)? -s : s
432	st	%f6,[%o0]
433
434	fmovdl	%fcc1,%f14,%f16
435	st	%f16,[%o1]
436
437	fmovdl	%fcc2,%f24,%f26
438	st	%f26,[%o2]
439	addcc	%i0,-1,%i0
440
441	bg,pt	%icc,.loop0
442! delay slot
443	st	%f7,[%o0+4]
444
445	ba,pt	%icc,.end
446! delay slot
447	nop
448
449
450	.align	16
451.medium:
452	faddd	%f6,c3two51,%f4
453	st	%f5,[%fp+nk0]
454
455	faddd	%f16,c3two51,%f14
456	st	%f15,[%fp+nk1]
457
458	faddd	%f26,c3two51,%f24
459	st	%f25,[%fp+nk2]
460
461	fsubd	%f4,c3two51,%f6
462
463	fsubd	%f14,c3two51,%f16
464
465	fsubd	%f24,c3two51,%f26
466
467	fmuld	%f6,pio2_1,%f2
468	ld	[%fp+nk0],%i5		! n
469
470	fmuld	%f16,pio2_1,%f12
471	ld	[%fp+nk1],%g5
472
473	fmuld	%f26,pio2_1,%f22
474	ld	[%fp+nk2],%o7
475
476	fsubd	%f0,%f2,%f0
477	fmuld	%f6,pio2_2,%f4
478	mov	%o0,%o4			! if (n & 1) swap ps, pc
479	andcc	%i5,1,%g0
480
481	fsubd	%f10,%f12,%f10
482	fmuld	%f16,pio2_2,%f14
483	movnz	%icc,%l4,%o0
484	and	%i5,3,%i5
485
486	fsubd	%f20,%f22,%f20
487	fmuld	%f26,pio2_2,%f24
488	movnz	%icc,%o4,%l4
489
490	fsubd	%f0,%f4,%f30
491	mov	%o1,%o4
492	andcc	%g5,1,%g0
493
494	fsubd	%f10,%f14,%f32
495	movnz	%icc,%l5,%o1
496	and	%g5,3,%g5
497
498	fsubd	%f20,%f24,%f34
499	movnz	%icc,%o4,%l5
500
501	fsubd	%f0,%f30,%f0
502	fcmple32 %f30,pio2_3,%l0	! x <= pio2_3 iff x < 0
503	mov	%o2,%o4
504	andcc	%o7,1,%g0
505
506	fsubd	%f10,%f32,%f10
507	fcmple32 %f32,pio2_3,%l1
508	movnz	%icc,%l6,%o2
509	and	%o7,3,%o7
510
511	fsubd	%f20,%f34,%f20
512	fcmple32 %f34,pio2_3,%l2
513	movnz	%icc,%o4,%l6
514
515	fsubd	%f0,%f4,%f0
516	fmuld	%f6,pio2_3,%f6
517	add	%i5,1,%o4		! n = (n >> 1) | (((n + 1) ^ l) & 2)
518	srl	%i5,1,%i5
519
520	fsubd	%f10,%f14,%f10
521	fmuld	%f16,pio2_3,%f16
522	xor	%o4,%l0,%o4
523
524	fsubd	%f20,%f24,%f20
525	fmuld	%f26,pio2_3,%f26
526	and	%o4,2,%o4
527
528	fsubd	%f6,%f0,%f6
529	or	%i5,%o4,%i5
530
531	fsubd	%f16,%f10,%f16
532	add	%g5,1,%o4
533	srl	%g5,1,%g5
534
535	fsubd	%f26,%f20,%f26
536	xor	%o4,%l1,%o4
537
538	fsubd	%f30,%f6,%f0		! reduced x
539	and	%o4,2,%o4
540
541	fsubd	%f32,%f16,%f10
542	or	%g5,%o4,%g5
543
544	fsubd	%f34,%f26,%f20
545	add	%o7,1,%o4
546	srl	%o7,1,%o7
547
548	fzero	%f38
549	xor	%o4,%l2,%o4
550
551	fabsd	%f0,%f2
552	and	%o4,2,%o4
553
554	fabsd	%f10,%f12
555	or	%o7,%o4,%o7
556
557	fabsd	%f20,%f22
558	sethi	%hi(0x3e400000),%o4
559
560	fnegd	%f38,%f38
561
562	faddd	%f2,c3two44,%f4
563	st	%f5,[%fp+nk0]
564
565	faddd	%f12,c3two44,%f14
566	st	%f15,[%fp+nk1]
567
568	faddd	%f22,c3two44,%f24
569	st	%f25,[%fp+nk2]
570
571	fsubd	%f30,%f0,%f4
572
573	fsubd	%f32,%f10,%f14
574
575	fsubd	%f34,%f20,%f24
576
577	fsubd	%f4,%f6,%f6		! w
578	ld	[%fp+nk0],%l0
579
580	fsubd	%f14,%f16,%f16
581	ld	[%fp+nk1],%l1
582
583	fsubd	%f24,%f26,%f26
584	ld	[%fp+nk2],%l2
585	sll	%l0,5,%l0		! k
586
587	fand	%f0,%f38,%f30		! sign bit of x
588	ldd	[%l0+%g1],%f4
589	sll	%l1,5,%l1
590
591	fand	%f10,%f38,%f32
592	ldd	[%l1+%g1],%f14
593	sll	%l2,5,%l2
594
595	fand	%f20,%f38,%f34
596	ldd	[%l2+%g1],%f24
597
598	fsubd	%f2,%f4,%f2		! x -= __vlibm_TBL_sincos2[k]
599
600	fsubd	%f12,%f14,%f12
601
602	fsubd	%f22,%f24,%f22
603
604	fmuld	%f2,%f2,%f0		! z = x * x
605	fxor	%f6,%f30,%f30
606
607	fmuld	%f12,%f12,%f10
608	fxor	%f16,%f32,%f32
609
610	fmuld	%f22,%f22,%f20
611	fxor	%f26,%f34,%f34
612
613	fmuld	%f0,pp3,%f6
614
615	fmuld	%f10,pp3,%f16
616
617	fmuld	%f20,pp3,%f26
618
619	faddd	%f6,pp2,%f6
620	fmuld	%f0,qq3,%f4
621
622	faddd	%f16,pp2,%f16
623	fmuld	%f10,qq3,%f14
624
625	faddd	%f26,pp2,%f26
626	fmuld	%f20,qq3,%f24
627
628	fmuld	%f0,%f6,%f6
629	faddd	%f4,qq2,%f4
630
631	fmuld	%f10,%f16,%f16
632	faddd	%f14,qq2,%f14
633
634	fmuld	%f20,%f26,%f26
635	faddd	%f24,qq2,%f24
636
637	faddd	%f6,pp1,%f6
638	fmuld	%f0,%f4,%f4
639	add	%l0,%g1,%l0
640
641	faddd	%f16,pp1,%f16
642	fmuld	%f10,%f14,%f14
643	add	%l1,%g1,%l1
644
645	faddd	%f26,pp1,%f26
646	fmuld	%f20,%f24,%f24
647	add	%l2,%g1,%l2
648
649	fmuld	%f0,%f6,%f6
650	faddd	%f4,qq1,%f4
651
652	fmuld	%f10,%f16,%f16
653	faddd	%f14,qq1,%f14
654
655	fmuld	%f20,%f26,%f26
656	faddd	%f24,qq1,%f24
657
658	fmuld	%f2,%f6,%f6
659	ldd	[%l0+16],%f8
660
661	fmuld	%f12,%f16,%f16
662	ldd	[%l1+16],%f18
663
664	fmuld	%f22,%f26,%f26
665	ldd	[%l2+16],%f28
666
667	faddd	%f6,%f30,%f6
668	fmuld	%f0,%f4,%f4
669	ldd	[%l0+8],%f30
670
671	faddd	%f16,%f32,%f16
672	fmuld	%f10,%f14,%f14
673	ldd	[%l1+8],%f32
674
675	faddd	%f26,%f34,%f26
676	fmuld	%f20,%f24,%f24
677	ldd	[%l2+8],%f34
678
679	fmuld	%f8,%f4,%f0		! c * cpoly
680	faddd	%f6,%f2,%f6
681
682	fmuld	%f18,%f14,%f10
683	faddd	%f16,%f12,%f16
684
685	fmuld	%f28,%f24,%f20
686	faddd	%f26,%f22,%f26
687
688	fmuld	%f30,%f6,%f2		! s * spoly
689
690	fmuld	%f32,%f16,%f12
691
692	fmuld	%f34,%f26,%f22
693
694	fmuld	%f8,%f6,%f6		! c * spoly
695	fsubd	%f0,%f2,%f2
696
697	fmuld	%f18,%f16,%f16
698	fsubd	%f10,%f12,%f12
699
700	fmuld	%f28,%f26,%f26
701	fsubd	%f20,%f22,%f22
702
703	fmuld	%f30,%f4,%f4		! s * cpoly
704	faddd	%f8,%f2,%f8
705
706	fmuld	%f32,%f14,%f14
707	faddd	%f18,%f12,%f18
708
709	fmuld	%f34,%f24,%f24
710	faddd	%f28,%f22,%f28
711
712	faddd	%f4,%f6,%f6
713
714	faddd	%f14,%f16,%f16
715
716	faddd	%f24,%f26,%f26
717
718	faddd	%f30,%f6,%f6		! now %f6 = sin |x|, %f8 = cos |x|
719
720	faddd	%f32,%f16,%f16
721
722	faddd	%f34,%f26,%f26
723
724	fnegd	%f8,%f4			! if (n & 1) c = -c
725	lda	[%i1]%asi,%l0		! preload next argument
726	mov	%i5,%l1
727
728	fnegd	%f18,%f14
729	lda	[%i1]%asi,%f0
730	sethi	%hi(0x80000000),%i5
731
732	fnegd	%f28,%f24
733	lda	[%i1+4]%asi,%f3
734
735	andcc	%l1,1,%g0
736	fmovdnz	%icc,%f4,%f8
737	st	%f8,[%l4]
738
739	andcc	%g5,1,%g0
740	fmovdnz	%icc,%f14,%f18
741	st	%f9,[%l4+4]
742
743	andcc	%o7,1,%g0
744	fmovdnz	%icc,%f24,%f28
745	st	%f18,[%l5]
746
747	fnegd	%f6,%f4			! if (n & 2) s = -s
748	st	%f19,[%l5+4]
749	andn	%l0,%i5,%l0
750
751	fnegd	%f16,%f14
752	st	%f28,[%l6]
753	add	%i1,%i2,%i1
754
755	fnegd	%f26,%f24
756	st	%f29,[%l6+4]
757
758	andcc	%l1,2,%g0
759	fmovdnz	%icc,%f4,%f6
760	st	%f6,[%o0]
761
762	andcc	%g5,2,%g0
763	fmovdnz	%icc,%f14,%f16
764	st	%f16,[%o1]
765
766	andcc	%o7,2,%g0
767	fmovdnz	%icc,%f24,%f26
768	st	%f26,[%o2]
769
770	addcc	%i0,-1,%i0
771	bg,pt	%icc,.loop0
772! delay slot
773	st	%f7,[%o0+4]
774
775	ba,pt	%icc,.end
776! delay slot
777	nop
778
779
780	.align	16
781.end:
782	st	%f17,[%o1+4]
783	st	%f27,[%o2+4]
784	ld	[%fp+biguns],%i5
785	tst	%i5			! check for huge arguments remaining
786	be,pt	%icc,.exit
787! delay slot
788	nop
789#ifdef __sparcv9
790	stx	%o5,[%sp+STACK_BIAS+0xb8]
791	ldx	[%fp+xsave],%o1
792	ldx	[%fp+ssave],%o3
793	ldx	[%fp+csave],%o5
794	ldx	[%fp+STACK_BIAS+0xb0],%i5
795	stx	%i5,[%sp+STACK_BIAS+0xb0]
796#else
797	st	%o5,[%sp+0x60]
798	ld	[%fp+xsave],%o1
799	ld	[%fp+ssave],%o3
800	ld	[%fp+csave],%o5
801	ld	[%fp+0x5c],%i5
802	st	%i5,[%sp+0x5c]
803#endif
804	ld	[%fp+nsave],%o0
805	ld	[%fp+sxsave],%o2
806	ld	[%fp+sssave],%o4
807	sra	%o2,0,%o2		! sign-extend for V9
808	call	__vlibm_vsincos_big
809	sra	%o4,0,%o4		! delay slot
810
811.exit:
812	ret
813	restore
814
815
816	.align	16
817.last1:
818	faddd	%f2,c3two44,%f4
819	st	%f17,[%o1+4]
820.last1_from_range1:
821	mov	0,%l1
822	fzeros	%f10
823	fzero	%f12
824	add	%fp,junk,%o1
825	add	%fp,junk,%l5
826.last2:
827	faddd	%f12,c3two44,%f14
828	st	%f27,[%o2+4]
829	st	%f5,[%fp+nk0]
830	st	%f15,[%fp+nk1]
831.last2_from_range2:
832	mov	0,%l2
833	fzeros	%f20
834	fzero	%f22
835	add	%fp,junk,%o2
836	ba,pt	%icc,.cont
837! delay slot
838	add	%fp,junk,%l6
839
840
841	.align	16
842.range0:
843	cmp	%l0,%o4
844	bl,pt	%icc,1f			! hx < 0x3e400000
845! delay slot, harmless if branch taken
846	sethi	%hi(0x7ff00000),%o7
847	cmp	%l0,%o7
848	bl,a,pt	%icc,2f			! branch if finite
849! delay slot, squashed if branch not taken
850	st	%o4,[%fp+biguns]	! set biguns
851	fzero	%f0
852	fmuld	%f2,%f0,%f2
853	st	%f2,[%o0]
854	st	%f3,[%o0+4]
855	st	%f2,[%l3]
856	ba,pt	%icc,2f
857! delay slot
858	st	%f3,[%l3+4]
8591:
860	fdtoi	%f2,%f4			! raise inexact if not zero
861	st	%f0,[%o0]
862	st	%f3,[%o0+4]
863	sethi	%hi(0x3ff00000),%g5
864	st	%g5,[%l3]
865	st	%g0,[%l3+4]
8662:
867	addcc	%i0,-1,%i0
868	ble,pn	%icc,.end
869! delay slot, harmless if branch taken
870	add	%i3,%i4,%i3		! s += strides
871	add	%l3,%l7,%l3		! c += stridec
872	andn	%l1,%i5,%l0		! hx &= ~0x80000000
873	fmovs	%f10,%f0
874	fmovs	%f13,%f3
875	ba,pt	%icc,.loop0
876! delay slot
877	add	%i1,%i2,%i1		! x += stridex
878
879
880	.align	16
881.range1:
882	cmp	%l1,%o4
883	bl,pt	%icc,1f			! hx < 0x3e400000
884! delay slot, harmless if branch taken
885	sethi	%hi(0x7ff00000),%o7
886	cmp	%l1,%o7
887	bl,a,pt	%icc,2f			! branch if finite
888! delay slot, squashed if branch not taken
889	st	%o4,[%fp+biguns]	! set biguns
890	fzero	%f10
891	fmuld	%f12,%f10,%f12
892	st	%f12,[%o1]
893	st	%f13,[%o1+4]
894	st	%f12,[%l3]
895	ba,pt	%icc,2f
896! delay slot
897	st	%f13,[%l3+4]
8981:
899	fdtoi	%f12,%f14		! raise inexact if not zero
900	st	%f10,[%o1]
901	st	%f13,[%o1+4]
902	sethi	%hi(0x3ff00000),%g5
903	st	%g5,[%l3]
904	st	%g0,[%l3+4]
9052:
906	addcc	%i0,-1,%i0
907	ble,pn	%icc,.last1_from_range1
908! delay slot, harmless if branch taken
909	add	%i3,%i4,%i3		! s += strides
910	add	%l3,%l7,%l3		! c += stridec
911	andn	%l2,%i5,%l1		! hx &= ~0x80000000
912	fmovs	%f20,%f10
913	fmovs	%f23,%f13
914	ba,pt	%icc,.loop1
915! delay slot
916	add	%i1,%i2,%i1		! x += stridex
917
918
919	.align	16
920.range2:
921	cmp	%l2,%o4
922	bl,pt	%icc,1f			! hx < 0x3e400000
923! delay slot, harmless if branch taken
924	sethi	%hi(0x7ff00000),%o7
925	cmp	%l2,%o7
926	bl,a,pt	%icc,2f			! branch if finite
927! delay slot, squashed if branch not taken
928	st	%o4,[%fp+biguns]	! set biguns
929	fzero	%f20
930	fmuld	%f22,%f20,%f22
931	st	%f22,[%o2]
932	st	%f23,[%o2+4]
933	st	%f22,[%l3]
934	ba,pt	%icc,2f
935! delay slot
936	st	%f23,[%l3+4]
9371:
938	fdtoi	%f22,%f24		! raise inexact if not zero
939	st	%f20,[%o2]
940	st	%f23,[%o2+4]
941	sethi	%hi(0x3ff00000),%g5
942	st	%g5,[%l3]
943	st	%g0,[%l3+4]
9442:
945	addcc	%i0,-1,%i0
946	ble,pn	%icc,.last2_from_range2
947! delay slot, harmless if branch taken
948	add	%i3,%i4,%i3		! s += strides
949	add	%l3,%l7,%l3		! c += stridec
950	ld	[%i1],%l2
951	ld	[%i1],%f20
952	ld	[%i1+4],%f23
953	andn	%l2,%i5,%l2		! hx &= ~0x80000000
954	ba,pt	%icc,.loop2
955! delay slot
956	add	%i1,%i2,%i1		! x += stridex
957
958	SET_SIZE(__vsincos)
959
960