xref: /illumos-gate/usr/src/lib/libmvec/common/vis/__vsqrtf_ultra3.S (revision b1e2e3fb17324e9ddf43db264a0c64da7756d9e6)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
23 */
24/*
25 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
26 * Use is subject to license terms.
27 */
28
29	.file	"__vsqrtf_ultra3.S"
30
31#include "libm.h"
32	.weak	__vsqrtf
33	.type	__vsqrtf,#function
34	__vsqrtf = __vsqrtf_ultra3
35
36	RO_DATA
37	.align	64
38
39.CONST_TBL:
40	.word	0x3fe00001, 0x80007e00	! K1  =  5.00000715259318464227e-01
41	.word	0xbfc00003, 0xc0017a01	! K2  = -1.25000447037521686593e-01
42	.word	0x000fffff, 0xffffffff	! DC0 = 0x000fffffffffffff
43	.word	0x3ff00000, 0x00000000	! DC1 = 0x3ff0000000000000
44	.word	0x7ffff000, 0x00000000	! DC2 = 0x7ffff00000000000
45
46#define DC0		%f6
47#define DC1		%f4
48#define DC2		%f2
49#define K2		%f38
50#define K1		%f36
51#define TBL		%l2
52#define stridex		%l3
53#define stridey		%l4
54#define _0x1ff0		%l5
55#define counter		%l6
56#define _0x00800000	%l7
57#define _0x7f800000	%o0
58
59#define tmp_px		STACK_BIAS-0x40
60#define tmp_counter	STACK_BIAS-0x38
61#define tmp0		STACK_BIAS-0x30
62#define tmp1		STACK_BIAS-0x28
63#define tmp2		STACK_BIAS-0x20
64#define tmp3		STACK_BIAS-0x18
65#define tmp4		STACK_BIAS-0x10
66
67! sizeof temp storage - must be a multiple of 16 for V9
68#define tmps		0x40
69
70!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
71!      !!!!!   algorithm   !!!!!
72!
73!  x0 = *px;
74!  ax = *(int*)px;
75!  px += stridex;
76!
77!  if( ax >= 0x7f800000 )
78!  {
79!    *py = sqrtf(x0);
80!    py += stridey;
81!    continue;
82!  }
83!  if( ax < 0x00800000 )
84!  {
85!    *py = sqrtf(x0);
86!    py += stridey;
87!    continue;
88!  }
89!
90!  db0 = (double)x0;
91!  iexp0 = ax >> 24;
92!  iexp0 += 0x3c0;
93!  lexp0 = (long long)iexp0 << 52;
94!
95!  db0 = vis_fand(db0,DC0);
96!  db0 = vis_for(db0,DC1);
97!  hi0 = vis_fand(db0,DC2);
98!
99!  ax >>= 11;
100!  si0 = ax & 0x1ff0;
101!  dtmp0 = ((double*)((char*)TBL + si0))[0];
102!  xx0 = (db0 - hi0);
103!  xx0 *= dtmp0;
104!  dtmp0 = ((double*)((char*)TBL + si0))[1]
105!  res0 = K2 * xx0;
106!  res0 += K1;
107!  res0 *= xx0;
108!  res0 += DC1;
109!  res0 = dtmp0 * res0;
110!  dtmp1 = *((double*)&lexp0);
111!  res0 *= dtmp1;
112!  fres0 = (float)res0;
113!  *py = fres0;
114!  py += stridey;
115!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
116
117	ENTRY(__vsqrtf_ultra3)
118	save	%sp,-SA(MINFRAME)-tmps,%sp
119	PIC_SETUP(l7)
120	PIC_SET(l7,.CONST_TBL,o2)
121	PIC_SET(l7,__vlibm_TBL_sqrtf,l2)
122
123	st	%i0,[%fp+tmp_counter]
124	sll	%i2,2,stridex
125	or	%g0,0xff8,%l5
126
127	stx	%i1,[%fp+tmp_px]
128	sll	%l5,1,_0x1ff0
129
130	ldd	[%o2],K1
131	sll	%i4,2,stridey
132
133	ldd	[%o2+8],K2
134	or	%g0,%i3,%g5
135
136	ldd	[%o2+16],DC0
137	sethi	%hi(0x7f800000),%o0
138
139	ldd	[%o2+24],DC1
140	sethi	%hi(0x00800000),%l7
141
142	ldd	[%o2+32],DC2
143
144.begin:
145	ld	[%fp+tmp_counter],counter
146	ldx	[%fp+tmp_px],%i1
147	st	%g0,[%fp+tmp_counter]
148.begin1:
149	cmp	counter,0
150	ble,pn	%icc,.exit
151
152	lda	[%i1]0x82,%o2		! (2_0) ax = *(int*)px;
153
154	or	%g0,%i1,%o7
155	lda	[%i1]0x82,%f25		! (2_0) x0 = *px;
156
157	cmp	%o2,_0x7f800000		! (2_0) ax ? 0x7f800000
158	bge,pn	%icc,.spec		! (2_0) if( ax >= 0x7f800000 )
159	nop
160
161	cmp	%o2,_0x00800000		! (2_0) ax ? 0x00800000
162	bl,pn	%icc,.spec		! (2_0) if( ax < 0x00800000 )
163	nop
164
165	fstod	%f25,%f56		! (2_0) db0 = (double)x0;
166
167	lda	[stridex+%o7]0x82,%o1	! (3_0) ax = *(int*)px;
168
169	sra	%o2,24,%l1		! (2_0) iexp0 = ax >> 24;
170
171	add	%o7,stridex,%i1		! px += stridex
172	add	%l1,960,%l0		! (2_0) iexp0 += 0x3c0;
173	lda	[stridex+%o7]0x82,%f0	! (3_0) x0 = *px;
174	fand	%f56,DC0,%f60		! (2_0) db0 = vis_fand(db0,DC0);
175
176	cmp	%o1,_0x7f800000		! (3_0) ax ? 0x7f800000
177	bge,pn	%icc,.update0		! (3_0) if( ax >= 0x7f800000 )
178	nop
179.cont0:
180	sllx	%l0,52,%o3		! (2_0) lexp0 = (long long)iexp0 << 52;
181
182	sra	%o2,11,%i2		! (2_0) ax >>= 11;
183	stx	%o3,[%fp+tmp0]		! (2_0) dtmp1 = *((double*)&lexp0);
184	for	%f60,DC1,%f40		! (2_0) db0 = vis_for(db0,DC1);
185
186	cmp	%o1,_0x00800000		! (3_0) ax ? 0x00800000
187	bl,pn	%icc,.update1		! (3_0) if( ax < 0x00800000 )
188	nop
189.cont1:
190	fstod	%f0,%f48		! (3_0) db0 = (double)x0;
191
192	and	%i2,_0x1ff0,%o3		! (2_0) si0 = ax & 0x1ff0;
193	lda	[%i1+stridex]0x82,%o2	! (4_0) ax = *(int*)px;
194
195	add	%i1,stridex,%i1		! px += stridex
196	add	%o3,TBL,%i2		! (2_0) (char*)TBL + si0
197	fand	%f40,DC2,%f46		! (2_0) hi0 = vis_fand(db0,DC2);
198
199	sra	%o1,24,%o4		! (3_0) iexp0 = ax >> 24;
200
201	lda	[%i1]0x82,%f13		! (4_0) x0 = *px;
202	fand	%f48,DC0,%f58		! (3_0) db0 = vis_fand(db0,DC0);
203
204	add	%o4,960,%i0		! (3_0) iexp0 += 0x3c0;
205
206	cmp	%o2,_0x7f800000		! (4_1) ax ? 0x7f800000
207	bge,pn	%icc,.update2		! (4_1) if( ax >= 0x7f800000 )
208	nop
209.cont2:
210	fsubd	%f40,%f46,%f44		! (2_1) xx0 = (db0 - hi0);
211	sllx	%i0,52,%g1		! (3_1) lexp0 = (long long)iexp0 << 52;
212	ldd	[%i2],%f40		! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
213
214	sra	%o1,11,%l0		! (3_1) ax >>= 11;
215	stx	%g1,[%fp+tmp1]		! (3_1) dtmp1 = *((double*)&lexp0);
216	for	%f58,DC1,%f48		! (3_1) db0 = vis_for(db0,DC1);
217
218	cmp	%o2,_0x00800000		! (4_1) ax ? 0x00800000
219	bl,pn	%icc,.update3		! (4_1) if( ax < 0x00800000 )
220	nop
221.cont3:
222	fstod	%f13,%f50		! (4_1) db0 = (double)x0;
223
224	fmuld	%f44,%f40,%f46		! (2_1) xx0 *= dtmp0;
225	and	%l0,_0x1ff0,%i0		! (3_1) si0 = ax & 0x1ff0;
226	lda	[%i1+stridex]0x82,%l1	! (0_0) ax = *(int*)px;
227
228	add	%i0,TBL,%l0		! (3_1) (char*)TBL + si0
229	fand	%f48,DC2,%f62		! (3_1) hi0 = vis_fand(db0,DC2);
230
231	sra	%o2,24,%o7		! (4_1) iexp0 = ax >> 24;
232
233	add	%i1,stridex,%o4		! px += stridex
234	add	%o7,960,%o7		! (4_1) iexp0 += 0x3c0;
235	lda	[%i1+stridex]0x82,%f17	! (0_0) x0 = *px;
236	fand	%f50,DC0,%f54		! (4_1) db0 = vis_fand(db0,DC0);
237
238	fmuld	K2,%f46,%f52		! (2_1) res0 = K2 * xx0;
239	cmp	%l1,_0x7f800000		! (0_0) ax ? 0x7f800000
240	bge,pn	%icc,.update4		! (0_0) if( ax >= 0x7f800000 )
241	fsubd	%f48,%f62,%f42		! (3_1) xx0 = (db0 - hi0);
242.cont4:
243	sllx	%o7,52,%o1		! (4_1) lexp0 = (long long)iexp0 << 52;
244	ldd	[%i0+TBL],%f40		! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
245
246	sra	%o2,11,%i5		! (4_1) ax >>= 11;
247	stx	%o1,[%fp+tmp2]		! (4_1) dtmp1 = *((double*)&lexp0);
248	for	%f54,DC1,%f34		! (4_1) db0 = vis_for(db0,DC1);
249
250	cmp	%l1,_0x00800000		! (0_0) ax ? 0x00800000
251	bl,pn	%icc,.update5		! (0_0) if( ax < 0x00800000 )
252	nop
253.cont5:
254	fstod	%f17,%f56		! (0_0) db0 = (double)x0;
255
256	fmuld	%f42,%f40,%f42		! (3_1) xx0 *= dtmp0;
257	lda	[stridex+%o4]0x82,%i0	! (1_0) ax = *(int*)px;
258	faddd	%f52,K1,%f52		! (2_1) res0 += K1;
259
260	sra	%l1,24,%g1		! (0_0) iexp0 = ax >> 24;
261	and	%i5,_0x1ff0,%i5		! (4_1) si0 = ax & 0x1ff0;
262	fand	%f34,DC2,%f62		! (4_1) hi0 = vis_fand(db0,DC2);
263
264	add	%o4,stridex,%i1		! px += stridex
265
266	add	%g1,960,%o5		! (0_0) iexp0 += 0x3c0;
267	add	%i5,TBL,%i3		! (4_1) (char*)TBL + si0
268	lda	[stridex+%o4]0x82,%f21	! (1_0) x0 = *px;
269	fand	%f56,DC0,%f32		! (0_0) db0 = vis_fand(db0,DC0);
270
271	fmuld	K2,%f42,%f50		! (3_1) res0 = K2 * xx0;
272	cmp	%i0,_0x7f800000		! (1_0) ax ? 0x7f800000
273	bge,pn	%icc,.update6		! (1_0) if( ax >= 0x7f800000 )
274	fsubd	%f34,%f62,%f54		! (4_1) xx0 = (db0 - hi0);
275.cont6:
276	fmuld	%f52,%f46,%f52		! (2_1) res0 *= xx0;
277	sllx	%o5,52,%o7		! (0_0) lexp0 = (long long)iexp0 << 52;
278	ldd	[TBL+%i5],%f62		! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
279
280	sra	%l1,11,%i4		! (0_0) ax >>= 11;
281	stx	%o7,[%fp+tmp3]		! (0_0) dtmp1 = *((double*)&lexp0);
282	for	%f32,DC1,%f48		! (0_0) db0 = vis_for(db0,DC1);
283
284	cmp	%i0,_0x00800000		! (1_0) ax ? 0x00800000
285	bl,pn	%icc,.update7		! (1_0) if( ax < 0x00800000 )
286	nop
287.cont7:
288	fstod	%f21,%f56		! (1_0) db0 = (double)x0;
289
290	fmuld	%f54,%f62,%f46		! (4_1) xx0 *= dtmp0;
291	and	%i4,_0x1ff0,%g1		! (0_0) si0 = ax & 0x1ff0;
292	lda	[%i1+stridex]0x82,%o2	! (2_0) ax = *(int*)px;
293	faddd	%f50,K1,%f62		! (3_1) res0 += K1;
294
295	add	%g1,TBL,%i5		! (0_0) (double*)((char*)TBL + si0
296	fand	%f48,DC2,%f32		! (0_0) hi0 = vis_fand(db0,DC2);
297
298	sra	%i0,24,%o4		! (1_0) iexp0 = ax >> 24;
299	ldd	[%i2+8],%f60		! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
300	faddd	%f52,DC1,%f58		! (2_1) res0 += DC1;
301
302	add	%i1,stridex,%o7		! px += stridex
303	add	%o4,960,%i2		! (1_0) iexp0 += 0x3c0;
304	lda	[%i1+stridex]0x82,%f25	! (2_0) x0 = *px;
305	fand	%f56,DC0,%f34		! (1_0) db0 = vis_fand(db0,DC0);
306
307	fmuld	K2,%f46,%f50		! (4_1) res0 = K2 * xx0;
308	cmp	%o2,_0x7f800000		! (2_0) ax ? 0x7f800000
309	bge,pn	%icc,.update8		! (2_0) if( ax >= 0x7f800000 )
310	fsubd	%f48,%f32,%f52		! (0_0) xx0 = (db0 - hi0);
311.cont8:
312	fmuld	%f62,%f42,%f54		! (3_1) res0 *= xx0;
313	sllx	%i2,52,%o4		! (1_0) lexp0 = (long long)iexp0 << 52;
314	ldd	[TBL+%g1],%f32		! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
315
316	fmuld	%f60,%f58,%f60		! (2_1) res0 = dtmp0 * res0;
317	sra	%i0,11,%g1		! (1_0) ax >>= 11;
318	stx	%o4,[%fp+tmp4]		! (1_0) dtmp1 = *((double*)&lexp0);
319	for	%f34,DC1,%f48		! (1_0) db0 = vis_for(db0,DC1);
320
321	cmp	%o2,_0x00800000		! (2_0) ax ? 0x00800000
322	bl,pn	%icc,.update9		! (2_0) if( ax < 0x00800000 )
323	ldd	[%fp+tmp0],%f40		! (2_1) dtmp1 = *((double*)&lexp0);
324	fstod	%f25,%f56		! (2_0) db0 = (double)x0;
325.cont9:
326	fmuld	%f52,%f32,%f42		! (0_0) xx0 *= dtmp0;
327	and	%g1,_0x1ff0,%o5		! (1_0) si0 = ax & 0x1ff0;
328	lda	[stridex+%o7]0x82,%o1	! (3_0) ax = *(int*)px;
329	faddd	%f50,K1,%f34		! (4_1) res0 += K1;
330
331	add	%o5,TBL,%i4		! (1_0) (char*)TBL + si0
332	fand	%f48,DC2,%f62		! (1_0) hi0 = vis_fand(db0,DC2);
333
334	fmuld	%f60,%f40,%f32		! (2_1) res0 *= dtmp1;
335	sra	%o2,24,%l1		! (2_0) iexp0 = ax >> 24;
336	ldd	[%l0+8],%f40		! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
337	faddd	%f54,DC1,%f58		! (3_1) res0 += DC1;
338
339	add	%o7,stridex,%i1		! px += stridex
340	add	%l1,960,%l0		! (2_0) iexp0 += 0x3c0;
341	lda	[stridex+%o7]0x82,%f0	! (3_0) x0 = *px;
342	fand	%f56,DC0,%f60		! (2_0) db0 = vis_fand(db0,DC0);
343
344	fmuld	K2,%f42,%f50		! (0_0) res0 = K2 * xx0;
345	cmp	%o1,_0x7f800000		! (3_0) ax ? 0x7f800000
346	bge,pn	%icc,.update10		! (3_0) if( ax >= 0x7f800000 )
347	fsubd	%f48,%f62,%f54		! (1_0) xx0 = (db0 - hi0);
348.cont10:
349	fmuld	%f34,%f46,%f52		! (4_1) res0 *= xx0;
350	sllx	%l0,52,%o3		! (2_0) lexp0 = (long long)iexp0 << 52;
351	ldd	[TBL+%o5],%f56		! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
352
353	fmuld	%f40,%f58,%f34		! (3_1) res0 = dtmp0 * res0;
354	sra	%o2,11,%i2		! (2_0) ax >>= 11;
355	stx	%o3,[%fp+tmp0]		! (2_0) dtmp1 = *((double*)&lexp0);
356	for	%f60,DC1,%f40		! (2_0) db0 = vis_for(db0,DC1);
357
358	cmp	%o1,_0x00800000		! (3_0) ax ? 0x00800000
359	bl,pn	%icc,.update11		! (3_0) if( ax < 0x00800000 )
360	ldd	[%fp+tmp1],%f62		! (3_1) dtmp1 = *((double*)&lexp0);
361	fstod	%f0,%f48		! (3_0) db0 = (double)x0;
362.cont11:
363	fmuld	%f54,%f56,%f30		! (1_0) xx0 *= dtmp0;
364	and	%i2,_0x1ff0,%o3		! (2_0) si0 = ax & 0x1ff0;
365	lda	[%i1+stridex]0x82,%o2	! (4_0) ax = *(int*)px;
366	faddd	%f50,K1,%f56		! (0_0) res0 += K1;
367
368	add	%i1,stridex,%i1		! px += stridex
369	add	%o3,TBL,%i2		! (2_0) (char*)TBL + si0
370	fand	%f40,DC2,%f46		! (2_0) hi0 = vis_fand(db0,DC2);
371
372	fmuld	%f34,%f62,%f28		! (3_1) res0 *= dtmp1;
373	sra	%o1,24,%o4		! (3_0) iexp0 = ax >> 24;
374	ldd	[%i3+8],%f50		! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
375	faddd	%f52,DC1,%f54		! (4_1) res0 += DC1;
376
377	lda	[%i1]0x82,%f13		! (4_0) x0 = *px;
378	fand	%f48,DC0,%f58		! (3_0) db0 = vis_fand(db0,DC0);
379
380	or	%g0,%g5,%i3
381	cmp	counter,5
382	bl,pn	%icc,.tail
383	add	%o4,960,%g5		! (3_0) iexp0 += 0x3c0;
384
385	ba	.main_loop
386	sub	counter,5,counter	! counter
387
388	.align	16
389.main_loop:
390	fmuld	K2,%f30,%f60		! (1_1) res0 = K2 * xx0;
391	cmp	%o2,_0x7f800000		! (4_1) ax ? 0x7f800000
392	bge,pn	%icc,.update12		! (4_1) if( ax >= 0x7f800000 )
393	fsubd	%f40,%f46,%f44		! (2_1) xx0 = (db0 - hi0);
394.cont12:
395	fmuld	%f56,%f42,%f52		! (0_1) res0 *= xx0;
396	sllx	%g5,52,%g5		! (3_1) lexp0 = (long long)iexp0 << 52;
397	ldd	[%i2],%f40		! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
398	fdtos	%f32,%f15		! (2_2) fres0 = (float)res0;
399
400	fmuld	%f50,%f54,%f42		! (4_2) res0 = dtmp0 * res0;
401	sra	%o1,11,%l0		! (3_1) ax >>= 11;
402	stx	%g5,[%fp+tmp1]		! (3_1) dtmp1 = *((double*)&lexp0);
403	for	%f58,DC1,%f48		! (3_1) db0 = vis_for(db0,DC1);
404
405	cmp	%o2,_0x00800000		! (4_1) ax ? 0x00800000
406	bl,pn	%icc,.update13		! (4_1) if( ax < 0x00800000 )
407	ldd	[%fp+tmp2],%f56		! (4_2) dtmp1 = *((double*)&lexp0);
408	fstod	%f13,%f50		! (4_1) db0 = (double)x0;
409.cont13:
410	fmuld	%f44,%f40,%f46		! (2_1) xx0 *= dtmp0;
411	and	%l0,_0x1ff0,%i0		! (3_1) si0 = ax & 0x1ff0;
412	lda	[%i1+stridex]0x82,%l1	! (0_0) ax = *(int*)px;
413	faddd	%f60,K1,%f32		! (1_1) res0 += K1;
414
415	add	%i0,TBL,%l0		! (3_1) (char*)TBL + si0
416	add	%i3,stridey,%o3		! py += stridey
417	st	%f15,[%i3]		! (2_2) *py = fres0;
418	fand	%f48,DC2,%f62		! (3_1) hi0 = vis_fand(db0,DC2);
419
420	fmuld	%f42,%f56,%f44		! (4_2) res0 *= dtmp1;
421	sra	%o2,24,%o7		! (4_1) iexp0 = ax >> 24;
422	ldd	[%i5+8],%f58		! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
423	faddd	%f52,DC1,%f34		! (0_1) res0 += DC1;
424
425	add	%i1,stridex,%o4		! px += stridex
426	add	%o7,960,%o7		! (4_1) iexp0 += 0x3c0;
427	lda	[%i1+stridex]0x82,%f17	! (0_0) x0 = *px;
428	fand	%f50,DC0,%f54		! (4_1) db0 = vis_fand(db0,DC0);
429
430	fmuld	K2,%f46,%f52		! (2_1) res0 = K2 * xx0;
431	cmp	%l1,_0x7f800000		! (0_0) ax ? 0x7f800000
432	bge,pn	%icc,.update14		! (0_0) if( ax >= 0x7f800000 )
433	fsubd	%f48,%f62,%f42		! (3_1) xx0 = (db0 - hi0);
434.cont14:
435	fmuld	%f32,%f30,%f48		! (1_1) res0 *= xx0;
436	sllx	%o7,52,%o1		! (4_1) lexp0 = (long long)iexp0 << 52;
437	ldd	[%i0+TBL],%f40		! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
438	fdtos	%f28,%f19		! (3_2) fres0 = (float)res0;
439
440	fmuld	%f58,%f34,%f32		! (0_1) res0 = dtmp0 * res0;
441	sra	%o2,11,%i5		! (4_1) ax >>= 11;
442	stx	%o1,[%fp+tmp2]		! (4_1) dtmp1 = *((double*)&lexp0);
443	for	%f54,DC1,%f34		! (4_1) db0 = vis_for(db0,DC1);
444
445	cmp	%l1,_0x00800000		! (0_0) ax ? 0x00800000
446	bl,pn	%icc,.update15		! (0_0) if( ax < 0x00800000 )
447	ldd	[%fp+tmp3],%f60		! (0_1) dtmp1 = *((double*)&lexp0);
448	fstod	%f17,%f56		! (0_0) db0 = (double)x0;
449.cont15:
450	fmuld	%f42,%f40,%f42		! (3_1) xx0 *= dtmp0;
451	add	%o3,stridey,%g5		! py += stridey
452	lda	[stridex+%o4]0x82,%i0	! (1_0) ax = *(int*)px;
453	faddd	%f52,K1,%f52		! (2_1) res0 += K1;
454
455	sra	%l1,24,%g1		! (0_0) iexp0 = ax >> 24;
456	and	%i5,_0x1ff0,%i5		! (4_1) si0 = ax & 0x1ff0;
457	st	%f19,[%o3]		! (3_2) *py = fres0;
458	fand	%f34,DC2,%f62		! (4_1) hi0 = vis_fand(db0,DC2);
459
460	fmuld	%f32,%f60,%f40		! (0_1) res0 *= dtmp1;
461	add	%o4,stridex,%i1		! px += stridex
462	ldd	[%i4+8],%f60		! (1_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
463	faddd	%f48,DC1,%f58		! (1_1) res0 += DC1;
464
465	add	%g1,960,%o5		! (0_0) iexp0 += 0x3c0;
466	add	%i5,TBL,%i3		! (4_1) (char*)TBL + si0
467	lda	[stridex+%o4]0x82,%f21	! (1_0) x0 = *px;
468	fand	%f56,DC0,%f32		! (0_0) db0 = vis_fand(db0,DC0);
469
470	fmuld	K2,%f42,%f50		! (3_1) res0 = K2 * xx0;
471	cmp	%i0,_0x7f800000		! (1_0) ax ? 0x7f800000
472	bge,pn	%icc,.update16		! (1_0) if( ax >= 0x7f800000 )
473	fsubd	%f34,%f62,%f54		! (4_1) xx0 = (db0 - hi0);
474.cont16:
475	fmuld	%f52,%f46,%f52		! (2_1) res0 *= xx0;
476	sllx	%o5,52,%o7		! (0_0) lexp0 = (long long)iexp0 << 52;
477	ldd	[TBL+%i5],%f62		! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
478	fdtos	%f44,%f23		! (4_2) fres0 = (float)res0;
479
480	fmuld	%f60,%f58,%f44		! (1_1) res0 = dtmp0 * res0;
481	sra	%l1,11,%i4		! (0_0) ax >>= 11;
482	stx	%o7,[%fp+tmp3]		! (0_0) dtmp1 = *((double*)&lexp0);
483	for	%f32,DC1,%f48		! (0_0) db0 = vis_for(db0,DC1);
484
485	cmp	%i0,_0x00800000		! (1_0) ax ? 0x00800000
486	bl,pn	%icc,.update17		! (1_0) if( ax < 0x00800000 )
487	ldd	[%fp+tmp4],%f34		! (1_1) dtmp1 = *((double*)&lexp0);
488	fstod	%f21,%f56		! (1_0) db0 = (double)x0;
489.cont17:
490	fmuld	%f54,%f62,%f46		! (4_1) xx0 *= dtmp0;
491	and	%i4,_0x1ff0,%g1		! (0_0) si0 = ax & 0x1ff0;
492	lda	[%i1+stridex]0x82,%o2	! (2_0) ax = *(int*)px;
493	faddd	%f50,K1,%f62		! (3_1) res0 += K1;
494
495	add	%g1,TBL,%i5		! (0_0) (double*)((char*)TBL + si0
496	add	%g5,stridey,%g5		! py += stridey
497	st	%f23,[stridey+%o3]	! (4_2) *py = fres0;
498	fand	%f48,DC2,%f32		! (0_0) hi0 = vis_fand(db0,DC2);
499
500	fmuld	%f44,%f34,%f44		! (1_1) res0 *= dtmp1;
501	sra	%i0,24,%o4		! (1_0) iexp0 = ax >> 24;
502	ldd	[%i2+8],%f60		! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
503	faddd	%f52,DC1,%f58		! (2_1) res0 += DC1;
504
505	add	%i1,stridex,%o7		! px += stridex
506	add	%o4,960,%i2		! (1_0) iexp0 += 0x3c0;
507	lda	[%i1+stridex]0x82,%f25	! (2_0) x0 = *px;
508	fand	%f56,DC0,%f34		! (1_0) db0 = vis_fand(db0,DC0);
509
510	fmuld	K2,%f46,%f50		! (4_1) res0 = K2 * xx0;
511	cmp	%o2,_0x7f800000		! (2_0) ax ? 0x7f800000
512	bge,pn	%icc,.update18		! (2_0) if( ax >= 0x7f800000 )
513	fsubd	%f48,%f32,%f52		! (0_0) xx0 = (db0 - hi0);
514.cont18:
515	fmuld	%f62,%f42,%f54		! (3_1) res0 *= xx0;
516	sllx	%i2,52,%o4		! (1_0) lexp0 = (long long)iexp0 << 52;
517	ldd	[TBL+%g1],%f32		! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
518	fdtos	%f40,%f27		! (0_1) fres0 = (float)res0;
519
520	fmuld	%f60,%f58,%f60		! (2_1) res0 = dtmp0 * res0;
521	sra	%i0,11,%g1		! (1_0) ax >>= 11;
522	stx	%o4,[%fp+tmp4]		! (1_0) dtmp1 = *((double*)&lexp0);
523	for	%f34,DC1,%f48		! (1_0) db0 = vis_for(db0,DC1);
524
525	cmp	%o2,_0x00800000		! (2_0) ax ? 0x00800000
526	bl,pn	%icc,.update19		! (2_0) if( ax < 0x00800000 )
527	ldd	[%fp+tmp0],%f40		! (2_1) dtmp1 = *((double*)&lexp0);
528	fstod	%f25,%f56		! (2_0) db0 = (double)x0;
529.cont19:
530	fmuld	%f52,%f32,%f42		! (0_0) xx0 *= dtmp0;
531	and	%g1,_0x1ff0,%o5		! (1_0) si0 = ax & 0x1ff0;
532	lda	[stridex+%o7]0x82,%o1	! (3_0) ax = *(int*)px;
533	faddd	%f50,K1,%f34		! (4_1) res0 += K1;
534
535	add	%o5,TBL,%i4		! (1_0) (char*)TBL + si0
536	add	%g5,stridey,%g1		! py += stridey
537	st	%f27,[%g5]		! (0_1) *py = fres0;
538	fand	%f48,DC2,%f62		! (1_0) hi0 = vis_fand(db0,DC2);
539
540	fmuld	%f60,%f40,%f32		! (2_1) res0 *= dtmp1;
541	sra	%o2,24,%l1		! (2_0) iexp0 = ax >> 24;
542	ldd	[%l0+8],%f40		! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
543	faddd	%f54,DC1,%f58		! (3_1) res0 += DC1;
544
545	add	%o7,stridex,%i1		! px += stridex
546	add	%l1,960,%l0		! (2_0) iexp0 += 0x3c0;
547	lda	[stridex+%o7]0x82,%f0	! (3_0) x0 = *px;
548	fand	%f56,DC0,%f60		! (2_0) db0 = vis_fand(db0,DC0);
549
550	fmuld	K2,%f42,%f50		! (0_0) res0 = K2 * xx0;
551	cmp	%o1,_0x7f800000		! (3_0) ax ? 0x7f800000
552	bge,pn	%icc,.update20		! (3_0) if( ax >= 0x7f800000 )
553	fsubd	%f48,%f62,%f54		! (1_0) xx0 = (db0 - hi0);
554.cont20:
555	fmuld	%f34,%f46,%f52		! (4_1) res0 *= xx0;
556	sllx	%l0,52,%o3		! (2_0) lexp0 = (long long)iexp0 << 52;
557	ldd	[TBL+%o5],%f56		! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
558	fdtos	%f44,%f8		! (1_1) fres0 = (float)res0;
559
560	fmuld	%f40,%f58,%f34		! (3_1) res0 = dtmp0 * res0;
561	sra	%o2,11,%i2		! (2_0) ax >>= 11;
562	stx	%o3,[%fp+tmp0]		! (2_0) dtmp1 = *((double*)&lexp0);
563	for	%f60,DC1,%f40		! (2_0) db0 = vis_for(db0,DC1);
564
565	cmp	%o1,_0x00800000		! (3_0) ax ? 0x00800000
566	bl,pn	%icc,.update21		! (3_0) if( ax < 0x00800000 )
567	ldd	[%fp+tmp1],%f62		! (3_1) dtmp1 = *((double*)&lexp0);
568	fstod	%f0,%f48		! (3_0) db0 = (double)x0;
569.cont21:
570	fmuld	%f54,%f56,%f30		! (1_0) xx0 *= dtmp0;
571	and	%i2,_0x1ff0,%o3		! (2_0) si0 = ax & 0x1ff0;
572	lda	[%i1+stridex]0x82,%o2	! (4_0) ax = *(int*)px;
573	faddd	%f50,K1,%f56		! (0_0) res0 += K1;
574
575	add	%i1,stridex,%i1		! px += stridex
576	add	%o3,TBL,%i2		! (2_0) (char*)TBL + si0
577	st	%f8,[stridey+%g5]	! (1_1) *py = fres0;
578	fand	%f40,DC2,%f46		! (2_0) hi0 = vis_fand(db0,DC2);
579
580	fmuld	%f34,%f62,%f28		! (3_1) res0 *= dtmp1;
581	sra	%o1,24,%o4		! (3_0) iexp0 = ax >> 24;
582	ldd	[%i3+8],%f50		! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
583	faddd	%f52,DC1,%f54		! (4_1) res0 += DC1;
584
585	add	%g1,stridey,%i3		! py += stridey
586	subcc	counter,5,counter	! counter
587	lda	[%i1]0x82,%f13		! (4_0) x0 = *px;
588	fand	%f48,DC0,%f58		! (3_0) db0 = vis_fand(db0,DC0);
589
590	bpos,pt	%icc,.main_loop
591	add	%o4,960,%g5		! (3_0) iexp0 += 0x3c0;
592
593	add	counter,5,counter
594.tail:
595	subcc	counter,1,counter
596	bneg,a	.begin
597	or	%g0,%i3,%g5
598
599	fmuld	%f56,%f42,%f52		! (0_1) res0 *= xx0;
600	fdtos	%f32,%f15		! (2_2) fres0 = (float)res0;
601
602	fmuld	%f50,%f54,%f42		! (4_2) res0 = dtmp0 * res0;
603
604	ldd	[%fp+tmp2],%f56		! (4_2) dtmp1 = *((double*)&lexp0);
605
606	add	%i3,stridey,%o3		! py += stridey
607	st	%f15,[%i3]		! (2_2) *py = fres0;
608
609	subcc	counter,1,counter
610	bneg,a	.begin
611	or	%g0,%o3,%g5
612
613	fmuld	%f42,%f56,%f44		! (4_2) res0 *= dtmp1;
614	ldd	[%i5+8],%f58		! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
615	faddd	%f52,DC1,%f34		! (0_1) res0 += DC1;
616
617	fdtos	%f28,%f19		! (3_2) fres0 = (float)res0;
618
619	fmuld	%f58,%f34,%f32		! (0_1) res0 = dtmp0 * res0;
620
621	ldd	[%fp+tmp3],%f60		! (0_1) dtmp1 = *((double*)&lexp0);
622
623	add	%o3,stridey,%g5		! py += stridey
624
625	st	%f19,[%o3]		! (3_2) *py = fres0;
626
627	subcc	counter,1,counter
628	bneg,a	.begin
629	nop
630
631	fmuld	%f32,%f60,%f40		! (0_1) res0 *= dtmp1;
632
633	fdtos	%f44,%f23		! (4_2) fres0 = (float)res0;
634
635	add	%g5,stridey,%g5		! py += stridey
636	st	%f23,[stridey+%o3]	! (4_2) *py = fres0;
637
638	subcc	counter,1,counter
639	bneg,a	.begin
640	nop
641
642	fdtos	%f40,%f27		! (0_1) fres0 = (float)res0;
643
644	st	%f27,[%g5]		! (0_1) *py = fres0;
645
646	ba	.begin
647	add	%g5,stridey,%g5
648
649	.align	16
650.spec:
651	fsqrts	%f25,%f25
652	sub	counter,1,counter
653	add	%i1,stridex,%i1
654	st	%f25,[%g5]
655	ba	.begin1
656	add	%g5,stridey,%g5
657
658	.align	16
659.update0:
660	cmp	counter,1
661	ble	.cont0
662	fzeros	%f0
663
664	stx	%i1,[%fp+tmp_px]
665	sethi	%hi(0x7f800000),%o1
666
667	sub	counter,1,counter
668	st	counter,[%fp+tmp_counter]
669
670	ba	.cont0
671	or	%g0,1,counter
672
673	.align	16
674.update1:
675	cmp	counter,1
676	ble	.cont1
677	fzeros	%f0
678
679	stx	%i1,[%fp+tmp_px]
680	clr	%o1
681
682	sub	counter,1,counter
683	st	counter,[%fp+tmp_counter]
684
685	ba	.cont1
686	or	%g0,1,counter
687
688	.align	16
689.update2:
690	cmp	counter,2
691	ble	.cont2
692	fzeros	%f13
693
694	stx	%i1,[%fp+tmp_px]
695	sethi	%hi(0x7f800000),%o2
696
697	sub	counter,2,counter
698	st	counter,[%fp+tmp_counter]
699
700	ba	.cont2
701	or	%g0,2,counter
702
703	.align	16
704.update3:
705	cmp	counter,2
706	ble	.cont3
707	fzeros	%f13
708
709	stx	%i1,[%fp+tmp_px]
710	clr	%o2
711
712	sub	counter,2,counter
713	st	counter,[%fp+tmp_counter]
714
715	ba	.cont3
716	or	%g0,2,counter
717
718	.align	16
719.update4:
720	cmp	counter,3
721	ble	.cont4
722	fzeros	%f17
723
724	stx	%o4,[%fp+tmp_px]
725	sethi	%hi(0x7f800000),%l1
726
727	sub	counter,3,counter
728	st	counter,[%fp+tmp_counter]
729
730	ba	.cont4
731	or	%g0,3,counter
732
733	.align	16
734.update5:
735	cmp	counter,3
736	ble	.cont5
737	fzeros	%f17
738
739	stx	%o4,[%fp+tmp_px]
740	clr	%l1
741
742	sub	counter,3,counter
743	st	counter,[%fp+tmp_counter]
744
745	ba	.cont5
746	or	%g0,3,counter
747
748	.align	16
749.update6:
750	cmp	counter,4
751	ble	.cont6
752	fzeros	%f21
753
754	stx	%i1,[%fp+tmp_px]
755	sethi	%hi(0x7f800000),%i0
756
757	sub	counter,4,counter
758	st	counter,[%fp+tmp_counter]
759
760	ba	.cont6
761	or	%g0,4,counter
762
763	.align	16
764.update7:
765	cmp	counter,4
766	ble	.cont7
767	fzeros	%f21
768
769	stx	%i1,[%fp+tmp_px]
770	clr	%i0
771
772	sub	counter,4,counter
773	st	counter,[%fp+tmp_counter]
774
775	ba	.cont7
776	or	%g0,4,counter
777
778	.align	16
779.update8:
780	cmp	counter,5
781	ble	.cont8
782	fzeros	%f25
783
784	stx	%o7,[%fp+tmp_px]
785	sethi	%hi(0x7f800000),%o2
786
787	sub	counter,5,counter
788	st	counter,[%fp+tmp_counter]
789
790	ba	.cont8
791	or	%g0,5,counter
792
793	.align	16
794.update9:
795	cmp	counter,5
796	ble	.cont9
797	fzeros	%f25
798
799	stx	%o7,[%fp+tmp_px]
800	clr	%o2
801
802	sub	counter,5,counter
803	st	counter,[%fp+tmp_counter]
804
805	ba	.cont9
806	or	%g0,5,counter
807
808	.align	16
809.update10:
810	cmp	counter,6
811	ble	.cont10
812	fzeros	%f0
813
814	stx	%i1,[%fp+tmp_px]
815	sethi	%hi(0x7f800000),%o1
816
817	sub	counter,6,counter
818	st	counter,[%fp+tmp_counter]
819
820	ba	.cont10
821	or	%g0,6,counter
822
823	.align	16
824.update11:
825	cmp	counter,6
826	ble	.cont11
827	fzeros	%f0
828
829	stx	%i1,[%fp+tmp_px]
830	clr	%o1
831
832	sub	counter,6,counter
833	st	counter,[%fp+tmp_counter]
834
835	ba	.cont11
836	or	%g0,6,counter
837
838	.align	16
839.update12:
840	cmp	counter,2
841	ble	.cont12
842	fzeros	%f13
843
844	stx	%i1,[%fp+tmp_px]
845	sethi	%hi(0x7f800000),%o2
846
847	sub	counter,2,counter
848	st	counter,[%fp+tmp_counter]
849
850	ba	.cont12
851	or	%g0,2,counter
852
853	.align	16
854.update13:
855	cmp	counter,2
856	ble	.cont13
857	fzeros	%f13
858
859	stx	%i1,[%fp+tmp_px]
860	clr	%o2
861
862	sub	counter,2,counter
863	st	counter,[%fp+tmp_counter]
864
865	ba	.cont13
866	or	%g0,2,counter
867
868	.align	16
869.update14:
870	cmp	counter,3
871	ble	.cont14
872	fzeros	%f17
873
874	stx	%o4,[%fp+tmp_px]
875	sethi	%hi(0x7f800000),%l1
876
877	sub	counter,3,counter
878	st	counter,[%fp+tmp_counter]
879
880	ba	.cont14
881	or	%g0,3,counter
882
883	.align	16
884.update15:
885	cmp	counter,3
886	ble	.cont15
887	fzeros	%f17
888
889	stx	%o4,[%fp+tmp_px]
890	clr	%l1
891
892	sub	counter,3,counter
893	st	counter,[%fp+tmp_counter]
894
895	ba	.cont15
896	or	%g0,3,counter
897
898	.align	16
899.update16:
900	cmp	counter,4
901	ble	.cont16
902	fzeros	%f21
903
904	stx	%i1,[%fp+tmp_px]
905	sethi	%hi(0x7f800000),%i0
906
907	sub	counter,4,counter
908	st	counter,[%fp+tmp_counter]
909
910	ba	.cont16
911	or	%g0,4,counter
912
913	.align	16
914.update17:
915	cmp	counter,4
916	ble	.cont17
917	fzeros	%f21
918
919	stx	%i1,[%fp+tmp_px]
920	clr	%i0
921
922	sub	counter,4,counter
923	st	counter,[%fp+tmp_counter]
924
925	ba	.cont17
926	or	%g0,4,counter
927
928	.align	16
929.update18:
930	cmp	counter,5
931	ble	.cont18
932	fzeros	%f25
933
934	stx	%o7,[%fp+tmp_px]
935	sethi	%hi(0x7f800000),%o2
936
937	sub	counter,5,counter
938	st	counter,[%fp+tmp_counter]
939
940	ba	.cont18
941	or	%g0,5,counter
942
943	.align	16
944.update19:
945	cmp	counter,5
946	ble	.cont19
947	fzeros	%f25
948
949	stx	%o7,[%fp+tmp_px]
950	clr	%o2
951
952	sub	counter,5,counter
953	st	counter,[%fp+tmp_counter]
954
955	ba	.cont19
956	or	%g0,5,counter
957
958	.align	16
959.update20:
960	cmp	counter,6
961	ble	.cont20
962	fzeros	%f0
963
964	stx	%i1,[%fp+tmp_px]
965	sethi	%hi(0x7f800000),%o1
966
967	sub	counter,6,counter
968	st	counter,[%fp+tmp_counter]
969
970	ba	.cont20
971	or	%g0,6,counter
972
973	.align	16
974.update21:
975	cmp	counter,6
976	ble	.cont21
977	fzeros	%f0
978
979	stx	%i1,[%fp+tmp_px]
980	clr	%o1
981
982	sub	counter,6,counter
983	st	counter,[%fp+tmp_counter]
984
985	ba	.cont21
986	or	%g0,6,counter
987
988.exit:
989	ret
990	restore
991	SET_SIZE(__vsqrtf_ultra3)
992
993