xref: /illumos-gate/usr/src/lib/libmvec/common/vis/__vsqrtf_ultra3.S (revision a6bde1a23b60f140c7ed78df979c2e22b1ed9b2c)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
23 */
24/*
25 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
26 * Use is subject to license terms.
27 */
28
29	.file	"__vsqrtf_ultra3.S"
30
31#include "libm.h"
32#if defined(LIBMVEC_SO_BUILD)
33	.weak	__vsqrtf
34	.type	__vsqrtf,#function
35	__vsqrtf = __vsqrtf_ultra3
36#endif
37
38	RO_DATA
39	.align	64
40
41.CONST_TBL:
42	.word	0x3fe00001, 0x80007e00	! K1  =  5.00000715259318464227e-01
43	.word	0xbfc00003, 0xc0017a01	! K2  = -1.25000447037521686593e-01
44	.word	0x000fffff, 0xffffffff	! DC0 = 0x000fffffffffffff
45	.word	0x3ff00000, 0x00000000	! DC1 = 0x3ff0000000000000
46	.word	0x7ffff000, 0x00000000	! DC2 = 0x7ffff00000000000
47
48#define DC0		%f6
49#define DC1		%f4
50#define DC2		%f2
51#define K2		%f38
52#define K1		%f36
53#define TBL		%l2
54#define stridex		%l3
55#define stridey		%l4
56#define _0x1ff0		%l5
57#define counter		%l6
58#define _0x00800000	%l7
59#define _0x7f800000	%o0
60
61#define tmp_px		STACK_BIAS-0x40
62#define tmp_counter	STACK_BIAS-0x38
63#define tmp0		STACK_BIAS-0x30
64#define tmp1		STACK_BIAS-0x28
65#define tmp2		STACK_BIAS-0x20
66#define tmp3		STACK_BIAS-0x18
67#define tmp4		STACK_BIAS-0x10
68
69! sizeof temp storage - must be a multiple of 16 for V9
70#define tmps		0x40
71
72!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
73!      !!!!!   algorithm   !!!!!
74!
75!  x0 = *px;
76!  ax = *(int*)px;
77!  px += stridex;
78!
79!  if( ax >= 0x7f800000 )
80!  {
81!    *py = sqrtf(x0);
82!    py += stridey;
83!    continue;
84!  }
85!  if( ax < 0x00800000 )
86!  {
87!    *py = sqrtf(x0);
88!    py += stridey;
89!    continue;
90!  }
91!
92!  db0 = (double)x0;
93!  iexp0 = ax >> 24;
94!  iexp0 += 0x3c0;
95!  lexp0 = (long long)iexp0 << 52;
96!
97!  db0 = vis_fand(db0,DC0);
98!  db0 = vis_for(db0,DC1);
99!  hi0 = vis_fand(db0,DC2);
100!
101!  ax >>= 11;
102!  si0 = ax & 0x1ff0;
103!  dtmp0 = ((double*)((char*)TBL + si0))[0];
104!  xx0 = (db0 - hi0);
105!  xx0 *= dtmp0;
106!  dtmp0 = ((double*)((char*)TBL + si0))[1]
107!  res0 = K2 * xx0;
108!  res0 += K1;
109!  res0 *= xx0;
110!  res0 += DC1;
111!  res0 = dtmp0 * res0;
112!  dtmp1 = *((double*)&lexp0);
113!  res0 *= dtmp1;
114!  fres0 = (float)res0;
115!  *py = fres0;
116!  py += stridey;
117!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
118
119	ENTRY(__vsqrtf_ultra3)
120	save	%sp,-SA(MINFRAME)-tmps,%sp
121	PIC_SETUP(l7)
122	PIC_SET(l7,.CONST_TBL,o2)
123	PIC_SET(l7,__vlibm_TBL_sqrtf,l2)
124
125	st	%i0,[%fp+tmp_counter]
126	sll	%i2,2,stridex
127	or	%g0,0xff8,%l5
128
129	stx	%i1,[%fp+tmp_px]
130	sll	%l5,1,_0x1ff0
131
132	ldd	[%o2],K1
133	sll	%i4,2,stridey
134
135	ldd	[%o2+8],K2
136	or	%g0,%i3,%g5
137
138	ldd	[%o2+16],DC0
139	sethi	%hi(0x7f800000),%o0
140
141	ldd	[%o2+24],DC1
142	sethi	%hi(0x00800000),%l7
143
144	ldd	[%o2+32],DC2
145
146.begin:
147	ld	[%fp+tmp_counter],counter
148	ldx	[%fp+tmp_px],%i1
149	st	%g0,[%fp+tmp_counter]
150.begin1:
151	cmp	counter,0
152	ble,pn	%icc,.exit
153
154	lda	[%i1]0x82,%o2		! (2_0) ax = *(int*)px;
155
156	or	%g0,%i1,%o7
157	lda	[%i1]0x82,%f25		! (2_0) x0 = *px;
158
159	cmp	%o2,_0x7f800000		! (2_0) ax ? 0x7f800000
160	bge,pn	%icc,.spec		! (2_0) if( ax >= 0x7f800000 )
161	nop
162
163	cmp	%o2,_0x00800000		! (2_0) ax ? 0x00800000
164	bl,pn	%icc,.spec		! (2_0) if( ax < 0x00800000 )
165	nop
166
167	fstod	%f25,%f56		! (2_0) db0 = (double)x0;
168
169	lda	[stridex+%o7]0x82,%o1	! (3_0) ax = *(int*)px;
170
171	sra	%o2,24,%l1		! (2_0) iexp0 = ax >> 24;
172
173	add	%o7,stridex,%i1		! px += stridex
174	add	%l1,960,%l0		! (2_0) iexp0 += 0x3c0;
175	lda	[stridex+%o7]0x82,%f0	! (3_0) x0 = *px;
176	fand	%f56,DC0,%f60		! (2_0) db0 = vis_fand(db0,DC0);
177
178	cmp	%o1,_0x7f800000		! (3_0) ax ? 0x7f800000
179	bge,pn	%icc,.update0		! (3_0) if( ax >= 0x7f800000 )
180	nop
181.cont0:
182	sllx	%l0,52,%o3		! (2_0) lexp0 = (long long)iexp0 << 52;
183
184	sra	%o2,11,%i2		! (2_0) ax >>= 11;
185	stx	%o3,[%fp+tmp0]		! (2_0) dtmp1 = *((double*)&lexp0);
186	for	%f60,DC1,%f40		! (2_0) db0 = vis_for(db0,DC1);
187
188	cmp	%o1,_0x00800000		! (3_0) ax ? 0x00800000
189	bl,pn	%icc,.update1		! (3_0) if( ax < 0x00800000 )
190	nop
191.cont1:
192	fstod	%f0,%f48		! (3_0) db0 = (double)x0;
193
194	and	%i2,_0x1ff0,%o3		! (2_0) si0 = ax & 0x1ff0;
195	lda	[%i1+stridex]0x82,%o2	! (4_0) ax = *(int*)px;
196
197	add	%i1,stridex,%i1		! px += stridex
198	add	%o3,TBL,%i2		! (2_0) (char*)TBL + si0
199	fand	%f40,DC2,%f46		! (2_0) hi0 = vis_fand(db0,DC2);
200
201	sra	%o1,24,%o4		! (3_0) iexp0 = ax >> 24;
202
203	lda	[%i1]0x82,%f13		! (4_0) x0 = *px;
204	fand	%f48,DC0,%f58		! (3_0) db0 = vis_fand(db0,DC0);
205
206	add	%o4,960,%i0		! (3_0) iexp0 += 0x3c0;
207
208	cmp	%o2,_0x7f800000		! (4_1) ax ? 0x7f800000
209	bge,pn	%icc,.update2		! (4_1) if( ax >= 0x7f800000 )
210	nop
211.cont2:
212	fsubd	%f40,%f46,%f44		! (2_1) xx0 = (db0 - hi0);
213	sllx	%i0,52,%g1		! (3_1) lexp0 = (long long)iexp0 << 52;
214	ldd	[%i2],%f40		! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
215
216	sra	%o1,11,%l0		! (3_1) ax >>= 11;
217	stx	%g1,[%fp+tmp1]		! (3_1) dtmp1 = *((double*)&lexp0);
218	for	%f58,DC1,%f48		! (3_1) db0 = vis_for(db0,DC1);
219
220	cmp	%o2,_0x00800000		! (4_1) ax ? 0x00800000
221	bl,pn	%icc,.update3		! (4_1) if( ax < 0x00800000 )
222	nop
223.cont3:
224	fstod	%f13,%f50		! (4_1) db0 = (double)x0;
225
226	fmuld	%f44,%f40,%f46		! (2_1) xx0 *= dtmp0;
227	and	%l0,_0x1ff0,%i0		! (3_1) si0 = ax & 0x1ff0;
228	lda	[%i1+stridex]0x82,%l1	! (0_0) ax = *(int*)px;
229
230	add	%i0,TBL,%l0		! (3_1) (char*)TBL + si0
231	fand	%f48,DC2,%f62		! (3_1) hi0 = vis_fand(db0,DC2);
232
233	sra	%o2,24,%o7		! (4_1) iexp0 = ax >> 24;
234
235	add	%i1,stridex,%o4		! px += stridex
236	add	%o7,960,%o7		! (4_1) iexp0 += 0x3c0;
237	lda	[%i1+stridex]0x82,%f17	! (0_0) x0 = *px;
238	fand	%f50,DC0,%f54		! (4_1) db0 = vis_fand(db0,DC0);
239
240	fmuld	K2,%f46,%f52		! (2_1) res0 = K2 * xx0;
241	cmp	%l1,_0x7f800000		! (0_0) ax ? 0x7f800000
242	bge,pn	%icc,.update4		! (0_0) if( ax >= 0x7f800000 )
243	fsubd	%f48,%f62,%f42		! (3_1) xx0 = (db0 - hi0);
244.cont4:
245	sllx	%o7,52,%o1		! (4_1) lexp0 = (long long)iexp0 << 52;
246	ldd	[%i0+TBL],%f40		! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
247
248	sra	%o2,11,%i5		! (4_1) ax >>= 11;
249	stx	%o1,[%fp+tmp2]		! (4_1) dtmp1 = *((double*)&lexp0);
250	for	%f54,DC1,%f34		! (4_1) db0 = vis_for(db0,DC1);
251
252	cmp	%l1,_0x00800000		! (0_0) ax ? 0x00800000
253	bl,pn	%icc,.update5		! (0_0) if( ax < 0x00800000 )
254	nop
255.cont5:
256	fstod	%f17,%f56		! (0_0) db0 = (double)x0;
257
258	fmuld	%f42,%f40,%f42		! (3_1) xx0 *= dtmp0;
259	lda	[stridex+%o4]0x82,%i0	! (1_0) ax = *(int*)px;
260	faddd	%f52,K1,%f52		! (2_1) res0 += K1;
261
262	sra	%l1,24,%g1		! (0_0) iexp0 = ax >> 24;
263	and	%i5,_0x1ff0,%i5		! (4_1) si0 = ax & 0x1ff0;
264	fand	%f34,DC2,%f62		! (4_1) hi0 = vis_fand(db0,DC2);
265
266	add	%o4,stridex,%i1		! px += stridex
267
268	add	%g1,960,%o5		! (0_0) iexp0 += 0x3c0;
269	add	%i5,TBL,%i3		! (4_1) (char*)TBL + si0
270	lda	[stridex+%o4]0x82,%f21	! (1_0) x0 = *px;
271	fand	%f56,DC0,%f32		! (0_0) db0 = vis_fand(db0,DC0);
272
273	fmuld	K2,%f42,%f50		! (3_1) res0 = K2 * xx0;
274	cmp	%i0,_0x7f800000		! (1_0) ax ? 0x7f800000
275	bge,pn	%icc,.update6		! (1_0) if( ax >= 0x7f800000 )
276	fsubd	%f34,%f62,%f54		! (4_1) xx0 = (db0 - hi0);
277.cont6:
278	fmuld	%f52,%f46,%f52		! (2_1) res0 *= xx0;
279	sllx	%o5,52,%o7		! (0_0) lexp0 = (long long)iexp0 << 52;
280	ldd	[TBL+%i5],%f62		! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
281
282	sra	%l1,11,%i4		! (0_0) ax >>= 11;
283	stx	%o7,[%fp+tmp3]		! (0_0) dtmp1 = *((double*)&lexp0);
284	for	%f32,DC1,%f48		! (0_0) db0 = vis_for(db0,DC1);
285
286	cmp	%i0,_0x00800000		! (1_0) ax ? 0x00800000
287	bl,pn	%icc,.update7		! (1_0) if( ax < 0x00800000 )
288	nop
289.cont7:
290	fstod	%f21,%f56		! (1_0) db0 = (double)x0;
291
292	fmuld	%f54,%f62,%f46		! (4_1) xx0 *= dtmp0;
293	and	%i4,_0x1ff0,%g1		! (0_0) si0 = ax & 0x1ff0;
294	lda	[%i1+stridex]0x82,%o2	! (2_0) ax = *(int*)px;
295	faddd	%f50,K1,%f62		! (3_1) res0 += K1;
296
297	add	%g1,TBL,%i5		! (0_0) (double*)((char*)TBL + si0
298	fand	%f48,DC2,%f32		! (0_0) hi0 = vis_fand(db0,DC2);
299
300	sra	%i0,24,%o4		! (1_0) iexp0 = ax >> 24;
301	ldd	[%i2+8],%f60		! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
302	faddd	%f52,DC1,%f58		! (2_1) res0 += DC1;
303
304	add	%i1,stridex,%o7		! px += stridex
305	add	%o4,960,%i2		! (1_0) iexp0 += 0x3c0;
306	lda	[%i1+stridex]0x82,%f25	! (2_0) x0 = *px;
307	fand	%f56,DC0,%f34		! (1_0) db0 = vis_fand(db0,DC0);
308
309	fmuld	K2,%f46,%f50		! (4_1) res0 = K2 * xx0;
310	cmp	%o2,_0x7f800000		! (2_0) ax ? 0x7f800000
311	bge,pn	%icc,.update8		! (2_0) if( ax >= 0x7f800000 )
312	fsubd	%f48,%f32,%f52		! (0_0) xx0 = (db0 - hi0);
313.cont8:
314	fmuld	%f62,%f42,%f54		! (3_1) res0 *= xx0;
315	sllx	%i2,52,%o4		! (1_0) lexp0 = (long long)iexp0 << 52;
316	ldd	[TBL+%g1],%f32		! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
317
318	fmuld	%f60,%f58,%f60		! (2_1) res0 = dtmp0 * res0;
319	sra	%i0,11,%g1		! (1_0) ax >>= 11;
320	stx	%o4,[%fp+tmp4]		! (1_0) dtmp1 = *((double*)&lexp0);
321	for	%f34,DC1,%f48		! (1_0) db0 = vis_for(db0,DC1);
322
323	cmp	%o2,_0x00800000		! (2_0) ax ? 0x00800000
324	bl,pn	%icc,.update9		! (2_0) if( ax < 0x00800000 )
325	ldd	[%fp+tmp0],%f40		! (2_1) dtmp1 = *((double*)&lexp0);
326	fstod	%f25,%f56		! (2_0) db0 = (double)x0;
327.cont9:
328	fmuld	%f52,%f32,%f42		! (0_0) xx0 *= dtmp0;
329	and	%g1,_0x1ff0,%o5		! (1_0) si0 = ax & 0x1ff0;
330	lda	[stridex+%o7]0x82,%o1	! (3_0) ax = *(int*)px;
331	faddd	%f50,K1,%f34		! (4_1) res0 += K1;
332
333	add	%o5,TBL,%i4		! (1_0) (char*)TBL + si0
334	fand	%f48,DC2,%f62		! (1_0) hi0 = vis_fand(db0,DC2);
335
336	fmuld	%f60,%f40,%f32		! (2_1) res0 *= dtmp1;
337	sra	%o2,24,%l1		! (2_0) iexp0 = ax >> 24;
338	ldd	[%l0+8],%f40		! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
339	faddd	%f54,DC1,%f58		! (3_1) res0 += DC1;
340
341	add	%o7,stridex,%i1		! px += stridex
342	add	%l1,960,%l0		! (2_0) iexp0 += 0x3c0;
343	lda	[stridex+%o7]0x82,%f0	! (3_0) x0 = *px;
344	fand	%f56,DC0,%f60		! (2_0) db0 = vis_fand(db0,DC0);
345
346	fmuld	K2,%f42,%f50		! (0_0) res0 = K2 * xx0;
347	cmp	%o1,_0x7f800000		! (3_0) ax ? 0x7f800000
348	bge,pn	%icc,.update10		! (3_0) if( ax >= 0x7f800000 )
349	fsubd	%f48,%f62,%f54		! (1_0) xx0 = (db0 - hi0);
350.cont10:
351	fmuld	%f34,%f46,%f52		! (4_1) res0 *= xx0;
352	sllx	%l0,52,%o3		! (2_0) lexp0 = (long long)iexp0 << 52;
353	ldd	[TBL+%o5],%f56		! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
354
355	fmuld	%f40,%f58,%f34		! (3_1) res0 = dtmp0 * res0;
356	sra	%o2,11,%i2		! (2_0) ax >>= 11;
357	stx	%o3,[%fp+tmp0]		! (2_0) dtmp1 = *((double*)&lexp0);
358	for	%f60,DC1,%f40		! (2_0) db0 = vis_for(db0,DC1);
359
360	cmp	%o1,_0x00800000		! (3_0) ax ? 0x00800000
361	bl,pn	%icc,.update11		! (3_0) if( ax < 0x00800000 )
362	ldd	[%fp+tmp1],%f62		! (3_1) dtmp1 = *((double*)&lexp0);
363	fstod	%f0,%f48		! (3_0) db0 = (double)x0;
364.cont11:
365	fmuld	%f54,%f56,%f30		! (1_0) xx0 *= dtmp0;
366	and	%i2,_0x1ff0,%o3		! (2_0) si0 = ax & 0x1ff0;
367	lda	[%i1+stridex]0x82,%o2	! (4_0) ax = *(int*)px;
368	faddd	%f50,K1,%f56		! (0_0) res0 += K1;
369
370	add	%i1,stridex,%i1		! px += stridex
371	add	%o3,TBL,%i2		! (2_0) (char*)TBL + si0
372	fand	%f40,DC2,%f46		! (2_0) hi0 = vis_fand(db0,DC2);
373
374	fmuld	%f34,%f62,%f28		! (3_1) res0 *= dtmp1;
375	sra	%o1,24,%o4		! (3_0) iexp0 = ax >> 24;
376	ldd	[%i3+8],%f50		! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
377	faddd	%f52,DC1,%f54		! (4_1) res0 += DC1;
378
379	lda	[%i1]0x82,%f13		! (4_0) x0 = *px;
380	fand	%f48,DC0,%f58		! (3_0) db0 = vis_fand(db0,DC0);
381
382	or	%g0,%g5,%i3
383	cmp	counter,5
384	bl,pn	%icc,.tail
385	add	%o4,960,%g5		! (3_0) iexp0 += 0x3c0;
386
387	ba	.main_loop
388	sub	counter,5,counter	! counter
389
390	.align	16
391.main_loop:
392	fmuld	K2,%f30,%f60		! (1_1) res0 = K2 * xx0;
393	cmp	%o2,_0x7f800000		! (4_1) ax ? 0x7f800000
394	bge,pn	%icc,.update12		! (4_1) if( ax >= 0x7f800000 )
395	fsubd	%f40,%f46,%f44		! (2_1) xx0 = (db0 - hi0);
396.cont12:
397	fmuld	%f56,%f42,%f52		! (0_1) res0 *= xx0;
398	sllx	%g5,52,%g5		! (3_1) lexp0 = (long long)iexp0 << 52;
399	ldd	[%i2],%f40		! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
400	fdtos	%f32,%f15		! (2_2) fres0 = (float)res0;
401
402	fmuld	%f50,%f54,%f42		! (4_2) res0 = dtmp0 * res0;
403	sra	%o1,11,%l0		! (3_1) ax >>= 11;
404	stx	%g5,[%fp+tmp1]		! (3_1) dtmp1 = *((double*)&lexp0);
405	for	%f58,DC1,%f48		! (3_1) db0 = vis_for(db0,DC1);
406
407	cmp	%o2,_0x00800000		! (4_1) ax ? 0x00800000
408	bl,pn	%icc,.update13		! (4_1) if( ax < 0x00800000 )
409	ldd	[%fp+tmp2],%f56		! (4_2) dtmp1 = *((double*)&lexp0);
410	fstod	%f13,%f50		! (4_1) db0 = (double)x0;
411.cont13:
412	fmuld	%f44,%f40,%f46		! (2_1) xx0 *= dtmp0;
413	and	%l0,_0x1ff0,%i0		! (3_1) si0 = ax & 0x1ff0;
414	lda	[%i1+stridex]0x82,%l1	! (0_0) ax = *(int*)px;
415	faddd	%f60,K1,%f32		! (1_1) res0 += K1;
416
417	add	%i0,TBL,%l0		! (3_1) (char*)TBL + si0
418	add	%i3,stridey,%o3		! py += stridey
419	st	%f15,[%i3]		! (2_2) *py = fres0;
420	fand	%f48,DC2,%f62		! (3_1) hi0 = vis_fand(db0,DC2);
421
422	fmuld	%f42,%f56,%f44		! (4_2) res0 *= dtmp1;
423	sra	%o2,24,%o7		! (4_1) iexp0 = ax >> 24;
424	ldd	[%i5+8],%f58		! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
425	faddd	%f52,DC1,%f34		! (0_1) res0 += DC1;
426
427	add	%i1,stridex,%o4		! px += stridex
428	add	%o7,960,%o7		! (4_1) iexp0 += 0x3c0;
429	lda	[%i1+stridex]0x82,%f17	! (0_0) x0 = *px;
430	fand	%f50,DC0,%f54		! (4_1) db0 = vis_fand(db0,DC0);
431
432	fmuld	K2,%f46,%f52		! (2_1) res0 = K2 * xx0;
433	cmp	%l1,_0x7f800000		! (0_0) ax ? 0x7f800000
434	bge,pn	%icc,.update14		! (0_0) if( ax >= 0x7f800000 )
435	fsubd	%f48,%f62,%f42		! (3_1) xx0 = (db0 - hi0);
436.cont14:
437	fmuld	%f32,%f30,%f48		! (1_1) res0 *= xx0;
438	sllx	%o7,52,%o1		! (4_1) lexp0 = (long long)iexp0 << 52;
439	ldd	[%i0+TBL],%f40		! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
440	fdtos	%f28,%f19		! (3_2) fres0 = (float)res0;
441
442	fmuld	%f58,%f34,%f32		! (0_1) res0 = dtmp0 * res0;
443	sra	%o2,11,%i5		! (4_1) ax >>= 11;
444	stx	%o1,[%fp+tmp2]		! (4_1) dtmp1 = *((double*)&lexp0);
445	for	%f54,DC1,%f34		! (4_1) db0 = vis_for(db0,DC1);
446
447	cmp	%l1,_0x00800000		! (0_0) ax ? 0x00800000
448	bl,pn	%icc,.update15		! (0_0) if( ax < 0x00800000 )
449	ldd	[%fp+tmp3],%f60		! (0_1) dtmp1 = *((double*)&lexp0);
450	fstod	%f17,%f56		! (0_0) db0 = (double)x0;
451.cont15:
452	fmuld	%f42,%f40,%f42		! (3_1) xx0 *= dtmp0;
453	add	%o3,stridey,%g5		! py += stridey
454	lda	[stridex+%o4]0x82,%i0	! (1_0) ax = *(int*)px;
455	faddd	%f52,K1,%f52		! (2_1) res0 += K1;
456
457	sra	%l1,24,%g1		! (0_0) iexp0 = ax >> 24;
458	and	%i5,_0x1ff0,%i5		! (4_1) si0 = ax & 0x1ff0;
459	st	%f19,[%o3]		! (3_2) *py = fres0;
460	fand	%f34,DC2,%f62		! (4_1) hi0 = vis_fand(db0,DC2);
461
462	fmuld	%f32,%f60,%f40		! (0_1) res0 *= dtmp1;
463	add	%o4,stridex,%i1		! px += stridex
464	ldd	[%i4+8],%f60		! (1_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
465	faddd	%f48,DC1,%f58		! (1_1) res0 += DC1;
466
467	add	%g1,960,%o5		! (0_0) iexp0 += 0x3c0;
468	add	%i5,TBL,%i3		! (4_1) (char*)TBL + si0
469	lda	[stridex+%o4]0x82,%f21	! (1_0) x0 = *px;
470	fand	%f56,DC0,%f32		! (0_0) db0 = vis_fand(db0,DC0);
471
472	fmuld	K2,%f42,%f50		! (3_1) res0 = K2 * xx0;
473	cmp	%i0,_0x7f800000		! (1_0) ax ? 0x7f800000
474	bge,pn	%icc,.update16		! (1_0) if( ax >= 0x7f800000 )
475	fsubd	%f34,%f62,%f54		! (4_1) xx0 = (db0 - hi0);
476.cont16:
477	fmuld	%f52,%f46,%f52		! (2_1) res0 *= xx0;
478	sllx	%o5,52,%o7		! (0_0) lexp0 = (long long)iexp0 << 52;
479	ldd	[TBL+%i5],%f62		! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
480	fdtos	%f44,%f23		! (4_2) fres0 = (float)res0;
481
482	fmuld	%f60,%f58,%f44		! (1_1) res0 = dtmp0 * res0;
483	sra	%l1,11,%i4		! (0_0) ax >>= 11;
484	stx	%o7,[%fp+tmp3]		! (0_0) dtmp1 = *((double*)&lexp0);
485	for	%f32,DC1,%f48		! (0_0) db0 = vis_for(db0,DC1);
486
487	cmp	%i0,_0x00800000		! (1_0) ax ? 0x00800000
488	bl,pn	%icc,.update17		! (1_0) if( ax < 0x00800000 )
489	ldd	[%fp+tmp4],%f34		! (1_1) dtmp1 = *((double*)&lexp0);
490	fstod	%f21,%f56		! (1_0) db0 = (double)x0;
491.cont17:
492	fmuld	%f54,%f62,%f46		! (4_1) xx0 *= dtmp0;
493	and	%i4,_0x1ff0,%g1		! (0_0) si0 = ax & 0x1ff0;
494	lda	[%i1+stridex]0x82,%o2	! (2_0) ax = *(int*)px;
495	faddd	%f50,K1,%f62		! (3_1) res0 += K1;
496
497	add	%g1,TBL,%i5		! (0_0) (double*)((char*)TBL + si0
498	add	%g5,stridey,%g5		! py += stridey
499	st	%f23,[stridey+%o3]	! (4_2) *py = fres0;
500	fand	%f48,DC2,%f32		! (0_0) hi0 = vis_fand(db0,DC2);
501
502	fmuld	%f44,%f34,%f44		! (1_1) res0 *= dtmp1;
503	sra	%i0,24,%o4		! (1_0) iexp0 = ax >> 24;
504	ldd	[%i2+8],%f60		! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
505	faddd	%f52,DC1,%f58		! (2_1) res0 += DC1;
506
507	add	%i1,stridex,%o7		! px += stridex
508	add	%o4,960,%i2		! (1_0) iexp0 += 0x3c0;
509	lda	[%i1+stridex]0x82,%f25	! (2_0) x0 = *px;
510	fand	%f56,DC0,%f34		! (1_0) db0 = vis_fand(db0,DC0);
511
512	fmuld	K2,%f46,%f50		! (4_1) res0 = K2 * xx0;
513	cmp	%o2,_0x7f800000		! (2_0) ax ? 0x7f800000
514	bge,pn	%icc,.update18		! (2_0) if( ax >= 0x7f800000 )
515	fsubd	%f48,%f32,%f52		! (0_0) xx0 = (db0 - hi0);
516.cont18:
517	fmuld	%f62,%f42,%f54		! (3_1) res0 *= xx0;
518	sllx	%i2,52,%o4		! (1_0) lexp0 = (long long)iexp0 << 52;
519	ldd	[TBL+%g1],%f32		! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
520	fdtos	%f40,%f27		! (0_1) fres0 = (float)res0;
521
522	fmuld	%f60,%f58,%f60		! (2_1) res0 = dtmp0 * res0;
523	sra	%i0,11,%g1		! (1_0) ax >>= 11;
524	stx	%o4,[%fp+tmp4]		! (1_0) dtmp1 = *((double*)&lexp0);
525	for	%f34,DC1,%f48		! (1_0) db0 = vis_for(db0,DC1);
526
527	cmp	%o2,_0x00800000		! (2_0) ax ? 0x00800000
528	bl,pn	%icc,.update19		! (2_0) if( ax < 0x00800000 )
529	ldd	[%fp+tmp0],%f40		! (2_1) dtmp1 = *((double*)&lexp0);
530	fstod	%f25,%f56		! (2_0) db0 = (double)x0;
531.cont19:
532	fmuld	%f52,%f32,%f42		! (0_0) xx0 *= dtmp0;
533	and	%g1,_0x1ff0,%o5		! (1_0) si0 = ax & 0x1ff0;
534	lda	[stridex+%o7]0x82,%o1	! (3_0) ax = *(int*)px;
535	faddd	%f50,K1,%f34		! (4_1) res0 += K1;
536
537	add	%o5,TBL,%i4		! (1_0) (char*)TBL + si0
538	add	%g5,stridey,%g1		! py += stridey
539	st	%f27,[%g5]		! (0_1) *py = fres0;
540	fand	%f48,DC2,%f62		! (1_0) hi0 = vis_fand(db0,DC2);
541
542	fmuld	%f60,%f40,%f32		! (2_1) res0 *= dtmp1;
543	sra	%o2,24,%l1		! (2_0) iexp0 = ax >> 24;
544	ldd	[%l0+8],%f40		! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
545	faddd	%f54,DC1,%f58		! (3_1) res0 += DC1;
546
547	add	%o7,stridex,%i1		! px += stridex
548	add	%l1,960,%l0		! (2_0) iexp0 += 0x3c0;
549	lda	[stridex+%o7]0x82,%f0	! (3_0) x0 = *px;
550	fand	%f56,DC0,%f60		! (2_0) db0 = vis_fand(db0,DC0);
551
552	fmuld	K2,%f42,%f50		! (0_0) res0 = K2 * xx0;
553	cmp	%o1,_0x7f800000		! (3_0) ax ? 0x7f800000
554	bge,pn	%icc,.update20		! (3_0) if( ax >= 0x7f800000 )
555	fsubd	%f48,%f62,%f54		! (1_0) xx0 = (db0 - hi0);
556.cont20:
557	fmuld	%f34,%f46,%f52		! (4_1) res0 *= xx0;
558	sllx	%l0,52,%o3		! (2_0) lexp0 = (long long)iexp0 << 52;
559	ldd	[TBL+%o5],%f56		! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
560	fdtos	%f44,%f8		! (1_1) fres0 = (float)res0;
561
562	fmuld	%f40,%f58,%f34		! (3_1) res0 = dtmp0 * res0;
563	sra	%o2,11,%i2		! (2_0) ax >>= 11;
564	stx	%o3,[%fp+tmp0]		! (2_0) dtmp1 = *((double*)&lexp0);
565	for	%f60,DC1,%f40		! (2_0) db0 = vis_for(db0,DC1);
566
567	cmp	%o1,_0x00800000		! (3_0) ax ? 0x00800000
568	bl,pn	%icc,.update21		! (3_0) if( ax < 0x00800000 )
569	ldd	[%fp+tmp1],%f62		! (3_1) dtmp1 = *((double*)&lexp0);
570	fstod	%f0,%f48		! (3_0) db0 = (double)x0;
571.cont21:
572	fmuld	%f54,%f56,%f30		! (1_0) xx0 *= dtmp0;
573	and	%i2,_0x1ff0,%o3		! (2_0) si0 = ax & 0x1ff0;
574	lda	[%i1+stridex]0x82,%o2	! (4_0) ax = *(int*)px;
575	faddd	%f50,K1,%f56		! (0_0) res0 += K1;
576
577	add	%i1,stridex,%i1		! px += stridex
578	add	%o3,TBL,%i2		! (2_0) (char*)TBL + si0
579	st	%f8,[stridey+%g5]	! (1_1) *py = fres0;
580	fand	%f40,DC2,%f46		! (2_0) hi0 = vis_fand(db0,DC2);
581
582	fmuld	%f34,%f62,%f28		! (3_1) res0 *= dtmp1;
583	sra	%o1,24,%o4		! (3_0) iexp0 = ax >> 24;
584	ldd	[%i3+8],%f50		! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
585	faddd	%f52,DC1,%f54		! (4_1) res0 += DC1;
586
587	add	%g1,stridey,%i3		! py += stridey
588	subcc	counter,5,counter	! counter
589	lda	[%i1]0x82,%f13		! (4_0) x0 = *px;
590	fand	%f48,DC0,%f58		! (3_0) db0 = vis_fand(db0,DC0);
591
592	bpos,pt	%icc,.main_loop
593	add	%o4,960,%g5		! (3_0) iexp0 += 0x3c0;
594
595	add	counter,5,counter
596.tail:
597	subcc	counter,1,counter
598	bneg,a	.begin
599	or	%g0,%i3,%g5
600
601	fmuld	%f56,%f42,%f52		! (0_1) res0 *= xx0;
602	fdtos	%f32,%f15		! (2_2) fres0 = (float)res0;
603
604	fmuld	%f50,%f54,%f42		! (4_2) res0 = dtmp0 * res0;
605
606	ldd	[%fp+tmp2],%f56		! (4_2) dtmp1 = *((double*)&lexp0);
607
608	add	%i3,stridey,%o3		! py += stridey
609	st	%f15,[%i3]		! (2_2) *py = fres0;
610
611	subcc	counter,1,counter
612	bneg,a	.begin
613	or	%g0,%o3,%g5
614
615	fmuld	%f42,%f56,%f44		! (4_2) res0 *= dtmp1;
616	ldd	[%i5+8],%f58		! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
617	faddd	%f52,DC1,%f34		! (0_1) res0 += DC1;
618
619	fdtos	%f28,%f19		! (3_2) fres0 = (float)res0;
620
621	fmuld	%f58,%f34,%f32		! (0_1) res0 = dtmp0 * res0;
622
623	ldd	[%fp+tmp3],%f60		! (0_1) dtmp1 = *((double*)&lexp0);
624
625	add	%o3,stridey,%g5		! py += stridey
626
627	st	%f19,[%o3]		! (3_2) *py = fres0;
628
629	subcc	counter,1,counter
630	bneg,a	.begin
631	nop
632
633	fmuld	%f32,%f60,%f40		! (0_1) res0 *= dtmp1;
634
635	fdtos	%f44,%f23		! (4_2) fres0 = (float)res0;
636
637	add	%g5,stridey,%g5		! py += stridey
638	st	%f23,[stridey+%o3]	! (4_2) *py = fres0;
639
640	subcc	counter,1,counter
641	bneg,a	.begin
642	nop
643
644	fdtos	%f40,%f27		! (0_1) fres0 = (float)res0;
645
646	st	%f27,[%g5]		! (0_1) *py = fres0;
647
648	ba	.begin
649	add	%g5,stridey,%g5
650
651	.align	16
652.spec:
653	fsqrts	%f25,%f25
654	sub	counter,1,counter
655	add	%i1,stridex,%i1
656	st	%f25,[%g5]
657	ba	.begin1
658	add	%g5,stridey,%g5
659
660	.align	16
661.update0:
662	cmp	counter,1
663	ble	.cont0
664	fzeros	%f0
665
666	stx	%i1,[%fp+tmp_px]
667	sethi	%hi(0x7f800000),%o1
668
669	sub	counter,1,counter
670	st	counter,[%fp+tmp_counter]
671
672	ba	.cont0
673	or	%g0,1,counter
674
675	.align	16
676.update1:
677	cmp	counter,1
678	ble	.cont1
679	fzeros	%f0
680
681	stx	%i1,[%fp+tmp_px]
682	clr	%o1
683
684	sub	counter,1,counter
685	st	counter,[%fp+tmp_counter]
686
687	ba	.cont1
688	or	%g0,1,counter
689
690	.align	16
691.update2:
692	cmp	counter,2
693	ble	.cont2
694	fzeros	%f13
695
696	stx	%i1,[%fp+tmp_px]
697	sethi	%hi(0x7f800000),%o2
698
699	sub	counter,2,counter
700	st	counter,[%fp+tmp_counter]
701
702	ba	.cont2
703	or	%g0,2,counter
704
705	.align	16
706.update3:
707	cmp	counter,2
708	ble	.cont3
709	fzeros	%f13
710
711	stx	%i1,[%fp+tmp_px]
712	clr	%o2
713
714	sub	counter,2,counter
715	st	counter,[%fp+tmp_counter]
716
717	ba	.cont3
718	or	%g0,2,counter
719
720	.align	16
721.update4:
722	cmp	counter,3
723	ble	.cont4
724	fzeros	%f17
725
726	stx	%o4,[%fp+tmp_px]
727	sethi	%hi(0x7f800000),%l1
728
729	sub	counter,3,counter
730	st	counter,[%fp+tmp_counter]
731
732	ba	.cont4
733	or	%g0,3,counter
734
735	.align	16
736.update5:
737	cmp	counter,3
738	ble	.cont5
739	fzeros	%f17
740
741	stx	%o4,[%fp+tmp_px]
742	clr	%l1
743
744	sub	counter,3,counter
745	st	counter,[%fp+tmp_counter]
746
747	ba	.cont5
748	or	%g0,3,counter
749
750	.align	16
751.update6:
752	cmp	counter,4
753	ble	.cont6
754	fzeros	%f21
755
756	stx	%i1,[%fp+tmp_px]
757	sethi	%hi(0x7f800000),%i0
758
759	sub	counter,4,counter
760	st	counter,[%fp+tmp_counter]
761
762	ba	.cont6
763	or	%g0,4,counter
764
765	.align	16
766.update7:
767	cmp	counter,4
768	ble	.cont7
769	fzeros	%f21
770
771	stx	%i1,[%fp+tmp_px]
772	clr	%i0
773
774	sub	counter,4,counter
775	st	counter,[%fp+tmp_counter]
776
777	ba	.cont7
778	or	%g0,4,counter
779
780	.align	16
781.update8:
782	cmp	counter,5
783	ble	.cont8
784	fzeros	%f25
785
786	stx	%o7,[%fp+tmp_px]
787	sethi	%hi(0x7f800000),%o2
788
789	sub	counter,5,counter
790	st	counter,[%fp+tmp_counter]
791
792	ba	.cont8
793	or	%g0,5,counter
794
795	.align	16
796.update9:
797	cmp	counter,5
798	ble	.cont9
799	fzeros	%f25
800
801	stx	%o7,[%fp+tmp_px]
802	clr	%o2
803
804	sub	counter,5,counter
805	st	counter,[%fp+tmp_counter]
806
807	ba	.cont9
808	or	%g0,5,counter
809
810	.align	16
811.update10:
812	cmp	counter,6
813	ble	.cont10
814	fzeros	%f0
815
816	stx	%i1,[%fp+tmp_px]
817	sethi	%hi(0x7f800000),%o1
818
819	sub	counter,6,counter
820	st	counter,[%fp+tmp_counter]
821
822	ba	.cont10
823	or	%g0,6,counter
824
825	.align	16
826.update11:
827	cmp	counter,6
828	ble	.cont11
829	fzeros	%f0
830
831	stx	%i1,[%fp+tmp_px]
832	clr	%o1
833
834	sub	counter,6,counter
835	st	counter,[%fp+tmp_counter]
836
837	ba	.cont11
838	or	%g0,6,counter
839
840	.align	16
841.update12:
842	cmp	counter,2
843	ble	.cont12
844	fzeros	%f13
845
846	stx	%i1,[%fp+tmp_px]
847	sethi	%hi(0x7f800000),%o2
848
849	sub	counter,2,counter
850	st	counter,[%fp+tmp_counter]
851
852	ba	.cont12
853	or	%g0,2,counter
854
855	.align	16
856.update13:
857	cmp	counter,2
858	ble	.cont13
859	fzeros	%f13
860
861	stx	%i1,[%fp+tmp_px]
862	clr	%o2
863
864	sub	counter,2,counter
865	st	counter,[%fp+tmp_counter]
866
867	ba	.cont13
868	or	%g0,2,counter
869
870	.align	16
871.update14:
872	cmp	counter,3
873	ble	.cont14
874	fzeros	%f17
875
876	stx	%o4,[%fp+tmp_px]
877	sethi	%hi(0x7f800000),%l1
878
879	sub	counter,3,counter
880	st	counter,[%fp+tmp_counter]
881
882	ba	.cont14
883	or	%g0,3,counter
884
885	.align	16
886.update15:
887	cmp	counter,3
888	ble	.cont15
889	fzeros	%f17
890
891	stx	%o4,[%fp+tmp_px]
892	clr	%l1
893
894	sub	counter,3,counter
895	st	counter,[%fp+tmp_counter]
896
897	ba	.cont15
898	or	%g0,3,counter
899
900	.align	16
901.update16:
902	cmp	counter,4
903	ble	.cont16
904	fzeros	%f21
905
906	stx	%i1,[%fp+tmp_px]
907	sethi	%hi(0x7f800000),%i0
908
909	sub	counter,4,counter
910	st	counter,[%fp+tmp_counter]
911
912	ba	.cont16
913	or	%g0,4,counter
914
915	.align	16
916.update17:
917	cmp	counter,4
918	ble	.cont17
919	fzeros	%f21
920
921	stx	%i1,[%fp+tmp_px]
922	clr	%i0
923
924	sub	counter,4,counter
925	st	counter,[%fp+tmp_counter]
926
927	ba	.cont17
928	or	%g0,4,counter
929
930	.align	16
931.update18:
932	cmp	counter,5
933	ble	.cont18
934	fzeros	%f25
935
936	stx	%o7,[%fp+tmp_px]
937	sethi	%hi(0x7f800000),%o2
938
939	sub	counter,5,counter
940	st	counter,[%fp+tmp_counter]
941
942	ba	.cont18
943	or	%g0,5,counter
944
945	.align	16
946.update19:
947	cmp	counter,5
948	ble	.cont19
949	fzeros	%f25
950
951	stx	%o7,[%fp+tmp_px]
952	clr	%o2
953
954	sub	counter,5,counter
955	st	counter,[%fp+tmp_counter]
956
957	ba	.cont19
958	or	%g0,5,counter
959
960	.align	16
961.update20:
962	cmp	counter,6
963	ble	.cont20
964	fzeros	%f0
965
966	stx	%i1,[%fp+tmp_px]
967	sethi	%hi(0x7f800000),%o1
968
969	sub	counter,6,counter
970	st	counter,[%fp+tmp_counter]
971
972	ba	.cont20
973	or	%g0,6,counter
974
975	.align	16
976.update21:
977	cmp	counter,6
978	ble	.cont21
979	fzeros	%f0
980
981	stx	%i1,[%fp+tmp_px]
982	clr	%o1
983
984	sub	counter,6,counter
985	st	counter,[%fp+tmp_counter]
986
987	ba	.cont21
988	or	%g0,6,counter
989
990.exit:
991	ret
992	restore
993	SET_SIZE(__vsqrtf_ultra3)
994
995