xref: /illumos-gate/usr/src/lib/libmvec/common/vis/__vhypotf.S (revision b31ca922c7346747131aed07c0c171ec2f573aac)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
23 */
24/*
25 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
26 * Use is subject to license terms.
27 */
28
29	.file	"__vhypotf.S"
30
31#include "libm.h"
32
33	RO_DATA
34	.align	64
35
36.CONST_TBL:
37	.word	0x3fe00001, 0x80007e00	! K1  =  5.00000715259318464227e-01
38	.word	0xbfc00003, 0xc0017a01	! K2  = -1.25000447037521686593e-01
39	.word	0x000fffff, 0xffffffff	! DC0 = 0x000fffffffffffff
40	.word	0x3ff00000, 0x00000000	! DC1 = 0x3ff0000000000000
41	.word	0x7ffff000, 0x00000000	! DC2 = 0x7ffff00000000000
42	.word	0x7fe00000, 0x00000000	! DA0 = 0x7fe0000000000000
43	.word	0x47efffff, 0xe0000000	! DFMAX = 3.402823e+38
44	.word	0x7f7fffff, 0x80808080	! FMAX = 3.402823e+38 , SCALE = 0x80808080
45	.word	0x20000000, 0x00000000	! DA1 = 0x2000000000000000
46
47#define DC0		%f12
48#define DC1		%f10
49#define DC2		%f42
50#define DA0		%f6
51#define DA1		%f4
52#define K2		%f26
53#define K1		%f28
54#define SCALE		%f3
55#define FMAX		%f2
56#define DFMAX		%f50
57
58#define stridex		%l6
59#define stridey		%i4
60#define stridez		%l5
61#define _0x7fffffff	%o1
62#define _0x7f3504f3	%o2
63#define _0x1ff0		%l2
64#define TBL		%l1
65
66#define counter		%l0
67
68#define tmp_px		STACK_BIAS-0x30
69#define tmp_py		STACK_BIAS-0x28
70#define tmp_counter	STACK_BIAS-0x20
71#define tmp0		STACK_BIAS-0x18
72#define tmp1		STACK_BIAS-0x10
73#define tmp2		STACK_BIAS-0x0c
74#define tmp3		STACK_BIAS-0x08
75#define tmp4		STACK_BIAS-0x04
76
77! sizeof temp storage - must be a multiple of 16 for V9
78#define tmps		0x30
79
80!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
81!      !!!!!   algorithm   !!!!!
82!  hx0 = *(int*)px;
83!  x0 = *px;
84!  px += stridex;
85!
86!  hy0 = *(int*)py;
87!  y0 = *py;
88!  py += stridey;
89!
90!  hx0 &= 0x7fffffff;
91!  hy0 &= 0x7fffffff;
92!
93!  if ( hx >= 0x7f3504f3 || hy >= 0x7f3504f3 )
94!  {
95!    if ( hx >= 0x7f800000 || hy >= 0x7f800000 )
96!    {
97!      if ( hx == 0x7f800000 || hy == 0x7f800000 )
98!        *(int*)pz = 0x7f800000;
99!      else *pz = x * y;
100!    }
101!    else
102!    {
103!      hyp = sqrt(x * (double)x + y * (double)y);
104!      if ( hyp <= DMAX ) ftmp0 = (float)hyp;
105!      else ftmp0 = FMAX * FMAX;
106!      *pz = ftmp0;
107!    }
108!    pz += stridez;
109!    continue;
110!  }
111!  if ( (hx | hy) == 0 )
112!  {
113!    *pz = 0;
114!    pz += stridez;
115!    continue;
116!  }
117!  dx0 = x0 * (double)x0;
118!  dy0 = y0 * (double)y0;
119!  db0 = dx0 + dy0;
120!
121!  iexp0 = ((int*)&db0)[0];
122!
123!  h0 = vis_fand(db0,DC0);
124!  h0 = vis_for(h0,DC1);
125!  h_hi0 = vis_fand(h0,DC2);
126!
127!  db0 = vis_fand(db0,DA0);
128!  db0 = vis_fmul8x16(SCALE, db0);
129!  db0 = vis_fpadd32(db0,DA1);
130!
131!  iexp0 >>= 8;
132!  di0 = iexp0 & 0x1ff0;
133!  si0 = (char*)sqrt_arr + di0;
134!
135!  dtmp0 = ((double*)((char*)div_arr + di0))[0];
136!  xx0 = h0 - h_hi0;
137!  xx0 *= dmp0;
138!
139!  dtmp0 = ((double*)si0)[1];
140!  res0 = K2 * xx0;
141!  res0 += K1;
142!  res0 *= xx0;
143!  res0 += DC1;
144!  res0 = dtmp0 * res0;
145!  res0 *= db0;
146!  ftmp0 = (float)res0;
147!  *pz = ftmp0;
148!  pz += stridez;
149!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
150
151	ENTRY(__vhypotf)
152	save	%sp,-SA(MINFRAME)-tmps,%sp
153	PIC_SETUP(l7)
154	PIC_SET(l7,.CONST_TBL,o3)
155	PIC_SET(l7,__vlibm_TBL_sqrtf,l1)
156
157#ifdef __sparcv9
158	ldx	[%fp+STACK_BIAS+176],stridez
159#else
160	ld	[%fp+STACK_BIAS+92],stridez
161#endif
162	st	%i0,[%fp+tmp_counter]
163
164	stx	%i1,[%fp+tmp_px]
165
166	stx	%i3,[%fp+tmp_py]
167
168	ldd	[%o3],K1
169	sethi	%hi(0x7ffffc00),%o1
170
171	ldd	[%o3+8],K2
172	sethi	%hi(0x7f350400),%o2
173
174	ldd	[%o3+16],DC0
175	add	%o1,1023,_0x7fffffff
176	add	%o2,0xf3,_0x7f3504f3
177
178	ldd	[%o3+24],DC1
179	sll	%i2,2,stridex
180
181	ld	[%o3+56],FMAX
182
183	ldd	[%o3+32],DC2
184	sll	%i4,2,stridey
185
186	ldd	[%o3+40],DA0
187	sll	stridez,2,stridez
188
189	ldd	[%o3+48],DFMAX
190
191	ld	[%o3+60],SCALE
192	or	%g0,0xff8,%l2
193
194	ldd	[%o3+64],DA1
195	sll	%l2,1,_0x1ff0
196	or	%g0,%i5,%l7
197
198.begin:
199	ld	[%fp+tmp_counter],counter
200	ldx	[%fp+tmp_px],%i1
201	ldx	[%fp+tmp_py],%i2
202	st	%g0,[%fp+tmp_counter]
203.begin1:
204	cmp	counter,0
205	ble,pn	%icc,.exit
206	lda	[%i1]0x82,%l3		! (3_0) hx0 = *(int*)px;
207
208	lda	[%i2]0x82,%l4		! (3_0) hy0 = *(int*)py;
209
210	lda	[%i1]0x82,%f17		! (3_0) x0 = *px;
211	and	%l3,_0x7fffffff,%l3	! (3_0) hx0 &= 0x7fffffff;
212
213	cmp	%l3,_0x7f3504f3		! (3_0) hx ? 0x7f3504f3
214	bge,pn	%icc,.spec		! (3_0) if ( hx >= 0x7f3504f3 )
215	and	%l4,_0x7fffffff,%l4	! (3_0) hy0 &= 0x7fffffff;
216
217	cmp	%l4,_0x7f3504f3		! (3_0) hy ? 0x7f3504f3
218	bge,pn	%icc,.spec		! (3_0) if ( hy >= 0x7f3504f3 )
219	or	%g0,%i2,%o7
220
221	orcc	%l3,%l4,%g0
222	bz,pn	%icc,.spec1
223
224	add	%i1,stridex,%i1		! px += stridex
225	fsmuld	%f17,%f17,%f44		! (3_0) dx0 = x0 * (double)x0;
226	lda	[%i2]0x82,%f17		! (3_0) y0 = *py;
227
228	lda	[%i1]0x82,%l3		! (4_0) hx0 = *(int*)px;
229
230	lda	[stridey+%o7]0x82,%l4	! (4_0) hy0 = *(int*)py;
231
232	and	%l3,_0x7fffffff,%l3	! (4_0) hx0 &= 0x7fffffff;
233
234	fsmuld	%f17,%f17,%f24		! (3_0) dy0 = y0 * (double)y0;
235	cmp	%l3,_0x7f3504f3		! (4_0) hx ? 0x7f3504f3
236	bge,pn	%icc,.update0		! (4_0) if ( hx >= 0x7f3504f3 )
237	and	%l4,_0x7fffffff,%l4	! (4_0) hy0 &= 0x7fffffff;
238
239	orcc	%l3,%l4,%g0
240	bz,pn	%icc,.update0
241	lda	[%i1]0x82,%f17		! (4_0) x0 = *px;
242.cont0:
243	faddd	%f44,%f24,%f24		! (3_0) db0 = dx0 + dy0;
244
245	fsmuld	%f17,%f17,%f40		! (4_1) dy0 = x0 * (double)x0;
246	cmp	%l4,_0x7f3504f3		! (4_1) hy ? 0x7f3504f3
247	lda	[stridey+%o7]0x82,%f17	! (4_1) hy0 = *py;
248
249	add	%o7,stridey,%i5		! py += stridey
250	lda	[%i1+stridex]0x82,%l3	! (0_0) hx0 = *(int*)px;
251
252	bge,pn	%icc,.update1		! (4_1) if ( hy >= 0x7f3504f3 )
253	st	%f24,[%fp+tmp0]		! (3_1) iexp0 = ((int*)&db0)[0];
254.cont1:
255	and	%l3,_0x7fffffff,%l3	! (0_0) hx0 &= 0x7fffffff;
256
257	fsmuld	%f17,%f17,%f48		! (4_1) dy0 = y0 * (double)y0;
258	lda	[%i1+stridex]0x82,%f8	! (0_0) x0 = *px;
259
260	add	%i1,stridex,%i1		! px += stridex
261
262	lda	[%i5+stridey]0x82,%l4	! (0_0) hy0 = *(int*)py;
263	cmp	%l3,_0x7f3504f3		! (0_0) hx ? 0x7f3504f3
264	bge,pn	%icc,.update2		! (0_0) if ( hx >= 0x7f3504f3 )
265	add	%i5,stridey,%o4		! py += stridey
266.cont2:
267	faddd	%f40,%f48,%f20		! (4_1) db0 = dx0 + dy0;
268
269	fsmuld	%f8,%f8,%f40		! (0_0) dx0 = x0 * (double)x0;
270	and	%l4,_0x7fffffff,%l4	! (0_0) hy0 &= 0x7fffffff;
271	lda	[%i5+stridey]0x82,%f17	! (0_0) hy0 = *py;
272
273	cmp	%l4,_0x7f3504f3		! (0_0) hy ? 0x7f3504f3
274	bge,pn	%icc,.update3		! (0_0) if ( hy >= 0x7f3504f3 )
275	st	%f20,[%fp+tmp1]		! (4_1) iexp0 = ((int*)&db0)[0];
276
277	orcc	%l3,%l4,%g0
278	bz,pn	%icc,.update3
279.cont3:
280	lda	[%i1+stridex]0x82,%l3	! (1_0) hx0 = *(int*)px;
281
282	fand	%f24,DC0,%f60		! (3_1) h0 = vis_fand(db0,DC0);
283
284	and	%l3,_0x7fffffff,%l3	! (1_0) hx0 &= 0x7fffffff;
285
286	fsmuld	%f17,%f17,%f34		! (0_0) dy0 = y0 * (double)y0;
287	cmp	%l3,_0x7f3504f3		! (1_0) hx ? 0x7f3504f3
288	lda	[%o4+stridey]0x82,%l4	! (1_0) hy0 = *(int*)py;
289
290	add	%i1,stridex,%i1		! px += stridex
291
292	lda	[%i1]0x82,%f17		! (1_0) x0 = *px;
293	bge,pn	%icc,.update4		! (1_0) if ( hx >= 0x7f3504f3 )
294	add	%o4,stridey,%i5		! py += stridey
295.cont4:
296	and	%l4,_0x7fffffff,%l4	! (1_0) hy0 &= 0x7fffffff;
297	for	%f60,DC1,%f46		! (3_1) h0 = vis_for(h0,DC1);
298
299	cmp	%l4,_0x7f3504f3		! (1_0) hy ? 0x7f3504f3
300	ld	[%fp+tmp0],%o0		! (3_1) iexp0 = ((int*)&db0)[0];
301	faddd	%f40,%f34,%f0		! (0_0) db0 = dx0 + dy0;
302
303	fsmuld	%f17,%f17,%f40		! (1_0) dx0 = x0 * (double)x0;
304	add	%i1,stridex,%i1		! px += stridex
305	lda	[%o4+stridey]0x82,%f17	! (1_0) y0 = *py;
306
307	srax	%o0,8,%o0		! (3_1) iexp0 >>= 8;
308	bge,pn	%icc,.update5		! (1_0) if ( hy >= 0x7f3504f3 )
309	fand	%f46,DC2,%f38		! (3_1) h_hi0 = vis_fand(h0,DC2);
310
311	orcc	%l3,%l4,%g0
312	bz,pn	%icc,.update5
313.cont5:
314	lda	[%i1]0x82,%l3		! (2_0) hx0 = *(int*)px;
315
316	and	%o0,_0x1ff0,%o0		! (3_1) di0 = iexp0 & 0x1ff0;
317	st	%f0,[%fp+tmp2]		! (0_0) iexp0 = ((int*)&db0)[0];
318	fand	%f20,DC0,%f60		! (4_1) h0 = vis_fand(db0,DC0);
319
320	ldd	[TBL+%o0],%f22		! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
321	fsubd	%f46,%f38,%f38		! (3_1) xx0 = h0 - h_hi0;
322
323	fsmuld	%f17,%f17,%f32		! (1_0) dy0 = y0 * (double)y0;
324	add	%i5,stridey,%i2		! py += stridey
325	lda	[stridey+%i5]0x82,%l4	! (2_0) hy0 = *(int*)py;
326
327	and	%l3,_0x7fffffff,%l3	! (2_0) hx0 &= 0x7fffffff;
328
329	lda	[%i1]0x82,%f17		! (2_0) x0 = *px;
330	cmp	%l3,_0x7f3504f3		! (2_0) hx ? 0x7f3504f3
331
332	fmuld	%f38,%f22,%f38		! (3_1) xx0 *= dmp0;
333	and	%l4,_0x7fffffff,%l4	! (2_0) hy0 &= 0x7fffffff;
334	for	%f60,DC1,%f46		! (4_1) h0 = vis_for(h0,DC1);
335
336	bge,pn	%icc,.update6		! (2_0) if ( hx >= 0x7f3504f3 )
337	ld	[%fp+tmp1],%o3		! (4_1) iexp0 = ((int*)&db0)[0];
338.cont6:
339	faddd	%f40,%f32,%f18		! (1_0) db0 = dx0 + dy0;
340
341	fsmuld	%f17,%f17,%f44		! (2_0) dx0 = x0 * (double)x0;
342	cmp	%l4,_0x7f3504f3		! (2_0) hy ? 0x7f3504f3
343	lda	[stridey+%i5]0x82,%f17	! (2_0) y0 = *py;
344
345	add	%i1,stridex,%i1		! px += stridex
346	bge,pn	%icc,.update7		! (2_0) if ( hy >= 0x7f3504f3 )
347	fand	%f46,DC2,%f58		! (4_1) h_hi0 = vis_fand(h0,DC2);
348
349	orcc	%l3,%l4,%g0
350	bz,pn	%icc,.update7
351	nop
352.cont7:
353	fmuld	K2,%f38,%f56		! (3_1) res0 = K2 * xx0;
354	srax	%o3,8,%o3		! (4_1) iexp0 >>= 8;
355	lda	[%i1]0x82,%l3		! (3_0) hx0 = *(int*)px;
356
357	and	%o3,_0x1ff0,%o3		! (4_1) di0 = iexp0 & 0x1ff0;
358	st	%f18,[%fp+tmp3]		! (1_0) iexp0 = ((int*)&db0)[0];
359	fand	%f0,DC0,%f60		! (0_0) h0 = vis_fand(db0,DC0);
360
361	ldd	[TBL+%o3],%f22		! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
362	add	%i2,stridey,%o7		! py += stridey
363	fsubd	%f46,%f58,%f58		! (4_1) xx0 = h0 - h_hi0;
364
365	fsmuld	%f17,%f17,%f30		! (2_0) dy0 = y0 * (double)y0;
366	lda	[stridey+%i2]0x82,%l4	! (3_0) hy0 = *(int*)py;
367	and	%l3,_0x7fffffff,%l3	! (3_0) hx0 &= 0x7fffffff;
368
369	faddd	%f56,K1,%f54		! (3_1) res0 += K1;
370	cmp	%l3,_0x7f3504f3		! (3_0) hx ? 0x7f3504f3
371
372	lda	[%i1]0x82,%f17		! (3_0) x0 = *px;
373	add	%i1,stridex,%i1		! px += stridex
374	bge,pn	%icc,.update8		! (3_0) if ( hx >= 0x7f3504f3 )
375
376	fmuld	%f58,%f22,%f58		! (4_1) xx0 *= dmp0;
377.cont8:
378	and	%l4,_0x7fffffff,%l4	! (3_0) hy0 &= 0x7fffffff;
379	for	%f60,DC1,%f46		! (0_0) h0 = vis_for(h0,DC1);
380
381	cmp	%l4,_0x7f3504f3		! (3_0) hy ? 0x7f3504f3
382	ld	[%fp+tmp2],%g1		! (0_0) iexp0 = ((int*)&db0)[0];
383	faddd	%f44,%f30,%f30		! (2_0) db0 = dx0 + dy0;
384
385	fsmuld	%f17,%f17,%f44		! (3_0) dx0 = x0 * (double)x0;
386	bge,pn	%icc,.update9		! (3_0) if ( hy >= 0x7f3504f3 )
387	lda	[stridey+%i2]0x82,%f17	! (3_0) y0 = *py;
388
389	orcc	%l3,%l4,%g0
390	bz,pn	%icc,.update9
391	nop
392.cont9:
393	fmuld	%f54,%f38,%f40		! (3_1) res0 *= xx0;
394	lda	[%i1]0x82,%l3		! (4_0) hx0 = *(int*)px;
395	fand	%f46,DC2,%f38		! (0_0) h_hi0 = vis_fand(h0,DC2);
396
397	fmuld	K2,%f58,%f54		! (4_1) res0 = K2 * xx0;
398	srax	%g1,8,%o5		! (0_0) iexp0 >>= 8;
399	lda	[stridey+%o7]0x82,%l4	! (4_0) hy0 = *(int*)py;
400	fand	%f24,DA0,%f56		! (3_1) db0 = vis_fand(db0,DA0);
401
402	and	%o5,_0x1ff0,%o5		! (0_0) di0 = iexp0 & 0x1ff0;
403	st	%f30,[%fp+tmp4]		! (2_0) iexp0 = ((int*)&db0)[0];
404	fand	%f18,DC0,%f60		! (1_0) h0 = vis_fand(db0,DC0);
405
406	ldd	[TBL+%o5],%f22		! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0];
407	add	%o0,TBL,%g1		! (3_1) si0 = (char*)sqrt_arr + di0;
408	and	%l3,_0x7fffffff,%l3	! (4_0) hx0 &= 0x7fffffff;
409	fsubd	%f46,%f38,%f38		! (0_0) xx0 = h0 - h_hi0;
410
411	fsmuld	%f17,%f17,%f24		! (3_0) dy0 = y0 * (double)y0;
412	cmp	%l3,_0x7f3504f3		! (4_0) hx ? 0x7f3504f3
413	bge,pn	%icc,.update10		! (4_0) if ( hx >= 0x7f3504f3 )
414	faddd	%f40,DC1,%f40		! (3_1) res0 += DC1;
415
416	fmul8x16	SCALE,%f56,%f36	! (3_1) db0 = vis_fmul8x16(SCALE, db0);
417	and	%l4,_0x7fffffff,%l4	! (4_0) hy0 &= 0x7fffffff;
418	ldd	[%g1+8],%f56		! (3_1) dtmp0 = ((double*)si0)[1];
419	faddd	%f54,K1,%f54		! (4_1) res0 += K1;
420
421	lda	[%i1]0x82,%f17		! (4_0) x0 = *px;
422.cont10:
423	fmuld	%f38,%f22,%f38		! (0_0) xx0 *= dmp0;
424	cmp	counter,5
425	for	%f60,DC1,%f46		! (1_0) h0 = vis_for(h0,DC1);
426
427	ld	[%fp+tmp3],%g1		! (1_0) iexp0 = ((int*)&db0)[0];
428	fmuld	%f56,%f40,%f62		! (3_1) res0 = dtmp0 * res0;
429	faddd	%f44,%f24,%f24		! (3_0) db0 = dx0 + dy0;
430
431	bl,pn	%icc,.tail
432	nop
433
434	ba	.main_loop
435	sub	counter,5,counter
436
437	.align	16
438.main_loop:
439	fsmuld	%f17,%f17,%f40		! (4_1) dy0 = x0 * (double)x0;
440	cmp	%l4,_0x7f3504f3		! (4_1) hy ? 0x7f3504f3
441	lda	[stridey+%o7]0x82,%f17	! (4_1) hy0 = *py;
442	fpadd32	%f36,DA1,%f36		! (3_2) db0 = vis_fpadd32(db0,DA1);
443
444	fmuld	%f54,%f58,%f58		! (4_2) res0 *= xx0;
445	add	%o7,stridey,%i5		! py += stridey
446	st	%f24,[%fp+tmp0]		! (3_1) iexp0 = ((int*)&db0)[0];
447	fand	%f46,DC2,%f44		! (1_1) h_hi0 = vis_fand(h0,DC2);
448
449	fmuld	K2,%f38,%f56		! (0_1) res0 = K2 * xx0;
450	srax	%g1,8,%g5		! (1_1) iexp0 >>= 8;
451	bge,pn	%icc,.update11		! (4_1) if ( hy >= 0x7f3504f3 )
452	fand	%f20,DA0,%f54		! (4_2) db0 = vis_fand(db0,DA0);
453
454	orcc	%l3,%l4,%g0
455	nop
456	bz,pn	%icc,.update11
457	fzero	%f52
458.cont11:
459	fmuld	%f62,%f36,%f62		! (3_2) res0 *= db0;
460	and	%g5,_0x1ff0,%g5		! (1_1) di0 = iexp0 & 0x1ff0;
461	lda	[%i1+stridex]0x82,%l3	! (0_0) hx0 = *(int*)px;
462	fand	%f30,DC0,%f60		! (2_1) h0 = vis_fand(db0,DC0);
463
464	ldd	[%g5+TBL],%f22		! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
465	add	%o3,TBL,%g1		! (4_2) si0 = (char*)sqrt_arr + di0;
466	add	%i1,stridex,%i0		! px += stridex
467	fsubd	%f46,%f44,%f44		! (1_1) xx0 = h0 - h_hi0;
468
469	fsmuld	%f17,%f17,%f48		! (4_1) dy0 = y0 * (double)y0;
470	nop
471	lda	[%i1+stridex]0x82,%f8	! (0_0) x0 = *px;
472	faddd	%f58,DC1,%f36		! (4_2) res0 += DC1;
473
474	faddd	%f56,K1,%f58		! (0_1) res0 += K1;
475	and	%l3,_0x7fffffff,%l3	! (0_0) hx0 &= 0x7fffffff;
476	ldd	[%g1+8],%f56		! (4_2) dtmp0 = ((double*)si0)[1];
477	fmul8x16	SCALE,%f54,%f54	! (4_2) db0 = vis_fmul8x16(SCALE, db0);
478
479	lda	[%i5+stridey]0x82,%l4	! (0_0) hy0 = *(int*)py;
480	cmp	%l3,_0x7f3504f3		! (0_0) hx ? 0x7f3504f3
481	bge,pn	%icc,.update12		! (0_0) if ( hx >= 0x7f3504f3 )
482	fdtos	%f62,%f14		! (3_2) ftmp0 = (float)res0;
483.cont12:
484	fmuld	%f44,%f22,%f44		! (1_1) xx0 *= dmp0;
485	add	%l7,stridez,%o7		! pz += stridez
486	st	%f14,[%l7]		! (3_2) *pz = ftmp0;
487	for	%f60,DC1,%f46		! (2_1) h0 = vis_for(h0,DC1);
488
489	fmuld	%f56,%f36,%f36		! (4_2) res0 = dtmp0 * res0;
490	add	%i5,stridey,%o4		! py += stridey
491	ld	[%fp+tmp4],%g1		! (2_1) iexp0 = ((int*)&db0)[0];
492	faddd	%f40,%f48,%f20		! (4_1) db0 = dx0 + dy0;
493
494	fsmuld	%f8,%f8,%f40		! (0_0) dx0 = x0 * (double)x0;
495	and	%l4,_0x7fffffff,%l4	! (0_0) hy0 &= 0x7fffffff;
496	lda	[%i5+stridey]0x82,%f17	! (0_0) hy0 = *py;
497	fpadd32	%f54,DA1,%f62		! (4_2) db0 = vis_fpadd32(db0,DA1);
498
499	fmuld	%f58,%f38,%f38		! (0_1) res0 *= xx0;
500	cmp	%l4,_0x7f3504f3		! (0_0) hy ? 0x7f3504f3
501	st	%f20,[%fp+tmp1]		! (4_1) iexp0 = ((int*)&db0)[0];
502	fand	%f46,DC2,%f58		! (2_1) h_hi0 = vis_fand(h0,DC2);
503
504	fmuld	K2,%f44,%f56		! (1_1) res0 = K2 * xx0;
505	srax	%g1,8,%g1		! (2_1) iexp0 >>= 8;
506	bge,pn	%icc,.update13		! (0_0) if ( hy >= 0x7f3504f3 )
507	fand	%f0,DA0,%f54		! (0_1) db0 = vis_fand(db0,DA0);
508
509	orcc	%l3,%l4,%g0
510	nop
511	bz,pn	%icc,.update13
512	fzero	%f52
513.cont13:
514	fmuld	%f36,%f62,%f62		! (4_2) res0 *= db0;
515	and	%g1,_0x1ff0,%g1		! (2_1) di0 = iexp0 & 0x1ff0;
516	lda	[%i0+stridex]0x82,%l3	! (1_0) hx0 = *(int*)px;
517	fand	%f24,DC0,%f60		! (3_1) h0 = vis_fand(db0,DC0);
518
519	ldd	[TBL+%g1],%f22		! (2_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
520	add	%o5,TBL,%o0		! (0_1) si0 = (char*)sqrt_arr + di0;
521	add	%i0,stridex,%i1		! px += stridex
522	fsubd	%f46,%f58,%f58		! (2_1) xx0 = h0 - h_hi0;
523
524	fsmuld	%f17,%f17,%f34		! (0_0) dy0 = y0 * (double)y0;
525	add	%o7,stridez,%i0		! pz += stridez
526	lda	[%o4+stridey]0x82,%l4	! (1_0) hy0 = *(int*)py;
527	faddd	%f38,DC1,%f36		! (0_1) res0 += DC1;
528
529	faddd	%f56,K1,%f38		! (1_1) res0 += K1;
530	and	%l3,_0x7fffffff,%l3	! (1_0) hx0 &= 0x7fffffff;
531	ldd	[%o0+8],%f56		! (0_1) dtmp0 = ((double*)si0)[1];
532	fmul8x16	SCALE,%f54,%f54	! (0_1) db0 = vis_fmul8x16(SCALE, db0);
533
534	lda	[%i1]0x82,%f17		! (1_0) x0 = *px;
535	cmp	%l3,_0x7f3504f3		! (1_0) hx ? 0x7f3504f3
536	bge,pn	%icc,.update14		! (1_0) if ( hx >= 0x7f3504f3 )
537	fdtos	%f62,%f14		! (4_2) ftmp0 = (float)res0;
538.cont14:
539	fmuld	%f58,%f22,%f58		! (2_1) xx0 *= dmp0;
540	and	%l4,_0x7fffffff,%l4	! (1_0) hy0 &= 0x7fffffff;
541	add	%o4,stridey,%i5		! py += stridey
542	for	%f60,DC1,%f46		! (3_1) h0 = vis_for(h0,DC1);
543
544	fmuld	%f56,%f36,%f36		! (0_1) res0 = dtmp0 * res0;
545	cmp	%l4,_0x7f3504f3		! (1_0) hy ? 0x7f3504f3
546	ld	[%fp+tmp0],%o0		! (3_1) iexp0 = ((int*)&db0)[0];
547	faddd	%f40,%f34,%f0		! (0_0) db0 = dx0 + dy0;
548
549	fsmuld	%f17,%f17,%f40		! (1_0) dx0 = x0 * (double)x0;
550	add	%i1,stridex,%i1		! px += stridex
551	lda	[%o4+stridey]0x82,%f17	! (1_0) y0 = *py;
552	fpadd32	%f54,DA1,%f62		! (0_1) db0 = vis_fpadd32(db0,DA1);
553
554	fmuld	%f38,%f44,%f44		! (1_1) res0 *= xx0;
555	st	%f14,[%o7]		! (4_2) *pz = ftmp0;
556	bge,pn	%icc,.update15		! (1_0) if ( hy >= 0x7f3504f3 )
557	fand	%f46,DC2,%f38		! (3_1) h_hi0 = vis_fand(h0,DC2);
558
559	orcc	%l3,%l4,%g0
560	bz,pn	%icc,.update15
561	nop
562.cont15:
563	fmuld	K2,%f58,%f54		! (2_1) res0 = K2 * xx0;
564	srax	%o0,8,%o0		! (3_1) iexp0 >>= 8;
565	st	%f0,[%fp+tmp2]		! (0_0) iexp0 = ((int*)&db0)[0];
566	fand	%f18,DA0,%f56		! (1_1) db0 = vis_fand(db0,DA0);
567
568	fmuld	%f36,%f62,%f62		! (0_1) res0 *= db0;
569	and	%o0,_0x1ff0,%o0		! (3_1) di0 = iexp0 & 0x1ff0;
570	lda	[%i1]0x82,%l3		! (2_0) hx0 = *(int*)px;
571	fand	%f20,DC0,%f60		! (4_1) h0 = vis_fand(db0,DC0);
572
573	ldd	[TBL+%o0],%f22		! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
574	add	%g5,TBL,%o3		! (1_1) si0 = (char*)sqrt_arr + di0;
575	add	%i0,stridez,%i3		! pz += stridez
576	fsubd	%f46,%f38,%f38		! (3_1) xx0 = h0 - h_hi0;
577
578	fsmuld	%f17,%f17,%f32		! (1_0) dy0 = y0 * (double)y0;
579	add	%i5,stridey,%i2		! py += stridey
580	lda	[stridey+%i5]0x82,%l4	! (2_0) hy0 = *(int*)py;
581	faddd	%f44,DC1,%f44		! (1_1) res0 += DC1;
582
583	fmul8x16	SCALE,%f56,%f36	! (1_1) db0 = vis_fmul8x16(SCALE, db0);
584	and	%l3,_0x7fffffff,%l3	! (2_0) hx0 &= 0x7fffffff;
585	ldd	[%o3+8],%f56		! (1_1) dtmp0 = ((double*)si0)[1];
586	faddd	%f54,K1,%f54		! (2_1) res0 += K1;
587
588	lda	[%i1]0x82,%f17		! (2_0) x0 = *px;
589	cmp	%l3,_0x7f3504f3		! (2_0) hx ? 0x7f3504f3
590	add	%i3,stridez,%o4		! pz += stridez
591	fdtos	%f62,%f14		! (0_1) ftmp0 = (float)res0;
592
593	fmuld	%f38,%f22,%f38		! (3_1) xx0 *= dmp0;
594	and	%l4,_0x7fffffff,%l4	! (2_0) hy0 &= 0x7fffffff;
595	st	%f14,[%i0]		! (0_1) *pz = ftmp0;
596	for	%f60,DC1,%f46		! (4_1) h0 = vis_for(h0,DC1);
597
598	fmuld	%f56,%f44,%f62		! (1_1) res0 = dtmp0 * res0;
599	bge,pn	%icc,.update16		! (2_0) if ( hx >= 0x7f3504f3 )
600	ld	[%fp+tmp1],%o3		! (4_1) iexp0 = ((int*)&db0)[0];
601	faddd	%f40,%f32,%f18		! (1_0) db0 = dx0 + dy0;
602.cont16:
603	fsmuld	%f17,%f17,%f44		! (2_0) dx0 = x0 * (double)x0;
604	cmp	%l4,_0x7f3504f3		! (2_0) hy ? 0x7f3504f3
605	lda	[stridey+%i5]0x82,%f17	! (2_0) y0 = *py;
606	fpadd32	%f36,DA1,%f36		! (1_1) db0 = vis_fpadd32(db0,DA1);
607
608	fmuld	%f54,%f58,%f54		! (2_1) res0 *= xx0;
609	add	%i1,stridex,%l7		! px += stridex
610	bge,pn	%icc,.update17		! (2_0) if ( hy >= 0x7f3504f3 )
611	fand	%f46,DC2,%f58		! (4_1) h_hi0 = vis_fand(h0,DC2);
612
613	orcc	%l3,%l4,%g0
614	nop
615	bz,pn	%icc,.update17
616	fzero	%f52
617.cont17:
618	fmuld	K2,%f38,%f56		! (3_1) res0 = K2 * xx0;
619	srax	%o3,8,%o3		! (4_1) iexp0 >>= 8;
620	st	%f18,[%fp+tmp3]		! (1_0) iexp0 = ((int*)&db0)[0];
621	fand	%f30,DA0,%f40		! (2_1) db0 = vis_fand(db0,DA0);
622
623	fmuld	%f62,%f36,%f62		! (1_1) res0 *= db0;
624	and	%o3,_0x1ff0,%o3		! (4_1) di0 = iexp0 & 0x1ff0;
625	lda	[%l7]0x82,%l3		! (3_0) hx0 = *(int*)px;
626	fand	%f0,DC0,%f60		! (0_0) h0 = vis_fand(db0,DC0);
627
628	ldd	[TBL+%o3],%f22		! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
629	add	%g1,TBL,%g1		! (2_1) si0 = (char*)sqrt_arr + di0;
630	add	%i2,stridey,%o7		! py += stridey
631	fsubd	%f46,%f58,%f58		! (4_1) xx0 = h0 - h_hi0;
632
633	fsmuld	%f17,%f17,%f30		! (2_0) dy0 = y0 * (double)y0;
634	lda	[stridey+%i2]0x82,%l4	! (3_0) hy0 = *(int*)py;
635	add	%l7,stridex,%i1		! px += stridex
636	faddd	%f54,DC1,%f36		! (2_1) res0 += DC1;
637
638	faddd	%f56,K1,%f54		! (3_1) res0 += K1;
639	and	%l3,_0x7fffffff,%l3	! (3_0) hx0 &= 0x7fffffff;
640	ldd	[%g1+8],%f56		! (2_1) dtmp0 = ((double*)si0)[1];
641	fmul8x16	SCALE,%f40,%f40	! (2_1) db0 = vis_fmul8x16(SCALE, db0);
642
643	lda	[%l7]0x82,%f17		! (3_0) x0 = *px;
644	cmp	%l3,_0x7f3504f3		! (3_0) hx ? 0x7f3504f3
645	bge,pn	%icc,.update18		! (3_0) if ( hx >= 0x7f3504f3 )
646	fdtos	%f62,%f14		! (1_1) ftmp0 = (float)res0;
647.cont18:
648	fmuld	%f58,%f22,%f58		! (4_1) xx0 *= dmp0;
649	and	%l4,_0x7fffffff,%l4	! (3_0) hy0 &= 0x7fffffff;
650	st	%f14,[%i3]		! (1_1) *pz = ftmp0;
651	for	%f60,DC1,%f46		! (0_0) h0 = vis_for(h0,DC1);
652
653	fmuld	%f56,%f36,%f36		! (2_1) res0 = dtmp0 * res0;
654	cmp	%l4,_0x7f3504f3		! (3_0) hy ? 0x7f3504f3
655	ld	[%fp+tmp2],%g1		! (0_0) iexp0 = ((int*)&db0)[0];
656	faddd	%f44,%f30,%f30		! (2_0) db0 = dx0 + dy0;
657
658	fsmuld	%f17,%f17,%f44		! (3_0) dx0 = x0 * (double)x0;
659	bge,pn	%icc,.update19		! (3_0) if ( hy >= 0x7f3504f3 )
660	lda	[stridey+%i2]0x82,%f17	! (3_0) y0 = *py;
661	fpadd32	%f40,DA1,%f62		! (2_1) db0 = vis_fpadd32(db0,DA1);
662
663.cont19:
664	fmuld	%f54,%f38,%f40		! (3_1) res0 *= xx0;
665	orcc	%l3,%l4,%g0
666	st	%f30,[%fp+tmp4]		! (2_0) iexp0 = ((int*)&db0)[0];
667	fand	%f46,DC2,%f38		! (0_0) h_hi0 = vis_fand(h0,DC2);
668
669	fmuld	K2,%f58,%f54		! (4_1) res0 = K2 * xx0;
670	srax	%g1,8,%o5		! (0_0) iexp0 >>= 8;
671	lda	[%i1]0x82,%l3		! (4_0) hx0 = *(int*)px;
672	fand	%f24,DA0,%f56		! (3_1) db0 = vis_fand(db0,DA0);
673
674	fmuld	%f36,%f62,%f62		! (2_1) res0 *= db0;
675	and	%o5,_0x1ff0,%o5		! (0_0) di0 = iexp0 & 0x1ff0;
676	bz,pn	%icc,.update19a
677	fand	%f18,DC0,%f60		! (1_0) h0 = vis_fand(db0,DC0);
678.cont19a:
679	ldd	[TBL+%o5],%f22		! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0];
680	add	%o0,TBL,%g1		! (3_1) si0 = (char*)sqrt_arr + di0;
681	and	%l3,_0x7fffffff,%l3	! (4_0) hx0 &= 0x7fffffff;
682	fsubd	%f46,%f38,%f38		! (0_0) xx0 = h0 - h_hi0;
683
684	fsmuld	%f17,%f17,%f24		! (3_0) dy0 = y0 * (double)y0;
685	cmp	%l3,_0x7f3504f3		! (4_0) hx ? 0x7f3504f3
686	lda	[stridey+%o7]0x82,%l4	! (4_0) hy0 = *(int*)py;
687	faddd	%f40,DC1,%f40		! (3_1) res0 += DC1;
688
689	fmul8x16	SCALE,%f56,%f36	! (3_1) db0 = vis_fmul8x16(SCALE, db0);
690	bge,pn	%icc,.update20		! (4_0) if ( hx >= 0x7f3504f3 )
691	ldd	[%g1+8],%f56		! (3_1) dtmp0 = ((double*)si0)[1];
692	faddd	%f54,K1,%f54		! (4_1) res0 += K1;
693
694	lda	[%i1]0x82,%f17		! (4_0) x0 = *px;
695.cont20:
696	subcc	counter,5,counter	! counter -= 5
697	add	%o4,stridez,%l7		! pz += stridez
698	fdtos	%f62,%f14		! (2_1) ftmp0 = (float)res0;
699
700	fmuld	%f38,%f22,%f38		! (0_0) xx0 *= dmp0;
701	and	%l4,_0x7fffffff,%l4	! (4_0) hy0 &= 0x7fffffff;
702	st	%f14,[%o4]		! (2_1) *pz = ftmp0;
703	for	%f60,DC1,%f46		! (1_0) h0 = vis_for(h0,DC1);
704
705	ld	[%fp+tmp3],%g1		! (1_0) iexp0 = ((int*)&db0)[0];
706	fmuld	%f56,%f40,%f62		! (3_1) res0 = dtmp0 * res0;
707	bpos,pt	%icc,.main_loop
708	faddd	%f44,%f24,%f24		! (3_0) db0 = dx0 + dy0;
709
710	add	counter,5,counter
711
712.tail:
713	subcc	counter,1,counter
714	bneg	.begin
715	nop
716
717	fpadd32	%f36,DA1,%f36		! (3_2) db0 = vis_fpadd32(db0,DA1);
718
719	fmuld	%f54,%f58,%f58		! (4_2) res0 *= xx0;
720	fand	%f46,DC2,%f44		! (1_1) h_hi0 = vis_fand(h0,DC2);
721
722	fmuld	K2,%f38,%f56		! (0_1) res0 = K2 * xx0;
723	srax	%g1,8,%g5		! (1_1) iexp0 >>= 8;
724	fand	%f20,DA0,%f54		! (4_2) db0 = vis_fand(db0,DA0);
725
726	fmuld	%f62,%f36,%f62		! (3_2) res0 *= db0;
727	and	%g5,_0x1ff0,%g5		! (1_1) di0 = iexp0 & 0x1ff0;
728
729	ldd	[%g5+TBL],%f22		! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
730	add	%o3,TBL,%g1		! (4_2) si0 = (char*)sqrt_arr + di0;
731	fsubd	%f46,%f44,%f44		! (1_1) xx0 = h0 - h_hi0;
732
733	faddd	%f58,DC1,%f36		! (4_2) res0 += DC1;
734
735	faddd	%f56,K1,%f58		! (0_1) res0 += K1;
736	ldd	[%g1+8],%f56		! (4_2) dtmp0 = ((double*)si0)[1];
737	fmul8x16	SCALE,%f54,%f54	! (4_2) db0 = vis_fmul8x16(SCALE, db0);
738
739	fdtos	%f62,%f14		! (3_2) ftmp0 = (float)res0;
740
741	fmuld	%f44,%f22,%f44		! (1_1) xx0 *= dmp0;
742	add	%l7,stridez,%o7		! pz += stridez
743	st	%f14,[%l7]		! (3_2) *pz = ftmp0;
744
745	subcc	counter,1,counter
746	bneg	.begin
747	or	%g0,%o7,%l7
748
749	fmuld	%f56,%f36,%f36		! (4_2) res0 = dtmp0 * res0;
750
751	fpadd32	%f54,DA1,%f62		! (4_2) db0 = vis_fpadd32(db0,DA1);
752
753	fmuld	%f58,%f38,%f38		! (0_1) res0 *= xx0;
754
755	fmuld	K2,%f44,%f56		! (1_1) res0 = K2 * xx0;
756	fand	%f0,DA0,%f54		! (0_1) db0 = vis_fand(db0,DA0);
757
758	fmuld	%f36,%f62,%f62		! (4_2) res0 *= db0;
759
760	add	%o5,TBL,%o0		! (0_1) si0 = (char*)sqrt_arr + di0;
761
762	faddd	%f38,DC1,%f36		! (0_1) res0 += DC1;
763
764	faddd	%f56,K1,%f38		! (1_1) res0 += K1;
765	ldd	[%o0+8],%f56		! (0_1) dtmp0 = ((double*)si0)[1];
766	fmul8x16	SCALE,%f54,%f54	! (0_1) db0 = vis_fmul8x16(SCALE, db0);
767
768	add	%o7,stridez,%i0		! pz += stridez
769	fdtos	%f62,%f14		! (4_2) ftmp0 = (float)res0;
770
771	fmuld	%f56,%f36,%f36		! (0_1) res0 = dtmp0 * res0;
772
773	fpadd32	%f54,DA1,%f62		! (0_1) db0 = vis_fpadd32(db0,DA1);
774
775	fmuld	%f38,%f44,%f44		! (1_1) res0 *= xx0;
776	add	%i0,stridez,%i3		! pz += stridez
777	st	%f14,[%o7]		! (4_2) *pz = ftmp0;
778
779	subcc	counter,1,counter
780	bneg	.begin
781	or	%g0,%i0,%l7
782
783	fand	%f18,DA0,%f56		! (1_1) db0 = vis_fand(db0,DA0);
784
785	fmuld	%f36,%f62,%f62		! (0_1) res0 *= db0;
786
787	add	%g5,TBL,%o3		! (1_1) si0 = (char*)sqrt_arr + di0;
788
789	faddd	%f44,DC1,%f44		! (1_1) res0 += DC1;
790
791	fmul8x16	SCALE,%f56,%f36	! (1_1) db0 = vis_fmul8x16(SCALE, db0);
792	ldd	[%o3+8],%f56		! (1_1) dtmp0 = ((double*)si0)[1];
793
794	add	%i3,stridez,%o4		! pz += stridez
795	fdtos	%f62,%f14		! (0_1) ftmp0 = (float)res0;
796
797	st	%f14,[%i0]		! (0_1) *pz = ftmp0;
798
799	subcc	counter,1,counter
800	bneg	.begin
801	or	%g0,%i3,%l7
802
803	fmuld	%f56,%f44,%f62		! (1_1) res0 = dtmp0 * res0;
804
805	fpadd32	%f36,DA1,%f36		! (1_1) db0 = vis_fpadd32(db0,DA1);
806
807	fmuld	%f62,%f36,%f62		! (1_1) res0 *= db0;
808
809	fdtos	%f62,%f14		! (1_1) ftmp0 = (float)res0;
810
811	st	%f14,[%i3]		! (1_1) *pz = ftmp0;
812
813	ba	.begin
814	or	%g0,%o4,%l7
815
816	.align	16
817.spec1:
818	st	%g0,[%l7]		! *pz = 0;
819	add	%l7,stridez,%l7		! pz += stridez
820
821	add	%i2,stridey,%i2		! py += stridey
822	ba	.begin1
823	sub	counter,1,counter	! counter--
824
825	.align	16
826.spec:
827	sethi	%hi(0x7f800000),%i0
828	cmp	%l3,%i0			! hx ? 0x7f800000
829	bge,pt	%icc,2f			! if ( hx >= 0x7f800000 )
830	ld	[%i2],%f8
831
832	cmp	%l4,%i0			! hy ? 0x7f800000
833	bge,pt	%icc,2f			! if ( hy >= 0x7f800000 )
834	nop
835
836	fsmuld	%f17,%f17,%f44		! x * (double)x
837	fsmuld	%f8,%f8,%f24		! y * (double)y
838	faddd	%f44,%f24,%f24		! x * (double)x + y * (double)y
839	fsqrtd	%f24,%f24		! hyp = sqrt(x * (double)x + y * (double)y);
840	fcmped	%f24,DFMAX		! hyp ? DMAX
841	fbug,a	1f			! if ( hyp > DMAX )
842	fmuls	FMAX,FMAX,%f20		! ftmp0 = FMAX * FMAX;
843
844	fdtos	%f24,%f20		! ftmp0 = (float)hyp;
8451:
846	st	%f20,[%l7]		! *pz = ftmp0;
847	add	%l7,stridez,%l7		! pz += stridez
848	add	%i1,stridex,%i1		! px += stridex
849
850	add	%i2,stridey,%i2		! py += stridey
851	ba	.begin1
852	sub	counter,1,counter	! counter--
8532:
854	fcmps	%f17,%f8		! exceptions
855	cmp	%l3,%i0			! hx ? 0x7f800000
856	be,a	%icc,1f			! if ( hx == 0x7f800000 )
857	st	%i0,[%l7]		! *(int*)pz = 0x7f800000;
858
859	cmp	%l4,%i0			! hy ? 0x7f800000
860	be,a	%icc,1f			! if ( hy == 0x7f800000
861	st	%i0,[%l7]		! *(int*)pz = 0x7f800000;
862
863	fmuls	%f17,%f8,%f8		! x * y
864	st	%f8,[%l7]		! *pz = x * y;
865
8661:
867	add	%l7,stridez,%l7		! pz += stridez
868	add	%i1,stridex,%i1		! px += stridex
869
870	add	%i2,stridey,%i2		! py += stridey
871	ba	.begin1
872	sub	counter,1,counter	! counter--
873
874	.align	16
875.update0:
876	cmp	counter,1
877	ble	.cont0
878	fzeros	%f17
879
880	stx	%i1,[%fp+tmp_px]
881
882	add	%o7,stridey,%i5
883	stx	%i5,[%fp+tmp_py]
884
885	sub	counter,1,counter
886	st	counter,[%fp+tmp_counter]
887
888	ba	.cont0
889	or	%g0,1,counter
890
891	.align	16
892.update1:
893	cmp	counter,1
894	ble	.cont1
895	fzeros	%f17
896
897	stx	%i1,[%fp+tmp_px]
898	stx	%i5,[%fp+tmp_py]
899
900	sub	counter,1,counter
901	st	counter,[%fp+tmp_counter]
902
903	ba	.cont1
904	or	%g0,1,counter
905
906	.align	16
907.update2:
908	cmp	counter,2
909	ble	.cont2
910	fzeros	%f8
911
912	stx	%i1,[%fp+tmp_px]
913	stx	%o4,[%fp+tmp_py]
914
915	sub	counter,2,counter
916	st	counter,[%fp+tmp_counter]
917
918	ba	.cont2
919	or	%g0,2,counter
920
921	.align	16
922.update3:
923	cmp	counter,2
924	ble	.cont3
925	fzeros	%f17
926
927	stx	%i1,[%fp+tmp_px]
928	stx	%o4,[%fp+tmp_py]
929
930	sub	counter,2,counter
931	st	counter,[%fp+tmp_counter]
932
933	ba	.cont3
934	or	%g0,2,counter
935
936	.align	16
937.update4:
938	cmp	counter,3
939	ble	.cont4
940	fzeros	%f17
941
942	stx	%i1,[%fp+tmp_px]
943	stx	%i5,[%fp+tmp_py]
944
945	sub	counter,3,counter
946	st	counter,[%fp+tmp_counter]
947
948	ba	.cont4
949	or	%g0,3,counter
950
951	.align	16
952.update5:
953	cmp	counter,3
954	ble	.cont5
955	fzeros	%f17
956
957	sub	%i1,stridex,%i2
958	stx	%i2,[%fp+tmp_px]
959	stx	%i5,[%fp+tmp_py]
960
961	sub	counter,3,counter
962	st	counter,[%fp+tmp_counter]
963
964	ba	.cont5
965	or	%g0,3,counter
966
967	.align	16
968.update6:
969	cmp	counter,4
970	ble	.cont6
971	fzeros	%f17
972
973	stx	%i1,[%fp+tmp_px]
974	stx	%i2,[%fp+tmp_py]
975
976	sub	counter,4,counter
977	st	counter,[%fp+tmp_counter]
978
979	ba	.cont6
980	or	%g0,4,counter
981
982	.align	16
983.update7:
984	cmp	counter,4
985	ble	.cont7
986	fzeros	%f17
987
988	sub	%i1,stridex,%o7
989	stx	%o7,[%fp+tmp_px]
990	stx	%i2,[%fp+tmp_py]
991
992	sub	counter,4,counter
993	st	counter,[%fp+tmp_counter]
994
995	ba	.cont7
996	or	%g0,4,counter
997
998	.align	16
999.update8:
1000	cmp	counter,5
1001	ble	.cont8
1002	fzeros	%f17
1003
1004	sub	%i1,stridex,%o5
1005	stx	%o5,[%fp+tmp_px]
1006	stx	%o7,[%fp+tmp_py]
1007
1008	sub	counter,5,counter
1009	st	counter,[%fp+tmp_counter]
1010
1011	ba	.cont8
1012	or	%g0,5,counter
1013
1014	.align	16
1015.update9:
1016	cmp	counter,5
1017	ble	.cont9
1018	fzeros	%f17
1019
1020	sub	%i1,stridex,%o5
1021	stx	%o5,[%fp+tmp_px]
1022	stx	%o7,[%fp+tmp_py]
1023
1024	sub	counter,5,counter
1025	st	counter,[%fp+tmp_counter]
1026
1027	ba	.cont9
1028	or	%g0,5,counter
1029
1030	.align	16
1031.update10:
1032	fmul8x16	SCALE,%f56,%f36	! (3_1) db0 = vis_fmul8x16(SCALE, db0);
1033	and	%l4,_0x7fffffff,%l4	! (4_0) hy0 &= 0x7fffffff;
1034	ldd	[%g1+8],%f56		! (3_1) dtmp0 = ((double*)si0)[1];
1035	faddd	%f54,K1,%f54		! (4_1) res0 += K1;
1036
1037	cmp	counter,6
1038	ble	.cont10
1039	fzeros	%f17
1040
1041	stx	%i1,[%fp+tmp_px]
1042	add	%o7,stridey,%i5
1043	stx	%i5,[%fp+tmp_py]
1044
1045	sub	counter,6,counter
1046	st	counter,[%fp+tmp_counter]
1047
1048	ba	.cont10
1049	or	%g0,6,counter
1050
1051	.align	16
1052.update11:
1053	cmp	counter,1
1054	ble	.cont11
1055	fzeros	%f17
1056
1057	stx	%i1,[%fp+tmp_px]
1058	stx	%i5,[%fp+tmp_py]
1059
1060	sub	counter,1,counter
1061	st	counter,[%fp+tmp_counter]
1062
1063	ba	.cont11
1064	or	%g0,1,counter
1065
1066	.align	16
1067.update12:
1068	cmp	counter,2
1069	ble	.cont12
1070	fzeros	%f8
1071
1072	stx	%i0,[%fp+tmp_px]
1073	add	%i5,stridey,%o4
1074	stx	%o4,[%fp+tmp_py]
1075
1076	sub	counter,2,counter
1077	st	counter,[%fp+tmp_counter]
1078
1079	ba	.cont12
1080	or	%g0,2,counter
1081
1082	.align	16
1083.update13:
1084	cmp	counter,2
1085	ble	.cont13
1086	fzeros	%f17
1087
1088	stx	%i0,[%fp+tmp_px]
1089	stx	%o4,[%fp+tmp_py]
1090
1091	sub	counter,2,counter
1092	st	counter,[%fp+tmp_counter]
1093
1094	ba	.cont13
1095	or	%g0,2,counter
1096
1097	.align	16
1098.update14:
1099	cmp	counter,3
1100	ble	.cont14
1101	fzeros	%f17
1102
1103	stx	%i1,[%fp+tmp_px]
1104	add	%o4,stridey,%i5
1105	stx	%i5,[%fp+tmp_py]
1106
1107	sub	counter,3,counter
1108	st	counter,[%fp+tmp_counter]
1109
1110	ba	.cont14
1111	or	%g0,3,counter
1112
1113	.align	16
1114.update15:
1115	cmp	counter,3
1116	ble	.cont15
1117	fzeros	%f17
1118
1119	sub	%i1,stridex,%i2
1120	stx	%i2,[%fp+tmp_px]
1121	stx	%i5,[%fp+tmp_py]
1122
1123	sub	counter,3,counter
1124	st	counter,[%fp+tmp_counter]
1125
1126	ba	.cont15
1127	or	%g0,3,counter
1128
1129	.align	16
1130.update16:
1131	faddd	%f40,%f32,%f18		! (1_0) db0 = dx0 + dy0;
1132	cmp	counter,4
1133	ble	.cont16
1134	fzeros	%f17
1135
1136	stx	%i1,[%fp+tmp_px]
1137	stx	%i2,[%fp+tmp_py]
1138
1139	sub	counter,4,counter
1140	st	counter,[%fp+tmp_counter]
1141
1142	ba	.cont16
1143	or	%g0,4,counter
1144
1145	.align	16
1146.update17:
1147	cmp	counter,4
1148	ble	.cont17
1149	fzeros	%f17
1150
1151	stx	%i1,[%fp+tmp_px]
1152	stx	%i2,[%fp+tmp_py]
1153
1154	sub	counter,4,counter
1155	st	counter,[%fp+tmp_counter]
1156
1157	ba	.cont17
1158	or	%g0,4,counter
1159
1160	.align	16
1161.update18:
1162	cmp	counter,5
1163	ble	.cont18
1164	fzeros	%f17
1165
1166	stx	%l7,[%fp+tmp_px]
1167	stx	%o7,[%fp+tmp_py]
1168
1169	sub	counter,5,counter
1170	st	counter,[%fp+tmp_counter]
1171
1172	ba	.cont18
1173	or	%g0,5,counter
1174
1175	.align	16
1176.update19:
1177	fpadd32	%f40,DA1,%f62		! (2_1) db0 = vis_fpadd32(db0,DA1);
1178	cmp	counter,5
1179	ble	.cont19
1180	fzeros	%f17
1181
1182	stx	%l7,[%fp+tmp_px]
1183	stx	%o7,[%fp+tmp_py]
1184
1185	sub	counter,5,counter
1186	st	counter,[%fp+tmp_counter]
1187
1188	ba	.cont19
1189	or	%g0,5,counter
1190
1191	.align	16
1192.update19a:
1193	cmp	counter,5
1194	ble	.cont19a
1195	fzeros	%f17
1196
1197	stx	%l7,[%fp+tmp_px]
1198	stx	%o7,[%fp+tmp_py]
1199
1200	sub	counter,5,counter
1201	st	counter,[%fp+tmp_counter]
1202
1203	ba	.cont19a
1204	or	%g0,5,counter
1205
1206	.align	16
1207.update20:
1208	faddd	%f54,K1,%f54		! (4_1) res0 += K1;
1209	cmp	counter,6
1210	ble	.cont20
1211	fzeros	%f17
1212
1213	stx	%i1,[%fp+tmp_px]
1214	add	%o7,stridey,%g1
1215	stx	%g1,[%fp+tmp_py]
1216
1217	sub	counter,6,counter
1218	st	counter,[%fp+tmp_counter]
1219
1220	ba	.cont20
1221	or	%g0,6,counter
1222
1223.exit:
1224	ret
1225	restore
1226	SET_SIZE(__vhypotf)
1227
1228