xref: /titanic_41/usr/src/lib/libmvec/common/vis/__vhypot.S (revision fa4825fa53a7f93d9b56c4c309623155890c9059)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
23 */
24/*
25 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
26 * Use is subject to license terms.
27 */
28
29	.file	"__vhypot.S"
30
31#include "libm.h"
32
33	RO_DATA
34	.align	64
35
36.CONST_TBL:
37	.word	0x7ff00000, 0	! DC0
38	.word	0x7fe00000, 0	! DC1
39	.word	0x00100000, 0	! DC2
40	.word	0x41b00000, 0	! D2ON28 = 268435456.0
41	.word	0x7fd00000, 0	! DC3
42
43#define counter		%i0
44#define tmp_counter	%l3
45#define tmp_px		%l5
46#define tmp_py		%o7
47#define stridex		%i2
48#define stridey		%i4
49#define stridez		%l0
50
51#define DC0		%f8
52#define DC0_HI		%f8
53#define DC0_LO		%f9
54#define DC1		%f46
55#define DC2		%f48
56#define DC3		%f0
57#define D2ON28		%f62
58
59!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
60!      !!!!!   algorithm   !!!!!
61!  ((float*)&x)[0] = ((float*)px)[0];
62!  ((float*)&x)[1] = ((float*)px)[1];
63!
64!  ((float*)&y)[0] = ((float*)py)[0];
65!  ((float*)&y)[1] = ((float*)py)[1];
66!
67!  x = fabs(x);
68!  y = fabs(y);
69!
70!  c0 = vis_fcmple32(DC1,x);
71!  c2 = vis_fcmple32(DC1,y);
72!  c1 = vis_fcmpgt32(DC2,x);
73!  c3 = vis_fcmpgt32(DC2,y);
74!
75!  c0 |= c2;
76!  c1 &= c3;
77!  if ( (c0 & 2) != 0 )
78!  {
79!    lx = ((int*)px)[1];
80!    ly = ((int*)py)[1];
81!    hx = *(int*)px;
82!    hy = *(int*)py;
83!
84!    hx &= 0x7fffffff;
85!    hy &= 0x7fffffff;
86!
87!    j0 = hx;
88!    if ( j0 < hy ) j0 = hy;
89!    j0 &= 0x7ff00000;
90!    if ( j0 >= 0x7ff00000 )
91!    {
92!      if ( hx == 0x7ff00000 && lx == 0 ) res = x == y ? y : x;
93!      else if ( hy == 0x7ff00000 && ly == 0 ) res = x == y ? x : y;
94!      else res = x * y;
95!
96!      ((float*)pz)[0] = ((float*)&res)[0];
97!      ((float*)pz)[1] = ((float*)&res)[1];
98!    }
99!    else
100!    {
101!      diff = hy - hx;
102!      j0 = diff >> 31;
103!      if ( ((diff ^ j0) - j0) < 0x03600000 )
104!      {!
105!        x *= D2ONM1022;
106!        y *= D2ONM1022;
107!
108!        x_hi = ( x + two28 ) - two28;
109!        x_lo = x - x_hi;
110!        y_hi = ( y + two28 ) - two28;
111!        y_lo = y - y_hi;
112!        res = (x_hi * x_hi + y_hi * y_hi);
113!        res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo);
114!
115!        res = sqrt(res);
116!
117!        res = D2ONP1022 * res;
118!        ((float*)pz)[0] = ((float*)&res)[0];
119!        ((float*)pz)[1] = ((float*)&res)[1];
120!      }
121!      else
122!      {
123!        res = x + y;
124!        ((float*)pz)[0] = ((float*)&res)[0];
125!        ((float*)pz)[1] = ((float*)&res)[1];
126!      }
127!    }
128!    px += stridex;
129!    py += stridey;
130!    pz += stridez;
131!    continue;
132!  }
133!  if ( (c1 & 2) != 0 )
134!  {
135!    x *= D2ONP1022;
136!    y *= D2ONP1022;
137!
138!    x_hi = ( x + two28 ) - two28;
139!    x_lo = x - x_hi;
140!    y_hi = ( y + two28 ) - two28;
141!    y_lo = y - y_hi;
142!    res = (x_hi * x_hi + y_hi * y_hi);
143!    res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo);
144!
145!    res = sqrt(res);
146!
147!    res = D2ONM1022 * res;
148!    ((float*)pz)[0] = ((float*)&res)[0];
149!    ((float*)pz)[1] = ((float*)&res)[1];
150!    px += stridex;
151!    py += stridey;
152!    pz += stridez;
153!    continue;
154!  }
155!
156!  dmax = x;
157!  if ( dmax < y ) dmax = y;
158!
159!  dmax = vis_fand(dmax,DC0);
160!  dnorm = vis_fpsub32(DC1,dmax);
161!
162!  x *= dnorm;
163!  y *= dnorm;
164!
165!  x_hi = x + D2ON28;
166!  x_hi -= D2ON28;
167!  x_lo = x - x_hi;
168!
169!  y_hi = y + D2ON28;
170!  y_hi -= D2ON28;
171!  y_lo = y - y_hi;
172!
173!  res = x_hi * x_hi;
174!  dtmp1 = x + x_hi;
175!  dtmp0 = y_hi * y_hi;
176!  dtmp2 = y + y_hi;
177!
178!  res += dtmp0;
179!  dtmp1 *= x_lo;
180!  dtmp2 *= y_lo;
181!  dtmp1 += dtmp2;
182!  res += dtmp1;
183!
184!  res = sqrt(res);
185!
186!  res = dmax * res;
187!  ((float*)pz)[0] = ((float*)&res)[0];
188!  ((float*)pz)[1] = ((float*)&res)[1];
189!
190!  px += stridex;
191!  py += stridey;
192!  pz += stridez;
193!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
194
195	ENTRY(__vhypot)
196	save	%sp,-SA(MINFRAME),%sp
197	PIC_SETUP(l7)
198	PIC_SET(l7,.CONST_TBL,o3)
199	wr	%g0,0x82,%asi
200
201#ifdef __sparcv9
202	ldx	[%fp+STACK_BIAS+176],%l0
203#else
204	ld	[%fp+STACK_BIAS+92],%l0
205#endif
206	ldd	[%o3],DC0
207	sll	%i2,3,stridex
208	mov	%i0,tmp_counter
209
210	ldd	[%o3+8],DC1
211	sll	%i4,3,stridey
212	mov	%i1,tmp_px
213
214	ldd	[%o3+16],DC2
215	sll	%l0,3,stridez
216	mov	%i3,tmp_py
217
218	ldd	[%o3+24],D2ON28
219
220	ldd	[%o3+32],DC3
221
222.begin:
223	mov	tmp_counter,counter
224	mov	tmp_px,%i1
225	mov	tmp_py,%i3
226	clr	tmp_counter
227.begin1:
228	cmp	counter,0
229	ble,pn	%icc,.exit
230	nop
231
232	lda	[%i1]%asi,%o0
233	sethi	%hi(0x7ffffc00),%o5
234
235	lda	[%i3]%asi,%o2
236	add	%o5,1023,%o5
237
238	lda	[%i1]%asi,%f26		! (1_0) ((float*)&x)[0] = ((float*)px)[0];
239
240	lda	[%i1+4]%asi,%f27	! (1_0) ((float*)&x)[1] = ((float*)px)[1];
241	add	%i1,stridex,%o1		! px += stridex
242
243	lda	[%i3]%asi,%f24		! (1_0) ((float*)&y)[0] = ((float*)py)[0];
244	sethi	%hi(0x00100000),%l7
245	and	%o0,%o5,%o0
246
247	lda	[%i3+4]%asi,%f25	! (1_0) ((float*)&y)[1] = ((float*)py)[1];
248	and	%o2,%o5,%o2
249	sethi	%hi(0x7fe00000),%l6
250
251	fabsd	%f26,%f36		! (1_0) x = fabs(x);
252	cmp	%o0,%o2
253	mov	%o2,%l4
254
255	fabsd	%f24,%f54		! (1_0) y = fabs(y);
256	add	%i3,stridey,%o5		! py += stridey
257	movg	%icc,%o0,%o2
258	lda	[%o5]%asi,%f28		! (2_0) ((float*)&y)[0] = ((float*)py)[0];
259
260	cmp	%o2,%l6
261	sethi	%hi(0x7ff00000),%o4
262	bge,pn	%icc,.spec0
263	lda	[%o5+4]%asi,%f29	! (2_0) ((float*)&y)[1] = ((float*)py)[1];
264
265	cmp	%o2,%l7
266	bl,pn	%icc,.spec1
267	nop
268	lda	[%o1]%asi,%f26		! (2_0) ((float*)&x)[0] = ((float*)px)[0];
269
270	lda	[%o1+4]%asi,%f27	! (2_0) ((float*)&x)[1] = ((float*)px)[1];
271	add	%i3,stridey,%i3		! py += stridey
272
273	fabsd	%f28,%f34		! (2_0) y = fabs(y);
274
275	fabsd	%f26,%f50		! (2_0) x = fabs(x);
276
277	fcmple32	DC1,%f50,%o3	! (2_0) c0 = vis_fcmple32(DC1,x);
278
279	fcmple32	DC1,%f34,%o0	! (2_0) c2 = vis_fcmple32(DC1,y);
280
281	fcmpgt32	DC2,%f50,%o4	! (2_0) c1 = vis_fcmpgt32(DC2,x);
282
283	fcmpgt32	DC2,%f34,%o5	! (2_0) c3 = vis_fcmpgt32(DC2,y);
284
285	or	%o3,%o0,%o3		! (2_0) c0 |= c2;
286
287	andcc	%o3,2,%g0		! (2_0) c0 & 2
288	bnz,pn	%icc,.update0		! (2_0) if ( (c0 & 2) != 0 )
289	and	%o4,%o5,%o4		! (2_0) c1 &= c3;
290.cont0:
291	add	%i3,stridey,%l4		! py += stridey
292	andcc	%o4,2,%g0		! (2_0) c1 & 2
293	bnz,pn	%icc,.update1		! (2_0) if ( (c1 & 2) != 0 )
294	fmovd	%f36,%f56		! (1_0) dmax = x;
295.cont1:
296	lda	[%l4]%asi,%f30		! (3_0) ((float*)&y)[0] = ((float*)py)[0];
297	add	%o1,stridex,%l2		! px += stridex
298
299	lda	[%l4+4]%asi,%f31	! (3_0) ((float*)&y)[1] = ((float*)py)[1];
300
301	lda	[%l2]%asi,%f18		! (3_1) ((float*)&x)[0] = ((float*)px)[0];
302
303	lda	[%l2+4]%asi,%f19	! (3_1) ((float*)&x)[1] = ((float*)px)[1];
304
305	fabsd	%f30,%f30		! (3_1) y = fabs(y);
306
307	fabsd	%f18,%f18		! (3_1) x = fabs(x);
308
309	fcmped	%fcc2,%f54,%f56		! (1_1) dmax ? y
310
311	fmovdg	%fcc2,%f54,%f56		! (1_1) if ( dmax < y ) dmax = y;
312
313	fcmple32	DC1,%f18,%o3	! (3_1) c0 = vis_fcmple32(DC1,x);
314
315	fcmple32	DC1,%f30,%o0	! (3_1) c2 = vis_fcmple32(DC1,y);
316
317	fcmpgt32	DC2,%f18,%o4	! (3_1) c1 = vis_fcmpgt32(DC2,x);
318
319	fcmpgt32	DC2,%f30,%o1	! (3_1) c3 = vis_fcmpgt32(DC2,y);
320
321	fand	%f56,DC0,%f38		! (1_1) dmax = vis_fand(dmax,DC0);
322
323	or	%o3,%o0,%o3		! (3_1) c0 |= c2;
324
325	andcc	%o3,2,%g0		! (3_1) c0 & 2
326	bnz,pn	%icc,.update2		! (3_1) if ( (c0 & 2) != 0 )
327	and	%o4,%o1,%o4		! (3_1) c1 &= c3;
328.cont2:
329	add	%l4,stridey,%i3		! py += stridey
330	andcc	%o4,2,%g0		! (3_1) c1 & 2
331	bnz,pn	%icc,.update3		! (3_1) if ( (c1 & 2) != 0 )
332	fmovd	%f50,%f32		! (2_1) dmax = x;
333.cont3:
334	fpsub32	DC1,%f38,%f10		! (1_1) dnorm = vis_fpsub32(DC1,dmax);
335	lda	[%i3]%asi,%f20		! (0_0) ((float*)&y)[0] = ((float*)py)[0];
336
337	lda	[%i3+4]%asi,%f21	! (0_0) ((float*)&y)[1] = ((float*)py)[1];
338
339	add	%l2,stridex,%l1		! px += stridex
340
341	fmuld	%f36,%f10,%f36		! (1_1) x *= dnorm;
342	lda	[%l1]%asi,%f22		! (0_0) ((float*)&x)[0] = ((float*)px)[0]
343
344	lda	[%l1+4]%asi,%f23	! (0_0) ((float*)&x)[1] = ((float*)px)[1];
345
346	fmuld	%f54,%f10,%f56		! (1_1) y *= dnorm;
347	fabsd	%f20,%f40		! (0_0) y = fabs(y);
348
349	fabsd	%f22,%f20		! (0_0) x = fabs(x);
350
351	fcmped	%fcc3,%f34,%f32		! (2_1) dmax ? y
352
353
354	fmovdg	%fcc3,%f34,%f32		! (2_1) if ( dmax < y ) dmax = y;
355
356	faddd	%f36,D2ON28,%f58	! (1_1) x_hi = x + D2ON28;
357	fcmple32	DC1,%f20,%g5	! (0_0) c0 = vis_fcmple32(DC1,x);
358
359	faddd	%f56,D2ON28,%f22	! (1_1) y_hi = y + D2ON28;
360	fcmple32	DC1,%f40,%o2	! (0_0) c2 = vis_fcmple32(DC1,y);
361
362	fcmpgt32	DC2,%f20,%g1	! (0_0) c1 = vis_fcmpgt32(DC2,x);
363
364	fcmpgt32	DC2,%f40,%o4	! (0_0) c3 = vis_fcmpgt32(DC2,y);
365
366	fand	%f32,DC0,%f52		! (2_1) dmax = vis_fand(dmax,DC0);
367
368	or	%g5,%o2,%g5		! (0_0) c0 |= c2;
369	fsubd	%f58,D2ON28,%f58	! (1_1) x_hi -= D2ON28;
370
371	andcc	%g5,2,%g0		! (0_0) c0 & 2
372	bnz,pn	%icc,.update4		! (0_0) if ( (c0 & 2) != 0 )
373	fsubd	%f22,D2ON28,%f22	! (1_1) y_hi -= D2ON28;
374.cont4:
375	and	%g1,%o4,%g1		! (0_0) c1 &= c3;
376
377	add	%i3,stridey,%l2		! py += stridey
378	andcc	%g1,2,%g0		! (0_0) c1 & 2
379	bnz,pn	%icc,.update5		! (0_0) if ( (c1 & 2) != 0 )
380	fmovd	%f18,%f44		! (3_1) dmax = x;
381.cont5:
382	fpsub32	DC1,%f52,%f10		! (2_1) dnorm = vis_fpsub32(DC1,dmax);
383	lda	[%l2]%asi,%f24		! (1_0) ((float*)&y)[0] = ((float*)py)[0];
384
385	fmuld	%f58,%f58,%f60		! (1_1) res = x_hi * x_hi;
386	lda	[%l2+4]%asi,%f25	! (1_0) ((float*)&y)[1] = ((float*)py)[1];
387	add	%l1,stridex,%l7		! px += stridex
388	faddd	%f56,%f22,%f28		! (1_1) dtmp2 = y + y_hi;
389
390	faddd	%f36,%f58,%f6		! (1_1) dtmp1 = x + x_hi;
391	lda	[%l7]%asi,%f26		! (1_0) ((float*)&x)[0] = ((float*)px)[0];
392
393	fmuld	%f50,%f10,%f50		! (2_1) x *= dnorm;
394	fsubd	%f36,%f58,%f58		! (1_1) x_lo = x - x_hi;
395	lda	[%l7+4]%asi,%f27	! (1_0) ((float*)&x)[1] = ((float*)px)[1];
396
397	fmuld	%f22,%f22,%f2		! (1_1) dtmp0 = y_hi * y_hi;
398	fsubd	%f56,%f22,%f56		! (1_1) y_lo = y - y_hi;
399
400	fmuld	%f34,%f10,%f34		! (2_1) y *= dnorm;
401	fabsd	%f24,%f54		! (1_0) y = fabs(y);
402
403	fabsd	%f26,%f36		! (1_0) x = fabs(x);
404
405	fmuld	%f6,%f58,%f10		! (1_1) dtmp1 *= x_lo;
406	fcmped	%fcc0,%f30,%f44		! (3_1) dmax ? y
407
408	fmuld	%f28,%f56,%f26		! (1_1) dtmp2 *= y_lo;
409
410	fmovdg	%fcc0,%f30,%f44		! (3_1) if ( dmax < y ) dmax = y;
411
412	faddd	%f50,D2ON28,%f58	! (2_1) x_hi = x + D2ON28;
413	fcmple32	DC1,%f36,%g1	! (1_0) c0 = vis_fcmple32(DC1,x);
414
415	faddd	%f34,D2ON28,%f22	! (2_1) y_hi = y + D2ON28;
416	fcmple32	DC1,%f54,%g5	! (1_0) c2 = vis_fcmple32(DC1,y);
417
418	faddd	%f60,%f2,%f24		! (1_1) res += dtmp0;
419	fcmpgt32	DC2,%f36,%o5	! (1_0) c1 = vis_fcmpgt32(DC2,x);
420
421	faddd	%f10,%f26,%f28		! (1_1) dtmp1 += dtmp2;
422	fcmpgt32	DC2,%f54,%o1	! (1_0) c3 = vis_fcmpgt32(DC2,y);
423
424	fand	%f44,DC0,%f14		! (3_1) dmax = vis_fand(dmax,DC0);
425
426	or	%g1,%g5,%g1		! (1_0) c0 |= c2;
427	fsubd	%f58,D2ON28,%f44	! (2_1) x_hi -= D2ON28;
428
429	andcc	%g1,2,%g0		! (1_0) c0 & 2
430	bnz,pn	%icc,.update6		! (1_0) if ( (c0 & 2) != 0 )
431	fsubd	%f22,D2ON28,%f58	! (2_1) y_hi -= D2ON28;
432.cont6:
433	and	%o5,%o1,%o5		! (1_0) c1 &= c3;
434	faddd	%f24,%f28,%f26		! (1_1) res += dtmp1;
435
436	add	%l2,stridey,%i3		! py += stridey
437	andcc	%o5,2,%g0		! (1_0) c1 & 2
438	bnz,pn	%icc,.update7		! (1_0) if ( (c1 & 2) != 0 )
439	fmovd	%f20,%f4		! (0_0) dmax = x;
440.cont7:
441	fpsub32	DC1,%f14,%f10		! (3_1) dnorm = vis_fpsub32(DC1,dmax);
442	lda	[%i3]%asi,%f28		! (2_0) ((float*)&y)[0] = ((float*)py)[0];
443
444	fmuld	%f44,%f44,%f2		! (2_1) res = x_hi * x_hi;
445	lda	[%i3+4]%asi,%f29	! (2_0) ((float*)&y)[1] = ((float*)py)[1];
446	add	%l7,stridex,%o1		! px += stridex
447	faddd	%f34,%f58,%f60		! (2_1) dtmp2 = y + y_hi;
448
449	fsqrtd	%f26,%f24		! (1_1) res = sqrt(res);
450	lda	[%o1]%asi,%f26		! (2_0) ((float*)&x)[0] = ((float*)px)[0];
451	faddd	%f50,%f44,%f56		! (2_1) dtmp1 = x + x_hi;
452
453	fmuld	%f18,%f10,%f6		! (3_1) x *= dnorm;
454	fsubd	%f50,%f44,%f18		! (2_1) x_lo = x - x_hi;
455	lda	[%o1+4]%asi,%f27	! (2_0) ((float*)&x)[1] = ((float*)px)[1];
456
457	fmuld	%f58,%f58,%f44		! (2_1) dtmp0 = y_hi * y_hi;
458	fsubd	%f34,%f58,%f22		! (2_1) y_lo = y - y_hi;
459
460	fmuld	%f30,%f10,%f58		! (3_1) y *= dnorm;
461	fabsd	%f28,%f34		! (2_0) y = fabs(y);
462
463	fabsd	%f26,%f50		! (2_0) x = fabs(x);
464
465	fmuld	%f56,%f18,%f10		! (2_1) dtmp1 *= x_lo;
466	fcmped	%fcc1,%f40,%f4		! (0_0) dmax ? y
467
468	fmuld	%f60,%f22,%f12		! (2_1) dtmp2 *= y_lo;
469
470	fmovdg	%fcc1,%f40,%f4		! (0_0) if ( dmax < y ) dmax = y;
471
472	faddd	%f6,D2ON28,%f56		! (3_1) x_hi = x + D2ON28;
473	fcmple32	DC1,%f50,%o3	! (2_0) c0 = vis_fcmple32(DC1,x);
474
475	faddd	%f58,D2ON28,%f28	! (3_1) y_hi = y + D2ON28;
476	fcmple32	DC1,%f34,%o0	! (2_0) c2 = vis_fcmple32(DC1,y);
477
478	faddd	%f2,%f44,%f30		! (2_1) res += dtmp0;
479	fcmpgt32	DC2,%f50,%o4	! (2_0) c1 = vis_fcmpgt32(DC2,x);
480
481	faddd	%f10,%f12,%f26		! (2_1) dtmp1 += dtmp2;
482	fcmpgt32	DC2,%f34,%o5	! (2_0) c3 = vis_fcmpgt32(DC2,y);
483
484	fand	%f4,DC0,%f16		! (0_0) dmax = vis_fand(dmax,DC0);
485
486	or	%o3,%o0,%o3		! (2_0) c0 |= c2;
487	fsubd	%f56,D2ON28,%f18	! (3_1) x_hi -= D2ON28;
488
489	andcc	%o3,2,%g0		! (2_0) c0 & 2
490	bnz,pn	%icc,.update8		! (2_0) if ( (c0 & 2) != 0 )
491	fsubd	%f28,D2ON28,%f4		! (3_1) y_hi -= D2ON28;
492.cont8:
493	and	%o4,%o5,%o4		! (2_0) c1 &= c3;
494	faddd	%f30,%f26,%f12		! (2_1) res += dtmp1;
495
496	add	%i3,stridey,%l4		! py += stridey
497	andcc	%o4,2,%g0		! (2_0) c1 & 2
498	bnz,pn	%icc,.update9		! (2_0) if ( (c1 & 2) != 0 )
499	fmovd	%f36,%f56		! (1_0) dmax = x;
500.cont9:
501	lda	[%l4]%asi,%f30		! (3_0) ((float*)&y)[0] = ((float*)py)[0];
502	add	%o1,stridex,%l2		! px += stridex
503	fpsub32	DC1,%f16,%f44		! (0_0) dnorm = vis_fpsub32(DC1,dmax);
504
505	fmuld	%f18,%f18,%f60		! (3_1) res = x_hi * x_hi;
506	lda	[%l4+4]%asi,%f31	! (3_0) ((float*)&y)[1] = ((float*)py)[1];
507	faddd	%f58,%f4,%f32		! (3_1) dtmp2 = y + y_hi;
508
509	fsqrtd	%f12,%f12		! (2_1) res = sqrt(res);
510	faddd	%f6,%f18,%f28		! (3_1) dtmp1 = x + x_hi;
511
512	cmp	counter,4
513	bl,pn	%icc,.tail
514	nop
515
516	ba	.main_loop
517	sub	counter,4,counter
518
519	.align	16
520.main_loop:
521	fmuld	%f20,%f44,%f2		! (0_1) x *= dnorm;
522	fsubd	%f6,%f18,%f20		! (3_2) x_lo = x - x_hi;
523	lda	[%l2]%asi,%f18		! (3_1) ((float*)&x)[0] = ((float*)px)[0];
524
525	fmuld	%f4,%f4,%f22		! (3_2) dtmp0 = y_hi * y_hi;
526	lda	[%l2+4]%asi,%f19	! (3_1) ((float*)&x)[1] = ((float*)px)[1];
527	fsubd	%f58,%f4,%f58		! (3_2) y_lo = y - y_hi;
528
529	fmuld	%f40,%f44,%f44		! (0_1) y *= dnorm;
530	fabsd	%f30,%f30		! (3_1) y = fabs(y);
531
532	fmuld	%f38,%f24,%f10		! (1_2) res = dmax * res;
533	fabsd	%f18,%f18		! (3_1) x = fabs(x);
534	st	%f10,[%i5]		! (1_2) ((float*)pz)[0] = ((float*)&res)[0];
535
536	fmuld	%f28,%f20,%f28		! (3_2) dtmp1 *= x_lo;
537	st	%f11,[%i5+4]		! (1_2) ((float*)pz)[1] = ((float*)&res)[1];
538	fcmped	%fcc2,%f54,%f56		! (1_1) dmax ? y
539
540	fmuld	%f32,%f58,%f24		! (3_2) dtmp2 *= y_lo;
541
542	fmovdg	%fcc2,%f54,%f56		! (1_1) if ( dmax < y ) dmax = y;
543
544	faddd	%f2,D2ON28,%f10		! (0_1) x_hi = x + D2ON28;
545	fcmple32	DC1,%f18,%o3	! (3_1) c0 = vis_fcmple32(DC1,x);
546
547	faddd	%f44,D2ON28,%f20	! (0_1) y_hi = y + D2ON28;
548	fcmple32	DC1,%f30,%o0	! (3_1) c2 = vis_fcmple32(DC1,y);
549
550	faddd	%f60,%f22,%f22		! (3_2) res += dtmp0;
551	fcmpgt32	DC2,%f18,%o4	! (3_1) c1 = vis_fcmpgt32(DC2,x);
552
553	faddd	%f28,%f24,%f26		! (3_2) dtmp1 += dtmp2;
554	fcmpgt32	DC2,%f30,%o1	! (3_1) c3 = vis_fcmpgt32(DC2,y);
555
556	fand	%f56,DC0,%f38		! (1_1) dmax = vis_fand(dmax,DC0);
557
558	or	%o3,%o0,%o3		! (3_1) c0 |= c2;
559	fsubd	%f10,D2ON28,%f58	! (0_1) x_hi -= D2ON28;
560
561	andcc	%o3,2,%g0		! (3_1) c0 & 2
562	bnz,pn	%icc,.update10		! (3_1) if ( (c0 & 2) != 0 )
563	fsubd	%f20,D2ON28,%f56	! (0_1) y_hi -= D2ON28;
564.cont10:
565	faddd	%f22,%f26,%f28		! (3_2) res += dtmp1;
566	and	%o4,%o1,%o4		! (3_1) c1 &= c3;
567
568	add	%l4,stridey,%i3		! py += stridey
569	andcc	%o4,2,%g0		! (3_1) c1 & 2
570	bnz,pn	%icc,.update11		! (3_1) if ( (c1 & 2) != 0 )
571	fmovd	%f50,%f32		! (2_1) dmax = x;
572.cont11:
573	fpsub32	DC1,%f38,%f10		! (1_1) dnorm = vis_fpsub32(DC1,dmax);
574	add	%l2,stridex,%l1		! px += stridex
575	lda	[%i3]%asi,%f20		! (0_0) ((float*)&y)[0] = ((float*)py)[0];
576
577	fmuld	%f58,%f58,%f6		! (0_1) res = x_hi * x_hi;
578	lda	[%i3+4]%asi,%f21	! (0_0) ((float*)&y)[1] = ((float*)py)[1];
579	add	%i5,stridez,%l6		! pz += stridez
580	faddd	%f44,%f56,%f60		! (0_1) dtmp2 = y + y_hi;
581
582	fsqrtd	%f28,%f4		! (3_2) res = sqrt(res);
583	lda	[%l1]%asi,%f22		! (0_0) ((float*)&x)[0] = ((float*)px)[0];
584	faddd	%f2,%f58,%f24		! (0_1) dtmp1 = x + x_hi;
585
586	fmuld	%f36,%f10,%f36		! (1_1) x *= dnorm;
587	fsubd	%f2,%f58,%f26		! (0_1) x_lo = x - x_hi;
588	lda	[%l1+4]%asi,%f23	! (0_0) ((float*)&x)[1] = ((float*)px)[1];
589
590	fmuld	%f56,%f56,%f28		! (0_1) dtmp0 = y_hi * y_hi;
591	fsubd	%f44,%f56,%f44		! (0_1) y_lo = y - y_hi;
592
593	fmuld	%f54,%f10,%f56		! (1_1) y *= dnorm;
594	fabsd	%f20,%f40		! (0_0) y = fabs(y);
595
596	fmuld	%f52,%f12,%f12		! (2_2) res = dmax * res;
597	fabsd	%f22,%f20		! (0_0) x = fabs(x);
598	st	%f12,[%l6]		! (2_2) ((float*)pz)[0] = ((float*)&res)[0];
599
600	fmuld	%f24,%f26,%f10		! (0_1) dtmp1 *= x_lo;
601	st	%f13,[%l6+4]		! (2_2) ((float*)pz)[1] = ((float*)&res)[1];
602	fcmped	%fcc3,%f34,%f32		! (2_1) dmax ? y
603
604	fmuld	%f60,%f44,%f12		! (0_1) dtmp2 *= y_lo;
605
606	fmovdg	%fcc3,%f34,%f32		! (2_1) if ( dmax < y ) dmax = y;
607
608	faddd	%f36,D2ON28,%f58	! (1_1) x_hi = x + D2ON28;
609	fcmple32	DC1,%f20,%g5	! (0_0) c0 = vis_fcmple32(DC1,x);
610
611	faddd	%f56,D2ON28,%f22	! (1_1) y_hi = y + D2ON28;
612	fcmple32	DC1,%f40,%o2	! (0_0) c2 = vis_fcmple32(DC1,y);
613
614	faddd	%f6,%f28,%f24		! (0_1) res += dtmp0;
615	fcmpgt32	DC2,%f20,%g1	! (0_0) c1 = vis_fcmpgt32(DC2,x);
616
617	faddd	%f10,%f12,%f26		! (0_1) dtmp1 += dtmp2;
618	fcmpgt32	DC2,%f40,%o4	! (0_0) c3 = vis_fcmpgt32(DC2,y);
619
620	fand	%f32,DC0,%f52		! (2_1) dmax = vis_fand(dmax,DC0);
621
622	or	%g5,%o2,%g5		! (0_0) c0 |= c2;
623	fsubd	%f58,D2ON28,%f58	! (1_1) x_hi -= D2ON28;
624
625	andcc	%g5,2,%g0		! (0_0) c0 & 2
626	bnz,pn	%icc,.update12		! (0_0) if ( (c0 & 2) != 0 )
627	fsubd	%f22,D2ON28,%f22	! (1_1) y_hi -= D2ON28;
628.cont12:
629	and	%g1,%o4,%g1		! (0_0) c1 &= c3;
630	faddd	%f24,%f26,%f12		! (0_1) res += dtmp1;
631
632	add	%i3,stridey,%l2		! py += stridey
633	andcc	%g1,2,%g0		! (0_0) c1 & 2
634	bnz,pn	%icc,.update13		! (0_0) if ( (c1 & 2) != 0 )
635	fmovd	%f18,%f44		! (3_1) dmax = x;
636.cont13:
637	fpsub32	DC1,%f52,%f10		! (2_1) dnorm = vis_fpsub32(DC1,dmax);
638	add	%l1,stridex,%l7		! px += stridex
639	lda	[%l2]%asi,%f24		! (1_0) ((float*)&y)[0] = ((float*)py)[0];
640
641	fmuld	%f58,%f58,%f60		! (1_1) res = x_hi * x_hi;
642	add	%l6,stridez,%i5		! pz += stridez
643	lda	[%l2+4]%asi,%f25	! (1_0) ((float*)&y)[1] = ((float*)py)[1];
644	faddd	%f56,%f22,%f28		! (1_1) dtmp2 = y + y_hi;
645
646	fsqrtd	%f12,%f12		! (0_1) res = sqrt(res);
647	lda	[%l7]%asi,%f26		! (1_0) ((float*)&x)[0] = ((float*)px)[0];
648	faddd	%f36,%f58,%f6		! (1_1) dtmp1 = x + x_hi;
649
650	fmuld	%f50,%f10,%f50		! (2_1) x *= dnorm;
651	fsubd	%f36,%f58,%f58		! (1_1) x_lo = x - x_hi;
652	lda	[%l7+4]%asi,%f27	! (1_0) ((float*)&x)[1] = ((float*)px)[1];
653
654	fmuld	%f22,%f22,%f2		! (1_1) dtmp0 = y_hi * y_hi;
655	fsubd	%f56,%f22,%f56		! (1_1) y_lo = y - y_hi;
656
657	fmuld	%f34,%f10,%f34		! (2_1) y *= dnorm;
658	fabsd	%f24,%f54		! (1_0) y = fabs(y);
659
660	fmuld	%f14,%f4,%f14		! (3_2) res = dmax * res;
661	fabsd	%f26,%f36		! (1_0) x = fabs(x);
662	st	%f14,[%i5]		! (3_2) ((float*)pz)[0] = ((float*)&res)[0];
663
664	fmuld	%f6,%f58,%f10		! (1_1) dtmp1 *= x_lo;
665	st	%f15,[%i5+4]		! (3_2) ((float*)pz)[1] = ((float*)&res)[1];
666	fcmped	%fcc0,%f30,%f44		! (3_1) dmax ? y
667
668	fmuld	%f28,%f56,%f26		! (1_1) dtmp2 *= y_lo;
669
670	fmovdg	%fcc0,%f30,%f44		! (3_1) if ( dmax < y ) dmax = y;
671
672	faddd	%f50,D2ON28,%f58	! (2_1) x_hi = x + D2ON28;
673	fcmple32	DC1,%f36,%g1	! (1_0) c0 = vis_fcmple32(DC1,x);
674
675	faddd	%f34,D2ON28,%f22	! (2_1) y_hi = y + D2ON28;
676	fcmple32	DC1,%f54,%g5	! (1_0) c2 = vis_fcmple32(DC1,y);
677
678	faddd	%f60,%f2,%f24		! (1_1) res += dtmp0;
679	fcmpgt32	DC2,%f36,%o5	! (1_0) c1 = vis_fcmpgt32(DC2,x);
680
681	faddd	%f10,%f26,%f28		! (1_1) dtmp1 += dtmp2;
682	fcmpgt32	DC2,%f54,%o1	! (1_0) c3 = vis_fcmpgt32(DC2,y);
683
684	fand	%f44,DC0,%f14		! (3_1) dmax = vis_fand(dmax,DC0);
685
686	or	%g1,%g5,%g1		! (1_0) c0 |= c2;
687	fsubd	%f58,D2ON28,%f44	! (2_1) x_hi -= D2ON28;
688
689	andcc	%g1,2,%g0		! (1_0) c0 & 2
690	bnz,pn	%icc,.update14		! (1_0) if ( (c0 & 2) != 0 )
691	fsubd	%f22,D2ON28,%f58	! (2_1) y_hi -= D2ON28;
692.cont14:
693	and	%o5,%o1,%o5		! (1_0) c1 &= c3;
694	faddd	%f24,%f28,%f26		! (1_1) res += dtmp1;
695
696	add	%l2,stridey,%i3		! py += stridey
697	andcc	%o5,2,%g0		! (1_0) c1 & 2
698	bnz,pn	%icc,.update15		! (1_0) if ( (c1 & 2) != 0 )
699	fmovd	%f20,%f4		! (0_0) dmax = x;
700.cont15:
701	fpsub32	DC1,%f14,%f10		! (3_1) dnorm = vis_fpsub32(DC1,dmax);
702	add	%l7,stridex,%o1		! px += stridex
703	lda	[%i3]%asi,%f28		! (2_0) ((float*)&y)[0] = ((float*)py)[0];
704
705	fmuld	%f44,%f44,%f2		! (2_1) res = x_hi * x_hi;
706	add	%i5,stridez,%g5		! pz += stridez
707	lda	[%i3+4]%asi,%f29	! (2_0) ((float*)&y)[1] = ((float*)py)[1];
708	faddd	%f34,%f58,%f60		! (2_1) dtmp2 = y + y_hi;
709
710	fsqrtd	%f26,%f24		! (1_1) res = sqrt(res);
711	lda	[%o1]%asi,%f26		! (2_0) ((float*)&x)[0] = ((float*)px)[0];
712	faddd	%f50,%f44,%f56		! (2_1) dtmp1 = x + x_hi;
713
714	fmuld	%f18,%f10,%f6		! (3_1) x *= dnorm;
715	fsubd	%f50,%f44,%f18		! (2_1) x_lo = x - x_hi;
716	lda	[%o1+4]%asi,%f27	! (2_0) ((float*)&x)[1] = ((float*)px)[1];
717
718	fmuld	%f58,%f58,%f44		! (2_1) dtmp0 = y_hi * y_hi;
719	fsubd	%f34,%f58,%f22		! (2_1) y_lo = y - y_hi;
720
721	fmuld	%f30,%f10,%f58		! (3_1) y *= dnorm;
722	fabsd	%f28,%f34		! (2_0) y = fabs(y);
723
724	fmuld	%f16,%f12,%f16		! (0_1) res = dmax * res;
725	fabsd	%f26,%f50		! (2_0) x = fabs(x);
726	st	%f16,[%g5]		! (0_1) ((float*)pz)[0] = ((float*)&res)[0];
727
728	fmuld	%f56,%f18,%f10		! (2_1) dtmp1 *= x_lo;
729	st	%f17,[%g5+4]		! (0_1) ((float*)pz)[1] = ((float*)&res)[1];
730	fcmped	%fcc1,%f40,%f4		! (0_0) dmax ? y
731
732	fmuld	%f60,%f22,%f12		! (2_1) dtmp2 *= y_lo;
733
734	fmovdg	%fcc1,%f40,%f4		! (0_0) if ( dmax < y ) dmax = y;
735
736	faddd	%f6,D2ON28,%f56		! (3_1) x_hi = x + D2ON28;
737	fcmple32	DC1,%f50,%o3	! (2_0) c0 = vis_fcmple32(DC1,x);
738
739	faddd	%f58,D2ON28,%f28	! (3_1) y_hi = y + D2ON28;
740	fcmple32	DC1,%f34,%o0	! (2_0) c2 = vis_fcmple32(DC1,y);
741
742	faddd	%f2,%f44,%f30		! (2_1) res += dtmp0;
743	fcmpgt32	DC2,%f50,%o4	! (2_0) c1 = vis_fcmpgt32(DC2,x);
744
745	faddd	%f10,%f12,%f26		! (2_1) dtmp1 += dtmp2;
746	fcmpgt32	DC2,%f34,%o5	! (2_0) c3 = vis_fcmpgt32(DC2,y);
747
748	fand	%f4,DC0,%f16		! (0_0) dmax = vis_fand(dmax,DC0);
749
750	or	%o3,%o0,%o3		! (2_0) c0 |= c2;
751	fsubd	%f56,D2ON28,%f18	! (3_1) x_hi -= D2ON28;
752
753	andcc	%o3,2,%g0		! (2_0) c0 & 2
754	bnz,pn	%icc,.update16		! (2_0) if ( (c0 & 2) != 0 )
755	fsubd	%f28,D2ON28,%f4		! (3_1) y_hi -= D2ON28;
756.cont16:
757	and	%o4,%o5,%o4		! (2_0) c1 &= c3;
758	faddd	%f30,%f26,%f12		! (2_1) res += dtmp1;
759
760	add	%i3,stridey,%l4		! py += stridey
761	andcc	%o4,2,%g0		! (2_0) c1 & 2
762	bnz,pn	%icc,.update17		! (2_0) if ( (c1 & 2) != 0 )
763	fmovd	%f36,%f56		! (1_0) dmax = x;
764.cont17:
765	lda	[%l4]%asi,%f30		! (3_0) ((float*)&y)[0] = ((float*)py)[0];
766	add	%o1,stridex,%l2		! px += stridex
767	fpsub32	DC1,%f16,%f44		! (0_0) dnorm = vis_fpsub32(DC1,dmax);
768
769	fmuld	%f18,%f18,%f60		! (3_1) res = x_hi * x_hi;
770	add	%g5,stridez,%i5		! pz += stridez
771	lda	[%l4+4]%asi,%f31	! (3_0) ((float*)&y)[1] = ((float*)py)[1];
772	faddd	%f58,%f4,%f32		! (3_1) dtmp2 = y + y_hi;
773
774	fsqrtd	%f12,%f12		! (2_1) res = sqrt(res);
775	subcc	counter,4,counter	! counter -= 4;
776	bpos,pt	%icc,.main_loop
777	faddd	%f6,%f18,%f28		! (3_1) dtmp1 = x + x_hi;
778
779	add	counter,4,counter
780
781.tail:
782	subcc	counter,1,counter
783	bneg,a	.begin
784	nop
785
786	fsubd	%f6,%f18,%f20		! (3_2) x_lo = x - x_hi;
787
788	fmuld	%f4,%f4,%f22		! (3_2) dtmp0 = y_hi * y_hi;
789	fsubd	%f58,%f4,%f58		! (3_2) y_lo = y - y_hi;
790
791	fmuld	%f38,%f24,%f10		! (1_2) res = dmax * res;
792	st	%f10,[%i5]		! (1_2) ((float*)pz)[0] = ((float*)&res)[0];
793
794	st	%f11,[%i5+4]		! (1_2) ((float*)pz)[1] = ((float*)&res)[1];
795
796	subcc	counter,1,counter
797	bneg,a	.begin
798	add	%i5,stridez,%i5
799
800	fmuld	%f28,%f20,%f28		! (3_2) dtmp1 *= x_lo;
801
802	fmuld	%f32,%f58,%f24		! (3_2) dtmp2 *= y_lo;
803
804	faddd	%f60,%f22,%f22		! (3_2) res += dtmp0;
805
806	faddd	%f28,%f24,%f26		! (3_2) dtmp1 += dtmp2;
807
808	faddd	%f22,%f26,%f28		! (3_2) res += dtmp1;
809
810	add	%i5,stridez,%l6		! pz += stridez
811
812	fsqrtd	%f28,%f4		! (3_2) res = sqrt(res);
813	add	%l2,stridex,%l1		! px += stridex
814
815	fmuld	%f52,%f12,%f12		! (2_2) res = dmax * res;
816	st	%f12,[%l6]		! (2_2) ((float*)pz)[0] = ((float*)&res)[0];
817
818	st	%f13,[%l6+4]		! (2_2) ((float*)pz)[1] = ((float*)&res)[1];
819
820	subcc	counter,1,counter
821	bneg	.begin
822	add	%l6,stridez,%i5
823
824	fmuld	%f14,%f4,%f14		! (3_2) res = dmax * res;
825	st	%f14,[%i5]		! (3_2) ((float*)pz)[0] = ((float*)&res)[0];
826
827	st	%f15,[%i5+4]		! (3_2) ((float*)pz)[1] = ((float*)&res)[1];
828
829	ba	.begin
830	add	%i5,stridez,%i5
831
832	.align	16
833.spec0:
834	ld	[%i1+4],%l1		! lx = ((int*)px)[1];
835	cmp	%o2,%o4			! j0 ? 0x7ff00000
836	bge,pn	%icc,1f			! if ( j0 >= 0x7ff00000 )
837	fabsd	%f26,%f26		! x = fabs(x);
838
839	sub	%o0,%l4,%o0		! diff = hy - hx;
840	fabsd	%f24,%f24		! y = fabs(y);
841
842	sra	%o0,31,%l4		! j0 = diff >> 31;
843
844	xor	%o0,%l4,%o0		! diff ^ j0
845
846	sethi	%hi(0x03600000),%l1
847	sub	%o0,%l4,%o0		! (diff ^ j0) - j0
848
849	cmp	%o0,%l1			! ((diff ^ j0) - j0) ? 0x03600000
850	bge,a,pn	%icc,2f		! if ( ((diff ^ j0) - j0) >= 0x03600000 )
851	faddd	%f26,%f24,%f24		! *pz = x + y
852
853	fmuld	%f26,DC2,%f36		! (1_1) x *= dnorm;
854
855	fmuld	%f24,DC2,%f56		! (1_1) y *= dnorm;
856
857	faddd	%f36,D2ON28,%f58	! (1_1) x_hi = x + D2ON28;
858
859	faddd	%f56,D2ON28,%f22	! (1_1) y_hi = y + D2ON28;
860
861	fsubd	%f58,D2ON28,%f58	! (1_1) x_hi -= D2ON28;
862
863	fsubd	%f22,D2ON28,%f22	! (1_1) y_hi -= D2ON28;
864
865	fmuld	%f58,%f58,%f60		! (1_1) res = x_hi * x_hi;
866	faddd	%f56,%f22,%f28		! (1_1) dtmp2 = y + y_hi;
867
868	faddd	%f36,%f58,%f6		! (1_1) dtmp1 = x + x_hi;
869
870	fsubd	%f36,%f58,%f58		! (1_1) x_lo = x - x_hi;
871
872	fmuld	%f22,%f22,%f2		! (1_1) dtmp0 = y_hi * y_hi;
873	fsubd	%f56,%f22,%f56		! (1_1) y_lo = y - y_hi;
874
875	fmuld	%f6,%f58,%f10		! (1_1) dtmp1 *= x_lo;
876
877	fmuld	%f28,%f56,%f26		! (1_1) dtmp2 *= y_lo;
878
879	faddd	%f60,%f2,%f24		! (1_1) res += dtmp0;
880
881	faddd	%f10,%f26,%f28		! (1_1) dtmp1 += dtmp2;
882
883	faddd	%f24,%f28,%f26		! (1_1) res += dtmp1;
884
885	fsqrtd	%f26,%f24		! (1_1) res = sqrt(res);
886
887	fmuld	DC3,%f24,%f24		! (1_2) res = dmax * res;
8882:
889	add	%i3,stridey,%i3
890	add	%i1,stridex,%i1
891	st	%f24,[%i5]		! ((float*)pz)[0] = ((float*)&res)[0];
892	st	%f25,[%i5+4]		! ((float*)pz)[1] = ((float*)&res)[1];
893
894	add	%i5,stridez,%i5
895	ba	.begin1
896	sub	counter,1,counter
897
8981:
899	ld	[%i3+4],%l2		! ly = ((int*)py)[1];
900	cmp	%o0,%o4			! hx ? 0x7ff00000
901	bne,pn	%icc,1f			! if ( hx != 0x7ff00000 )
902	fabsd	%f24,%f24		! y = fabs(y);
903
904	cmp	%l1,0			! lx ? 0
905	be,pn	%icc,2f			! if ( lx == 0 )
906	nop
9071:
908	cmp	%l4,%o4			! hy ? 0x7ff00000
909	bne,pn	%icc,1f			! if ( hy != 0x7ff00000 )
910	nop
911
912	cmp	%l2,0			! ly ? 0
913	be,pn	%icc,2f			! if ( ly == 0 )
914	nop
9151:
916	add	%i3,stridey,%i3
917	add	%i1,stridex,%i1
918	fmuld	%f26,%f24,%f24		! res = x * y;
919	st	%f24,[%i5]		! ((float*)pz)[0] = ((float*)&res)[0];
920
921	st	%f25,[%i5+4]		! ((float*)pz)[1] = ((float*)&res)[1];
922
923	add	%i5,stridez,%i5
924	ba	.begin1
925	sub	counter,1,counter
926
9272:
928	add	%i1,stridex,%i1
929	add	%i3,stridey,%i3
930	st	DC0_HI,[%i5]		! ((int*)pz)[0] = 0x7ff00000;
931	st	DC0_LO,[%i5+4]		! ((int*)pz)[1] = 0;
932	fcmpd	%f26,%f24		! x ? y
933
934	add	%i5,stridez,%i5
935	ba	.begin1
936	sub	counter,1,counter
937
938	.align	16
939.spec1:
940	fmuld	%f26,DC3,%f36		! (1_1) x *= dnorm;
941
942	fmuld	%f24,DC3,%f56		! (1_1) y *= dnorm;
943
944	faddd	%f36,D2ON28,%f58	! (1_1) x_hi = x + D2ON28;
945
946	faddd	%f56,D2ON28,%f22	! (1_1) y_hi = y + D2ON28;
947
948	fsubd	%f58,D2ON28,%f58	! (1_1) x_hi -= D2ON28;
949
950	fsubd	%f22,D2ON28,%f22	! (1_1) y_hi -= D2ON28;
951
952	fmuld	%f58,%f58,%f60		! (1_1) res = x_hi * x_hi;
953	faddd	%f56,%f22,%f28		! (1_1) dtmp2 = y + y_hi;
954
955	faddd	%f36,%f58,%f6		! (1_1) dtmp1 = x + x_hi;
956
957	fsubd	%f36,%f58,%f58		! (1_1) x_lo = x - x_hi;
958
959	fmuld	%f22,%f22,%f2		! (1_1) dtmp0 = y_hi * y_hi;
960	fsubd	%f56,%f22,%f56		! (1_1) y_lo = y - y_hi;
961
962	fmuld	%f6,%f58,%f10		! (1_1) dtmp1 *= x_lo;
963
964	fmuld	%f28,%f56,%f26		! (1_1) dtmp2 *= y_lo;
965
966	faddd	%f60,%f2,%f24		! (1_1) res += dtmp0;
967
968	faddd	%f10,%f26,%f28		! (1_1) dtmp1 += dtmp2;
969
970	faddd	%f24,%f28,%f26		! (1_1) res += dtmp1;
971
972	fsqrtd	%f26,%f24		! (1_1) res = sqrt(res);
973
974	fmuld	DC2,%f24,%f24		! (1_2) res = dmax * res;
975
976	add	%i3,stridey,%i3
977	add	%i1,stridex,%i1
978	st	%f24,[%i5]		! ((float*)pz)[0] = ((float*)&res)[0];
979
980	st	%f25,[%i5+4]		! ((float*)pz)[1] = ((float*)&res)[1];
981	add	%i5,stridez,%i5
982	ba	.begin1
983	sub	counter,1,counter
984
985	.align	16
986.update0:
987	fzero	%f50
988	cmp	counter,1
989	ble	.cont0
990	fzero	%f34
991
992	mov	%o1,tmp_px
993	mov	%i3,tmp_py
994
995	sub	counter,1,tmp_counter
996	ba	.cont0
997	mov	1,counter
998
999	.align	16
1000.update1:
1001	fzero	%f50
1002	cmp	counter,1
1003	ble	.cont1
1004	fzero	%f34
1005
1006	mov	%o1,tmp_px
1007	mov	%i3,tmp_py
1008
1009	sub	counter,1,tmp_counter
1010	ba	.cont1
1011	mov	1,counter
1012
1013	.align	16
1014.update2:
1015	fzero	%f18
1016	cmp	counter,2
1017	ble	.cont2
1018	fzero	%f30
1019
1020	mov	%l2,tmp_px
1021	mov	%l4,tmp_py
1022
1023	sub	counter,2,tmp_counter
1024	ba	.cont1
1025	mov	2,counter
1026
1027	.align	16
1028.update3:
1029	fzero	%f18
1030	cmp	counter,2
1031	ble	.cont3
1032	fzero	%f30
1033
1034	mov	%l2,tmp_px
1035	mov	%l4,tmp_py
1036
1037	sub	counter,2,tmp_counter
1038	ba	.cont3
1039	mov	2,counter
1040
1041	.align	16
1042.update4:
1043	fzero	%f20
1044	cmp	counter,3
1045	ble	.cont4
1046	fzero	%f40
1047
1048	mov	%l1,tmp_px
1049	mov	%i3,tmp_py
1050
1051	sub	counter,3,tmp_counter
1052	ba	.cont4
1053	mov	3,counter
1054
1055	.align	16
1056.update5:
1057	fzero	%f20
1058	cmp	counter,3
1059	ble	.cont5
1060	fzero	%f40
1061
1062	mov	%l1,tmp_px
1063	mov	%i3,tmp_py
1064
1065	sub	counter,3,tmp_counter
1066	ba	.cont5
1067	mov	3,counter
1068
1069	.align	16
1070.update6:
1071	fzero	%f36
1072	cmp	counter,4
1073	ble	.cont6
1074	fzero	%f54
1075
1076	mov	%l7,tmp_px
1077	mov	%l2,tmp_py
1078
1079	sub	counter,4,tmp_counter
1080	ba	.cont6
1081	mov	4,counter
1082
1083	.align	16
1084.update7:
1085	fzero	%f36
1086	cmp	counter,4
1087	ble	.cont7
1088	fzero	%f54
1089
1090	mov	%l7,tmp_px
1091	mov	%l2,tmp_py
1092
1093	sub	counter,4,tmp_counter
1094	ba	.cont7
1095	mov	4,counter
1096
1097	.align	16
1098.update8:
1099	fzero	%f50
1100	cmp	counter,5
1101	ble	.cont8
1102	fzero	%f34
1103
1104	mov	%o1,tmp_px
1105	mov	%i3,tmp_py
1106
1107	sub	counter,5,tmp_counter
1108	ba	.cont8
1109	mov	5,counter
1110
1111	.align	16
1112.update9:
1113	fzero	%f50
1114	cmp	counter,5
1115	ble	.cont9
1116	fzero	%f34
1117
1118	mov	%o1,tmp_px
1119	mov	%i3,tmp_py
1120
1121	sub	counter,5,tmp_counter
1122	ba	.cont9
1123	mov	5,counter
1124
1125
1126	.align	16
1127.update10:
1128	fzero	%f18
1129	cmp	counter,2
1130	ble	.cont10
1131	fzero	%f30
1132
1133	mov	%l2,tmp_px
1134	mov	%l4,tmp_py
1135
1136	sub	counter,2,tmp_counter
1137	ba	.cont10
1138	mov	2,counter
1139
1140	.align	16
1141.update11:
1142	fzero	%f18
1143	cmp	counter,2
1144	ble	.cont11
1145	fzero	%f30
1146
1147	mov	%l2,tmp_px
1148	mov	%l4,tmp_py
1149
1150	sub	counter,2,tmp_counter
1151	ba	.cont11
1152	mov	2,counter
1153
1154	.align	16
1155.update12:
1156	fzero	%f20
1157	cmp	counter,3
1158	ble	.cont12
1159	fzero	%f40
1160
1161	mov	%l1,tmp_px
1162	mov	%i3,tmp_py
1163
1164	sub	counter,3,tmp_counter
1165	ba	.cont12
1166	mov	3,counter
1167
1168	.align	16
1169.update13:
1170	fzero	%f20
1171	cmp	counter,3
1172	ble	.cont13
1173	fzero	%f40
1174
1175	mov	%l1,tmp_px
1176	mov	%i3,tmp_py
1177
1178	sub	counter,3,tmp_counter
1179	ba	.cont13
1180	mov	3,counter
1181
1182	.align	16
1183.update14:
1184	fzero	%f54
1185	cmp	counter,4
1186	ble	.cont14
1187	fzero	%f36
1188
1189	mov	%l7,tmp_px
1190	mov	%l2,tmp_py
1191
1192	sub	counter,4,tmp_counter
1193	ba	.cont14
1194	mov	4,counter
1195
1196	.align	16
1197.update15:
1198	fzero	%f54
1199	cmp	counter,4
1200	ble	.cont15
1201	fzero	%f36
1202
1203	mov	%l7,tmp_px
1204	mov	%l2,tmp_py
1205
1206	sub	counter,4,tmp_counter
1207	ba	.cont15
1208	mov	4,counter
1209
1210	.align	16
1211.update16:
1212	fzero	%f50
1213	cmp	counter,5
1214	ble	.cont16
1215	fzero	%f34
1216
1217	mov	%o1,tmp_px
1218	mov	%i3,tmp_py
1219
1220	sub	counter,5,tmp_counter
1221	ba	.cont16
1222	mov	5,counter
1223
1224	.align	16
1225.update17:
1226	fzero	%f50
1227	cmp	counter,5
1228	ble	.cont17
1229	fzero	%f34
1230
1231	mov	%o1,tmp_px
1232	mov	%i3,tmp_py
1233
1234	sub	counter,5,tmp_counter
1235	ba	.cont17
1236	mov	5,counter
1237
1238	.align	16
1239.exit:
1240	ret
1241	restore
1242	SET_SIZE(__vhypot)
1243
1244