/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

	.file	"__vhypotf.S"

#include "libm.h"

	RO_DATA
	.align	64

.CONST_TBL:
	.word	0x3fe00001, 0x80007e00	! K1  =  5.00000715259318464227e-01
	.word	0xbfc00003, 0xc0017a01	! K2  = -1.25000447037521686593e-01
	.word	0x000fffff, 0xffffffff	! DC0 = 0x000fffffffffffff
	.word	0x3ff00000, 0x00000000	! DC1 = 0x3ff0000000000000
	.word	0x7ffff000, 0x00000000	! DC2 = 0x7ffff00000000000
	.word	0x7fe00000, 0x00000000	! DA0 = 0x7fe0000000000000
	.word	0x47efffff, 0xe0000000	! DFMAX = 3.402823e+38
	.word	0x7f7fffff, 0x80808080	! FMAX = 3.402823e+38 , SCALE = 0x80808080
	.word	0x20000000, 0x00000000	! DA1 = 0x2000000000000000

#define DC0		%f12
#define DC1		%f10
#define DC2		%f42
#define DA0		%f6
#define DA1		%f4
#define K2		%f26
#define K1		%f28
#define SCALE		%f3
#define FMAX		%f2
#define DFMAX		%f50

#define stridex		%l6
#define stridey		%i4
#define stridez		%l5
#define _0x7fffffff	%o1
#define _0x7f3504f3	%o2
#define _0x1ff0		%l2
#define TBL		%l1

#define counter		%l0

#define tmp_px		STACK_BIAS-0x30
#define tmp_py		STACK_BIAS-0x28
#define tmp_counter	STACK_BIAS-0x20
#define tmp0		STACK_BIAS-0x18
#define tmp1		STACK_BIAS-0x10
#define tmp2		STACK_BIAS-0x0c
#define tmp3		STACK_BIAS-0x08
#define tmp4		STACK_BIAS-0x04

! sizeof temp storage - must be a multiple of 16 for V9
#define tmps		0x30

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!      !!!!!   algorithm   !!!!!
!  hx0 = *(int*)px;
!  x0 = *px;
!  px += stridex;
!
!  hy0 = *(int*)py;
!  y0 = *py;
!  py += stridey;
!
!  hx0 &= 0x7fffffff;
!  hy0 &= 0x7fffffff;
!
!  if ( hx >= 0x7f3504f3 || hy >= 0x7f3504f3 )
!  {
!    if ( hx >= 0x7f800000 || hy >= 0x7f800000 )
!    {
!      if ( hx == 0x7f800000 || hy == 0x7f800000 )
!        *(int*)pz = 0x7f800000;
!      else *pz = x * y;
!    }
!    else
!    {
!      hyp = sqrt(x * (double)x + y * (double)y);
!      if ( hyp <= DMAX ) ftmp0 = (float)hyp;
!      else ftmp0 = FMAX * FMAX;
!      *pz = ftmp0;
!    }
!    pz += stridez;
!    continue;
!  }
!  if ( (hx | hy) == 0 )
!  {
!    *pz = 0;
!    pz += stridez;
!    continue;
!  }
!  dx0 = x0 * (double)x0;
!  dy0 = y0 * (double)y0;
!  db0 = dx0 + dy0;
!
!  iexp0 = ((int*)&db0)[0];
!
!  h0 = vis_fand(db0,DC0);
!  h0 = vis_for(h0,DC1);
!  h_hi0 = vis_fand(h0,DC2);
!
!  db0 = vis_fand(db0,DA0);
!  db0 = vis_fmul8x16(SCALE, db0);
!  db0 = vis_fpadd32(db0,DA1);
!
!  iexp0 >>= 8;
!  di0 = iexp0 & 0x1ff0;
!  si0 = (char*)sqrt_arr + di0;
!
!  dtmp0 = ((double*)((char*)div_arr + di0))[0];
!  xx0 = h0 - h_hi0;
!  xx0 *= dmp0;
!
!  dtmp0 = ((double*)si0)[1];
!  res0 = K2 * xx0;
!  res0 += K1;
!  res0 *= xx0;
!  res0 += DC1;
!  res0 = dtmp0 * res0;
!  res0 *= db0;
!  ftmp0 = (float)res0;
!  *pz = ftmp0;
!  pz += stridez;
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

	ENTRY(__vhypotf)
	save	%sp,-SA(MINFRAME)-tmps,%sp
	PIC_SETUP(l7)
	PIC_SET(l7,.CONST_TBL,o3)
	PIC_SET(l7,__vlibm_TBL_sqrtf,l1)

#ifdef __sparcv9
	ldx	[%fp+STACK_BIAS+176],stridez
#else
	ld	[%fp+STACK_BIAS+92],stridez
#endif
	st	%i0,[%fp+tmp_counter]

	stx	%i1,[%fp+tmp_px]

	stx	%i3,[%fp+tmp_py]

	ldd	[%o3],K1
	sethi	%hi(0x7ffffc00),%o1

	ldd	[%o3+8],K2
	sethi	%hi(0x7f350400),%o2

	ldd	[%o3+16],DC0
	add	%o1,1023,_0x7fffffff
	add	%o2,0xf3,_0x7f3504f3

	ldd	[%o3+24],DC1
	sll	%i2,2,stridex

	ld	[%o3+56],FMAX

	ldd	[%o3+32],DC2
	sll	%i4,2,stridey

	ldd	[%o3+40],DA0
	sll	stridez,2,stridez

	ldd	[%o3+48],DFMAX

	ld	[%o3+60],SCALE
	or	%g0,0xff8,%l2

	ldd	[%o3+64],DA1
	sll	%l2,1,_0x1ff0
	or	%g0,%i5,%l7

.begin:
	ld	[%fp+tmp_counter],counter
	ldx	[%fp+tmp_px],%i1
	ldx	[%fp+tmp_py],%i2
	st	%g0,[%fp+tmp_counter]
.begin1:
	cmp	counter,0
	ble,pn	%icc,.exit
	lda	[%i1]0x82,%l3		! (3_0) hx0 = *(int*)px;

	lda	[%i2]0x82,%l4		! (3_0) hy0 = *(int*)py;

	lda	[%i1]0x82,%f17		! (3_0) x0 = *px;
	and	%l3,_0x7fffffff,%l3	! (3_0) hx0 &= 0x7fffffff;

	cmp	%l3,_0x7f3504f3		! (3_0) hx ? 0x7f3504f3
	bge,pn	%icc,.spec		! (3_0) if ( hx >= 0x7f3504f3 )
	and	%l4,_0x7fffffff,%l4	! (3_0) hy0 &= 0x7fffffff;

	cmp	%l4,_0x7f3504f3		! (3_0) hy ? 0x7f3504f3
	bge,pn	%icc,.spec		! (3_0) if ( hy >= 0x7f3504f3 )
	or	%g0,%i2,%o7

	orcc	%l3,%l4,%g0
	bz,pn	%icc,.spec1

	add	%i1,stridex,%i1		! px += stridex
	fsmuld	%f17,%f17,%f44		! (3_0) dx0 = x0 * (double)x0;
	lda	[%i2]0x82,%f17		! (3_0) y0 = *py;

	lda	[%i1]0x82,%l3		! (4_0) hx0 = *(int*)px;

	lda	[stridey+%o7]0x82,%l4	! (4_0) hy0 = *(int*)py;

	and	%l3,_0x7fffffff,%l3	! (4_0) hx0 &= 0x7fffffff;

	fsmuld	%f17,%f17,%f24		! (3_0) dy0 = y0 * (double)y0;
	cmp	%l3,_0x7f3504f3		! (4_0) hx ? 0x7f3504f3
	bge,pn	%icc,.update0		! (4_0) if ( hx >= 0x7f3504f3 )
	and	%l4,_0x7fffffff,%l4	! (4_0) hy0 &= 0x7fffffff;

	orcc	%l3,%l4,%g0
	bz,pn	%icc,.update0
	lda	[%i1]0x82,%f17		! (4_0) x0 = *px;
.cont0:
	faddd	%f44,%f24,%f24		! (3_0) db0 = dx0 + dy0;

	fsmuld	%f17,%f17,%f40		! (4_1) dy0 = x0 * (double)x0;
	cmp	%l4,_0x7f3504f3		! (4_1) hy ? 0x7f3504f3
	lda	[stridey+%o7]0x82,%f17	! (4_1) hy0 = *py;

	add	%o7,stridey,%i5		! py += stridey
	lda	[%i1+stridex]0x82,%l3	! (0_0) hx0 = *(int*)px;

	bge,pn	%icc,.update1		! (4_1) if ( hy >= 0x7f3504f3 )
	st	%f24,[%fp+tmp0]		! (3_1) iexp0 = ((int*)&db0)[0];
.cont1:
	and	%l3,_0x7fffffff,%l3	! (0_0) hx0 &= 0x7fffffff;

	fsmuld	%f17,%f17,%f48		! (4_1) dy0 = y0 * (double)y0;
	lda	[%i1+stridex]0x82,%f8	! (0_0) x0 = *px;

	add	%i1,stridex,%i1		! px += stridex

	lda	[%i5+stridey]0x82,%l4	! (0_0) hy0 = *(int*)py;
	cmp	%l3,_0x7f3504f3		! (0_0) hx ? 0x7f3504f3
	bge,pn	%icc,.update2		! (0_0) if ( hx >= 0x7f3504f3 )
	add	%i5,stridey,%o4		! py += stridey
.cont2:
	faddd	%f40,%f48,%f20		! (4_1) db0 = dx0 + dy0;

	fsmuld	%f8,%f8,%f40		! (0_0) dx0 = x0 * (double)x0;
	and	%l4,_0x7fffffff,%l4	! (0_0) hy0 &= 0x7fffffff;
	lda	[%i5+stridey]0x82,%f17	! (0_0) hy0 = *py;

	cmp	%l4,_0x7f3504f3		! (0_0) hy ? 0x7f3504f3
	bge,pn	%icc,.update3		! (0_0) if ( hy >= 0x7f3504f3 )
	st	%f20,[%fp+tmp1]		! (4_1) iexp0 = ((int*)&db0)[0];

	orcc	%l3,%l4,%g0
	bz,pn	%icc,.update3
.cont3:
	lda	[%i1+stridex]0x82,%l3	! (1_0) hx0 = *(int*)px;

	fand	%f24,DC0,%f60		! (3_1) h0 = vis_fand(db0,DC0);

	and	%l3,_0x7fffffff,%l3	! (1_0) hx0 &= 0x7fffffff;

	fsmuld	%f17,%f17,%f34		! (0_0) dy0 = y0 * (double)y0;
	cmp	%l3,_0x7f3504f3		! (1_0) hx ? 0x7f3504f3
	lda	[%o4+stridey]0x82,%l4	! (1_0) hy0 = *(int*)py;

	add	%i1,stridex,%i1		! px += stridex

	lda	[%i1]0x82,%f17		! (1_0) x0 = *px;
	bge,pn	%icc,.update4		! (1_0) if ( hx >= 0x7f3504f3 )
	add	%o4,stridey,%i5		! py += stridey
.cont4:
	and	%l4,_0x7fffffff,%l4	! (1_0) hy0 &= 0x7fffffff;
	for	%f60,DC1,%f46		! (3_1) h0 = vis_for(h0,DC1);

	cmp	%l4,_0x7f3504f3		! (1_0) hy ? 0x7f3504f3
	ld	[%fp+tmp0],%o0		! (3_1) iexp0 = ((int*)&db0)[0];
	faddd	%f40,%f34,%f0		! (0_0) db0 = dx0 + dy0;

	fsmuld	%f17,%f17,%f40		! (1_0) dx0 = x0 * (double)x0;
	add	%i1,stridex,%i1		! px += stridex
	lda	[%o4+stridey]0x82,%f17	! (1_0) y0 = *py;

	srax	%o0,8,%o0		! (3_1) iexp0 >>= 8;
	bge,pn	%icc,.update5		! (1_0) if ( hy >= 0x7f3504f3 )
	fand	%f46,DC2,%f38		! (3_1) h_hi0 = vis_fand(h0,DC2);

	orcc	%l3,%l4,%g0
	bz,pn	%icc,.update5
.cont5:
	lda	[%i1]0x82,%l3		! (2_0) hx0 = *(int*)px;

	and	%o0,_0x1ff0,%o0		! (3_1) di0 = iexp0 & 0x1ff0;
	st	%f0,[%fp+tmp2]		! (0_0) iexp0 = ((int*)&db0)[0];
	fand	%f20,DC0,%f60		! (4_1) h0 = vis_fand(db0,DC0);

	ldd	[TBL+%o0],%f22		! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
	fsubd	%f46,%f38,%f38		! (3_1) xx0 = h0 - h_hi0;

	fsmuld	%f17,%f17,%f32		! (1_0) dy0 = y0 * (double)y0;
	add	%i5,stridey,%i2		! py += stridey
	lda	[stridey+%i5]0x82,%l4	! (2_0) hy0 = *(int*)py;

	and	%l3,_0x7fffffff,%l3	! (2_0) hx0 &= 0x7fffffff;

	lda	[%i1]0x82,%f17		! (2_0) x0 = *px;
	cmp	%l3,_0x7f3504f3		! (2_0) hx ? 0x7f3504f3

	fmuld	%f38,%f22,%f38		! (3_1) xx0 *= dmp0;
	and	%l4,_0x7fffffff,%l4	! (2_0) hy0 &= 0x7fffffff;
	for	%f60,DC1,%f46		! (4_1) h0 = vis_for(h0,DC1);

	bge,pn	%icc,.update6		! (2_0) if ( hx >= 0x7f3504f3 )
	ld	[%fp+tmp1],%o3		! (4_1) iexp0 = ((int*)&db0)[0];
.cont6:
	faddd	%f40,%f32,%f18		! (1_0) db0 = dx0 + dy0;

	fsmuld	%f17,%f17,%f44		! (2_0) dx0 = x0 * (double)x0;
	cmp	%l4,_0x7f3504f3		! (2_0) hy ? 0x7f3504f3
	lda	[stridey+%i5]0x82,%f17	! (2_0) y0 = *py;

	add	%i1,stridex,%i1		! px += stridex
	bge,pn	%icc,.update7		! (2_0) if ( hy >= 0x7f3504f3 )
	fand	%f46,DC2,%f58		! (4_1) h_hi0 = vis_fand(h0,DC2);

	orcc	%l3,%l4,%g0
	bz,pn	%icc,.update7
	nop
.cont7:
	fmuld	K2,%f38,%f56		! (3_1) res0 = K2 * xx0;
	srax	%o3,8,%o3		! (4_1) iexp0 >>= 8;
	lda	[%i1]0x82,%l3		! (3_0) hx0 = *(int*)px;

	and	%o3,_0x1ff0,%o3		! (4_1) di0 = iexp0 & 0x1ff0;
	st	%f18,[%fp+tmp3]		! (1_0) iexp0 = ((int*)&db0)[0];
	fand	%f0,DC0,%f60		! (0_0) h0 = vis_fand(db0,DC0);

	ldd	[TBL+%o3],%f22		! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
	add	%i2,stridey,%o7		! py += stridey
	fsubd	%f46,%f58,%f58		! (4_1) xx0 = h0 - h_hi0;

	fsmuld	%f17,%f17,%f30		! (2_0) dy0 = y0 * (double)y0;
	lda	[stridey+%i2]0x82,%l4	! (3_0) hy0 = *(int*)py;
	and	%l3,_0x7fffffff,%l3	! (3_0) hx0 &= 0x7fffffff;

	faddd	%f56,K1,%f54		! (3_1) res0 += K1;
	cmp	%l3,_0x7f3504f3		! (3_0) hx ? 0x7f3504f3

	lda	[%i1]0x82,%f17		! (3_0) x0 = *px;
	add	%i1,stridex,%i1		! px += stridex
	bge,pn	%icc,.update8		! (3_0) if ( hx >= 0x7f3504f3 )

	fmuld	%f58,%f22,%f58		! (4_1) xx0 *= dmp0;
.cont8:
	and	%l4,_0x7fffffff,%l4	! (3_0) hy0 &= 0x7fffffff;
	for	%f60,DC1,%f46		! (0_0) h0 = vis_for(h0,DC1);

	cmp	%l4,_0x7f3504f3		! (3_0) hy ? 0x7f3504f3
	ld	[%fp+tmp2],%g1		! (0_0) iexp0 = ((int*)&db0)[0];
	faddd	%f44,%f30,%f30		! (2_0) db0 = dx0 + dy0;

	fsmuld	%f17,%f17,%f44		! (3_0) dx0 = x0 * (double)x0;
	bge,pn	%icc,.update9		! (3_0) if ( hy >= 0x7f3504f3 )
	lda	[stridey+%i2]0x82,%f17	! (3_0) y0 = *py;

	orcc	%l3,%l4,%g0
	bz,pn	%icc,.update9
	nop
.cont9:
	fmuld	%f54,%f38,%f40		! (3_1) res0 *= xx0;
	lda	[%i1]0x82,%l3		! (4_0) hx0 = *(int*)px;
	fand	%f46,DC2,%f38		! (0_0) h_hi0 = vis_fand(h0,DC2);

	fmuld	K2,%f58,%f54		! (4_1) res0 = K2 * xx0;
	srax	%g1,8,%o5		! (0_0) iexp0 >>= 8;
	lda	[stridey+%o7]0x82,%l4	! (4_0) hy0 = *(int*)py;
	fand	%f24,DA0,%f56		! (3_1) db0 = vis_fand(db0,DA0);

	and	%o5,_0x1ff0,%o5		! (0_0) di0 = iexp0 & 0x1ff0;
	st	%f30,[%fp+tmp4]		! (2_0) iexp0 = ((int*)&db0)[0];
	fand	%f18,DC0,%f60		! (1_0) h0 = vis_fand(db0,DC0);

	ldd	[TBL+%o5],%f22		! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0];
	add	%o0,TBL,%g1		! (3_1) si0 = (char*)sqrt_arr + di0;
	and	%l3,_0x7fffffff,%l3	! (4_0) hx0 &= 0x7fffffff;
	fsubd	%f46,%f38,%f38		! (0_0) xx0 = h0 - h_hi0;

	fsmuld	%f17,%f17,%f24		! (3_0) dy0 = y0 * (double)y0;
	cmp	%l3,_0x7f3504f3		! (4_0) hx ? 0x7f3504f3
	bge,pn	%icc,.update10		! (4_0) if ( hx >= 0x7f3504f3 )
	faddd	%f40,DC1,%f40		! (3_1) res0 += DC1;

	fmul8x16	SCALE,%f56,%f36	! (3_1) db0 = vis_fmul8x16(SCALE, db0);
	and	%l4,_0x7fffffff,%l4	! (4_0) hy0 &= 0x7fffffff;
	ldd	[%g1+8],%f56		! (3_1) dtmp0 = ((double*)si0)[1];
	faddd	%f54,K1,%f54		! (4_1) res0 += K1;

	lda	[%i1]0x82,%f17		! (4_0) x0 = *px;
.cont10:
	fmuld	%f38,%f22,%f38		! (0_0) xx0 *= dmp0;
	cmp	counter,5
	for	%f60,DC1,%f46		! (1_0) h0 = vis_for(h0,DC1);

	ld	[%fp+tmp3],%g1		! (1_0) iexp0 = ((int*)&db0)[0];
	fmuld	%f56,%f40,%f62		! (3_1) res0 = dtmp0 * res0;
	faddd	%f44,%f24,%f24		! (3_0) db0 = dx0 + dy0;

	bl,pn	%icc,.tail
	nop

	ba	.main_loop
	sub	counter,5,counter

	.align	16
.main_loop:
	fsmuld	%f17,%f17,%f40		! (4_1) dy0 = x0 * (double)x0;
	cmp	%l4,_0x7f3504f3		! (4_1) hy ? 0x7f3504f3
	lda	[stridey+%o7]0x82,%f17	! (4_1) hy0 = *py;
	fpadd32	%f36,DA1,%f36		! (3_2) db0 = vis_fpadd32(db0,DA1);

	fmuld	%f54,%f58,%f58		! (4_2) res0 *= xx0;
	add	%o7,stridey,%i5		! py += stridey
	st	%f24,[%fp+tmp0]		! (3_1) iexp0 = ((int*)&db0)[0];
	fand	%f46,DC2,%f44		! (1_1) h_hi0 = vis_fand(h0,DC2);

	fmuld	K2,%f38,%f56		! (0_1) res0 = K2 * xx0;
	srax	%g1,8,%g5		! (1_1) iexp0 >>= 8;
	bge,pn	%icc,.update11		! (4_1) if ( hy >= 0x7f3504f3 )
	fand	%f20,DA0,%f54		! (4_2) db0 = vis_fand(db0,DA0);

	orcc	%l3,%l4,%g0
	nop
	bz,pn	%icc,.update11
	fzero	%f52
.cont11:
	fmuld	%f62,%f36,%f62		! (3_2) res0 *= db0;
	and	%g5,_0x1ff0,%g5		! (1_1) di0 = iexp0 & 0x1ff0;
	lda	[%i1+stridex]0x82,%l3	! (0_0) hx0 = *(int*)px;
	fand	%f30,DC0,%f60		! (2_1) h0 = vis_fand(db0,DC0);

	ldd	[%g5+TBL],%f22		! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
	add	%o3,TBL,%g1		! (4_2) si0 = (char*)sqrt_arr + di0;
	add	%i1,stridex,%i0		! px += stridex
	fsubd	%f46,%f44,%f44		! (1_1) xx0 = h0 - h_hi0;

	fsmuld	%f17,%f17,%f48		! (4_1) dy0 = y0 * (double)y0;
	nop
	lda	[%i1+stridex]0x82,%f8	! (0_0) x0 = *px;
	faddd	%f58,DC1,%f36		! (4_2) res0 += DC1;

	faddd	%f56,K1,%f58		! (0_1) res0 += K1;
	and	%l3,_0x7fffffff,%l3	! (0_0) hx0 &= 0x7fffffff;
	ldd	[%g1+8],%f56		! (4_2) dtmp0 = ((double*)si0)[1];
	fmul8x16	SCALE,%f54,%f54	! (4_2) db0 = vis_fmul8x16(SCALE, db0);

	lda	[%i5+stridey]0x82,%l4	! (0_0) hy0 = *(int*)py;
	cmp	%l3,_0x7f3504f3		! (0_0) hx ? 0x7f3504f3
	bge,pn	%icc,.update12		! (0_0) if ( hx >= 0x7f3504f3 )
	fdtos	%f62,%f14		! (3_2) ftmp0 = (float)res0;
.cont12:
	fmuld	%f44,%f22,%f44		! (1_1) xx0 *= dmp0;
	add	%l7,stridez,%o7		! pz += stridez
	st	%f14,[%l7]		! (3_2) *pz = ftmp0;
	for	%f60,DC1,%f46		! (2_1) h0 = vis_for(h0,DC1);

	fmuld	%f56,%f36,%f36		! (4_2) res0 = dtmp0 * res0;
	add	%i5,stridey,%o4		! py += stridey
	ld	[%fp+tmp4],%g1		! (2_1) iexp0 = ((int*)&db0)[0];
	faddd	%f40,%f48,%f20		! (4_1) db0 = dx0 + dy0;

	fsmuld	%f8,%f8,%f40		! (0_0) dx0 = x0 * (double)x0;
	and	%l4,_0x7fffffff,%l4	! (0_0) hy0 &= 0x7fffffff;
	lda	[%i5+stridey]0x82,%f17	! (0_0) hy0 = *py;
	fpadd32	%f54,DA1,%f62		! (4_2) db0 = vis_fpadd32(db0,DA1);

	fmuld	%f58,%f38,%f38		! (0_1) res0 *= xx0;
	cmp	%l4,_0x7f3504f3		! (0_0) hy ? 0x7f3504f3
	st	%f20,[%fp+tmp1]		! (4_1) iexp0 = ((int*)&db0)[0];
	fand	%f46,DC2,%f58		! (2_1) h_hi0 = vis_fand(h0,DC2);

	fmuld	K2,%f44,%f56		! (1_1) res0 = K2 * xx0;
	srax	%g1,8,%g1		! (2_1) iexp0 >>= 8;
	bge,pn	%icc,.update13		! (0_0) if ( hy >= 0x7f3504f3 )
	fand	%f0,DA0,%f54		! (0_1) db0 = vis_fand(db0,DA0);

	orcc	%l3,%l4,%g0
	nop
	bz,pn	%icc,.update13
	fzero	%f52
.cont13:
	fmuld	%f36,%f62,%f62		! (4_2) res0 *= db0;
	and	%g1,_0x1ff0,%g1		! (2_1) di0 = iexp0 & 0x1ff0;
	lda	[%i0+stridex]0x82,%l3	! (1_0) hx0 = *(int*)px;
	fand	%f24,DC0,%f60		! (3_1) h0 = vis_fand(db0,DC0);

	ldd	[TBL+%g1],%f22		! (2_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
	add	%o5,TBL,%o0		! (0_1) si0 = (char*)sqrt_arr + di0;
	add	%i0,stridex,%i1		! px += stridex
	fsubd	%f46,%f58,%f58		! (2_1) xx0 = h0 - h_hi0;

	fsmuld	%f17,%f17,%f34		! (0_0) dy0 = y0 * (double)y0;
	add	%o7,stridez,%i0		! pz += stridez
	lda	[%o4+stridey]0x82,%l4	! (1_0) hy0 = *(int*)py;
	faddd	%f38,DC1,%f36		! (0_1) res0 += DC1;

	faddd	%f56,K1,%f38		! (1_1) res0 += K1;
	and	%l3,_0x7fffffff,%l3	! (1_0) hx0 &= 0x7fffffff;
	ldd	[%o0+8],%f56		! (0_1) dtmp0 = ((double*)si0)[1];
	fmul8x16	SCALE,%f54,%f54	! (0_1) db0 = vis_fmul8x16(SCALE, db0);

	lda	[%i1]0x82,%f17		! (1_0) x0 = *px;
	cmp	%l3,_0x7f3504f3		! (1_0) hx ? 0x7f3504f3
	bge,pn	%icc,.update14		! (1_0) if ( hx >= 0x7f3504f3 )
	fdtos	%f62,%f14		! (4_2) ftmp0 = (float)res0;
.cont14:
	fmuld	%f58,%f22,%f58		! (2_1) xx0 *= dmp0;
	and	%l4,_0x7fffffff,%l4	! (1_0) hy0 &= 0x7fffffff;
	add	%o4,stridey,%i5		! py += stridey
	for	%f60,DC1,%f46		! (3_1) h0 = vis_for(h0,DC1);

	fmuld	%f56,%f36,%f36		! (0_1) res0 = dtmp0 * res0;
	cmp	%l4,_0x7f3504f3		! (1_0) hy ? 0x7f3504f3
	ld	[%fp+tmp0],%o0		! (3_1) iexp0 = ((int*)&db0)[0];
	faddd	%f40,%f34,%f0		! (0_0) db0 = dx0 + dy0;

	fsmuld	%f17,%f17,%f40		! (1_0) dx0 = x0 * (double)x0;
	add	%i1,stridex,%i1		! px += stridex
	lda	[%o4+stridey]0x82,%f17	! (1_0) y0 = *py;
	fpadd32	%f54,DA1,%f62		! (0_1) db0 = vis_fpadd32(db0,DA1);

	fmuld	%f38,%f44,%f44		! (1_1) res0 *= xx0;
	st	%f14,[%o7]		! (4_2) *pz = ftmp0;
	bge,pn	%icc,.update15		! (1_0) if ( hy >= 0x7f3504f3 )
	fand	%f46,DC2,%f38		! (3_1) h_hi0 = vis_fand(h0,DC2);

	orcc	%l3,%l4,%g0
	bz,pn	%icc,.update15
	nop
.cont15:
	fmuld	K2,%f58,%f54		! (2_1) res0 = K2 * xx0;
	srax	%o0,8,%o0		! (3_1) iexp0 >>= 8;
	st	%f0,[%fp+tmp2]		! (0_0) iexp0 = ((int*)&db0)[0];
	fand	%f18,DA0,%f56		! (1_1) db0 = vis_fand(db0,DA0);

	fmuld	%f36,%f62,%f62		! (0_1) res0 *= db0;
	and	%o0,_0x1ff0,%o0		! (3_1) di0 = iexp0 & 0x1ff0;
	lda	[%i1]0x82,%l3		! (2_0) hx0 = *(int*)px;
	fand	%f20,DC0,%f60		! (4_1) h0 = vis_fand(db0,DC0);

	ldd	[TBL+%o0],%f22		! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
	add	%g5,TBL,%o3		! (1_1) si0 = (char*)sqrt_arr + di0;
	add	%i0,stridez,%i3		! pz += stridez
	fsubd	%f46,%f38,%f38		! (3_1) xx0 = h0 - h_hi0;

	fsmuld	%f17,%f17,%f32		! (1_0) dy0 = y0 * (double)y0;
	add	%i5,stridey,%i2		! py += stridey
	lda	[stridey+%i5]0x82,%l4	! (2_0) hy0 = *(int*)py;
	faddd	%f44,DC1,%f44		! (1_1) res0 += DC1;

	fmul8x16	SCALE,%f56,%f36	! (1_1) db0 = vis_fmul8x16(SCALE, db0);
	and	%l3,_0x7fffffff,%l3	! (2_0) hx0 &= 0x7fffffff;
	ldd	[%o3+8],%f56		! (1_1) dtmp0 = ((double*)si0)[1];
	faddd	%f54,K1,%f54		! (2_1) res0 += K1;

	lda	[%i1]0x82,%f17		! (2_0) x0 = *px;
	cmp	%l3,_0x7f3504f3		! (2_0) hx ? 0x7f3504f3
	add	%i3,stridez,%o4		! pz += stridez
	fdtos	%f62,%f14		! (0_1) ftmp0 = (float)res0;

	fmuld	%f38,%f22,%f38		! (3_1) xx0 *= dmp0;
	and	%l4,_0x7fffffff,%l4	! (2_0) hy0 &= 0x7fffffff;
	st	%f14,[%i0]		! (0_1) *pz = ftmp0;
	for	%f60,DC1,%f46		! (4_1) h0 = vis_for(h0,DC1);

	fmuld	%f56,%f44,%f62		! (1_1) res0 = dtmp0 * res0;
	bge,pn	%icc,.update16		! (2_0) if ( hx >= 0x7f3504f3 )
	ld	[%fp+tmp1],%o3		! (4_1) iexp0 = ((int*)&db0)[0];
	faddd	%f40,%f32,%f18		! (1_0) db0 = dx0 + dy0;
.cont16:
	fsmuld	%f17,%f17,%f44		! (2_0) dx0 = x0 * (double)x0;
	cmp	%l4,_0x7f3504f3		! (2_0) hy ? 0x7f3504f3
	lda	[stridey+%i5]0x82,%f17	! (2_0) y0 = *py;
	fpadd32	%f36,DA1,%f36		! (1_1) db0 = vis_fpadd32(db0,DA1);

	fmuld	%f54,%f58,%f54		! (2_1) res0 *= xx0;
	add	%i1,stridex,%l7		! px += stridex
	bge,pn	%icc,.update17		! (2_0) if ( hy >= 0x7f3504f3 )
	fand	%f46,DC2,%f58		! (4_1) h_hi0 = vis_fand(h0,DC2);

	orcc	%l3,%l4,%g0
	nop
	bz,pn	%icc,.update17
	fzero	%f52
.cont17:
	fmuld	K2,%f38,%f56		! (3_1) res0 = K2 * xx0;
	srax	%o3,8,%o3		! (4_1) iexp0 >>= 8;
	st	%f18,[%fp+tmp3]		! (1_0) iexp0 = ((int*)&db0)[0];
	fand	%f30,DA0,%f40		! (2_1) db0 = vis_fand(db0,DA0);

	fmuld	%f62,%f36,%f62		! (1_1) res0 *= db0;
	and	%o3,_0x1ff0,%o3		! (4_1) di0 = iexp0 & 0x1ff0;
	lda	[%l7]0x82,%l3		! (3_0) hx0 = *(int*)px;
	fand	%f0,DC0,%f60		! (0_0) h0 = vis_fand(db0,DC0);

	ldd	[TBL+%o3],%f22		! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
	add	%g1,TBL,%g1		! (2_1) si0 = (char*)sqrt_arr + di0;
	add	%i2,stridey,%o7		! py += stridey
	fsubd	%f46,%f58,%f58		! (4_1) xx0 = h0 - h_hi0;

	fsmuld	%f17,%f17,%f30		! (2_0) dy0 = y0 * (double)y0;
	lda	[stridey+%i2]0x82,%l4	! (3_0) hy0 = *(int*)py;
	add	%l7,stridex,%i1		! px += stridex
	faddd	%f54,DC1,%f36		! (2_1) res0 += DC1;

	faddd	%f56,K1,%f54		! (3_1) res0 += K1;
	and	%l3,_0x7fffffff,%l3	! (3_0) hx0 &= 0x7fffffff;
	ldd	[%g1+8],%f56		! (2_1) dtmp0 = ((double*)si0)[1];
	fmul8x16	SCALE,%f40,%f40	! (2_1) db0 = vis_fmul8x16(SCALE, db0);

	lda	[%l7]0x82,%f17		! (3_0) x0 = *px;
	cmp	%l3,_0x7f3504f3		! (3_0) hx ? 0x7f3504f3
	bge,pn	%icc,.update18		! (3_0) if ( hx >= 0x7f3504f3 )
	fdtos	%f62,%f14		! (1_1) ftmp0 = (float)res0;
.cont18:
	fmuld	%f58,%f22,%f58		! (4_1) xx0 *= dmp0;
	and	%l4,_0x7fffffff,%l4	! (3_0) hy0 &= 0x7fffffff;
	st	%f14,[%i3]		! (1_1) *pz = ftmp0;
	for	%f60,DC1,%f46		! (0_0) h0 = vis_for(h0,DC1);

	fmuld	%f56,%f36,%f36		! (2_1) res0 = dtmp0 * res0;
	cmp	%l4,_0x7f3504f3		! (3_0) hy ? 0x7f3504f3
	ld	[%fp+tmp2],%g1		! (0_0) iexp0 = ((int*)&db0)[0];
	faddd	%f44,%f30,%f30		! (2_0) db0 = dx0 + dy0;

	fsmuld	%f17,%f17,%f44		! (3_0) dx0 = x0 * (double)x0;
	bge,pn	%icc,.update19		! (3_0) if ( hy >= 0x7f3504f3 )
	lda	[stridey+%i2]0x82,%f17	! (3_0) y0 = *py;
	fpadd32	%f40,DA1,%f62		! (2_1) db0 = vis_fpadd32(db0,DA1);

.cont19:
	fmuld	%f54,%f38,%f40		! (3_1) res0 *= xx0;
	orcc	%l3,%l4,%g0
	st	%f30,[%fp+tmp4]		! (2_0) iexp0 = ((int*)&db0)[0];
	fand	%f46,DC2,%f38		! (0_0) h_hi0 = vis_fand(h0,DC2);

	fmuld	K2,%f58,%f54		! (4_1) res0 = K2 * xx0;
	srax	%g1,8,%o5		! (0_0) iexp0 >>= 8;
	lda	[%i1]0x82,%l3		! (4_0) hx0 = *(int*)px;
	fand	%f24,DA0,%f56		! (3_1) db0 = vis_fand(db0,DA0);

	fmuld	%f36,%f62,%f62		! (2_1) res0 *= db0;
	and	%o5,_0x1ff0,%o5		! (0_0) di0 = iexp0 & 0x1ff0;
	bz,pn	%icc,.update19a
	fand	%f18,DC0,%f60		! (1_0) h0 = vis_fand(db0,DC0);
.cont19a:
	ldd	[TBL+%o5],%f22		! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0];
	add	%o0,TBL,%g1		! (3_1) si0 = (char*)sqrt_arr + di0;
	and	%l3,_0x7fffffff,%l3	! (4_0) hx0 &= 0x7fffffff;
	fsubd	%f46,%f38,%f38		! (0_0) xx0 = h0 - h_hi0;

	fsmuld	%f17,%f17,%f24		! (3_0) dy0 = y0 * (double)y0;
	cmp	%l3,_0x7f3504f3		! (4_0) hx ? 0x7f3504f3
	lda	[stridey+%o7]0x82,%l4	! (4_0) hy0 = *(int*)py;
	faddd	%f40,DC1,%f40		! (3_1) res0 += DC1;

	fmul8x16	SCALE,%f56,%f36	! (3_1) db0 = vis_fmul8x16(SCALE, db0);
	bge,pn	%icc,.update20		! (4_0) if ( hx >= 0x7f3504f3 )
	ldd	[%g1+8],%f56		! (3_1) dtmp0 = ((double*)si0)[1];
	faddd	%f54,K1,%f54		! (4_1) res0 += K1;

	lda	[%i1]0x82,%f17		! (4_0) x0 = *px;
.cont20:
	subcc	counter,5,counter	! counter -= 5
	add	%o4,stridez,%l7		! pz += stridez
	fdtos	%f62,%f14		! (2_1) ftmp0 = (float)res0;

	fmuld	%f38,%f22,%f38		! (0_0) xx0 *= dmp0;
	and	%l4,_0x7fffffff,%l4	! (4_0) hy0 &= 0x7fffffff;
	st	%f14,[%o4]		! (2_1) *pz = ftmp0;
	for	%f60,DC1,%f46		! (1_0) h0 = vis_for(h0,DC1);

	ld	[%fp+tmp3],%g1		! (1_0) iexp0 = ((int*)&db0)[0];
	fmuld	%f56,%f40,%f62		! (3_1) res0 = dtmp0 * res0;
	bpos,pt	%icc,.main_loop
	faddd	%f44,%f24,%f24		! (3_0) db0 = dx0 + dy0;

	add	counter,5,counter

.tail:
	subcc	counter,1,counter
	bneg	.begin
	nop

	fpadd32	%f36,DA1,%f36		! (3_2) db0 = vis_fpadd32(db0,DA1);

	fmuld	%f54,%f58,%f58		! (4_2) res0 *= xx0;
	fand	%f46,DC2,%f44		! (1_1) h_hi0 = vis_fand(h0,DC2);

	fmuld	K2,%f38,%f56		! (0_1) res0 = K2 * xx0;
	srax	%g1,8,%g5		! (1_1) iexp0 >>= 8;
	fand	%f20,DA0,%f54		! (4_2) db0 = vis_fand(db0,DA0);

	fmuld	%f62,%f36,%f62		! (3_2) res0 *= db0;
	and	%g5,_0x1ff0,%g5		! (1_1) di0 = iexp0 & 0x1ff0;

	ldd	[%g5+TBL],%f22		! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
	add	%o3,TBL,%g1		! (4_2) si0 = (char*)sqrt_arr + di0;
	fsubd	%f46,%f44,%f44		! (1_1) xx0 = h0 - h_hi0;

	faddd	%f58,DC1,%f36		! (4_2) res0 += DC1;

	faddd	%f56,K1,%f58		! (0_1) res0 += K1;
	ldd	[%g1+8],%f56		! (4_2) dtmp0 = ((double*)si0)[1];
	fmul8x16	SCALE,%f54,%f54	! (4_2) db0 = vis_fmul8x16(SCALE, db0);

	fdtos	%f62,%f14		! (3_2) ftmp0 = (float)res0;

	fmuld	%f44,%f22,%f44		! (1_1) xx0 *= dmp0;
	add	%l7,stridez,%o7		! pz += stridez
	st	%f14,[%l7]		! (3_2) *pz = ftmp0;

	subcc	counter,1,counter
	bneg	.begin
	or	%g0,%o7,%l7

	fmuld	%f56,%f36,%f36		! (4_2) res0 = dtmp0 * res0;

	fpadd32	%f54,DA1,%f62		! (4_2) db0 = vis_fpadd32(db0,DA1);

	fmuld	%f58,%f38,%f38		! (0_1) res0 *= xx0;

	fmuld	K2,%f44,%f56		! (1_1) res0 = K2 * xx0;
	fand	%f0,DA0,%f54		! (0_1) db0 = vis_fand(db0,DA0);

	fmuld	%f36,%f62,%f62		! (4_2) res0 *= db0;

	add	%o5,TBL,%o0		! (0_1) si0 = (char*)sqrt_arr + di0;

	faddd	%f38,DC1,%f36		! (0_1) res0 += DC1;

	faddd	%f56,K1,%f38		! (1_1) res0 += K1;
	ldd	[%o0+8],%f56		! (0_1) dtmp0 = ((double*)si0)[1];
	fmul8x16	SCALE,%f54,%f54	! (0_1) db0 = vis_fmul8x16(SCALE, db0);

	add	%o7,stridez,%i0		! pz += stridez
	fdtos	%f62,%f14		! (4_2) ftmp0 = (float)res0;

	fmuld	%f56,%f36,%f36		! (0_1) res0 = dtmp0 * res0;

	fpadd32	%f54,DA1,%f62		! (0_1) db0 = vis_fpadd32(db0,DA1);

	fmuld	%f38,%f44,%f44		! (1_1) res0 *= xx0;
	add	%i0,stridez,%i3		! pz += stridez
	st	%f14,[%o7]		! (4_2) *pz = ftmp0;

	subcc	counter,1,counter
	bneg	.begin
	or	%g0,%i0,%l7

	fand	%f18,DA0,%f56		! (1_1) db0 = vis_fand(db0,DA0);

	fmuld	%f36,%f62,%f62		! (0_1) res0 *= db0;

	add	%g5,TBL,%o3		! (1_1) si0 = (char*)sqrt_arr + di0;

	faddd	%f44,DC1,%f44		! (1_1) res0 += DC1;

	fmul8x16	SCALE,%f56,%f36	! (1_1) db0 = vis_fmul8x16(SCALE, db0);
	ldd	[%o3+8],%f56		! (1_1) dtmp0 = ((double*)si0)[1];

	add	%i3,stridez,%o4		! pz += stridez
	fdtos	%f62,%f14		! (0_1) ftmp0 = (float)res0;

	st	%f14,[%i0]		! (0_1) *pz = ftmp0;

	subcc	counter,1,counter
	bneg	.begin
	or	%g0,%i3,%l7

	fmuld	%f56,%f44,%f62		! (1_1) res0 = dtmp0 * res0;

	fpadd32	%f36,DA1,%f36		! (1_1) db0 = vis_fpadd32(db0,DA1);

	fmuld	%f62,%f36,%f62		! (1_1) res0 *= db0;

	fdtos	%f62,%f14		! (1_1) ftmp0 = (float)res0;

	st	%f14,[%i3]		! (1_1) *pz = ftmp0;

	ba	.begin
	or	%g0,%o4,%l7

	.align	16
.spec1:
	st	%g0,[%l7]		! *pz = 0;
	add	%l7,stridez,%l7		! pz += stridez

	add	%i2,stridey,%i2		! py += stridey
	ba	.begin1
	sub	counter,1,counter	! counter--

	.align	16
.spec:
	sethi	%hi(0x7f800000),%i0
	cmp	%l3,%i0			! hx ? 0x7f800000
	bge,pt	%icc,2f			! if ( hx >= 0x7f800000 )
	ld	[%i2],%f8

	cmp	%l4,%i0			! hy ? 0x7f800000
	bge,pt	%icc,2f			! if ( hy >= 0x7f800000 )
	nop

	fsmuld	%f17,%f17,%f44		! x * (double)x
	fsmuld	%f8,%f8,%f24		! y * (double)y
	faddd	%f44,%f24,%f24		! x * (double)x + y * (double)y
	fsqrtd	%f24,%f24		! hyp = sqrt(x * (double)x + y * (double)y);
	fcmped	%f24,DFMAX		! hyp ? DMAX
	fbug,a	1f			! if ( hyp > DMAX )
	fmuls	FMAX,FMAX,%f20		! ftmp0 = FMAX * FMAX;

	fdtos	%f24,%f20		! ftmp0 = (float)hyp;
1:
	st	%f20,[%l7]		! *pz = ftmp0;
	add	%l7,stridez,%l7		! pz += stridez
	add	%i1,stridex,%i1		! px += stridex

	add	%i2,stridey,%i2		! py += stridey
	ba	.begin1
	sub	counter,1,counter	! counter--
2:
	fcmps	%f17,%f8		! exceptions
	cmp	%l3,%i0			! hx ? 0x7f800000
	be,a	%icc,1f			! if ( hx == 0x7f800000 )
	st	%i0,[%l7]		! *(int*)pz = 0x7f800000;

	cmp	%l4,%i0			! hy ? 0x7f800000
	be,a	%icc,1f			! if ( hy == 0x7f800000
	st	%i0,[%l7]		! *(int*)pz = 0x7f800000;

	fmuls	%f17,%f8,%f8		! x * y
	st	%f8,[%l7]		! *pz = x * y;

1:
	add	%l7,stridez,%l7		! pz += stridez
	add	%i1,stridex,%i1		! px += stridex

	add	%i2,stridey,%i2		! py += stridey
	ba	.begin1
	sub	counter,1,counter	! counter--

	.align	16
.update0:
	cmp	counter,1
	ble	.cont0
	fzeros	%f17

	stx	%i1,[%fp+tmp_px]

	add	%o7,stridey,%i5
	stx	%i5,[%fp+tmp_py]

	sub	counter,1,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont0
	or	%g0,1,counter

	.align	16
.update1:
	cmp	counter,1
	ble	.cont1
	fzeros	%f17

	stx	%i1,[%fp+tmp_px]
	stx	%i5,[%fp+tmp_py]

	sub	counter,1,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont1
	or	%g0,1,counter

	.align	16
.update2:
	cmp	counter,2
	ble	.cont2
	fzeros	%f8

	stx	%i1,[%fp+tmp_px]
	stx	%o4,[%fp+tmp_py]

	sub	counter,2,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont2
	or	%g0,2,counter

	.align	16
.update3:
	cmp	counter,2
	ble	.cont3
	fzeros	%f17

	stx	%i1,[%fp+tmp_px]
	stx	%o4,[%fp+tmp_py]

	sub	counter,2,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont3
	or	%g0,2,counter

	.align	16
.update4:
	cmp	counter,3
	ble	.cont4
	fzeros	%f17

	stx	%i1,[%fp+tmp_px]
	stx	%i5,[%fp+tmp_py]

	sub	counter,3,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont4
	or	%g0,3,counter

	.align	16
.update5:
	cmp	counter,3
	ble	.cont5
	fzeros	%f17

	sub	%i1,stridex,%i2
	stx	%i2,[%fp+tmp_px]
	stx	%i5,[%fp+tmp_py]

	sub	counter,3,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont5
	or	%g0,3,counter

	.align	16
.update6:
	cmp	counter,4
	ble	.cont6
	fzeros	%f17

	stx	%i1,[%fp+tmp_px]
	stx	%i2,[%fp+tmp_py]

	sub	counter,4,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont6
	or	%g0,4,counter

	.align	16
.update7:
	cmp	counter,4
	ble	.cont7
	fzeros	%f17

	sub	%i1,stridex,%o7
	stx	%o7,[%fp+tmp_px]
	stx	%i2,[%fp+tmp_py]

	sub	counter,4,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont7
	or	%g0,4,counter

	.align	16
.update8:
	cmp	counter,5
	ble	.cont8
	fzeros	%f17

	sub	%i1,stridex,%o5
	stx	%o5,[%fp+tmp_px]
	stx	%o7,[%fp+tmp_py]

	sub	counter,5,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont8
	or	%g0,5,counter

	.align	16
.update9:
	cmp	counter,5
	ble	.cont9
	fzeros	%f17

	sub	%i1,stridex,%o5
	stx	%o5,[%fp+tmp_px]
	stx	%o7,[%fp+tmp_py]

	sub	counter,5,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont9
	or	%g0,5,counter

	.align	16
.update10:
	fmul8x16	SCALE,%f56,%f36	! (3_1) db0 = vis_fmul8x16(SCALE, db0);
	and	%l4,_0x7fffffff,%l4	! (4_0) hy0 &= 0x7fffffff;
	ldd	[%g1+8],%f56		! (3_1) dtmp0 = ((double*)si0)[1];
	faddd	%f54,K1,%f54		! (4_1) res0 += K1;

	cmp	counter,6
	ble	.cont10
	fzeros	%f17

	stx	%i1,[%fp+tmp_px]
	add	%o7,stridey,%i5
	stx	%i5,[%fp+tmp_py]

	sub	counter,6,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont10
	or	%g0,6,counter

	.align	16
.update11:
	cmp	counter,1
	ble	.cont11
	fzeros	%f17

	stx	%i1,[%fp+tmp_px]
	stx	%i5,[%fp+tmp_py]

	sub	counter,1,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont11
	or	%g0,1,counter

	.align	16
.update12:
	cmp	counter,2
	ble	.cont12
	fzeros	%f8

	stx	%i0,[%fp+tmp_px]
	add	%i5,stridey,%o4
	stx	%o4,[%fp+tmp_py]

	sub	counter,2,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont12
	or	%g0,2,counter

	.align	16
.update13:
	cmp	counter,2
	ble	.cont13
	fzeros	%f17

	stx	%i0,[%fp+tmp_px]
	stx	%o4,[%fp+tmp_py]

	sub	counter,2,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont13
	or	%g0,2,counter

	.align	16
.update14:
	cmp	counter,3
	ble	.cont14
	fzeros	%f17

	stx	%i1,[%fp+tmp_px]
	add	%o4,stridey,%i5
	stx	%i5,[%fp+tmp_py]

	sub	counter,3,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont14
	or	%g0,3,counter

	.align	16
.update15:
	cmp	counter,3
	ble	.cont15
	fzeros	%f17

	sub	%i1,stridex,%i2
	stx	%i2,[%fp+tmp_px]
	stx	%i5,[%fp+tmp_py]

	sub	counter,3,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont15
	or	%g0,3,counter

	.align	16
.update16:
	faddd	%f40,%f32,%f18		! (1_0) db0 = dx0 + dy0;
	cmp	counter,4
	ble	.cont16
	fzeros	%f17

	stx	%i1,[%fp+tmp_px]
	stx	%i2,[%fp+tmp_py]

	sub	counter,4,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont16
	or	%g0,4,counter

	.align	16
.update17:
	cmp	counter,4
	ble	.cont17
	fzeros	%f17

	stx	%i1,[%fp+tmp_px]
	stx	%i2,[%fp+tmp_py]

	sub	counter,4,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont17
	or	%g0,4,counter

	.align	16
.update18:
	cmp	counter,5
	ble	.cont18
	fzeros	%f17

	stx	%l7,[%fp+tmp_px]
	stx	%o7,[%fp+tmp_py]

	sub	counter,5,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont18
	or	%g0,5,counter

	.align	16
.update19:
	fpadd32	%f40,DA1,%f62		! (2_1) db0 = vis_fpadd32(db0,DA1);
	cmp	counter,5
	ble	.cont19
	fzeros	%f17

	stx	%l7,[%fp+tmp_px]
	stx	%o7,[%fp+tmp_py]

	sub	counter,5,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont19
	or	%g0,5,counter

	.align	16
.update19a:
	cmp	counter,5
	ble	.cont19a
	fzeros	%f17

	stx	%l7,[%fp+tmp_px]
	stx	%o7,[%fp+tmp_py]

	sub	counter,5,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont19a
	or	%g0,5,counter

	.align	16
.update20:
	faddd	%f54,K1,%f54		! (4_1) res0 += K1;
	cmp	counter,6
	ble	.cont20
	fzeros	%f17

	stx	%i1,[%fp+tmp_px]
	add	%o7,stridey,%g1
	stx	%g1,[%fp+tmp_py]

	sub	counter,6,counter
	st	counter,[%fp+tmp_counter]

	ba	.cont20
	or	%g0,6,counter

.exit:
	ret
	restore
	SET_SIZE(__vhypotf)