/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

	.file	"__vrsqrt.S"

#include "libm.h"

	RO_DATA
	.align	64

.CONST_TBL:
	.word	0xbfe00000, 0x0000002f	! K1 =-5.00000000000005209867e-01;
	.word	0x3fd80000, 0x00000058	! K2 = 3.75000000000004884257e-01;
	.word	0xbfd3ffff, 0xff444bc8	! K3 =-3.12499999317136886551e-01;
	.word	0x3fd17fff, 0xff5006fe	! K4 = 2.73437499359815081532e-01;
	.word	0xbfcf80bb, 0xb33ef574	! K5 =-2.46116125605037803130e-01;
	.word	0x3fcce0af, 0xf8156949	! K6 = 2.25606914648617522896e-01;

	.word	0x001fffff, 0xffffffff	! DC0
	.word	0x3fe00000, 0x00000000	! DC1
	.word	0x00002000, 0x00000000	! DC2
	.word	0x7fffc000, 0x00000000	! DC3
	.word	0x0007ffff, 0xffffffff	! DC4

	.word	0x43200000, 0x00000000	! D2ON51  = pow(2,51)
	.word	0x3ff00000, 0x00000000	! DONE   = 1.0

#define stridex		%l5
#define stridey		%l7
#define counter		%l0
#define TBL		%l3
#define _0x7ff00000	%o0
#define _0x00100000	%o1

#define DC0		%f56
#define DC1		%f54
#define DC2		%f48
#define DC3		%f46
#define K6		%f42
#define K5		%f20
#define K4		%f52
#define K3		%f50
#define K2		%f14
#define K1		%f12
#define DONE		%f4

#define tmp_counter	%g5
#define tmp_px		%o5

#define tmp0		STACK_BIAS-0x40
#define tmp1		STACK_BIAS-0x38
#define tmp2		STACK_BIAS-0x30
#define tmp3		STACK_BIAS-0x28
#define tmp4		STACK_BIAS-0x20
#define tmp5		STACK_BIAS-0x18
#define tmp6		STACK_BIAS-0x10
#define tmp7		STACK_BIAS-0x08

! sizeof temp storage - must be a multiple of 16 for V9
#define tmps		0x40

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!      !!!!!   algorithm   !!!!!
!  ((float*)&res)[0] = ((float*)px)[0];
!  ((float*)&res)[1] = ((float*)px)[1];
!  hx = *(int*)px;
!  if ( hx >= 0x7ff00000 )
!  {
!    res = DONE / res;
!    ((float*)py)[0] = ((float*)&res)[0];
!    ((float*)py)[1] = ((float*)&res)[1];
!    px += stridex;
!    py += stridey;
!    continue;
!  }
!  if ( hx < 0x00100000 )
!  {
!    ax = hx & 0x7fffffff;
!    lx = ((int*)px)[1];
!
!    if ( (ax | lx) == 0 )
!    {
!      res = DONE / res;
!      ((float*)py)[0] = ((float*)&res)[0];
!      ((float*)py)[1] = ((float*)&res)[1];
!      px += stridex;
!      py += stridey;
!      continue;
!    }
!    else if ( hx >= 0 )
!    {
!      if ( hx < 0x00080000 )
!      {
!        res = *(long long*)&res;
!        hx = *(int*)&res - (537 << 21);
!      }
!      else
!      {
!        res = vis_fand(res,DC4);
!        res = *(long long*)&res;
!        res += D2ON51;
!        hx = *(int*)&res - (537 << 21);
!      }
!    }
!    else
!    {
!      res = sqrt(res);
!      ((float*)py)[0] = ((float*)&res)[0];
!      ((float*)py)[1] = ((float*)&res)[1];
!      px += stridex;
!      py += stridey;
!      continue;
!    }
!  }
!
!  iexp = hx >> 21;
!  iexp = -iexp;
!  iexp += 0x5fe;
!  lexp = iexp << 52;
!  dlexp = *(double*)&lexp;
!  hx >>= 10;
!  hx &= 0x7f8;
!  hx += 8;
!  hx &= -16;
!
!  res = vis_fand(res,DC0);
!  res = vis_for(res,DC1);
!  res_c = vis_fpadd32(res,DC2);
!  res_c = vis_fand(res_c,DC3);
!
!  addr = (char*)arr + hx;
!  dexp_hi = ((double*)addr)[0];
!  dexp_lo = ((double*)addr)[1];
!  dtmp0 = dexp_hi * dexp_hi;
!  xx = res - res_c;
!  xx *= dtmp0;
!  res = K6 * xx;
!  res += K5;
!  res *= xx;
!  res += K4;
!  res *= xx;
!  res += K3;
!  res *= xx;
!  res += K2;
!  res *= xx;
!  res += K1;
!  res *= xx;
!  res = dexp_hi * res;
!  res += dexp_lo;
!  res += dexp_hi;
!
!  res *= dlexp;
!
!  ((float*)py)[0] = ((float*)&res)[0];
!  ((float*)py)[1] = ((float*)&res)[1];
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

	ENTRY(__vrsqrt)
	save	%sp,-SA(MINFRAME)-tmps,%sp
	PIC_SETUP(l7)
	PIC_SET(l7,.CONST_TBL,o3)
	PIC_SET(l7,__vlibm_TBL_rsqrt,l3)
	wr	%g0,0x82,%asi

	ldd	[%o3],K1
	sethi	%hi(0x7ff00000),%o0
	mov	%i3,%o4

	ldd	[%o3+0x08],K2
	sethi	%hi(0x00100000),%o1
	mov	%i1,tmp_px

	ldd	[%o3+0x10],K3
	sll	%i2,3,stridex
	mov	%i0,tmp_counter

	ldd	[%o3+0x18],K4
	sll	%i4,3,stridey

	ldd	[%o3+0x20],K5
	ldd	[%o3+0x28],K6
	ldd	[%o3+0x30],DC0
	ldd	[%o3+0x38],DC1
	ldd	[%o3+0x40],DC2
	ldd	[%o3+0x48],DC3

.begin:
	mov	tmp_counter,counter
	mov	tmp_px,%i1
	clr	tmp_counter
.begin1:
	cmp	counter,0
	ble,pn	%icc,.exit
	ldd	[%o3+0x60],DONE

	lda	[%i1]%asi,%f0		! (6_0) ((float*)res)[0] = ((float*)px)[0];
	sethi	%hi(0x7ffffc00),%i0

	lda	[%i1+4]%asi,%f1		! (6_0) ((float*)res)[1] = ((float*)px)[1];
	add	%i0,1023,%i0

	fand	%f0,DC0,%f16		! (6_0) res = vis_fand(res,DC0);

	lda	[%i1]%asi,%g1		! (6_1) hx = *(int*)px;
	sethi	%hi(0x00080000),%i4

	lda	[%i1+4]%asi,%l4
	add	%i1,stridex,%l6		! px += stridex

	sra	%g1,21,%o7		! (6_1) iexp = hx >> 21;
	lda	[%l6]%asi,%f8		! (0_0) ((float*)res)[0] = ((float*)px)[0];
	for	%f16,DC1,%f44		! (6_1) res = vis_for(res,DC1);

	lda	[%l6+4]%asi,%f9		! (0_0) ((float*)res)[1] = ((float*)px)[1];
	sra	%g1,10,%o2		! (6_1) hx >>= 10;
	and	%g1,%i0,%i2

	cmp	%g1,_0x7ff00000		! (6_1) hx ? 0x7ff00000
	bge,pn	%icc,.spec0		! (6_1) if ( hx >= 0x7ff00000 )
	and	%o2,2040,%o2		! (6_1) hx &= 0x7f8;

	cmp	%g1,_0x00100000		! (6_1) hx ? 0x00100000
	bl,pn	%icc,.spec1		! (6_1) if ( hx < 0x00100000 )
	sub	%g0,%o7,%o7		! (6_1) iexp = -iexp;
.cont_spec:
	fand	%f8,DC0,%f16		! (0_0) res = vis_fand(res,DC0);

	fpadd32	%f44,DC2,%f18		! (6_1) res_c = vis_fpadd32(res,DC2);

	add	%o2,8,%l4		! (6_1) hx += 8;

	add	%o7,1534,%o7		! (6_1) iexp += 0x5fe;

	lda	[%l6]%asi,%g1		! (0_0) hx = *(int*)px;
	sllx	%o7,52,%o7		! (6_1) iexp << 52;
	and	%l4,-16,%l4		! (6_1) hx = -16;

	add	%l4,TBL,%l4		! (6_1) addr = (char*)arr + hx;
	stx	%o7,[%fp+tmp1]		! (6_1) dlexp = *(double*)lexp;

	add	%l6,stridex,%l6		! px += stridex
	ldd	[%l4],%f30		! (6_1) dtmp0 = ((double*)addr)[0];

	sra	%g1,21,%o7		! (0_0) iexp = hx >> 21;
	lda	[%l6]%asi,%f0		! (1_0) ((float*)res)[0] = ((float*)px)[0];
	for	%f16,DC1,%f28		! (0_0) res = vis_for(res,DC1);

	sra	%g1,10,%o2		! (0_0) hx >>= 10;
	sub	%g0,%o7,%o7		! (0_0) iexp = -iexp;
	lda	[%l6+4]%asi,%f1		! (1_0) ((float*)res)[1] = ((float*)px)[1];

	cmp	%g1,_0x7ff00000		! (0_0) hx ? 0x7ff00000
	bge,pn	%icc,.update0		! (0_0) if ( hx >= 0x7ff00000 )
	fand	%f18,DC3,%f6		! (6_1) res_c = vis_fand(res_c,DC3);
.cont0:
	and	%o2,2040,%o2		! (0_0) hx &= 0x7f8;
	fmuld	%f30,%f30,%f10		! (6_1) dtmp0 = dexp_hi * dexp_hi;

	cmp	%g1,_0x00100000		! (0_0) hx ? 0x00100000
	bl,pn	%icc,.update1		! (0_0) if ( hx < 0x00100000 )
	add	%o7,1534,%o7		! (0_0) iexp += 0x5fe;
.cont1:
	fand	%f0,DC0,%f16		! (1_0) res = vis_fand(res,DC0);

	fpadd32	%f28,DC2,%f18		! (0_0) res_c = vis_fpadd32(res,DC2);

	add	%o2,8,%l2		! (0_0) hx += 8;
	fsubd	%f44,%f6,%f6		! (6_1) xx = res - res_c;

	lda	[%l6]%asi,%g1		! (1_0) hx = *(int*)px;
	sllx	%o7,52,%o7		! (0_0) iexp << 52;
	and	%l2,-16,%l2		! (0_0) hx = -16;

	add	%l2,TBL,%l2		! (0_0) addr = (char*)arr + hx;
	add	%l6,stridex,%l6		! px += stridex
	stx	%o7,[%fp+tmp2]		! (0_0) dlexp = *(double*)lexp;

	fmuld	%f6,%f10,%f26		! (6_1) xx *= dtmp0;
	ldd	[%l2],%f10		! (0_0) dtmp0 = ((double*)addr)[0];

	sra	%g1,21,%o7		! (1_0) iexp = hx >> 21;
	lda	[%l6]%asi,%f6		! (2_0) ((float*)res)[0] = ((float*)px)[0];
	for	%f16,DC1,%f44		! (1_0) res = vis_for(res,DC1);

	sra	%g1,10,%o2		! (1_0) hx >>= 10;
	cmp	%g1,_0x7ff00000		! (1_0) hx ? 0x7ff00000
	bge,pn	%icc,.update2		! (1_0) if ( hx >= 0x7ff00000 )
	lda	[%l6+4]%asi,%f7		! (2_0) ((float*)res)[1] = ((float*)px)[1];
.cont2:
	fand	%f18,DC3,%f8		! (0_0) res_c = vis_fand(res_c,DC3);

	fmuld	%f10,%f10,%f10		! (0_0) dtmp0 = dexp_hi * dexp_hi;
	cmp	%g1,_0x00100000		! (1_0) hx ? 0x00100000
	bl,pn	%icc,.update3		! (1_0) if ( hx < 0x00100000 )
	and	%o2,2040,%o2		! (1_0) hx &= 0x7f8;
.cont3:
	sub	%g0,%o7,%o7		! (1_0) iexp = -iexp;
	fand	%f6,DC0,%f16		! (2_0) res = vis_fand(res,DC0);

	add	%o7,1534,%o7		! (1_0) iexp += 0x5fe;
	fpadd32	%f44,DC2,%f18		! (1_0) res_c = vis_fpadd32(res,DC2);

	fmuld	K6,%f26,%f62		! (6_1) res = K6 * xx;
	add	%o2,8,%i2		! (1_0) hx += 8;
	fsubd	%f28,%f8,%f32		! (0_0) xx = res - res_c;

	lda	[%l6]%asi,%g1		! (2_0) hx = *(int*)px;
	sllx	%o7,52,%o7		! (1_0) iexp << 52;
	and	%i2,-16,%i2		! (1_0) hx = -16;

	add	%i2,TBL,%i2		! (1_0) addr = (char*)arr + hx;
	stx	%o7,[%fp+tmp3]		! (1_0) dlexp = *(double*)lexp;

	fmuld	%f32,%f10,%f32		! (0_0) xx *= dtmp0;
	add	%l6,stridex,%l6		! px += stridex
	ldd	[%i2],%f10		! (1_0) dtmp0 = ((double*)addr)[0];
	faddd	%f62,K5,%f62		! (6_1) res += K5;

	sra	%g1,21,%o7		! (2_0) iexp = hx >> 21;
	lda	[%l6]%asi,%f0		! (3_0) ((float*)res)[0] = ((float*)px)[0];
	for	%f16,DC1,%f28		! (2_0) res = vis_for(res,DC1);

	sra	%g1,10,%o2		! (2_0) hx >>= 10;
	cmp	%g1,_0x7ff00000		! (2_0) hx ? 0x7ff00000
	bge,pn	%icc,.update4		! (2_0) if ( hx >= 0x7ff00000 )
	lda	[%l6+4]%asi,%f1		! (3_0) ((float*)res)[1] = ((float*)px)[1];
.cont4:
	fmuld	%f62,%f26,%f40		! (6_1) res *= xx;
	fand	%f18,DC3,%f8		! (1_0) res_c = vis_fand(res_c,DC3);

	fmuld	%f10,%f10,%f10		! (1_0) dtmp0 = dexp_hi * dexp_hi;
	cmp	%g1,_0x00100000		! (2_0) hx ? 0x00100000
	bl,pn	%icc,.update5		! (2_0) if ( hx < 0x00100000 )
	and	%o2,2040,%o2		! (2_0) hx &= 0x7f8;
.cont5:
	sub	%g0,%o7,%o7		! (2_0) iexp = -iexp;
	fand	%f0,DC0,%f16		! (3_0) res = vis_fand(res,DC0);

	add	%o7,1534,%o7		! (2_0) iexp += 0x5fe;
	fpadd32	%f28,DC2,%f18		! (2_0) res_c = vis_fpadd32(res,DC2);

	fmuld	K6,%f32,%f62		! (0_0) res = K6 * xx;
	add	%o2,8,%i4		! (2_0) hx += 8;
	fsubd	%f44,%f8,%f6		! (1_0) xx = res - res_c;

	faddd	%f40,K4,%f40		! (6_1) res += K4;

	lda	[%l6]%asi,%g1		! (3_0) hx = *(int*)px;
	sllx	%o7,52,%o7		! (2_0) iexp << 52;
	and	%i4,-16,%i4		! (2_0) hx = -16;

	add	%i4,TBL,%i4		! (2_0) addr = (char*)arr + hx;
	stx	%o7,[%fp+tmp4]		! (2_0) dlexp = *(double*)lexp;

	fmuld	%f6,%f10,%f38		! (1_0) xx *= dtmp0;
	ldd	[%i4],%f24		! (2_0) dtmp0 = ((double*)addr)[0];
	faddd	%f62,K5,%f62		! (0_0) res += K5;

	fmuld	%f40,%f26,%f34		! (6_1) res *= xx;
	add	%l6,stridex,%l6		! px += stridex

	sra	%g1,21,%o7		! (3_0) iexp = hx >> 21;
	lda	[%l6]%asi,%f8		! (4_0) ((float*)res)[0] = ((float*)px)[0];
	for	%f16,DC1,%f44		! (3_0) res = vis_for(res,DC1);

	sra	%g1,10,%o2		! (3_0) hx >>= 10;
	cmp	%g1,_0x7ff00000		! (3_0) hx ? 0x7ff00000
	bge,pn	%icc,.update6		! (3_0) if ( hx >= 0x7ff00000 )
	lda	[%l6+4]%asi,%f9		! (4_0) ((float*)res)[1] = ((float*)px)[1];
.cont6:
	fmuld	%f62,%f32,%f60		! (0_0) res *= xx;
	cmp	%g1,_0x00100000		! (3_0) hx ? 0x00100000
	fand	%f18,DC3,%f22		! (2_0) res_c = vis_fand(res_c,DC3);

	fmuld	%f24,%f24,%f24		! (2_0) dtmp0 = dexp_hi * dexp_hi;
	bl,pn	%icc,.update7		! (3_0) if ( hx < 0x00100000 )
	and	%o2,2040,%o2		! (3_0) hx &= 0x7f8;
	faddd	%f34,K3,%f6		! (6_1) res += K3;
.cont7:
	sub	%g0,%o7,%o7		! (3_0) iexp = -iexp;
	fand	%f8,DC0,%f16		! (4_0) res = vis_fand(res,DC0);

	add	%o7,1534,%o7		! (3_0) iexp += 0x5fe;
	fpadd32	%f44,DC2,%f18		! (3_0) res_c = vis_fpadd32(res,DC2);

	fmuld	K6,%f38,%f62		! (1_0) res = K6 * xx;
	add	%o2,8,%i5		! (3_0) hx += 8;
	fsubd	%f28,%f22,%f28		! (2_0) xx = res - res_c;

	fmuld	%f6,%f26,%f22		! (6_1) res *= xx;
	faddd	%f60,K4,%f60		! (0_0) res += K4;

	lda	[%l6]%asi,%g1		! (4_0) hx = *(int*)px;
	sllx	%o7,52,%o7		! (3_0) iexp << 52;
	and	%i5,-16,%i5		! (3_0) hx = -16;

	add	%i5,TBL,%i5		! (3_0) addr = (char*)arr + hx;
	stx	%o7,[%fp+tmp5]		! (3_0) dlexp = *(double*)lexp;

	fmuld	%f28,%f24,%f36		! (2_0) xx *= dtmp0;
	add	%l6,stridex,%i0		! px += stridex
	ldd	[%i5],%f28		! (3_0) dtmp0 = ((double*)addr)[0];
	faddd	%f62,K5,%f62		! (1_0) res += K5;

	faddd	%f22,K2,%f10		! (6_1) res += K2;
	fmuld	%f60,%f32,%f34		! (0_0) res *= xx;

	sra	%g1,21,%o7		! (4_0) iexp = hx >> 21;
	lda	[%i0]%asi,%f0		! (5_0) ((float*)res)[0] = ((float*)px)[0];
	for	%f16,DC1,%f24		! (4_0) res = vis_for(res,DC1);

	sra	%g1,10,%o2		! (4_0) hx >>= 10;
	cmp	%g1,_0x7ff00000		! (4_0) hx ? 0x7ff00000
	bge,pn	%icc,.update8		! (4_0) if ( hx >= 0x7ff00000 )
	lda	[%i0+4]%asi,%f1		! (5_0) ((float*)res)[1] = ((float*)px)[1];
.cont8:
	fand	%f18,DC3,%f40		! (3_0) res_c = vis_fand(res_c,DC3);
	fmuld	%f62,%f38,%f62		! (1_0) res *= xx;

	fmuld	%f10,%f26,%f58		! (6_1) res *= xx;
	cmp	%g1,_0x00100000		! (4_0) hx ? 0x00100000
	and	%o2,2040,%o2		! (4_0) hx &= 0x7f8;
	faddd	%f34,K3,%f60		! (0_0) res += K3;

	fmuld	%f28,%f28,%f28		! (3_0) dtmp0 = dexp_hi * dexp_hi;
	bl,pn	%icc,.update9		! (4_0) if ( hx < 0x00100000 )
	sub	%g0,%o7,%o7		! (4_0) iexp = -iexp;
	fand	%f0,DC0,%f16		! (5_0) res = vis_fand(res,DC0);
.cont9:
	add	%o7,1534,%o7		! (4_0) iexp += 0x5fe;
	fpadd32	%f24,DC2,%f18		! (4_0) res_c = vis_fpadd32(res,DC2);

	fmuld	K6,%f36,%f10		! (2_0) res = K6 * xx;
	add	%o2,8,%l1		! (4_0) hx += 8;
	fsubd	%f44,%f40,%f44		! (3_0) xx = res - res_c;

	fmuld	%f60,%f32,%f60		! (0_0) res *= xx;
	faddd	%f62,K4,%f6		! (1_0) res += K4;

	lda	[%i0]%asi,%g1		! (5_0) hx = *(int*)px;
	sllx	%o7,52,%o7		! (4_0) iexp << 52;
	and	%l1,-16,%l1		! (4_0) hx = -16;
	faddd	%f58,K1,%f58		! (6_1) res += K1;

	add	%i0,stridex,%i1		! px += stridex
	add	%l1,TBL,%l1		! (4_0) addr = (char*)arr + hx;
	stx	%o7,[%fp+tmp6]		! (4_0) dlexp = *(double*)lexp;

	fmuld	%f44,%f28,%f40		! (3_0) xx *= dtmp0;
	ldd	[%l1],%f44		! (4_0) dtmp0 = ((double*)addr)[0];
	faddd	%f10,K5,%f62		! (2_0) res += K5;

	fmuld	%f6,%f38,%f34		! (1_0) res *= xx;
	sra	%g1,21,%o7		! (5_0) iexp = hx >> 21;
	nop
	faddd	%f60,K2,%f60		! (0_0) res += K2;

	for	%f16,DC1,%f28		! (5_0) res = vis_for(res,DC1);
	sub	%g0,%o7,%o7		! (5_0) iexp = -iexp;
	lda	[%i1]%asi,%f6		! (6_0) ((float*)res)[0] = ((float*)px)[0];
	fmuld	%f58,%f26,%f26		! (6_1) res *= xx;

	sra	%g1,10,%o2		! (5_0) hx >>= 10;
	cmp	%g1,_0x7ff00000		! (5_0) hx ? 0x7ff00000
	bge,pn	%icc,.update10		! (5_0) if ( hx >= 0x7ff00000 )
	lda	[%i1+4]%asi,%f7		! (6_0) ((float*)res)[1] = ((float*)px)[1];
.cont10:
	fand	%f18,DC3,%f8		! (4_0) res_c = vis_fand(res_c,DC3);
	fmuld	%f62,%f36,%f62		! (2_0) res *= xx;

	fmuld	%f60,%f32,%f58		! (0_0) res *= xx;
	cmp	%g1,_0x00100000		! (5_0) hx ? 0x00100000
	and	%o2,2040,%o2		! (5_0) hx &= 0x7f8;
	faddd	%f34,K3,%f34		! (1_0) res += K3;

	fmuld	%f30,%f26,%f26		! (6_1) res = dexp_hi * res;
	bl,pn	%icc,.update11		! (5_0) if ( hx < 0x00100000 )
	nop
	fand	%f6,DC0,%f16		! (6_0) res = vis_fand(res,DC0);
.cont11:
	ldd	[%l4+8],%f60		! (6_1) dexp_lo = ((double*)addr)[1];
	fmuld	%f44,%f44,%f44		! (4_0) dtmp0 = dexp_hi * dexp_hi;
	fpadd32	%f28,DC2,%f18		! (5_0) res_c = vis_fpadd32(res,DC2);

	fmuld	K6,%f40,%f22		! (3_0) res = K6 * xx;
	add	%o2,8,%i3		! (5_0) hx += 8;
	fsubd	%f24,%f8,%f10		! (4_0) xx = res - res_c;

	fmuld	%f34,%f38,%f24		! (1_0) res *= xx;
	or	%g0,%o4,%i0

	cmp	counter,7
	bl,pn	%icc,.tail
	faddd	%f62,K4,%f34		! (2_0) res += K4;

	ba	.main_loop
	sub	counter,7,counter	! counter

	.align	16
.main_loop:
	add	%o7,1534,%o7		! (5_0) iexp += 0x5fe;
	and	%i3,-16,%i3		! (5_1) hx = -16;
	lda	[%i1]%asi,%g1		! (6_1) hx = *(int*)px;
	faddd	%f58,K1,%f58		! (0_1) res += K1;

	add	%i3,TBL,%i3		! (5_1) addr = (char*)arr + hx;
	sllx	%o7,52,%o7		! (5_1) iexp << 52;
	stx	%o7,[%fp+tmp0]		! (5_1) dlexp = *(double*)lexp;
	faddd	%f26,%f60,%f8		! (6_2) res += dexp_lo;

	faddd	%f22,K5,%f62		! (3_1) res += K5;
	add	%i1,stridex,%l6		! px += stridex
	ldd	[%i3],%f22		! (5_1) dtmp0 = ((double*)addr)[0];
	fmuld	%f10,%f44,%f60		! (4_1) xx *= dtmp0;

	faddd	%f24,K2,%f26		! (1_1) res += K2;
	add	%i0,stridey,%i1		! px += stridey
	ldd	[%l2],%f24		! (0_1) dexp_hi = ((double*)addr)[0];
	fmuld	%f34,%f36,%f34		! (2_1) res *= xx;

	fmuld	%f58,%f32,%f58		! (0_1) res *= xx;
	sra	%g1,21,%o7		! (6_1) iexp = hx >> 21;
	lda	[%l6]%asi,%f0		! (0_0) ((float*)res)[0] = ((float*)px)[0];
	for	%f16,DC1,%f44		! (6_1) res = vis_for(res,DC1);

	lda	[%l6+4]%asi,%f1		! (0_0) ((float*)res)[1] = ((float*)px)[1];
	sra	%g1,10,%o2		! (6_1) hx >>= 10;
	fmuld	%f22,%f22,%f10		! (5_1) dtmp0 = dexp_hi * dexp_hi;
	faddd	%f8,%f30,%f30		! (6_2) res += dexp_hi;

	fmuld	%f62,%f40,%f32		! (3_1) res *= xx;
	cmp	%g1,_0x7ff00000		! (6_1) hx ? 0x7ff00000
	ldd	[%fp+tmp1],%f62		! (6_2) dlexp = *(double*)lexp;
	fand	%f18,DC3,%f8		! (5_1) res_c = vis_fand(res_c,DC3);

	fmuld	%f26,%f38,%f26		! (1_1) res *= xx;
	bge,pn	%icc,.update12		! (6_1) if ( hx >= 0x7ff00000 )
	and	%o2,2040,%o2		! (6_1) hx &= 0x7f8;
	faddd	%f34,K3,%f34		! (2_1) res += K3;
.cont12:
	fmuld	%f24,%f58,%f58		! (0_1) res = dexp_hi * res;
	cmp	%g1,_0x00100000		! (6_1) hx ? 0x00100000
	sub	%g0,%o7,%o7		! (6_1) iexp = -iexp;
	fand	%f0,DC0,%f16		! (0_0) res = vis_fand(res,DC0);

	fmuld	%f30,%f62,%f2		! (6_2) res *= dlexp;
	bl,pn	%icc,.update13		! (6_1) if ( hx < 0x00100000 )
	ldd	[%l2+8],%f30		! (0_1) dexp_lo = ((double*)addr)[1];
	fpadd32	%f44,DC2,%f18		! (6_1) res_c = vis_fpadd32(res,DC2);
.cont13:
	fmuld	K6,%f60,%f62		! (4_1) res = K6 * xx;
	add	%o2,8,%l4		! (6_1) hx += 8;
	st	%f2,[%i0]		! (6_2) ((float*)py)[0] = ((float*)res)[0];
	fsubd	%f28,%f8,%f6		! (5_1) xx = res - res_c;

	fmuld	%f34,%f36,%f28		! (2_1) res *= xx;
	add	%o7,1534,%o7		! (6_1) iexp += 0x5fe;
	st	%f3,[%i0+4]		! (6_2) ((float*)py)[1] = ((float*)res)[1];
	faddd	%f32,K4,%f32		! (3_1) res += K4;

	lda	[%l6]%asi,%g1		! (0_0) hx = *(int*)px;
	sllx	%o7,52,%o7		! (6_1) iexp << 52;
	and	%l4,-16,%l4		! (6_1) hx = -16;
	faddd	%f26,K1,%f26		! (1_1) res += K1;

	add	%i1,stridey,%i0		! px += stridey
	add	%l4,TBL,%l4		! (6_1) addr = (char*)arr + hx;
	stx	%o7,[%fp+tmp1]		! (6_1) dlexp = *(double*)lexp;
	faddd	%f58,%f30,%f8		! (0_1) res += dexp_lo;

	fmuld	%f6,%f10,%f58		! (5_1) xx *= dtmp0;
	add	%l6,stridex,%l6		! px += stridex
	ldd	[%l4],%f30		! (6_1) dtmp0 = ((double*)addr)[0];
	faddd	%f62,K5,%f62		! (4_1) res += K5;

	fmuld	%f32,%f40,%f34		! (3_1) res *= xx;
	sra	%g1,10,%o2		! (0_0) hx >>= 10;
	ldd	[%i2],%f4		! (1_1) dexp_hi = ((double*)addr)[0];
	faddd	%f28,K2,%f32		! (2_1) res += K2;

	fmuld	%f26,%f38,%f26		! (1_1) res *= xx;
	sra	%g1,21,%o7		! (0_0) iexp = hx >> 21;
	lda	[%l6]%asi,%f6		! (1_0) ((float*)res)[0] = ((float*)px)[0];
	for	%f16,DC1,%f28		! (0_0) res = vis_for(res,DC1);

	fmuld	%f30,%f30,%f30		! (6_1) dtmp0 = dexp_hi * dexp_hi;
	sub	%g0,%o7,%o7		! (0_0) iexp = -iexp;
	lda	[%l6+4]%asi,%f7		! (1_0) ((float*)res)[1] = ((float*)px)[1];
	faddd	%f8,%f24,%f24		! (0_1) res += dexp_hi;

	fmuld	%f62,%f60,%f38		! (4_1) res *= xx;
	cmp	%g1,_0x7ff00000		! (0_0) hx ? 0x7ff00000
	ldd	[%fp+tmp2],%f62		! (0_1) dlexp = *(double*)lexp;
	fand	%f18,DC3,%f8		! (6_1) res_c = vis_fand(res_c,DC3);

	fmuld	%f32,%f36,%f32		! (2_1) res *= xx;
	bge,pn	%icc,.update14		! (0_0) if ( hx >= 0x7ff00000 )
	and	%o2,2040,%o2		! (0_0) hx &= 0x7f8;
	faddd	%f34,K3,%f34		! (3_1) res += K3;
.cont14:
	fmuld	%f4,%f26,%f26		! (1_1) res = dexp_hi * res;
	cmp	%g1,_0x00100000		! (0_0) hx ? 0x00100000
	add	%o7,1534,%o7		! (0_0) iexp += 0x5fe;
	fand	%f6,DC0,%f16		! (1_0) res = vis_fand(res,DC0);

	fmuld	%f24,%f62,%f2		! (0_1) res *= dlexp;
	bl,pn	%icc,.update15		! (0_0) if ( hx < 0x00100000 )
	ldd	[%i2+8],%f24		! (1_1) dexp_lo = ((double*)addr)[1];
	fpadd32	%f28,DC2,%f18		! (0_0) res_c = vis_fpadd32(res,DC2);
.cont15:
	fmuld	K6,%f58,%f62		! (5_1) res = K6 * xx;
	add	%o2,8,%l2		! (0_0) hx += 8;
	st	%f2,[%i1]		! (0_1) ((float*)py)[0] = ((float*)res)[0];
	fsubd	%f44,%f8,%f10		! (6_1) xx = res - res_c;

	fmuld	%f34,%f40,%f44		! (3_1) res *= xx;
	nop
	st	%f3,[%i1+4]		! (0_1) ((float*)py)[1] = ((float*)res)[1];
	faddd	%f38,K4,%f38		! (4_1) res += K4;

	lda	[%l6]%asi,%g1		! (1_0) hx = *(int*)px;
	sllx	%o7,52,%o7		! (0_0) iexp << 52;
	and	%l2,-16,%l2		! (0_0) hx = -16;
	faddd	%f32,K1,%f32		! (2_1) res += K1;

	add	%l2,TBL,%l2		! (0_0) addr = (char*)arr + hx;
	add	%l6,stridex,%l6		! px += stridex
	stx	%o7,[%fp+tmp2]		! (0_0) dlexp = *(double*)lexp;
	faddd	%f26,%f24,%f8		! (1_1) res += dexp_lo;

	fmuld	%f10,%f30,%f26		! (6_1) xx *= dtmp0;
	add	%i0,stridey,%i1		! px += stridey
	ldd	[%l2],%f30		! (0_0) dtmp0 = ((double*)addr)[0];
	faddd	%f62,K5,%f62		! (5_1) res += K5;

	fmuld	%f38,%f60,%f34		! (4_1) res *= xx;
	sra	%g1,10,%o2		! (1_0) hx >>= 10;
	ldd	[%i4],%f24		! (2_1) dexp_hi = ((double*)addr)[0];
	faddd	%f44,K2,%f38		! (3_1) res += K2;

	fmuld	%f32,%f36,%f32		! (2_1) res *= xx;
	sra	%g1,21,%o7		! (1_0) iexp = hx >> 21;
	lda	[%l6]%asi,%f0		! (2_0) ((float*)res)[0] = ((float*)px)[0];
	for	%f16,DC1,%f44		! (1_0) res = vis_for(res,DC1);

	fmuld	%f30,%f30,%f30		! (0_0) dtmp0 = dexp_hi * dexp_hi;
	cmp	%g1,_0x7ff00000		! (1_0) hx ? 0x7ff00000
	lda	[%l6+4]%asi,%f1		! (2_0) ((float*)res)[1] = ((float*)px)[1];
	faddd	%f8,%f4,%f4		! (1_1) res += dexp_hi;

	fmuld	%f62,%f58,%f36		! (5_1) res *= xx;
	bge,pn	%icc,.update16		! (1_0) if ( hx >= 0x7ff00000 )
	ldd	[%fp+tmp3],%f62		! (1_1) dlexp = *(double*)lexp;
	fand	%f18,DC3,%f8		! (0_0) res_c = vis_fand(res_c,DC3);
.cont16:
	fmuld	%f38,%f40,%f38		! (3_1) res *= xx;
	cmp	%g1,_0x00100000		! (1_0) hx ? 0x00100000
	and	%o2,2040,%o2		! (1_0) hx &= 0x7f8;
	faddd	%f34,K3,%f34		! (4_1) res += K3;

	fmuld	%f24,%f32,%f32		! (2_1) res = dexp_hi * res;
	bl,pn	%icc,.update17		! (1_0) if ( hx < 0x00100000 )
	sub	%g0,%o7,%o7		! (1_0) iexp = -iexp;
	fand	%f0,DC0,%f16		! (2_0) res = vis_fand(res,DC0);
.cont17:
	fmuld	%f4,%f62,%f2		! (1_1) res *= dlexp;
	add	%o7,1534,%o7		! (1_0) iexp += 0x5fe;
	ldd	[%i4+8],%f4		! (2_1) dexp_lo = ((double*)addr)[1];
	fpadd32	%f44,DC2,%f18		! (1_0) res_c = vis_fpadd32(res,DC2);

	fmuld	K6,%f26,%f62		! (6_1) res = K6 * xx;
	add	%o2,8,%i2		! (1_0) hx += 8;
	st	%f2,[%i0]		! (1_1) ((float*)py)[0] = ((float*)res)[0];
	fsubd	%f28,%f8,%f6		! (0_0) xx = res - res_c;

	fmuld	%f34,%f60,%f28		! (4_1) res *= xx;
	nop
	st	%f3,[%i0+4]		! (1_1) ((float*)py)[1] = ((float*)res)[1];
	faddd	%f36,K4,%f36		! (5_1) res += K4;

	lda	[%l6]%asi,%g1		! (2_0) hx = *(int*)px;
	sllx	%o7,52,%o7		! (1_0) iexp << 52;
	and	%i2,-16,%i2		! (1_0) hx = -16;
	faddd	%f38,K1,%f38		! (3_1) res += K1;

	add	%i1,stridey,%i0		! px += stridey
	add	%i2,TBL,%i2		! (1_0) addr = (char*)arr + hx;
	stx	%o7,[%fp+tmp3]		! (1_0) dlexp = *(double*)lexp;
	faddd	%f32,%f4,%f8		! (2_1) res += dexp_lo;

	fmuld	%f6,%f30,%f32		! (0_0) xx *= dtmp0;
	add	%l6,stridex,%l6		! px += stridex
	ldd	[%i2],%f30		! (1_0) dtmp0 = ((double*)addr)[0];
	faddd	%f62,K5,%f62		! (6_1) res += K5;

	fmuld	%f36,%f58,%f34		! (5_1) res *= xx;
	sra	%g1,10,%o2		! (2_0) hx >>= 10;
	ldd	[%i5],%f4		! (3_1) dexp_hi = ((double*)addr)[0];
	faddd	%f28,K2,%f36		! (4_1) res += K2;

	fmuld	%f38,%f40,%f38		! (3_1) res *= xx;
	sra	%g1,21,%o7		! (2_0) iexp = hx >> 21;
	lda	[%l6]%asi,%f6		! (3_0) ((float*)res)[0] = ((float*)px)[0];
	for	%f16,DC1,%f28		! (2_0) res = vis_for(res,DC1);

	fmuld	%f30,%f30,%f30		! (1_0) dtmp0 = dexp_hi * dexp_hi;
	cmp	%g1,_0x7ff00000		! (2_0) hx ? 0x7ff00000
	lda	[%l6+4]%asi,%f7		! (3_0) ((float*)res)[1] = ((float*)px)[1];
	faddd	%f8,%f24,%f24		! (2_1) res += dexp_hi;

	fmuld	%f62,%f26,%f40		! (6_1) res *= xx;
	bge,pn	%icc,.update18		! (2_0) if ( hx >= 0x7ff00000 )
	ldd	[%fp+tmp4],%f62		! (2_1) dlexp = *(double*)lexp;
	fand	%f18,DC3,%f8		! (1_0) res_c = vis_fand(res_c,DC3);
.cont18:
	fmuld	%f36,%f60,%f36		! (4_1) res *= xx;
	cmp	%g1,_0x00100000		! (2_0) hx ? 0x00100000
	and	%o2,2040,%o2		! (2_0) hx &= 0x7f8;
	faddd	%f34,K3,%f34		! (5_1) res += K3;

	fmuld	%f4,%f38,%f38		! (3_1) res = dexp_hi * res;
	bl,pn	%icc,.update19		! (2_0) if ( hx < 0x00100000 )
	sub	%g0,%o7,%o7		! (2_0) iexp = -iexp;
	fand	%f6,DC0,%f16		! (3_0) res = vis_fand(res,DC0);
.cont19:
	fmuld	%f24,%f62,%f2		! (2_1) res *= dlexp;
	add	%o7,1534,%o7		! (2_0) iexp += 0x5fe;
	ldd	[%i5+8],%f24		! (3_1) dexp_lo = ((double*)addr)[1];
	fpadd32	%f28,DC2,%f18		! (2_0) res_c = vis_fpadd32(res,DC2);

	fmuld	K6,%f32,%f62		! (0_0) res = K6 * xx;
	add	%o2,8,%i4		! (2_0) hx += 8;
	st	%f2,[%i1]		! (2_1) ((float*)py)[0] = ((float*)res)[0];
	fsubd	%f44,%f8,%f10		! (1_0) xx = res - res_c;

	fmuld	%f34,%f58,%f44		! (5_1) res *= xx;
	nop
	st	%f3,[%i1+4]		! (2_1) ((float*)py)[1] = ((float*)res)[1];
	faddd	%f40,K4,%f40		! (6_1) res += K4;

	lda	[%l6]%asi,%g1		! (3_0) hx = *(int*)px;
	sllx	%o7,52,%o7		! (2_0) iexp << 52;
	and	%i4,-16,%i4		! (2_0) hx = -16;
	faddd	%f36,K1,%f36		! (4_1) res += K1;

	add	%l6,stridex,%l6		! px += stridex
	add	%i4,TBL,%i4		! (2_0) addr = (char*)arr + hx;
	stx	%o7,[%fp+tmp4]		! (2_0) dlexp = *(double*)lexp;
	faddd	%f38,%f24,%f8		! (3_1) res += dexp_lo;

	fmuld	%f10,%f30,%f38		! (1_0) xx *= dtmp0;
	add	%i0,stridey,%i1		! px += stridey
	ldd	[%i4],%f24		! (2_0) dtmp0 = ((double*)addr)[0];
	faddd	%f62,K5,%f62		! (0_0) res += K5;

	fmuld	%f40,%f26,%f34		! (6_1) res *= xx;
	sra	%g1,10,%o2		! (3_0) hx >>= 10;
	ldd	[%l1],%f30		! (4_1) dexp_hi = ((double*)addr)[0];
	faddd	%f44,K2,%f40		! (5_1) res += K2;

	fmuld	%f36,%f60,%f36		! (4_1) res *= xx;
	sra	%g1,21,%o7		! (3_0) iexp = hx >> 21;
	lda	[%l6]%asi,%f0		! (4_0) ((float*)res)[0] = ((float*)px)[0];
	for	%f16,DC1,%f44		! (3_0) res = vis_for(res,DC1);

	fmuld	%f24,%f24,%f24		! (2_0) dtmp0 = dexp_hi * dexp_hi;
	cmp	%g1,_0x7ff00000		! (3_0) hx ? 0x7ff00000
	lda	[%l6+4]%asi,%f1		! (4_0) ((float*)res)[1] = ((float*)px)[1];
	faddd	%f8,%f4,%f8		! (3_1) res += dexp_hi;

	fmuld	%f62,%f32,%f60		! (0_0) res *= xx;
	bge,pn	%icc,.update20		! (3_0) if ( hx >= 0x7ff00000 )
	ldd	[%fp+tmp5],%f62		! (3_1) dlexp = *(double*)lexp;
	fand	%f18,DC3,%f4		! (2_0) res_c = vis_fand(res_c,DC3);
.cont20:
	fmuld	%f40,%f58,%f40		! (5_1) res *= xx;
	cmp	%g1,_0x00100000		! (3_0) hx ? 0x00100000
	and	%o2,2040,%o2		! (3_0) hx &= 0x7f8;
	faddd	%f34,K3,%f10		! (6_1) res += K3;

	fmuld	%f30,%f36,%f36		! (4_1) res = dexp_hi * res;
	bl,pn	%icc,.update21		! (3_0) if ( hx < 0x00100000 )
	sub	%g0,%o7,%o7		! (3_0) iexp = -iexp;
	fand	%f0,DC0,%f16		! (4_0) res = vis_fand(res,DC0);
.cont21:
	fmuld	%f8,%f62,%f8		! (3_1) res *= dlexp;
	add	%o7,1534,%o7		! (3_0) iexp += 0x5fe;
	ldd	[%l1+8],%f34		! (4_1) dexp_lo = ((double*)addr)[1];
	fpadd32	%f44,DC2,%f18		! (3_0) res_c = vis_fpadd32(res,DC2);

	fmuld	K6,%f38,%f62		! (1_0) res = K6 * xx;
	add	%o2,8,%i5		! (3_0) hx += 8;
	st	%f8,[%i0]		! (3_1) ((float*)py)[0] = ((float*)res)[0];
	fsubd	%f28,%f4,%f28		! (2_0) xx = res - res_c;

	fmuld	%f10,%f26,%f4		! (6_1) res *= xx;
	nop
	st	%f9,[%i0+4]		! (3_1) ((float*)py)[1] = ((float*)res)[1];
	faddd	%f60,K4,%f60		! (0_0) res += K4;

	lda	[%l6]%asi,%g1		! (4_0) hx = *(int*)px;
	sllx	%o7,52,%o7		! (3_0) iexp << 52;
	and	%i5,-16,%i5		! (3_0) hx = -16;
	faddd	%f40,K1,%f40		! (5_1) res += K1;

	add	%l6,stridex,%i0		! px += stridex
	add	%i5,TBL,%i5		! (3_0) addr = (char*)arr + hx;
	stx	%o7,[%fp+tmp5]		! (3_0) dlexp = *(double*)lexp;
	faddd	%f36,%f34,%f8		! (4_1) res += dexp_lo;

	fmuld	%f28,%f24,%f36		! (2_0) xx *= dtmp0;
	add	%i1,stridey,%l6		! px += stridey
	ldd	[%i5],%f28		! (3_0) dtmp0 = ((double*)addr)[0];
	faddd	%f62,K5,%f62		! (1_0) res += K5;

	faddd	%f4,K2,%f10		! (6_1) res += K2;
	sra	%g1,10,%o2		! (4_0) hx >>= 10;
	nop
	fmuld	%f60,%f32,%f34		! (0_0) res *= xx;

	fmuld	%f40,%f58,%f40		! (5_1) res *= xx;
	sra	%g1,21,%o7		! (4_0) iexp = hx >> 21;
	lda	[%i0]%asi,%f6		! (5_0) ((float*)res)[0] = ((float*)px)[0];
	for	%f16,DC1,%f24		! (4_0) res = vis_for(res,DC1);

	fmuld	%f28,%f28,%f28		! (3_0) dtmp0 = dexp_hi * dexp_hi;
	cmp	%g1,_0x7ff00000		! (4_0) hx ? 0x7ff00000
	lda	[%i0+4]%asi,%f7		! (5_0) ((float*)res)[1] = ((float*)px)[1];
	faddd	%f8,%f30,%f30		! (4_1) res += dexp_hi;

	fand	%f18,DC3,%f8		! (3_0) res_c = vis_fand(res_c,DC3);
	bge,pn	%icc,.update22		! (4_0) if ( hx >= 0x7ff00000 )
	ldd	[%fp+tmp6],%f18		! (4_1) dlexp = *(double*)lexp;
	fmuld	%f62,%f38,%f62		! (1_0) res *= xx;
.cont22:
	fmuld	%f10,%f26,%f58		! (6_1) res *= xx;
	cmp	%g1,_0x00100000		! (4_0) hx ? 0x00100000
	and	%o2,2040,%o2		! (4_0) hx &= 0x7f8;
	faddd	%f34,K3,%f60		! (0_0) res += K3;

	fmuld	%f22,%f40,%f40		! (5_1) res = dexp_hi * res;
	bl,pn	%icc,.update23		! (4_0) if ( hx < 0x00100000 )
	sub	%g0,%o7,%o7		! (4_0) iexp = -iexp;
	fand	%f6,DC0,%f16		! (5_0) res = vis_fand(res,DC0);
.cont23:
	fmuld	%f30,%f18,%f6		! (4_1) res *= dlexp;
	add	%o7,1534,%o7		! (4_0) iexp += 0x5fe;
	ldd	[%i3+8],%f34		! (5_1) dexp_lo = ((double*)addr)[1];
	fpadd32	%f24,DC2,%f18		! (4_0) res_c = vis_fpadd32(res,DC2);

	fmuld	K6,%f36,%f30		! (2_0) res = K6 * xx;
	add	%o2,8,%l1		! (4_0) hx += 8;
	st	%f6,[%i1]		! (4_1) ((float*)py)[0] = ((float*)res)[0];
	fsubd	%f44,%f8,%f44		! (3_0) xx = res - res_c;

	fmuld	%f60,%f32,%f60		! (0_0) res *= xx;
	sllx	%o7,52,%o7		! (4_0) iexp << 52;
	st	%f7,[%i1+4]		! (4_1) ((float*)py)[1] = ((float*)res)[1];
	faddd	%f62,K4,%f6		! (1_0) res += K4;

	lda	[%i0]%asi,%g1		! (5_0) hx = *(int*)px;
	add	%i0,stridex,%i1		! px += stridex
	and	%l1,-16,%l1		! (4_0) hx = -16;
	faddd	%f58,K1,%f58		! (6_1) res += K1;

	add	%l1,TBL,%l1		! (4_0) addr = (char*)arr + hx;
	add	%l6,stridey,%i0		! px += stridey
	stx	%o7,[%fp+tmp6]		! (4_0) dlexp = *(double*)lexp;
	faddd	%f40,%f34,%f8		! (5_1) res += dexp_lo;

	fmuld	%f44,%f28,%f40		! (3_0) xx *= dtmp0;
	nop
	ldd	[%l1],%f44		! (4_0) dtmp0 = ((double*)addr)[0];
	faddd	%f30,K5,%f62		! (2_0) res += K5;

	fmuld	%f6,%f38,%f34		! (1_0) res *= xx;
	sra	%g1,21,%o7		! (5_0) iexp = hx >> 21;
	ldd	[%l4],%f30		! (6_1) dexp_hi = ((double*)addr)[0];
	faddd	%f60,K2,%f60		! (0_0) res += K2;

	for	%f16,DC1,%f28		! (5_0) res = vis_for(res,DC1);
	sub	%g0,%o7,%o7		! (5_0) iexp = -iexp;
	lda	[%i1]%asi,%f6		! (6_0) ((float*)res)[0] = ((float*)px)[0];
	fmuld	%f58,%f26,%f26		! (6_1) res *= xx;

	fmuld	%f44,%f44,%f44		! (4_0) dtmp0 = dexp_hi * dexp_hi;
	cmp	%g1,_0x7ff00000		! (5_0) hx ? 0x7ff00000
	lda	[%i1+4]%asi,%f7		! (6_0) ((float*)res)[1] = ((float*)px)[1];
	faddd	%f8,%f22,%f22		! (5_1) res += dexp_hi;

	fand	%f18,DC3,%f8		! (4_0) res_c = vis_fand(res_c,DC3);
	bge,pn	%icc,.update24		! (5_0) if ( hx >= 0x7ff00000 )
	ldd	[%fp+tmp0],%f18		! (5_1) dlexp = *(double*)lexp;
	fmuld	%f62,%f36,%f62		! (2_0) res *= xx;
.cont24:
	fmuld	%f60,%f32,%f58		! (0_0) res *= xx;
	sra	%g1,10,%o2		! (5_0) hx >>= 10;
	cmp	%g1,_0x00100000		! (5_0) hx ? 0x00100000
	faddd	%f34,K3,%f34		! (1_0) res += K3;

	fmuld	%f30,%f26,%f26		! (6_1) res = dexp_hi * res;
	bl,pn	%icc,.update25		! (5_0) if ( hx < 0x00100000 )
	and	%o2,2040,%o2		! (5_0) hx &= 0x7f8;
	fand	%f6,DC0,%f16		! (6_0) res = vis_fand(res,DC0);
.cont25:
	fmuld	%f22,%f18,%f2		! (5_1) res *= dlexp;
	subcc	counter,7,counter	! counter -= 7;
	ldd	[%l4+8],%f60		! (6_1) dexp_lo = ((double*)addr)[1];
	fpadd32	%f28,DC2,%f18		! (5_0) res_c = vis_fpadd32(res,DC2);

	fmuld	K6,%f40,%f22		! (3_0) res = K6 * xx;
	add	%o2,8,%i3		! (5_0) hx += 8;
	st	%f2,[%l6]		! (5_1) ((float*)py)[0] = ((float*)res)[0];
	fsubd	%f24,%f8,%f10		! (4_0) xx = res - res_c;

	fmuld	%f34,%f38,%f24		! (1_0) res *= xx;
	st	%f3,[%l6+4]		! (5_1) ((float*)py)[1] = ((float*)res)[1];
	bpos,pt	%icc,.main_loop
	faddd	%f62,K4,%f34		! (2_0) res += K4;

	add	counter,7,counter
.tail:
	add	%o7,1534,%o7		! (5_0) iexp += 0x5fe;
	subcc	counter,1,counter
	bneg,a	.begin
	mov	%i0,%o4

	faddd	%f58,K1,%f58		! (0_1) res += K1;

	faddd	%f26,%f60,%f8		! (6_2) res += dexp_lo;

	faddd	%f22,K5,%f62		! (3_1) res += K5;
	fmuld	%f10,%f44,%f60		! (4_1) xx *= dtmp0;

	faddd	%f24,K2,%f26		! (1_1) res += K2;
	add	%i1,stridex,%l6		! px += stridex
	ldd	[%l2],%f24		! (0_1) dexp_hi = ((double*)addr)[0];
	fmuld	%f34,%f36,%f34		! (2_1) res *= xx;

	fmuld	%f58,%f32,%f58		! (0_1) res *= xx;

	add	%i0,stridey,%i1		! px += stridey
	faddd	%f8,%f30,%f30		! (6_2) res += dexp_hi;

	fmuld	%f62,%f40,%f32		! (3_1) res *= xx;
	ldd	[%fp+tmp1],%f62		! (6_2) dlexp = *(double*)lexp;

	fmuld	%f26,%f38,%f26		! (1_1) res *= xx;
	faddd	%f34,K3,%f34		! (2_1) res += K3;

	fmuld	%f24,%f58,%f58		! (0_1) res = dexp_hi * res;

	fmuld	%f30,%f62,%f2		! (6_2) res *= dlexp;
	ldd	[%l2+8],%f30		! (0_1) dexp_lo = ((double*)addr)[1];

	fmuld	K6,%f60,%f62		! (4_1) res = K6 * xx;
	st	%f2,[%i0]		! (6_2) ((float*)py)[0] = ((float*)res)[0];

	fmuld	%f34,%f36,%f28		! (2_1) res *= xx;
	st	%f3,[%i0+4]		! (6_2) ((float*)py)[1] = ((float*)res)[1];
	faddd	%f32,K4,%f32		! (3_1) res += K4;

	subcc	counter,1,counter
	bneg,a	.begin
	mov	%i1,%o4

	faddd	%f26,K1,%f26		! (1_1) res += K1;

	faddd	%f58,%f30,%f8		! (0_1) res += dexp_lo;

	add	%l6,stridex,%l6		! px += stridex
	faddd	%f62,K5,%f62		! (4_1) res += K5;

	fmuld	%f32,%f40,%f34		! (3_1) res *= xx;
	add	%i1,stridey,%i0		! px += stridey
	ldd	[%i2],%f22		! (1_1) dexp_hi = ((double*)addr)[0];
	faddd	%f28,K2,%f32		! (2_1) res += K2;

	fmuld	%f26,%f38,%f26		! (1_1) res *= xx;

	faddd	%f8,%f24,%f24		! (0_1) res += dexp_hi;

	fmuld	%f62,%f60,%f38		! (4_1) res *= xx;
	ldd	[%fp+tmp2],%f62		! (0_1) dlexp = *(double*)lexp;

	fmuld	%f32,%f36,%f32		! (2_1) res *= xx;
	faddd	%f34,K3,%f34		! (3_1) res += K3;

	fmuld	%f22,%f26,%f26		! (1_1) res = dexp_hi * res;

	fmuld	%f24,%f62,%f2		! (0_1) res *= dlexp;
	ldd	[%i2+8],%f24		! (1_1) dexp_lo = ((double*)addr)[1];

	st	%f2,[%i1]		! (0_1) ((float*)py)[0] = ((float*)res)[0];

	fmuld	%f34,%f40,%f44		! (3_1) res *= xx;
	st	%f3,[%i1+4]		! (0_1) ((float*)py)[1] = ((float*)res)[1];
	faddd	%f38,K4,%f38		! (4_1) res += K4;

	subcc	counter,1,counter
	bneg,a	.begin
	mov	%i0,%o4

	faddd	%f32,K1,%f32		! (2_1) res += K1;

	add	%l6,stridex,%l6		! px += stridex
	faddd	%f26,%f24,%f8		! (1_1) res += dexp_lo;

	add	%i0,stridey,%i1		! px += stridey

	fmuld	%f38,%f60,%f34		! (4_1) res *= xx;
	ldd	[%i4],%f24		! (2_1) dexp_hi = ((double*)addr)[0];
	faddd	%f44,K2,%f38		! (3_1) res += K2;

	fmuld	%f32,%f36,%f32		! (2_1) res *= xx;

	faddd	%f8,%f22,%f22		! (1_1) res += dexp_hi;

	ldd	[%fp+tmp3],%f62		! (1_1) dlexp = *(double*)lexp;

	fmuld	%f38,%f40,%f38		! (3_1) res *= xx;
	faddd	%f34,K3,%f34		! (4_1) res += K3;

	fmuld	%f24,%f32,%f32		! (2_1) res = dexp_hi * res;

	fmuld	%f22,%f62,%f2		! (1_1) res *= dlexp;
	ldd	[%i4+8],%f22		! (2_1) dexp_lo = ((double*)addr)[1];

	st	%f2,[%i0]		! (1_1) ((float*)py)[0] = ((float*)res)[0];

	fmuld	%f34,%f60,%f28		! (4_1) res *= xx;
	st	%f3,[%i0+4]		! (1_1) ((float*)py)[1] = ((float*)res)[1];

	subcc	counter,1,counter
	bneg,a	.begin
	mov	%i1,%o4

	faddd	%f38,K1,%f38		! (3_1) res += K1;

	faddd	%f32,%f22,%f8		! (2_1) res += dexp_lo;

	add	%l6,stridex,%l6		! px += stridex

	add	%i1,stridey,%i0		! px += stridey
	ldd	[%i5],%f22		! (3_1) dexp_hi = ((double*)addr)[0];
	faddd	%f28,K2,%f36		! (4_1) res += K2;

	fmuld	%f38,%f40,%f38		! (3_1) res *= xx;

	faddd	%f8,%f24,%f24		! (2_1) res += dexp_hi;

	ldd	[%fp+tmp4],%f62		! (2_1) dlexp = *(double*)lexp;

	fmuld	%f36,%f60,%f36		! (4_1) res *= xx;

	fmuld	%f22,%f38,%f38		! (3_1) res = dexp_hi * res;

	fmuld	%f24,%f62,%f2		! (2_1) res *= dlexp;
	ldd	[%i5+8],%f24		! (3_1) dexp_lo = ((double*)addr)[1];

	st	%f2,[%i1]		! (2_1) ((float*)py)[0] = ((float*)res)[0];

	st	%f3,[%i1+4]		! (2_1) ((float*)py)[1] = ((float*)res)[1];

	subcc	counter,1,counter
	bneg,a	.begin
	mov	%i0,%o4

	faddd	%f36,K1,%f36		! (4_1) res += K1;

	faddd	%f38,%f24,%f8		! (3_1) res += dexp_lo;

	add	%i0,stridey,%i1		! px += stridey

	add	%l6,stridex,%l6		! px += stridex
	ldd	[%l1],%f30		! (4_1) dexp_hi = ((double*)addr)[0];

	fmuld	%f36,%f60,%f36		! (4_1) res *= xx;

	faddd	%f8,%f22,%f8		! (3_1) res += dexp_hi;

	ldd	[%fp+tmp5],%f62		! (3_1) dlexp = *(double*)lexp;

	fmuld	%f30,%f36,%f36		! (4_1) res = dexp_hi * res;

	fmuld	%f8,%f62,%f8		! (3_1) res *= dlexp;
	ldd	[%l1+8],%f34		! (4_1) dexp_lo = ((double*)addr)[1];

	st	%f8,[%i0]		! (3_1) ((float*)py)[0] = ((float*)res)[0];

	st	%f9,[%i0+4]		! (3_1) ((float*)py)[1] = ((float*)res)[1];

	subcc	counter,1,counter
	bneg,a	.begin
	mov	%i1,%o4

	faddd	%f36,%f34,%f8		! (4_1) res += dexp_lo;

	add	%l6,stridex,%i0		! px += stridex

	add	%i1,stridey,%l6		! px += stridey

	faddd	%f8,%f30,%f30		! (4_1) res += dexp_hi;

	ldd	[%fp+tmp6],%f18		! (4_1) dlexp = *(double*)lexp;

	fmuld	%f30,%f18,%f6		! (4_1) res *= dlexp;

	st	%f6,[%i1]		! (4_1) ((float*)py)[0] = ((float*)res)[0];

	st	%f7,[%i1+4]		! (4_1) ((float*)py)[1] = ((float*)res)[1];

	ba	.begin
	add	%i1,stridey,%o4

	.align	16
.spec0:
	fdivd	DONE,%f0,%f0		! res = DONE / res;
	add	%i1,stridex,%i1		! px += stridex
	st	%f0,[%o4]		! ((float*)py)[0] = ((float*)&res)[0];
	st	%f1,[%o4+4]		! ((float*)py)[1] = ((float*)&res)[1];
	add	%o4,stridey,%o4		! py += stridey
	ba	.begin1
	sub	counter,1,counter

	.align	16
.spec1:
	orcc	%i2,%l4,%g0
	bz,a	2f
	fdivd	DONE,%f0,%f0		! res = DONE / res;

	cmp	%g1,0
	bl,a	2f
	fsqrtd	%f0,%f0			! res = sqrt(res);

	cmp	%g1,%i4
	bge,a	1f
	ldd	[%o3+0x50],%f18

	fxtod	%f0,%f0			! res = *(long long*)&res;
	st	%f0,[%fp+tmp0]

	fand	%f0,DC0,%f16		! (6_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp0],%g1

	sra	%g1,21,%o7		! (6_1) iexp = hx >> 21;
	for	%f16,DC1,%f44		! (6_1) res = vis_for(res,DC1);

	sra	%g1,10,%o2		! (6_1) hx >>= 10;
	sub	%o7,537,%o7

	and	%o2,2040,%o2		! (6_1) hx &= 0x7f8;
	ba	.cont_spec
	sub	%g0,%o7,%o7		! (6_1) iexp = -iexp;

1:
	fand	%f0,%f18,%f0		! res = vis_fand(res,DC4);

	ldd	[%o3+0x58],%f28
	fxtod	%f0,%f0			! res = *(long long*)&res;

	faddd	%f0,%f28,%f0		! res += D2ON51;
	st	%f0,[%fp+tmp0]

	fand	%f0,DC0,%f16		! (6_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp0],%g1

	sra	%g1,21,%o7		! (6_1) iexp = hx >> 21;
	for	%f16,DC1,%f44		! (6_1) res = vis_for(res,DC1);

	sra	%g1,10,%o2		! (6_1) hx >>= 10;
	sub	%o7,537,%o7

	and	%o2,2040,%o2		! (6_1) hx &= 0x7f8;
	ba	.cont_spec
	sub	%g0,%o7,%o7		! (6_1) iexp = -iexp;

2:
	add	%i1,stridex,%i1		! px += stridex
	st	%f0,[%o4]		! ((float*)py)[0] = ((float*)&res)[0];
	st	%f1,[%o4+4]		! ((float*)py)[1] = ((float*)&res)[1];
	add	%o4,stridey,%o4		! py += stridey
	ba	.begin1
	sub	counter,1,counter

	.align	16
.update0:
	cmp	counter,1
	ble	.cont0
	nop

	sub	%l6,stridex,tmp_px
	sub	counter,1,tmp_counter

	ba	.cont0
	mov	1,counter

	.align	16
.update1:
	cmp	counter,1
	ble	.cont1
	sub	%l6,stridex,%i1

	ld	[%i1+4],%i2
	cmp	%g1,0
	bl	1f

	orcc	%g1,%i2,%g0
	bz	1f
	sethi	%hi(0x00080000),%i3

	cmp	%g1,%i3
	bge,a	2f
	ldd	[%o3+0x50],%f18

	fxtod	%f8,%f8			! res = *(long long*)&res;
	st	%f8,[%fp+tmp7]

	fand	%f8,DC0,%f16		! (0_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (0_0) iexp = hx >> 21;
	sra	%g1,10,%o2		! (0_0) hx >>= 10;
	for	%f16,DC1,%f28		! (0_0) res = vis_for(res,DC1);

	sub	%o7,537,%o7

	sub	%g0,%o7,%o7		! (0_0) iexp = -iexp;

	and	%o2,2040,%o2		! (0_0) hx &= 0x7f8;
	ba	.cont1
	add	%o7,1534,%o7		! (0_0) iexp += 0x5fe;
2:
	fand	%f8,%f18,%f8
	fxtod	%f8,%f8			! res = *(long long*)&res;
	ldd	[%o3+0x58],%f18
	faddd	%f8,%f18,%f8
	st	%f8,[%fp+tmp7]

	fand	%f8,DC0,%f16		! (0_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (0_0) iexp = hx >> 21;
	sra	%g1,10,%o2		! (0_0) hx >>= 10;
	for	%f16,DC1,%f28		! (0_0) res = vis_for(res,DC1);

	sub	%o7,537,%o7

	sub	%g0,%o7,%o7		! (0_0) iexp = -iexp;

	and	%o2,2040,%o2		! (0_0) hx &= 0x7f8;
	ba	.cont1
	add	%o7,1534,%o7		! (0_0) iexp += 0x5fe;
1:
	sub	%l6,stridex,tmp_px
	sub	counter,1,tmp_counter

	ba	.cont1
	mov	1,counter

	.align	16
.update2:
	cmp	counter,2
	ble	.cont2
	nop

	sub	%l6,stridex,tmp_px
	sub	counter,2,tmp_counter

	ba	.cont2
	mov	2,counter

	.align	16
.update3:
	cmp	counter,2
	ble	.cont3
	sub	%l6,stridex,%i1

	ld	[%i1+4],%i2
	cmp	%g1,0
	bl	1f

	orcc	%g1,%i2,%g0
	bz	1f
	sethi	%hi(0x00080000),%i3

	cmp	%g1,%i3
	bge,a	2f
	ldd	[%o3+0x50],%f18

	fxtod	%f0,%f0			! res = *(long long*)&res;
	st	%f0,[%fp+tmp7]

	fand	%f0,DC0,%f16		! (1_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (1_0) iexp = hx >> 21;
	for	%f16,DC1,%f44		! (1_0) res = vis_for(res,DC1);

	sra	%g1,10,%o2		! (1_0) hx >>= 10;
	sub	%o7,537,%o7
	ba	.cont3
	and	%o2,2040,%o2		! (1_0) hx &= 0x7f8;
2:
	fand	%f0,%f18,%f0
	fxtod	%f0,%f0			! res = *(long long*)&res;
	ldd	[%o3+0x58],%f18
	faddd	%f0,%f18,%f0
	st	%f0,[%fp+tmp7]

	fand	%f0,DC0,%f16		! (1_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (1_0) iexp = hx >> 21;
	for	%f16,DC1,%f44		! (1_0) res = vis_for(res,DC1);

	sra	%g1,10,%o2		! (1_0) hx >>= 10;
	sub	%o7,537,%o7
	ba	.cont3
	and	%o2,2040,%o2		! (1_0) hx &= 0x7f8;
1:
	sub	%l6,stridex,tmp_px
	sub	counter,2,tmp_counter

	ba	.cont3
	mov	2,counter

	.align	16
.update4:
	cmp	counter,3
	ble	.cont4
	nop

	sub	%l6,stridex,tmp_px
	sub	counter,3,tmp_counter

	ba	.cont4
	mov	3,counter

	.align	16
.update5:
	cmp	counter,3
	ble	.cont5
	sub	%l6,stridex,%i1

	ld	[%i1+4],%i3
	cmp	%g1,0
	bl	1f

	orcc	%g1,%i3,%g0
	bz	1f
	sethi	%hi(0x00080000),%i4

	cmp	%g1,%i4
	bge,a	2f
	ldd	[%o3+0x50],%f18

	fxtod	%f6,%f6			! res = *(long long*)&res;
	st	%f6,[%fp+tmp7]

	fand	%f6,DC0,%f16		! (2_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (2_0) iexp = hx >> 21;
	sra	%g1,10,%o2		! (2_0) hx >>= 10;

	sub	%o7,537,%o7
	and	%o2,2040,%o2		! (2_0) hx &= 0x7f8;
	ba	.cont5
	for	%f16,DC1,%f28		! (2_0) res = vis_for(res,DC1);
2:
	fand	%f6,%f18,%f6
	fxtod	%f6,%f6			! res = *(long long*)&res;
	ldd	[%o3+0x58],%f18
	faddd	%f6,%f18,%f6
	st	%f6,[%fp+tmp7]

	fand	%f6,DC0,%f16		! (2_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (2_0) iexp = hx >> 21;
	sra	%g1,10,%o2		! (2_0) hx >>= 10;

	sub	%o7,537,%o7
	and	%o2,2040,%o2		! (2_0) hx &= 0x7f8;
	ba	.cont5
	for	%f16,DC1,%f28		! (2_0) res = vis_for(res,DC1);
1:
	sub	%l6,stridex,tmp_px
	sub	counter,3,tmp_counter

	ba	.cont5
	mov	3,counter

	.align	16
.update6:
	cmp	counter,4
	ble	.cont6
	nop

	sub	%l6,stridex,tmp_px
	sub	counter,4,tmp_counter

	ba	.cont6
	mov	4,counter

	.align	16
.update7:
	sub	%l6,stridex,%i1
	cmp	counter,4
	ble	.cont7
	faddd	%f34,K3,%f6		! (6_1) res += K3;

	ld	[%i1+4],%i3
	cmp	%g1,0
	bl	1f

	orcc	%g1,%i3,%g0
	bz	1f
	sethi	%hi(0x00080000),%i5

	cmp	%g1,%i5
	bge,a	2f
	ldd	[%o3+0x50],%f18

	fxtod	%f0,%f0			! res = *(long long*)&res;
	st	%f0,[%fp+tmp7]

	fand	%f0,DC0,%f16		! (3_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (3_0) iexp = hx >> 21;
	sra	%g1,10,%o2		! (3_0) hx >>= 10;

	sub	%o7,537,%o7
	and	%o2,2040,%o2		! (3_0) hx &= 0x7f8;
	ba	.cont7
	for	%f16,DC1,%f44		! (3_0) res = vis_for(res,DC1);
2:
	fand	%f0,%f18,%f0
	fxtod	%f0,%f0			! res = *(long long*)&res;
	ldd	[%o3+0x58],%f18
	faddd	%f0,%f18,%f0
	st	%f0,[%fp+tmp7]

	fand	%f0,DC0,%f16		! (3_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (3_0) iexp = hx >> 21;
	sra	%g1,10,%o2		! (3_0) hx >>= 10;

	sub	%o7,537,%o7
	and	%o2,2040,%o2		! (3_0) hx &= 0x7f8;
	ba	.cont7
	for	%f16,DC1,%f44		! (3_0) res = vis_for(res,DC1);
1:
	sub	%l6,stridex,tmp_px
	sub	counter,4,tmp_counter

	ba	.cont7
	mov	4,counter

	.align	16
.update8:
	cmp	counter,5
	ble	.cont8
	nop

	mov	%l6,tmp_px
	sub	counter,5,tmp_counter

	ba	.cont8
	mov	5,counter

	.align	16
.update9:
	ld	[%l6+4],%i3
	cmp	counter,5
	ble	.cont9
	fand	%f0,DC0,%f16		! (5_0) res = vis_fand(res,DC0);

	cmp	%g1,0
	bl	1f

	orcc	%g1,%i3,%g0
	bz	1f
	sethi	%hi(0x00080000),%i1

	cmp	%g1,%i1
	bge,a	2f
	ldd	[%o3+0x50],%f18

	fxtod	%f8,%f8			! res = *(long long*)&res;
	st	%f8,[%fp+tmp7]

	fand	%f8,DC0,%f24		! (4_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (4_0) iexp = hx >> 21;
	sra	%g1,10,%o2		! (4_0) hx >>= 10;

	sub	%o7,537,%o7

	and	%o2,2040,%o2		! (4_0) hx &= 0x7f8;
	sub	%g0,%o7,%o7		! (4_0) iexp = -iexp;
	ba	.cont9
	for	%f24,DC1,%f24		! (4_0) res = vis_for(res,DC1);
2:
	fand	%f8,%f18,%f8
	fxtod	%f8,%f8			! res = *(long long*)&res;
	ldd	[%o3+0x58],%f18
	faddd	%f8,%f18,%f8
	st	%f8,[%fp+tmp7]

	fand	%f8,DC0,%f24		! (4_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (4_0) iexp = hx >> 21;
	sra	%g1,10,%o2		! (4_0) hx >>= 10;

	sub	%o7,537,%o7

	and	%o2,2040,%o2		! (4_0) hx &= 0x7f8;
	sub	%g0,%o7,%o7		! (4_0) iexp = -iexp;
	ba	.cont9
	for	%f24,DC1,%f24		! (4_0) res = vis_for(res,DC1);
1:
	mov	%l6,tmp_px
	sub	counter,5,tmp_counter

	ba	.cont9
	mov	5,counter

	.align	16
.update10:
	cmp	counter,6
	ble	.cont10
	nop

	mov	%i0,tmp_px
	sub	counter,6,tmp_counter

	ba	.cont10
	mov	6,counter

	.align	16
.update11:
	ld	[%i0+4],%i3
	cmp	counter,6
	ble	.cont11
	fand	%f6,DC0,%f16		! (6_0) res = vis_fand(res,DC0);

	cmp	%g1,0
	bl	1f

	orcc	%g1,%i3,%g0
	bz	1f
	sethi	%hi(0x00080000),%i3

	cmp	%g1,%i3
	bge,a	2f
	ldd	[%o3+0x50],%f18

	fxtod	%f0,%f0			! res = *(long long*)&res;
	st	%f0,[%fp+tmp7]

	fand	%f0,DC0,%f28		! (5_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (5_0) iexp = hx >> 21;
	sra	%g1,10,%o2		! (5_0) hx >>= 10;

	sub	%o7,537,%o7

	sub	%g0,%o7,%o7		! (5_0) iexp = -iexp;

	and	%o2,2040,%o2		! (5_0) hx &= 0x7f8;
	ba	.cont11
	for	%f28,DC1,%f28		! (5_0) res = vis_for(res,DC1);
2:
	fand	%f0,%f18,%f0
	fxtod	%f0,%f0			! res = *(long long*)&res;
	ldd	[%o3+0x58],%f18
	faddd	%f0,%f18,%f0
	st	%f0,[%fp+tmp7]

	fand	%f0,DC0,%f28		! (5_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (5_0) iexp = hx >> 21;
	sra	%g1,10,%o2		! (5_0) hx >>= 10;

	sub	%o7,537,%o7

	sub	%g0,%o7,%o7		! (5_0) iexp = -iexp;

	and	%o2,2040,%o2		! (5_0) hx &= 0x7f8;
	ba	.cont11
	for	%f28,DC1,%f28		! (5_0) res = vis_for(res,DC1);
1:
	mov	%i0,tmp_px
	sub	counter,6,tmp_counter

	ba	.cont11
	mov	6,counter

	.align	16
.update12:
	cmp	counter,0
	ble	.cont12
	faddd	%f34,K3,%f34		! (2_1) res += K3;

	sub	%l6,stridex,tmp_px
	sub	counter,0,tmp_counter

	ba	.cont12
	mov	0,counter

	.align	16
.update13:
	sub	%l6,stridex,%l4
	cmp	counter,0
	ble	.cont13
	fpadd32	%f44,DC2,%f18		! (6_1) res_c = vis_fpadd32(res,DC2);

	ld	[%l4+4],%l4
	cmp	%g1,0
	bl	1f

	orcc	%g1,%l4,%g0
	bz	1f
	sethi	%hi(0x00080000),%l4

	cmp	%g1,%l4
	bge,a	2f
	ldd	[%o3+0x50],%f62

	fxtod	%f6,%f6			! res = *(long long*)&res;
	st	%f6,[%fp+tmp7]

	fand	%f6,DC0,%f44		! (6_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (6_1) iexp = hx >> 21;
	sra	%g1,10,%o2		! (6_1) hx >>= 10;

	sub	%o7,537,%o7
	and	%o2,2040,%o2		! (6_1) hx &= 0x7f8;
	for	%f44,DC1,%f44		! (6_1) res = vis_for(res,DC1);

	sub	%g0,%o7,%o7		! (6_1) iexp = -iexp;
	ba	.cont13
	fpadd32	%f44,DC2,%f18		! (6_1) res_c = vis_fpadd32(res,DC2);
2:
	fand	%f6,%f62,%f6
	fxtod	%f6,%f6			! res = *(long long*)&res;
	ldd	[%o3+0x58],%f62
	faddd	%f6,%f62,%f6
	st	%f6,[%fp+tmp7]

	fand	%f6,DC0,%f44		! (6_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (6_1) iexp = hx >> 21;
	sra	%g1,10,%o2		! (6_1) hx >>= 10;
	for	%f44,DC1,%f44		! (6_1) res = vis_for(res,DC1);

	sub	%o7,537,%o7

	and	%o2,2040,%o2		! (6_1) hx &= 0x7f8;
	sub	%g0,%o7,%o7		! (6_1) iexp = -iexp;
	ba	.cont13
	fpadd32	%f44,DC2,%f18		! (6_1) res_c = vis_fpadd32(res,DC2);
1:
	sub	%l6,stridex,tmp_px
	sub	counter,0,tmp_counter

	ba	.cont13
	mov	0,counter

	.align	16
.update14:
	cmp	counter,1
	ble	.cont14
	faddd	%f34,K3,%f34		! (3_1) res += K3;

	sub	%l6,stridex,tmp_px
	sub	counter,1,tmp_counter

	ba	.cont14
	mov	1,counter

	.align	16
.update15:
	sub	%l6,stridex,%l2
	cmp	counter,1
	ble	.cont15
	fpadd32	%f28,DC2,%f18		! (0_0) res_c = vis_fpadd32(res,DC2);

	ld	[%l2+4],%l2
	cmp	%g1,0
	bl	1f

	orcc	%g1,%l2,%g0
	bz	1f
	sethi	%hi(0x00080000),%l2

	cmp	%g1,%l2
	bge,a	2f
	ldd	[%o3+0x50],%f62

	fxtod	%f0,%f0			! res = *(long long*)&res;
	st	%f0,[%fp+tmp7]

	fand	%f0,DC0,%f18		! (0_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (0_0) iexp = hx >> 21;
	sra	%g1,10,%o2		! (0_0) hx >>= 10;

	sub	%o7,537,%o7
	for	%f18,DC1,%f28		! (0_0) res = vis_for(res,DC1);

	sub	%g0,%o7,%o7		! (0_0) iexp = -iexp;

	and	%o2,2040,%o2		! (0_0) hx &= 0x7f8;
	add	%o7,1534,%o7		! (0_0) iexp += 0x5fe;
	ba	.cont15
	fpadd32	%f28,DC2,%f18		! (0_0) res_c = vis_fpadd32(res,DC2);
2:
	fand	%f0,%f62,%f0
	fxtod	%f0,%f0			! res = *(long long*)&res;
	ldd	[%o3+0x58],%f62
	faddd	%f0,%f62,%f0
	st	%f0,[%fp+tmp7]

	fand	%f0,DC0,%f18		! (0_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (0_0) iexp = hx >> 21;
	sra	%g1,10,%o2		! (0_0) hx >>= 10;
	for	%f18,DC1,%f28		! (0_0) res = vis_for(res,DC1);

	sub	%o7,537,%o7

	sub	%g0,%o7,%o7		! (0_0) iexp = -iexp;

	and	%o2,2040,%o2		! (0_0) hx &= 0x7f8;
	add	%o7,1534,%o7		! (0_0) iexp += 0x5fe;
	ba	.cont15
	fpadd32	%f28,DC2,%f18		! (0_0) res_c = vis_fpadd32(res,DC2);
1:
	sub	%l6,stridex,tmp_px
	sub	counter,1,tmp_counter

	ba	.cont15
	mov	1,counter

	.align	16
.update16:
	cmp	counter,2
	ble	.cont16
	fand	%f18,DC3,%f8		! (0_0) res_c = vis_fand(res_c,DC3);

	sub	%l6,stridex,tmp_px
	sub	counter,2,tmp_counter

	ba	.cont16
	mov	2,counter

	.align	16
.update17:
	sub	%l6,stridex,%i2
	cmp	counter,2
	ble	.cont17
	fand	%f0,DC0,%f16		! (2_0) res = vis_fand(res,DC0);

	ld	[%i2+4],%i2
	cmp	%g1,0
	bl	1f

	orcc	%g1,%i2,%g0
	bz	1f
	sethi	%hi(0x00080000),%i2

	cmp	%g1,%i2
	bge,a	2f
	ldd	[%o3+0x50],%f2

	fxtod	%f6,%f6			! res = *(long long*)&res;
	st	%f6,[%fp+tmp7]

	fand	%f6,DC0,%f44		! (1_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (1_0) iexp = hx >> 21;
	sra	%g1,10,%o2		! (1_0) hx >>= 10;

	sub	%o7,537,%o7

	and	%o2,2040,%o2		! (1_0) hx &= 0x7f8;
	sub	%g0,%o7,%o7		! (1_0) iexp = -iexp;
	ba	.cont17
	for	%f44,DC1,%f44		! (1_0) res = vis_for(res,DC1);
2:
	fand	%f6,%f2,%f6
	fxtod	%f6,%f6			! res = *(long long*)&res;
	ldd	[%o3+0x58],%f2
	faddd	%f6,%f2,%f6
	st	%f6,[%fp+tmp7]

	fand	%f6,DC0,%f44		! (1_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (1_0) iexp = hx >> 21;
	sra	%g1,10,%o2		! (1_0) hx >>= 10;

	sub	%o7,537,%o7

	and	%o2,2040,%o2		! (1_0) hx &= 0x7f8;
	sub	%g0,%o7,%o7		! (1_0) iexp = -iexp;
	ba	.cont17
	for	%f44,DC1,%f44		! (1_0) res = vis_for(res,DC1);
1:
	sub	%l6,stridex,tmp_px
	sub	counter,2,tmp_counter

	ba	.cont17
	mov	2,counter

	.align	16
.update18:
	cmp	counter,3
	ble	.cont18
	fand	%f18,DC3,%f8		! (1_0) res_c = vis_fand(res_c,DC3);

	sub	%l6,stridex,tmp_px
	sub	counter,3,tmp_counter

	ba	.cont18
	mov	3,counter

	.align	16
.update19:
	sub	%l6,stridex,%i4
	cmp	counter,3
	ble	.cont19
	fand	%f6,DC0,%f16		! (3_0) res = vis_fand(res,DC0);

	ld	[%i4+4],%i4
	cmp	%g1,0
	bl	1f

	orcc	%g1,%i4,%g0
	bz	1f
	sethi	%hi(0x00080000),%i4

	cmp	%g1,%i4
	bge,a	2f
	ldd	[%o3+0x50],%f2

	fxtod	%f0,%f0			! res = *(long long*)&res;
	st	%f0,[%fp+tmp7]

	fand	%f0,DC0,%f28		! (2_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (2_0) iexp = hx >> 21;

	sra	%g1,10,%o2		! (2_0) hx >>= 10;
	sub	%o7,537,%o7

	and	%o2,2040,%o2		! (2_0) hx &= 0x7f8;
	sub	%g0,%o7,%o7		! (2_0) iexp = -iexp;
	ba	.cont19
	for	%f28,DC1,%f28		! (2_0) res = vis_for(res,DC1);
2:
	fand	%f0,%f2,%f0
	fxtod	%f0,%f0			! res = *(long long*)&res;
	ldd	[%o3+0x58],%f2
	faddd	%f0,%f2,%f0
	st	%f0,[%fp+tmp7]

	fand	%f0,DC0,%f28		! (2_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (2_0) iexp = hx >> 21;

	sra	%g1,10,%o2		! (2_0) hx >>= 10;
	sub	%o7,537,%o7

	and	%o2,2040,%o2		! (2_0) hx &= 0x7f8;
	sub	%g0,%o7,%o7		! (2_0) iexp = -iexp;
	ba	.cont19
	for	%f28,DC1,%f28		! (2_0) res = vis_for(res,DC1);
1:
	sub	%l6,stridex,tmp_px
	sub	counter,3,tmp_counter

	ba	.cont19
	mov	3,counter

	.align	16
.update20:
	cmp	counter,4
	ble	.cont20
	fand	%f18,DC3,%f4		! (2_0) res_c = vis_fand(res_c,DC3);

	sub	%l6,stridex,tmp_px
	sub	counter,4,tmp_counter

	ba	.cont20
	mov	4,counter

	.align	16
.update21:
	sub	%l6,stridex,%i5
	cmp	counter,4
	ble	.cont21
	fand	%f0,DC0,%f16		! (4_0) res = vis_fand(res,DC0);

	ld	[%i5+4],%i5
	cmp	%g1,0
	bl	1f

	orcc	%g1,%i5,%g0
	bz	1f
	sethi	%hi(0x00080000),%i5

	cmp	%g1,%i5
	bge,a	2f
	ldd	[%o3+0x50],%f34

	fxtod	%f6,%f6			! res = *(long long*)&res;
	st	%f6,[%fp+tmp7]

	fand	%f6,DC0,%f44		! (3_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (3_0) iexp = hx >> 21;
	sra	%g1,10,%o2		! (3_0) hx >>= 10;

	sub	%o7,537,%o7
	and	%o2,2040,%o2		! (3_0) hx &= 0x7f8;

	sub	%g0,%o7,%o7		! (3_0) iexp = -iexp;
	ba	.cont21
	for	%f44,DC1,%f44		! (3_0) res = vis_for(res,DC1);
2:
	fand	%f6,%f34,%f6
	fxtod	%f6,%f6			! res = *(long long*)&res;
	ldd	[%o3+0x58],%f34
	faddd	%f6,%f34,%f6
	st	%f6,[%fp+tmp7]

	fand	%f6,DC0,%f44		! (3_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (3_0) iexp = hx >> 21;
	sra	%g1,10,%o2		! (3_0) hx >>= 10;

	sub	%o7,537,%o7
	and	%o2,2040,%o2		! (3_0) hx &= 0x7f8;

	sub	%g0,%o7,%o7		! (3_0) iexp = -iexp;
	ba	.cont21
	for	%f44,DC1,%f44		! (3_0) res = vis_for(res,DC1);
1:
	sub	%l6,stridex,tmp_px
	sub	counter,4,tmp_counter

	ba	.cont21
	mov	4,counter

	.align	16
.update22:
	cmp	counter,5
	ble	.cont22
	fmuld	%f62,%f38,%f62		! (1_0) res *= xx;

	sub	%i0,stridex,tmp_px
	sub	counter,5,tmp_counter

	ba	.cont22
	mov	5,counter

	.align	16
.update23:
	sub	%i0,stridex,%l1
	cmp	counter,5
	ble	.cont23
	fand	%f6,DC0,%f16		! (5_0) res = vis_fand(res,DC0);

	ld	[%l1+4],%l1
	cmp	%g1,0
	bl	1f

	orcc	%g1,%l1,%g0
	bz	1f
	sethi	%hi(0x00080000),%l1

	cmp	%g1,%l1
	bge,a	2f
	ldd	[%o3+0x50],%f34

	fxtod	%f0,%f0			! res = *(long long*)&res;
	st	%f0,[%fp+tmp7]

	fand	%f0,DC0,%f24		! (4_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (4_0) iexp = hx >> 21;

	sra	%g1,10,%o2		! (4_0) hx >>= 10;
	sub	%o7,537,%o7

	and	%o2,2040,%o2		! (4_0) hx &= 0x7f8;
	sub	%g0,%o7,%o7		! (4_0) iexp = -iexp;
	ba	.cont23
	for	%f24,DC1,%f24		! (4_0) res = vis_for(res,DC1);
2:
	fand	%f0,%f34,%f0
	fxtod	%f0,%f0			! res = *(long long*)&res;
	ldd	[%o3+0x58],%f34
	faddd	%f0,%f34,%f0
	st	%f0,[%fp+tmp7]

	fand	%f0,DC0,%f24		! (4_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (4_0) iexp = hx >> 21;

	sra	%g1,10,%o2		! (4_0) hx >>= 10;
	sub	%o7,537,%o7

	and	%o2,2040,%o2		! (4_0) hx &= 0x7f8;
	sub	%g0,%o7,%o7		! (4_0) iexp = -iexp;
	ba	.cont23
	for	%f24,DC1,%f24		! (4_0) res = vis_for(res,DC1);
1:
	sub	%i0,stridex,tmp_px
	sub	counter,5,tmp_counter

	ba	.cont23
	mov	5,counter

	.align	16
.update24:
	cmp	counter,6
	ble	.cont24
	fmuld	%f62,%f36,%f62		! (2_0) res *= xx;

	sub	%i1,stridex,tmp_px
	sub	counter,6,tmp_counter

	ba	.cont24
	mov	6,counter

	.align	16
.update25:
	sub	%i1,stridex,%i3
	cmp	counter,6
	ble	.cont25
	fand	%f6,DC0,%f16		! (6_0) res = vis_fand(res,DC0);

	ld	[%i3+4],%i3
	cmp	%g1,0
	bl	1f

	orcc	%g1,%i3,%g0
	bz	1f
	nop

	sub	%i1,stridex,%i3
	ld	[%i3],%f10
	ld	[%i3+4],%f11

	sethi	%hi(0x00080000),%i3

	cmp	%g1,%i3
	bge,a	2f
	ldd	[%o3+0x50],%f60

	fxtod	%f10,%f10		! res = *(long long*)&res;
	st	%f10,[%fp+tmp7]

	fand	%f10,DC0,%f28		! (5_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (5_0) iexp = hx >> 21;

	sra	%g1,10,%o2		! (5_0) hx >>= 10;
	sub	%o7,537,%o7

	and	%o2,2040,%o2		! (5_0) hx &= 0x7f8;
	sub	%g0,%o7,%o7		! (5_0) iexp = -iexp;

	ba	.cont25
	for	%f28,DC1,%f28		! (5_0) res = vis_for(res,DC1);
2:
	fand	%f10,%f60,%f10
	fxtod	%f10,%f10		! res = *(long long*)&res;
	ldd	[%o3+0x58],%f60
	faddd	%f10,%f60,%f10
	st	%f10,[%fp+tmp7]

	fand	%f10,DC0,%f28		! (5_0) res = vis_fand(res,DC0);
	ld	[%fp+tmp7],%g1

	sra	%g1,21,%o7		! (5_0) iexp = hx >> 21;

	sra	%g1,10,%o2		! (5_0) hx >>= 10;
	sub	%o7,537,%o7

	and	%o2,2040,%o2		! (5_0) hx &= 0x7f8;
	sub	%g0,%o7,%o7		! (5_0) iexp = -iexp;

	ba	.cont25
	for	%f28,DC1,%f28		! (5_0) res = vis_for(res,DC1);
1:
	sub	%i1,stridex,tmp_px
	sub	counter,6,tmp_counter

	ba	.cont25
	mov	6,counter

.exit:
	ret
	restore
	SET_SIZE(__vrsqrt)