xref: /illumos-gate/usr/src/lib/libmvec/common/vis/__vrsqrtf.S (revision 66582b606a8194f7f3ba5b3a3a6dca5b0d346361)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
23 */
24/*
25 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
26 * Use is subject to license terms.
27 */
28
29	.file	"__vrsqrtf.S"
30
31#include "libm.h"
32
33	RO_DATA
34	.align	64
35
36! i = [0,63]
37! TBL[2*i  ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-24;
38! TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46)));
39! i = [64,127]
40! TBL[2*i  ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-23;
41! TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46)));
42
43.CONST_TBL:
44	.word	0x3e800000, 0x00000000, 0x3ff6a09e, 0x667f3bcd,
45	.word	0x3e7f81f8, 0x1f81f820, 0x3ff673e3, 0x2ef63a03,
46	.word	0x3e7f07c1, 0xf07c1f08, 0x3ff6482d, 0x37a5a3d2,
47	.word	0x3e7e9131, 0xabf0b767, 0x3ff61d72, 0xb7978671,
48	.word	0x3e7e1e1e, 0x1e1e1e1e, 0x3ff5f3aa, 0x673fa911,
49	.word	0x3e7dae60, 0x76b981db, 0x3ff5cacb, 0x7802f342,
50	.word	0x3e7d41d4, 0x1d41d41d, 0x3ff5a2cd, 0x8c69d61a,
51	.word	0x3e7cd856, 0x89039b0b, 0x3ff57ba8, 0xb0ee01b9,
52	.word	0x3e7c71c7, 0x1c71c71c, 0x3ff55555, 0x55555555,
53	.word	0x3e7c0e07, 0x0381c0e0, 0x3ff52fcc, 0x468d6b54,
54	.word	0x3e7bacf9, 0x14c1bad0, 0x3ff50b06, 0xa8fc6b70,
55	.word	0x3e7b4e81, 0xb4e81b4f, 0x3ff4e6fd, 0xf33cf032,
56	.word	0x3e7af286, 0xbca1af28, 0x3ff4c3ab, 0xe93bcf74,
57	.word	0x3e7a98ef, 0x606a63be, 0x3ff4a10a, 0x97af7b92,
58	.word	0x3e7a41a4, 0x1a41a41a, 0x3ff47f14, 0x4fe17f9f,
59	.word	0x3e79ec8e, 0x951033d9, 0x3ff45dc3, 0xa3c34fa3,
60	.word	0x3e799999, 0x9999999a, 0x3ff43d13, 0x6248490f,
61	.word	0x3e7948b0, 0xfcd6e9e0, 0x3ff41cfe, 0x93ff5199,
62	.word	0x3e78f9c1, 0x8f9c18fa, 0x3ff3fd80, 0x77e70577,
63	.word	0x3e78acb9, 0x0f6bf3aa, 0x3ff3de94, 0x8077db58,
64	.word	0x3e786186, 0x18618618, 0x3ff3c036, 0x50e00e03,
65	.word	0x3e781818, 0x18181818, 0x3ff3a261, 0xba6d7a37,
66	.word	0x3e77d05f, 0x417d05f4, 0x3ff38512, 0xba21f51e,
67	.word	0x3e778a4c, 0x8178a4c8, 0x3ff36845, 0x766eec92,
68	.word	0x3e7745d1, 0x745d1746, 0x3ff34bf6, 0x3d156826,
69	.word	0x3e7702e0, 0x5c0b8170, 0x3ff33021, 0x8127c0e0,
70	.word	0x3e76c16c, 0x16c16c17, 0x3ff314c3, 0xd92a9e91,
71	.word	0x3e768168, 0x16816817, 0x3ff2f9d9, 0xfd52fd50,
72	.word	0x3e7642c8, 0x590b2164, 0x3ff2df60, 0xc5df2c9e,
73	.word	0x3e760581, 0x60581606, 0x3ff2c555, 0x2988e428,
74	.word	0x3e75c988, 0x2b931057, 0x3ff2abb4, 0x3c0eb0f4,
75	.word	0x3e758ed2, 0x308158ed, 0x3ff2927b, 0x2cd320f5,
76	.word	0x3e755555, 0x55555555, 0x3ff279a7, 0x4590331c,
77	.word	0x3e751d07, 0xeae2f815, 0x3ff26135, 0xe91daf55,
78	.word	0x3e74e5e0, 0xa72f0539, 0x3ff24924, 0x92492492,
79	.word	0x3e74afd6, 0xa052bf5b, 0x3ff23170, 0xd2be638a,
80	.word	0x3e747ae1, 0x47ae147b, 0x3ff21a18, 0x51ff630a,
81	.word	0x3e7446f8, 0x6562d9fb, 0x3ff20318, 0xcc6a8f5d,
82	.word	0x3e741414, 0x14141414, 0x3ff1ec70, 0x124e98f9,
83	.word	0x3e73e22c, 0xbce4a902, 0x3ff1d61c, 0x070ae7d3,
84	.word	0x3e73b13b, 0x13b13b14, 0x3ff1c01a, 0xa03be896,
85	.word	0x3e738138, 0x13813814, 0x3ff1aa69, 0xe4f2777f,
86	.word	0x3e73521c, 0xfb2b78c1, 0x3ff19507, 0xecf5b9e9,
87	.word	0x3e7323e3, 0x4a2b10bf, 0x3ff17ff2, 0xe00ec3ee,
88	.word	0x3e72f684, 0xbda12f68, 0x3ff16b28, 0xf55d72d4,
89	.word	0x3e72c9fb, 0x4d812ca0, 0x3ff156a8, 0x72b5ef62,
90	.word	0x3e729e41, 0x29e4129e, 0x3ff1426f, 0xac0654db,
91	.word	0x3e727350, 0xb8812735, 0x3ff12e7d, 0x02c40253,
92	.word	0x3e724924, 0x92492492, 0x3ff11ace, 0xe560242a,
93	.word	0x3e721fb7, 0x8121fb78, 0x3ff10763, 0xcec30b26,
94	.word	0x3e71f704, 0x7dc11f70, 0x3ff0f43a, 0x45cdedad,
95	.word	0x3e71cf06, 0xada2811d, 0x3ff0e150, 0xdce2b60c,
96	.word	0x3e71a7b9, 0x611a7b96, 0x3ff0cea6, 0x317186dc,
97	.word	0x3e718118, 0x11811812, 0x3ff0bc38, 0xeb8ba412,
98	.word	0x3e715b1e, 0x5f75270d, 0x3ff0aa07, 0xbd7b7488,
99	.word	0x3e7135c8, 0x1135c811, 0x3ff09811, 0x63615499,
100	.word	0x3e711111, 0x11111111, 0x3ff08654, 0xa2d4f6db,
101	.word	0x3e70ecf5, 0x6be69c90, 0x3ff074d0, 0x4a8b1438,
102	.word	0x3e70c971, 0x4fbcda3b, 0x3ff06383, 0x31ff307a,
103	.word	0x3e70a681, 0x0a6810a7, 0x3ff0526c, 0x39213bfa,
104	.word	0x3e708421, 0x08421084, 0x3ff0418a, 0x4806de7d,
105	.word	0x3e70624d, 0xd2f1a9fc, 0x3ff030dc, 0x4ea03a72,
106	.word	0x3e704104, 0x10410410, 0x3ff02061, 0x446ffa9a,
107	.word	0x3e702040, 0x81020408, 0x3ff01018, 0x28467ee9,
108	.word	0x3e800000, 0x00000000, 0x3ff00000, 0x00000000,
109	.word	0x3e7f81f8, 0x1f81f820, 0x3fefc0bd, 0x88a0f1d9,
110	.word	0x3e7f07c1, 0xf07c1f08, 0x3fef82ec, 0x882c0f9b,
111	.word	0x3e7e9131, 0xabf0b767, 0x3fef467f, 0x2814b0cc,
112	.word	0x3e7e1e1e, 0x1e1e1e1e, 0x3fef0b68, 0x48d2af1c,
113	.word	0x3e7dae60, 0x76b981db, 0x3feed19b, 0x75e78957,
114	.word	0x3e7d41d4, 0x1d41d41d, 0x3fee990c, 0xdad55ed2,
115	.word	0x3e7cd856, 0x89039b0b, 0x3fee61b1, 0x38f18adc,
116	.word	0x3e7c71c7, 0x1c71c71c, 0x3fee2b7d, 0xddfefa66,
117	.word	0x3e7c0e07, 0x0381c0e0, 0x3fedf668, 0x9b7e6350,
118	.word	0x3e7bacf9, 0x14c1bad0, 0x3fedc267, 0xbea45549,
119	.word	0x3e7b4e81, 0xb4e81b4f, 0x3fed8f72, 0x08e6b82d,
120	.word	0x3e7af286, 0xbca1af28, 0x3fed5d7e, 0xa914b937,
121	.word	0x3e7a98ef, 0x606a63be, 0x3fed2c85, 0x34ed6d86,
122	.word	0x3e7a41a4, 0x1a41a41a, 0x3fecfc7d, 0xa32a9213,
123	.word	0x3e79ec8e, 0x951033d9, 0x3feccd60, 0x45f5d358,
124	.word	0x3e799999, 0x9999999a, 0x3fec9f25, 0xc5bfedd9,
125	.word	0x3e7948b0, 0xfcd6e9e0, 0x3fec71c7, 0x1c71c71c,
126	.word	0x3e78f9c1, 0x8f9c18fa, 0x3fec453d, 0x90f057a2,
127	.word	0x3e78acb9, 0x0f6bf3aa, 0x3fec1982, 0xb2ece47b,
128	.word	0x3e786186, 0x18618618, 0x3febee90, 0x56fb9c39,
129	.word	0x3e781818, 0x18181818, 0x3febc460, 0x92eb3118,
130	.word	0x3e77d05f, 0x417d05f4, 0x3feb9aed, 0xba588347,
131	.word	0x3e778a4c, 0x8178a4c8, 0x3feb7232, 0x5b79db11,
132	.word	0x3e7745d1, 0x745d1746, 0x3feb4a29, 0x3c1d9550,
133	.word	0x3e7702e0, 0x5c0b8170, 0x3feb22cd, 0x56d87d7e,
134	.word	0x3e76c16c, 0x16c16c17, 0x3feafc19, 0xd8606169,
135	.word	0x3e768168, 0x16816817, 0x3fead60a, 0x1d0fb394,
136	.word	0x3e7642c8, 0x590b2164, 0x3feab099, 0xae8f539a,
137	.word	0x3e760581, 0x60581606, 0x3fea8bc4, 0x41a3d02c,
138	.word	0x3e75c988, 0x2b931057, 0x3fea6785, 0xb41bacf7,
139	.word	0x3e758ed2, 0x308158ed, 0x3fea43da, 0x0adc6899,
140	.word	0x3e755555, 0x55555555, 0x3fea20bd, 0x700c2c3e,
141	.word	0x3e751d07, 0xeae2f815, 0x3fe9fe2c, 0x315637ee,
142	.word	0x3e74e5e0, 0xa72f0539, 0x3fe9dc22, 0xbe484458,
143	.word	0x3e74afd6, 0xa052bf5b, 0x3fe9ba9d, 0xa6c73588,
144	.word	0x3e747ae1, 0x47ae147b, 0x3fe99999, 0x9999999a,
145	.word	0x3e7446f8, 0x6562d9fb, 0x3fe97913, 0x63068b54,
146	.word	0x3e741414, 0x14141414, 0x3fe95907, 0xeb87ab44,
147	.word	0x3e73e22c, 0xbce4a902, 0x3fe93974, 0x368cfa31,
148	.word	0x3e73b13b, 0x13b13b14, 0x3fe91a55, 0x6151761c,
149	.word	0x3e738138, 0x13813814, 0x3fe8fba8, 0xa1bf6f96,
150	.word	0x3e73521c, 0xfb2b78c1, 0x3fe8dd6b, 0x4563a009,
151	.word	0x3e7323e3, 0x4a2b10bf, 0x3fe8bf9a, 0xb06e1af3,
152	.word	0x3e72f684, 0xbda12f68, 0x3fe8a234, 0x5cc04426,
153	.word	0x3e72c9fb, 0x4d812ca0, 0x3fe88535, 0xd90703c6,
154	.word	0x3e729e41, 0x29e4129e, 0x3fe8689c, 0xc7e07e7d,
155	.word	0x3e727350, 0xb8812735, 0x3fe84c66, 0xdf0ca4c2,
156	.word	0x3e724924, 0x92492492, 0x3fe83091, 0xe6a7f7e7,
157	.word	0x3e721fb7, 0x8121fb78, 0x3fe8151b, 0xb86fee1d,
158	.word	0x3e71f704, 0x7dc11f70, 0x3fe7fa02, 0x3f1068d1,
159	.word	0x3e71cf06, 0xada2811d, 0x3fe7df43, 0x7579b9b5,
160	.word	0x3e71a7b9, 0x611a7b96, 0x3fe7c4dd, 0x663ebb88,
161	.word	0x3e718118, 0x11811812, 0x3fe7aace, 0x2afa8b72,
162	.word	0x3e715b1e, 0x5f75270d, 0x3fe79113, 0xebbd7729,
163	.word	0x3e7135c8, 0x1135c811, 0x3fe777ac, 0xde80baea,
164	.word	0x3e711111, 0x11111111, 0x3fe75e97, 0x46a0b098,
165	.word	0x3e70ecf5, 0x6be69c90, 0x3fe745d1, 0x745d1746,
166	.word	0x3e70c971, 0x4fbcda3b, 0x3fe72d59, 0xc45f1fc5,
167	.word	0x3e70a681, 0x0a6810a7, 0x3fe7152e, 0x9f44f01f,
168	.word	0x3e708421, 0x08421084, 0x3fe6fd4e, 0x79325467,
169	.word	0x3e70624d, 0xd2f1a9fc, 0x3fe6e5b7, 0xd16657e1,
170	.word	0x3e704104, 0x10410410, 0x3fe6ce69, 0x31d5858d,
171	.word	0x3e702040, 0x81020408, 0x3fe6b761, 0x2ec892f6,
172
173	.word	0x3fefffff, 0xfee7f18f	! K0 =  9.99999997962321453275e-01
174	.word	0xbfdfffff, 0xfe07e52f	! K1 = -4.99999998166077580600e-01
175	.word	0x3fd80118, 0x0ca296d9	! K2 =  3.75066768969515586277e-01
176	.word	0xbfd400fc, 0x0bbb8e78	! K3 = -3.12560092408808548438e-01
177	.word	0x7ffe0000, 0x7ffe0000	! DC0
178	.word	0x3f800000, 0x40000000	! FTWO
179
180#define stridex		%l4
181#define stridex2	%l1
182#define stridey		%l3
183#define stridey2	%i2
184#define TBL		%l2
185#define counter		%i5
186
187#define K3		%f38
188#define K2		%f36
189#define K1		%f34
190#define K0		%f32
191#define DC0		%f4
192#define FONE		%f2
193#define FTWO		%f3
194
195#define _0x00800000	%o2
196#define _0x7f800000	%o4
197
198#define tmp0		STACK_BIAS-0x30
199#define tmp1		STACK_BIAS-0x28
200#define tmp2		STACK_BIAS-0x20
201#define tmp3		STACK_BIAS-0x18
202#define tmp_counter	STACK_BIAS-0x10
203#define tmp_px		STACK_BIAS-0x08
204
205! sizeof temp storage - must be a multiple of 16 for V9
206#define tmps		0x30
207
208!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
209!      !!!!!   algorithm   !!!!!
210!  ((float*)&ddx0)[0] = *px;
211!  ax0 = *(int*)px;
212!
213!  ((float*)&ddx0)[1] = *(px + stridex);
214!  ax1 = *(int*)(px + stridex);
215!
216!  px += stridex2;
217!
218!  if ( ax0 >= 0x7f800000 )
219!  {
220!    RETURN ( FONE / ((float*)&dres0)[0] );
221!  }
222!  if ( ax0 < 0x00800000 )
223!  {
224!    float res = ((float*)&dres0)[0];
225!
226!    if ( (ax0 & 0x7fffffff) == 0 )  /* |X| = zero  */
227!    {
228!      RETURN ( FONE / res )
229!    }
230!    else if ( ax0 >= 0 )  /* X = denormal  */
231!    {
232!      double    res0, xx0, tbl_div0, tbl_sqrt0;
233!      float    fres0;
234!      int    iax0, si0, iexp0;
235!
236!      res = *(int*)&res;
237!      res *= FTWO;
238!      ax0 = *(int*)&res;
239!      iexp0 = ax0 >> 24;
240!      iexp0 = 0x3f + 0x4b - iexp0;
241!      iexp0 = iexp0 << 23;
242!
243!      si0 = (ax0 >> 13) & 0x7f0;
244!
245!      tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
246!      tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
247!      iax0 = ax0 & 0x7ffe0000;
248!      iax0 = ax0 - iax0;
249!      xx0 = iax0 * tbl_div0;
250!      res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
251!
252!      fres0 = res0;
253!      iexp0 += *(int*)&fres0;
254!      RETURN(*(float*)&iexp0)
255!    }
256!    else  /* X = negative  */
257!    {
258!      RETURN ( sqrtf(res) )
259!    }
260!  }
261!  if ( ax1 >= 0x7f800000 )
262!  {
263!    RETURN ( FONE / ((float*)&dres0)[1] )
264!  }
265!  if ( ax1 < 0x00800000 )
266!  {
267!    float res = ((float*)&dres0)[1];
268!    if ( (ax0 & 0x7fffffff) == 0 )  /* |X| = zero  */
269!    {
270!      RETURN ( FONE / res )
271!    }
272!    else if ( ax0 >= 0 )  /* X = denormal  */
273!    {
274!      double    res0, xx0, tbl_div0, tbl_sqrt0;
275!      float    fres0;
276!      int    iax1, si0, iexp0;
277!
278!      res = *(int*)&res;
279!      res *= FTWO;
280!      ax1 = *(int*)&res;
281!      iexp0 = ax1 >> 24;
282!      iexp0 = 0x3f + 0x4b - iexp0;
283!      iexp0 = iexp0 << 23;
284!
285!      si0 = (ax1 >> 13) & 0x7f0;
286!
287!      tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
288!      tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
289!      iax1 = ax1 & 0x7ffe0000;
290!      iax1 = ax1 - iax1;
291!      xx0 = iax1 * tbl_div0;
292!      res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
293!
294!      fres0 = res0;
295!      iexp0 += *(int*)&fres0;
296!      RETURN(*(float*)&iexp0)
297!    }
298!    else  /* X = negative  */
299!    {
300!      RETURN ( sqrtf(res) )
301!    }
302!  }
303!
304!  iexp0 = ax0 >> 24;
305!  iexp1 = ax1 >> 24;
306!  iexp0 = 0x3f - iexp0;
307!  iexp1 = 0x3f - iexp1;
308!  iexp1 &= 0x1ff;
309!  lexp0 = iexp0 << 55;
310!  lexp1 = iexp1 << 23;
311!
312!  lexp0 |= lexp1;
313!
314!  fdx0 = *((double*)&lexp0);
315!
316!  si0 = ax0 >> 13;
317!  si1 = ax1 >> 13;
318!  si0 &= 0x7f0;
319!  si1 &= 0x7f0;
320!
321!  addr0 = (char*)TBL + si0;
322!  addr1 = (char*)TBL + si1;
323!  tbl_div0 = ((double*)((char*)TBL + si0))[0];
324!  tbl_div1 = ((double*)((char*)TBL + si1))[0];
325!  tbl_sqrt0 = ((double*)addr0)[1];
326!  tbl_sqrt1 = ((double*)addr1)[1];
327!  dfx0 = vis_fand(ddx0,DC0);
328!  dfx0 = vis_fpsub32(ddx0,dfx0);
329!  dtmp0 = (double)(((int*)&dfx0)[0]);
330!  dtmp1 = (double)(((int*)&dfx0)[1]);
331!  xx0 = dtmp0 * tbl_div0;
332!  xx1 = dtmp1 * tbl_div1;
333!  res0 = K3 * xx0;
334!  res1 = K3 * xx1;
335!  res0 += K2;
336!  res1 += K2;
337!  res0 *= xx0;
338!  res1 *= xx1;
339!  res0 += K1;
340!  res1 += K1;
341!  res0 *= xx0;
342!  res1 *= xx1;
343!  res0 += K0;
344!  res1 += K0;
345!  res0 = tbl_sqrt0 * res0;
346!  res1 = tbl_sqrt1 * res1;
347!  ((float*)&dres0)[0] = (float)res0;
348!  ((float*)&dres0)[1] = (float)res1;
349!  dres0 = vis_fpadd32(dres0,fdx0);
350!  *py = ((float*)&dres0)[0];
351!  *(py + stridey) = ((float*)&dres0)[1];
352!  py += stridey2;
353!
354!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
355
356	ENTRY(__vrsqrtf)
357	save	%sp,-SA(MINFRAME)-tmps,%sp
358	PIC_SETUP(l7)
359	PIC_SET(l7,.CONST_TBL,l2)
360
361	st	%i0,[%fp+tmp_counter]
362	stx	%i1,[%fp+tmp_px]
363
364	ldd	[TBL+2048],K0
365	sll	%i2,2,stridex
366
367	ldd	[TBL+2048+8],K1
368	sll	%i4,2,stridey
369	mov	%i3,%i2
370
371	ldd	[TBL+2048+16],K2
372	sethi	%hi(0x7f800000),_0x7f800000
373	sll	stridex,1,stridex2
374
375	ldd	[TBL+2048+24],K3
376	sethi	%hi(0x00800000),_0x00800000
377
378	ldd	[TBL+2048+32],DC0
379	add	%g0,0x3f,%l0
380
381	ldd	[TBL+2048+40],FONE
382!	ld	[TBL+2048+44],FTWO
383.begin:
384	ld	[%fp+tmp_counter],counter
385	ldx	[%fp+tmp_px],%l7
386	st	%g0,[%fp+tmp_counter]
387.begin1:
388	cmp	counter,0
389	ble,pn	%icc,.exit
390
391	lda	[%l7]0x82,%f14		! (4_0) ((float*)&ddx0)[0] = *px;
392
393	lda	[stridex+%l7]0x82,%f15	! (5_0) ((float*)&ddx0)[1] = *(px + stridex);
394	sethi	%hi(0x7ffffc00),%o0
395
396	lda	[%l7]0x82,%g1		! (4_0) ax0 = *(int*)px;
397	add	%l7,stridex2,%i1	! px += stridex2
398	add	%o0,0x3ff,%o0
399
400	lda	[stridex+%l7]0x82,%g5	! (5_0) ax1 = *(int*)(px + stridex);
401	fand	%f14,DC0,%f16		! (4_0) dfx0 = vis_fand(ddx0,DC0);
402
403	sra	%g1,13,%l5		! (4_0) si0 = ax0 >> 13;
404	add	%i1,stridex2,%o5	! px += stridex2
405
406	cmp	%g1,_0x7f800000		! (4_1) ax0 ? 0x7f800000
407	bge,pn	%icc,.spec0		! (4_1) if ( ax0 >= 0x7f800000 )
408	nop
409
410	cmp	%g1,_0x00800000		! (4_1) ax0 ? 0x00800000
411	bl,pn	%icc,.spec1		! (4_1) if ( ax0 < 0x00800000 )
412	sra	%g5,13,%l6		! (5_0) si1 = ax1 >> 13;
413.cont_spec:
414	and	%l5,2032,%l5		! (4_0) si0 &= 0x7f0;
415
416	ldd	[%l5+TBL],%f54		! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
417	sra	%g5,24,%l7		! (5_0) iexp1 = ax1 >> 24;
418	and	%l6,2032,%l6		! (5_0) si1 &= 0x7f0;
419	fpsub32	%f14,%f16,%f16		! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
420
421	ldd	[%l6+TBL],%f46		! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
422	sra	%g1,24,%i3		! (4_0) iexp0 = ax0 >> 24;
423	sub	%l0,%l7,%l7		! (5_0) iexp1 = 0x3f - iexp1;
424
425	and	%l7,511,%l1		! (5_0) iexp1 = 0x1ff;
426	add	%l6,TBL,%l6		! (5_0) addr1 = (char*)TBL + si1;
427
428	sllx	%l1,23,%l1		! (5_0) lexp1 = iexp1 << 23;
429	sub	%l0,%i3,%o0		! (4_0) iexp0 = 0x3f - iexp0;
430	fitod	%f16,%f56		! (4_0) dtmp0 = (double)(((int*)dfx0)[0]);
431
432	sllx	%o0,55,%o0		! (4_0) lexp0 = iexp0 << 55;
433	fitod	%f17,%f44		! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);
434
435	or	%o0,%l1,%o0		! (4_0) lexp0 |= lexp1;
436
437	stx	%o0,[%fp+tmp0]		! (4_0) fdx0 = *((double*)lexp0);
438
439	fmuld	%f56,%f54,%f40		! (4_0) xx0 = dtmp0 * tbl_div0;
440
441	lda	[%i1]0x82,%f18		! (0_0) ((float*)&ddx0)[0] = *px;
442	fmuld	%f44,%f46,%f46		! (5_1) xx1 = dtmp1 * tbl_div1;
443
444	lda	[stridex+%i1]0x82,%f19	! (1_0) ((float*)&ddx0)[1] = *(px + stridex);
445
446	lda	[%i1]0x82,%g1		! (0_0) ax0 = *(int*)px;
447
448	lda	[stridex+%i1]0x82,%i4	! (1_0) ax1 = *(int*)(px + stridex);
449	cmp	%g5,_0x7f800000		! (5_1) ax1 ? 0x7f800000
450	bge,pn	%icc,.update0		! (5_1) if ( ax1 >= 0x7f800000 )
451	fmuld	K3,%f40,%f52		! (4_1) res0 = K3 * xx0;
452.cont0:
453	fmuld	K3,%f46,%f50		! (5_1) res1 = K3 * xx1;
454	cmp	%g5,_0x00800000		! (5_1) ax1 ? 0x00800000
455	bl,pn	%icc,.update1		! (5_1) if ( ax1 < 0x00800000 )
456	fand	%f18,DC0,%f56		! (0_0) dfx0 = vis_fand(ddx0,DC0);
457.cont1:
458	sra	%g1,13,%o0		! (0_0) si0 = ax0 >> 13;
459	cmp	%g1,_0x7f800000		! (0_0) ax0 ? 0x7f800000
460
461	sra	%i4,13,%g5		! (1_0) si1 = ax1 >> 13;
462	and	%o0,2032,%o0		! (0_0) si0 &= 0x7f0;
463
464	ldd	[%o0+TBL],%f54		! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
465	sra	%i4,24,%i1		! (1_0) iexp1 = ax1 >> 24;
466	and	%g5,2032,%o7		! (1_0) si1 &= 0x7f0;
467	fpsub32	%f18,%f56,%f30		! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
468
469	ldd	[%o7+TBL],%f44		! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
470	sra	%g1,24,%i3		! (0_0) iexp0 = ax0 >> 24;
471	sub	%l0,%i1,%i1		! (1_0) iexp1 = 0x3f - iexp1;
472	faddd	%f52,K2,%f62		! (4_1) res0 += K2;
473
474	sub	%l0,%i3,%g5		! (0_0) iexp0 = 0x3f - iexp0;
475	bge,pn	%icc,.update2		! (0_0) if ( ax0 >= 0x7f800000 )
476	faddd	%f50,K2,%f60		! (5_1) res1 += K2;
477.cont2:
478	cmp	%g1,_0x00800000		! (0_0) ax0 ? 0x00800000
479	and	%i1,511,%i0		! (1_0) iexp1 = 0x1ff;
480	fitod	%f30,%f56		! (0_0) dtmp0 = (double)(((int*)dfx0)[0]);
481
482	sllx	%i0,23,%i0		! (1_0) lexp1 = iexp1 << 23;
483	bl,pn	%icc,.update3		! (0_0) if ( ax0 < 0x00800000 )
484	fitod	%f31,%f50		! (1_0) dtmp0 = (double)(((int*)dfx0)[0]);
485.cont3:
486	fmuld	%f62,%f40,%f30		! (4_1) res0 *= xx0;
487	sllx	%g5,55,%g5		! (0_0) lexp0 = iexp0 << 55;
488
489	fmuld	%f60,%f46,%f48		! (5_1) res1 *= xx1;
490	or	%g5,%i0,%g5		! (0_0) lexp0 |= lexp1;
491	stx	%g5,[%fp+tmp1]		! (0_0) fdx0 = *((double*)lexp0);
492
493	fmuld	%f56,%f54,%f26		! (0_0) xx0 = dtmp0 * tbl_div0;
494	sll	stridex,1,stridex2	! stridex2 = stridex * 2;
495
496	lda	[%o5]0x82,%f24		! (2_0) ((float*)&ddx0)[0] = *px;
497	add	%o7,TBL,%o7		! (1_0) addr0 = (char*)TBL + si0;
498	fmuld	%f50,%f44,%f44		! (1_0) xx0 = dtmp0 * tbl_div0;
499
500	lda	[stridex+%o5]0x82,%f25	! (3_0) ((float*)&ddx0)[1] = *(px + stridex);
501	add	%l5,TBL,%l5		! (4_1) addr0 = (char*)TBL + si0;
502	faddd	%f30,K1,%f62		! (4_1) res0 += K1;
503
504	lda	[%o5]0x82,%g1		! (2_0) ax0 = *(int*)px;
505	add	%o5,stridex2,%l7	! px += stridex2
506	faddd	%f48,K1,%f42		! (5_1) res1 += K1;
507
508	lda	[stridex+%o5]0x82,%o5	! (3_0) ax1 = *(int*)(px + stridex);
509	cmp	%i4,_0x7f800000		! (1_0) ax1 ? 0x7f800000
510	bge,pn	%icc,.update4		! (1_0) if ( ax1 >= 0x7f800000 )
511	fmuld	K3,%f26,%f52		! (0_0) res0 = K3 * xx0;
512.cont4:
513	fmuld	K3,%f44,%f50		! (1_0) res1 = K3 * xx1;
514	cmp	%i4,_0x00800000		! (1_0) ax1 ? 0x00800000
515	bl,pn	%icc,.update5		! (1_0) if ( ax1 < 0x00800000 )
516	fand	%f24,DC0,%f54		! (2_0) dfx0 = vis_fand(ddx0,DC0);
517.cont5:
518	fmuld	%f62,%f40,%f48		! (4_1) res0 *= xx0;
519	sra	%g1,13,%i0		! (2_0) si0 = ax0 >> 13;
520	cmp	%g1,_0x7f800000		! (2_0) ax0 ? 0x7f800000
521
522	fmuld	%f42,%f46,%f58		! (5_1) res1 *= xx1;
523	sra	%o5,13,%o1		! (3_0) si1 = ax1 >> 13;
524	and	%i0,2032,%i0		! (2_0) si0 &= 0x7f0;
525
526	ldd	[%i0+TBL],%f30		! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
527	sra	%o5,24,%o3		! (3_0) iexp1 = ax1 >> 24;
528	and	%o1,2032,%o1		! (3_0) si1 &= 0x7f0;
529	fpsub32	%f24,%f54,%f12		! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
530
531	ldd	[%o1+TBL],%f46		! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
532	sra	%g1,24,%i3		! (2_0) iexp0 = ax0 >> 24;
533	sub	%l0,%o3,%o3		! (3_0) iexp1 = 0x3f - iexp1;
534	faddd	%f52,K2,%f40		! (0_0) res0 += K2;
535
536	ldd	[%l5+8],%f42		! (4_1) tbl_sqrt0 = ((double*)addr0)[1];
537	sub	%l0,%i3,%g5		! (2_0) iexp0 = 0x3f - iexp0;
538	and	%o3,511,%i3		! (3_0) iexp1 &= 0x1ff;
539	faddd	%f50,K2,%f60		! (1_0) res0 += K2;
540
541	ldd	[%l6+8],%f28		! (5_1) tbl_sqrt1 = ((double*)addr1)[1];
542	sllx	%g5,55,%g5		! (2_0) lexp0 = iexp0 << 55;
543	add	%i0,TBL,%i0		! (2_0) addr0 = (char*)TBL + si0;
544	fitod	%f12,%f56		! (2_0) dtmp0 = (double)(((int*)dfx0)[0]);
545
546	sllx	%i3,23,%i3		! (3_0) lexp1 = iexp1 << 23;
547	fitod	%f13,%f50		! (3_0) dtmp1 = (double)(((int*)dfx0)[1]);
548
549	fmuld	%f40,%f26,%f40		! (0_0) res0 *= xx0;
550	or	%g5,%i3,%g5		! (2_0) lexp0 |= lexp1;
551	faddd	%f48,K0,%f62		! (4_1) res0 += K0;
552
553	fmuld	%f60,%f44,%f48		! (1_0) res1 *= xx1;
554	add	%o1,TBL,%o1		! (3_0) addr1 = (char*)TBL + si1;
555	stx	%g5,[%fp+tmp2]		! (2_0) fdx0 = *((double*)lexp0);
556	faddd	%f58,K0,%f60		! (5_1) res1 += K0;
557
558	fmuld	%f56,%f30,%f30		! (2_0) xx0 = dtmp0 * tbl_div0;
559	bge,pn	%icc,.update6		! (2_0) if ( ax0 >= 0x7f800000 )
560	lda	[%l7]0x82,%f14		! (4_0) ((float*)&ddx0)[0] = *px;
561.cont6:
562	cmp	%g1,_0x00800000		! (2_0) ax0 ? 0x00800000
563	bl,pn	%icc,.update7		! (2_0) if ( ax0 < 0x00800000 )
564	nop
565.cont7:
566	fmuld	%f50,%f46,%f24		! (3_0) xx1 = dtmp1 * tbl_div1;
567
568	lda	[stridex+%l7]0x82,%f15	! (5_0) ((float*)&ddx0)[1] = *(px + stridex);
569	cmp	%o5,_0x7f800000		! (3_0) ax1 ? 0x7f800000
570	fmuld	%f42,%f62,%f58		! (4_1) res0 = tbl_sqrt0 * res0;
571	faddd	%f40,K1,%f46		! (0_0) res0 += K1;
572
573	lda	[%l7]0x82,%g1		! (4_0) ax0 = *(int*)px;
574	add	%l7,stridex2,%i1	! px += stridex2
575	fmuld	%f28,%f60,%f56		! (5_1) res1 = tbl_sqrt1 * res1;
576	faddd	%f48,K1,%f62		! (1_0) res1 += K1;
577
578	lda	[stridex+%l7]0x82,%g5	! (5_0) ax1 = *(int*)(px + stridex);
579	add	%o0,TBL,%o0		! (0_0) addr0 = (char*)TBL + si0;
580	bge,pn	%icc,.update8		! (3_0) if ( ax1 >= 0x7f800000 )
581	fmuld	K3,%f30,%f52		! (2_0) res0 = K3 * xx0;
582.cont8:
583	fmuld	K3,%f24,%f50		! (3_0) res1 = K3 * xx1;
584	cmp	%o5,_0x00800000		! (3_0) ax1 ? 0x00800000
585	bl,pn	%icc,.update9		! (3_0) if ( ax1 < 0x00800000 )
586	fand	%f14,DC0,%f16		! (4_0) dfx0 = vis_fand(ddx0,DC0);
587.cont9:
588	fmuld	%f46,%f26,%f48		! (0_0) res0 *= xx0;
589	sra	%g1,13,%l5		! (4_0) si0 = ax0 >> 13;
590	add	%i1,stridex2,%o5	! px += stridex2
591	fdtos	%f58,%f6		! (4_1) ((float*)&dres0)[0] = (float)res0;
592
593	fmuld	%f62,%f44,%f40		! (1_0) res1 *= xx1;
594	sra	%g5,13,%l6		! (5_0) si1 = ax1 >> 13;
595	and	%l5,2032,%l5		! (4_0) si0 &= 0x7f0;
596	fdtos	%f56,%f7		! (5_1) ((float*)&dres0)[1] = (float)res1;
597
598	ldd	[%l5+TBL],%f54		! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
599	sra	%g5,24,%l7		! (5_0) iexp1 = ax1 >> 24;
600	and	%l6,2032,%l6		! (5_0) si1 &= 0x7f0;
601	fpsub32	%f14,%f16,%f16		! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
602
603	ldd	[%l6+TBL],%f46		! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
604	sra	%g1,24,%i3		! (4_0) iexp0 = ax0 >> 24;
605	sub	%l0,%l7,%l7		! (5_0) iexp1 = 0x3f - iexp1;
606	faddd	%f52,K2,%f58		! (2_0) res0 += K2;
607
608	ldd	[%o0+8],%f42		! (0_0) tbl_sqrt0 = ((double*)addr0)[1];
609	and	%l7,511,%l1		! (5_0) iexp1 = 0x1ff;
610	add	%l6,TBL,%l6		! (5_0) addr1 = (char*)TBL + si1;
611	faddd	%f50,K2,%f60		! (3_0) res1 += K2;
612
613	ldd	[%o7+8],%f28		! (1_0) tbl_sqrt1 = ((double*)addr1)[1];
614	sllx	%l1,23,%l1		! (5_0) lexp1 = iexp1 << 23;
615	sub	%l0,%i3,%o0		! (4_0) iexp0 = 0x3f - iexp0;
616	fitod	%f16,%f56		! (4_0) dtmp0 = (double)(((int*)dfx0)[0]);
617
618	ldd	[%fp+tmp0],%f52		! (4_1) fdx0 = *((double*)lexp0);
619	sllx	%o0,55,%o0		! (4_0) lexp0 = iexp0 << 55;
620	fitod	%f17,%f44		! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);
621
622	fmuld	%f58,%f30,%f62		! (2_0) res0 *= xx0;
623	or	%o0,%l1,%o0		! (4_0) lexp0 |= lexp1;
624	faddd	%f48,K0,%f22		! (0_0) res0 += K0;
625
626	fmuld	%f60,%f24,%f58		! (3_0) res1 *= xx1;
627	stx	%o0,[%fp+tmp0]		! (4_0) fdx0 = *((double*)lexp0);
628	faddd	%f40,K0,%f26		! (1_0) res1 += K0;
629
630	fmuld	%f56,%f54,%f40		! (4_0) xx0 = dtmp0 * tbl_div0;
631	fpadd32	%f6,%f52,%f10		! (4_1) dres0 = vis_fpadd32(dres0,fdx0);
632
633	or	%g0,%i2,%l7
634	add	stridey,stridey,stridey2
635
636	cmp	counter,6
637	bl,pn	%icc,.tail
638	nop
639
640	ba	.main_loop
641	sub	counter,6,counter	! counter
642
643	.align	16
644.main_loop:
645	lda	[%i1]0x82,%f18		! (0_0) ((float*)&ddx0)[0] = *px;
646	cmp	%g1,_0x7f800000		! (4_1) ax0 ? 0x7f800000
647	bge,pn	%icc,.update10		! (4_1) if ( ax0 >= 0x7f800000 )
648	fmuld	%f44,%f46,%f46		! (5_1) xx1 = dtmp1 * tbl_div1;
649.cont10:
650	lda	[stridex+%i1]0x82,%f19	! (1_0) ((float*)&ddx0)[1] = *(px + stridex);
651	cmp	%g1,_0x00800000		! (4_1) ax0 ? 0x00800000
652	fmuld	%f42,%f22,%f44		! (0_1) res0 = tbl_sqrt0 * res0;
653	faddd	%f62,K1,%f42		! (2_1) res0 += K1;
654
655	lda	[%i1]0x82,%g1		! (0_0) ax0 = *(int*)px;
656	fmuld	%f28,%f26,%f60		! (1_1) res1 = tbl_sqrt1 * res1;
657	bl,pn	%icc,.update11		! (4_1) if ( ax0 < 0x00800000 )
658	faddd	%f58,K1,%f62		! (3_1) res1 += K1;
659.cont11:
660	lda	[stridex+%i1]0x82,%i4	! (1_0) ax1 = *(int*)(px + stridex);
661	cmp	%g5,_0x7f800000		! (5_1) ax1 ? 0x7f800000
662	bge,pn	%icc,.update12		! (5_1) if ( ax1 >= 0x7f800000 )
663	fmuld	K3,%f40,%f52		! (4_1) res0 = K3 * xx0;
664.cont12:
665	fmuld	K3,%f46,%f50		! (5_1) res1 = K3 * xx1;
666	cmp	%g5,_0x00800000		! (5_1) ax1 ? 0x00800000
667	bl,pn	%icc,.update13		! (5_1) if ( ax1 < 0x00800000 )
668	fand	%f18,DC0,%f56		! (0_0) dfx0 = vis_fand(ddx0,DC0);
669.cont13:
670	fmuld	%f42,%f30,%f48		! (2_1) res0 *= xx0;
671	sra	%g1,13,%o0		! (0_0) si0 = ax0 >> 13;
672	cmp	%g1,_0x7f800000		! (0_0) ax0 ? 0x7f800000
673	fdtos	%f44,%f8		! (0_1) ((float*)&dres0)[0] = (float)res0;
674
675	fmuld	%f62,%f24,%f58		! (3_1) res1 *= xx1;
676	sra	%i4,13,%g5		! (1_0) si1 = ax1 >> 13;
677	and	%o0,2032,%o0		! (0_0) si0 &= 0x7f0;
678	fdtos	%f60,%f9		! (1_1) ((float*)&dres0)[1] = (float)res1;
679
680	ldd	[%o0+TBL],%f54		! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
681	sra	%i4,24,%i1		! (1_0) iexp1 = ax1 >> 24;
682	and	%g5,2032,%o7		! (1_0) si1 &= 0x7f0;
683	fpsub32	%f18,%f56,%f30		! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
684
685	ldd	[%o7+TBL],%f44		! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
686	sra	%g1,24,%i3		! (0_0) iexp0 = ax0 >> 24;
687	sub	%l0,%i1,%i1		! (1_0) iexp1 = 0x3f - iexp1;
688	faddd	%f52,K2,%f62		! (4_1) res0 += K2;
689
690	ldd	[%i0+8],%f42		! (2_1) tbl_sqrt0 = ((double*)addr0)[1];
691	sub	%l0,%i3,%g5		! (0_0) iexp0 = 0x3f - iexp0;
692	bge,pn	%icc,.update14		! (0_0) if ( ax0 >= 0x7f800000 )
693	faddd	%f50,K2,%f60		! (5_1) res1 += K2;
694.cont14:
695	ldd	[%o1+8],%f28		! (3_1) tbl_sqrt1 = ((double*)addr0)[1];
696	cmp	%g1,_0x00800000		! (0_0) ax0 ? 0x00800000
697	and	%i1,511,%i0		! (1_0) iexp1 = 0x1ff;
698	fitod	%f30,%f56		! (0_0) dtmp0 = (double)(((int*)dfx0)[0]);
699
700	ldd	[%fp+tmp1],%f52		! (0_1) fdx0 = *((double*)lexp0);
701	sllx	%i0,23,%i0		! (1_0) lexp1 = iexp1 << 23;
702	bl,pn	%icc,.update15		! (0_0) if ( ax0 < 0x00800000 )
703	fitod	%f31,%f50		! (1_0) dtmp0 = (double)(((int*)dfx0)[0]);
704.cont15:
705	fmuld	%f62,%f40,%f30		! (4_1) res0 *= xx0;
706	sllx	%g5,55,%g5		! (0_0) lexp0 = iexp0 << 55;
707	st	%f10,[%l7]		! (4_2) *py = ((float*)&dres0)[0];
708	faddd	%f48,K0,%f62		! (2_1) res0 += K0;
709
710	fmuld	%f60,%f46,%f48		! (5_1) res1 *= xx1;
711	or	%g5,%i0,%g5		! (0_0) lexp0 |= lexp1;
712	stx	%g5,[%fp+tmp1]		! (0_0) fdx0 = *((double*)lexp0);
713	faddd	%f58,K0,%f60		! (3_1) res1 += K0;
714
715	fmuld	%f56,%f54,%f26		! (0_0) xx0 = dtmp0 * tbl_div0;
716	sll	stridex,1,stridex2	! stridex2 = stridex * 2;
717	st	%f11,[stridey+%l7]	! (5_2) *(py + stridey) = ((float*)&dres0)[1];
718	fpadd32	%f8,%f52,%f10		! (0_1) dres0 = vis_fpadd32(dres0,fdx0);
719
720	lda	[%o5]0x82,%f24		! (2_0) ((float*)&ddx0)[0] = *px;
721	add	%l7,stridey2,%i1	! py += stridey2
722	add	%o7,TBL,%o7		! (1_0) addr0 = (char*)TBL + si0;
723	fmuld	%f50,%f44,%f44		! (1_0) xx0 = dtmp0 * tbl_div0;
724
725	lda	[stridex+%o5]0x82,%f25	! (3_0) ((float*)&ddx0)[1] = *(px + stridex);
726	add	%l5,TBL,%l5		! (4_1) addr0 = (char*)TBL + si0;
727	fmuld	%f42,%f62,%f58		! (2_1) res0 = tbl_sqrt0 * res0;
728	faddd	%f30,K1,%f62		! (4_1) res0 += K1;
729
730	lda	[%o5]0x82,%g1		! (2_0) ax0 = *(int*)px;
731	add	%o5,stridex2,%l7	! px += stridex2
732	fmuld	%f28,%f60,%f56		! (3_1) res1 = tbl_sqrt1 * res1;
733	faddd	%f48,K1,%f42		! (5_1) res1 += K1;
734
735	lda	[stridex+%o5]0x82,%o5	! (3_0) ax1 = *(int*)(px + stridex);
736	cmp	%i4,_0x7f800000		! (1_0) ax1 ? 0x7f800000
737	bge,pn	%icc,.update16		! (1_0) if ( ax1 >= 0x7f800000 )
738	fmuld	K3,%f26,%f52		! (0_0) res0 = K3 * xx0;
739.cont16:
740	fmuld	K3,%f44,%f50		! (1_0) res1 = K3 * xx1;
741	cmp	%i4,_0x00800000		! (1_0) ax1 ? 0x00800000
742	bl,pn	%icc,.update17		! (1_0) if ( ax1 < 0x00800000 )
743	fand	%f24,DC0,%f54		! (2_0) dfx0 = vis_fand(ddx0,DC0);
744.cont17:
745	fmuld	%f62,%f40,%f48		! (4_1) res0 *= xx0;
746	sra	%g1,13,%i0		! (2_0) si0 = ax0 >> 13;
747	cmp	%g1,_0x7f800000		! (2_0) ax0 ? 0x7f800000
748	fdtos	%f58,%f20		! (2_1) ((float*)&dres0)[0] = (float)res0;
749
750	fmuld	%f42,%f46,%f58		! (5_1) res1 *= xx1;
751	sra	%o5,13,%o1		! (3_0) si1 = ax1 >> 13;
752	and	%i0,2032,%i0		! (2_0) si0 &= 0x7f0;
753	fdtos	%f56,%f21		! (3_1) ((float*)&dres0)[0] = (float)res0;
754
755	ldd	[%i0+TBL],%f30		! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
756	sra	%o5,24,%o3		! (3_0) iexp1 = ax1 >> 24;
757	and	%o1,2032,%o1		! (3_0) si1 &= 0x7f0;
758	fpsub32	%f24,%f54,%f12		! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
759
760	ldd	[%o1+TBL],%f46		! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
761	sra	%g1,24,%i3		! (2_0) iexp0 = ax0 >> 24;
762	sub	%l0,%o3,%o3		! (3_0) iexp1 = 0x3f - iexp1;
763	faddd	%f52,K2,%f40		! (0_0) res0 += K2;
764
765	ldd	[%l5+8],%f42		! (4_1) tbl_sqrt0 = ((double*)addr0)[1];
766	sub	%l0,%i3,%g5		! (2_0) iexp0 = 0x3f - iexp0;
767	and	%o3,511,%i3		! (3_0) iexp1 &= 0x1ff;
768	faddd	%f50,K2,%f60		! (1_0) res0 += K2;
769
770	ldd	[%l6+8],%f28		! (5_1) tbl_sqrt1 = ((double*)addr1)[1];
771	sllx	%g5,55,%g5		! (2_0) lexp0 = iexp0 << 55;
772	add	%i0,TBL,%i0		! (2_0) addr0 = (char*)TBL + si0;
773	fitod	%f12,%f56		! (2_0) dtmp0 = (double)(((int*)dfx0)[0]);
774
775	ldd	[%fp+tmp2],%f52		! (2_1) fdx0 = *((double*)lexp0);
776	sllx	%i3,23,%i3		! (3_0) lexp1 = iexp1 << 23;
777	add	%i1,stridey2,%o3	! py += stridey2
778	fitod	%f13,%f50		! (3_0) dtmp1 = (double)(((int*)dfx0)[1]);
779
780	fmuld	%f40,%f26,%f40		! (0_0) res0 *= xx0;
781	or	%g5,%i3,%g5		! (2_0) lexp0 |= lexp1;
782	st	%f10,[%i1]		! (0_1) *py = ((float*)&dres0)[0];
783	faddd	%f48,K0,%f62		! (4_1) res0 += K0;
784
785	fmuld	%f60,%f44,%f48		! (1_0) res1 *= xx1;
786	add	%o1,TBL,%o1		! (3_0) addr1 = (char*)TBL + si1;
787	stx	%g5,[%fp+tmp2]		! (2_0) fdx0 = *((double*)lexp0);
788	faddd	%f58,K0,%f60		! (5_1) res1 += K0;
789
790	fmuld	%f56,%f30,%f30		! (2_0) xx0 = dtmp0 * tbl_div0;
791	bge,pn	%icc,.update18		! (2_0) if ( ax0 >= 0x7f800000 )
792	st	%f11,[stridey+%i1]	! (1_1) *(py + stridey) = ((float*)&dres0)[1];
793	fpadd32	%f20,%f52,%f0		! (2_1) dres0 = vis_fpadd32(dres0,fdx0);
794.cont18:
795	cmp	%g1,_0x00800000		! (2_0) ax0 ? 0x00800000
796	bl,pn	%icc,.update19		! (2_0) if ( ax0 < 0x00800000 )
797	lda	[%l7]0x82,%f14		! (4_0) ((float*)&ddx0)[0] = *px;
798	fmuld	%f50,%f46,%f24		! (3_0) xx1 = dtmp1 * tbl_div1;
799.cont19:
800	lda	[stridex+%l7]0x82,%f15	! (5_0) ((float*)&ddx0)[1] = *(px + stridex);
801	cmp	%o5,_0x7f800000		! (3_0) ax1 ? 0x7f800000
802	fmuld	%f42,%f62,%f58		! (4_1) res0 = tbl_sqrt0 * res0;
803	faddd	%f40,K1,%f46		! (0_0) res0 += K1;
804
805	lda	[%l7]0x82,%g1		! (4_0) ax0 = *(int*)px;
806	add	%l7,stridex2,%i1	! px += stridex2
807	fmuld	%f28,%f60,%f56		! (5_1) res1 = tbl_sqrt1 * res1;
808	faddd	%f48,K1,%f62		! (1_0) res1 += K1;
809
810	lda	[stridex+%l7]0x82,%g5	! (5_0) ax1 = *(int*)(px + stridex);
811	add	%o0,TBL,%o0		! (0_0) addr0 = (char*)TBL + si0;
812	bge,pn	%icc,.update20		! (3_0) if ( ax1 >= 0x7f800000 )
813	fmuld	K3,%f30,%f52		! (2_0) res0 = K3 * xx0;
814.cont20:
815	fmuld	K3,%f24,%f50		! (3_0) res1 = K3 * xx1;
816	cmp	%o5,_0x00800000		! (3_0) ax1 ? 0x00800000
817	bl,pn	%icc,.update21		! (3_0) if ( ax1 < 0x00800000 )
818	fand	%f14,DC0,%f16		! (4_0) dfx0 = vis_fand(ddx0,DC0);
819.cont21:
820	fmuld	%f46,%f26,%f48		! (0_0) res0 *= xx0;
821	sra	%g1,13,%l5		! (4_0) si0 = ax0 >> 13;
822	add	%i1,stridex2,%o5	! px += stridex2
823	fdtos	%f58,%f6		! (4_1) ((float*)&dres0)[0] = (float)res0;
824
825	fmuld	%f62,%f44,%f40		! (1_0) res1 *= xx1;
826	sra	%g5,13,%l6		! (5_0) si1 = ax1 >> 13;
827	and	%l5,2032,%l5		! (4_0) si0 &= 0x7f0;
828	fdtos	%f56,%f7		! (5_1) ((float*)&dres0)[1] = (float)res1;
829
830	ldd	[%l5+TBL],%f54		! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
831	sra	%g5,24,%l7		! (5_0) iexp1 = ax1 >> 24;
832	and	%l6,2032,%l6		! (5_0) si1 &= 0x7f0;
833	fpsub32	%f14,%f16,%f16		! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
834
835	ldd	[%l6+TBL],%f46		! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
836	sra	%g1,24,%i3		! (4_0) iexp0 = ax0 >> 24;
837	sub	%l0,%l7,%l7		! (5_0) iexp1 = 0x3f - iexp1;
838	faddd	%f52,K2,%f58		! (2_0) res0 += K2;
839
840	ldd	[%o0+8],%f42		! (0_0) tbl_sqrt0 = ((double*)addr0)[1];
841	and	%l7,511,%l1		! (5_0) iexp1 = 0x1ff;
842	add	%l6,TBL,%l6		! (5_0) addr1 = (char*)TBL + si1;
843	faddd	%f50,K2,%f60		! (3_0) res1 += K2;
844
845	ldd	[%o7+8],%f28		! (1_0) tbl_sqrt1 = ((double*)addr1)[1];
846	sllx	%l1,23,%l1		! (5_0) lexp1 = iexp1 << 23;
847	sub	%l0,%i3,%o0		! (4_0) iexp0 = 0x3f - iexp0;
848	fitod	%f16,%f56		! (4_0) dtmp0 = (double)(((int*)dfx0)[0]);
849
850	ldd	[%fp+tmp0],%f52		! (4_1) fdx0 = *((double*)lexp0);
851	sllx	%o0,55,%o0		! (4_0) lexp0 = iexp0 << 55;
852	add	%o3,stridey2,%l7	! py += stridey2
853	fitod	%f17,%f44		! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);
854
855	fmuld	%f58,%f30,%f62		! (2_0) res0 *= xx0;
856	or	%o0,%l1,%o0		! (4_0) lexp0 |= lexp1;
857	st	%f0,[%o3]		! (2_1) *py = ((float*)&dres0)[0];
858	faddd	%f48,K0,%f22		! (0_0) res0 += K0;
859
860	fmuld	%f60,%f24,%f58		! (3_0) res1 *= xx1;
861	subcc	counter,6,counter	! counter -= 6;
862	stx	%o0,[%fp+tmp0]		! (4_0) fdx0 = *((double*)lexp0);
863	faddd	%f40,K0,%f26		! (1_0) res1 += K0;
864
865	fmuld	%f56,%f54,%f40		! (4_0) xx0 = dtmp0 * tbl_div0;
866	st	%f1,[stridey+%o3]	! (3_1) *(py + stridey) = ((float*)&dres0)[1];
867	bpos,pt	%icc,.main_loop
868	fpadd32	%f6,%f52,%f10		! (4_1) dres0 = vis_fpadd32(dres0,fdx0);
869
870	add	counter,6,counter
871.tail:
872	sll	stridex,1,stridex2
873	subcc	counter,1,counter
874	bneg,a	.begin
875	mov	%l7,%i2
876
877	fmuld	%f42,%f22,%f44		! (0_1) res0 = tbl_sqrt0 * res0;
878	faddd	%f62,K1,%f42		! (2_1) res0 += K1;
879
880	fmuld	%f28,%f26,%f60		! (1_1) res1 = tbl_sqrt1 * res1;
881
882	fmuld	%f42,%f30,%f48		! (2_1) res0 *= xx0;
883	fdtos	%f44,%f8		! (0_1) ((float*)&dres0)[0] = (float)res0;
884
885	fdtos	%f60,%f9		! (1_1) ((float*)&dres0)[1] = (float)res1;
886
887	ldd	[%i0+8],%f42		! (2_1) tbl_sqrt0 = ((double*)addr0)[1];
888
889	ldd	[%fp+tmp1],%f52		! (0_1) fdx0 = *((double*)lexp0);
890
891	st	%f10,[%l7]		! (4_2) *py = ((float*)&dres0)[0];
892	subcc	counter,1,counter
893	bneg,a	.begin
894	add	%l7,stridey,%i2
895
896	faddd	%f48,K0,%f62		! (2_1) res0 += K0;
897	st	%f11,[stridey+%l7]	! (5_2) *(py + stridey) = ((float*)&dres0)[1];
898	subcc	counter,1,counter
899	bneg,a	.begin
900	add	%l7,stridey2,%i2
901	fpadd32	%f8,%f52,%f10		! (0_1) dres0 = vis_fpadd32(dres0,fdx0);
902
903	add	%l7,stridey2,%i1	! py += stridey2
904
905	fmuld	%f42,%f62,%f58		! (2_1) res0 = tbl_sqrt0 * res0;
906
907	fdtos	%f58,%f20		! (2_1) ((float*)&dres0)[0] = (float)res0;
908
909	ldd	[%fp+tmp2],%f52		! (2_1) fdx0 = *((double*)lexp0);
910	add	%i1,stridey2,%o3	! py += stridey2
911
912	st	%f10,[%i1]		! (0_1) *py = ((float*)&dres0)[0];
913	subcc	counter,1,counter
914	bneg,a	.begin
915	add	%i1,stridey,%i2
916
917	st	%f11,[stridey+%i1]	! (1_1) *(py + stridey) = ((float*)&dres0)[1];
918	subcc	counter,1,counter
919	bneg,a	.begin
920	mov	%o3,%i2
921	fpadd32	%f20,%f52,%f0		! (2_1) dres0 = vis_fpadd32(dres0,fdx0);
922
923	st	%f0,[%o3]		! (2_1) *py = ((float*)&dres0)[0];
924	ba	.begin
925	add	%o3,stridey,%i2
926
927	.align	16
928.spec0:
929	fdivs	FONE,%f14,%f14		! x0 = FONE / x0;
930	add	%l7,stridex,%l7		! px += stridex
931	st	%f14,[%i2]		! *py = x0;
932	sub	counter,1,counter
933	ba	.begin1
934	add	%i2,stridey,%i2		! py += stridey
935
936	.align	16
937.spec1:
938	andcc	%g1,%o0,%g0
939	bz,a	1f
940	fdivs	FONE,%f14,%f14		! x0 = DONE / x0;
941
942	cmp	%g1,0
943	bl,a	1f
944	fsqrts	%f14,%f14		! x0 = sqrtf(x0);
945
946	fitod	%f14,%f0
947	fdtos	%f0,%f14
948	fmuls	%f14,FTWO,%f14
949	st	%f14,[%fp+tmp3]
950	ld	[%fp+tmp3],%g1
951	sethi	%hi(0x4b000000),%o0
952	sra	%g1,13,%l5		! (4_0) si0 = ax0 >> 13;
953	fands	%f14,DC0,%f16		! (4_0) dfx0 = vis_fand(ddx0,DC0);
954	ba	.cont_spec
955	sub	%g1,%o0,%g1
9561:
957	add	%l7,stridex,%l7		! px += stridex
958	sub	counter,1,counter
959	st	%f14,[%i2]		! *py = x0;
960	ba	.begin1
961	add	%i2,stridey,%i2		! py += stridey
962
963	.align	16
964.update0:
965	cmp	counter,1
966	ble	.cont0
967	nop
968
969	sub	%i1,stridex,%o1
970	stx	%o1,[%fp+tmp_px]
971
972	sub	counter,1,counter
973	st	counter,[%fp+tmp_counter]
974
975	ba	.cont0
976	mov	1,counter
977
978	.align	16
979.update1:
980	sethi	%hi(0x7ffffc00),%o0
981	cmp	counter,1
982	ble	.cont1
983
984	add	%o0,0x3ff,%o0
985
986	andcc	%g5,%o0,%g0
987	bz,a	1f
988	nop
989
990	cmp	%g5,0
991	bl,a	1f
992	nop
993
994	fitod	%f15,%f0
995	fdtos	%f0,%f15
996	fmuls	%f15,FTWO,%f15
997	st	%f15,[%fp+tmp3]
998	ld	[%fp+tmp3],%g5
999	sethi	%hi(0x4b000000),%o0
1000	sub	%g5,%o0,%g5
1001
1002	fands	%f15,DC0,%f17		! (4_0) dfx0 = vis_fand(ddx0,DC0);
1003
1004	sra	%g5,13,%l6		! (5_0) si1 = ax1 >> 13;
1005
1006	sra	%g5,24,%l7		! (5_0) iexp1 = ax1 >> 24;
1007	and	%l6,2032,%l6		! (5_0) si1 &= 0x7f0;
1008
1009	fpsub32s	%f15,%f17,%f17	! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1010
1011	ldd	[%l6+TBL],%f46		! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1012	sub	%l0,%l7,%l1		! (5_0) iexp1 = 0x3f - iexp1;
1013
1014	sll	%l1,23,%l1		! (5_0) lexp1 = iexp1 << 23;
1015	add	%l6,TBL,%l6		! (5_0) addr1 = (char*)TBL + si1;
1016	st	%l1,[%fp+tmp0+4]	! (4_0) fdx0 = *((double*)lexp0);
1017	fitod	%f17,%f44		! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);
1018
1019	fmuld	%f44,%f46,%f46		! (5_1) xx1 = dtmp1 * tbl_div1;
1020
1021	ba	.cont1
1022	fmuld	K3,%f46,%f50		! (5_1) res1 = K3 * xx1;
10231:
1024	sub	%i1,stridex,%o1
1025	stx	%o1,[%fp+tmp_px]
1026
1027	sub	counter,1,counter
1028	st	counter,[%fp+tmp_counter]
1029
1030	ba	.cont1
1031	mov	1,counter
1032
1033	.align	16
1034.update2:
1035	cmp	counter,2
1036	ble	.cont2
1037	sub	%o5,stridex,%o1
1038
1039	sub	%o1,stridex,%o1
1040	stx	%o1,[%fp+tmp_px]
1041
1042	sub	counter,2,counter
1043	st	counter,[%fp+tmp_counter]
1044
1045	ba	.cont2
1046	mov	2,counter
1047
1048	.align	16
1049.update3:
1050	sethi	%hi(0x7ffffc00),%o1
1051	cmp	counter,2
1052	ble	.cont3
1053
1054	add	%o1,0x3ff,%o1
1055
1056	andcc	%g1,%o1,%g0
1057	bz,a	1f
1058	sub	%o5,stridex,%o1
1059
1060	cmp	%g1,0
1061	bl,a	1f
1062	sub	%o5,stridex,%o1
1063
1064	fitod	%f18,%f0
1065	fdtos	%f0,%f18
1066	fmuls	%f18,FTWO,%f18
1067	st	%f18,[%fp+tmp3]
1068	ld	[%fp+tmp3],%g1
1069	sethi	%hi(0x4b000000),%o1
1070	sub	%g1,%o1,%g1
1071
1072	fand	%f18,DC0,%f56		! (0_0) dfx0 = vis_fand(ddx0,DC0);
1073	sra	%g1,13,%o0		! (0_0) si0 = ax0 >> 13;
1074
1075	and	%o0,2032,%o0		! (0_0) si0 &= 0x7f0;
1076
1077	ldd	[%o0+TBL],%f54		! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
1078	fpsub32	%f18,%f56,%f30		! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1079
1080	sra	%g1,24,%i3		! (0_0) iexp0 = ax0 >> 24;
1081	sub	%l0,%i3,%g5		! (0_0) iexp0 = 0x3f - iexp0;
1082	ba	.cont3
1083	fitod	%f30,%f56		! (0_0) dtmp0 = (double)(((int*)dfx0)[0]);
10841:
1085	sub	%o1,stridex,%o1
1086	stx	%o1,[%fp+tmp_px]
1087
1088	sub	counter,2,counter
1089	st	counter,[%fp+tmp_counter]
1090
1091	ba	.cont3
1092	mov	2,counter
1093
1094	.align	16
1095.update4:
1096	cmp	counter,3
1097	ble	.cont4
1098	sub	%l7,stridex2,%o1
1099
1100	sub	%o1,stridex,%o1
1101	stx	%o1,[%fp+tmp_px]
1102
1103	sub	counter,3,counter
1104	st	counter,[%fp+tmp_counter]
1105
1106	ba	.cont4
1107	mov	3,counter
1108
1109	.align	16
1110.update5:
1111	sethi	%hi(0x7ffffc00),%o1
1112	cmp	counter,3
1113	ble	.cont5
1114
1115	add	%o1,0x3ff,%o1
1116
1117	andcc	%i4,%o1,%g0
1118	bz,a	1f
1119	sub	%l7,stridex2,%o1
1120
1121	cmp	%i4,0
1122	bl,a	1f
1123	sub	%l7,stridex2,%o1
1124
1125	fitod	%f19,%f0
1126	fdtos	%f0,%f19
1127	fmuls	%f19,FTWO,%f19
1128	st	%f19,[%fp+tmp3]
1129	ld	[%fp+tmp3],%i4
1130	sethi	%hi(0x4b000000),%o1
1131	sub	%i4,%o1,%i4
1132
1133	fands	%f19,DC0,%f0		! (0_0) dfx0 = vis_fand(ddx0,DC0);
1134
1135	sra	%i4,13,%g5		! (1_0) si1 = ax1 >> 13;
1136
1137	sra	%i4,24,%i1		! (1_0) iexp1 = ax1 >> 24;
1138	and	%g5,2032,%o7		! (1_0) si1 &= 0x7f0;
1139	fpsub32s	%f19,%f0,%f31	! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1140
1141	ldd	[%o7+TBL],%f44		! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1142	sub	%l0,%i1,%i0		! (1_0) iexp1 = 0x3f - iexp1;
1143
1144	sll	%i0,23,%i0		! (1_0) lexp1 = iexp1 << 23;
1145	fitod	%f31,%f50		! (1_0) dtmp0 = (double)(((int*)dfx0)[0]);
1146
1147	st	%i0,[%fp+tmp1+4]	! (0_0) fdx0 = *((double*)lexp0);
1148
1149	add	%o7,TBL,%o7		! (1_0) addr0 = (char*)TBL + si0;
1150	fmuld	%f50,%f44,%f44		! (1_0) xx0 = dtmp0 * tbl_div0;
1151
1152	ba	.cont5
1153	fmuld	K3,%f44,%f50		! (1_0) res1 = K3 * xx1;
11541:
1155	sub	%o1,stridex,%o1
1156	stx	%o1,[%fp+tmp_px]
1157
1158	sub	counter,3,counter
1159	st	counter,[%fp+tmp_counter]
1160
1161	ba	.cont5
1162	mov	3,counter
1163
1164	.align	16
1165.update6:
1166	cmp	counter,4
1167	ble	.cont6
1168	sub	%l7,stridex,%o3
1169
1170	sub	%o3,stridex,%o3
1171	stx	%o3,[%fp+tmp_px]
1172
1173	sub	counter,4,counter
1174	st	counter,[%fp+tmp_counter]
1175
1176	ba	.cont6
1177	mov	4,counter
1178
1179	.align	16
1180.update7:
1181	sethi	%hi(0x7ffffc00),%o3
1182	cmp	counter,4
1183	ble	.cont7
1184
1185	add	%o3,0x3ff,%o3
1186
1187	andcc	%g1,%o3,%g0
1188	bz,a	1f
1189	sub	%l7,stridex,%o3
1190
1191	cmp	%g1,0
1192	bl,a	1f
1193	sub	%l7,stridex,%o3
1194
1195	fitod	%f24,%f0
1196	fdtos	%f0,%f24
1197	fmuls	%f24,FTWO,%f24
1198	st	%f24,[%fp+tmp3]
1199	ld	[%fp+tmp3],%g1
1200	sethi	%hi(0x4b000000),%o3
1201	sub	%g1,%o3,%g1
1202
1203	fands	%f24,DC0,%f0		! (2_0) dfx0 = vis_fand(ddx0,DC0);
1204	sra	%g1,13,%i0		! (2_0) si0 = ax0 >> 13;
1205
1206	and	%i0,2032,%i0		! (2_0) si0 &= 0x7f0;
1207
1208	ldd	[%i0+TBL],%f30		! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
1209	fpsub32s	%f24,%f0,%f12	! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1210
1211	sra	%g1,24,%i3		! (2_0) iexp0 = ax0 >> 24;
1212
1213	sub	%l0,%i3,%g5		! (2_0) iexp0 = 0x3f - iexp0;
1214
1215	sll	%g5,23,%g5		! (2_0) lexp0 = iexp0 << 55;
1216	add	%i0,TBL,%i0		! (2_0) addr0 = (char*)TBL + si0;
1217	fitod	%f12,%f56		! (2_0) dtmp0 = (double)(((int*)dfx0)[0]);
1218
1219	st	%g5,[%fp+tmp2]		! (2_0) fdx0 = *((double*)lexp0);
1220	ba	.cont7
1221	fmuld	%f56,%f30,%f30		! (2_0) xx0 = dtmp0 * tbl_div0;
12221:
1223	sub	%o3,stridex,%o3
1224	stx	%o3,[%fp+tmp_px]
1225
1226	sub	counter,4,counter
1227	st	counter,[%fp+tmp_counter]
1228
1229	ba	.cont7
1230	mov	4,counter
1231
1232	.align	16
1233.update8:
1234	cmp	counter,5
1235	ble	.cont8
1236	nop
1237
1238	sub	%l7,stridex,%o3
1239	stx	%o3,[%fp+tmp_px]
1240
1241	sub	counter,5,counter
1242	st	counter,[%fp+tmp_counter]
1243
1244	ba	.cont8
1245	mov	5,counter
1246
1247	.align	16
1248.update9:
1249	sethi	%hi(0x7ffffc00),%o3
1250	cmp	counter,5
1251	ble	.cont9
1252	sub	%l7,stridex,%i3
1253
1254	add	%o3,0x3ff,%o3
1255
1256	andcc	%o5,%o3,%g0
1257	bz	1f
1258	ld	[%i3],%f0
1259
1260	cmp	%o5,0
1261	bl,a	1f
1262	nop
1263
1264	fitod	%f0,%f0
1265	fdtos	%f0,%f0
1266	fmuls	%f0,FTWO,%f0
1267	st	%f0,[%fp+tmp3]
1268	ld	[%fp+tmp3],%o5
1269	sethi	%hi(0x4b000000),%o3
1270	sub	%o5,%o3,%o5
1271
1272	fands	%f0,DC0,%f8		! (2_0) dfx0 = vis_fand(ddx0,DC0);
1273
1274	sra	%o5,13,%o1		! (3_0) si1 = ax1 >> 13;
1275
1276	sra	%o5,24,%o3		! (3_0) iexp1 = ax1 >> 24;
1277	and	%o1,2032,%o1		! (3_0) si1 &= 0x7f0;
1278	fpsub32s	%f0,%f8,%f0	! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1279
1280	ldd	[%o1+TBL],%f8		! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1281	sub	%l0,%o3,%i3		! (3_0) iexp1 = 0x3f - iexp1;
1282
1283	sllx	%i3,23,%i3		! (3_0) lexp1 = iexp1 << 23;
1284	fitod	%f0,%f50		! (3_0) dtmp1 = (double)(((int*)dfx0)[1]);
1285
1286	add	%o1,TBL,%o1		! (3_0) addr1 = (char*)TBL + si1;
1287	st	%i3,[%fp+tmp2+4]	! (2_0) fdx0 = *((double*)lexp0);
1288
1289	fmuld	%f50,%f8,%f24		! (3_0) xx1 = dtmp1 * tbl_div1;
1290
1291	ba	.cont9
1292	fmuld	K3,%f24,%f50		! (3_0) res1 = K3 * xx1;
12931:
1294	stx	%i3,[%fp+tmp_px]
1295
1296	sub	counter,5,counter
1297	st	counter,[%fp+tmp_counter]
1298
1299	ba	.cont9
1300	mov	5,counter
1301
1302	.align	16
1303.update10:
1304	cmp	counter,0
1305	ble	.cont10
1306	sub	%i1,stridex,%o3
1307
1308	sub	%o3,stridex,%o3
1309	stx	%o3,[%fp+tmp_px]
1310
1311	st	counter,[%fp+tmp_counter]
1312
1313	ba	.cont10
1314	mov	0,counter
1315
1316	.align	16
1317.update11:
1318	sethi	%hi(0x7ffffc00),%i4
1319	cmp	counter,0
1320	ble	.cont11
1321	sub	%i1,stridex,%o3
1322
1323	sub	%o3,stridex,%o3
1324	add	%i4,0x3ff,%i4
1325	ld	[%o3],%i3
1326
1327	andcc	%i3,%i4,%g0
1328	bz	1f
1329
1330	cmp	%i3,0
1331	bl,a	1f
1332	nop
1333
1334	fitod	%f14,%f0
1335	fdtos	%f0,%f14
1336	fmuls	%f14,FTWO,%f14
1337	st	%f14,[%fp+tmp3]
1338	ld	[%fp+tmp3],%i3
1339	sethi	%hi(0x4b000000),%o3
1340	sub	%i3,%o3,%i3
1341
1342	fands	%f14,DC0,%f16		! (4_0) dfx0 = vis_fand(ddx0,DC0);
1343	sra	%i3,13,%l5		! (4_0) si0 = ax0 >> 13;
1344
1345	and	%l5,2032,%l5		! (4_0) si0 &= 0x7f0;
1346
1347	ldd	[%l5+TBL],%f54		! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
1348	fpsub32s	%f14,%f16,%f16	! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1349
1350	sra	%i3,24,%i3		! (4_0) iexp0 = ax0 >> 24;
1351
1352	sub	%l0,%i3,%o0		! (4_0) iexp0 = 0x3f - iexp0;
1353	fitod	%f16,%f56		! (4_0) dtmp0 = (double)(((int*)dfx0)[0]);
1354
1355	sllx	%o0,23,%o0		! (4_0) lexp0 = iexp0 << 55;
1356
1357	st	%o0,[%fp+tmp0]		! (4_0) fdx0 = *((double*)lexp0);
1358
1359	ba	.cont11
1360	fmuld	%f56,%f54,%f40		! (4_0) xx0 = dtmp0 * tbl_div0;
13611:
1362	stx	%o3,[%fp+tmp_px]
1363
1364	st	counter,[%fp+tmp_counter]
1365
1366	ba	.cont11
1367	mov	0,counter
1368
1369	.align	16
1370.update12:
1371	cmp	counter,1
1372	ble	.cont12
1373	nop
1374
1375	sub	%i1,stridex,%i1
1376	stx	%i1,[%fp+tmp_px]
1377
1378	sub	counter,1,counter
1379	st	counter,[%fp+tmp_counter]
1380
1381	ba	.cont12
1382	mov	1,counter
1383
1384	.align	16
1385.update13:
1386	sethi	%hi(0x7ffffc00),%o3
1387	cmp	counter,1
1388	ble	.cont13
1389
1390	add	%o3,0x3ff,%o3
1391
1392	andcc	%g5,%o3,%g0
1393	bz	1f
1394
1395	cmp	%g5,0
1396	bl,a	1f
1397	nop
1398
1399	fitod	%f15,%f0
1400	fdtos	%f0,%f15
1401	fmuls	%f15,FTWO,%f15
1402	st	%f15,[%fp+tmp3]
1403	ld	[%fp+tmp3],%g5
1404	sethi	%hi(0x4b000000),%o3
1405	sub	%g5,%o3,%g5
1406
1407	fands	%f15,DC0,%f17		! (4_0) dfx0 = vis_fand(ddx0,DC0);
1408
1409	sra	%g5,13,%l6		! (5_0) si1 = ax1 >> 13;
1410	sra	%g5,24,%o3		! (5_0) iexp1 = ax1 >> 24;
1411	and	%l6,2032,%l6		! (5_0) si1 &= 0x7f0;
1412	fpsub32s	%f15,%f17,%f17	! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1413
1414	ldd	[%l6+TBL],%f46		! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1415	sub	%l0,%o3,%l1		! (5_0) iexp1 = 0x3f - iexp1;
1416
1417	add	%l6,TBL,%l6		! (5_0) addr1 = (char*)TBL + si1;
1418
1419	sllx	%l1,23,%l1		! (5_0) lexp1 = iexp1 << 23;
1420	st	%l1,[%fp+tmp0+4]	! (4_0) fdx0 = *((double*)lexp0);
1421
1422	fitod	%f17,%f0		! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);
1423
1424	fmuld	%f0,%f46,%f46		! (5_1) xx1 = dtmp1 * tbl_div1;
1425	ba	.cont13
1426	fmuld	K3,%f46,%f50		! (5_1) res1 = K3 * xx1;
14271:
1428	sub	%i1,stridex,%i1
1429	stx	%i1,[%fp+tmp_px]
1430
1431	sub	counter,1,counter
1432	st	counter,[%fp+tmp_counter]
1433
1434	ba	.cont13
1435	mov	1,counter
1436
1437	.align	16
1438.update14:
1439	cmp	counter,2
1440	ble	.cont14
1441	sub	%o5,stridex,%o3
1442
1443	sub	%o3,stridex,%o3
1444	stx	%o3,[%fp+tmp_px]
1445
1446	sub	counter,2,counter
1447	st	counter,[%fp+tmp_counter]
1448
1449	ba	.cont14
1450	mov	2,counter
1451
1452	.align	16
1453.update15:
1454	sethi	%hi(0x7ffffc00),%i3
1455	cmp	counter,2
1456	ble	.cont15
1457	sub	%o5,stridex,%o3
1458
1459	add	%i3,0x3ff,%i3
1460
1461	andcc	%g1,%i3,%g0
1462	bz	1f
1463	sub	%o3,stridex,%o3
1464
1465	cmp	%g1,0
1466	bl,a	1f
1467	nop
1468
1469	fitod	%f18,%f0
1470	fdtos	%f0,%f18
1471	fmuls	%f18,FTWO,%f18
1472	st	%f18,[%fp+tmp3]
1473	ld	[%fp+tmp3],%g1
1474	sethi	%hi(0x4b000000),%o3
1475	sub	%g1,%o3,%g1
1476
1477	fands	%f18,DC0,%f0		! (0_0) dfx0 = vis_fand(ddx0,DC0);
1478	sra	%g1,13,%o0		! (0_0) si0 = ax0 >> 13;
1479	and	%o0,2032,%o0		! (0_0) si0 &= 0x7f0;
1480
1481	ldd	[%o0+TBL],%f54		! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
1482	fpsub32s	%f18,%f0,%f30	! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1483
1484	sra	%g1,24,%i3		! (0_0) iexp0 = ax0 >> 24;
1485
1486	sub	%l0,%i3,%g5		! (0_0) iexp0 = 0x3f - iexp0;
1487
1488	ba	.cont15
1489	fitod	%f30,%f56		! (0_0) dtmp0 = (double)(((int*)dfx0)[0]);
14901:
1491	stx	%o3,[%fp+tmp_px]
1492
1493	sub	counter,2,counter
1494	st	counter,[%fp+tmp_counter]
1495
1496	ba	.cont15
1497	mov	2,counter
1498
1499	.align	16
1500.update16:
1501	cmp	counter,3
1502	ble	.cont16
1503	sub	%l7,stridex2,%o3
1504
1505	sub	%o3,stridex,%o3
1506	stx	%o3,[%fp+tmp_px]
1507
1508	sub	counter,3,counter
1509	st	counter,[%fp+tmp_counter]
1510
1511	ba	.cont16
1512	mov	3,counter
1513
1514	.align	16
1515.update17:
1516	sethi	%hi(0x7ffffc00),%i3
1517	cmp	counter,3
1518	ble	.cont17
1519	sub	%l7,stridex2,%o3
1520
1521	add	%i3,0x3ff,%i3
1522
1523	andcc	%i4,%i3,%g0
1524	bz	1f
1525	sub	%o3,stridex,%o3
1526
1527	cmp	%i4,0
1528	bl,a	1f
1529	nop
1530
1531	fitod	%f19,%f0
1532	fdtos	%f0,%f19
1533	fmuls	%f19,FTWO,%f19
1534	st	%f19,[%fp+tmp3]
1535	ld	[%fp+tmp3],%i4
1536	sethi	%hi(0x4b000000),%o3
1537	sub	%i4,%o3,%i4
1538
1539	fands	%f19,DC0,%f0		! (0_0) dfx0 = vis_fand(ddx0,DC0);
1540
1541	sra	%i4,13,%g5		! (1_0) si1 = ax1 >> 13;
1542
1543	sra	%i4,24,%i0		! (1_0) iexp1 = ax1 >> 24;
1544	and	%g5,2032,%o7		! (1_0) si1 &= 0x7f0;
1545	fpsub32s	%f19,%f0,%f31	! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1546
1547	ldd	[%o7+TBL],%f44		! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1548	sub	%l0,%i0,%i0		! (1_0) iexp1 = 0x3f - iexp1;
1549
1550	sllx	%i0,23,%i0		! (1_0) lexp1 = iexp1 << 23;
1551	fitod	%f31,%f50		! (1_0) dtmp0 = (double)(((int*)dfx0)[0]);
1552
1553	st	%i0,[%fp+tmp1+4]	! (0_0) fdx0 = *((double*)lexp0);
1554
1555	add	%o7,TBL,%o7		! (1_0) addr0 = (char*)TBL + si0;
1556	fmuld	%f50,%f44,%f44		! (1_0) xx0 = dtmp0 * tbl_div0;
1557
1558	ba	.cont17
1559	fmuld	K3,%f44,%f50		! (1_0) res1 = K3 * xx1;
15601:
1561	stx	%o3,[%fp+tmp_px]
1562
1563	sub	counter,3,counter
1564	st	counter,[%fp+tmp_counter]
1565
1566	ba	.cont17
1567	mov	3,counter
1568
1569	.align	16
1570.update18:
1571	cmp	counter,4
1572	ble	.cont18
1573	fpadd32	%f20,%f52,%f0		! (2_1) dres0 = vis_fpadd32(dres0,fdx0);
1574
1575	sub	%l7,stridex2,%i3
1576	stx	%i3,[%fp+tmp_px]
1577
1578	sub	counter,4,counter
1579	st	counter,[%fp+tmp_counter]
1580
1581	ba	.cont18
1582	mov	4,counter
1583
1584	.align	16
1585.update19:
1586	sethi	%hi(0x7ffffc00),%i3
1587	cmp	counter,4
1588	ble,a	.cont19
1589	fmuld	%f50,%f46,%f24		! (3_0) xx1 = dtmp1 * tbl_div1;
1590
1591	add	%i3,0x3ff,%i3
1592
1593	andcc	%g1,%i3,%g0
1594	bz	1f
1595	nop
1596
1597	cmp	%g1,0
1598	bl,a	1f
1599	nop
1600
1601	fitod	%f24,%f24
1602	fdtos	%f24,%f24
1603	fmuls	%f24,FTWO,%f24
1604	st	%f24,[%fp+tmp3]
1605	ld	[%fp+tmp3],%g1
1606	sethi	%hi(0x4b000000),%i3
1607	sub	%g1,%i3,%g1
1608
1609	fands	%f24,DC0,%f8		! (2_0) dfx0 = vis_fand(ddx0,DC0);
1610	sra	%g1,13,%i0		! (2_0) si0 = ax0 >> 13;
1611
1612	and	%i0,2032,%i0		! (2_0) si0 &= 0x7f0;
1613
1614	ldd	[%i0+TBL],%f30		! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
1615	fpsub32s	%f24,%f8,%f12	! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1616
1617	sra	%g1,24,%i3		! (2_0) iexp0 = ax0 >> 24;
1618
1619	sub	%l0,%i3,%g5		! (2_0) iexp0 = 0x3f - iexp0;
1620
1621	sllx	%g5,23,%g5		! (2_0) lexp0 = iexp0 << 55;
1622	add	%i0,TBL,%i0		! (2_0) addr0 = (char*)TBL + si0;
1623	fitod	%f12,%f56		! (2_0) dtmp0 = (double)(((int*)dfx0)[0]);
1624
1625	st	%g5,[%fp+tmp2]		! (2_0) fdx0 = *((double*)lexp0);
1626	fmuld	%f56,%f30,%f30		! (2_0) xx0 = dtmp0 * tbl_div0;
1627
1628	ba	.cont19
1629	fmuld	%f50,%f46,%f24		! (3_0) xx1 = dtmp1 * tbl_div1;
16301:
1631	sub	%l7,stridex2,%i3
1632	stx	%i3,[%fp+tmp_px]
1633
1634	sub	counter,4,counter
1635	st	counter,[%fp+tmp_counter]
1636
1637	mov	4,counter
1638	ba	.cont19
1639	fmuld	%f50,%f46,%f24		! (3_0) xx1 = dtmp1 * tbl_div1;
1640
1641	.align	16
1642.update20:
1643	cmp	counter,5
1644	ble	.cont20
1645	nop
1646
1647	sub	%l7,stridex,%i3
1648	stx	%i3,[%fp+tmp_px]
1649
1650	sub	counter,5,counter
1651	st	counter,[%fp+tmp_counter]
1652
1653	ba	.cont20
1654	mov	5,counter
1655
1656	.align	16
1657.update21:
1658	sethi	%hi(0x7ffffc00),%i3
1659	cmp	counter,5
1660	ble,a	.cont21
1661	nop
1662
1663	sub	%l7,stridex,%i4
1664	add	%i3,0x3ff,%i3
1665
1666	andcc	%o5,%i3,%g0
1667	bz	1f
1668	ld	[%i4],%f8
1669
1670	cmp	%o5,0
1671	bl,a	1f
1672	nop
1673
1674	fitod	%f8,%f8
1675	fdtos	%f8,%f8
1676	fmuls	%f8,FTWO,%f8
1677	st	%f8,[%fp+tmp3]
1678	ld	[%fp+tmp3],%o5
1679	sethi	%hi(0x4b000000),%i3
1680	sub	%o5,%i3,%o5
1681
1682	fands	%f8,DC0,%f24		! (2_0) dfx0 = vis_fand(ddx0,DC0);
1683
1684	sra	%o5,13,%o1		! (3_0) si1 = ax1 >> 13;
1685
1686	sra	%o5,24,%i3		! (3_0) iexp1 = ax1 >> 24;
1687	and	%o1,2032,%o1		! (3_0) si1 &= 0x7f0;
1688	fpsub32s	%f8,%f24,%f24	! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1689
1690	ldd	[%o1+TBL],%f8		! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1691	sub	%l0,%i3,%i3		! (3_0) iexp1 = 0x3f - iexp1;
1692
1693	sllx	%i3,23,%i3		! (3_0) lexp1 = iexp1 << 23;
1694	fitod	%f24,%f50		! (3_0) dtmp1 = (double)(((int*)dfx0)[1]);
1695
1696	add	%o1,TBL,%o1		! (3_0) addr1 = (char*)TBL + si1;
1697	st	%i3,[%fp+tmp2+4]	! (2_0) fdx0 = *((double*)lexp0);
1698
1699	fmuld	%f50,%f8,%f24		! (3_0) xx1 = dtmp1 * tbl_div1;
1700
1701	ba	.cont21
1702	fmuld	K3,%f24,%f50		! (3_0) res1 = K3 * xx1;
17031:
1704	sub	%l7,stridex,%i3
1705	stx	%i3,[%fp+tmp_px]
1706
1707	sub	counter,5,counter
1708	st	counter,[%fp+tmp_counter]
1709
1710	ba	.cont21
1711	mov	5,counter
1712
1713	.align	16
1714.exit:
1715	ret
1716	restore
1717
1718	SET_SIZE(__vrsqrtf)
1719
1720