1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 23 */ 24/* 25 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms. 27 */ 28 29 .file "__vcos.S" 30 31#include "libm.h" 32 33 RO_DATA 34 .align 64 35constants: 36 .word 0x3ec718e3,0xa6972785 37 .word 0x3ef9fd39,0x94293940 38 .word 0xbf2a019f,0x75ee4be1 39 .word 0xbf56c16b,0xba552569 40 .word 0x3f811111,0x1108c703 41 .word 0x3fa55555,0x554f5b35 42 .word 0xbfc55555,0x555554d0 43 .word 0xbfdfffff,0xffffff85 44 .word 0x3ff00000,0x00000000 45 .word 0xbfc55555,0x5551fc28 46 .word 0x3f811107,0x62eacc9d 47 .word 0xbfdfffff,0xffff6328 48 .word 0x3fa55551,0x5f7acf0c 49 .word 0x3fe45f30,0x6dc9c883 50 .word 0x43380000,0x00000000 51 .word 0x3ff921fb,0x54400000 52 .word 0x3dd0b461,0x1a600000 53 .word 0x3ba3198a,0x2e000000 54 .word 0x397b839a,0x252049c1 55 .word 0x80000000,0x00004000 56 .word 0xffff8000,0x00000000 ! N.B.: low-order words used 57 .word 0x3fc90000,0x80000000 ! for sign bit hacking; see 58 .word 0x3fc40000,0x00000000 ! references to "thresh" below 59 60#define p4 0x0 61#define q4 0x08 62#define p3 0x10 63#define q3 0x18 64#define p2 0x20 65#define q2 0x28 66#define p1 0x30 67#define q1 0x38 68#define one 0x40 69#define pp1 0x48 70#define pp2 0x50 71#define qq1 0x58 72#define qq2 0x60 73#define invpio2 0x68 74#define round 0x70 75#define pio2_1 0x78 76#define pio2_2 0x80 77#define pio2_3 0x88 78#define pio2_3t 0x90 79#define f30val 0x98 80#define mask 0xa0 81#define thresh 0xa8 82 83! local storage indices 84 85#define xsave STACK_BIAS-0x8 86#define ysave STACK_BIAS-0x10 87#define nsave STACK_BIAS-0x14 88#define sxsave STACK_BIAS-0x18 89#define sysave STACK_BIAS-0x1c 90#define biguns STACK_BIAS-0x20 91#define n2 STACK_BIAS-0x24 92#define n1 STACK_BIAS-0x28 93#define n0 STACK_BIAS-0x2c 94#define x2_1 STACK_BIAS-0x40 95#define x1_1 STACK_BIAS-0x50 96#define x0_1 STACK_BIAS-0x60 97#define y2_0 STACK_BIAS-0x70 98#define y1_0 STACK_BIAS-0x80 99#define y0_0 STACK_BIAS-0x90 100! sizeof temp storage - must be a multiple of 16 for V9 101#define tmps 0x90 102 103!-------------------------------------------------------------------- 104! define pipes for easier reading 105 106#define P0_f0 %f0 107#define P0_f1 %f1 108#define P0_f2 %f2 109#define P0_f3 %f3 110#define P0_f4 %f4 111#define P0_f5 %f5 112#define P0_f6 %f6 113#define P0_f7 %f7 114#define P0_f8 %f8 115#define P0_f9 %f9 116 117#define P1_f10 %f10 118#define P1_f11 %f11 119#define P1_f12 %f12 120#define P1_f13 %f13 121#define P1_f14 %f14 122#define P1_f15 %f15 123#define P1_f16 %f16 124#define P1_f17 %f17 125#define P1_f18 %f18 126#define P1_f19 %f19 127 128#define P2_f20 %f20 129#define P2_f21 %f21 130#define P2_f22 %f22 131#define P2_f23 %f23 132#define P2_f24 %f24 133#define P2_f25 %f25 134#define P2_f26 %f26 135#define P2_f27 %f27 136#define P2_f28 %f28 137#define P2_f29 %f29 138 139! define __vlibm_TBL_sincos_hi & lo for easy reading 140 141#define SC_HI %l3 142#define SC_LO %l4 143 144! define constants for easy reading 145 146#define C_q1 %f46 147#define C_q2 %f48 148#define C_q3 %f50 149#define C_q4 %f52 150 151! one ( 1 ) uno eins echi un 152#define C_ONE %f54 153#define C_ONE_LO %f55 154 155! masks 156#define MSK_SIGN %i5 157#define MSK_BIT31 %f30 158#define MSK_BIT13 %f31 159#define MSK_BITSHI17 %f44 160 161 162! constants for pp and qq 163#define C_pp1 %f56 164#define C_pp2 %f58 165#define C_qq1 %f60 166#define C_qq2 %f62 167 168! sign mask 169#define C_signM %i5 170 171#define LIM_l5 %l5 172#define LIM_l6 %l6 173! when in pri range, using value as transition from poly to table. 174! for Medium range,change use of %l6 and use to keep track of biguns. 175#define LIM_l7 %l7 176 177!-------------------------------------------------------------------- 178 179 180 ENTRY(__vcos) 181 save %sp,-SA(MINFRAME)-tmps,%sp 182 PIC_SETUP(g5) 183 PIC_SET(g5,__vlibm_TBL_sincos_hi,l3) 184 PIC_SET(g5,__vlibm_TBL_sincos_lo,l4) 185 PIC_SET(g5,constants,o0) 186 mov %o0,%g1 187 wr %g0,0x82,%asi ! set %asi for non-faulting loads 188 189! ========== primary range ========== 190 191! register use 192 193! i0 n 194! i1 x 195! i2 stridex 196! i3 y 197! i4 stridey 198! i5 0x80000000 199 200! l0 hx0 201! l1 hx1 202! l2 hx2 203! l3 __vlibm_TBL_sincos_hi 204! l4 __vlibm_TBL_sincos_lo 205! l5 0x3fc40000 206! l6 0x3e400000 207! l7 0x3fe921fb 208 209! the following are 64-bit registers in both V8+ and V9 210 211! g1 scratch 212! g5 213 214! o0 py0 215! o1 py1 216! o2 py2 217! o3 oy0 218! o4 oy1 219! o5 oy2 220! o7 scratch 221 222! f0 x0 223! f2 224! f4 225! f6 226! f8 scratch for table base 227! f9 signbit0 228! f10 x1 229! f12 230! f14 231! f16 232! f18 scratch for table base 233! f19 signbit1 234! f20 x2 235! f22 236! f24 237! f26 238! f28 scratch for table base 239! f29 signbit2 240! f30 0x80000000 241! f31 0x4000 242! f32 243! f34 244! f36 245! f38 246! f40 247! f42 248! f44 0xffff800000000000 249! f46 p1 250! f48 p2 251! f50 p3 252! f52 p4 253! f54 one 254! f56 pp1 255! f58 pp2 256! f60 qq1 257! f62 qq2 258 259#ifdef __sparcv9 260 stx %i1,[%fp+xsave] ! save arguments 261 stx %i3,[%fp+ysave] 262#else 263 st %i1,[%fp+xsave] ! save arguments 264 st %i3,[%fp+ysave] 265#endif 266 267 st %i0,[%fp+nsave] 268 st %i2,[%fp+sxsave] 269 st %i4,[%fp+sysave] 270 sethi %hi(0x80000000),MSK_SIGN ! load/set up constants 271 sethi %hi(0x3fc40000),LIM_l5 272 sethi %hi(0x3e400000),LIM_l6 273 sethi %hi(0x3fe921fb),LIM_l7 274 or LIM_l7,%lo(0x3fe921fb),LIM_l7 275 ldd [%g1+f30val],MSK_BIT31 276 ldd [%g1+mask],MSK_BITSHI17 277 ldd [%g1+q1],C_q1 278 ldd [%g1+q2],C_q2 279 ldd [%g1+q3],C_q3 280 ldd [%g1+q4],C_q4 281 ldd [%g1+one],C_ONE 282 ldd [%g1+pp1],C_pp1 283 ldd [%g1+pp2],C_pp2 284 ldd [%g1+qq1],C_qq1 285 ldd [%g1+qq2],C_qq2 286 sll %i2,3,%i2 ! scale strides 287 sll %i4,3,%i4 288 add %fp,x0_1,%o3 ! precondition loop 289 add %fp,x0_1,%o4 290 add %fp,x0_1,%o5 291 ld [%i1],%l0 ! hx = *x 292 ld [%i1],P0_f0 293 ld [%i1+4],P0_f1 294 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 295 add %i1,%i2,%i1 ! x += stridex 296 297 ba,pt %icc,.loop0 298!delay slot 299 nop 300 301 .align 32 302.loop0: 303 lda [%i1]%asi,%l1 ! preload next argument 304 sub %l0,LIM_l6,%g1 305 sub LIM_l7,%l0,%o7 306 fands P0_f0,MSK_BIT31,P0_f9 ! save signbit 307 308 lda [%i1]%asi,P1_f10 309 orcc %o7,%g1,%g0 310 mov %i3,%o0 ! py0 = y 311 bl,pn %icc,.range0 ! if hx < 0x3e400000 or > 0x3fe921fb 312 313! delay slot 314 lda [%i1+4]%asi,P1_f11 315 addcc %i0,-1,%i0 316 add %i3,%i4,%i3 ! y += stridey 317 ble,pn %icc,.endloop1 318 319! delay slot 320 andn %l1,MSK_SIGN,%l1 321 add %i1,%i2,%i1 ! x += stridex 322 fabsd P0_f0,P0_f0 323 fmuld C_ONE,C_ONE,C_ONE ! one*one; a nop for alignment only 324 325.loop1: 326 lda [%i1]%asi,%l2 ! preload next argument 327 sub %l1,LIM_l6,%g1 328 sub LIM_l7,%l1,%o7 329 fands P1_f10,MSK_BIT31,P1_f19 ! save signbit 330 331 lda [%i1]%asi,P2_f20 332 orcc %o7,%g1,%g0 333 mov %i3,%o1 ! py1 = y 334 bl,pn %icc,.range1 ! if hx < 0x3e400000 or > 0x3fe921fb 335 336! delay slot 337 lda [%i1+4]%asi,P2_f21 338 addcc %i0,-1,%i0 339 add %i3,%i4,%i3 ! y += stridey 340 ble,pn %icc,.endloop2 341 342! delay slot 343 andn %l2,MSK_SIGN,%l2 344 add %i1,%i2,%i1 ! x += stridex 345 fabsd P1_f10,P1_f10 346 fmuld C_ONE,C_ONE,C_ONE ! one*one; a nop for alignment only 347 348.loop2: 349 st P0_f6,[%o3] 350 sub %l2,LIM_l6,%g1 351 sub LIM_l7,%l2,%o7 352 fands P2_f20,MSK_BIT31,P2_f29 ! save signbit 353 354 st P0_f7,[%o3+4] 355 orcc %g1,%o7,%g0 356 mov %i3,%o2 ! py2 = y 357 bl,pn %icc,.range2 ! if hx < 0x3e400000 or > 0x3fe921fb 358 359! delay slot 360 add %i3,%i4,%i3 ! y += stridey 361 cmp %l0,LIM_l5 362 fabsd P2_f20,P2_f20 363 bl,pn %icc,.case4 364 365! delay slot 366 st P1_f16,[%o4] 367 cmp %l1,LIM_l5 368 fpadd32s P0_f0,MSK_BIT13,P0_f8 369 bl,pn %icc,.case2 370 371! delay slot 372 st P1_f17,[%o4+4] 373 cmp %l2,LIM_l5 374 fpadd32s P1_f10,MSK_BIT13,P1_f18 375 bl,pn %icc,.case1 376 377! delay slot 378 st P2_f26,[%o5] 379 mov %o0,%o3 380 sethi %hi(0x3fc3c000),%o7 381 fpadd32s P2_f20,MSK_BIT13,P2_f28 382 383 st P2_f27,[%o5+4] 384 fand P0_f8,MSK_BITSHI17,P0_f2 385 mov %o1,%o4 386 387 fand P1_f18,MSK_BITSHI17,P1_f12 388 mov %o2,%o5 389 sub %l0,%o7,%l0 390 391 fand P2_f28,MSK_BITSHI17,P2_f22 392 sub %l1,%o7,%l1 393 sub %l2,%o7,%l2 394 395 fsubd P0_f0,P0_f2,P0_f0 396 srl %l0,10,%l0 397 add SC_HI,8,%g1;add SC_LO,8,%o7 398 399 fsubd P1_f10,P1_f12,P1_f10 400 srl %l1,10,%l1 401 402 fsubd P2_f20,P2_f22,P2_f20 403 srl %l2,10,%l2 404 405 fmuld P0_f0,P0_f0,P0_f2 406 andn %l0,0x1f,%l0 407 408 fmuld P1_f10,P1_f10,P1_f12 409 andn %l1,0x1f,%l1 410 411 fmuld P2_f20,P2_f20,P2_f22 412 andn %l2,0x1f,%l2 413 414 fmuld P0_f2,C_pp2,P0_f6 415 ldd [%g1+%l0],%f32 416 417 fmuld P1_f12,C_pp2,P1_f16 418 ldd [%g1+%l1],%f36 419 420 fmuld P2_f22,C_pp2,P2_f26 421 ldd [%g1+%l2],%f40 422 423 faddd P0_f6,C_pp1,P0_f6 424 fmuld P0_f2,C_qq2,P0_f4 425 ldd [SC_HI+%l0],%f34 426 427 faddd P1_f16,C_pp1,P1_f16 428 fmuld P1_f12,C_qq2,P1_f14 429 ldd [SC_HI+%l1],%f38 430 431 faddd P2_f26,C_pp1,P2_f26 432 fmuld P2_f22,C_qq2,P2_f24 433 ldd [SC_HI+%l2],%f42 434 435 fmuld P0_f2,P0_f6,P0_f6 436 faddd P0_f4,C_qq1,P0_f4 437 438 fmuld P1_f12,P1_f16,P1_f16 439 faddd P1_f14,C_qq1,P1_f14 440 441 fmuld P2_f22,P2_f26,P2_f26 442 faddd P2_f24,C_qq1,P2_f24 443 444 faddd P0_f6,C_ONE,P0_f6 445 fmuld P0_f2,P0_f4,P0_f4 446 447 faddd P1_f16,C_ONE,P1_f16 448 fmuld P1_f12,P1_f14,P1_f14 449 450 faddd P2_f26,C_ONE,P2_f26 451 fmuld P2_f22,P2_f24,P2_f24 452 453 fmuld P0_f0,P0_f6,P0_f6 454 ldd [%o7+%l0],P0_f2 455 456 fmuld P1_f10,P1_f16,P1_f16 457 ldd [%o7+%l1],P1_f12 458 459 fmuld P2_f20,P2_f26,P2_f26 460 ldd [%o7+%l2],P2_f22 461 462 fmuld P0_f4,%f32,P0_f4 463 lda [%i1]%asi,%l0 ! preload next argument 464 465 fmuld P1_f14,%f36,P1_f14 466 lda [%i1]%asi,P0_f0 467 468 fmuld P2_f24,%f40,P2_f24 469 lda [%i1+4]%asi,P0_f1 470 471 fmuld P0_f6,%f34,P0_f6 472 add %i1,%i2,%i1 ! x += stridex 473 474 fmuld P1_f16,%f38,P1_f16 475 476 fmuld P2_f26,%f42,P2_f26 477 478 fsubd P0_f6,P0_f4,P0_f6 479 480 fsubd P1_f16,P1_f14,P1_f16 481 482 fsubd P2_f26,P2_f24,P2_f26 483 484 fsubd P0_f2,P0_f6,P0_f6 485 486 fsubd P1_f12,P1_f16,P1_f16 487 488 fsubd P2_f22,P2_f26,P2_f26 489 490 faddd P0_f6,%f32,P0_f6 491 492 faddd P1_f16,%f36,P1_f16 493 494 faddd P2_f26,%f40,P2_f26 495 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 496 497 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 498 addcc %i0,-1,%i0 499 500 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 501 bg,pt %icc,.loop0 502 503! delay slot 504 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 505 506 ba,pt %icc,.endloop0 507! delay slot 508 nop 509 510 .align 32 511.case1: 512 st P2_f27,[%o5+4] 513 sethi %hi(0x3fc3c000),%o7 514 fand P0_f8,MSK_BITSHI17,P0_f2 515 516 sub %l0,%o7,%l0 517 sub %l1,%o7,%l1 518 add SC_HI,8,%g1;add SC_LO,8,%o7 519 fand P1_f18,MSK_BITSHI17,P1_f12 520 fmuld P2_f20,P2_f20,P2_f22 521 522 fsubd P0_f0,P0_f2,P0_f0 523 srl %l0,10,%l0 524 mov %o0,%o3 525 526 fsubd P1_f10,P1_f12,P1_f10 527 srl %l1,10,%l1 528 mov %o1,%o4 529 530 fmuld P2_f22,C_q4,P2_f24 531 mov %o2,%o5 532 533 fmuld P0_f0,P0_f0,P0_f2 534 andn %l0,0x1f,%l0 535 536 fmuld P1_f10,P1_f10,P1_f12 537 andn %l1,0x1f,%l1 538 539 faddd P2_f24,C_q3,P2_f24 540 541 fmuld P0_f2,C_pp2,P0_f6 542 ldd [%g1+%l0],%f32 543 544 fmuld P1_f12,C_pp2,P1_f16 545 ldd [%g1+%l1],%f36 546 547 fmuld P2_f22,P2_f24,P2_f24 548 549 faddd P0_f6,C_pp1,P0_f6 550 fmuld P0_f2,C_qq2,P0_f4 551 ldd [SC_HI+%l0],%f34 552 553 faddd P1_f16,C_pp1,P1_f16 554 fmuld P1_f12,C_qq2,P1_f14 555 ldd [SC_HI+%l1],%f38 556 557 faddd P2_f24,C_q2,P2_f24 558 559 fmuld P0_f2,P0_f6,P0_f6 560 faddd P0_f4,C_qq1,P0_f4 561 562 fmuld P1_f12,P1_f16,P1_f16 563 faddd P1_f14,C_qq1,P1_f14 564 565 fmuld P2_f22,P2_f24,P2_f24 566 567 faddd P0_f6,C_ONE,P0_f6 568 fmuld P0_f2,P0_f4,P0_f4 569 570 faddd P1_f16,C_ONE,P1_f16 571 fmuld P1_f12,P1_f14,P1_f14 572 573 faddd P2_f24,C_q1,P2_f24 574 575 fmuld P0_f0,P0_f6,P0_f6 576 ldd [%o7+%l0],P0_f2 577 578 fmuld P1_f10,P1_f16,P1_f16 579 ldd [%o7+%l1],P1_f12 580 581 fmuld P0_f4,%f32,P0_f4 582 lda [%i1]%asi,%l0 ! preload next argument 583 584 fmuld P1_f14,%f36,P1_f14 585 lda [%i1]%asi,P0_f0 586 587 fmuld P0_f6,%f34,P0_f6 588 lda [%i1+4]%asi,P0_f1 589 590 fmuld P1_f16,%f38,P1_f16 591 add %i1,%i2,%i1 ! x += stridex 592 593 fmuld P2_f22,P2_f24,P2_f24 594 595 fsubd P0_f6,P0_f4,P0_f6 596 597 fsubd P1_f16,P1_f14,P1_f16 598 599 !!(vsin)fmuld P2_f20,P2_f24,P2_f24 600 601 fsubd P0_f2,P0_f6,P0_f6 602 603 fsubd P1_f12,P1_f16,P1_f16 604 605 faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 606 607 faddd P0_f6,%f32,P0_f6 608 609 faddd P1_f16,%f36,P1_f16 610 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 611 612 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 613 addcc %i0,-1,%i0 614 615 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 616 bg,pt %icc,.loop0 617 618! delay slot 619 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 620 621 ba,pt %icc,.endloop0 622! delay slot 623 nop 624 625 .align 32 626.case2: 627 st P2_f26,[%o5] 628 cmp %l2,LIM_l5 629 fpadd32s P2_f20,MSK_BIT13,P2_f28 630 bl,pn %icc,.case3 631 632! delay slot 633 st P2_f27,[%o5+4] 634 sethi %hi(0x3fc3c000),%o7 635 fand P0_f8,MSK_BITSHI17,P0_f2 636 637 sub %l0,%o7,%l0 638 sub %l2,%o7,%l2 639 add SC_HI,8,%g1;add SC_LO,8,%o7 640 fand P2_f28,MSK_BITSHI17,P2_f22 641 fmuld P1_f10,P1_f10,P1_f12 642 643 fsubd P0_f0,P0_f2,P0_f0 644 srl %l0,10,%l0 645 mov %o0,%o3 646 647 fsubd P2_f20,P2_f22,P2_f20 648 srl %l2,10,%l2 649 mov %o2,%o5 650 651 fmuld P1_f12,C_q4,P1_f14 652 mov %o1,%o4 653 654 fmuld P0_f0,P0_f0,P0_f2 655 andn %l0,0x1f,%l0 656 657 fmuld P2_f20,P2_f20,P2_f22 658 andn %l2,0x1f,%l2 659 660 faddd P1_f14,C_q3,P1_f14 661 662 fmuld P0_f2,C_pp2,P0_f6 663 ldd [%g1+%l0],%f32 664 665 fmuld P2_f22,C_pp2,P2_f26 666 ldd [%g1+%l2],%f40 667 668 fmuld P1_f12,P1_f14,P1_f14 669 670 faddd P0_f6,C_pp1,P0_f6 671 fmuld P0_f2,C_qq2,P0_f4 672 ldd [SC_HI+%l0],%f34 673 674 faddd P2_f26,C_pp1,P2_f26 675 fmuld P2_f22,C_qq2,P2_f24 676 ldd [SC_HI+%l2],%f42 677 678 faddd P1_f14,C_q2,P1_f14 679 680 fmuld P0_f2,P0_f6,P0_f6 681 faddd P0_f4,C_qq1,P0_f4 682 683 fmuld P2_f22,P2_f26,P2_f26 684 faddd P2_f24,C_qq1,P2_f24 685 686 fmuld P1_f12,P1_f14,P1_f14 687 688 faddd P0_f6,C_ONE,P0_f6 689 fmuld P0_f2,P0_f4,P0_f4 690 691 faddd P2_f26,C_ONE,P2_f26 692 fmuld P2_f22,P2_f24,P2_f24 693 694 faddd P1_f14,C_q1,P1_f14 695 696 fmuld P0_f0,P0_f6,P0_f6 697 ldd [%o7+%l0],P0_f2 698 699 fmuld P2_f20,P2_f26,P2_f26 700 ldd [%o7+%l2],P2_f22 701 702 fmuld P0_f4,%f32,P0_f4 703 lda [%i1]%asi,%l0 ! preload next argument 704 705 fmuld P2_f24,%f40,P2_f24 706 lda [%i1]%asi,P0_f0 707 708 fmuld P0_f6,%f34,P0_f6 709 lda [%i1+4]%asi,P0_f1 710 711 fmuld P2_f26,%f42,P2_f26 712 add %i1,%i2,%i1 ! x += stridex 713 714 fmuld P1_f12,P1_f14,P1_f14 715 716 fsubd P0_f6,P0_f4,P0_f6 717 718 fsubd P2_f26,P2_f24,P2_f26 719 720 !!(vsin)fmuld P1_f10,P1_f14,P1_f14 721 722 fsubd P0_f2,P0_f6,P0_f6 723 724 fsubd P2_f22,P2_f26,P2_f26 725 726 faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 727 728 faddd P0_f6,%f32,P0_f6 729 730 faddd P2_f26,%f40,P2_f26 731 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 732 733 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 734 addcc %i0,-1,%i0 735 736 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 737 bg,pt %icc,.loop0 738 739! delay slot 740 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 741 742 ba,pt %icc,.endloop0 743! delay slot 744 nop 745 746 .align 32 747.case3: 748 sethi %hi(0x3fc3c000),%o7 749 fand P0_f8,MSK_BITSHI17,P0_f2 750 fmuld P1_f10,P1_f10,P1_f12 751 752 sub %l0,%o7,%l0 753 add SC_HI,8,%g1;add SC_LO,8,%o7 754 fmuld P2_f20,P2_f20,P2_f22 755 756 fsubd P0_f0,P0_f2,P0_f0 757 srl %l0,10,%l0 758 mov %o0,%o3 759 760 fmuld P1_f12,C_q4,P1_f14 761 mov %o1,%o4 762 763 fmuld P2_f22,C_q4,P2_f24 764 mov %o2,%o5 765 766 fmuld P0_f0,P0_f0,P0_f2 767 andn %l0,0x1f,%l0 768 769 faddd P1_f14,C_q3,P1_f14 770 771 faddd P2_f24,C_q3,P2_f24 772 773 fmuld P0_f2,C_pp2,P0_f6 774 ldd [%g1+%l0],%f32 775 776 fmuld P1_f12,P1_f14,P1_f14 777 778 fmuld P2_f22,P2_f24,P2_f24 779 780 faddd P0_f6,C_pp1,P0_f6 781 fmuld P0_f2,C_qq2,P0_f4 782 ldd [SC_HI+%l0],%f34 783 784 faddd P1_f14,C_q2,P1_f14 785 786 faddd P2_f24,C_q2,P2_f24 787 788 fmuld P0_f2,P0_f6,P0_f6 789 faddd P0_f4,C_qq1,P0_f4 790 791 fmuld P1_f12,P1_f14,P1_f14 792 793 fmuld P2_f22,P2_f24,P2_f24 794 795 faddd P0_f6,C_ONE,P0_f6 796 fmuld P0_f2,P0_f4,P0_f4 797 798 faddd P1_f14,C_q1,P1_f14 799 800 faddd P2_f24,C_q1,P2_f24 801 802 fmuld P0_f0,P0_f6,P0_f6 803 ldd [%o7+%l0],P0_f2 804 805 fmuld P0_f4,%f32,P0_f4 806 lda [%i1]%asi,%l0 ! preload next argument 807 808 fmuld P1_f12,P1_f14,P1_f14 809 lda [%i1]%asi,P0_f0 810 811 fmuld P0_f6,%f34,P0_f6 812 lda [%i1+4]%asi,P0_f1 813 814 fmuld P2_f22,P2_f24,P2_f24 815 add %i1,%i2,%i1 ! x += stridex 816 817 !!(vsin)fmuld P1_f10,P1_f14,P1_f14 818 819 fsubd P0_f6,P0_f4,P0_f6 820 821 !!(vsin)fmuld P2_f20,P2_f24,P2_f24 822 823 faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 824 825 fsubd P0_f2,P0_f6,P0_f6 826 827 faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 828 829 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 830 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 831 832 faddd P0_f6,%f32,P0_f6 833 addcc %i0,-1,%i0 834 835 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 836 bg,pt %icc,.loop0 837 838! delay slot 839 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 840 841 ba,pt %icc,.endloop0 842! delay slot 843 nop 844 845 .align 32 846.case4: 847 st P1_f17,[%o4+4] 848 cmp %l1,LIM_l5 849 fpadd32s P1_f10,MSK_BIT13,P1_f18 850 bl,pn %icc,.case6 851 852! delay slot 853 st P2_f26,[%o5] 854 cmp %l2,LIM_l5 855 fpadd32s P2_f20,MSK_BIT13,P2_f28 856 bl,pn %icc,.case5 857 858! delay slot 859 st P2_f27,[%o5+4] 860 sethi %hi(0x3fc3c000),%o7 861 fand P1_f18,MSK_BITSHI17,P1_f12 862 863 sub %l1,%o7,%l1 864 sub %l2,%o7,%l2 865 add SC_HI,8,%g1;add SC_LO,8,%o7 866 fand P2_f28,MSK_BITSHI17,P2_f22 867 fmuld P0_f0,P0_f0,P0_f2 868 869 fsubd P1_f10,P1_f12,P1_f10 870 srl %l1,10,%l1 871 mov %o1,%o4 872 873 fsubd P2_f20,P2_f22,P2_f20 874 srl %l2,10,%l2 875 mov %o2,%o5 876 877 fmovd P0_f0,P0_f6 !ID for processing 878 fmuld P0_f2,C_q4,P0_f4 879 mov %o0,%o3 880 881 fmuld P1_f10,P1_f10,P1_f12 882 andn %l1,0x1f,%l1 883 884 fmuld P2_f20,P2_f20,P2_f22 885 andn %l2,0x1f,%l2 886 887 faddd P0_f4,C_q3,P0_f4 888 889 fmuld P1_f12,C_pp2,P1_f16 890 ldd [%g1+%l1],%f36 891 892 fmuld P2_f22,C_pp2,P2_f26 893 ldd [%g1+%l2],%f40 894 895 fmuld P0_f2,P0_f4,P0_f4 896 897 faddd P1_f16,C_pp1,P1_f16 898 fmuld P1_f12,C_qq2,P1_f14 899 ldd [SC_HI+%l1],%f38 900 901 faddd P2_f26,C_pp1,P2_f26 902 fmuld P2_f22,C_qq2,P2_f24 903 ldd [SC_HI+%l2],%f42 904 905 faddd P0_f4,C_q2,P0_f4 906 907 fmuld P1_f12,P1_f16,P1_f16 908 faddd P1_f14,C_qq1,P1_f14 909 910 fmuld P2_f22,P2_f26,P2_f26 911 faddd P2_f24,C_qq1,P2_f24 912 913 fmuld P0_f2,P0_f4,P0_f4 914 915 faddd P1_f16,C_ONE,P1_f16 916 fmuld P1_f12,P1_f14,P1_f14 917 918 faddd P2_f26,C_ONE,P2_f26 919 fmuld P2_f22,P2_f24,P2_f24 920 921 faddd P0_f4,C_q1,P0_f4 922 923 fmuld P1_f10,P1_f16,P1_f16 924 ldd [%o7+%l1],P1_f12 925 926 fmuld P2_f20,P2_f26,P2_f26 927 ldd [%o7+%l2],P2_f22 928 929 fmuld P1_f14,%f36,P1_f14 930 lda [%i1]%asi,%l0 ! preload next argument 931 932 fmuld P2_f24,%f40,P2_f24 933 lda [%i1]%asi,P0_f0 934 935 fmuld P1_f16,%f38,P1_f16 936 lda [%i1+4]%asi,P0_f1 937 938 fmuld P2_f26,%f42,P2_f26 939 add %i1,%i2,%i1 ! x += stridex 940 941 fmuld P0_f2,P0_f4,P0_f4 942 943 fsubd P1_f16,P1_f14,P1_f16 944 945 fsubd P2_f26,P2_f24,P2_f26 946 947 !!(vsin)fmuld P0_f6,P0_f4,P0_f4 948 949 fsubd P1_f12,P1_f16,P1_f16 950 951 fsubd P2_f22,P2_f26,P2_f26 952 953 faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing 954 955 faddd P1_f16,%f36,P1_f16 956 957 faddd P2_f26,%f40,P2_f26 958 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 959 960 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 961 addcc %i0,-1,%i0 962 963 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 964 bg,pt %icc,.loop0 965 966! delay slot 967 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 968 969 ba,pt %icc,.endloop0 970! delay slot 971 nop 972 973 .align 32 974.case5: 975 sethi %hi(0x3fc3c000),%o7 976 fand P1_f18,MSK_BITSHI17,P1_f12 977 fmuld P0_f0,P0_f0,P0_f2 978 979 sub %l1,%o7,%l1 980 add SC_HI,8,%g1;add SC_LO,8,%o7 981 fmuld P2_f20,P2_f20,P2_f22 982 983 fsubd P1_f10,P1_f12,P1_f10 984 srl %l1,10,%l1 985 mov %o1,%o4 986 987 fmovd P0_f0,P0_f6 !ID for processing 988 fmuld P0_f2,C_q4,P0_f4 989 mov %o0,%o3 990 991 fmuld P2_f22,C_q4,P2_f24 992 mov %o2,%o5 993 994 fmuld P1_f10,P1_f10,P1_f12 995 andn %l1,0x1f,%l1 996 997 faddd P0_f4,C_q3,P0_f4 998 999 faddd P2_f24,C_q3,P2_f24 1000 1001 fmuld P1_f12,C_pp2,P1_f16 1002 ldd [%g1+%l1],%f36 1003 1004 fmuld P0_f2,P0_f4,P0_f4 1005 1006 fmuld P2_f22,P2_f24,P2_f24 1007 1008 faddd P1_f16,C_pp1,P1_f16 1009 fmuld P1_f12,C_qq2,P1_f14 1010 ldd [SC_HI+%l1],%f38 1011 1012 faddd P0_f4,C_q2,P0_f4 1013 1014 faddd P2_f24,C_q2,P2_f24 1015 1016 fmuld P1_f12,P1_f16,P1_f16 1017 faddd P1_f14,C_qq1,P1_f14 1018 1019 fmuld P0_f2,P0_f4,P0_f4 1020 1021 fmuld P2_f22,P2_f24,P2_f24 1022 1023 faddd P1_f16,C_ONE,P1_f16 1024 fmuld P1_f12,P1_f14,P1_f14 1025 1026 faddd P0_f4,C_q1,P0_f4 1027 1028 faddd P2_f24,C_q1,P2_f24 1029 1030 fmuld P1_f10,P1_f16,P1_f16 1031 ldd [%o7+%l1],P1_f12 1032 1033 fmuld P1_f14,%f36,P1_f14 1034 lda [%i1]%asi,%l0 ! preload next argument 1035 1036 fmuld P0_f2,P0_f4,P0_f4 1037 lda [%i1]%asi,P0_f0 1038 1039 fmuld P1_f16,%f38,P1_f16 1040 lda [%i1+4]%asi,P0_f1 1041 1042 fmuld P2_f22,P2_f24,P2_f24 1043 add %i1,%i2,%i1 ! x += stridex 1044 1045 !!(vsin)fmuld P0_f6,P0_f4,P0_f4 1046 1047 fsubd P1_f16,P1_f14,P1_f16 1048 1049 !!(vsin)fmuld P2_f20,P2_f24,P2_f24 1050 1051 faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing 1052 1053 fsubd P1_f12,P1_f16,P1_f16 1054 1055 faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 1056 1057 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 1058 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 1059 1060 faddd P1_f16,%f36,P1_f16 1061 addcc %i0,-1,%i0 1062 1063 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 1064 bg,pt %icc,.loop0 1065 1066! delay slot 1067 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 1068 1069 ba,pt %icc,.endloop0 1070! delay slot 1071 nop 1072 1073 .align 32 1074.case6: 1075 st P2_f27,[%o5+4] 1076 cmp %l2,LIM_l5 1077 fpadd32s P2_f20,MSK_BIT13,P2_f28 1078 bl,pn %icc,.case7 1079 1080! delay slot 1081 sethi %hi(0x3fc3c000),%o7 1082 fand P2_f28,MSK_BITSHI17,P2_f22 1083 fmuld P0_f0,P0_f0,P0_f2 1084 1085 sub %l2,%o7,%l2 1086 add SC_HI,8,%g1;add SC_LO,8,%o7 1087 fmuld P1_f10,P1_f10,P1_f12 1088 1089 fsubd P2_f20,P2_f22,P2_f20 1090 srl %l2,10,%l2 1091 mov %o2,%o5 1092 1093 fmovd P0_f0,P0_f6 !ID for processing 1094 fmuld P0_f2,C_q4,P0_f4 1095 mov %o0,%o3 1096 1097 fmuld P1_f12,C_q4,P1_f14 1098 mov %o1,%o4 1099 1100 fmuld P2_f20,P2_f20,P2_f22 1101 andn %l2,0x1f,%l2 1102 1103 faddd P0_f4,C_q3,P0_f4 1104 1105 faddd P1_f14,C_q3,P1_f14 1106 1107 fmuld P2_f22,C_pp2,P2_f26 1108 ldd [%g1+%l2],%f40 1109 1110 fmuld P0_f2,P0_f4,P0_f4 1111 1112 fmuld P1_f12,P1_f14,P1_f14 1113 1114 faddd P2_f26,C_pp1,P2_f26 1115 fmuld P2_f22,C_qq2,P2_f24 1116 ldd [SC_HI+%l2],%f42 1117 1118 faddd P0_f4,C_q2,P0_f4 1119 1120 faddd P1_f14,C_q2,P1_f14 1121 1122 fmuld P2_f22,P2_f26,P2_f26 1123 faddd P2_f24,C_qq1,P2_f24 1124 1125 fmuld P0_f2,P0_f4,P0_f4 1126 1127 fmuld P1_f12,P1_f14,P1_f14 1128 1129 faddd P2_f26,C_ONE,P2_f26 1130 fmuld P2_f22,P2_f24,P2_f24 1131 1132 faddd P0_f4,C_q1,P0_f4 1133 1134 faddd P1_f14,C_q1,P1_f14 1135 1136 fmuld P2_f20,P2_f26,P2_f26 1137 ldd [%o7+%l2],P2_f22 1138 1139 fmuld P2_f24,%f40,P2_f24 1140 lda [%i1]%asi,%l0 ! preload next argument 1141 1142 fmuld P0_f2,P0_f4,P0_f4 1143 lda [%i1]%asi,P0_f0 1144 1145 fmuld P2_f26,%f42,P2_f26 1146 lda [%i1+4]%asi,P0_f1 1147 1148 fmuld P1_f12,P1_f14,P1_f14 1149 add %i1,%i2,%i1 ! x += stridex 1150 1151 !!(vsin)fmuld P0_f6,P0_f4,P0_f4 1152 1153 fsubd P2_f26,P2_f24,P2_f26 1154 1155 !!(vsin)fmuld P1_f10,P1_f14,P1_f14 1156 1157 faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing 1158 1159 fsubd P2_f22,P2_f26,P2_f26 1160 1161 faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 1162 1163 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 1164 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 1165 1166 faddd P2_f26,%f40,P2_f26 1167 addcc %i0,-1,%i0 1168 1169 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 1170 bg,pt %icc,.loop0 1171 1172! delay slot 1173 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 1174 1175 ba,pt %icc,.endloop0 1176! delay slot 1177 nop 1178 1179 .align 32 1180.case7: 1181 fmuld P0_f0,P0_f0,P0_f2 1182 fmovd P0_f0,P0_f6 !ID for processing 1183 mov %o0,%o3 1184 1185 fmuld P1_f10,P1_f10,P1_f12 1186 mov %o1,%o4 1187 1188 fmuld P2_f20,P2_f20,P2_f22 1189 mov %o2,%o5 1190 1191 fmuld P0_f2,C_q4,P0_f4 1192 lda [%i1]%asi,%l0 ! preload next argument 1193 1194 fmuld P1_f12,C_q4,P1_f14 1195 lda [%i1]%asi,P0_f0 1196 1197 fmuld P2_f22,C_q4,P2_f24 1198 lda [%i1+4]%asi,P0_f1 1199 1200 faddd P0_f4,C_q3,P0_f4 1201 add %i1,%i2,%i1 ! x += stridex 1202 1203 faddd P1_f14,C_q3,P1_f14 1204 1205 faddd P2_f24,C_q3,P2_f24 1206 1207 fmuld P0_f2,P0_f4,P0_f4 1208 1209 fmuld P1_f12,P1_f14,P1_f14 1210 1211 fmuld P2_f22,P2_f24,P2_f24 1212 1213 faddd P0_f4,C_q2,P0_f4 1214 1215 faddd P1_f14,C_q2,P1_f14 1216 1217 faddd P2_f24,C_q2,P2_f24 1218 1219 fmuld P0_f2,P0_f4,P0_f4 1220 1221 fmuld P1_f12,P1_f14,P1_f14 1222 1223 fmuld P2_f22,P2_f24,P2_f24 1224 1225 faddd P0_f4,C_q1,P0_f4 1226 1227 faddd P1_f14,C_q1,P1_f14 1228 1229 faddd P2_f24,C_q1,P2_f24 1230 1231 fmuld P0_f2,P0_f4,P0_f4 1232 1233 fmuld P1_f12,P1_f14,P1_f14 1234 1235 fmuld P2_f22,P2_f24,P2_f24 1236 1237 !!(vsin)fmuld P0_f6,P0_f4,P0_f4 1238 1239 !!(vsin)fmuld P1_f10,P1_f14,P1_f14 1240 1241 !!(vsin)fmuld P2_f20,P2_f24,P2_f24 1242 1243 faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing 1244 1245 faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 1246 1247 faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 1248 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 1249 1250 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 1251 addcc %i0,-1,%i0 1252 1253 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 1254 bg,pt %icc,.loop0 1255 1256! delay slot 1257 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 1258 1259 ba,pt %icc,.endloop0 1260! delay slot 1261 nop 1262 1263 1264 .align 32 1265.endloop2: 1266 cmp %l1,LIM_l5 1267 bl,pn %icc,1f 1268! delay slot 1269 fabsd P1_f10,P1_f10 1270 sethi %hi(0x3fc3c000),%o7 1271 fpadd32s P1_f10,MSK_BIT13,P1_f18 1272 fand P1_f18,MSK_BITSHI17,P1_f12 1273 sub %l1,%o7,%l1 1274 add SC_HI,8,%g1;add SC_LO,8,%o7 1275 fsubd P1_f10,P1_f12,P1_f10 1276 srl %l1,10,%l1 1277 fmuld P1_f10,P1_f10,P1_f12 1278 andn %l1,0x1f,%l1 1279 fmuld P1_f12,C_pp2,P2_f20 1280 ldd [%g1+%l1],%f36 1281 faddd P2_f20,C_pp1,P2_f20 1282 fmuld P1_f12,C_qq2,P1_f14 1283 ldd [SC_HI+%l1],%f38 1284 fmuld P1_f12,P2_f20,P2_f20 1285 faddd P1_f14,C_qq1,P1_f14 1286 faddd P2_f20,C_ONE,P2_f20 1287 fmuld P1_f12,P1_f14,P1_f14 1288 fmuld P1_f10,P2_f20,P2_f20 1289 ldd [%o7+%l1],P1_f12 1290 fmuld P1_f14,%f36,P1_f14 1291 fmuld P2_f20,%f38,P2_f20 1292 fsubd P2_f20,P1_f14,P2_f20 1293 fsubd P1_f12,P2_f20,P2_f20 1294 ba,pt %icc,2f 1295! delay slot 1296 faddd P2_f20,%f36,P2_f20 12971: 1298 fmuld P1_f10,P1_f10,P1_f12 1299 fmuld P1_f12,C_q4,P1_f14 1300 faddd P1_f14,C_q3,P1_f14 1301 fmuld P1_f12,P1_f14,P1_f14 1302 faddd P1_f14,C_q2,P1_f14 1303 fmuld P1_f12,P1_f14,P1_f14 1304 faddd P1_f14,C_q1,P1_f14 1305 fmuld P1_f12,P1_f14,P1_f14 1306 !!(vsin)fmuld P1_f10,P1_f14,P1_f14 1307 faddd C_ONE,P1_f14,P2_f20 !!(vsin)faddd P1_f10,P1_f14,P2_f20 13082: 1309 nop !!(vsin) fors P2_f20,P1_f19,P2_f20 1310 st P2_f20,[%o1] 1311 st P2_f21,[%o1+4] 1312 1313.endloop1: 1314 cmp %l0,LIM_l5 1315 bl,pn %icc,1f 1316! delay slot 1317 fabsd P0_f0,P0_f0 1318 sethi %hi(0x3fc3c000),%o7 1319 fpadd32s P0_f0,MSK_BIT13,P0_f8 1320 fand P0_f8,MSK_BITSHI17,P0_f2 1321 sub %l0,%o7,%l0 1322 add SC_HI,8,%g1;add SC_LO,8,%o7 1323 fsubd P0_f0,P0_f2,P0_f0 1324 srl %l0,10,%l0 1325 fmuld P0_f0,P0_f0,P0_f2 1326 andn %l0,0x1f,%l0 1327 fmuld P0_f2,C_pp2,P2_f20 1328 ldd [%g1+%l0],%f32 1329 faddd P2_f20,C_pp1,P2_f20 1330 fmuld P0_f2,C_qq2,P0_f4 1331 ldd [SC_HI+%l0],%f34 1332 fmuld P0_f2,P2_f20,P2_f20 1333 faddd P0_f4,C_qq1,P0_f4 1334 faddd P2_f20,C_ONE,P2_f20 1335 fmuld P0_f2,P0_f4,P0_f4 1336 fmuld P0_f0,P2_f20,P2_f20 1337 ldd [%o7+%l0],P0_f2 1338 fmuld P0_f4,%f32,P0_f4 1339 fmuld P2_f20,%f34,P2_f20 1340 fsubd P2_f20,P0_f4,P2_f20 1341 fsubd P0_f2,P2_f20,P2_f20 1342 ba,pt %icc,2f 1343! delay slot 1344 faddd P2_f20,%f32,P2_f20 13451: 1346 fmuld P0_f0,P0_f0,P0_f2 1347 fmuld P0_f2,C_q4,P0_f4 1348 faddd P0_f4,C_q3,P0_f4 1349 fmuld P0_f2,P0_f4,P0_f4 1350 faddd P0_f4,C_q2,P0_f4 1351 fmuld P0_f2,P0_f4,P0_f4 1352 faddd P0_f4,C_q1,P0_f4 1353 fmuld P0_f2,P0_f4,P0_f4 1354 !!(vsin)fmuld P0_f0,P0_f4,P0_f4 1355 faddd C_ONE,P0_f4,P2_f20 !!(vsin)faddd P0_f0,P0_f4,P2_f20 13562: 1357 nop !!(vsin) fors P2_f20,P0_f9,P2_f20 1358 st P2_f20,[%o0] 1359 st P2_f21,[%o0+4] 1360 1361.endloop0: 1362 st P0_f6,[%o3] 1363 st P0_f7,[%o3+4] 1364 st P1_f16,[%o4] 1365 st P1_f17,[%o4+4] 1366 st P2_f26,[%o5] 1367 st P2_f27,[%o5+4] 1368 1369! return. finished off with only primary range arguments 1370 1371 ret 1372 restore 1373 1374 1375 .align 32 1376.range0: 1377 cmp %l0,LIM_l6 1378 bg,a,pt %icc,.MEDIUM ! branch to Medium range on big arg. 1379! delay slot, annulled if branch not taken 1380 mov 0x1,LIM_l6 ! set biguns flag or 1381 fdtoi P0_f0,P0_f2; fmovd C_ONE,P0_f0 ; st P0_f0,[%o0] ! *y = *x with inexact if x nonzero 1382 st P0_f1,[%o0+4] 1383 !nop ! (vsin) fdtoi P0_f0,P0_f2 1384 addcc %i0,-1,%i0 1385 ble,pn %icc,.endloop0 1386! delay slot, harmless if branch taken 1387 add %i3,%i4,%i3 ! y += stridey 1388 andn %l1,MSK_SIGN,%l0 ! hx &= ~0x80000000 1389 fmovd P1_f10,P0_f0 1390 ba,pt %icc,.loop0 1391! delay slot 1392 add %i1,%i2,%i1 ! x += stridex 1393 1394 1395 .align 32 1396.range1: 1397 cmp %l1,LIM_l6 1398 bg,a,pt %icc,.MEDIUM ! branch to Medium range on big arg. 1399! delay slot, annulled if branch not taken 1400 mov 0x2,LIM_l6 ! set biguns flag or 1401 fdtoi P1_f10,P1_f12; fmovd C_ONE,P1_f10 ; st P1_f10,[%o1] ! *y = *x with inexact if x nonzero 1402 st P1_f11,[%o1+4] 1403 !nop ! (vsin) fdtoi P1_f10,P1_f12 1404 addcc %i0,-1,%i0 1405 ble,pn %icc,.endloop1 1406! delay slot, harmless if branch taken 1407 add %i3,%i4,%i3 ! y += stridey 1408 andn %l2,MSK_SIGN,%l1 ! hx &= ~0x80000000 1409 fmovd P2_f20,P1_f10 1410 ba,pt %icc,.loop1 1411! delay slot 1412 add %i1,%i2,%i1 ! x += stridex 1413 1414 1415 .align 32 1416.range2: 1417 cmp %l2,LIM_l6 1418 bg,a,pt %icc,.MEDIUM ! brance to Medium range on big arg. 1419! delay slot, annulled if branch not taken 1420 mov 0x3,LIM_l6 ! set biguns flag or 1421 fdtoi P2_f20,P2_f22; fmovd C_ONE,P2_f20 ; st P2_f20,[%o2] ! *y = *x with inexact if x nonzero 1422 st P2_f21,[%o2+4] 1423 nop ! (vsin) fdtoi P2_f20,P2_f22 14241: 1425 addcc %i0,-1,%i0 1426 ble,pn %icc,.endloop2 1427! delay slot 1428 nop 1429 ld [%i1],%l2 1430 ld [%i1],P2_f20 1431 ld [%i1+4],P2_f21 1432 andn %l2,MSK_SIGN,%l2 ! hx &= ~0x80000000 1433 ba,pt %icc,.loop2 1434! delay slot 1435 add %i1,%i2,%i1 ! x += stridex 1436 1437 1438 .align 32 1439.MEDIUM: 1440 1441! ========== medium range ========== 1442 1443! register use 1444 1445! i0 n 1446! i1 x 1447! i2 stridex 1448! i3 y 1449! i4 stridey 1450! i5 0x80000000 1451 1452! l0 hx0 1453! l1 hx1 1454! l2 hx2 1455! l3 __vlibm_TBL_sincos_hi 1456! l4 __vlibm_TBL_sincos_lo 1457! l5 constants 1458! l6 biguns stored here : still called LIM_l6 1459! l7 0x413921fb 1460 1461! the following are 64-bit registers in both V8+ and V9 1462 1463! g1 scratch 1464! g5 1465 1466! o0 py0 1467! o1 py1 1468! o2 py2 1469! o3 n0 1470! o4 n1 1471! o5 n2 1472! o7 scratch 1473 1474! f0 x0 1475! f2 n0,y0 1476! f4 1477! f6 1478! f8 scratch for table base 1479! f9 signbit0 1480! f10 x1 1481! f12 n1,y1 1482! f14 1483! f16 1484! f18 scratch for table base 1485! f19 signbit1 1486! f20 x2 1487! f22 n2,y2 1488! f24 1489! f26 1490! f28 scratch for table base 1491! f29 signbit2 1492! f30 0x80000000 1493! f31 0x4000 1494! f32 1495! f34 1496! f36 1497! f38 1498! f40 invpio2 1499! f42 round 1500! f44 0xffff800000000000 1501! f46 pio2_1 1502! f48 pio2_2 1503! f50 pio2_3 1504! f52 pio2_3t 1505! f54 one 1506! f56 pp1 1507! f58 pp2 1508! f60 qq1 1509! f62 qq2 1510 1511 1512 PIC_SET(g5,constants,l5) 1513 1514 ! %o3,%o4,%o5 need to be stored 1515 st P0_f6,[%o3] 1516 sethi %hi(0x413921fb),%l7 1517 st P0_f7,[%o3+4] 1518 or %l7,%lo(0x413921fb),%l7 1519 st P1_f16,[%o4] 1520 st P1_f17,[%o4+4] 1521 st P2_f26,[%o5] 1522 st P2_f27,[%o5+4] 1523 ldd [%l5+invpio2],%f40 1524 ldd [%l5+round],%f42 1525 ldd [%l5+pio2_1],%f46 1526 ldd [%l5+pio2_2],%f48 1527 ldd [%l5+pio2_3],%f50 1528 ldd [%l5+pio2_3t],%f52 1529 std %f54,[%fp+x0_1+8] ! set up stack data 1530 std %f54,[%fp+x1_1+8] 1531 std %f54,[%fp+x2_1+8] 1532 stx %g0,[%fp+y0_0+8] 1533 stx %g0,[%fp+y1_0+8] 1534 stx %g0,[%fp+y2_0+8] 1535 1536! branched here in the middle of the array. Need to adjust 1537! for the members of the triple that were selected in the primary 1538! loop. 1539 1540! no adjustment since all three selected here 1541 subcc LIM_l6,0x1,%g0 ! continue in LOOP0? 1542 bz,a %icc,.LOOP0 1543 mov 0x0,LIM_l6 ! delay slot set biguns=0 1544 1545! ajust 1st triple since 2d and 3d done here 1546 subcc LIM_l6,0x2,%g0 ! continue in LOOP1? 1547 fmuld %f0,%f40,%f2 ! adj LOOP0 1548 bz,a %icc,.LOOP1 1549 mov 0x0,LIM_l6 ! delay slot set biguns=0 1550 1551! ajust 1st and 2d triple since 3d done here 1552 subcc LIM_l6,0x3,%g0 ! continue in LOOP2? 1553 !done fmuld %f0,%f40,%f2 ! adj LOOP0 1554 sub %i3,%i4,%i3 ! adjust to not double increment 1555 fmuld %f10,%f40,%f12 ! adj LOOP1 1556 faddd %f2,%f42,%f2 ! adj LOOP1 1557 bz,a %icc,.LOOP2 1558 mov 0x0,LIM_l6 ! delay slot set biguns=0 1559 1560 ba .LOOP0 1561 nop 1562 1563! -- 16 byte aligned 1564 1565 .align 32 1566.LOOP0: 1567 lda [%i1]%asi,%l1 ! preload next argument 1568 mov %i3,%o0 ! py0 = y 1569 1570 lda [%i1]%asi,%f10 1571 cmp %l0,%l7 1572 add %i3,%i4,%i3 ! y += stridey 1573 bg,pn %icc,.BIG0 ! if hx > 0x413921fb 1574 1575! delay slot 1576 lda [%i1+4]%asi,%f11 1577 addcc %i0,-1,%i0 1578 add %i1,%i2,%i1 ! x += stridex 1579 ble,pn %icc,.ENDLOOP1 1580 1581! delay slot 1582 andn %l1,%i5,%l1 1583 nop 1584 fmuld %f0,%f40,%f2 1585 fabsd %f54,%f54 ! a nop for alignment only 1586 1587.LOOP1: 1588 lda [%i1]%asi,%l2 ! preload next argument 1589 mov %i3,%o1 ! py1 = y 1590 1591 lda [%i1]%asi,%f20 1592 cmp %l1,%l7 1593 add %i3,%i4,%i3 ! y += stridey 1594 bg,pn %icc,.BIG1 ! if hx > 0x413921fb 1595 1596! delay slot 1597 lda [%i1+4]%asi,%f21 1598 addcc %i0,-1,%i0 1599 add %i1,%i2,%i1 ! x += stridex 1600 ble,pn %icc,.ENDLOOP2 1601 1602! delay slot 1603 andn %l2,%i5,%l2 1604 nop 1605 fmuld %f10,%f40,%f12 1606 faddd %f2,%f42,%f2 1607 1608.LOOP2: 1609 st %f3,[%fp+n0] 1610 mov %i3,%o2 ! py2 = y 1611 1612 cmp %l2,%l7 1613 add %i3,%i4,%i3 ! y += stridey 1614 fmuld %f20,%f40,%f22 1615 bg,pn %icc,.BIG2 ! if hx > 0x413921fb 1616 1617! delay slot 1618 add %l5,thresh+4,%o7 1619 faddd %f12,%f42,%f12 1620 st %f13,[%fp+n1] 1621 1622! - 1623 1624 add %l5,thresh,%g1 1625 faddd %f22,%f42,%f22 1626 st %f23,[%fp+n2] 1627 1628 fsubd %f2,%f42,%f2 ! n 1629 1630 fsubd %f12,%f42,%f12 ! n 1631 1632 fsubd %f22,%f42,%f22 ! n 1633 1634 fmuld %f2,%f46,%f4 1635 1636 fmuld %f12,%f46,%f14 1637 1638 fmuld %f22,%f46,%f24 1639 1640 fsubd %f0,%f4,%f4 1641 fmuld %f2,%f48,%f6 1642 1643 fsubd %f10,%f14,%f14 1644 fmuld %f12,%f48,%f16 1645 1646 fsubd %f20,%f24,%f24 1647 fmuld %f22,%f48,%f26 1648 1649 fsubd %f4,%f6,%f0 1650 ld [%fp+n0],%o3 ; add %o3,1,%o3 1651 1652 fsubd %f14,%f16,%f10 1653 ld [%fp+n1],%o4 ; add %o4,1,%o4 1654 1655 fsubd %f24,%f26,%f20 1656 ld [%fp+n2],%o5 ; add %o5,1,%o5 1657 1658 fsubd %f4,%f0,%f32 1659 and %o3,1,%o3 1660 1661 fsubd %f14,%f10,%f34 1662 and %o4,1,%o4 1663 1664 fsubd %f24,%f20,%f36 1665 and %o5,1,%o5 1666 1667 fsubd %f32,%f6,%f32 1668 fmuld %f2,%f50,%f8 1669 sll %o3,3,%o3 1670 1671 fsubd %f34,%f16,%f34 1672 fmuld %f12,%f50,%f18 1673 sll %o4,3,%o4 1674 1675 fsubd %f36,%f26,%f36 1676 fmuld %f22,%f50,%f28 1677 sll %o5,3,%o5 1678 1679 fsubd %f8,%f32,%f8 1680 ld [%g1+%o3],%f6 1681 1682 fsubd %f18,%f34,%f18 1683 ld [%g1+%o4],%f16 1684 1685 fsubd %f28,%f36,%f28 1686 ld [%g1+%o5],%f26 1687 1688 fsubd %f0,%f8,%f4 1689 1690 fsubd %f10,%f18,%f14 1691 1692 fsubd %f20,%f28,%f24 1693 1694 fsubd %f0,%f4,%f32 1695 1696 fsubd %f10,%f14,%f34 1697 1698 fsubd %f20,%f24,%f36 1699 1700 fsubd %f32,%f8,%f32 1701 fmuld %f2,%f52,%f2 1702 1703 fsubd %f34,%f18,%f34 1704 fmuld %f12,%f52,%f12 1705 1706 fsubd %f36,%f28,%f36 1707 fmuld %f22,%f52,%f22 1708 1709 fsubd %f2,%f32,%f2 1710 ld [%o7+%o3],%f8 1711 1712 fsubd %f12,%f34,%f12 1713 ld [%o7+%o4],%f18 1714 1715 fsubd %f22,%f36,%f22 1716 ld [%o7+%o5],%f28 1717 1718 fsubd %f4,%f2,%f0 ! x 1719 1720 fsubd %f14,%f12,%f10 ! x 1721 1722 fsubd %f24,%f22,%f20 ! x 1723 1724 fsubd %f4,%f0,%f4 1725 1726 fsubd %f14,%f10,%f14 1727 1728 fsubd %f24,%f20,%f24 1729 1730 fands %f0,%f30,%f9 ! save signbit 1731 1732 fands %f10,%f30,%f19 ! save signbit 1733 1734 fands %f20,%f30,%f29 ! save signbit 1735 1736 fabsd %f0,%f0 1737 std %f0,[%fp+x0_1] 1738 1739 fabsd %f10,%f10 1740 std %f10,[%fp+x1_1] 1741 1742 fabsd %f20,%f20 1743 std %f20,[%fp+x2_1] 1744 1745 fsubd %f4,%f2,%f2 ! y 1746 1747 fsubd %f14,%f12,%f12 ! y 1748 1749 fsubd %f24,%f22,%f22 ! y 1750 1751 fcmpgt32 %f6,%f0,%l0 1752 1753 fcmpgt32 %f16,%f10,%l1 1754 1755 fcmpgt32 %f26,%f20,%l2 1756 1757! -- 16 byte aligned 1758 fxors %f2,%f9,%f2 1759 1760 fxors %f12,%f19,%f12 1761 1762 fxors %f22,%f29,%f22 1763 1764 fands %f9,%f8,%f9 ! if (n & 1) clear sign bit 1765 andcc %l0,2,%g0 1766 bne,pn %icc,.CASE4 1767 1768! delay slot 1769 fands %f19,%f18,%f19 ! if (n & 1) clear sign bit 1770 andcc %l1,2,%g0 1771 bne,pn %icc,.CASE2 1772 1773! delay slot 1774 fands %f29,%f28,%f29 ! if (n & 1) clear sign bit 1775 andcc %l2,2,%g0 1776 bne,pn %icc,.CASE1 1777 1778! delay slot 1779 fpadd32s %f0,%f31,%f8 1780 sethi %hi(0x3fc3c000),%o7 1781 ld [%fp+x0_1],%l0 1782 1783 fpadd32s %f10,%f31,%f18 1784 add %l3,8,%g1 1785 ld [%fp+x1_1],%l1 1786 1787 fpadd32s %f20,%f31,%f28 1788 ld [%fp+x2_1],%l2 1789 1790 fand %f8,%f44,%f4 1791 sub %l0,%o7,%l0 1792 1793 fand %f18,%f44,%f14 1794 sub %l1,%o7,%l1 1795 1796 fand %f28,%f44,%f24 1797 sub %l2,%o7,%l2 1798 1799 fsubd %f0,%f4,%f0 1800 srl %l0,10,%l0 1801 1802 fsubd %f10,%f14,%f10 1803 srl %l1,10,%l1 1804 1805 fsubd %f20,%f24,%f20 1806 srl %l2,10,%l2 1807 1808 faddd %f0,%f2,%f0 1809 andn %l0,0x1f,%l0 1810 1811 faddd %f10,%f12,%f10 1812 andn %l1,0x1f,%l1 1813 1814 faddd %f20,%f22,%f20 1815 andn %l2,0x1f,%l2 1816 1817 fmuld %f0,%f0,%f2 1818 add %l0,%o3,%l0 1819 1820 fmuld %f10,%f10,%f12 1821 add %l1,%o4,%l1 1822 1823 fmuld %f20,%f20,%f22 1824 add %l2,%o5,%l2 1825 1826 fmuld %f2,%f58,%f6 1827 ldd [%l3+%l0],%f32 1828 1829 fmuld %f12,%f58,%f16 1830 ldd [%l3+%l1],%f34 1831 1832 fmuld %f22,%f58,%f26 1833 ldd [%l3+%l2],%f36 1834 1835 faddd %f6,%f56,%f6 1836 fmuld %f2,%f62,%f4 1837 1838 faddd %f16,%f56,%f16 1839 fmuld %f12,%f62,%f14 1840 1841 faddd %f26,%f56,%f26 1842 fmuld %f22,%f62,%f24 1843 1844 fmuld %f2,%f6,%f6 1845 faddd %f4,%f60,%f4 1846 1847 fmuld %f12,%f16,%f16 1848 faddd %f14,%f60,%f14 1849 1850 fmuld %f22,%f26,%f26 1851 faddd %f24,%f60,%f24 1852 1853 faddd %f6,%f54,%f6 1854 fmuld %f2,%f4,%f4 1855 1856 faddd %f16,%f54,%f16 1857 fmuld %f12,%f14,%f14 1858 1859 faddd %f26,%f54,%f26 1860 fmuld %f22,%f24,%f24 1861 1862 fmuld %f0,%f6,%f6 1863 ldd [%g1+%l0],%f2 1864 1865 fmuld %f10,%f16,%f16 1866 ldd [%g1+%l1],%f12 1867 1868 fmuld %f20,%f26,%f26 1869 ldd [%g1+%l2],%f22 1870 1871 fmuld %f4,%f32,%f4 1872 ldd [%l4+%l0],%f0 1873 1874 fmuld %f14,%f34,%f14 1875 ldd [%l4+%l1],%f10 1876 1877 fmuld %f24,%f36,%f24 1878 ldd [%l4+%l2],%f20 1879 1880 fmuld %f6,%f2,%f6 1881 1882 fmuld %f16,%f12,%f16 1883 1884 fmuld %f26,%f22,%f26 1885 1886 faddd %f6,%f4,%f6 1887 1888 faddd %f16,%f14,%f16 1889 1890 faddd %f26,%f24,%f26 1891 1892 faddd %f6,%f0,%f6 1893 1894 faddd %f16,%f10,%f16 1895 1896 faddd %f26,%f20,%f26 1897 1898 faddd %f6,%f32,%f6 1899 1900 faddd %f16,%f34,%f16 1901 1902 faddd %f26,%f36,%f26 1903 1904.FIXSIGN: 1905 ld [%fp+n0],%o3 ; add %o3,1,%o3 1906 add %l5,thresh-4,%g1 1907 1908 ld [%fp+n1],%o4 ; add %o4,1,%o4 1909 1910 ld [%fp+n2],%o5 ; add %o5,1,%o5 1911 and %o3,2,%o3 1912 1913 sll %o3,2,%o3 1914 and %o4,2,%o4 1915 lda [%i1]%asi,%l0 ! preload next argument 1916 1917 sll %o4,2,%o4 1918 and %o5,2,%o5 1919 ld [%g1+%o3],%f8 1920 1921 sll %o5,2,%o5 1922 ld [%g1+%o4],%f18 1923 1924 ld [%g1+%o5],%f28 1925 fxors %f9,%f8,%f9 1926 1927 lda [%i1]%asi,%f0 1928 fxors %f29,%f28,%f29 1929 1930 lda [%i1+4]%asi,%f1 1931 fxors %f19,%f18,%f19 1932 1933 fors %f6,%f9,%f6 ! tack on sign 1934 add %i1,%i2,%i1 ! x += stridex 1935 st %f6,[%o0] 1936 1937 fors %f26,%f29,%f26 ! tack on sign 1938 st %f7,[%o0+4] 1939 1940 fors %f16,%f19,%f16 ! tack on sign 1941 st %f26,[%o2] 1942 1943 st %f27,[%o2+4] 1944 addcc %i0,-1,%i0 1945 1946 st %f16,[%o1] 1947 andn %l0,%i5,%l0 ! hx &= ~0x80000000 1948 bg,pt %icc,.LOOP0 1949 1950! delay slot 1951 st %f17,[%o1+4] 1952 1953 ba,pt %icc,.ENDLOOP0 1954! delay slot 1955 nop 1956 1957 .align 32 1958.CASE1: 1959 fpadd32s %f10,%f31,%f18 1960 sethi %hi(0x3fc3c000),%o7 1961 ld [%fp+x0_1],%l0 1962 1963 fand %f8,%f44,%f4 1964 add %l3,8,%g1 1965 ld [%fp+x1_1],%l1 1966 1967 fand %f18,%f44,%f14 1968 sub %l0,%o7,%l0 1969 1970 fsubd %f0,%f4,%f0 1971 srl %l0,10,%l0 1972 sub %l1,%o7,%l1 1973 1974 fsubd %f10,%f14,%f10 1975 srl %l1,10,%l1 1976 1977 fmuld %f20,%f20,%f20 1978 ldd [%l5+%o5],%f36 1979 add %l5,%o5,%l2 1980 1981 faddd %f0,%f2,%f0 1982 andn %l0,0x1f,%l0 1983 1984 faddd %f10,%f12,%f10 1985 andn %l1,0x1f,%l1 1986 1987 fmuld %f20,%f36,%f24 1988 ldd [%l2+0x10],%f26 1989 add %fp,%o5,%o5 1990 1991 fmuld %f0,%f0,%f2 1992 add %l0,%o3,%l0 1993 1994 fmuld %f10,%f10,%f12 1995 add %l1,%o4,%l1 1996 1997 faddd %f24,%f26,%f24 1998 ldd [%l2+0x20],%f36 1999 2000 fmuld %f2,%f58,%f6 2001 ldd [%l3+%l0],%f32 2002 2003 fmuld %f12,%f58,%f16 2004 ldd [%l3+%l1],%f34 2005 2006 fmuld %f20,%f24,%f24 2007 ldd [%l2+0x30],%f26 2008 2009 faddd %f6,%f56,%f6 2010 fmuld %f2,%f62,%f4 2011 2012 faddd %f16,%f56,%f16 2013 fmuld %f12,%f62,%f14 2014 2015 faddd %f24,%f36,%f24 2016 ldd [%o5+x2_1],%f36 2017 2018 fmuld %f2,%f6,%f6 2019 faddd %f4,%f60,%f4 2020 2021 fmuld %f12,%f16,%f16 2022 faddd %f14,%f60,%f14 2023 2024 fmuld %f20,%f24,%f24 2025 2026 faddd %f6,%f54,%f6 2027 fmuld %f2,%f4,%f4 2028 ldd [%g1+%l0],%f2 2029 2030 faddd %f16,%f54,%f16 2031 fmuld %f12,%f14,%f14 2032 ldd [%g1+%l1],%f12 2033 2034 faddd %f24,%f26,%f24 2035 2036 fmuld %f0,%f6,%f6 2037 ldd [%l4+%l0],%f0 2038 2039 fmuld %f10,%f16,%f16 2040 ldd [%l4+%l1],%f10 2041 2042 fmuld %f4,%f32,%f4 2043 std %f22,[%fp+y2_0] 2044 2045 fmuld %f14,%f34,%f14 2046 2047 fmuld %f6,%f2,%f6 2048 2049 fmuld %f16,%f12,%f16 2050 2051 fmuld %f20,%f24,%f24 2052 2053 faddd %f6,%f4,%f6 2054 2055 faddd %f16,%f14,%f16 2056 2057 fmuld %f36,%f24,%f24 2058 ldd [%o5+y2_0],%f22 2059 2060 faddd %f6,%f0,%f6 2061 2062 faddd %f16,%f10,%f16 2063 2064 faddd %f24,%f22,%f24 2065 2066 faddd %f6,%f32,%f6 2067 2068 faddd %f16,%f34,%f16 2069 ba,pt %icc,.FIXSIGN 2070 2071! delay slot 2072 faddd %f36,%f24,%f26 2073 2074 .align 32 2075.CASE2: 2076 fpadd32s %f0,%f31,%f8 2077 ld [%fp+x0_1],%l0 2078 andcc %l2,2,%g0 2079 bne,pn %icc,.CASE3 2080 2081! delay slot 2082 sethi %hi(0x3fc3c000),%o7 2083 fpadd32s %f20,%f31,%f28 2084 ld [%fp+x2_1],%l2 2085 2086 fand %f8,%f44,%f4 2087 sub %l0,%o7,%l0 2088 add %l3,8,%g1 2089 2090 fand %f28,%f44,%f24 2091 sub %l2,%o7,%l2 2092 2093 fsubd %f0,%f4,%f0 2094 srl %l0,10,%l0 2095 2096 fsubd %f20,%f24,%f20 2097 srl %l2,10,%l2 2098 2099 fmuld %f10,%f10,%f10 2100 ldd [%l5+%o4],%f34 2101 add %l5,%o4,%l1 2102 2103 faddd %f0,%f2,%f0 2104 andn %l0,0x1f,%l0 2105 2106 faddd %f20,%f22,%f20 2107 andn %l2,0x1f,%l2 2108 2109 fmuld %f10,%f34,%f14 2110 ldd [%l1+0x10],%f16 2111 add %fp,%o4,%o4 2112 2113 fmuld %f0,%f0,%f2 2114 add %l0,%o3,%l0 2115 2116 fmuld %f20,%f20,%f22 2117 add %l2,%o5,%l2 2118 2119 faddd %f14,%f16,%f14 2120 ldd [%l1+0x20],%f34 2121 2122 fmuld %f2,%f58,%f6 2123 ldd [%l3+%l0],%f32 2124 2125 fmuld %f22,%f58,%f26 2126 ldd [%l3+%l2],%f36 2127 2128 fmuld %f10,%f14,%f14 2129 ldd [%l1+0x30],%f16 2130 2131 faddd %f6,%f56,%f6 2132 fmuld %f2,%f62,%f4 2133 2134 faddd %f26,%f56,%f26 2135 fmuld %f22,%f62,%f24 2136 2137 faddd %f14,%f34,%f14 2138 ldd [%o4+x1_1],%f34 2139 2140 fmuld %f2,%f6,%f6 2141 faddd %f4,%f60,%f4 2142 2143 fmuld %f22,%f26,%f26 2144 faddd %f24,%f60,%f24 2145 2146 fmuld %f10,%f14,%f14 2147 2148 faddd %f6,%f54,%f6 2149 fmuld %f2,%f4,%f4 2150 ldd [%g1+%l0],%f2 2151 2152 faddd %f26,%f54,%f26 2153 fmuld %f22,%f24,%f24 2154 ldd [%g1+%l2],%f22 2155 2156 faddd %f14,%f16,%f14 2157 2158 fmuld %f0,%f6,%f6 2159 ldd [%l4+%l0],%f0 2160 2161 fmuld %f20,%f26,%f26 2162 ldd [%l4+%l2],%f20 2163 2164 fmuld %f4,%f32,%f4 2165 std %f12,[%fp+y1_0] 2166 2167 fmuld %f24,%f36,%f24 2168 2169 fmuld %f6,%f2,%f6 2170 2171 fmuld %f26,%f22,%f26 2172 2173 fmuld %f10,%f14,%f14 2174 2175 faddd %f6,%f4,%f6 2176 2177 faddd %f26,%f24,%f26 2178 2179 fmuld %f34,%f14,%f14 2180 ldd [%o4+y1_0],%f12 2181 2182 faddd %f6,%f0,%f6 2183 2184 faddd %f26,%f20,%f26 2185 2186 faddd %f14,%f12,%f14 2187 2188 faddd %f6,%f32,%f6 2189 2190 faddd %f26,%f36,%f26 2191 ba,pt %icc,.FIXSIGN 2192 2193! delay slot 2194 faddd %f34,%f14,%f16 2195 2196 .align 32 2197.CASE3: 2198 fand %f8,%f44,%f4 2199 add %l3,8,%g1 2200 sub %l0,%o7,%l0 2201 2202 fmuld %f10,%f10,%f10 2203 ldd [%l5+%o4],%f34 2204 add %l5,%o4,%l1 2205 2206 fsubd %f0,%f4,%f0 2207 srl %l0,10,%l0 2208 2209 fmuld %f20,%f20,%f20 2210 ldd [%l5+%o5],%f36 2211 add %l5,%o5,%l2 2212 2213 fmuld %f10,%f34,%f14 2214 ldd [%l1+0x10],%f16 2215 add %fp,%o4,%o4 2216 2217 faddd %f0,%f2,%f0 2218 andn %l0,0x1f,%l0 2219 2220 fmuld %f20,%f36,%f24 2221 ldd [%l2+0x10],%f26 2222 add %fp,%o5,%o5 2223 2224 faddd %f14,%f16,%f14 2225 ldd [%l1+0x20],%f34 2226 2227 fmuld %f0,%f0,%f2 2228 add %l0,%o3,%l0 2229 2230 faddd %f24,%f26,%f24 2231 ldd [%l2+0x20],%f36 2232 2233 fmuld %f10,%f14,%f14 2234 ldd [%l1+0x30],%f16 2235 2236 fmuld %f2,%f58,%f6 2237 ldd [%l3+%l0],%f32 2238 2239 fmuld %f20,%f24,%f24 2240 ldd [%l2+0x30],%f26 2241 2242 faddd %f14,%f34,%f14 2243 ldd [%o4+x1_1],%f34 2244 2245 faddd %f6,%f56,%f6 2246 fmuld %f2,%f62,%f4 2247 2248 faddd %f24,%f36,%f24 2249 ldd [%o5+x2_1],%f36 2250 2251 fmuld %f10,%f14,%f14 2252 std %f12,[%fp+y1_0] 2253 2254 fmuld %f2,%f6,%f6 2255 faddd %f4,%f60,%f4 2256 2257 fmuld %f20,%f24,%f24 2258 std %f22,[%fp+y2_0] 2259 2260 faddd %f14,%f16,%f14 2261 2262 faddd %f6,%f54,%f6 2263 fmuld %f2,%f4,%f4 2264 ldd [%g1+%l0],%f2 2265 2266 faddd %f24,%f26,%f24 2267 2268 fmuld %f10,%f14,%f14 2269 2270 fmuld %f0,%f6,%f6 2271 ldd [%l4+%l0],%f0 2272 2273 fmuld %f4,%f32,%f4 2274 2275 fmuld %f20,%f24,%f24 2276 2277 fmuld %f6,%f2,%f6 2278 2279 fmuld %f34,%f14,%f14 2280 ldd [%o4+y1_0],%f12 2281 2282 fmuld %f36,%f24,%f24 2283 ldd [%o5+y2_0],%f22 2284 2285 faddd %f6,%f4,%f6 2286 2287 faddd %f14,%f12,%f14 2288 2289 faddd %f24,%f22,%f24 2290 2291 faddd %f6,%f0,%f6 2292 2293 faddd %f34,%f14,%f16 2294 2295 faddd %f36,%f24,%f26 2296 ba,pt %icc,.FIXSIGN 2297 2298! delay slot 2299 faddd %f6,%f32,%f6 2300 2301 .align 32 2302.CASE4: 2303 fands %f29,%f28,%f29 ! if (n & 1) clear sign bit 2304 sethi %hi(0x3fc3c000),%o7 2305 andcc %l1,2,%g0 2306 bne,pn %icc,.CASE6 2307 2308! delay slot 2309 andcc %l2,2,%g0 2310 fpadd32s %f10,%f31,%f18 2311 ld [%fp+x1_1],%l1 2312 bne,pn %icc,.CASE5 2313 2314! delay slot 2315 add %l3,8,%g1 2316 ld [%fp+x2_1],%l2 2317 fpadd32s %f20,%f31,%f28 2318 2319 fand %f18,%f44,%f14 2320 sub %l1,%o7,%l1 2321 2322 fand %f28,%f44,%f24 2323 sub %l2,%o7,%l2 2324 2325 fsubd %f10,%f14,%f10 2326 srl %l1,10,%l1 2327 2328 fsubd %f20,%f24,%f20 2329 srl %l2,10,%l2 2330 2331 fmuld %f0,%f0,%f0 2332 ldd [%l5+%o3],%f32 2333 add %l5,%o3,%l0 2334 2335 faddd %f10,%f12,%f10 2336 andn %l1,0x1f,%l1 2337 2338 faddd %f20,%f22,%f20 2339 andn %l2,0x1f,%l2 2340 2341 fmuld %f0,%f32,%f4 2342 ldd [%l0+0x10],%f6 2343 add %fp,%o3,%o3 2344 2345 fmuld %f10,%f10,%f12 2346 add %l1,%o4,%l1 2347 2348 fmuld %f20,%f20,%f22 2349 add %l2,%o5,%l2 2350 2351 faddd %f4,%f6,%f4 2352 ldd [%l0+0x20],%f32 2353 2354 fmuld %f12,%f58,%f16 2355 ldd [%l3+%l1],%f34 2356 2357 fmuld %f22,%f58,%f26 2358 ldd [%l3+%l2],%f36 2359 2360 fmuld %f0,%f4,%f4 2361 ldd [%l0+0x30],%f6 2362 2363 faddd %f16,%f56,%f16 2364 fmuld %f12,%f62,%f14 2365 2366 faddd %f26,%f56,%f26 2367 fmuld %f22,%f62,%f24 2368 2369 faddd %f4,%f32,%f4 2370 ldd [%o3+x0_1],%f32 2371 2372 fmuld %f12,%f16,%f16 2373 faddd %f14,%f60,%f14 2374 2375 fmuld %f22,%f26,%f26 2376 faddd %f24,%f60,%f24 2377 2378 fmuld %f0,%f4,%f4 2379 2380 faddd %f16,%f54,%f16 2381 fmuld %f12,%f14,%f14 2382 ldd [%g1+%l1],%f12 2383 2384 faddd %f26,%f54,%f26 2385 fmuld %f22,%f24,%f24 2386 ldd [%g1+%l2],%f22 2387 2388 faddd %f4,%f6,%f4 2389 2390 fmuld %f10,%f16,%f16 2391 ldd [%l4+%l1],%f10 2392 2393 fmuld %f20,%f26,%f26 2394 ldd [%l4+%l2],%f20 2395 2396 fmuld %f14,%f34,%f14 2397 std %f2,[%fp+y0_0] 2398 2399 fmuld %f24,%f36,%f24 2400 2401 fmuld %f0,%f4,%f4 2402 2403 fmuld %f16,%f12,%f16 2404 2405 fmuld %f26,%f22,%f26 2406 2407 fmuld %f32,%f4,%f4 2408 ldd [%o3+y0_0],%f2 2409 2410 faddd %f16,%f14,%f16 2411 2412 faddd %f26,%f24,%f26 2413 2414 faddd %f4,%f2,%f4 2415 2416 faddd %f16,%f10,%f16 2417 2418 faddd %f26,%f20,%f26 2419 2420 faddd %f32,%f4,%f6 2421 2422 faddd %f16,%f34,%f16 2423 ba,pt %icc,.FIXSIGN 2424 2425! delay slot 2426 faddd %f26,%f36,%f26 2427 2428 .align 32 2429.CASE5: 2430 fand %f18,%f44,%f14 2431 sub %l1,%o7,%l1 2432 2433 fmuld %f0,%f0,%f0 2434 ldd [%l5+%o3],%f32 2435 add %l5,%o3,%l0 2436 2437 fsubd %f10,%f14,%f10 2438 srl %l1,10,%l1 2439 2440 fmuld %f20,%f20,%f20 2441 ldd [%l5+%o5],%f36 2442 add %l5,%o5,%l2 2443 2444 fmuld %f0,%f32,%f4 2445 ldd [%l0+0x10],%f6 2446 add %fp,%o3,%o3 2447 2448 faddd %f10,%f12,%f10 2449 andn %l1,0x1f,%l1 2450 2451 fmuld %f20,%f36,%f24 2452 ldd [%l2+0x10],%f26 2453 add %fp,%o5,%o5 2454 2455 faddd %f4,%f6,%f4 2456 ldd [%l0+0x20],%f32 2457 2458 fmuld %f10,%f10,%f12 2459 add %l1,%o4,%l1 2460 2461 faddd %f24,%f26,%f24 2462 ldd [%l2+0x20],%f36 2463 2464 fmuld %f0,%f4,%f4 2465 ldd [%l0+0x30],%f6 2466 2467 fmuld %f12,%f58,%f16 2468 ldd [%l3+%l1],%f34 2469 2470 fmuld %f20,%f24,%f24 2471 ldd [%l2+0x30],%f26 2472 2473 faddd %f4,%f32,%f4 2474 ldd [%o3+x0_1],%f32 2475 2476 faddd %f16,%f56,%f16 2477 fmuld %f12,%f62,%f14 2478 2479 faddd %f24,%f36,%f24 2480 ldd [%o5+x2_1],%f36 2481 2482 fmuld %f0,%f4,%f4 2483 std %f2,[%fp+y0_0] 2484 2485 fmuld %f12,%f16,%f16 2486 faddd %f14,%f60,%f14 2487 2488 fmuld %f20,%f24,%f24 2489 std %f22,[%fp+y2_0] 2490 2491 faddd %f4,%f6,%f4 2492 2493 faddd %f16,%f54,%f16 2494 fmuld %f12,%f14,%f14 2495 ldd [%g1+%l1],%f12 2496 2497 faddd %f24,%f26,%f24 2498 2499 fmuld %f0,%f4,%f4 2500 2501 fmuld %f10,%f16,%f16 2502 ldd [%l4+%l1],%f10 2503 2504 fmuld %f14,%f34,%f14 2505 2506 fmuld %f20,%f24,%f24 2507 2508 fmuld %f16,%f12,%f16 2509 2510 fmuld %f32,%f4,%f4 2511 ldd [%o3+y0_0],%f2 2512 2513 fmuld %f36,%f24,%f24 2514 ldd [%o5+y2_0],%f22 2515 2516 faddd %f16,%f14,%f16 2517 2518 faddd %f4,%f2,%f4 2519 2520 faddd %f24,%f22,%f24 2521 2522 faddd %f16,%f10,%f16 2523 2524 faddd %f32,%f4,%f6 2525 2526 faddd %f36,%f24,%f26 2527 ba,pt %icc,.FIXSIGN 2528 2529! delay slot 2530 faddd %f16,%f34,%f16 2531 2532 .align 32 2533.CASE6: 2534 ld [%fp+x2_1],%l2 2535 add %l3,8,%g1 2536 bne,pn %icc,.CASE7 2537! delay slot 2538 fpadd32s %f20,%f31,%f28 2539 2540 fand %f28,%f44,%f24 2541 ldd [%l5+%o3],%f32 2542 add %l5,%o3,%l0 2543 2544 fmuld %f0,%f0,%f0 2545 sub %l2,%o7,%l2 2546 2547 fsubd %f20,%f24,%f20 2548 srl %l2,10,%l2 2549 2550 fmuld %f10,%f10,%f10 2551 ldd [%l5+%o4],%f34 2552 add %l5,%o4,%l1 2553 2554 fmuld %f0,%f32,%f4 2555 ldd [%l0+0x10],%f6 2556 add %fp,%o3,%o3 2557 2558 faddd %f20,%f22,%f20 2559 andn %l2,0x1f,%l2 2560 2561 fmuld %f10,%f34,%f14 2562 ldd [%l1+0x10],%f16 2563 add %fp,%o4,%o4 2564 2565 faddd %f4,%f6,%f4 2566 ldd [%l0+0x20],%f32 2567 2568 fmuld %f20,%f20,%f22 2569 add %l2,%o5,%l2 2570 2571 faddd %f14,%f16,%f14 2572 ldd [%l1+0x20],%f34 2573 2574 fmuld %f0,%f4,%f4 2575 ldd [%l0+0x30],%f6 2576 2577 fmuld %f22,%f58,%f26 2578 ldd [%l3+%l2],%f36 2579 2580 fmuld %f10,%f14,%f14 2581 ldd [%l1+0x30],%f16 2582 2583 faddd %f4,%f32,%f4 2584 ldd [%o3+x0_1],%f32 2585 2586 faddd %f26,%f56,%f26 2587 fmuld %f22,%f62,%f24 2588 2589 faddd %f14,%f34,%f14 2590 ldd [%o4+x1_1],%f34 2591 2592 fmuld %f0,%f4,%f4 2593 std %f2,[%fp+y0_0] 2594 2595 fmuld %f22,%f26,%f26 2596 faddd %f24,%f60,%f24 2597 2598 fmuld %f10,%f14,%f14 2599 std %f12,[%fp+y1_0] 2600 2601 faddd %f4,%f6,%f4 2602 2603 faddd %f26,%f54,%f26 2604 fmuld %f22,%f24,%f24 2605 ldd [%g1+%l2],%f22 2606 2607 faddd %f14,%f16,%f14 2608 2609 fmuld %f0,%f4,%f4 2610 2611 fmuld %f20,%f26,%f26 2612 ldd [%l4+%l2],%f20 2613 2614 fmuld %f24,%f36,%f24 2615 2616 fmuld %f10,%f14,%f14 2617 2618 fmuld %f26,%f22,%f26 2619 2620 fmuld %f32,%f4,%f4 2621 ldd [%o3+y0_0],%f2 2622 2623 fmuld %f34,%f14,%f14 2624 ldd [%o4+y1_0],%f12 2625 2626 faddd %f26,%f24,%f26 2627 2628 faddd %f4,%f2,%f4 2629 2630 faddd %f14,%f12,%f14 2631 2632 faddd %f26,%f20,%f26 2633 2634 faddd %f32,%f4,%f6 2635 2636 faddd %f34,%f14,%f16 2637 ba,pt %icc,.FIXSIGN 2638 2639! delay slot 2640 faddd %f26,%f36,%f26 2641 2642 .align 32 2643.CASE7: 2644 fmuld %f0,%f0,%f0 2645 ldd [%l5+%o3],%f32 2646 add %l5,%o3,%l0 2647 2648 fmuld %f10,%f10,%f10 2649 ldd [%l5+%o4],%f34 2650 add %l5,%o4,%l1 2651 2652 fmuld %f20,%f20,%f20 2653 ldd [%l5+%o5],%f36 2654 add %l5,%o5,%l2 2655 2656 fmuld %f0,%f32,%f4 2657 ldd [%l0+0x10],%f6 2658 add %fp,%o3,%o3 2659 2660 fmuld %f10,%f34,%f14 2661 ldd [%l1+0x10],%f16 2662 add %fp,%o4,%o4 2663 2664 fmuld %f20,%f36,%f24 2665 ldd [%l2+0x10],%f26 2666 add %fp,%o5,%o5 2667 2668 faddd %f4,%f6,%f4 2669 ldd [%l0+0x20],%f32 2670 2671 faddd %f14,%f16,%f14 2672 ldd [%l1+0x20],%f34 2673 2674 faddd %f24,%f26,%f24 2675 ldd [%l2+0x20],%f36 2676 2677 fmuld %f0,%f4,%f4 2678 ldd [%l0+0x30],%f6 2679 2680 fmuld %f10,%f14,%f14 2681 ldd [%l1+0x30],%f16 2682 2683 fmuld %f20,%f24,%f24 2684 ldd [%l2+0x30],%f26 2685 2686 faddd %f4,%f32,%f4 2687 ldd [%o3+x0_1],%f32 2688 2689 faddd %f14,%f34,%f14 2690 ldd [%o4+x1_1],%f34 2691 2692 faddd %f24,%f36,%f24 2693 ldd [%o5+x2_1],%f36 2694 2695 fmuld %f0,%f4,%f4 2696 std %f2,[%fp+y0_0] 2697 2698 fmuld %f10,%f14,%f14 2699 std %f12,[%fp+y1_0] 2700 2701 fmuld %f20,%f24,%f24 2702 std %f22,[%fp+y2_0] 2703 2704 faddd %f4,%f6,%f4 2705 2706 faddd %f14,%f16,%f14 2707 2708 faddd %f24,%f26,%f24 2709 2710 fmuld %f0,%f4,%f4 2711 2712 fmuld %f10,%f14,%f14 2713 2714 fmuld %f20,%f24,%f24 2715 2716 fmuld %f32,%f4,%f4 2717 ldd [%o3+y0_0],%f2 2718 2719 fmuld %f34,%f14,%f14 2720 ldd [%o4+y1_0],%f12 2721 2722 fmuld %f36,%f24,%f24 2723 ldd [%o5+y2_0],%f22 2724 2725 faddd %f4,%f2,%f4 2726 2727 faddd %f14,%f12,%f14 2728 2729 faddd %f24,%f22,%f24 2730 2731 faddd %f32,%f4,%f6 2732 2733 faddd %f34,%f14,%f16 2734 ba,pt %icc,.FIXSIGN 2735 2736! delay slot 2737 faddd %f36,%f24,%f26 2738 2739 2740 .align 32 2741.ENDLOOP2: 2742 fmuld %f10,%f40,%f12 2743 add %l5,thresh,%g1 2744 faddd %f12,%f42,%f12 2745 st %f13,[%fp+n1] 2746 fsubd %f12,%f42,%f12 ! n 2747 fmuld %f12,%f46,%f14 2748 fsubd %f10,%f14,%f14 2749 fmuld %f12,%f48,%f16 2750 fsubd %f14,%f16,%f10 2751 ld [%fp+n1],%o4 ; add %o4,1,%o4 2752 fsubd %f14,%f10,%f34 2753 and %o4,1,%o4 2754 fsubd %f34,%f16,%f34 2755 fmuld %f12,%f50,%f18 2756 sll %o4,3,%o4 2757 fsubd %f18,%f34,%f18 2758 ld [%g1+%o4],%f16 2759 fsubd %f10,%f18,%f14 2760 fsubd %f10,%f14,%f34 2761 add %l5,thresh+4,%o7 2762 fsubd %f34,%f18,%f34 2763 fmuld %f12,%f52,%f12 2764 fsubd %f12,%f34,%f12 2765 ld [%o7+%o4],%f18 2766 fsubd %f14,%f12,%f10 ! x 2767 fsubd %f14,%f10,%f14 2768 fands %f10,%f30,%f19 ! save signbit 2769 fabsd %f10,%f10 2770 std %f10,[%fp+x1_1] 2771 fsubd %f14,%f12,%f12 ! y 2772 fcmpgt32 %f16,%f10,%l1 2773 fxors %f12,%f19,%f12 2774 fands %f19,%f18,%f19 ! if (n & 1) clear sign bit 2775 andcc %l1,2,%g0 2776 bne,pn %icc,1f 2777! delay slot 2778 nop 2779 fpadd32s %f10,%f31,%f18 2780 ld [%fp+x1_1],%l1 2781 fand %f18,%f44,%f14 2782 sethi %hi(0x3fc3c000),%o7 2783 add %l3,8,%g1 2784 fsubd %f10,%f14,%f10 2785 sub %l1,%o7,%l1 2786 srl %l1,10,%l1 2787 faddd %f10,%f12,%f10 2788 andn %l1,0x1f,%l1 2789 fmuld %f10,%f10,%f12 2790 add %l1,%o4,%l1 2791 fmuld %f12,%f58,%f16 2792 ldd [%l3+%l1],%f34 2793 faddd %f16,%f56,%f16 2794 fmuld %f12,%f62,%f14 2795 fmuld %f12,%f16,%f16 2796 faddd %f14,%f60,%f14 2797 faddd %f16,%f54,%f16 2798 fmuld %f12,%f14,%f14 2799 ldd [%g1+%l1],%f12 2800 fmuld %f10,%f16,%f16 2801 ldd [%l4+%l1],%f10 2802 fmuld %f14,%f34,%f14 2803 fmuld %f16,%f12,%f16 2804 faddd %f16,%f14,%f16 2805 faddd %f16,%f10,%f16 2806 ba,pt %icc,2f 2807 faddd %f16,%f34,%f16 28081: 2809 fmuld %f10,%f10,%f10 2810 ldd [%l5+%o4],%f34 2811 add %l5,%o4,%l1 2812 fmuld %f10,%f34,%f14 2813 ldd [%l1+0x10],%f16 2814 add %fp,%o4,%o4 2815 faddd %f14,%f16,%f14 2816 ldd [%l1+0x20],%f34 2817 fmuld %f10,%f14,%f14 2818 ldd [%l1+0x30],%f16 2819 faddd %f14,%f34,%f14 2820 ldd [%o4+x1_1],%f34 2821 fmuld %f10,%f14,%f14 2822 std %f12,[%fp+y1_0] 2823 faddd %f14,%f16,%f14 2824 fmuld %f10,%f14,%f14 2825 fmuld %f34,%f14,%f14 2826 ldd [%o4+y1_0],%f12 2827 faddd %f14,%f12,%f14 2828 faddd %f34,%f14,%f16 28292: 2830 add %l5,thresh-4,%g1 2831 ld [%fp+n1],%o4 ; add %o4,1,%o4 2832 and %o4,2,%o4 2833 sll %o4,2,%o4 2834 ld [%g1+%o4],%f18 2835 fxors %f19,%f18,%f19 2836 fors %f16,%f19,%f16 ! tack on sign 2837 st %f16,[%o1] 2838 st %f17,[%o1+4] 2839 2840.ENDLOOP1: 2841 fmuld %f0,%f40,%f2 2842 add %l5,thresh,%g1 2843 faddd %f2,%f42,%f2 2844 st %f3,[%fp+n0] 2845 fsubd %f2,%f42,%f2 ! n 2846 fmuld %f2,%f46,%f4 2847 fsubd %f0,%f4,%f4 2848 fmuld %f2,%f48,%f6 2849 fsubd %f4,%f6,%f0 2850 ld [%fp+n0],%o3 ; add %o3,1,%o3 2851 fsubd %f4,%f0,%f32 2852 and %o3,1,%o3 2853 fsubd %f32,%f6,%f32 2854 fmuld %f2,%f50,%f8 2855 sll %o3,3,%o3 2856 fsubd %f8,%f32,%f8 2857 ld [%g1+%o3],%f6 2858 fsubd %f0,%f8,%f4 2859 fsubd %f0,%f4,%f32 2860 add %l5,thresh+4,%o7 2861 fsubd %f32,%f8,%f32 2862 fmuld %f2,%f52,%f2 2863 fsubd %f2,%f32,%f2 2864 ld [%o7+%o3],%f8 2865 fsubd %f4,%f2,%f0 ! x 2866 fsubd %f4,%f0,%f4 2867 fands %f0,%f30,%f9 ! save signbit 2868 fabsd %f0,%f0 2869 std %f0,[%fp+x0_1] 2870 fsubd %f4,%f2,%f2 ! y 2871 fcmpgt32 %f6,%f0,%l0 2872 fxors %f2,%f9,%f2 2873 fands %f9,%f8,%f9 ! if (n & 1) clear sign bit 2874 andcc %l0,2,%g0 2875 bne,pn %icc,1f 2876! delay slot 2877 nop 2878 fpadd32s %f0,%f31,%f8 2879 ld [%fp+x0_1],%l0 2880 fand %f8,%f44,%f4 2881 sethi %hi(0x3fc3c000),%o7 2882 add %l3,8,%g1 2883 fsubd %f0,%f4,%f0 2884 sub %l0,%o7,%l0 2885 srl %l0,10,%l0 2886 faddd %f0,%f2,%f0 2887 andn %l0,0x1f,%l0 2888 fmuld %f0,%f0,%f2 2889 add %l0,%o3,%l0 2890 fmuld %f2,%f58,%f6 2891 ldd [%l3+%l0],%f32 2892 faddd %f6,%f56,%f6 2893 fmuld %f2,%f62,%f4 2894 fmuld %f2,%f6,%f6 2895 faddd %f4,%f60,%f4 2896 faddd %f6,%f54,%f6 2897 fmuld %f2,%f4,%f4 2898 ldd [%g1+%l0],%f2 2899 fmuld %f0,%f6,%f6 2900 ldd [%l4+%l0],%f0 2901 fmuld %f4,%f32,%f4 2902 fmuld %f6,%f2,%f6 2903 faddd %f6,%f4,%f6 2904 faddd %f6,%f0,%f6 2905 ba,pt %icc,2f 2906 faddd %f6,%f32,%f6 29071: 2908 fmuld %f0,%f0,%f0 2909 ldd [%l5+%o3],%f32 2910 add %l5,%o3,%l0 2911 fmuld %f0,%f32,%f4 2912 ldd [%l0+0x10],%f6 2913 add %fp,%o3,%o3 2914 faddd %f4,%f6,%f4 2915 ldd [%l0+0x20],%f32 2916 fmuld %f0,%f4,%f4 2917 ldd [%l0+0x30],%f6 2918 faddd %f4,%f32,%f4 2919 ldd [%o3+x0_1],%f32 2920 fmuld %f0,%f4,%f4 2921 std %f2,[%fp+y0_0] 2922 faddd %f4,%f6,%f4 2923 fmuld %f0,%f4,%f4 2924 fmuld %f32,%f4,%f4 2925 ldd [%o3+y0_0],%f2 2926 faddd %f4,%f2,%f4 2927 faddd %f32,%f4,%f6 29282: 2929 add %l5,thresh-4,%g1 2930 ld [%fp+n0],%o3 ; add %o3,1,%o3 2931 and %o3,2,%o3 2932 sll %o3,2,%o3 2933 ld [%g1+%o3],%f8 2934 fxors %f9,%f8,%f9 2935 fors %f6,%f9,%f6 ! tack on sign 2936 st %f6,[%o0] 2937 st %f7,[%o0+4] 2938 2939.ENDLOOP0: 2940 2941! check for huge arguments remaining 2942 2943 tst LIM_l6 2944 be,pt %icc,.exit 2945! delay slot 2946 nop 2947 2948! ========== huge range (use C code) ========== 2949 2950#ifdef __sparcv9 2951 ldx [%fp+xsave],%o1 2952 ldx [%fp+ysave],%o3 2953#else 2954 ld [%fp+xsave],%o1 2955 ld [%fp+ysave],%o3 2956#endif 2957 ld [%fp+nsave],%o0 2958 ld [%fp+sxsave],%o2 2959 ld [%fp+sysave],%o4 2960 sra %o2,0,%o2 ! sign-extend for V9 2961 sra %o4,0,%o4 2962 call __vlibm_vcos_big 2963 mov %l7,%o5 ! delay slot 2964 2965.exit: 2966 ret 2967 restore 2968 2969 2970 .align 32 2971.SKIP0: 2972 addcc %i0,-1,%i0 2973 ble,pn %icc,.ENDLOOP0 2974! delay slot, harmless if branch taken 2975 add %i3,%i4,%i3 ! y += stridey 2976 andn %l1,%i5,%l0 ! hx &= ~0x80000000 2977 fmovs %f10,%f0 2978 ld [%i1+4],%f1 2979 ba,pt %icc,.LOOP0 2980! delay slot 2981 add %i1,%i2,%i1 ! x += stridex 2982 2983 2984 .align 32 2985.SKIP1: 2986 addcc %i0,-1,%i0 2987 ble,pn %icc,.ENDLOOP1 2988! delay slot, harmless if branch taken 2989 add %i3,%i4,%i3 ! y += stridey 2990 andn %l2,%i5,%l1 ! hx &= ~0x80000000 2991 fmovs %f20,%f10 2992 ld [%i1+4],%f11 2993 ba,pt %icc,.LOOP1 2994! delay slot 2995 add %i1,%i2,%i1 ! x += stridex 2996 2997 2998 .align 32 2999.SKIP2: 3000 addcc %i0,-1,%i0 3001 ble,pn %icc,.ENDLOOP2 3002! delay slot, harmless if branch taken 3003 add %i3,%i4,%i3 ! y += stridey 3004 ld [%i1],%l2 3005 ld [%i1],%f20 3006 ld [%i1+4],%f21 3007 andn %l2,%i5,%l2 ! hx &= ~0x80000000 3008 ba,pt %icc,.LOOP2 3009! delay slot 3010 add %i1,%i2,%i1 ! x += stridex 3011 3012 3013 .align 32 3014.BIG0: 3015 sethi %hi(0x7ff00000),%o7 3016 cmp %l0,%o7 3017 bl,a,pt %icc,1f ! if hx < 0x7ff00000 3018! delay slot, annulled if branch not taken 3019 mov %l7,LIM_l6 ! set biguns flag or 3020 fsubd %f0,%f0,%f0 ! y = x - x 3021 st %f0,[%o0] 3022 st %f1,[%o0+4] 30231: 3024 addcc %i0,-1,%i0 3025 ble,pn %icc,.ENDLOOP0 3026! delay slot, harmless if branch taken 3027 andn %l1,%i5,%l0 ! hx &= ~0x80000000 3028 fmovd %f10,%f0 3029 ba,pt %icc,.LOOP0 3030! delay slot 3031 add %i1,%i2,%i1 ! x += stridex 3032 3033 3034 .align 32 3035.BIG1: 3036 sethi %hi(0x7ff00000),%o7 3037 cmp %l1,%o7 3038 bl,a,pt %icc,1f ! if hx < 0x7ff00000 3039! delay slot, annulled if branch not taken 3040 mov %l7,LIM_l6 ! set biguns flag or 3041 fsubd %f10,%f10,%f10 ! y = x - x 3042 st %f10,[%o1] 3043 st %f11,[%o1+4] 30441: 3045 addcc %i0,-1,%i0 3046 ble,pn %icc,.ENDLOOP1 3047! delay slot, harmless if branch taken 3048 andn %l2,%i5,%l1 ! hx &= ~0x80000000 3049 fmovd %f20,%f10 3050 ba,pt %icc,.LOOP1 3051! delay slot 3052 add %i1,%i2,%i1 ! x += stridex 3053 3054 3055 .align 32 3056.BIG2: 3057 sethi %hi(0x7ff00000),%o7 3058 cmp %l2,%o7 3059 bl,a,pt %icc,1f ! if hx < 0x7ff00000 3060! delay slot, annulled if branch not taken 3061 mov %l7,LIM_l6 ! set biguns flag or 3062 fsubd %f20,%f20,%f20 ! y = x - x 3063 st %f20,[%o2] 3064 st %f21,[%o2+4] 30651: 3066 addcc %i0,-1,%i0 3067 ble,pn %icc,.ENDLOOP2 3068! delay slot 3069 nop 3070 ld [%i1],%l2 3071 ld [%i1],%f20 3072 ld [%i1+4],%f21 3073 andn %l2,%i5,%l2 ! hx &= ~0x80000000 3074 ba,pt %icc,.LOOP2 3075! delay slot 3076 add %i1,%i2,%i1 ! x += stridex 3077 3078 SET_SIZE(__vcos) 3079 3080