1//===----------------------Hexagon builtin routine ------------------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8 9#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG 10#define END(TAG) .size TAG,.-TAG 11 12// Double Precision Multiply 13 14 15#define A r1:0 16#define AH r1 17#define AL r0 18#define B r3:2 19#define BH r3 20#define BL r2 21#define C r5:4 22#define CH r5 23#define CL r4 24 25 26 27#define BTMP r15:14 28#define BTMPH r15 29#define BTMPL r14 30 31#define ATMP r13:12 32#define ATMPH r13 33#define ATMPL r12 34 35#define CTMP r11:10 36#define CTMPH r11 37#define CTMPL r10 38 39#define PP_LL r9:8 40#define PP_LL_H r9 41#define PP_LL_L r8 42 43#define PP_ODD r7:6 44#define PP_ODD_H r7 45#define PP_ODD_L r6 46 47 48#define PP_HH r17:16 49#define PP_HH_H r17 50#define PP_HH_L r16 51 52#define EXPA r18 53#define EXPB r19 54#define EXPBA r19:18 55 56#define TMP r28 57 58#define P_TMP p0 59#define PROD_NEG p3 60#define EXACT p2 61#define SWAP p1 62 63#define MANTBITS 52 64#define HI_MANTBITS 20 65#define EXPBITS 11 66#define BIAS 1023 67#define STACKSPACE 32 68 69#define ADJUST 4 70 71#define FUDGE 7 72#define FUDGE2 3 73 74#ifndef SR_ROUND_OFF 75#define SR_ROUND_OFF 22 76#endif 77 78 // First, classify for normal values, and abort if abnormal 79 // 80 // Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8 81 // 82 // Since we know that the 2 MSBs of the H registers is zero, we should never carry 83 // the partial products that involve the H registers 84 // 85 // Try to buy X slots, at the expense of latency if needed 86 // 87 // We will have PP_HH with the upper bits of the product, PP_LL with the lower 88 // PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts 89 // PP_HH can have a minimum of 0x0100_0000_0000_0000 90 // 91 // 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS 92 // 93 // We need to align CTMP. 94 // If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add 95 // If CTMP << PP align CTMP and add 128 bits. Then compute sticky 96 // If CTMP ~= PP, align CTMP and add 128 bits. May have massive cancellation. 97 // 98 // Convert partial product and CTMP to 2's complement prior to addition 99 // 100 // After we add, we need to normalize into upper 64 bits, then compute sticky. 101 102 .text 103 .global __hexagon_fmadf4 104 .type __hexagon_fmadf4,@function 105 .global __hexagon_fmadf5 106 .type __hexagon_fmadf5,@function 107 .global fma 108 .type fma,@function 109 Q6_ALIAS(fmadf5) 110 .p2align 5 111__hexagon_fmadf4: 112__hexagon_fmadf5: 113fma: 114 { 115 P_TMP = dfclass(A,#2) 116 P_TMP = dfclass(B,#2) 117 ATMP = #0 118 BTMP = #0 119 } 120 { 121 ATMP = insert(A,#MANTBITS,#EXPBITS-3) 122 BTMP = insert(B,#MANTBITS,#EXPBITS-3) 123 PP_ODD_H = ##0x10000000 124 allocframe(#STACKSPACE) 125 } 126 { 127 PP_LL = mpyu(ATMPL,BTMPL) 128 if (!P_TMP) jump .Lfma_abnormal_ab 129 ATMPH = or(ATMPH,PP_ODD_H) 130 BTMPH = or(BTMPH,PP_ODD_H) 131 } 132 { 133 P_TMP = dfclass(C,#2) 134 if (!P_TMP.new) jump:nt .Lfma_abnormal_c 135 CTMP = combine(PP_ODD_H,#0) 136 PP_ODD = combine(#0,PP_LL_H) 137 } 138.Lfma_abnormal_c_restart: 139 { 140 PP_ODD += mpyu(BTMPL,ATMPH) 141 CTMP = insert(C,#MANTBITS,#EXPBITS-3) 142 memd(r29+#0) = PP_HH 143 memd(r29+#8) = EXPBA 144 } 145 { 146 PP_ODD += mpyu(ATMPL,BTMPH) 147 EXPBA = neg(CTMP) 148 P_TMP = cmp.gt(CH,#-1) 149 TMP = xor(AH,BH) 150 } 151 { 152 EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS) 153 EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS) 154 PP_HH = combine(#0,PP_ODD_H) 155 if (!P_TMP) CTMP = EXPBA 156 } 157 { 158 PP_HH += mpyu(ATMPH,BTMPH) 159 PP_LL = combine(PP_ODD_L,PP_LL_L) 160#undef PP_ODD 161#undef PP_ODD_H 162#undef PP_ODD_L 163#undef ATMP 164#undef ATMPL 165#undef ATMPH 166#undef BTMP 167#undef BTMPL 168#undef BTMPH 169#define RIGHTLEFTSHIFT r13:12 170#define RIGHTSHIFT r13 171#define LEFTSHIFT r12 172 173 EXPA = add(EXPA,EXPB) 174#undef EXPB 175#undef EXPBA 176#define EXPC r19 177#define EXPCA r19:18 178 EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS) 179 } 180 // PP_HH:PP_LL now has product 181 // CTMP is negated 182 // EXPA,B,C are extracted 183 // We need to negate PP 184 // Since we will be adding with carry later, if we need to negate, 185 // just invert all bits now, which we can do conditionally and in parallel 186#define PP_HH_TMP r15:14 187#define PP_LL_TMP r7:6 188 { 189 EXPA = add(EXPA,#-BIAS+(ADJUST)) 190 PROD_NEG = !cmp.gt(TMP,#-1) 191 PP_LL_TMP = #0 192 PP_HH_TMP = #0 193 } 194 { 195 PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry 196 P_TMP = !cmp.gt(TMP,#-1) 197 SWAP = cmp.gt(EXPC,EXPA) // If C >> PP 198 if (SWAP.new) EXPCA = combine(EXPA,EXPC) 199 } 200 { 201 PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry 202 if (P_TMP) PP_LL = PP_LL_TMP 203#undef PP_LL_TMP 204#define CTMP2 r7:6 205#define CTMP2H r7 206#define CTMP2L r6 207 CTMP2 = #0 208 EXPC = sub(EXPA,EXPC) 209 } 210 { 211 if (P_TMP) PP_HH = PP_HH_TMP 212 P_TMP = cmp.gt(EXPC,#63) 213 if (SWAP) PP_LL = CTMP2 214 if (SWAP) CTMP2 = PP_LL 215 } 216#undef PP_HH_TMP 217//#define ONE r15:14 218//#define S_ONE r14 219#define ZERO r15:14 220#define S_ZERO r15 221#undef PROD_NEG 222#define P_CARRY p3 223 { 224 if (SWAP) PP_HH = CTMP // Swap C and PP 225 if (SWAP) CTMP = PP_HH 226 if (P_TMP) EXPC = add(EXPC,#-64) 227 TMP = #63 228 } 229 { 230 // If diff > 63, pre-shift-right by 64... 231 if (P_TMP) CTMP2 = CTMP 232 TMP = asr(CTMPH,#31) 233 RIGHTSHIFT = min(EXPC,TMP) 234 LEFTSHIFT = #0 235 } 236#undef C 237#undef CH 238#undef CL 239#define STICKIES r5:4 240#define STICKIESH r5 241#define STICKIESL r4 242 { 243 if (P_TMP) CTMP = combine(TMP,TMP) // sign extension of pre-shift-right-64 244 STICKIES = extract(CTMP2,RIGHTLEFTSHIFT) 245 CTMP2 = lsr(CTMP2,RIGHTSHIFT) 246 LEFTSHIFT = sub(#64,RIGHTSHIFT) 247 } 248 { 249 ZERO = #0 250 TMP = #-2 251 CTMP2 |= lsl(CTMP,LEFTSHIFT) 252 CTMP = asr(CTMP,RIGHTSHIFT) 253 } 254 { 255 P_CARRY = cmp.gtu(STICKIES,ZERO) // If we have sticky bits from C shift 256 if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR 257#undef ZERO 258#define ONE r15:14 259#define S_ONE r14 260 ONE = #1 261 STICKIES = #0 262 } 263 { 264 PP_LL = add(CTMP2,PP_LL,P_CARRY):carry // use the carry to add the sticky 265 } 266 { 267 PP_HH = add(CTMP,PP_HH,P_CARRY):carry 268 TMP = #62 269 } 270 // PP_HH:PP_LL now holds the sum 271 // We may need to normalize left, up to ??? bits. 272 // 273 // I think that if we have massive cancellation, the range we normalize by 274 // is still limited 275 { 276 LEFTSHIFT = add(clb(PP_HH),#-2) 277 if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t 1f // all sign bits? 278 } 279 // We had all sign bits, shift left by 62. 280 { 281 CTMP = extractu(PP_LL,#62,#2) 282 PP_LL = asl(PP_LL,#62) 283 EXPA = add(EXPA,#-62) // And adjust exponent of result 284 } 285 { 286 PP_HH = insert(CTMP,#62,#0) // Then shift 63 287 } 288 { 289 LEFTSHIFT = add(clb(PP_HH),#-2) 290 } 291 .falign 2921: 293 { 294 CTMP = asl(PP_HH,LEFTSHIFT) 295 STICKIES |= asl(PP_LL,LEFTSHIFT) 296 RIGHTSHIFT = sub(#64,LEFTSHIFT) 297 EXPA = sub(EXPA,LEFTSHIFT) 298 } 299 { 300 CTMP |= lsr(PP_LL,RIGHTSHIFT) 301 EXACT = cmp.gtu(ONE,STICKIES) 302 TMP = #BIAS+BIAS-2 303 } 304 { 305 if (!EXACT) CTMPL = or(CTMPL,S_ONE) 306 // If EXPA is overflow/underflow, jump to ovf_unf 307 P_TMP = !cmp.gt(EXPA,TMP) 308 P_TMP = cmp.gt(EXPA,#1) 309 if (!P_TMP.new) jump:nt .Lfma_ovf_unf 310 } 311 { 312 // XXX: FIXME: should PP_HH for check of zero be CTMP? 313 P_TMP = cmp.gtu(ONE,CTMP) // is result true zero? 314 A = convert_d2df(CTMP) 315 EXPA = add(EXPA,#-BIAS-60) 316 PP_HH = memd(r29+#0) 317 } 318 { 319 AH += asl(EXPA,#HI_MANTBITS) 320 EXPCA = memd(r29+#8) 321 if (!P_TMP) dealloc_return // not zero, return 322 } 323.Ladd_yields_zero: 324 // We had full cancellation. Return +/- zero (-0 when round-down) 325 { 326 TMP = USR 327 A = #0 328 } 329 { 330 TMP = extractu(TMP,#2,#SR_ROUND_OFF) 331 PP_HH = memd(r29+#0) 332 EXPCA = memd(r29+#8) 333 } 334 { 335 p0 = cmp.eq(TMP,#2) 336 if (p0.new) AH = ##0x80000000 337 dealloc_return 338 } 339 340#undef RIGHTLEFTSHIFT 341#undef RIGHTSHIFT 342#undef LEFTSHIFT 343#undef CTMP2 344#undef CTMP2H 345#undef CTMP2L 346 347.Lfma_ovf_unf: 348 { 349 p0 = cmp.gtu(ONE,CTMP) 350 if (p0.new) jump:nt .Ladd_yields_zero 351 } 352 { 353 A = convert_d2df(CTMP) 354 EXPA = add(EXPA,#-BIAS-60) 355 TMP = EXPA 356 } 357#define NEW_EXPB r7 358#define NEW_EXPA r6 359 { 360 AH += asl(EXPA,#HI_MANTBITS) 361 NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS) 362 } 363 { 364 NEW_EXPA = add(EXPA,NEW_EXPB) 365 PP_HH = memd(r29+#0) 366 EXPCA = memd(r29+#8) 367#undef PP_HH 368#undef PP_HH_H 369#undef PP_HH_L 370#undef EXPCA 371#undef EXPC 372#undef EXPA 373#undef PP_LL 374#undef PP_LL_H 375#undef PP_LL_L 376#define EXPA r6 377#define EXPB r7 378#define EXPBA r7:6 379#define ATMP r9:8 380#define ATMPH r9 381#define ATMPL r8 382#undef NEW_EXPB 383#undef NEW_EXPA 384 ATMP = abs(CTMP) 385 } 386 { 387 p0 = cmp.gt(EXPA,##BIAS+BIAS) 388 if (p0.new) jump:nt .Lfma_ovf 389 } 390 { 391 p0 = cmp.gt(EXPA,#0) 392 if (p0.new) jump:nt .Lpossible_unf 393 } 394 { 395 // TMP has original EXPA. 396 // ATMP is corresponding value 397 // Normalize ATMP and shift right to correct location 398 EXPB = add(clb(ATMP),#-2) // Amount to left shift to normalize 399 EXPA = sub(#1+5,TMP) // Amount to right shift to denormalize 400 p3 = cmp.gt(CTMPH,#-1) 401 } 402 // Underflow 403 // We know that the infinte range exponent should be EXPA 404 // CTMP is 2's complement, ATMP is abs(CTMP) 405 { 406 EXPA = add(EXPA,EXPB) // how much to shift back right 407 ATMP = asl(ATMP,EXPB) // shift left 408 AH = USR 409 TMP = #63 410 } 411 { 412 EXPB = min(EXPA,TMP) 413 EXPA = #0 414 AL = #0x0030 415 } 416 { 417 B = extractu(ATMP,EXPBA) 418 ATMP = asr(ATMP,EXPB) 419 } 420 { 421 p0 = cmp.gtu(ONE,B) 422 if (!p0.new) ATMPL = or(ATMPL,S_ONE) 423 ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2) 424 } 425 { 426 CTMP = neg(ATMP) 427 p1 = bitsclr(ATMPL,#(1<<FUDGE2)-1) 428 if (!p1.new) AH = or(AH,AL) 429 B = #0 430 } 431 { 432 if (p3) CTMP = ATMP 433 USR = AH 434 TMP = #-BIAS-(MANTBITS+FUDGE2) 435 } 436 { 437 A = convert_d2df(CTMP) 438 } 439 { 440 AH += asl(TMP,#HI_MANTBITS) 441 dealloc_return 442 } 443.Lpossible_unf: 444 { 445 TMP = ##0x7fefffff 446 ATMP = abs(CTMP) 447 } 448 { 449 p0 = cmp.eq(AL,#0) 450 p0 = bitsclr(AH,TMP) 451 if (!p0.new) dealloc_return:t 452 TMP = #0x7fff 453 } 454 { 455 p0 = bitsset(ATMPH,TMP) 456 BH = USR 457 BL = #0x0030 458 } 459 { 460 if (p0) BH = or(BH,BL) 461 } 462 { 463 USR = BH 464 } 465 { 466 p0 = dfcmp.eq(A,A) 467 dealloc_return 468 } 469.Lfma_ovf: 470 { 471 TMP = USR 472 CTMP = combine(##0x7fefffff,#-1) 473 A = CTMP 474 } 475 { 476 ATMP = combine(##0x7ff00000,#0) 477 BH = extractu(TMP,#2,#SR_ROUND_OFF) 478 TMP = or(TMP,#0x28) 479 } 480 { 481 USR = TMP 482 BH ^= lsr(AH,#31) 483 BL = BH 484 } 485 { 486 p0 = !cmp.eq(BL,#1) 487 p0 = !cmp.eq(BH,#2) 488 } 489 { 490 p0 = dfcmp.eq(ATMP,ATMP) 491 if (p0.new) CTMP = ATMP 492 } 493 { 494 A = insert(CTMP,#63,#0) 495 dealloc_return 496 } 497#undef CTMP 498#undef CTMPH 499#undef CTMPL 500#define BTMP r11:10 501#define BTMPH r11 502#define BTMPL r10 503 504#undef STICKIES 505#undef STICKIESH 506#undef STICKIESL 507#define C r5:4 508#define CH r5 509#define CL r4 510 511.Lfma_abnormal_ab: 512 { 513 ATMP = extractu(A,#63,#0) 514 BTMP = extractu(B,#63,#0) 515 deallocframe 516 } 517 { 518 p3 = cmp.gtu(ATMP,BTMP) 519 if (!p3.new) A = B // sort values 520 if (!p3.new) B = A 521 } 522 { 523 p0 = dfclass(A,#0x0f) // A NaN? 524 if (!p0.new) jump:nt .Lnan 525 if (!p3) ATMP = BTMP 526 if (!p3) BTMP = ATMP 527 } 528 { 529 p1 = dfclass(A,#0x08) // A is infinity 530 p1 = dfclass(B,#0x0e) // B is nonzero 531 } 532 { 533 p0 = dfclass(A,#0x08) // a is inf 534 p0 = dfclass(B,#0x01) // b is zero 535 } 536 { 537 if (p1) jump .Lab_inf 538 p2 = dfclass(B,#0x01) 539 } 540 { 541 if (p0) jump .Linvalid 542 if (p2) jump .Lab_true_zero 543 TMP = ##0x7c000000 544 } 545 // We are left with a normal or subnormal times a subnormal, A > B 546 // If A and B are both very small, we will go to a single sticky bit; replace 547 // A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results 548 // if A and B might multiply to something bigger, decrease A exp and increase B exp 549 // and start over 550 { 551 p0 = bitsclr(AH,TMP) 552 if (p0.new) jump:nt .Lfma_ab_tiny 553 } 554 { 555 TMP = add(clb(BTMP),#-EXPBITS) 556 } 557 { 558 BTMP = asl(BTMP,TMP) 559 } 560 { 561 B = insert(BTMP,#63,#0) 562 AH -= asl(TMP,#HI_MANTBITS) 563 } 564 jump fma 565 566.Lfma_ab_tiny: 567 ATMP = combine(##0x00100000,#0) 568 { 569 A = insert(ATMP,#63,#0) 570 B = insert(ATMP,#63,#0) 571 } 572 jump fma 573 574.Lab_inf: 575 { 576 B = lsr(B,#63) 577 p0 = dfclass(C,#0x10) 578 } 579 { 580 A ^= asl(B,#63) 581 if (p0) jump .Lnan 582 } 583 { 584 p1 = dfclass(C,#0x08) 585 if (p1.new) jump:nt .Lfma_inf_plus_inf 586 } 587 // A*B is +/- inf, C is finite. Return A 588 { 589 jumpr r31 590 } 591 .falign 592.Lfma_inf_plus_inf: 593 { // adding infinities of different signs is invalid 594 p0 = dfcmp.eq(A,C) 595 if (!p0.new) jump:nt .Linvalid 596 } 597 { 598 jumpr r31 599 } 600 601.Lnan: 602 { 603 p0 = dfclass(B,#0x10) 604 p1 = dfclass(C,#0x10) 605 if (!p0.new) B = A 606 if (!p1.new) C = A 607 } 608 { // find sNaNs 609 BH = convert_df2sf(B) 610 BL = convert_df2sf(C) 611 } 612 { 613 BH = convert_df2sf(A) 614 A = #-1 615 jumpr r31 616 } 617 618.Linvalid: 619 { 620 TMP = ##0x7f800001 // sp snan 621 } 622 { 623 A = convert_sf2df(TMP) 624 jumpr r31 625 } 626 627.Lab_true_zero: 628 // B is zero, A is finite number 629 { 630 p0 = dfclass(C,#0x10) 631 if (p0.new) jump:nt .Lnan 632 if (p0.new) A = C 633 } 634 { 635 p0 = dfcmp.eq(B,C) // is C also zero? 636 AH = lsr(AH,#31) // get sign 637 } 638 { 639 BH ^= asl(AH,#31) // form correctly signed zero in B 640 if (!p0) A = C // If C is not zero, return C 641 if (!p0) jumpr r31 642 } 643 // B has correctly signed zero, C is also zero 644.Lzero_plus_zero: 645 { 646 p0 = cmp.eq(B,C) // yes, scalar equals. +0++0 or -0+-0 647 if (p0.new) jumpr:t r31 648 A = B 649 } 650 { 651 TMP = USR 652 } 653 { 654 TMP = extractu(TMP,#2,#SR_ROUND_OFF) 655 A = #0 656 } 657 { 658 p0 = cmp.eq(TMP,#2) 659 if (p0.new) AH = ##0x80000000 660 jumpr r31 661 } 662#undef BTMP 663#undef BTMPH 664#undef BTMPL 665#define CTMP r11:10 666 .falign 667.Lfma_abnormal_c: 668 // We know that AB is normal * normal 669 // C is not normal: zero, subnormal, inf, or NaN. 670 { 671 p0 = dfclass(C,#0x10) // is C NaN? 672 if (p0.new) jump:nt .Lnan 673 if (p0.new) A = C // move NaN to A 674 deallocframe 675 } 676 { 677 p0 = dfclass(C,#0x08) // is C inf? 678 if (p0.new) A = C // return C 679 if (p0.new) jumpr:nt r31 680 } 681 // zero or subnormal 682 // If we have a zero, and we know AB is normal*normal, we can just call normal multiply 683 { 684 p0 = dfclass(C,#0x01) // is C zero? 685 if (p0.new) jump:nt __hexagon_muldf3 686 TMP = #1 687 } 688 // Left with: subnormal 689 // Adjust C and jump back to restart 690 { 691 allocframe(#STACKSPACE) // oops, deallocated above, re-allocate frame 692 CTMP = #0 693 CH = insert(TMP,#EXPBITS,#HI_MANTBITS) 694 jump .Lfma_abnormal_c_restart 695 } 696END(fma) 697