1 /*===---- __clang_hip_math.h - Device-side HIP math support ----------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 #ifndef __CLANG_HIP_MATH_H__ 10 #define __CLANG_HIP_MATH_H__ 11 12 #if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__) 13 #error "This file is for HIP and OpenMP AMDGCN device compilation only." 14 #endif 15 16 #if !defined(__HIPCC_RTC__) 17 #include <limits.h> 18 #include <stdint.h> 19 #ifdef __OPENMP_AMDGCN__ 20 #include <omp.h> 21 #endif 22 #endif // !defined(__HIPCC_RTC__) 23 24 #pragma push_macro("__DEVICE__") 25 26 #ifdef __OPENMP_AMDGCN__ 27 #define __DEVICE__ static inline __attribute__((always_inline, nothrow)) 28 #else 29 #define __DEVICE__ static __device__ inline __attribute__((always_inline)) 30 #endif 31 32 // Device library provides fast low precision and slow full-recision 33 // implementations for some functions. Which one gets selected depends on 34 // __CLANG_GPU_APPROX_TRANSCENDENTALS__ which gets defined by clang if 35 // -ffast-math or -fgpu-approx-transcendentals are in effect. 36 #pragma push_macro("__FAST_OR_SLOW") 37 #if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__) 38 #define __FAST_OR_SLOW(fast, slow) fast 39 #else 40 #define __FAST_OR_SLOW(fast, slow) slow 41 #endif 42 43 // A few functions return bool type starting only in C++11. 44 #pragma push_macro("__RETURN_TYPE") 45 #ifdef __OPENMP_AMDGCN__ 46 #define __RETURN_TYPE int 47 #else 48 #if defined(__cplusplus) 49 #define __RETURN_TYPE bool 50 #else 51 #define __RETURN_TYPE int 52 #endif 53 #endif // __OPENMP_AMDGCN__ 54 55 #if defined (__cplusplus) && __cplusplus < 201103L 56 // emulate static_assert on type sizes 57 template<bool> 58 struct __compare_result{}; 59 template<> 60 struct __compare_result<true> { 61 static const __device__ bool valid; 62 }; 63 64 __DEVICE__ 65 void __suppress_unused_warning(bool b){}; 66 template <unsigned int S, unsigned int T> 67 __DEVICE__ void __static_assert_equal_size() { 68 __suppress_unused_warning(__compare_result<S == T>::valid); 69 } 70 71 #define __static_assert_type_size_equal(A, B) \ 72 __static_assert_equal_size<A,B>() 73 74 #else 75 #define __static_assert_type_size_equal(A,B) \ 76 static_assert((A) == (B), "") 77 78 #endif 79 80 __DEVICE__ 81 uint64_t __make_mantissa_base8(const char *__tagp __attribute__((nonnull))) { 82 uint64_t __r = 0; 83 while (*__tagp != '\0') { 84 char __tmp = *__tagp; 85 86 if (__tmp >= '0' && __tmp <= '7') 87 __r = (__r * 8u) + __tmp - '0'; 88 else 89 return 0; 90 91 ++__tagp; 92 } 93 94 return __r; 95 } 96 97 __DEVICE__ 98 uint64_t __make_mantissa_base10(const char *__tagp __attribute__((nonnull))) { 99 uint64_t __r = 0; 100 while (*__tagp != '\0') { 101 char __tmp = *__tagp; 102 103 if (__tmp >= '0' && __tmp <= '9') 104 __r = (__r * 10u) + __tmp - '0'; 105 else 106 return 0; 107 108 ++__tagp; 109 } 110 111 return __r; 112 } 113 114 __DEVICE__ 115 uint64_t __make_mantissa_base16(const char *__tagp __attribute__((nonnull))) { 116 uint64_t __r = 0; 117 while (*__tagp != '\0') { 118 char __tmp = *__tagp; 119 120 if (__tmp >= '0' && __tmp <= '9') 121 __r = (__r * 16u) + __tmp - '0'; 122 else if (__tmp >= 'a' && __tmp <= 'f') 123 __r = (__r * 16u) + __tmp - 'a' + 10; 124 else if (__tmp >= 'A' && __tmp <= 'F') 125 __r = (__r * 16u) + __tmp - 'A' + 10; 126 else 127 return 0; 128 129 ++__tagp; 130 } 131 132 return __r; 133 } 134 135 __DEVICE__ 136 uint64_t __make_mantissa(const char *__tagp __attribute__((nonnull))) { 137 if (*__tagp == '0') { 138 ++__tagp; 139 140 if (*__tagp == 'x' || *__tagp == 'X') 141 return __make_mantissa_base16(__tagp); 142 else 143 return __make_mantissa_base8(__tagp); 144 } 145 146 return __make_mantissa_base10(__tagp); 147 } 148 149 // BEGIN FLOAT 150 151 // BEGIN INTRINSICS 152 153 __DEVICE__ 154 float __cosf(float __x) { return __ocml_native_cos_f32(__x); } 155 156 __DEVICE__ 157 float __exp10f(float __x) { 158 const float __log2_10 = 0x1.a934f0p+1f; 159 return __builtin_amdgcn_exp2f(__log2_10 * __x); 160 } 161 162 __DEVICE__ 163 float __expf(float __x) { 164 const float __log2_e = 0x1.715476p+0; 165 return __builtin_amdgcn_exp2f(__log2_e * __x); 166 } 167 168 #if defined OCML_BASIC_ROUNDED_OPERATIONS 169 __DEVICE__ 170 float __fadd_rd(float __x, float __y) { return __ocml_add_rtn_f32(__x, __y); } 171 __DEVICE__ 172 float __fadd_rn(float __x, float __y) { return __ocml_add_rte_f32(__x, __y); } 173 __DEVICE__ 174 float __fadd_ru(float __x, float __y) { return __ocml_add_rtp_f32(__x, __y); } 175 __DEVICE__ 176 float __fadd_rz(float __x, float __y) { return __ocml_add_rtz_f32(__x, __y); } 177 #else 178 __DEVICE__ 179 float __fadd_rn(float __x, float __y) { return __x + __y; } 180 #endif 181 182 #if defined OCML_BASIC_ROUNDED_OPERATIONS 183 __DEVICE__ 184 float __fdiv_rd(float __x, float __y) { return __ocml_div_rtn_f32(__x, __y); } 185 __DEVICE__ 186 float __fdiv_rn(float __x, float __y) { return __ocml_div_rte_f32(__x, __y); } 187 __DEVICE__ 188 float __fdiv_ru(float __x, float __y) { return __ocml_div_rtp_f32(__x, __y); } 189 __DEVICE__ 190 float __fdiv_rz(float __x, float __y) { return __ocml_div_rtz_f32(__x, __y); } 191 #else 192 __DEVICE__ 193 float __fdiv_rn(float __x, float __y) { return __x / __y; } 194 #endif 195 196 __DEVICE__ 197 float __fdividef(float __x, float __y) { return __x / __y; } 198 199 #if defined OCML_BASIC_ROUNDED_OPERATIONS 200 __DEVICE__ 201 float __fmaf_rd(float __x, float __y, float __z) { 202 return __ocml_fma_rtn_f32(__x, __y, __z); 203 } 204 __DEVICE__ 205 float __fmaf_rn(float __x, float __y, float __z) { 206 return __ocml_fma_rte_f32(__x, __y, __z); 207 } 208 __DEVICE__ 209 float __fmaf_ru(float __x, float __y, float __z) { 210 return __ocml_fma_rtp_f32(__x, __y, __z); 211 } 212 __DEVICE__ 213 float __fmaf_rz(float __x, float __y, float __z) { 214 return __ocml_fma_rtz_f32(__x, __y, __z); 215 } 216 #else 217 __DEVICE__ 218 float __fmaf_rn(float __x, float __y, float __z) { 219 return __builtin_fmaf(__x, __y, __z); 220 } 221 #endif 222 223 #if defined OCML_BASIC_ROUNDED_OPERATIONS 224 __DEVICE__ 225 float __fmul_rd(float __x, float __y) { return __ocml_mul_rtn_f32(__x, __y); } 226 __DEVICE__ 227 float __fmul_rn(float __x, float __y) { return __ocml_mul_rte_f32(__x, __y); } 228 __DEVICE__ 229 float __fmul_ru(float __x, float __y) { return __ocml_mul_rtp_f32(__x, __y); } 230 __DEVICE__ 231 float __fmul_rz(float __x, float __y) { return __ocml_mul_rtz_f32(__x, __y); } 232 #else 233 __DEVICE__ 234 float __fmul_rn(float __x, float __y) { return __x * __y; } 235 #endif 236 237 #if defined OCML_BASIC_ROUNDED_OPERATIONS 238 __DEVICE__ 239 float __frcp_rd(float __x) { return __ocml_div_rtn_f32(1.0f, __x); } 240 __DEVICE__ 241 float __frcp_rn(float __x) { return __ocml_div_rte_f32(1.0f, __x); } 242 __DEVICE__ 243 float __frcp_ru(float __x) { return __ocml_div_rtp_f32(1.0f, __x); } 244 __DEVICE__ 245 float __frcp_rz(float __x) { return __ocml_div_rtz_f32(1.0f, __x); } 246 #else 247 __DEVICE__ 248 float __frcp_rn(float __x) { return 1.0f / __x; } 249 #endif 250 251 __DEVICE__ 252 float __frsqrt_rn(float __x) { return __builtin_amdgcn_rsqf(__x); } 253 254 #if defined OCML_BASIC_ROUNDED_OPERATIONS 255 __DEVICE__ 256 float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); } 257 __DEVICE__ 258 float __fsqrt_rn(float __x) { return __ocml_sqrt_rte_f32(__x); } 259 __DEVICE__ 260 float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); } 261 __DEVICE__ 262 float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); } 263 #else 264 __DEVICE__ 265 float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); } 266 #endif 267 268 #if defined OCML_BASIC_ROUNDED_OPERATIONS 269 __DEVICE__ 270 float __fsub_rd(float __x, float __y) { return __ocml_sub_rtn_f32(__x, __y); } 271 __DEVICE__ 272 float __fsub_rn(float __x, float __y) { return __ocml_sub_rte_f32(__x, __y); } 273 __DEVICE__ 274 float __fsub_ru(float __x, float __y) { return __ocml_sub_rtp_f32(__x, __y); } 275 __DEVICE__ 276 float __fsub_rz(float __x, float __y) { return __ocml_sub_rtz_f32(__x, __y); } 277 #else 278 __DEVICE__ 279 float __fsub_rn(float __x, float __y) { return __x - __y; } 280 #endif 281 282 __DEVICE__ 283 float __log10f(float __x) { return __builtin_log10f(__x); } 284 285 __DEVICE__ 286 float __log2f(float __x) { return __builtin_amdgcn_logf(__x); } 287 288 __DEVICE__ 289 float __logf(float __x) { return __builtin_logf(__x); } 290 291 __DEVICE__ 292 float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } 293 294 __DEVICE__ 295 float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); } 296 297 __DEVICE__ 298 void __sincosf(float __x, float *__sinptr, float *__cosptr) { 299 *__sinptr = __ocml_native_sin_f32(__x); 300 *__cosptr = __ocml_native_cos_f32(__x); 301 } 302 303 __DEVICE__ 304 float __sinf(float __x) { return __ocml_native_sin_f32(__x); } 305 306 __DEVICE__ 307 float __tanf(float __x) { 308 return __sinf(__x) * __builtin_amdgcn_rcpf(__cosf(__x)); 309 } 310 // END INTRINSICS 311 312 #if defined(__cplusplus) 313 __DEVICE__ 314 int abs(int __x) { 315 return __builtin_abs(__x); 316 } 317 __DEVICE__ 318 long labs(long __x) { 319 return __builtin_labs(__x); 320 } 321 __DEVICE__ 322 long long llabs(long long __x) { 323 return __builtin_llabs(__x); 324 } 325 #endif 326 327 __DEVICE__ 328 float acosf(float __x) { return __ocml_acos_f32(__x); } 329 330 __DEVICE__ 331 float acoshf(float __x) { return __ocml_acosh_f32(__x); } 332 333 __DEVICE__ 334 float asinf(float __x) { return __ocml_asin_f32(__x); } 335 336 __DEVICE__ 337 float asinhf(float __x) { return __ocml_asinh_f32(__x); } 338 339 __DEVICE__ 340 float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); } 341 342 __DEVICE__ 343 float atanf(float __x) { return __ocml_atan_f32(__x); } 344 345 __DEVICE__ 346 float atanhf(float __x) { return __ocml_atanh_f32(__x); } 347 348 __DEVICE__ 349 float cbrtf(float __x) { return __ocml_cbrt_f32(__x); } 350 351 __DEVICE__ 352 float ceilf(float __x) { return __builtin_ceilf(__x); } 353 354 __DEVICE__ 355 float copysignf(float __x, float __y) { return __builtin_copysignf(__x, __y); } 356 357 __DEVICE__ 358 float cosf(float __x) { return __FAST_OR_SLOW(__cosf, __ocml_cos_f32)(__x); } 359 360 __DEVICE__ 361 float coshf(float __x) { return __ocml_cosh_f32(__x); } 362 363 __DEVICE__ 364 float cospif(float __x) { return __ocml_cospi_f32(__x); } 365 366 __DEVICE__ 367 float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); } 368 369 __DEVICE__ 370 float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); } 371 372 __DEVICE__ 373 float erfcf(float __x) { return __ocml_erfc_f32(__x); } 374 375 __DEVICE__ 376 float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); } 377 378 __DEVICE__ 379 float erfcxf(float __x) { return __ocml_erfcx_f32(__x); } 380 381 __DEVICE__ 382 float erff(float __x) { return __ocml_erf_f32(__x); } 383 384 __DEVICE__ 385 float erfinvf(float __x) { return __ocml_erfinv_f32(__x); } 386 387 __DEVICE__ 388 float exp10f(float __x) { return __ocml_exp10_f32(__x); } 389 390 __DEVICE__ 391 float exp2f(float __x) { return __builtin_exp2f(__x); } 392 393 __DEVICE__ 394 float expf(float __x) { return __builtin_expf(__x); } 395 396 __DEVICE__ 397 float expm1f(float __x) { return __ocml_expm1_f32(__x); } 398 399 __DEVICE__ 400 float fabsf(float __x) { return __builtin_fabsf(__x); } 401 402 __DEVICE__ 403 float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); } 404 405 __DEVICE__ 406 float fdividef(float __x, float __y) { return __x / __y; } 407 408 __DEVICE__ 409 float floorf(float __x) { return __builtin_floorf(__x); } 410 411 __DEVICE__ 412 float fmaf(float __x, float __y, float __z) { 413 return __builtin_fmaf(__x, __y, __z); 414 } 415 416 __DEVICE__ 417 float fmaxf(float __x, float __y) { return __builtin_fmaxf(__x, __y); } 418 419 __DEVICE__ 420 float fminf(float __x, float __y) { return __builtin_fminf(__x, __y); } 421 422 __DEVICE__ 423 float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); } 424 425 __DEVICE__ 426 float frexpf(float __x, int *__nptr) { 427 return __builtin_frexpf(__x, __nptr); 428 } 429 430 __DEVICE__ 431 float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); } 432 433 __DEVICE__ 434 int ilogbf(float __x) { return __ocml_ilogb_f32(__x); } 435 436 __DEVICE__ 437 __RETURN_TYPE __finitef(float __x) { return __builtin_isfinite(__x); } 438 439 __DEVICE__ 440 __RETURN_TYPE __isinff(float __x) { return __builtin_isinf(__x); } 441 442 __DEVICE__ 443 __RETURN_TYPE __isnanf(float __x) { return __builtin_isnan(__x); } 444 445 __DEVICE__ 446 float j0f(float __x) { return __ocml_j0_f32(__x); } 447 448 __DEVICE__ 449 float j1f(float __x) { return __ocml_j1_f32(__x); } 450 451 __DEVICE__ 452 float jnf(int __n, float __x) { // TODO: we could use Ahmes multiplication 453 // and the Miller & Brown algorithm 454 // for linear recurrences to get O(log n) steps, but it's unclear if 455 // it'd be beneficial in this case. 456 if (__n == 0) 457 return j0f(__x); 458 if (__n == 1) 459 return j1f(__x); 460 461 float __x0 = j0f(__x); 462 float __x1 = j1f(__x); 463 for (int __i = 1; __i < __n; ++__i) { 464 float __x2 = (2 * __i) / __x * __x1 - __x0; 465 __x0 = __x1; 466 __x1 = __x2; 467 } 468 469 return __x1; 470 } 471 472 __DEVICE__ 473 float ldexpf(float __x, int __e) { return __builtin_amdgcn_ldexpf(__x, __e); } 474 475 __DEVICE__ 476 float lgammaf(float __x) { return __ocml_lgamma_f32(__x); } 477 478 __DEVICE__ 479 long long int llrintf(float __x) { return __builtin_rintf(__x); } 480 481 __DEVICE__ 482 long long int llroundf(float __x) { return __builtin_roundf(__x); } 483 484 __DEVICE__ 485 float log10f(float __x) { return __builtin_log10f(__x); } 486 487 __DEVICE__ 488 float log1pf(float __x) { return __ocml_log1p_f32(__x); } 489 490 __DEVICE__ 491 float log2f(float __x) { return __FAST_OR_SLOW(__log2f, __ocml_log2_f32)(__x); } 492 493 __DEVICE__ 494 float logbf(float __x) { return __ocml_logb_f32(__x); } 495 496 __DEVICE__ 497 float logf(float __x) { return __FAST_OR_SLOW(__logf, __ocml_log_f32)(__x); } 498 499 __DEVICE__ 500 long int lrintf(float __x) { return __builtin_rintf(__x); } 501 502 __DEVICE__ 503 long int lroundf(float __x) { return __builtin_roundf(__x); } 504 505 __DEVICE__ 506 float modff(float __x, float *__iptr) { 507 float __tmp; 508 #ifdef __OPENMP_AMDGCN__ 509 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 510 #endif 511 float __r = 512 __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); 513 *__iptr = __tmp; 514 return __r; 515 } 516 517 __DEVICE__ 518 float nanf(const char *__tagp __attribute__((nonnull))) { 519 union { 520 float val; 521 struct ieee_float { 522 unsigned int mantissa : 22; 523 unsigned int quiet : 1; 524 unsigned int exponent : 8; 525 unsigned int sign : 1; 526 } bits; 527 } __tmp; 528 __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits)); 529 530 __tmp.bits.sign = 0u; 531 __tmp.bits.exponent = ~0u; 532 __tmp.bits.quiet = 1u; 533 __tmp.bits.mantissa = __make_mantissa(__tagp); 534 535 return __tmp.val; 536 } 537 538 __DEVICE__ 539 float nearbyintf(float __x) { return __builtin_nearbyintf(__x); } 540 541 __DEVICE__ 542 float nextafterf(float __x, float __y) { 543 return __ocml_nextafter_f32(__x, __y); 544 } 545 546 __DEVICE__ 547 float norm3df(float __x, float __y, float __z) { 548 return __ocml_len3_f32(__x, __y, __z); 549 } 550 551 __DEVICE__ 552 float norm4df(float __x, float __y, float __z, float __w) { 553 return __ocml_len4_f32(__x, __y, __z, __w); 554 } 555 556 __DEVICE__ 557 float normcdff(float __x) { return __ocml_ncdf_f32(__x); } 558 559 __DEVICE__ 560 float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); } 561 562 __DEVICE__ 563 float normf(int __dim, 564 const float *__a) { // TODO: placeholder until OCML adds support. 565 float __r = 0; 566 while (__dim--) { 567 __r += __a[0] * __a[0]; 568 ++__a; 569 } 570 571 return __builtin_sqrtf(__r); 572 } 573 574 __DEVICE__ 575 float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } 576 577 __DEVICE__ 578 float powif(float __x, int __y) { return __ocml_pown_f32(__x, __y); } 579 580 __DEVICE__ 581 float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); } 582 583 __DEVICE__ 584 float remainderf(float __x, float __y) { 585 return __ocml_remainder_f32(__x, __y); 586 } 587 588 __DEVICE__ 589 float remquof(float __x, float __y, int *__quo) { 590 int __tmp; 591 #ifdef __OPENMP_AMDGCN__ 592 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 593 #endif 594 float __r = __ocml_remquo_f32( 595 __x, __y, (__attribute__((address_space(5))) int *)&__tmp); 596 *__quo = __tmp; 597 598 return __r; 599 } 600 601 __DEVICE__ 602 float rhypotf(float __x, float __y) { return __ocml_rhypot_f32(__x, __y); } 603 604 __DEVICE__ 605 float rintf(float __x) { return __builtin_rintf(__x); } 606 607 __DEVICE__ 608 float rnorm3df(float __x, float __y, float __z) { 609 return __ocml_rlen3_f32(__x, __y, __z); 610 } 611 612 __DEVICE__ 613 float rnorm4df(float __x, float __y, float __z, float __w) { 614 return __ocml_rlen4_f32(__x, __y, __z, __w); 615 } 616 617 __DEVICE__ 618 float rnormf(int __dim, 619 const float *__a) { // TODO: placeholder until OCML adds support. 620 float __r = 0; 621 while (__dim--) { 622 __r += __a[0] * __a[0]; 623 ++__a; 624 } 625 626 return __ocml_rsqrt_f32(__r); 627 } 628 629 __DEVICE__ 630 float roundf(float __x) { return __builtin_roundf(__x); } 631 632 __DEVICE__ 633 float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); } 634 635 __DEVICE__ 636 float scalblnf(float __x, long int __n) { 637 return (__n < INT_MAX) ? __builtin_amdgcn_ldexpf(__x, __n) 638 : __ocml_scalb_f32(__x, __n); 639 } 640 641 __DEVICE__ 642 float scalbnf(float __x, int __n) { return __builtin_amdgcn_ldexpf(__x, __n); } 643 644 __DEVICE__ 645 __RETURN_TYPE __signbitf(float __x) { return __builtin_signbitf(__x); } 646 647 __DEVICE__ 648 void sincosf(float __x, float *__sinptr, float *__cosptr) { 649 float __tmp; 650 #ifdef __OPENMP_AMDGCN__ 651 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 652 #endif 653 #ifdef __CLANG_CUDA_APPROX_TRANSCENDENTALS__ 654 __sincosf(__x, __sinptr, __cosptr); 655 #else 656 *__sinptr = 657 __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); 658 *__cosptr = __tmp; 659 #endif 660 } 661 662 __DEVICE__ 663 void sincospif(float __x, float *__sinptr, float *__cosptr) { 664 float __tmp; 665 #ifdef __OPENMP_AMDGCN__ 666 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 667 #endif 668 *__sinptr = __ocml_sincospi_f32( 669 __x, (__attribute__((address_space(5))) float *)&__tmp); 670 *__cosptr = __tmp; 671 } 672 673 __DEVICE__ 674 float sinf(float __x) { return __FAST_OR_SLOW(__sinf, __ocml_sin_f32)(__x); } 675 676 __DEVICE__ 677 float sinhf(float __x) { return __ocml_sinh_f32(__x); } 678 679 __DEVICE__ 680 float sinpif(float __x) { return __ocml_sinpi_f32(__x); } 681 682 __DEVICE__ 683 float sqrtf(float __x) { return __builtin_sqrtf(__x); } 684 685 __DEVICE__ 686 float tanf(float __x) { return __ocml_tan_f32(__x); } 687 688 __DEVICE__ 689 float tanhf(float __x) { return __ocml_tanh_f32(__x); } 690 691 __DEVICE__ 692 float tgammaf(float __x) { return __ocml_tgamma_f32(__x); } 693 694 __DEVICE__ 695 float truncf(float __x) { return __builtin_truncf(__x); } 696 697 __DEVICE__ 698 float y0f(float __x) { return __ocml_y0_f32(__x); } 699 700 __DEVICE__ 701 float y1f(float __x) { return __ocml_y1_f32(__x); } 702 703 __DEVICE__ 704 float ynf(int __n, float __x) { // TODO: we could use Ahmes multiplication 705 // and the Miller & Brown algorithm 706 // for linear recurrences to get O(log n) steps, but it's unclear if 707 // it'd be beneficial in this case. Placeholder until OCML adds 708 // support. 709 if (__n == 0) 710 return y0f(__x); 711 if (__n == 1) 712 return y1f(__x); 713 714 float __x0 = y0f(__x); 715 float __x1 = y1f(__x); 716 for (int __i = 1; __i < __n; ++__i) { 717 float __x2 = (2 * __i) / __x * __x1 - __x0; 718 __x0 = __x1; 719 __x1 = __x2; 720 } 721 722 return __x1; 723 } 724 725 726 // END FLOAT 727 728 // BEGIN DOUBLE 729 __DEVICE__ 730 double acos(double __x) { return __ocml_acos_f64(__x); } 731 732 __DEVICE__ 733 double acosh(double __x) { return __ocml_acosh_f64(__x); } 734 735 __DEVICE__ 736 double asin(double __x) { return __ocml_asin_f64(__x); } 737 738 __DEVICE__ 739 double asinh(double __x) { return __ocml_asinh_f64(__x); } 740 741 __DEVICE__ 742 double atan(double __x) { return __ocml_atan_f64(__x); } 743 744 __DEVICE__ 745 double atan2(double __x, double __y) { return __ocml_atan2_f64(__x, __y); } 746 747 __DEVICE__ 748 double atanh(double __x) { return __ocml_atanh_f64(__x); } 749 750 __DEVICE__ 751 double cbrt(double __x) { return __ocml_cbrt_f64(__x); } 752 753 __DEVICE__ 754 double ceil(double __x) { return __builtin_ceil(__x); } 755 756 __DEVICE__ 757 double copysign(double __x, double __y) { 758 return __builtin_copysign(__x, __y); 759 } 760 761 __DEVICE__ 762 double cos(double __x) { return __ocml_cos_f64(__x); } 763 764 __DEVICE__ 765 double cosh(double __x) { return __ocml_cosh_f64(__x); } 766 767 __DEVICE__ 768 double cospi(double __x) { return __ocml_cospi_f64(__x); } 769 770 __DEVICE__ 771 double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); } 772 773 __DEVICE__ 774 double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); } 775 776 __DEVICE__ 777 double erf(double __x) { return __ocml_erf_f64(__x); } 778 779 __DEVICE__ 780 double erfc(double __x) { return __ocml_erfc_f64(__x); } 781 782 __DEVICE__ 783 double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); } 784 785 __DEVICE__ 786 double erfcx(double __x) { return __ocml_erfcx_f64(__x); } 787 788 __DEVICE__ 789 double erfinv(double __x) { return __ocml_erfinv_f64(__x); } 790 791 __DEVICE__ 792 double exp(double __x) { return __ocml_exp_f64(__x); } 793 794 __DEVICE__ 795 double exp10(double __x) { return __ocml_exp10_f64(__x); } 796 797 __DEVICE__ 798 double exp2(double __x) { return __ocml_exp2_f64(__x); } 799 800 __DEVICE__ 801 double expm1(double __x) { return __ocml_expm1_f64(__x); } 802 803 __DEVICE__ 804 double fabs(double __x) { return __builtin_fabs(__x); } 805 806 __DEVICE__ 807 double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); } 808 809 __DEVICE__ 810 double floor(double __x) { return __builtin_floor(__x); } 811 812 __DEVICE__ 813 double fma(double __x, double __y, double __z) { 814 return __builtin_fma(__x, __y, __z); 815 } 816 817 __DEVICE__ 818 double fmax(double __x, double __y) { return __builtin_fmax(__x, __y); } 819 820 __DEVICE__ 821 double fmin(double __x, double __y) { return __builtin_fmin(__x, __y); } 822 823 __DEVICE__ 824 double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); } 825 826 __DEVICE__ 827 double frexp(double __x, int *__nptr) { 828 return __builtin_frexp(__x, __nptr); 829 } 830 831 __DEVICE__ 832 double hypot(double __x, double __y) { return __ocml_hypot_f64(__x, __y); } 833 834 __DEVICE__ 835 int ilogb(double __x) { return __ocml_ilogb_f64(__x); } 836 837 __DEVICE__ 838 __RETURN_TYPE __finite(double __x) { return __builtin_isfinite(__x); } 839 840 __DEVICE__ 841 __RETURN_TYPE __isinf(double __x) { return __builtin_isinf(__x); } 842 843 __DEVICE__ 844 __RETURN_TYPE __isnan(double __x) { return __builtin_isnan(__x); } 845 846 __DEVICE__ 847 double j0(double __x) { return __ocml_j0_f64(__x); } 848 849 __DEVICE__ 850 double j1(double __x) { return __ocml_j1_f64(__x); } 851 852 __DEVICE__ 853 double jn(int __n, double __x) { // TODO: we could use Ahmes multiplication 854 // and the Miller & Brown algorithm 855 // for linear recurrences to get O(log n) steps, but it's unclear if 856 // it'd be beneficial in this case. Placeholder until OCML adds 857 // support. 858 if (__n == 0) 859 return j0(__x); 860 if (__n == 1) 861 return j1(__x); 862 863 double __x0 = j0(__x); 864 double __x1 = j1(__x); 865 for (int __i = 1; __i < __n; ++__i) { 866 double __x2 = (2 * __i) / __x * __x1 - __x0; 867 __x0 = __x1; 868 __x1 = __x2; 869 } 870 return __x1; 871 } 872 873 __DEVICE__ 874 double ldexp(double __x, int __e) { return __builtin_amdgcn_ldexp(__x, __e); } 875 876 __DEVICE__ 877 double lgamma(double __x) { return __ocml_lgamma_f64(__x); } 878 879 __DEVICE__ 880 long long int llrint(double __x) { return __builtin_rint(__x); } 881 882 __DEVICE__ 883 long long int llround(double __x) { return __builtin_round(__x); } 884 885 __DEVICE__ 886 double log(double __x) { return __ocml_log_f64(__x); } 887 888 __DEVICE__ 889 double log10(double __x) { return __ocml_log10_f64(__x); } 890 891 __DEVICE__ 892 double log1p(double __x) { return __ocml_log1p_f64(__x); } 893 894 __DEVICE__ 895 double log2(double __x) { return __ocml_log2_f64(__x); } 896 897 __DEVICE__ 898 double logb(double __x) { return __ocml_logb_f64(__x); } 899 900 __DEVICE__ 901 long int lrint(double __x) { return __builtin_rint(__x); } 902 903 __DEVICE__ 904 long int lround(double __x) { return __builtin_round(__x); } 905 906 __DEVICE__ 907 double modf(double __x, double *__iptr) { 908 double __tmp; 909 #ifdef __OPENMP_AMDGCN__ 910 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 911 #endif 912 double __r = 913 __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp); 914 *__iptr = __tmp; 915 916 return __r; 917 } 918 919 __DEVICE__ 920 double nan(const char *__tagp) { 921 #if !_WIN32 922 union { 923 double val; 924 struct ieee_double { 925 uint64_t mantissa : 51; 926 uint32_t quiet : 1; 927 uint32_t exponent : 11; 928 uint32_t sign : 1; 929 } bits; 930 } __tmp; 931 __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits)); 932 933 __tmp.bits.sign = 0u; 934 __tmp.bits.exponent = ~0u; 935 __tmp.bits.quiet = 1u; 936 __tmp.bits.mantissa = __make_mantissa(__tagp); 937 938 return __tmp.val; 939 #else 940 __static_assert_type_size_equal(sizeof(uint64_t), sizeof(double)); 941 uint64_t __val = __make_mantissa(__tagp); 942 __val |= 0xFFF << 51; 943 return *reinterpret_cast<double *>(&__val); 944 #endif 945 } 946 947 __DEVICE__ 948 double nearbyint(double __x) { return __builtin_nearbyint(__x); } 949 950 __DEVICE__ 951 double nextafter(double __x, double __y) { 952 return __ocml_nextafter_f64(__x, __y); 953 } 954 955 __DEVICE__ 956 double norm(int __dim, 957 const double *__a) { // TODO: placeholder until OCML adds support. 958 double __r = 0; 959 while (__dim--) { 960 __r += __a[0] * __a[0]; 961 ++__a; 962 } 963 964 return __builtin_sqrt(__r); 965 } 966 967 __DEVICE__ 968 double norm3d(double __x, double __y, double __z) { 969 return __ocml_len3_f64(__x, __y, __z); 970 } 971 972 __DEVICE__ 973 double norm4d(double __x, double __y, double __z, double __w) { 974 return __ocml_len4_f64(__x, __y, __z, __w); 975 } 976 977 __DEVICE__ 978 double normcdf(double __x) { return __ocml_ncdf_f64(__x); } 979 980 __DEVICE__ 981 double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); } 982 983 __DEVICE__ 984 double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); } 985 986 __DEVICE__ 987 double powi(double __x, int __y) { return __ocml_pown_f64(__x, __y); } 988 989 __DEVICE__ 990 double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); } 991 992 __DEVICE__ 993 double remainder(double __x, double __y) { 994 return __ocml_remainder_f64(__x, __y); 995 } 996 997 __DEVICE__ 998 double remquo(double __x, double __y, int *__quo) { 999 int __tmp; 1000 #ifdef __OPENMP_AMDGCN__ 1001 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 1002 #endif 1003 double __r = __ocml_remquo_f64( 1004 __x, __y, (__attribute__((address_space(5))) int *)&__tmp); 1005 *__quo = __tmp; 1006 1007 return __r; 1008 } 1009 1010 __DEVICE__ 1011 double rhypot(double __x, double __y) { return __ocml_rhypot_f64(__x, __y); } 1012 1013 __DEVICE__ 1014 double rint(double __x) { return __builtin_rint(__x); } 1015 1016 __DEVICE__ 1017 double rnorm(int __dim, 1018 const double *__a) { // TODO: placeholder until OCML adds support. 1019 double __r = 0; 1020 while (__dim--) { 1021 __r += __a[0] * __a[0]; 1022 ++__a; 1023 } 1024 1025 return __ocml_rsqrt_f64(__r); 1026 } 1027 1028 __DEVICE__ 1029 double rnorm3d(double __x, double __y, double __z) { 1030 return __ocml_rlen3_f64(__x, __y, __z); 1031 } 1032 1033 __DEVICE__ 1034 double rnorm4d(double __x, double __y, double __z, double __w) { 1035 return __ocml_rlen4_f64(__x, __y, __z, __w); 1036 } 1037 1038 __DEVICE__ 1039 double round(double __x) { return __builtin_round(__x); } 1040 1041 __DEVICE__ 1042 double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); } 1043 1044 __DEVICE__ 1045 double scalbln(double __x, long int __n) { 1046 return (__n < INT_MAX) ? __builtin_amdgcn_ldexp(__x, __n) 1047 : __ocml_scalb_f64(__x, __n); 1048 } 1049 __DEVICE__ 1050 double scalbn(double __x, int __n) { return __builtin_amdgcn_ldexp(__x, __n); } 1051 1052 __DEVICE__ 1053 __RETURN_TYPE __signbit(double __x) { return __builtin_signbit(__x); } 1054 1055 __DEVICE__ 1056 double sin(double __x) { return __ocml_sin_f64(__x); } 1057 1058 __DEVICE__ 1059 void sincos(double __x, double *__sinptr, double *__cosptr) { 1060 double __tmp; 1061 #ifdef __OPENMP_AMDGCN__ 1062 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 1063 #endif 1064 *__sinptr = __ocml_sincos_f64( 1065 __x, (__attribute__((address_space(5))) double *)&__tmp); 1066 *__cosptr = __tmp; 1067 } 1068 1069 __DEVICE__ 1070 void sincospi(double __x, double *__sinptr, double *__cosptr) { 1071 double __tmp; 1072 #ifdef __OPENMP_AMDGCN__ 1073 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 1074 #endif 1075 *__sinptr = __ocml_sincospi_f64( 1076 __x, (__attribute__((address_space(5))) double *)&__tmp); 1077 *__cosptr = __tmp; 1078 } 1079 1080 __DEVICE__ 1081 double sinh(double __x) { return __ocml_sinh_f64(__x); } 1082 1083 __DEVICE__ 1084 double sinpi(double __x) { return __ocml_sinpi_f64(__x); } 1085 1086 __DEVICE__ 1087 double sqrt(double __x) { return __builtin_sqrt(__x); } 1088 1089 __DEVICE__ 1090 double tan(double __x) { return __ocml_tan_f64(__x); } 1091 1092 __DEVICE__ 1093 double tanh(double __x) { return __ocml_tanh_f64(__x); } 1094 1095 __DEVICE__ 1096 double tgamma(double __x) { return __ocml_tgamma_f64(__x); } 1097 1098 __DEVICE__ 1099 double trunc(double __x) { return __builtin_trunc(__x); } 1100 1101 __DEVICE__ 1102 double y0(double __x) { return __ocml_y0_f64(__x); } 1103 1104 __DEVICE__ 1105 double y1(double __x) { return __ocml_y1_f64(__x); } 1106 1107 __DEVICE__ 1108 double yn(int __n, double __x) { // TODO: we could use Ahmes multiplication 1109 // and the Miller & Brown algorithm 1110 // for linear recurrences to get O(log n) steps, but it's unclear if 1111 // it'd be beneficial in this case. Placeholder until OCML adds 1112 // support. 1113 if (__n == 0) 1114 return y0(__x); 1115 if (__n == 1) 1116 return y1(__x); 1117 1118 double __x0 = y0(__x); 1119 double __x1 = y1(__x); 1120 for (int __i = 1; __i < __n; ++__i) { 1121 double __x2 = (2 * __i) / __x * __x1 - __x0; 1122 __x0 = __x1; 1123 __x1 = __x2; 1124 } 1125 1126 return __x1; 1127 } 1128 1129 // BEGIN INTRINSICS 1130 #if defined OCML_BASIC_ROUNDED_OPERATIONS 1131 __DEVICE__ 1132 double __dadd_rd(double __x, double __y) { 1133 return __ocml_add_rtn_f64(__x, __y); 1134 } 1135 __DEVICE__ 1136 double __dadd_rn(double __x, double __y) { 1137 return __ocml_add_rte_f64(__x, __y); 1138 } 1139 __DEVICE__ 1140 double __dadd_ru(double __x, double __y) { 1141 return __ocml_add_rtp_f64(__x, __y); 1142 } 1143 __DEVICE__ 1144 double __dadd_rz(double __x, double __y) { 1145 return __ocml_add_rtz_f64(__x, __y); 1146 } 1147 #else 1148 __DEVICE__ 1149 double __dadd_rn(double __x, double __y) { return __x + __y; } 1150 #endif 1151 1152 #if defined OCML_BASIC_ROUNDED_OPERATIONS 1153 __DEVICE__ 1154 double __ddiv_rd(double __x, double __y) { 1155 return __ocml_div_rtn_f64(__x, __y); 1156 } 1157 __DEVICE__ 1158 double __ddiv_rn(double __x, double __y) { 1159 return __ocml_div_rte_f64(__x, __y); 1160 } 1161 __DEVICE__ 1162 double __ddiv_ru(double __x, double __y) { 1163 return __ocml_div_rtp_f64(__x, __y); 1164 } 1165 __DEVICE__ 1166 double __ddiv_rz(double __x, double __y) { 1167 return __ocml_div_rtz_f64(__x, __y); 1168 } 1169 #else 1170 __DEVICE__ 1171 double __ddiv_rn(double __x, double __y) { return __x / __y; } 1172 #endif 1173 1174 #if defined OCML_BASIC_ROUNDED_OPERATIONS 1175 __DEVICE__ 1176 double __dmul_rd(double __x, double __y) { 1177 return __ocml_mul_rtn_f64(__x, __y); 1178 } 1179 __DEVICE__ 1180 double __dmul_rn(double __x, double __y) { 1181 return __ocml_mul_rte_f64(__x, __y); 1182 } 1183 __DEVICE__ 1184 double __dmul_ru(double __x, double __y) { 1185 return __ocml_mul_rtp_f64(__x, __y); 1186 } 1187 __DEVICE__ 1188 double __dmul_rz(double __x, double __y) { 1189 return __ocml_mul_rtz_f64(__x, __y); 1190 } 1191 #else 1192 __DEVICE__ 1193 double __dmul_rn(double __x, double __y) { return __x * __y; } 1194 #endif 1195 1196 #if defined OCML_BASIC_ROUNDED_OPERATIONS 1197 __DEVICE__ 1198 double __drcp_rd(double __x) { return __ocml_div_rtn_f64(1.0, __x); } 1199 __DEVICE__ 1200 double __drcp_rn(double __x) { return __ocml_div_rte_f64(1.0, __x); } 1201 __DEVICE__ 1202 double __drcp_ru(double __x) { return __ocml_div_rtp_f64(1.0, __x); } 1203 __DEVICE__ 1204 double __drcp_rz(double __x) { return __ocml_div_rtz_f64(1.0, __x); } 1205 #else 1206 __DEVICE__ 1207 double __drcp_rn(double __x) { return 1.0 / __x; } 1208 #endif 1209 1210 #if defined OCML_BASIC_ROUNDED_OPERATIONS 1211 __DEVICE__ 1212 double __dsqrt_rd(double __x) { return __ocml_sqrt_rtn_f64(__x); } 1213 __DEVICE__ 1214 double __dsqrt_rn(double __x) { return __ocml_sqrt_rte_f64(__x); } 1215 __DEVICE__ 1216 double __dsqrt_ru(double __x) { return __ocml_sqrt_rtp_f64(__x); } 1217 __DEVICE__ 1218 double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); } 1219 #else 1220 __DEVICE__ 1221 double __dsqrt_rn(double __x) { return __builtin_sqrt(__x); } 1222 #endif 1223 1224 #if defined OCML_BASIC_ROUNDED_OPERATIONS 1225 __DEVICE__ 1226 double __dsub_rd(double __x, double __y) { 1227 return __ocml_sub_rtn_f64(__x, __y); 1228 } 1229 __DEVICE__ 1230 double __dsub_rn(double __x, double __y) { 1231 return __ocml_sub_rte_f64(__x, __y); 1232 } 1233 __DEVICE__ 1234 double __dsub_ru(double __x, double __y) { 1235 return __ocml_sub_rtp_f64(__x, __y); 1236 } 1237 __DEVICE__ 1238 double __dsub_rz(double __x, double __y) { 1239 return __ocml_sub_rtz_f64(__x, __y); 1240 } 1241 #else 1242 __DEVICE__ 1243 double __dsub_rn(double __x, double __y) { return __x - __y; } 1244 #endif 1245 1246 #if defined OCML_BASIC_ROUNDED_OPERATIONS 1247 __DEVICE__ 1248 double __fma_rd(double __x, double __y, double __z) { 1249 return __ocml_fma_rtn_f64(__x, __y, __z); 1250 } 1251 __DEVICE__ 1252 double __fma_rn(double __x, double __y, double __z) { 1253 return __ocml_fma_rte_f64(__x, __y, __z); 1254 } 1255 __DEVICE__ 1256 double __fma_ru(double __x, double __y, double __z) { 1257 return __ocml_fma_rtp_f64(__x, __y, __z); 1258 } 1259 __DEVICE__ 1260 double __fma_rz(double __x, double __y, double __z) { 1261 return __ocml_fma_rtz_f64(__x, __y, __z); 1262 } 1263 #else 1264 __DEVICE__ 1265 double __fma_rn(double __x, double __y, double __z) { 1266 return __builtin_fma(__x, __y, __z); 1267 } 1268 #endif 1269 // END INTRINSICS 1270 // END DOUBLE 1271 1272 // C only macros 1273 #if !defined(__cplusplus) && __STDC_VERSION__ >= 201112L 1274 #define isfinite(__x) _Generic((__x), float : __finitef, double : __finite)(__x) 1275 #define isinf(__x) _Generic((__x), float : __isinff, double : __isinf)(__x) 1276 #define isnan(__x) _Generic((__x), float : __isnanf, double : __isnan)(__x) 1277 #define signbit(__x) \ 1278 _Generic((__x), float : __signbitf, double : __signbit)(__x) 1279 #endif // !defined(__cplusplus) && __STDC_VERSION__ >= 201112L 1280 1281 #if defined(__cplusplus) 1282 template <class T> __DEVICE__ T min(T __arg1, T __arg2) { 1283 return (__arg1 < __arg2) ? __arg1 : __arg2; 1284 } 1285 1286 template <class T> __DEVICE__ T max(T __arg1, T __arg2) { 1287 return (__arg1 > __arg2) ? __arg1 : __arg2; 1288 } 1289 1290 __DEVICE__ int min(int __arg1, int __arg2) { 1291 return (__arg1 < __arg2) ? __arg1 : __arg2; 1292 } 1293 __DEVICE__ int max(int __arg1, int __arg2) { 1294 return (__arg1 > __arg2) ? __arg1 : __arg2; 1295 } 1296 1297 __DEVICE__ 1298 float max(float __x, float __y) { return __builtin_fmaxf(__x, __y); } 1299 1300 __DEVICE__ 1301 double max(double __x, double __y) { return __builtin_fmax(__x, __y); } 1302 1303 __DEVICE__ 1304 float min(float __x, float __y) { return __builtin_fminf(__x, __y); } 1305 1306 __DEVICE__ 1307 double min(double __x, double __y) { return __builtin_fmin(__x, __y); } 1308 1309 #if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) 1310 __host__ inline static int min(int __arg1, int __arg2) { 1311 return __arg1 < __arg2 ? __arg1 : __arg2; 1312 } 1313 1314 __host__ inline static int max(int __arg1, int __arg2) { 1315 return __arg1 > __arg2 ? __arg1 : __arg2; 1316 } 1317 #endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) 1318 #endif 1319 1320 #pragma pop_macro("__DEVICE__") 1321 #pragma pop_macro("__RETURN_TYPE") 1322 #pragma pop_macro("__FAST_OR_SLOW") 1323 1324 #endif // __CLANG_HIP_MATH_H__ 1325