10b57cec5SDimitry Andric //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric 90b57cec5SDimitry Andric #include "NVPTXTargetTransformInfo.h" 100b57cec5SDimitry Andric #include "NVPTXUtilities.h" 110b57cec5SDimitry Andric #include "llvm/Analysis/LoopInfo.h" 120b57cec5SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h" 130b57cec5SDimitry Andric #include "llvm/Analysis/ValueTracking.h" 140b57cec5SDimitry Andric #include "llvm/CodeGen/BasicTTIImpl.h" 150b57cec5SDimitry Andric #include "llvm/CodeGen/CostTable.h" 160b57cec5SDimitry Andric #include "llvm/CodeGen/TargetLowering.h" 17480093f4SDimitry Andric #include "llvm/IR/IntrinsicsNVPTX.h" 180b57cec5SDimitry Andric #include "llvm/Support/Debug.h" 19bdd1243dSDimitry Andric #include <optional> 200b57cec5SDimitry Andric using namespace llvm; 210b57cec5SDimitry Andric 220b57cec5SDimitry Andric #define DEBUG_TYPE "NVPTXtti" 230b57cec5SDimitry Andric 240b57cec5SDimitry Andric // Whether the given intrinsic reads threadIdx.x/y/z. 250b57cec5SDimitry Andric static bool readsThreadIndex(const IntrinsicInst *II) { 260b57cec5SDimitry Andric switch (II->getIntrinsicID()) { 270b57cec5SDimitry Andric default: return false; 280b57cec5SDimitry Andric case Intrinsic::nvvm_read_ptx_sreg_tid_x: 290b57cec5SDimitry Andric case Intrinsic::nvvm_read_ptx_sreg_tid_y: 300b57cec5SDimitry Andric case Intrinsic::nvvm_read_ptx_sreg_tid_z: 310b57cec5SDimitry Andric return true; 320b57cec5SDimitry Andric } 330b57cec5SDimitry Andric } 340b57cec5SDimitry Andric 350b57cec5SDimitry Andric static bool readsLaneId(const IntrinsicInst *II) { 360b57cec5SDimitry Andric return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid; 370b57cec5SDimitry Andric } 380b57cec5SDimitry Andric 390b57cec5SDimitry Andric // Whether the given intrinsic is an atomic instruction in PTX. 400b57cec5SDimitry Andric static bool isNVVMAtomic(const IntrinsicInst *II) { 410b57cec5SDimitry Andric switch (II->getIntrinsicID()) { 420b57cec5SDimitry Andric default: return false; 430b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_load_inc_32: 440b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_load_dec_32: 450b57cec5SDimitry Andric 460b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_add_gen_f_cta: 470b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_add_gen_f_sys: 480b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_add_gen_i_cta: 490b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_add_gen_i_sys: 500b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_and_gen_i_cta: 510b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_and_gen_i_sys: 520b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_cas_gen_i_cta: 530b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_cas_gen_i_sys: 540b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_dec_gen_i_cta: 550b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_dec_gen_i_sys: 560b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_inc_gen_i_cta: 570b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_inc_gen_i_sys: 580b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_max_gen_i_cta: 590b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_max_gen_i_sys: 600b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_min_gen_i_cta: 610b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_min_gen_i_sys: 620b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_or_gen_i_cta: 630b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_or_gen_i_sys: 640b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_exch_gen_i_cta: 650b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_exch_gen_i_sys: 660b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_xor_gen_i_cta: 670b57cec5SDimitry Andric case Intrinsic::nvvm_atomic_xor_gen_i_sys: 680b57cec5SDimitry Andric return true; 690b57cec5SDimitry Andric } 700b57cec5SDimitry Andric } 710b57cec5SDimitry Andric 720b57cec5SDimitry Andric bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) { 730b57cec5SDimitry Andric // Without inter-procedural analysis, we conservatively assume that arguments 740b57cec5SDimitry Andric // to __device__ functions are divergent. 750b57cec5SDimitry Andric if (const Argument *Arg = dyn_cast<Argument>(V)) 760b57cec5SDimitry Andric return !isKernelFunction(*Arg->getParent()); 770b57cec5SDimitry Andric 780b57cec5SDimitry Andric if (const Instruction *I = dyn_cast<Instruction>(V)) { 790b57cec5SDimitry Andric // Without pointer analysis, we conservatively assume values loaded from 800b57cec5SDimitry Andric // generic or local address space are divergent. 810b57cec5SDimitry Andric if (const LoadInst *LI = dyn_cast<LoadInst>(I)) { 820b57cec5SDimitry Andric unsigned AS = LI->getPointerAddressSpace(); 830b57cec5SDimitry Andric return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL; 840b57cec5SDimitry Andric } 850b57cec5SDimitry Andric // Atomic instructions may cause divergence. Atomic instructions are 860b57cec5SDimitry Andric // executed sequentially across all threads in a warp. Therefore, an earlier 870b57cec5SDimitry Andric // executed thread may see different memory inputs than a later executed 880b57cec5SDimitry Andric // thread. For example, suppose *a = 0 initially. 890b57cec5SDimitry Andric // 900b57cec5SDimitry Andric // atom.global.add.s32 d, [a], 1 910b57cec5SDimitry Andric // 920b57cec5SDimitry Andric // returns 0 for the first thread that enters the critical region, and 1 for 930b57cec5SDimitry Andric // the second thread. 940b57cec5SDimitry Andric if (I->isAtomic()) 950b57cec5SDimitry Andric return true; 960b57cec5SDimitry Andric if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 970b57cec5SDimitry Andric // Instructions that read threadIdx are obviously divergent. 980b57cec5SDimitry Andric if (readsThreadIndex(II) || readsLaneId(II)) 990b57cec5SDimitry Andric return true; 10081ad6265SDimitry Andric // Handle the NVPTX atomic intrinsics that cannot be represented as an 1010b57cec5SDimitry Andric // atomic IR instruction. 1020b57cec5SDimitry Andric if (isNVVMAtomic(II)) 1030b57cec5SDimitry Andric return true; 1040b57cec5SDimitry Andric } 1050b57cec5SDimitry Andric // Conservatively consider the return value of function calls as divergent. 1060b57cec5SDimitry Andric // We could analyze callees with bodies more precisely using 1070b57cec5SDimitry Andric // inter-procedural analysis. 1080b57cec5SDimitry Andric if (isa<CallInst>(I)) 1090b57cec5SDimitry Andric return true; 1100b57cec5SDimitry Andric } 1110b57cec5SDimitry Andric 1120b57cec5SDimitry Andric return false; 1130b57cec5SDimitry Andric } 1140b57cec5SDimitry Andric 115e8d8bef9SDimitry Andric // Convert NVVM intrinsics to target-generic LLVM code where possible. 116e8d8bef9SDimitry Andric static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) { 117e8d8bef9SDimitry Andric // Each NVVM intrinsic we can simplify can be replaced with one of: 118e8d8bef9SDimitry Andric // 119e8d8bef9SDimitry Andric // * an LLVM intrinsic, 120e8d8bef9SDimitry Andric // * an LLVM cast operation, 121e8d8bef9SDimitry Andric // * an LLVM binary operation, or 122e8d8bef9SDimitry Andric // * ad-hoc LLVM IR for the particular operation. 123e8d8bef9SDimitry Andric 124e8d8bef9SDimitry Andric // Some transformations are only valid when the module's 125e8d8bef9SDimitry Andric // flush-denormals-to-zero (ftz) setting is true/false, whereas other 126e8d8bef9SDimitry Andric // transformations are valid regardless of the module's ftz setting. 127e8d8bef9SDimitry Andric enum FtzRequirementTy { 128e8d8bef9SDimitry Andric FTZ_Any, // Any ftz setting is ok. 129e8d8bef9SDimitry Andric FTZ_MustBeOn, // Transformation is valid only if ftz is on. 130e8d8bef9SDimitry Andric FTZ_MustBeOff, // Transformation is valid only if ftz is off. 131e8d8bef9SDimitry Andric }; 132e8d8bef9SDimitry Andric // Classes of NVVM intrinsics that can't be replaced one-to-one with a 133e8d8bef9SDimitry Andric // target-generic intrinsic, cast op, or binary op but that we can nonetheless 134e8d8bef9SDimitry Andric // simplify. 135e8d8bef9SDimitry Andric enum SpecialCase { 136e8d8bef9SDimitry Andric SPC_Reciprocal, 137e8d8bef9SDimitry Andric }; 138e8d8bef9SDimitry Andric 139e8d8bef9SDimitry Andric // SimplifyAction is a poor-man's variant (plus an additional flag) that 140e8d8bef9SDimitry Andric // represents how to replace an NVVM intrinsic with target-generic LLVM IR. 141e8d8bef9SDimitry Andric struct SimplifyAction { 142e8d8bef9SDimitry Andric // Invariant: At most one of these Optionals has a value. 143bdd1243dSDimitry Andric std::optional<Intrinsic::ID> IID; 144bdd1243dSDimitry Andric std::optional<Instruction::CastOps> CastOp; 145bdd1243dSDimitry Andric std::optional<Instruction::BinaryOps> BinaryOp; 146bdd1243dSDimitry Andric std::optional<SpecialCase> Special; 147e8d8bef9SDimitry Andric 148e8d8bef9SDimitry Andric FtzRequirementTy FtzRequirement = FTZ_Any; 14981ad6265SDimitry Andric // Denormal handling is guarded by different attributes depending on the 15081ad6265SDimitry Andric // type (denormal-fp-math vs denormal-fp-math-f32), take note of halfs. 15181ad6265SDimitry Andric bool IsHalfTy = false; 152e8d8bef9SDimitry Andric 153e8d8bef9SDimitry Andric SimplifyAction() = default; 154e8d8bef9SDimitry Andric 15581ad6265SDimitry Andric SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq, 15681ad6265SDimitry Andric bool IsHalfTy = false) 15781ad6265SDimitry Andric : IID(IID), FtzRequirement(FtzReq), IsHalfTy(IsHalfTy) {} 158e8d8bef9SDimitry Andric 159e8d8bef9SDimitry Andric // Cast operations don't have anything to do with FTZ, so we skip that 160e8d8bef9SDimitry Andric // argument. 161e8d8bef9SDimitry Andric SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {} 162e8d8bef9SDimitry Andric 163e8d8bef9SDimitry Andric SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq) 164e8d8bef9SDimitry Andric : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {} 165e8d8bef9SDimitry Andric 166e8d8bef9SDimitry Andric SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq) 167e8d8bef9SDimitry Andric : Special(Special), FtzRequirement(FtzReq) {} 168e8d8bef9SDimitry Andric }; 169e8d8bef9SDimitry Andric 170e8d8bef9SDimitry Andric // Try to generate a SimplifyAction describing how to replace our 171e8d8bef9SDimitry Andric // IntrinsicInstr with target-generic LLVM IR. 172e8d8bef9SDimitry Andric const SimplifyAction Action = [II]() -> SimplifyAction { 173e8d8bef9SDimitry Andric switch (II->getIntrinsicID()) { 174e8d8bef9SDimitry Andric // NVVM intrinsics that map directly to LLVM intrinsics. 175e8d8bef9SDimitry Andric case Intrinsic::nvvm_ceil_d: 176e8d8bef9SDimitry Andric return {Intrinsic::ceil, FTZ_Any}; 177e8d8bef9SDimitry Andric case Intrinsic::nvvm_ceil_f: 178e8d8bef9SDimitry Andric return {Intrinsic::ceil, FTZ_MustBeOff}; 179e8d8bef9SDimitry Andric case Intrinsic::nvvm_ceil_ftz_f: 180e8d8bef9SDimitry Andric return {Intrinsic::ceil, FTZ_MustBeOn}; 181e8d8bef9SDimitry Andric case Intrinsic::nvvm_fabs_d: 182e8d8bef9SDimitry Andric return {Intrinsic::fabs, FTZ_Any}; 183e8d8bef9SDimitry Andric case Intrinsic::nvvm_floor_d: 184e8d8bef9SDimitry Andric return {Intrinsic::floor, FTZ_Any}; 185e8d8bef9SDimitry Andric case Intrinsic::nvvm_floor_f: 186e8d8bef9SDimitry Andric return {Intrinsic::floor, FTZ_MustBeOff}; 187e8d8bef9SDimitry Andric case Intrinsic::nvvm_floor_ftz_f: 188e8d8bef9SDimitry Andric return {Intrinsic::floor, FTZ_MustBeOn}; 189e8d8bef9SDimitry Andric case Intrinsic::nvvm_fma_rn_d: 190e8d8bef9SDimitry Andric return {Intrinsic::fma, FTZ_Any}; 191e8d8bef9SDimitry Andric case Intrinsic::nvvm_fma_rn_f: 192e8d8bef9SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOff}; 193e8d8bef9SDimitry Andric case Intrinsic::nvvm_fma_rn_ftz_f: 194e8d8bef9SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOn}; 19581ad6265SDimitry Andric case Intrinsic::nvvm_fma_rn_f16: 19681ad6265SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOff, true}; 19781ad6265SDimitry Andric case Intrinsic::nvvm_fma_rn_ftz_f16: 19881ad6265SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOn, true}; 19981ad6265SDimitry Andric case Intrinsic::nvvm_fma_rn_f16x2: 20081ad6265SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOff, true}; 20181ad6265SDimitry Andric case Intrinsic::nvvm_fma_rn_ftz_f16x2: 20281ad6265SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOn, true}; 203*06c3fb27SDimitry Andric case Intrinsic::nvvm_fma_rn_bf16: 204*06c3fb27SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOff, true}; 205*06c3fb27SDimitry Andric case Intrinsic::nvvm_fma_rn_ftz_bf16: 206*06c3fb27SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOn, true}; 207*06c3fb27SDimitry Andric case Intrinsic::nvvm_fma_rn_bf16x2: 208*06c3fb27SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOff, true}; 209*06c3fb27SDimitry Andric case Intrinsic::nvvm_fma_rn_ftz_bf16x2: 210*06c3fb27SDimitry Andric return {Intrinsic::fma, FTZ_MustBeOn, true}; 211e8d8bef9SDimitry Andric case Intrinsic::nvvm_fmax_d: 212e8d8bef9SDimitry Andric return {Intrinsic::maxnum, FTZ_Any}; 213e8d8bef9SDimitry Andric case Intrinsic::nvvm_fmax_f: 214e8d8bef9SDimitry Andric return {Intrinsic::maxnum, FTZ_MustBeOff}; 215e8d8bef9SDimitry Andric case Intrinsic::nvvm_fmax_ftz_f: 216e8d8bef9SDimitry Andric return {Intrinsic::maxnum, FTZ_MustBeOn}; 21781ad6265SDimitry Andric case Intrinsic::nvvm_fmax_nan_f: 21881ad6265SDimitry Andric return {Intrinsic::maximum, FTZ_MustBeOff}; 21981ad6265SDimitry Andric case Intrinsic::nvvm_fmax_ftz_nan_f: 22081ad6265SDimitry Andric return {Intrinsic::maximum, FTZ_MustBeOn}; 22181ad6265SDimitry Andric case Intrinsic::nvvm_fmax_f16: 22281ad6265SDimitry Andric return {Intrinsic::maxnum, FTZ_MustBeOff, true}; 22381ad6265SDimitry Andric case Intrinsic::nvvm_fmax_ftz_f16: 22481ad6265SDimitry Andric return {Intrinsic::maxnum, FTZ_MustBeOn, true}; 22581ad6265SDimitry Andric case Intrinsic::nvvm_fmax_f16x2: 22681ad6265SDimitry Andric return {Intrinsic::maxnum, FTZ_MustBeOff, true}; 22781ad6265SDimitry Andric case Intrinsic::nvvm_fmax_ftz_f16x2: 22881ad6265SDimitry Andric return {Intrinsic::maxnum, FTZ_MustBeOn, true}; 22981ad6265SDimitry Andric case Intrinsic::nvvm_fmax_nan_f16: 23081ad6265SDimitry Andric return {Intrinsic::maximum, FTZ_MustBeOff, true}; 23181ad6265SDimitry Andric case Intrinsic::nvvm_fmax_ftz_nan_f16: 23281ad6265SDimitry Andric return {Intrinsic::maximum, FTZ_MustBeOn, true}; 23381ad6265SDimitry Andric case Intrinsic::nvvm_fmax_nan_f16x2: 23481ad6265SDimitry Andric return {Intrinsic::maximum, FTZ_MustBeOff, true}; 23581ad6265SDimitry Andric case Intrinsic::nvvm_fmax_ftz_nan_f16x2: 23681ad6265SDimitry Andric return {Intrinsic::maximum, FTZ_MustBeOn, true}; 237e8d8bef9SDimitry Andric case Intrinsic::nvvm_fmin_d: 238e8d8bef9SDimitry Andric return {Intrinsic::minnum, FTZ_Any}; 239e8d8bef9SDimitry Andric case Intrinsic::nvvm_fmin_f: 240e8d8bef9SDimitry Andric return {Intrinsic::minnum, FTZ_MustBeOff}; 241e8d8bef9SDimitry Andric case Intrinsic::nvvm_fmin_ftz_f: 242e8d8bef9SDimitry Andric return {Intrinsic::minnum, FTZ_MustBeOn}; 24381ad6265SDimitry Andric case Intrinsic::nvvm_fmin_nan_f: 24481ad6265SDimitry Andric return {Intrinsic::minimum, FTZ_MustBeOff}; 24581ad6265SDimitry Andric case Intrinsic::nvvm_fmin_ftz_nan_f: 24681ad6265SDimitry Andric return {Intrinsic::minimum, FTZ_MustBeOn}; 24781ad6265SDimitry Andric case Intrinsic::nvvm_fmin_f16: 24881ad6265SDimitry Andric return {Intrinsic::minnum, FTZ_MustBeOff, true}; 24981ad6265SDimitry Andric case Intrinsic::nvvm_fmin_ftz_f16: 25081ad6265SDimitry Andric return {Intrinsic::minnum, FTZ_MustBeOn, true}; 25181ad6265SDimitry Andric case Intrinsic::nvvm_fmin_f16x2: 25281ad6265SDimitry Andric return {Intrinsic::minnum, FTZ_MustBeOff, true}; 25381ad6265SDimitry Andric case Intrinsic::nvvm_fmin_ftz_f16x2: 25481ad6265SDimitry Andric return {Intrinsic::minnum, FTZ_MustBeOn, true}; 25581ad6265SDimitry Andric case Intrinsic::nvvm_fmin_nan_f16: 25681ad6265SDimitry Andric return {Intrinsic::minimum, FTZ_MustBeOff, true}; 25781ad6265SDimitry Andric case Intrinsic::nvvm_fmin_ftz_nan_f16: 25881ad6265SDimitry Andric return {Intrinsic::minimum, FTZ_MustBeOn, true}; 25981ad6265SDimitry Andric case Intrinsic::nvvm_fmin_nan_f16x2: 26081ad6265SDimitry Andric return {Intrinsic::minimum, FTZ_MustBeOff, true}; 26181ad6265SDimitry Andric case Intrinsic::nvvm_fmin_ftz_nan_f16x2: 26281ad6265SDimitry Andric return {Intrinsic::minimum, FTZ_MustBeOn, true}; 263e8d8bef9SDimitry Andric case Intrinsic::nvvm_sqrt_rn_d: 264e8d8bef9SDimitry Andric return {Intrinsic::sqrt, FTZ_Any}; 265e8d8bef9SDimitry Andric case Intrinsic::nvvm_sqrt_f: 266e8d8bef9SDimitry Andric // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the 267e8d8bef9SDimitry Andric // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts 268e8d8bef9SDimitry Andric // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are 269e8d8bef9SDimitry Andric // the versions with explicit ftz-ness. 270e8d8bef9SDimitry Andric return {Intrinsic::sqrt, FTZ_Any}; 271e8d8bef9SDimitry Andric case Intrinsic::nvvm_trunc_d: 272e8d8bef9SDimitry Andric return {Intrinsic::trunc, FTZ_Any}; 273e8d8bef9SDimitry Andric case Intrinsic::nvvm_trunc_f: 274e8d8bef9SDimitry Andric return {Intrinsic::trunc, FTZ_MustBeOff}; 275e8d8bef9SDimitry Andric case Intrinsic::nvvm_trunc_ftz_f: 276e8d8bef9SDimitry Andric return {Intrinsic::trunc, FTZ_MustBeOn}; 277e8d8bef9SDimitry Andric 278e8d8bef9SDimitry Andric // NVVM intrinsics that map to LLVM cast operations. 279e8d8bef9SDimitry Andric // 280e8d8bef9SDimitry Andric // Note that llvm's target-generic conversion operators correspond to the rz 281e8d8bef9SDimitry Andric // (round to zero) versions of the nvvm conversion intrinsics, even though 282e8d8bef9SDimitry Andric // most everything else here uses the rn (round to nearest even) nvvm ops. 283e8d8bef9SDimitry Andric case Intrinsic::nvvm_d2i_rz: 284e8d8bef9SDimitry Andric case Intrinsic::nvvm_f2i_rz: 285e8d8bef9SDimitry Andric case Intrinsic::nvvm_d2ll_rz: 286e8d8bef9SDimitry Andric case Intrinsic::nvvm_f2ll_rz: 287e8d8bef9SDimitry Andric return {Instruction::FPToSI}; 288e8d8bef9SDimitry Andric case Intrinsic::nvvm_d2ui_rz: 289e8d8bef9SDimitry Andric case Intrinsic::nvvm_f2ui_rz: 290e8d8bef9SDimitry Andric case Intrinsic::nvvm_d2ull_rz: 291e8d8bef9SDimitry Andric case Intrinsic::nvvm_f2ull_rz: 292e8d8bef9SDimitry Andric return {Instruction::FPToUI}; 293e8d8bef9SDimitry Andric case Intrinsic::nvvm_i2d_rz: 294e8d8bef9SDimitry Andric case Intrinsic::nvvm_i2f_rz: 295e8d8bef9SDimitry Andric case Intrinsic::nvvm_ll2d_rz: 296e8d8bef9SDimitry Andric case Intrinsic::nvvm_ll2f_rz: 297e8d8bef9SDimitry Andric return {Instruction::SIToFP}; 298e8d8bef9SDimitry Andric case Intrinsic::nvvm_ui2d_rz: 299e8d8bef9SDimitry Andric case Intrinsic::nvvm_ui2f_rz: 300e8d8bef9SDimitry Andric case Intrinsic::nvvm_ull2d_rz: 301e8d8bef9SDimitry Andric case Intrinsic::nvvm_ull2f_rz: 302e8d8bef9SDimitry Andric return {Instruction::UIToFP}; 303e8d8bef9SDimitry Andric 304e8d8bef9SDimitry Andric // NVVM intrinsics that map to LLVM binary ops. 305e8d8bef9SDimitry Andric case Intrinsic::nvvm_div_rn_d: 306e8d8bef9SDimitry Andric return {Instruction::FDiv, FTZ_Any}; 307e8d8bef9SDimitry Andric 308e8d8bef9SDimitry Andric // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but 309e8d8bef9SDimitry Andric // need special handling. 310e8d8bef9SDimitry Andric // 311e8d8bef9SDimitry Andric // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just 312e8d8bef9SDimitry Andric // as well. 313e8d8bef9SDimitry Andric case Intrinsic::nvvm_rcp_rn_d: 314e8d8bef9SDimitry Andric return {SPC_Reciprocal, FTZ_Any}; 315e8d8bef9SDimitry Andric 316e8d8bef9SDimitry Andric // We do not currently simplify intrinsics that give an approximate 317e8d8bef9SDimitry Andric // answer. These include: 318e8d8bef9SDimitry Andric // 319e8d8bef9SDimitry Andric // - nvvm_cos_approx_{f,ftz_f} 320e8d8bef9SDimitry Andric // - nvvm_ex2_approx_{d,f,ftz_f} 321e8d8bef9SDimitry Andric // - nvvm_lg2_approx_{d,f,ftz_f} 322e8d8bef9SDimitry Andric // - nvvm_sin_approx_{f,ftz_f} 323e8d8bef9SDimitry Andric // - nvvm_sqrt_approx_{f,ftz_f} 324e8d8bef9SDimitry Andric // - nvvm_rsqrt_approx_{d,f,ftz_f} 325e8d8bef9SDimitry Andric // - nvvm_div_approx_{ftz_d,ftz_f,f} 326e8d8bef9SDimitry Andric // - nvvm_rcp_approx_ftz_d 327e8d8bef9SDimitry Andric // 328e8d8bef9SDimitry Andric // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast" 329e8d8bef9SDimitry Andric // means that fastmath is enabled in the intrinsic. Unfortunately only 330e8d8bef9SDimitry Andric // binary operators (currently) have a fastmath bit in SelectionDAG, so 331e8d8bef9SDimitry Andric // this information gets lost and we can't select on it. 332e8d8bef9SDimitry Andric // 333e8d8bef9SDimitry Andric // TODO: div and rcp are lowered to a binary op, so these we could in 334e8d8bef9SDimitry Andric // theory lower them to "fast fdiv". 335e8d8bef9SDimitry Andric 336e8d8bef9SDimitry Andric default: 337e8d8bef9SDimitry Andric return {}; 338e8d8bef9SDimitry Andric } 339e8d8bef9SDimitry Andric }(); 340e8d8bef9SDimitry Andric 341e8d8bef9SDimitry Andric // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we 342e8d8bef9SDimitry Andric // can bail out now. (Notice that in the case that IID is not an NVVM 343e8d8bef9SDimitry Andric // intrinsic, we don't have to look up any module metadata, as 344e8d8bef9SDimitry Andric // FtzRequirementTy will be FTZ_Any.) 345e8d8bef9SDimitry Andric if (Action.FtzRequirement != FTZ_Any) { 346bdd1243dSDimitry Andric // FIXME: Broken for f64 347bdd1243dSDimitry Andric DenormalMode Mode = II->getFunction()->getDenormalMode( 348bdd1243dSDimitry Andric Action.IsHalfTy ? APFloat::IEEEhalf() : APFloat::IEEEsingle()); 349bdd1243dSDimitry Andric bool FtzEnabled = Mode.Output == DenormalMode::PreserveSign; 350e8d8bef9SDimitry Andric 351e8d8bef9SDimitry Andric if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn)) 352e8d8bef9SDimitry Andric return nullptr; 353e8d8bef9SDimitry Andric } 354e8d8bef9SDimitry Andric 355e8d8bef9SDimitry Andric // Simplify to target-generic intrinsic. 356e8d8bef9SDimitry Andric if (Action.IID) { 357349cc55cSDimitry Andric SmallVector<Value *, 4> Args(II->args()); 358e8d8bef9SDimitry Andric // All the target-generic intrinsics currently of interest to us have one 359e8d8bef9SDimitry Andric // type argument, equal to that of the nvvm intrinsic's argument. 360e8d8bef9SDimitry Andric Type *Tys[] = {II->getArgOperand(0)->getType()}; 361e8d8bef9SDimitry Andric return CallInst::Create( 362e8d8bef9SDimitry Andric Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args); 363e8d8bef9SDimitry Andric } 364e8d8bef9SDimitry Andric 365e8d8bef9SDimitry Andric // Simplify to target-generic binary op. 366e8d8bef9SDimitry Andric if (Action.BinaryOp) 367e8d8bef9SDimitry Andric return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0), 368e8d8bef9SDimitry Andric II->getArgOperand(1), II->getName()); 369e8d8bef9SDimitry Andric 370e8d8bef9SDimitry Andric // Simplify to target-generic cast op. 371e8d8bef9SDimitry Andric if (Action.CastOp) 372e8d8bef9SDimitry Andric return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(), 373e8d8bef9SDimitry Andric II->getName()); 374e8d8bef9SDimitry Andric 375e8d8bef9SDimitry Andric // All that's left are the special cases. 376e8d8bef9SDimitry Andric if (!Action.Special) 377e8d8bef9SDimitry Andric return nullptr; 378e8d8bef9SDimitry Andric 379e8d8bef9SDimitry Andric switch (*Action.Special) { 380e8d8bef9SDimitry Andric case SPC_Reciprocal: 381e8d8bef9SDimitry Andric // Simplify reciprocal. 382e8d8bef9SDimitry Andric return BinaryOperator::Create( 383e8d8bef9SDimitry Andric Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1), 384e8d8bef9SDimitry Andric II->getArgOperand(0), II->getName()); 385e8d8bef9SDimitry Andric } 386e8d8bef9SDimitry Andric llvm_unreachable("All SpecialCase enumerators should be handled in switch."); 387e8d8bef9SDimitry Andric } 388e8d8bef9SDimitry Andric 389bdd1243dSDimitry Andric std::optional<Instruction *> 390e8d8bef9SDimitry Andric NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 391e8d8bef9SDimitry Andric if (Instruction *I = simplifyNvvmIntrinsic(&II, IC)) { 392e8d8bef9SDimitry Andric return I; 393e8d8bef9SDimitry Andric } 394bdd1243dSDimitry Andric return std::nullopt; 395e8d8bef9SDimitry Andric } 396e8d8bef9SDimitry Andric 397fe6060f1SDimitry Andric InstructionCost NVPTXTTIImpl::getArithmeticInstrCost( 3985ffd83dbSDimitry Andric unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 399bdd1243dSDimitry Andric TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 400bdd1243dSDimitry Andric ArrayRef<const Value *> Args, 401480093f4SDimitry Andric const Instruction *CxtI) { 4020b57cec5SDimitry Andric // Legalize the type. 403bdd1243dSDimitry Andric std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 4040b57cec5SDimitry Andric 4050b57cec5SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode); 4060b57cec5SDimitry Andric 4070b57cec5SDimitry Andric switch (ISD) { 4080b57cec5SDimitry Andric default: 409bdd1243dSDimitry Andric return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 410bdd1243dSDimitry Andric Op2Info); 4110b57cec5SDimitry Andric case ISD::ADD: 4120b57cec5SDimitry Andric case ISD::MUL: 4130b57cec5SDimitry Andric case ISD::XOR: 4140b57cec5SDimitry Andric case ISD::OR: 4150b57cec5SDimitry Andric case ISD::AND: 4160b57cec5SDimitry Andric // The machine code (SASS) simulates an i64 with two i32. Therefore, we 4170b57cec5SDimitry Andric // estimate that arithmetic operations on i64 are twice as expensive as 4180b57cec5SDimitry Andric // those on types that can fit into one machine register. 4190b57cec5SDimitry Andric if (LT.second.SimpleTy == MVT::i64) 4200b57cec5SDimitry Andric return 2 * LT.first; 4210b57cec5SDimitry Andric // Delegate other cases to the basic TTI. 422bdd1243dSDimitry Andric return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 423bdd1243dSDimitry Andric Op2Info); 4240b57cec5SDimitry Andric } 4250b57cec5SDimitry Andric } 4260b57cec5SDimitry Andric 4270b57cec5SDimitry Andric void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 428349cc55cSDimitry Andric TTI::UnrollingPreferences &UP, 429349cc55cSDimitry Andric OptimizationRemarkEmitter *ORE) { 430349cc55cSDimitry Andric BaseT::getUnrollingPreferences(L, SE, UP, ORE); 4310b57cec5SDimitry Andric 4320b57cec5SDimitry Andric // Enable partial unrolling and runtime unrolling, but reduce the 4330b57cec5SDimitry Andric // threshold. This partially unrolls small loops which are often 4340b57cec5SDimitry Andric // unrolled by the PTX to SASS compiler and unrolling earlier can be 4350b57cec5SDimitry Andric // beneficial. 4360b57cec5SDimitry Andric UP.Partial = UP.Runtime = true; 4370b57cec5SDimitry Andric UP.PartialThreshold = UP.Threshold / 4; 4380b57cec5SDimitry Andric } 4395ffd83dbSDimitry Andric 4405ffd83dbSDimitry Andric void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 4415ffd83dbSDimitry Andric TTI::PeelingPreferences &PP) { 4425ffd83dbSDimitry Andric BaseT::getPeelingPreferences(L, SE, PP); 4435ffd83dbSDimitry Andric } 444