1 //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "NVPTXTargetTransformInfo.h" 10 #include "NVPTXUtilities.h" 11 #include "llvm/Analysis/LoopInfo.h" 12 #include "llvm/Analysis/TargetTransformInfo.h" 13 #include "llvm/Analysis/ValueTracking.h" 14 #include "llvm/CodeGen/BasicTTIImpl.h" 15 #include "llvm/CodeGen/CostTable.h" 16 #include "llvm/CodeGen/TargetLowering.h" 17 #include "llvm/IR/IntrinsicsNVPTX.h" 18 #include "llvm/Support/Debug.h" 19 using namespace llvm; 20 21 #define DEBUG_TYPE "NVPTXtti" 22 23 // Whether the given intrinsic reads threadIdx.x/y/z. 24 static bool readsThreadIndex(const IntrinsicInst *II) { 25 switch (II->getIntrinsicID()) { 26 default: return false; 27 case Intrinsic::nvvm_read_ptx_sreg_tid_x: 28 case Intrinsic::nvvm_read_ptx_sreg_tid_y: 29 case Intrinsic::nvvm_read_ptx_sreg_tid_z: 30 return true; 31 } 32 } 33 34 static bool readsLaneId(const IntrinsicInst *II) { 35 return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid; 36 } 37 38 // Whether the given intrinsic is an atomic instruction in PTX. 39 static bool isNVVMAtomic(const IntrinsicInst *II) { 40 switch (II->getIntrinsicID()) { 41 default: return false; 42 case Intrinsic::nvvm_atomic_load_inc_32: 43 case Intrinsic::nvvm_atomic_load_dec_32: 44 45 case Intrinsic::nvvm_atomic_add_gen_f_cta: 46 case Intrinsic::nvvm_atomic_add_gen_f_sys: 47 case Intrinsic::nvvm_atomic_add_gen_i_cta: 48 case Intrinsic::nvvm_atomic_add_gen_i_sys: 49 case Intrinsic::nvvm_atomic_and_gen_i_cta: 50 case Intrinsic::nvvm_atomic_and_gen_i_sys: 51 case Intrinsic::nvvm_atomic_cas_gen_i_cta: 52 case Intrinsic::nvvm_atomic_cas_gen_i_sys: 53 case Intrinsic::nvvm_atomic_dec_gen_i_cta: 54 case Intrinsic::nvvm_atomic_dec_gen_i_sys: 55 case Intrinsic::nvvm_atomic_inc_gen_i_cta: 56 case Intrinsic::nvvm_atomic_inc_gen_i_sys: 57 case Intrinsic::nvvm_atomic_max_gen_i_cta: 58 case Intrinsic::nvvm_atomic_max_gen_i_sys: 59 case Intrinsic::nvvm_atomic_min_gen_i_cta: 60 case Intrinsic::nvvm_atomic_min_gen_i_sys: 61 case Intrinsic::nvvm_atomic_or_gen_i_cta: 62 case Intrinsic::nvvm_atomic_or_gen_i_sys: 63 case Intrinsic::nvvm_atomic_exch_gen_i_cta: 64 case Intrinsic::nvvm_atomic_exch_gen_i_sys: 65 case Intrinsic::nvvm_atomic_xor_gen_i_cta: 66 case Intrinsic::nvvm_atomic_xor_gen_i_sys: 67 return true; 68 } 69 } 70 71 bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) { 72 // Without inter-procedural analysis, we conservatively assume that arguments 73 // to __device__ functions are divergent. 74 if (const Argument *Arg = dyn_cast<Argument>(V)) 75 return !isKernelFunction(*Arg->getParent()); 76 77 if (const Instruction *I = dyn_cast<Instruction>(V)) { 78 // Without pointer analysis, we conservatively assume values loaded from 79 // generic or local address space are divergent. 80 if (const LoadInst *LI = dyn_cast<LoadInst>(I)) { 81 unsigned AS = LI->getPointerAddressSpace(); 82 return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL; 83 } 84 // Atomic instructions may cause divergence. Atomic instructions are 85 // executed sequentially across all threads in a warp. Therefore, an earlier 86 // executed thread may see different memory inputs than a later executed 87 // thread. For example, suppose *a = 0 initially. 88 // 89 // atom.global.add.s32 d, [a], 1 90 // 91 // returns 0 for the first thread that enters the critical region, and 1 for 92 // the second thread. 93 if (I->isAtomic()) 94 return true; 95 if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 96 // Instructions that read threadIdx are obviously divergent. 97 if (readsThreadIndex(II) || readsLaneId(II)) 98 return true; 99 // Handle the NVPTX atomic intrinsics that cannot be represented as an 100 // atomic IR instruction. 101 if (isNVVMAtomic(II)) 102 return true; 103 } 104 // Conservatively consider the return value of function calls as divergent. 105 // We could analyze callees with bodies more precisely using 106 // inter-procedural analysis. 107 if (isa<CallInst>(I)) 108 return true; 109 } 110 111 return false; 112 } 113 114 // Convert NVVM intrinsics to target-generic LLVM code where possible. 115 static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) { 116 // Each NVVM intrinsic we can simplify can be replaced with one of: 117 // 118 // * an LLVM intrinsic, 119 // * an LLVM cast operation, 120 // * an LLVM binary operation, or 121 // * ad-hoc LLVM IR for the particular operation. 122 123 // Some transformations are only valid when the module's 124 // flush-denormals-to-zero (ftz) setting is true/false, whereas other 125 // transformations are valid regardless of the module's ftz setting. 126 enum FtzRequirementTy { 127 FTZ_Any, // Any ftz setting is ok. 128 FTZ_MustBeOn, // Transformation is valid only if ftz is on. 129 FTZ_MustBeOff, // Transformation is valid only if ftz is off. 130 }; 131 // Classes of NVVM intrinsics that can't be replaced one-to-one with a 132 // target-generic intrinsic, cast op, or binary op but that we can nonetheless 133 // simplify. 134 enum SpecialCase { 135 SPC_Reciprocal, 136 }; 137 138 // SimplifyAction is a poor-man's variant (plus an additional flag) that 139 // represents how to replace an NVVM intrinsic with target-generic LLVM IR. 140 struct SimplifyAction { 141 // Invariant: At most one of these Optionals has a value. 142 Optional<Intrinsic::ID> IID; 143 Optional<Instruction::CastOps> CastOp; 144 Optional<Instruction::BinaryOps> BinaryOp; 145 Optional<SpecialCase> Special; 146 147 FtzRequirementTy FtzRequirement = FTZ_Any; 148 // Denormal handling is guarded by different attributes depending on the 149 // type (denormal-fp-math vs denormal-fp-math-f32), take note of halfs. 150 bool IsHalfTy = false; 151 152 SimplifyAction() = default; 153 154 SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq, 155 bool IsHalfTy = false) 156 : IID(IID), FtzRequirement(FtzReq), IsHalfTy(IsHalfTy) {} 157 158 // Cast operations don't have anything to do with FTZ, so we skip that 159 // argument. 160 SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {} 161 162 SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq) 163 : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {} 164 165 SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq) 166 : Special(Special), FtzRequirement(FtzReq) {} 167 }; 168 169 // Try to generate a SimplifyAction describing how to replace our 170 // IntrinsicInstr with target-generic LLVM IR. 171 const SimplifyAction Action = [II]() -> SimplifyAction { 172 switch (II->getIntrinsicID()) { 173 // NVVM intrinsics that map directly to LLVM intrinsics. 174 case Intrinsic::nvvm_ceil_d: 175 return {Intrinsic::ceil, FTZ_Any}; 176 case Intrinsic::nvvm_ceil_f: 177 return {Intrinsic::ceil, FTZ_MustBeOff}; 178 case Intrinsic::nvvm_ceil_ftz_f: 179 return {Intrinsic::ceil, FTZ_MustBeOn}; 180 case Intrinsic::nvvm_fabs_d: 181 return {Intrinsic::fabs, FTZ_Any}; 182 case Intrinsic::nvvm_fabs_f: 183 return {Intrinsic::fabs, FTZ_MustBeOff}; 184 case Intrinsic::nvvm_fabs_ftz_f: 185 return {Intrinsic::fabs, FTZ_MustBeOn}; 186 case Intrinsic::nvvm_floor_d: 187 return {Intrinsic::floor, FTZ_Any}; 188 case Intrinsic::nvvm_floor_f: 189 return {Intrinsic::floor, FTZ_MustBeOff}; 190 case Intrinsic::nvvm_floor_ftz_f: 191 return {Intrinsic::floor, FTZ_MustBeOn}; 192 case Intrinsic::nvvm_fma_rn_d: 193 return {Intrinsic::fma, FTZ_Any}; 194 case Intrinsic::nvvm_fma_rn_f: 195 return {Intrinsic::fma, FTZ_MustBeOff}; 196 case Intrinsic::nvvm_fma_rn_ftz_f: 197 return {Intrinsic::fma, FTZ_MustBeOn}; 198 case Intrinsic::nvvm_fma_rn_f16: 199 return {Intrinsic::fma, FTZ_MustBeOff, true}; 200 case Intrinsic::nvvm_fma_rn_ftz_f16: 201 return {Intrinsic::fma, FTZ_MustBeOn, true}; 202 case Intrinsic::nvvm_fma_rn_f16x2: 203 return {Intrinsic::fma, FTZ_MustBeOff, true}; 204 case Intrinsic::nvvm_fma_rn_ftz_f16x2: 205 return {Intrinsic::fma, FTZ_MustBeOn, true}; 206 case Intrinsic::nvvm_fmax_d: 207 return {Intrinsic::maxnum, FTZ_Any}; 208 case Intrinsic::nvvm_fmax_f: 209 return {Intrinsic::maxnum, FTZ_MustBeOff}; 210 case Intrinsic::nvvm_fmax_ftz_f: 211 return {Intrinsic::maxnum, FTZ_MustBeOn}; 212 case Intrinsic::nvvm_fmax_nan_f: 213 return {Intrinsic::maximum, FTZ_MustBeOff}; 214 case Intrinsic::nvvm_fmax_ftz_nan_f: 215 return {Intrinsic::maximum, FTZ_MustBeOn}; 216 case Intrinsic::nvvm_fmax_f16: 217 return {Intrinsic::maxnum, FTZ_MustBeOff, true}; 218 case Intrinsic::nvvm_fmax_ftz_f16: 219 return {Intrinsic::maxnum, FTZ_MustBeOn, true}; 220 case Intrinsic::nvvm_fmax_f16x2: 221 return {Intrinsic::maxnum, FTZ_MustBeOff, true}; 222 case Intrinsic::nvvm_fmax_ftz_f16x2: 223 return {Intrinsic::maxnum, FTZ_MustBeOn, true}; 224 case Intrinsic::nvvm_fmax_nan_f16: 225 return {Intrinsic::maximum, FTZ_MustBeOff, true}; 226 case Intrinsic::nvvm_fmax_ftz_nan_f16: 227 return {Intrinsic::maximum, FTZ_MustBeOn, true}; 228 case Intrinsic::nvvm_fmax_nan_f16x2: 229 return {Intrinsic::maximum, FTZ_MustBeOff, true}; 230 case Intrinsic::nvvm_fmax_ftz_nan_f16x2: 231 return {Intrinsic::maximum, FTZ_MustBeOn, true}; 232 case Intrinsic::nvvm_fmin_d: 233 return {Intrinsic::minnum, FTZ_Any}; 234 case Intrinsic::nvvm_fmin_f: 235 return {Intrinsic::minnum, FTZ_MustBeOff}; 236 case Intrinsic::nvvm_fmin_ftz_f: 237 return {Intrinsic::minnum, FTZ_MustBeOn}; 238 case Intrinsic::nvvm_fmin_nan_f: 239 return {Intrinsic::minimum, FTZ_MustBeOff}; 240 case Intrinsic::nvvm_fmin_ftz_nan_f: 241 return {Intrinsic::minimum, FTZ_MustBeOn}; 242 case Intrinsic::nvvm_fmin_f16: 243 return {Intrinsic::minnum, FTZ_MustBeOff, true}; 244 case Intrinsic::nvvm_fmin_ftz_f16: 245 return {Intrinsic::minnum, FTZ_MustBeOn, true}; 246 case Intrinsic::nvvm_fmin_f16x2: 247 return {Intrinsic::minnum, FTZ_MustBeOff, true}; 248 case Intrinsic::nvvm_fmin_ftz_f16x2: 249 return {Intrinsic::minnum, FTZ_MustBeOn, true}; 250 case Intrinsic::nvvm_fmin_nan_f16: 251 return {Intrinsic::minimum, FTZ_MustBeOff, true}; 252 case Intrinsic::nvvm_fmin_ftz_nan_f16: 253 return {Intrinsic::minimum, FTZ_MustBeOn, true}; 254 case Intrinsic::nvvm_fmin_nan_f16x2: 255 return {Intrinsic::minimum, FTZ_MustBeOff, true}; 256 case Intrinsic::nvvm_fmin_ftz_nan_f16x2: 257 return {Intrinsic::minimum, FTZ_MustBeOn, true}; 258 case Intrinsic::nvvm_round_d: 259 return {Intrinsic::round, FTZ_Any}; 260 case Intrinsic::nvvm_round_f: 261 return {Intrinsic::round, FTZ_MustBeOff}; 262 case Intrinsic::nvvm_round_ftz_f: 263 return {Intrinsic::round, FTZ_MustBeOn}; 264 case Intrinsic::nvvm_sqrt_rn_d: 265 return {Intrinsic::sqrt, FTZ_Any}; 266 case Intrinsic::nvvm_sqrt_f: 267 // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the 268 // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts 269 // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are 270 // the versions with explicit ftz-ness. 271 return {Intrinsic::sqrt, FTZ_Any}; 272 case Intrinsic::nvvm_sqrt_rn_f: 273 return {Intrinsic::sqrt, FTZ_MustBeOff}; 274 case Intrinsic::nvvm_sqrt_rn_ftz_f: 275 return {Intrinsic::sqrt, FTZ_MustBeOn}; 276 case Intrinsic::nvvm_trunc_d: 277 return {Intrinsic::trunc, FTZ_Any}; 278 case Intrinsic::nvvm_trunc_f: 279 return {Intrinsic::trunc, FTZ_MustBeOff}; 280 case Intrinsic::nvvm_trunc_ftz_f: 281 return {Intrinsic::trunc, FTZ_MustBeOn}; 282 283 // NVVM intrinsics that map to LLVM cast operations. 284 // 285 // Note that llvm's target-generic conversion operators correspond to the rz 286 // (round to zero) versions of the nvvm conversion intrinsics, even though 287 // most everything else here uses the rn (round to nearest even) nvvm ops. 288 case Intrinsic::nvvm_d2i_rz: 289 case Intrinsic::nvvm_f2i_rz: 290 case Intrinsic::nvvm_d2ll_rz: 291 case Intrinsic::nvvm_f2ll_rz: 292 return {Instruction::FPToSI}; 293 case Intrinsic::nvvm_d2ui_rz: 294 case Intrinsic::nvvm_f2ui_rz: 295 case Intrinsic::nvvm_d2ull_rz: 296 case Intrinsic::nvvm_f2ull_rz: 297 return {Instruction::FPToUI}; 298 case Intrinsic::nvvm_i2d_rz: 299 case Intrinsic::nvvm_i2f_rz: 300 case Intrinsic::nvvm_ll2d_rz: 301 case Intrinsic::nvvm_ll2f_rz: 302 return {Instruction::SIToFP}; 303 case Intrinsic::nvvm_ui2d_rz: 304 case Intrinsic::nvvm_ui2f_rz: 305 case Intrinsic::nvvm_ull2d_rz: 306 case Intrinsic::nvvm_ull2f_rz: 307 return {Instruction::UIToFP}; 308 309 // NVVM intrinsics that map to LLVM binary ops. 310 case Intrinsic::nvvm_add_rn_d: 311 return {Instruction::FAdd, FTZ_Any}; 312 case Intrinsic::nvvm_add_rn_f: 313 return {Instruction::FAdd, FTZ_MustBeOff}; 314 case Intrinsic::nvvm_add_rn_ftz_f: 315 return {Instruction::FAdd, FTZ_MustBeOn}; 316 case Intrinsic::nvvm_mul_rn_d: 317 return {Instruction::FMul, FTZ_Any}; 318 case Intrinsic::nvvm_mul_rn_f: 319 return {Instruction::FMul, FTZ_MustBeOff}; 320 case Intrinsic::nvvm_mul_rn_ftz_f: 321 return {Instruction::FMul, FTZ_MustBeOn}; 322 case Intrinsic::nvvm_div_rn_d: 323 return {Instruction::FDiv, FTZ_Any}; 324 case Intrinsic::nvvm_div_rn_f: 325 return {Instruction::FDiv, FTZ_MustBeOff}; 326 case Intrinsic::nvvm_div_rn_ftz_f: 327 return {Instruction::FDiv, FTZ_MustBeOn}; 328 329 // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but 330 // need special handling. 331 // 332 // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just 333 // as well. 334 case Intrinsic::nvvm_rcp_rn_d: 335 return {SPC_Reciprocal, FTZ_Any}; 336 case Intrinsic::nvvm_rcp_rn_f: 337 return {SPC_Reciprocal, FTZ_MustBeOff}; 338 case Intrinsic::nvvm_rcp_rn_ftz_f: 339 return {SPC_Reciprocal, FTZ_MustBeOn}; 340 341 // We do not currently simplify intrinsics that give an approximate 342 // answer. These include: 343 // 344 // - nvvm_cos_approx_{f,ftz_f} 345 // - nvvm_ex2_approx_{d,f,ftz_f} 346 // - nvvm_lg2_approx_{d,f,ftz_f} 347 // - nvvm_sin_approx_{f,ftz_f} 348 // - nvvm_sqrt_approx_{f,ftz_f} 349 // - nvvm_rsqrt_approx_{d,f,ftz_f} 350 // - nvvm_div_approx_{ftz_d,ftz_f,f} 351 // - nvvm_rcp_approx_ftz_d 352 // 353 // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast" 354 // means that fastmath is enabled in the intrinsic. Unfortunately only 355 // binary operators (currently) have a fastmath bit in SelectionDAG, so 356 // this information gets lost and we can't select on it. 357 // 358 // TODO: div and rcp are lowered to a binary op, so these we could in 359 // theory lower them to "fast fdiv". 360 361 default: 362 return {}; 363 } 364 }(); 365 366 // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we 367 // can bail out now. (Notice that in the case that IID is not an NVVM 368 // intrinsic, we don't have to look up any module metadata, as 369 // FtzRequirementTy will be FTZ_Any.) 370 if (Action.FtzRequirement != FTZ_Any) { 371 const char *AttrName = 372 Action.IsHalfTy ? "denormal-fp-math" : "denormal-fp-math-f32"; 373 StringRef Attr = 374 II->getFunction()->getFnAttribute(AttrName).getValueAsString(); 375 DenormalMode Mode = parseDenormalFPAttribute(Attr); 376 bool FtzEnabled = Mode.Output != DenormalMode::IEEE; 377 378 if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn)) 379 return nullptr; 380 } 381 382 // Simplify to target-generic intrinsic. 383 if (Action.IID) { 384 SmallVector<Value *, 4> Args(II->args()); 385 // All the target-generic intrinsics currently of interest to us have one 386 // type argument, equal to that of the nvvm intrinsic's argument. 387 Type *Tys[] = {II->getArgOperand(0)->getType()}; 388 return CallInst::Create( 389 Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args); 390 } 391 392 // Simplify to target-generic binary op. 393 if (Action.BinaryOp) 394 return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0), 395 II->getArgOperand(1), II->getName()); 396 397 // Simplify to target-generic cast op. 398 if (Action.CastOp) 399 return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(), 400 II->getName()); 401 402 // All that's left are the special cases. 403 if (!Action.Special) 404 return nullptr; 405 406 switch (*Action.Special) { 407 case SPC_Reciprocal: 408 // Simplify reciprocal. 409 return BinaryOperator::Create( 410 Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1), 411 II->getArgOperand(0), II->getName()); 412 } 413 llvm_unreachable("All SpecialCase enumerators should be handled in switch."); 414 } 415 416 Optional<Instruction *> 417 NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 418 if (Instruction *I = simplifyNvvmIntrinsic(&II, IC)) { 419 return I; 420 } 421 return None; 422 } 423 424 InstructionCost NVPTXTTIImpl::getArithmeticInstrCost( 425 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 426 TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, 427 TTI::OperandValueProperties Opd1PropInfo, 428 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, 429 const Instruction *CxtI) { 430 // Legalize the type. 431 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 432 433 int ISD = TLI->InstructionOpcodeToISD(Opcode); 434 435 switch (ISD) { 436 default: 437 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 438 Opd2Info, 439 Opd1PropInfo, Opd2PropInfo); 440 case ISD::ADD: 441 case ISD::MUL: 442 case ISD::XOR: 443 case ISD::OR: 444 case ISD::AND: 445 // The machine code (SASS) simulates an i64 with two i32. Therefore, we 446 // estimate that arithmetic operations on i64 are twice as expensive as 447 // those on types that can fit into one machine register. 448 if (LT.second.SimpleTy == MVT::i64) 449 return 2 * LT.first; 450 // Delegate other cases to the basic TTI. 451 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, 452 Opd2Info, 453 Opd1PropInfo, Opd2PropInfo); 454 } 455 } 456 457 void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 458 TTI::UnrollingPreferences &UP, 459 OptimizationRemarkEmitter *ORE) { 460 BaseT::getUnrollingPreferences(L, SE, UP, ORE); 461 462 // Enable partial unrolling and runtime unrolling, but reduce the 463 // threshold. This partially unrolls small loops which are often 464 // unrolled by the PTX to SASS compiler and unrolling earlier can be 465 // beneficial. 466 UP.Partial = UP.Runtime = true; 467 UP.PartialThreshold = UP.Threshold / 4; 468 } 469 470 void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 471 TTI::PeelingPreferences &PP) { 472 BaseT::getPeelingPreferences(L, SE, PP); 473 } 474