//===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "NVPTXTargetTransformInfo.h" #include "NVPTXUtilities.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/Support/Debug.h" using namespace llvm; #define DEBUG_TYPE "NVPTXtti" // Whether the given intrinsic reads threadIdx.x/y/z. static bool readsThreadIndex(const IntrinsicInst *II) { switch (II->getIntrinsicID()) { default: return false; case Intrinsic::nvvm_read_ptx_sreg_tid_x: case Intrinsic::nvvm_read_ptx_sreg_tid_y: case Intrinsic::nvvm_read_ptx_sreg_tid_z: return true; } } static bool readsLaneId(const IntrinsicInst *II) { return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid; } // Whether the given intrinsic is an atomic instruction in PTX. static bool isNVVMAtomic(const IntrinsicInst *II) { switch (II->getIntrinsicID()) { default: return false; case Intrinsic::nvvm_atomic_load_inc_32: case Intrinsic::nvvm_atomic_load_dec_32: case Intrinsic::nvvm_atomic_add_gen_f_cta: case Intrinsic::nvvm_atomic_add_gen_f_sys: case Intrinsic::nvvm_atomic_add_gen_i_cta: case Intrinsic::nvvm_atomic_add_gen_i_sys: case Intrinsic::nvvm_atomic_and_gen_i_cta: case Intrinsic::nvvm_atomic_and_gen_i_sys: case Intrinsic::nvvm_atomic_cas_gen_i_cta: case Intrinsic::nvvm_atomic_cas_gen_i_sys: case Intrinsic::nvvm_atomic_dec_gen_i_cta: case Intrinsic::nvvm_atomic_dec_gen_i_sys: case Intrinsic::nvvm_atomic_inc_gen_i_cta: case Intrinsic::nvvm_atomic_inc_gen_i_sys: case Intrinsic::nvvm_atomic_max_gen_i_cta: case Intrinsic::nvvm_atomic_max_gen_i_sys: case Intrinsic::nvvm_atomic_min_gen_i_cta: case Intrinsic::nvvm_atomic_min_gen_i_sys: case Intrinsic::nvvm_atomic_or_gen_i_cta: case Intrinsic::nvvm_atomic_or_gen_i_sys: case Intrinsic::nvvm_atomic_exch_gen_i_cta: case Intrinsic::nvvm_atomic_exch_gen_i_sys: case Intrinsic::nvvm_atomic_xor_gen_i_cta: case Intrinsic::nvvm_atomic_xor_gen_i_sys: return true; } } bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) { // Without inter-procedural analysis, we conservatively assume that arguments // to __device__ functions are divergent. if (const Argument *Arg = dyn_cast(V)) return !isKernelFunction(*Arg->getParent()); if (const Instruction *I = dyn_cast(V)) { // Without pointer analysis, we conservatively assume values loaded from // generic or local address space are divergent. if (const LoadInst *LI = dyn_cast(I)) { unsigned AS = LI->getPointerAddressSpace(); return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL; } // Atomic instructions may cause divergence. Atomic instructions are // executed sequentially across all threads in a warp. Therefore, an earlier // executed thread may see different memory inputs than a later executed // thread. For example, suppose *a = 0 initially. // // atom.global.add.s32 d, [a], 1 // // returns 0 for the first thread that enters the critical region, and 1 for // the second thread. if (I->isAtomic()) return true; if (const IntrinsicInst *II = dyn_cast(I)) { // Instructions that read threadIdx are obviously divergent. if (readsThreadIndex(II) || readsLaneId(II)) return true; // Handle the NVPTX atomic instrinsics that cannot be represented as an // atomic IR instruction. if (isNVVMAtomic(II)) return true; } // Conservatively consider the return value of function calls as divergent. // We could analyze callees with bodies more precisely using // inter-procedural analysis. if (isa(I)) return true; } return false; } int NVPTXTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args, const Instruction *CxtI) { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); switch (ISD) { default: return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); case ISD::ADD: case ISD::MUL: case ISD::XOR: case ISD::OR: case ISD::AND: // The machine code (SASS) simulates an i64 with two i32. Therefore, we // estimate that arithmetic operations on i64 are twice as expensive as // those on types that can fit into one machine register. if (LT.second.SimpleTy == MVT::i64) return 2 * LT.first; // Delegate other cases to the basic TTI. return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); } } void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { BaseT::getUnrollingPreferences(L, SE, UP); // Enable partial unrolling and runtime unrolling, but reduce the // threshold. This partially unrolls small loops which are often // unrolled by the PTX to SASS compiler and unrolling earlier can be // beneficial. UP.Partial = UP.Runtime = true; UP.PartialThreshold = UP.Threshold / 4; } void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) { BaseT::getPeelingPreferences(L, SE, PP); }