1 //===- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// \file 8 /// This file implements a TargetTransformInfo analysis pass specific to the 9 /// Hexagon target machine. It uses the target's detailed information to provide 10 /// more precise answers to certain TTI queries, while letting the target 11 /// independent and default TTI implementations handle the rest. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "HexagonTargetTransformInfo.h" 16 #include "HexagonSubtarget.h" 17 #include "llvm/Analysis/TargetTransformInfo.h" 18 #include "llvm/CodeGen/ValueTypes.h" 19 #include "llvm/IR/InstrTypes.h" 20 #include "llvm/IR/Instructions.h" 21 #include "llvm/IR/User.h" 22 #include "llvm/Support/Casting.h" 23 #include "llvm/Support/CommandLine.h" 24 #include "llvm/Transforms/Utils/LoopPeel.h" 25 #include "llvm/Transforms/Utils/UnrollLoop.h" 26 27 using namespace llvm; 28 29 #define DEBUG_TYPE "hexagontti" 30 31 static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false), 32 cl::Hidden, cl::desc("Enable loop vectorizer for HVX")); 33 34 static cl::opt<bool> EnableV68FloatAutoHVX( 35 "force-hvx-float", cl::Hidden, 36 cl::desc("Enable auto-vectorization of floatint point types on v68.")); 37 38 static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables", 39 cl::init(true), cl::Hidden, 40 cl::desc("Control lookup table emission on Hexagon target")); 41 42 static cl::opt<bool> HexagonMaskedVMem("hexagon-masked-vmem", cl::init(true), 43 cl::Hidden, cl::desc("Enable masked loads/stores for HVX")); 44 45 // Constant "cost factor" to make floating point operations more expensive 46 // in terms of vectorization cost. This isn't the best way, but it should 47 // do. Ultimately, the cost should use cycles. 48 static const unsigned FloatFactor = 4; 49 50 bool HexagonTTIImpl::useHVX() const { 51 return ST.useHVXOps() && HexagonAutoHVX; 52 } 53 54 bool HexagonTTIImpl::isHVXVectorType(Type *Ty) const { 55 auto *VecTy = dyn_cast<VectorType>(Ty); 56 if (!VecTy) 57 return false; 58 if (!ST.isTypeForHVX(VecTy)) 59 return false; 60 if (ST.useHVXV69Ops() || !VecTy->getElementType()->isFloatingPointTy()) 61 return true; 62 return ST.useHVXV68Ops() && EnableV68FloatAutoHVX; 63 } 64 65 unsigned HexagonTTIImpl::getTypeNumElements(Type *Ty) const { 66 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) 67 return VTy->getNumElements(); 68 assert((Ty->isIntegerTy() || Ty->isFloatingPointTy()) && 69 "Expecting scalar type"); 70 return 1; 71 } 72 73 TargetTransformInfo::PopcntSupportKind 74 HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const { 75 // Return fast hardware support as every input < 64 bits will be promoted 76 // to 64 bits. 77 return TargetTransformInfo::PSK_FastHardware; 78 } 79 80 // The Hexagon target can unroll loops with run-time trip counts. 81 void HexagonTTIImpl::getUnrollingPreferences( 82 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, 83 OptimizationRemarkEmitter *ORE) const { 84 UP.Runtime = UP.Partial = true; 85 } 86 87 void HexagonTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 88 TTI::PeelingPreferences &PP) const { 89 BaseT::getPeelingPreferences(L, SE, PP); 90 // Only try to peel innermost loops with small runtime trip counts. 91 if (L && L->isInnermost() && canPeel(L) && 92 SE.getSmallConstantTripCount(L) == 0 && 93 SE.getSmallConstantMaxTripCount(L) > 0 && 94 SE.getSmallConstantMaxTripCount(L) <= 5) { 95 PP.PeelCount = 2; 96 } 97 } 98 99 TTI::AddressingModeKind 100 HexagonTTIImpl::getPreferredAddressingMode(const Loop *L, 101 ScalarEvolution *SE) const { 102 return TTI::AMK_PostIndexed; 103 } 104 105 /// --- Vector TTI begin --- 106 107 unsigned HexagonTTIImpl::getNumberOfRegisters(unsigned ClassID) const { 108 bool Vector = ClassID == 1; 109 if (Vector) 110 return useHVX() ? 32 : 0; 111 return 32; 112 } 113 114 unsigned HexagonTTIImpl::getMaxInterleaveFactor(ElementCount VF) const { 115 return useHVX() ? 2 : 1; 116 } 117 118 TypeSize 119 HexagonTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 120 switch (K) { 121 case TargetTransformInfo::RGK_Scalar: 122 return TypeSize::getFixed(32); 123 case TargetTransformInfo::RGK_FixedWidthVector: 124 return TypeSize::getFixed(getMinVectorRegisterBitWidth()); 125 case TargetTransformInfo::RGK_ScalableVector: 126 return TypeSize::getScalable(0); 127 } 128 129 llvm_unreachable("Unsupported register kind"); 130 } 131 132 unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const { 133 return useHVX() ? ST.getVectorLength()*8 : 32; 134 } 135 136 ElementCount HexagonTTIImpl::getMinimumVF(unsigned ElemWidth, 137 bool IsScalable) const { 138 assert(!IsScalable && "Scalable VFs are not supported for Hexagon"); 139 return ElementCount::getFixed((8 * ST.getVectorLength()) / ElemWidth); 140 } 141 142 InstructionCost 143 HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys, 144 TTI::TargetCostKind CostKind) const { 145 return BaseT::getCallInstrCost(F, RetTy, Tys, CostKind); 146 } 147 148 InstructionCost 149 HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 150 TTI::TargetCostKind CostKind) const { 151 if (ICA.getID() == Intrinsic::bswap) { 152 std::pair<InstructionCost, MVT> LT = 153 getTypeLegalizationCost(ICA.getReturnType()); 154 return LT.first + 2; 155 } 156 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 157 } 158 159 InstructionCost HexagonTTIImpl::getAddressComputationCost(Type *Tp, 160 ScalarEvolution *SE, 161 const SCEV *S) const { 162 return 0; 163 } 164 165 InstructionCost HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 166 Align Alignment, 167 unsigned AddressSpace, 168 TTI::TargetCostKind CostKind, 169 TTI::OperandValueInfo OpInfo, 170 const Instruction *I) const { 171 assert(Opcode == Instruction::Load || Opcode == Instruction::Store); 172 // TODO: Handle other cost kinds. 173 if (CostKind != TTI::TCK_RecipThroughput) 174 return 1; 175 176 if (Opcode == Instruction::Store) 177 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 178 CostKind, OpInfo, I); 179 180 if (Src->isVectorTy()) { 181 VectorType *VecTy = cast<VectorType>(Src); 182 unsigned VecWidth = VecTy->getPrimitiveSizeInBits().getFixedValue(); 183 if (isHVXVectorType(VecTy)) { 184 unsigned RegWidth = 185 getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 186 .getFixedValue(); 187 assert(RegWidth && "Non-zero vector register width expected"); 188 // Cost of HVX loads. 189 if (VecWidth % RegWidth == 0) 190 return VecWidth / RegWidth; 191 // Cost of constructing HVX vector from scalar loads 192 const Align RegAlign(RegWidth / 8); 193 if (Alignment > RegAlign) 194 Alignment = RegAlign; 195 unsigned AlignWidth = 8 * Alignment.value(); 196 unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth; 197 return 3 * NumLoads; 198 } 199 200 // Non-HVX vectors. 201 // Add extra cost for floating point types. 202 unsigned Cost = 203 VecTy->getElementType()->isFloatingPointTy() ? FloatFactor : 1; 204 205 // At this point unspecified alignment is considered as Align(1). 206 const Align BoundAlignment = std::min(Alignment, Align(8)); 207 unsigned AlignWidth = 8 * BoundAlignment.value(); 208 unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth; 209 if (Alignment == Align(4) || Alignment == Align(8)) 210 return Cost * NumLoads; 211 // Loads of less than 32 bits will need extra inserts to compose a vector. 212 assert(BoundAlignment <= Align(8)); 213 unsigned LogA = Log2(BoundAlignment); 214 return (3 - LogA) * Cost * NumLoads; 215 } 216 217 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind, 218 OpInfo, I); 219 } 220 221 InstructionCost 222 HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, 223 Align Alignment, unsigned AddressSpace, 224 TTI::TargetCostKind CostKind) const { 225 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 226 CostKind); 227 } 228 229 InstructionCost 230 HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, 231 VectorType *SrcTy, ArrayRef<int> Mask, 232 TTI::TargetCostKind CostKind, int Index, 233 VectorType *SubTp, ArrayRef<const Value *> Args, 234 const Instruction *CxtI) const { 235 return 1; 236 } 237 238 InstructionCost HexagonTTIImpl::getGatherScatterOpCost( 239 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 240 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const { 241 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 242 Alignment, CostKind, I); 243 } 244 245 InstructionCost HexagonTTIImpl::getInterleavedMemoryOpCost( 246 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 247 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 248 bool UseMaskForCond, bool UseMaskForGaps) const { 249 if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps) 250 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 251 Alignment, AddressSpace, 252 CostKind, 253 UseMaskForCond, UseMaskForGaps); 254 return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind); 255 } 256 257 InstructionCost HexagonTTIImpl::getCmpSelInstrCost( 258 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, 259 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, 260 TTI::OperandValueInfo Op2Info, const Instruction *I) const { 261 if (ValTy->isVectorTy() && CostKind == TTI::TCK_RecipThroughput) { 262 if (!isHVXVectorType(ValTy) && ValTy->isFPOrFPVectorTy()) 263 return InstructionCost::getMax(); 264 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 265 if (Opcode == Instruction::FCmp) 266 return LT.first + FloatFactor * getTypeNumElements(ValTy); 267 } 268 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 269 Op1Info, Op2Info, I); 270 } 271 272 InstructionCost HexagonTTIImpl::getArithmeticInstrCost( 273 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 274 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 275 ArrayRef<const Value *> Args, const Instruction *CxtI) const { 276 // TODO: Handle more cost kinds. 277 if (CostKind != TTI::TCK_RecipThroughput) 278 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 279 Op2Info, Args, CxtI); 280 281 if (Ty->isVectorTy()) { 282 if (!isHVXVectorType(Ty) && Ty->isFPOrFPVectorTy()) 283 return InstructionCost::getMax(); 284 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 285 if (LT.second.isFloatingPoint()) 286 return LT.first + FloatFactor * getTypeNumElements(Ty); 287 } 288 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 289 Args, CxtI); 290 } 291 292 InstructionCost HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy, 293 Type *SrcTy, 294 TTI::CastContextHint CCH, 295 TTI::TargetCostKind CostKind, 296 const Instruction *I) const { 297 auto isNonHVXFP = [this] (Type *Ty) { 298 return Ty->isVectorTy() && !isHVXVectorType(Ty) && Ty->isFPOrFPVectorTy(); 299 }; 300 if (isNonHVXFP(SrcTy) || isNonHVXFP(DstTy)) 301 return InstructionCost::getMax(); 302 303 if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) { 304 unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0; 305 unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0; 306 307 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcTy); 308 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(DstTy); 309 InstructionCost Cost = 310 std::max(SrcLT.first, DstLT.first) + FloatFactor * (SrcN + DstN); 311 // TODO: Allow non-throughput costs that aren't binary. 312 if (CostKind != TTI::TCK_RecipThroughput) 313 return Cost == 0 ? 0 : 1; 314 return Cost; 315 } 316 return 1; 317 } 318 319 InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 320 TTI::TargetCostKind CostKind, 321 unsigned Index, 322 const Value *Op0, 323 const Value *Op1) const { 324 Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType() 325 : Val; 326 if (Opcode == Instruction::InsertElement) { 327 // Need two rotations for non-zero index. 328 unsigned Cost = (Index != 0) ? 2 : 0; 329 if (ElemTy->isIntegerTy(32)) 330 return Cost; 331 // If it's not a 32-bit value, there will need to be an extract. 332 return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, CostKind, 333 Index, Op0, Op1); 334 } 335 336 if (Opcode == Instruction::ExtractElement) 337 return 2; 338 339 return 1; 340 } 341 342 bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/, 343 unsigned /*AddressSpace*/) const { 344 // This function is called from scalarize-masked-mem-intrin, which runs 345 // in pre-isel. Use ST directly instead of calling isHVXVectorType. 346 return HexagonMaskedVMem && ST.isTypeForHVX(DataType); 347 } 348 349 bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/, 350 unsigned /*AddressSpace*/) const { 351 // This function is called from scalarize-masked-mem-intrin, which runs 352 // in pre-isel. Use ST directly instead of calling isHVXVectorType. 353 return HexagonMaskedVMem && ST.isTypeForHVX(DataType); 354 } 355 356 /// --- Vector TTI end --- 357 358 unsigned HexagonTTIImpl::getPrefetchDistance() const { 359 return ST.getL1PrefetchDistance(); 360 } 361 362 unsigned HexagonTTIImpl::getCacheLineSize() const { 363 return ST.getL1CacheLineSize(); 364 } 365 366 InstructionCost 367 HexagonTTIImpl::getInstructionCost(const User *U, 368 ArrayRef<const Value *> Operands, 369 TTI::TargetCostKind CostKind) const { 370 auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool { 371 if (!CI->isIntegerCast()) 372 return false; 373 // Only extensions from an integer type shorter than 32-bit to i32 374 // can be folded into the load. 375 const DataLayout &DL = getDataLayout(); 376 unsigned SBW = DL.getTypeSizeInBits(CI->getSrcTy()); 377 unsigned DBW = DL.getTypeSizeInBits(CI->getDestTy()); 378 if (DBW != 32 || SBW >= DBW) 379 return false; 380 381 const LoadInst *LI = dyn_cast<const LoadInst>(CI->getOperand(0)); 382 // Technically, this code could allow multiple uses of the load, and 383 // check if all the uses are the same extension operation, but this 384 // should be sufficient for most cases. 385 return LI && LI->hasOneUse(); 386 }; 387 388 if (const CastInst *CI = dyn_cast<const CastInst>(U)) 389 if (isCastFoldedIntoLoad(CI)) 390 return TargetTransformInfo::TCC_Free; 391 return BaseT::getInstructionCost(U, Operands, CostKind); 392 } 393 394 bool HexagonTTIImpl::shouldBuildLookupTables() const { 395 return EmitLookupTables; 396 } 397