1 //===- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// \file 8 /// This file implements a TargetTransformInfo analysis pass specific to the 9 /// Hexagon target machine. It uses the target's detailed information to provide 10 /// more precise answers to certain TTI queries, while letting the target 11 /// independent and default TTI implementations handle the rest. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "HexagonTargetTransformInfo.h" 16 #include "HexagonSubtarget.h" 17 #include "llvm/Analysis/TargetTransformInfo.h" 18 #include "llvm/CodeGen/ValueTypes.h" 19 #include "llvm/IR/InstrTypes.h" 20 #include "llvm/IR/Instructions.h" 21 #include "llvm/IR/User.h" 22 #include "llvm/Support/Casting.h" 23 #include "llvm/Support/CommandLine.h" 24 #include "llvm/Transforms/Utils/LoopPeel.h" 25 #include "llvm/Transforms/Utils/UnrollLoop.h" 26 27 using namespace llvm; 28 29 #define DEBUG_TYPE "hexagontti" 30 31 static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false), 32 cl::Hidden, cl::desc("Enable loop vectorizer for HVX")); 33 34 static cl::opt<bool> EnableV68FloatAutoHVX( 35 "force-hvx-float", cl::Hidden, 36 cl::desc("Enable auto-vectorization of floatint point types on v68.")); 37 38 static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables", 39 cl::init(true), cl::Hidden, 40 cl::desc("Control lookup table emission on Hexagon target")); 41 42 static cl::opt<bool> HexagonMaskedVMem("hexagon-masked-vmem", cl::init(true), 43 cl::Hidden, cl::desc("Enable masked loads/stores for HVX")); 44 45 // Constant "cost factor" to make floating point operations more expensive 46 // in terms of vectorization cost. This isn't the best way, but it should 47 // do. Ultimately, the cost should use cycles. 48 static const unsigned FloatFactor = 4; 49 50 bool HexagonTTIImpl::useHVX() const { 51 return ST.useHVXOps() && HexagonAutoHVX; 52 } 53 54 bool HexagonTTIImpl::isHVXVectorType(Type *Ty) const { 55 auto *VecTy = dyn_cast<VectorType>(Ty); 56 if (!VecTy) 57 return false; 58 if (!ST.isTypeForHVX(VecTy)) 59 return false; 60 if (ST.useHVXV69Ops() || !VecTy->getElementType()->isFloatingPointTy()) 61 return true; 62 return ST.useHVXV68Ops() && EnableV68FloatAutoHVX; 63 } 64 65 unsigned HexagonTTIImpl::getTypeNumElements(Type *Ty) const { 66 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) 67 return VTy->getNumElements(); 68 assert((Ty->isIntegerTy() || Ty->isFloatingPointTy()) && 69 "Expecting scalar type"); 70 return 1; 71 } 72 73 TargetTransformInfo::PopcntSupportKind 74 HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const { 75 // Return fast hardware support as every input < 64 bits will be promoted 76 // to 64 bits. 77 return TargetTransformInfo::PSK_FastHardware; 78 } 79 80 // The Hexagon target can unroll loops with run-time trip counts. 81 void HexagonTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 82 TTI::UnrollingPreferences &UP, 83 OptimizationRemarkEmitter *ORE) { 84 UP.Runtime = UP.Partial = true; 85 } 86 87 void HexagonTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 88 TTI::PeelingPreferences &PP) { 89 BaseT::getPeelingPreferences(L, SE, PP); 90 // Only try to peel innermost loops with small runtime trip counts. 91 if (L && L->isInnermost() && canPeel(L) && 92 SE.getSmallConstantTripCount(L) == 0 && 93 SE.getSmallConstantMaxTripCount(L) > 0 && 94 SE.getSmallConstantMaxTripCount(L) <= 5) { 95 PP.PeelCount = 2; 96 } 97 } 98 99 TTI::AddressingModeKind 100 HexagonTTIImpl::getPreferredAddressingMode(const Loop *L, 101 ScalarEvolution *SE) const { 102 return TTI::AMK_PostIndexed; 103 } 104 105 /// --- Vector TTI begin --- 106 107 unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector) const { 108 if (Vector) 109 return useHVX() ? 32 : 0; 110 return 32; 111 } 112 113 unsigned HexagonTTIImpl::getMaxInterleaveFactor(unsigned VF) { 114 return useHVX() ? 2 : 1; 115 } 116 117 TypeSize 118 HexagonTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 119 switch (K) { 120 case TargetTransformInfo::RGK_Scalar: 121 return TypeSize::getFixed(32); 122 case TargetTransformInfo::RGK_FixedWidthVector: 123 return TypeSize::getFixed(getMinVectorRegisterBitWidth()); 124 case TargetTransformInfo::RGK_ScalableVector: 125 return TypeSize::getScalable(0); 126 } 127 128 llvm_unreachable("Unsupported register kind"); 129 } 130 131 unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const { 132 return useHVX() ? ST.getVectorLength()*8 : 32; 133 } 134 135 ElementCount HexagonTTIImpl::getMinimumVF(unsigned ElemWidth, 136 bool IsScalable) const { 137 assert(!IsScalable && "Scalable VFs are not supported for Hexagon"); 138 return ElementCount::getFixed((8 * ST.getVectorLength()) / ElemWidth); 139 } 140 141 InstructionCost HexagonTTIImpl::getScalarizationOverhead( 142 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, 143 TTI::TargetCostKind CostKind) { 144 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, 145 CostKind); 146 } 147 148 InstructionCost 149 HexagonTTIImpl::getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, 150 ArrayRef<Type *> Tys, 151 TTI::TargetCostKind CostKind) { 152 return BaseT::getOperandsScalarizationOverhead(Args, Tys, CostKind); 153 } 154 155 InstructionCost HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy, 156 ArrayRef<Type *> Tys, 157 TTI::TargetCostKind CostKind) { 158 return BaseT::getCallInstrCost(F, RetTy, Tys, CostKind); 159 } 160 161 InstructionCost 162 HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 163 TTI::TargetCostKind CostKind) { 164 if (ICA.getID() == Intrinsic::bswap) { 165 std::pair<InstructionCost, MVT> LT = 166 getTypeLegalizationCost(ICA.getReturnType()); 167 return LT.first + 2; 168 } 169 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 170 } 171 172 InstructionCost HexagonTTIImpl::getAddressComputationCost(Type *Tp, 173 ScalarEvolution *SE, 174 const SCEV *S) { 175 return 0; 176 } 177 178 InstructionCost HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 179 MaybeAlign Alignment, 180 unsigned AddressSpace, 181 TTI::TargetCostKind CostKind, 182 TTI::OperandValueInfo OpInfo, 183 const Instruction *I) { 184 assert(Opcode == Instruction::Load || Opcode == Instruction::Store); 185 // TODO: Handle other cost kinds. 186 if (CostKind != TTI::TCK_RecipThroughput) 187 return 1; 188 189 if (Opcode == Instruction::Store) 190 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 191 CostKind, OpInfo, I); 192 193 if (Src->isVectorTy()) { 194 VectorType *VecTy = cast<VectorType>(Src); 195 unsigned VecWidth = VecTy->getPrimitiveSizeInBits().getFixedValue(); 196 if (isHVXVectorType(VecTy)) { 197 unsigned RegWidth = 198 getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 199 .getFixedValue(); 200 assert(RegWidth && "Non-zero vector register width expected"); 201 // Cost of HVX loads. 202 if (VecWidth % RegWidth == 0) 203 return VecWidth / RegWidth; 204 // Cost of constructing HVX vector from scalar loads 205 const Align RegAlign(RegWidth / 8); 206 if (!Alignment || *Alignment > RegAlign) 207 Alignment = RegAlign; 208 assert(Alignment); 209 unsigned AlignWidth = 8 * Alignment->value(); 210 unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth; 211 return 3 * NumLoads; 212 } 213 214 // Non-HVX vectors. 215 // Add extra cost for floating point types. 216 unsigned Cost = 217 VecTy->getElementType()->isFloatingPointTy() ? FloatFactor : 1; 218 219 // At this point unspecified alignment is considered as Align(1). 220 const Align BoundAlignment = std::min(Alignment.valueOrOne(), Align(8)); 221 unsigned AlignWidth = 8 * BoundAlignment.value(); 222 unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth; 223 if (Alignment == Align(4) || Alignment == Align(8)) 224 return Cost * NumLoads; 225 // Loads of less than 32 bits will need extra inserts to compose a vector. 226 assert(BoundAlignment <= Align(8)); 227 unsigned LogA = Log2(BoundAlignment); 228 return (3 - LogA) * Cost * NumLoads; 229 } 230 231 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind, 232 OpInfo, I); 233 } 234 235 InstructionCost 236 HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, 237 Align Alignment, unsigned AddressSpace, 238 TTI::TargetCostKind CostKind) { 239 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 240 CostKind); 241 } 242 243 InstructionCost HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, 244 ArrayRef<int> Mask, 245 TTI::TargetCostKind CostKind, 246 int Index, Type *SubTp, 247 ArrayRef<const Value *> Args) { 248 return 1; 249 } 250 251 InstructionCost HexagonTTIImpl::getGatherScatterOpCost( 252 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 253 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 254 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 255 Alignment, CostKind, I); 256 } 257 258 InstructionCost HexagonTTIImpl::getInterleavedMemoryOpCost( 259 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 260 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 261 bool UseMaskForCond, bool UseMaskForGaps) { 262 if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps) 263 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 264 Alignment, AddressSpace, 265 CostKind, 266 UseMaskForCond, UseMaskForGaps); 267 return getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, 268 CostKind); 269 } 270 271 InstructionCost HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 272 Type *CondTy, 273 CmpInst::Predicate VecPred, 274 TTI::TargetCostKind CostKind, 275 const Instruction *I) { 276 if (ValTy->isVectorTy() && CostKind == TTI::TCK_RecipThroughput) { 277 if (!isHVXVectorType(ValTy) && ValTy->isFPOrFPVectorTy()) 278 return InstructionCost::getMax(); 279 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 280 if (Opcode == Instruction::FCmp) 281 return LT.first + FloatFactor * getTypeNumElements(ValTy); 282 } 283 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 284 } 285 286 InstructionCost HexagonTTIImpl::getArithmeticInstrCost( 287 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 288 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 289 ArrayRef<const Value *> Args, 290 const Instruction *CxtI) { 291 // TODO: Handle more cost kinds. 292 if (CostKind != TTI::TCK_RecipThroughput) 293 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 294 Op2Info, Args, CxtI); 295 296 if (Ty->isVectorTy()) { 297 if (!isHVXVectorType(Ty) && Ty->isFPOrFPVectorTy()) 298 return InstructionCost::getMax(); 299 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 300 if (LT.second.isFloatingPoint()) 301 return LT.first + FloatFactor * getTypeNumElements(Ty); 302 } 303 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 304 Args, CxtI); 305 } 306 307 InstructionCost HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy, 308 Type *SrcTy, 309 TTI::CastContextHint CCH, 310 TTI::TargetCostKind CostKind, 311 const Instruction *I) { 312 auto isNonHVXFP = [this] (Type *Ty) { 313 return Ty->isVectorTy() && !isHVXVectorType(Ty) && Ty->isFPOrFPVectorTy(); 314 }; 315 if (isNonHVXFP(SrcTy) || isNonHVXFP(DstTy)) 316 return InstructionCost::getMax(); 317 318 if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) { 319 unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0; 320 unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0; 321 322 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcTy); 323 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(DstTy); 324 InstructionCost Cost = 325 std::max(SrcLT.first, DstLT.first) + FloatFactor * (SrcN + DstN); 326 // TODO: Allow non-throughput costs that aren't binary. 327 if (CostKind != TTI::TCK_RecipThroughput) 328 return Cost == 0 ? 0 : 1; 329 return Cost; 330 } 331 return 1; 332 } 333 334 InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 335 TTI::TargetCostKind CostKind, 336 unsigned Index, Value *Op0, 337 Value *Op1) { 338 Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType() 339 : Val; 340 if (Opcode == Instruction::InsertElement) { 341 // Need two rotations for non-zero index. 342 unsigned Cost = (Index != 0) ? 2 : 0; 343 if (ElemTy->isIntegerTy(32)) 344 return Cost; 345 // If it's not a 32-bit value, there will need to be an extract. 346 return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, CostKind, 347 Index, Op0, Op1); 348 } 349 350 if (Opcode == Instruction::ExtractElement) 351 return 2; 352 353 return 1; 354 } 355 356 bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/) { 357 // This function is called from scalarize-masked-mem-intrin, which runs 358 // in pre-isel. Use ST directly instead of calling isHVXVectorType. 359 return HexagonMaskedVMem && ST.isTypeForHVX(DataType); 360 } 361 362 bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/) { 363 // This function is called from scalarize-masked-mem-intrin, which runs 364 // in pre-isel. Use ST directly instead of calling isHVXVectorType. 365 return HexagonMaskedVMem && ST.isTypeForHVX(DataType); 366 } 367 368 /// --- Vector TTI end --- 369 370 unsigned HexagonTTIImpl::getPrefetchDistance() const { 371 return ST.getL1PrefetchDistance(); 372 } 373 374 unsigned HexagonTTIImpl::getCacheLineSize() const { 375 return ST.getL1CacheLineSize(); 376 } 377 378 InstructionCost 379 HexagonTTIImpl::getInstructionCost(const User *U, 380 ArrayRef<const Value *> Operands, 381 TTI::TargetCostKind CostKind) { 382 auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool { 383 if (!CI->isIntegerCast()) 384 return false; 385 // Only extensions from an integer type shorter than 32-bit to i32 386 // can be folded into the load. 387 const DataLayout &DL = getDataLayout(); 388 unsigned SBW = DL.getTypeSizeInBits(CI->getSrcTy()); 389 unsigned DBW = DL.getTypeSizeInBits(CI->getDestTy()); 390 if (DBW != 32 || SBW >= DBW) 391 return false; 392 393 const LoadInst *LI = dyn_cast<const LoadInst>(CI->getOperand(0)); 394 // Technically, this code could allow multiple uses of the load, and 395 // check if all the uses are the same extension operation, but this 396 // should be sufficient for most cases. 397 return LI && LI->hasOneUse(); 398 }; 399 400 if (const CastInst *CI = dyn_cast<const CastInst>(U)) 401 if (isCastFoldedIntoLoad(CI)) 402 return TargetTransformInfo::TCC_Free; 403 return BaseT::getInstructionCost(U, Operands, CostKind); 404 } 405 406 bool HexagonTTIImpl::shouldBuildLookupTables() const { 407 return EmitLookupTables; 408 } 409