1 //===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a TargetTransformInfo analysis pass specific to the 10 // SystemZ target machine. It uses the target's detailed information to provide 11 // more precise answers to certain TTI queries, while letting the target 12 // independent and default TTI implementations handle the rest. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "SystemZTargetTransformInfo.h" 17 #include "llvm/Analysis/TargetTransformInfo.h" 18 #include "llvm/CodeGen/BasicTTIImpl.h" 19 #include "llvm/CodeGen/TargetLowering.h" 20 #include "llvm/IR/DerivedTypes.h" 21 #include "llvm/IR/InstIterator.h" 22 #include "llvm/IR/IntrinsicInst.h" 23 #include "llvm/IR/Intrinsics.h" 24 #include "llvm/Support/Debug.h" 25 #include "llvm/Support/InstructionCost.h" 26 #include "llvm/Support/MathExtras.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "systemztti" 31 32 //===----------------------------------------------------------------------===// 33 // 34 // SystemZ cost model. 35 // 36 //===----------------------------------------------------------------------===// 37 38 static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) { 39 bool UsedAsMemCpySource = false; 40 for (const User *U : V->users()) 41 if (const Instruction *User = dyn_cast<Instruction>(U)) { 42 if (isa<BitCastInst>(User) || isa<GetElementPtrInst>(User)) { 43 UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse); 44 continue; 45 } 46 if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) { 47 if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) { 48 UsedAsMemCpySource = true; 49 continue; 50 } 51 } 52 OtherUse = true; 53 } 54 return UsedAsMemCpySource; 55 } 56 57 static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores, 58 unsigned &NumLoads, const Function *F) { 59 if (!isa<PointerType>(Ptr->getType())) 60 return; 61 for (const User *U : Ptr->users()) 62 if (const Instruction *User = dyn_cast<Instruction>(U)) { 63 if (User->getParent()->getParent() == F) { 64 if (const auto *SI = dyn_cast<StoreInst>(User)) { 65 if (SI->getPointerOperand() == Ptr && !SI->isVolatile()) 66 NumStores++; 67 } else if (const auto *LI = dyn_cast<LoadInst>(User)) { 68 if (LI->getPointerOperand() == Ptr && !LI->isVolatile()) 69 NumLoads++; 70 } else if (const auto *GEP = dyn_cast<GetElementPtrInst>(User)) { 71 if (GEP->getPointerOperand() == Ptr) 72 countNumMemAccesses(GEP, NumStores, NumLoads, F); 73 } 74 } 75 } 76 } 77 78 unsigned SystemZTTIImpl::adjustInliningThreshold(const CallBase *CB) const { 79 unsigned Bonus = 0; 80 const Function *Caller = CB->getParent()->getParent(); 81 const Function *Callee = CB->getCalledFunction(); 82 if (!Callee) 83 return 0; 84 85 // Increase the threshold if an incoming argument is used only as a memcpy 86 // source. 87 for (const Argument &Arg : Callee->args()) { 88 bool OtherUse = false; 89 if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse) { 90 Bonus = 1000; 91 break; 92 } 93 } 94 95 // Give bonus for globals used much in both caller and a relatively small 96 // callee. 97 unsigned InstrCount = 0; 98 SmallDenseMap<const Value *, unsigned> Ptr2NumUses; 99 for (auto &I : instructions(Callee)) { 100 if (++InstrCount == 200) { 101 Ptr2NumUses.clear(); 102 break; 103 } 104 if (const auto *SI = dyn_cast<StoreInst>(&I)) { 105 if (!SI->isVolatile()) 106 if (auto *GV = dyn_cast<GlobalVariable>(SI->getPointerOperand())) 107 Ptr2NumUses[GV]++; 108 } else if (const auto *LI = dyn_cast<LoadInst>(&I)) { 109 if (!LI->isVolatile()) 110 if (auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand())) 111 Ptr2NumUses[GV]++; 112 } else if (const auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 113 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getPointerOperand())) { 114 unsigned NumStores = 0, NumLoads = 0; 115 countNumMemAccesses(GEP, NumStores, NumLoads, Callee); 116 Ptr2NumUses[GV] += NumLoads + NumStores; 117 } 118 } 119 } 120 121 for (auto [Ptr, NumCalleeUses] : Ptr2NumUses) 122 if (NumCalleeUses > 10) { 123 unsigned CallerStores = 0, CallerLoads = 0; 124 countNumMemAccesses(Ptr, CallerStores, CallerLoads, Caller); 125 if (CallerStores + CallerLoads > 10) { 126 Bonus = 1000; 127 break; 128 } 129 } 130 131 // Give bonus when Callee accesses an Alloca of Caller heavily. 132 unsigned NumStores = 0; 133 unsigned NumLoads = 0; 134 for (unsigned OpIdx = 0; OpIdx != Callee->arg_size(); ++OpIdx) { 135 Value *CallerArg = CB->getArgOperand(OpIdx); 136 Argument *CalleeArg = Callee->getArg(OpIdx); 137 if (isa<AllocaInst>(CallerArg)) 138 countNumMemAccesses(CalleeArg, NumStores, NumLoads, Callee); 139 } 140 if (NumLoads > 10) 141 Bonus += NumLoads * 50; 142 if (NumStores > 10) 143 Bonus += NumStores * 50; 144 Bonus = std::min(Bonus, unsigned(1000)); 145 146 LLVM_DEBUG(if (Bonus) 147 dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";); 148 return Bonus; 149 } 150 151 InstructionCost 152 SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 153 TTI::TargetCostKind CostKind) const { 154 assert(Ty->isIntegerTy()); 155 156 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 157 // There is no cost model for constants with a bit size of 0. Return TCC_Free 158 // here, so that constant hoisting will ignore this constant. 159 if (BitSize == 0) 160 return TTI::TCC_Free; 161 // No cost model for operations on integers larger than 128 bit implemented yet. 162 if ((!ST->hasVector() && BitSize > 64) || BitSize > 128) 163 return TTI::TCC_Free; 164 165 if (Imm == 0) 166 return TTI::TCC_Free; 167 168 if (Imm.getBitWidth() <= 64) { 169 // Constants loaded via lgfi. 170 if (isInt<32>(Imm.getSExtValue())) 171 return TTI::TCC_Basic; 172 // Constants loaded via llilf. 173 if (isUInt<32>(Imm.getZExtValue())) 174 return TTI::TCC_Basic; 175 // Constants loaded via llihf: 176 if ((Imm.getZExtValue() & 0xffffffff) == 0) 177 return TTI::TCC_Basic; 178 179 return 2 * TTI::TCC_Basic; 180 } 181 182 // i128 immediates loads from Constant Pool 183 return 2 * TTI::TCC_Basic; 184 } 185 186 InstructionCost SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 187 const APInt &Imm, Type *Ty, 188 TTI::TargetCostKind CostKind, 189 Instruction *Inst) const { 190 assert(Ty->isIntegerTy()); 191 192 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 193 // There is no cost model for constants with a bit size of 0. Return TCC_Free 194 // here, so that constant hoisting will ignore this constant. 195 if (BitSize == 0) 196 return TTI::TCC_Free; 197 // No cost model for operations on integers larger than 64 bit implemented yet. 198 if (BitSize > 64) 199 return TTI::TCC_Free; 200 201 switch (Opcode) { 202 default: 203 return TTI::TCC_Free; 204 case Instruction::GetElementPtr: 205 // Always hoist the base address of a GetElementPtr. This prevents the 206 // creation of new constants for every base constant that gets constant 207 // folded with the offset. 208 if (Idx == 0) 209 return 2 * TTI::TCC_Basic; 210 return TTI::TCC_Free; 211 case Instruction::Store: 212 if (Idx == 0 && Imm.getBitWidth() <= 64) { 213 // Any 8-bit immediate store can by implemented via mvi. 214 if (BitSize == 8) 215 return TTI::TCC_Free; 216 // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi. 217 if (isInt<16>(Imm.getSExtValue())) 218 return TTI::TCC_Free; 219 } 220 break; 221 case Instruction::ICmp: 222 if (Idx == 1 && Imm.getBitWidth() <= 64) { 223 // Comparisons against signed 32-bit immediates implemented via cgfi. 224 if (isInt<32>(Imm.getSExtValue())) 225 return TTI::TCC_Free; 226 // Comparisons against unsigned 32-bit immediates implemented via clgfi. 227 if (isUInt<32>(Imm.getZExtValue())) 228 return TTI::TCC_Free; 229 } 230 break; 231 case Instruction::Add: 232 case Instruction::Sub: 233 if (Idx == 1 && Imm.getBitWidth() <= 64) { 234 // We use algfi/slgfi to add/subtract 32-bit unsigned immediates. 235 if (isUInt<32>(Imm.getZExtValue())) 236 return TTI::TCC_Free; 237 // Or their negation, by swapping addition vs. subtraction. 238 if (isUInt<32>(-Imm.getSExtValue())) 239 return TTI::TCC_Free; 240 } 241 break; 242 case Instruction::Mul: 243 if (Idx == 1 && Imm.getBitWidth() <= 64) { 244 // We use msgfi to multiply by 32-bit signed immediates. 245 if (isInt<32>(Imm.getSExtValue())) 246 return TTI::TCC_Free; 247 } 248 break; 249 case Instruction::Or: 250 case Instruction::Xor: 251 if (Idx == 1 && Imm.getBitWidth() <= 64) { 252 // Masks supported by oilf/xilf. 253 if (isUInt<32>(Imm.getZExtValue())) 254 return TTI::TCC_Free; 255 // Masks supported by oihf/xihf. 256 if ((Imm.getZExtValue() & 0xffffffff) == 0) 257 return TTI::TCC_Free; 258 } 259 break; 260 case Instruction::And: 261 if (Idx == 1 && Imm.getBitWidth() <= 64) { 262 // Any 32-bit AND operation can by implemented via nilf. 263 if (BitSize <= 32) 264 return TTI::TCC_Free; 265 // 64-bit masks supported by nilf. 266 if (isUInt<32>(~Imm.getZExtValue())) 267 return TTI::TCC_Free; 268 // 64-bit masks supported by nilh. 269 if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff) 270 return TTI::TCC_Free; 271 // Some 64-bit AND operations can be implemented via risbg. 272 const SystemZInstrInfo *TII = ST->getInstrInfo(); 273 unsigned Start, End; 274 if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End)) 275 return TTI::TCC_Free; 276 } 277 break; 278 case Instruction::Shl: 279 case Instruction::LShr: 280 case Instruction::AShr: 281 // Always return TCC_Free for the shift value of a shift instruction. 282 if (Idx == 1) 283 return TTI::TCC_Free; 284 break; 285 case Instruction::UDiv: 286 case Instruction::SDiv: 287 case Instruction::URem: 288 case Instruction::SRem: 289 case Instruction::Trunc: 290 case Instruction::ZExt: 291 case Instruction::SExt: 292 case Instruction::IntToPtr: 293 case Instruction::PtrToInt: 294 case Instruction::BitCast: 295 case Instruction::PHI: 296 case Instruction::Call: 297 case Instruction::Select: 298 case Instruction::Ret: 299 case Instruction::Load: 300 break; 301 } 302 303 return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind); 304 } 305 306 InstructionCost 307 SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 308 const APInt &Imm, Type *Ty, 309 TTI::TargetCostKind CostKind) const { 310 assert(Ty->isIntegerTy()); 311 312 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 313 // There is no cost model for constants with a bit size of 0. Return TCC_Free 314 // here, so that constant hoisting will ignore this constant. 315 if (BitSize == 0) 316 return TTI::TCC_Free; 317 // No cost model for operations on integers larger than 64 bit implemented yet. 318 if (BitSize > 64) 319 return TTI::TCC_Free; 320 321 switch (IID) { 322 default: 323 return TTI::TCC_Free; 324 case Intrinsic::sadd_with_overflow: 325 case Intrinsic::uadd_with_overflow: 326 case Intrinsic::ssub_with_overflow: 327 case Intrinsic::usub_with_overflow: 328 // These get expanded to include a normal addition/subtraction. 329 if (Idx == 1 && Imm.getBitWidth() <= 64) { 330 if (isUInt<32>(Imm.getZExtValue())) 331 return TTI::TCC_Free; 332 if (isUInt<32>(-Imm.getSExtValue())) 333 return TTI::TCC_Free; 334 } 335 break; 336 case Intrinsic::smul_with_overflow: 337 case Intrinsic::umul_with_overflow: 338 // These get expanded to include a normal multiplication. 339 if (Idx == 1 && Imm.getBitWidth() <= 64) { 340 if (isInt<32>(Imm.getSExtValue())) 341 return TTI::TCC_Free; 342 } 343 break; 344 case Intrinsic::experimental_stackmap: 345 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 346 return TTI::TCC_Free; 347 break; 348 case Intrinsic::experimental_patchpoint_void: 349 case Intrinsic::experimental_patchpoint: 350 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 351 return TTI::TCC_Free; 352 break; 353 } 354 return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind); 355 } 356 357 TargetTransformInfo::PopcntSupportKind 358 SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) const { 359 assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2"); 360 if (ST->hasPopulationCount() && TyWidth <= 64) 361 return TTI::PSK_FastHardware; 362 return TTI::PSK_Software; 363 } 364 365 void SystemZTTIImpl::getUnrollingPreferences( 366 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, 367 OptimizationRemarkEmitter *ORE) const { 368 // Find out if L contains a call, what the machine instruction count 369 // estimate is, and how many stores there are. 370 bool HasCall = false; 371 InstructionCost NumStores = 0; 372 for (auto &BB : L->blocks()) 373 for (auto &I : *BB) { 374 if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) { 375 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 376 if (isLoweredToCall(F)) 377 HasCall = true; 378 if (F->getIntrinsicID() == Intrinsic::memcpy || 379 F->getIntrinsicID() == Intrinsic::memset) 380 NumStores++; 381 } else { // indirect call. 382 HasCall = true; 383 } 384 } 385 if (isa<StoreInst>(&I)) { 386 Type *MemAccessTy = I.getOperand(0)->getType(); 387 NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, Align(), 388 0, TTI::TCK_RecipThroughput); 389 } 390 } 391 392 // The z13 processor will run out of store tags if too many stores 393 // are fed into it too quickly. Therefore make sure there are not 394 // too many stores in the resulting unrolled loop. 395 unsigned const NumStoresVal = NumStores.getValue(); 396 unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX); 397 398 if (HasCall) { 399 // Only allow full unrolling if loop has any calls. 400 UP.FullUnrollMaxCount = Max; 401 UP.MaxCount = 1; 402 return; 403 } 404 405 UP.MaxCount = Max; 406 if (UP.MaxCount <= 1) 407 return; 408 409 // Allow partial and runtime trip count unrolling. 410 UP.Partial = UP.Runtime = true; 411 412 UP.PartialThreshold = 75; 413 UP.DefaultUnrollRuntimeCount = 4; 414 415 // Allow expensive instructions in the pre-header of the loop. 416 UP.AllowExpensiveTripCount = true; 417 418 UP.Force = true; 419 } 420 421 void SystemZTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 422 TTI::PeelingPreferences &PP) const { 423 BaseT::getPeelingPreferences(L, SE, PP); 424 } 425 426 bool SystemZTTIImpl::isLSRCostLess( 427 const TargetTransformInfo::LSRCost &C1, 428 const TargetTransformInfo::LSRCost &C2) const { 429 // SystemZ specific: check instruction count (first), and don't care about 430 // ImmCost, since offsets are checked explicitly. 431 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 432 C1.NumIVMuls, C1.NumBaseAdds, 433 C1.ScaleCost, C1.SetupCost) < 434 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 435 C2.NumIVMuls, C2.NumBaseAdds, 436 C2.ScaleCost, C2.SetupCost); 437 } 438 439 bool SystemZTTIImpl::areInlineCompatible(const Function *Caller, 440 const Function *Callee) const { 441 const TargetMachine &TM = getTLI()->getTargetMachine(); 442 443 const FeatureBitset &CallerBits = 444 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 445 const FeatureBitset &CalleeBits = 446 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 447 448 // Support only equal feature bitsets. Restriction should be relaxed in the 449 // future to allow inlining when callee's bits are subset of the caller's. 450 return CallerBits == CalleeBits; 451 } 452 453 unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const { 454 bool Vector = (ClassID == 1); 455 if (!Vector) 456 // Discount the stack pointer. Also leave out %r0, since it can't 457 // be used in an address. 458 return 14; 459 if (ST->hasVector()) 460 return 32; 461 return 0; 462 } 463 464 TypeSize 465 SystemZTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 466 switch (K) { 467 case TargetTransformInfo::RGK_Scalar: 468 return TypeSize::getFixed(64); 469 case TargetTransformInfo::RGK_FixedWidthVector: 470 return TypeSize::getFixed(ST->hasVector() ? 128 : 0); 471 case TargetTransformInfo::RGK_ScalableVector: 472 return TypeSize::getScalable(0); 473 } 474 475 llvm_unreachable("Unsupported register kind"); 476 } 477 478 unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses, 479 unsigned NumStridedMemAccesses, 480 unsigned NumPrefetches, 481 bool HasCall) const { 482 // Don't prefetch a loop with many far apart accesses. 483 if (NumPrefetches > 16) 484 return UINT_MAX; 485 486 // Emit prefetch instructions for smaller strides in cases where we think 487 // the hardware prefetcher might not be able to keep up. 488 if (NumStridedMemAccesses > 32 && !HasCall && 489 (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses) 490 return 1; 491 492 return ST->hasMiscellaneousExtensions3() ? 8192 : 2048; 493 } 494 495 bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const { 496 EVT VT = TLI->getValueType(DL, DataType); 497 return (VT.isScalarInteger() && TLI->isTypeLegal(VT)); 498 } 499 500 static bool isFreeEltLoad(const Value *Op) { 501 if (isa<LoadInst>(Op) && Op->hasOneUse()) { 502 const Instruction *UserI = cast<Instruction>(*Op->user_begin()); 503 return !isa<StoreInst>(UserI); // Prefer MVC 504 } 505 return false; 506 } 507 508 InstructionCost SystemZTTIImpl::getScalarizationOverhead( 509 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, 510 TTI::TargetCostKind CostKind, bool ForPoisonSrc, 511 ArrayRef<Value *> VL) const { 512 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements(); 513 InstructionCost Cost = 0; 514 515 if (Insert && Ty->isIntOrIntVectorTy(64)) { 516 // VLVGP will insert two GPRs with one instruction, while VLE will load 517 // an element directly with no extra cost 518 assert((VL.empty() || VL.size() == NumElts) && 519 "Type does not match the number of values."); 520 InstructionCost CurrVectorCost = 0; 521 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 522 if (DemandedElts[Idx] && !(VL.size() && isFreeEltLoad(VL[Idx]))) 523 ++CurrVectorCost; 524 if (Idx % 2 == 1) { 525 Cost += std::min(InstructionCost(1), CurrVectorCost); 526 CurrVectorCost = 0; 527 } 528 } 529 Insert = false; 530 } 531 532 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, 533 CostKind, ForPoisonSrc, VL); 534 return Cost; 535 } 536 537 // Return the bit size for the scalar type or vector element 538 // type. getScalarSizeInBits() returns 0 for a pointer type. 539 static unsigned getScalarSizeInBits(Type *Ty) { 540 unsigned Size = 541 (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits()); 542 assert(Size > 0 && "Element must have non-zero size."); 543 return Size; 544 } 545 546 // getNumberOfParts() calls getTypeLegalizationCost() which splits the vector 547 // type until it is legal. This would e.g. return 4 for <6 x i64>, instead of 548 // 3. 549 static unsigned getNumVectorRegs(Type *Ty) { 550 auto *VTy = cast<FixedVectorType>(Ty); 551 unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements(); 552 assert(WideBits > 0 && "Could not compute size of vector"); 553 return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U)); 554 } 555 556 InstructionCost SystemZTTIImpl::getArithmeticInstrCost( 557 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 558 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 559 ArrayRef<const Value *> Args, const Instruction *CxtI) const { 560 561 // TODO: Handle more cost kinds. 562 if (CostKind != TTI::TCK_RecipThroughput) 563 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 564 Op2Info, Args, CxtI); 565 566 // TODO: return a good value for BB-VECTORIZER that includes the 567 // immediate loads, which we do not want to count for the loop 568 // vectorizer, since they are hopefully hoisted out of the loop. This 569 // would require a new parameter 'InLoop', but not sure if constant 570 // args are common enough to motivate this. 571 572 unsigned ScalarBits = Ty->getScalarSizeInBits(); 573 574 // There are thre cases of division and remainder: Dividing with a register 575 // needs a divide instruction. A divisor which is a power of two constant 576 // can be implemented with a sequence of shifts. Any other constant needs a 577 // multiply and shifts. 578 const unsigned DivInstrCost = 20; 579 const unsigned DivMulSeqCost = 10; 580 const unsigned SDivPow2Cost = 4; 581 582 bool SignedDivRem = 583 Opcode == Instruction::SDiv || Opcode == Instruction::SRem; 584 bool UnsignedDivRem = 585 Opcode == Instruction::UDiv || Opcode == Instruction::URem; 586 587 // Check for a constant divisor. 588 bool DivRemConst = false; 589 bool DivRemConstPow2 = false; 590 if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) { 591 if (const Constant *C = dyn_cast<Constant>(Args[1])) { 592 const ConstantInt *CVal = 593 (C->getType()->isVectorTy() 594 ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue()) 595 : dyn_cast<const ConstantInt>(C)); 596 if (CVal && (CVal->getValue().isPowerOf2() || 597 CVal->getValue().isNegatedPowerOf2())) 598 DivRemConstPow2 = true; 599 else 600 DivRemConst = true; 601 } 602 } 603 604 if (!Ty->isVectorTy()) { 605 // These FP operations are supported with a dedicated instruction for 606 // float, double and fp128 (base implementation assumes float generally 607 // costs 2). 608 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub || 609 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) 610 return 1; 611 612 // There is no native support for FRem. 613 if (Opcode == Instruction::FRem) 614 return LIBCALL_COST; 615 616 // Give discount for some combined logical operations if supported. 617 if (Args.size() == 2) { 618 if (Opcode == Instruction::Xor) { 619 for (const Value *A : Args) { 620 if (const Instruction *I = dyn_cast<Instruction>(A)) 621 if (I->hasOneUse() && 622 (I->getOpcode() == Instruction::Or || 623 I->getOpcode() == Instruction::And || 624 I->getOpcode() == Instruction::Xor)) 625 if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) || 626 (isInt128InVR(Ty) && 627 (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1()))) 628 return 0; 629 } 630 } 631 else if (Opcode == Instruction::And || Opcode == Instruction::Or) { 632 for (const Value *A : Args) { 633 if (const Instruction *I = dyn_cast<Instruction>(A)) 634 if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) && 635 ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) || 636 (isInt128InVR(Ty) && 637 (Opcode == Instruction::And || ST->hasVectorEnhancements1())))) 638 return 0; 639 } 640 } 641 } 642 643 // Or requires one instruction, although it has custom handling for i64. 644 if (Opcode == Instruction::Or) 645 return 1; 646 647 if (Opcode == Instruction::Xor && ScalarBits == 1) { 648 if (ST->hasLoadStoreOnCond2()) 649 return 5; // 2 * (li 0; loc 1); xor 650 return 7; // 2 * ipm sequences ; xor ; shift ; compare 651 } 652 653 if (DivRemConstPow2) 654 return (SignedDivRem ? SDivPow2Cost : 1); 655 if (DivRemConst) 656 return DivMulSeqCost; 657 if (SignedDivRem || UnsignedDivRem) 658 return DivInstrCost; 659 } 660 else if (ST->hasVector()) { 661 auto *VTy = cast<FixedVectorType>(Ty); 662 unsigned VF = VTy->getNumElements(); 663 unsigned NumVectors = getNumVectorRegs(Ty); 664 665 // These vector operations are custom handled, but are still supported 666 // with one instruction per vector, regardless of element size. 667 if (Opcode == Instruction::Shl || Opcode == Instruction::LShr || 668 Opcode == Instruction::AShr) { 669 return NumVectors; 670 } 671 672 if (DivRemConstPow2) 673 return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1)); 674 if (DivRemConst) { 675 SmallVector<Type *> Tys(Args.size(), Ty); 676 return VF * DivMulSeqCost + 677 BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind); 678 } 679 if (SignedDivRem || UnsignedDivRem) { 680 if (ST->hasVectorEnhancements3() && ScalarBits >= 32) 681 return NumVectors * DivInstrCost; 682 else if (VF > 4) 683 // Temporary hack: disable high vectorization factors with integer 684 // division/remainder, which will get scalarized and handled with 685 // GR128 registers. The mischeduler is not clever enough to avoid 686 // spilling yet. 687 return 1000; 688 } 689 690 // These FP operations are supported with a single vector instruction for 691 // double (base implementation assumes float generally costs 2). For 692 // FP128, the scalar cost is 1, and there is no overhead since the values 693 // are already in scalar registers. 694 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub || 695 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) { 696 switch (ScalarBits) { 697 case 32: { 698 // The vector enhancements facility 1 provides v4f32 instructions. 699 if (ST->hasVectorEnhancements1()) 700 return NumVectors; 701 // Return the cost of multiple scalar invocation plus the cost of 702 // inserting and extracting the values. 703 InstructionCost ScalarCost = 704 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind); 705 SmallVector<Type *> Tys(Args.size(), Ty); 706 InstructionCost Cost = 707 (VF * ScalarCost) + 708 BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind); 709 // FIXME: VF 2 for these FP operations are currently just as 710 // expensive as for VF 4. 711 if (VF == 2) 712 Cost *= 2; 713 return Cost; 714 } 715 case 64: 716 case 128: 717 return NumVectors; 718 default: 719 break; 720 } 721 } 722 723 // There is no native support for FRem. 724 if (Opcode == Instruction::FRem) { 725 SmallVector<Type *> Tys(Args.size(), Ty); 726 InstructionCost Cost = 727 (VF * LIBCALL_COST) + 728 BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind); 729 // FIXME: VF 2 for float is currently just as expensive as for VF 4. 730 if (VF == 2 && ScalarBits == 32) 731 Cost *= 2; 732 return Cost; 733 } 734 } 735 736 // Fallback to the default implementation. 737 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 738 Args, CxtI); 739 } 740 741 InstructionCost 742 SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, 743 VectorType *SrcTy, ArrayRef<int> Mask, 744 TTI::TargetCostKind CostKind, int Index, 745 VectorType *SubTp, ArrayRef<const Value *> Args, 746 const Instruction *CxtI) const { 747 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp); 748 if (ST->hasVector()) { 749 unsigned NumVectors = getNumVectorRegs(SrcTy); 750 751 // TODO: Since fp32 is expanded, the shuffle cost should always be 0. 752 753 // FP128 values are always in scalar registers, so there is no work 754 // involved with a shuffle, except for broadcast. In that case register 755 // moves are done with a single instruction per element. 756 if (SrcTy->getScalarType()->isFP128Ty()) 757 return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0); 758 759 switch (Kind) { 760 case TargetTransformInfo::SK_ExtractSubvector: 761 // ExtractSubvector Index indicates start offset. 762 763 // Extracting a subvector from first index is a noop. 764 return (Index == 0 ? 0 : NumVectors); 765 766 case TargetTransformInfo::SK_Broadcast: 767 // Loop vectorizer calls here to figure out the extra cost of 768 // broadcasting a loaded value to all elements of a vector. Since vlrep 769 // loads and replicates with a single instruction, adjust the returned 770 // value. 771 return NumVectors - 1; 772 773 default: 774 775 // SystemZ supports single instruction permutation / replication. 776 return NumVectors; 777 } 778 } 779 780 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, 781 SubTp); 782 } 783 784 // Return the log2 difference of the element sizes of the two vector types. 785 static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) { 786 unsigned Bits0 = Ty0->getScalarSizeInBits(); 787 unsigned Bits1 = Ty1->getScalarSizeInBits(); 788 789 if (Bits1 > Bits0) 790 return (Log2_32(Bits1) - Log2_32(Bits0)); 791 792 return (Log2_32(Bits0) - Log2_32(Bits1)); 793 } 794 795 // Return the number of instructions needed to truncate SrcTy to DstTy. 796 unsigned SystemZTTIImpl::getVectorTruncCost(Type *SrcTy, Type *DstTy) const { 797 assert (SrcTy->isVectorTy() && DstTy->isVectorTy()); 798 assert(SrcTy->getPrimitiveSizeInBits().getFixedValue() > 799 DstTy->getPrimitiveSizeInBits().getFixedValue() && 800 "Packing must reduce size of vector type."); 801 assert(cast<FixedVectorType>(SrcTy)->getNumElements() == 802 cast<FixedVectorType>(DstTy)->getNumElements() && 803 "Packing should not change number of elements."); 804 805 // TODO: Since fp32 is expanded, the extract cost should always be 0. 806 807 unsigned NumParts = getNumVectorRegs(SrcTy); 808 if (NumParts <= 2) 809 // Up to 2 vector registers can be truncated efficiently with pack or 810 // permute. The latter requires an immediate mask to be loaded, which 811 // typically gets hoisted out of a loop. TODO: return a good value for 812 // BB-VECTORIZER that includes the immediate loads, which we do not want 813 // to count for the loop vectorizer. 814 return 1; 815 816 unsigned Cost = 0; 817 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy); 818 unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements(); 819 for (unsigned P = 0; P < Log2Diff; ++P) { 820 if (NumParts > 1) 821 NumParts /= 2; 822 Cost += NumParts; 823 } 824 825 // Currently, a general mix of permutes and pack instructions is output by 826 // isel, which follow the cost computation above except for this case which 827 // is one instruction less: 828 if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 && 829 DstTy->getScalarSizeInBits() == 8) 830 Cost--; 831 832 return Cost; 833 } 834 835 // Return the cost of converting a vector bitmask produced by a compare 836 // (SrcTy), to the type of the select or extend instruction (DstTy). 837 unsigned SystemZTTIImpl::getVectorBitmaskConversionCost(Type *SrcTy, 838 Type *DstTy) const { 839 assert (SrcTy->isVectorTy() && DstTy->isVectorTy() && 840 "Should only be called with vector types."); 841 842 unsigned PackCost = 0; 843 unsigned SrcScalarBits = SrcTy->getScalarSizeInBits(); 844 unsigned DstScalarBits = DstTy->getScalarSizeInBits(); 845 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy); 846 if (SrcScalarBits > DstScalarBits) 847 // The bitmask will be truncated. 848 PackCost = getVectorTruncCost(SrcTy, DstTy); 849 else if (SrcScalarBits < DstScalarBits) { 850 unsigned DstNumParts = getNumVectorRegs(DstTy); 851 // Each vector select needs its part of the bitmask unpacked. 852 PackCost = Log2Diff * DstNumParts; 853 // Extra cost for moving part of mask before unpacking. 854 PackCost += DstNumParts - 1; 855 } 856 857 return PackCost; 858 } 859 860 // Return the type of the compared operands. This is needed to compute the 861 // cost for a Select / ZExt or SExt instruction. 862 static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) { 863 Type *OpTy = nullptr; 864 if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0))) 865 OpTy = CI->getOperand(0)->getType(); 866 else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0))) 867 if (LogicI->getNumOperands() == 2) 868 if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0))) 869 if (isa<CmpInst>(LogicI->getOperand(1))) 870 OpTy = CI0->getOperand(0)->getType(); 871 872 if (OpTy != nullptr) { 873 if (VF == 1) { 874 assert (!OpTy->isVectorTy() && "Expected scalar type"); 875 return OpTy; 876 } 877 // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may 878 // be either scalar or already vectorized with a same or lesser VF. 879 Type *ElTy = OpTy->getScalarType(); 880 return FixedVectorType::get(ElTy, VF); 881 } 882 883 return nullptr; 884 } 885 886 // Get the cost of converting a boolean vector to a vector with same width 887 // and element size as Dst, plus the cost of zero extending if needed. 888 unsigned 889 SystemZTTIImpl::getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, 890 const Instruction *I) const { 891 auto *DstVTy = cast<FixedVectorType>(Dst); 892 unsigned VF = DstVTy->getNumElements(); 893 unsigned Cost = 0; 894 // If we know what the widths of the compared operands, get any cost of 895 // converting it to match Dst. Otherwise assume same widths. 896 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr); 897 if (CmpOpTy != nullptr) 898 Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst); 899 if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP) 900 // One 'vn' per dst vector with an immediate mask. 901 Cost += getNumVectorRegs(Dst); 902 return Cost; 903 } 904 905 InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 906 Type *Src, 907 TTI::CastContextHint CCH, 908 TTI::TargetCostKind CostKind, 909 const Instruction *I) const { 910 // FIXME: Can the logic below also be used for these cost kinds? 911 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) { 912 auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 913 return BaseCost == 0 ? BaseCost : 1; 914 } 915 916 unsigned DstScalarBits = Dst->getScalarSizeInBits(); 917 unsigned SrcScalarBits = Src->getScalarSizeInBits(); 918 919 if (!Src->isVectorTy()) { 920 if (Dst->isVectorTy()) 921 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 922 923 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) { 924 if (Src->isIntegerTy(128)) 925 return LIBCALL_COST; 926 if (SrcScalarBits >= 32 || 927 (I != nullptr && isa<LoadInst>(I->getOperand(0)))) 928 return 1; 929 return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/; 930 } 931 932 if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) && 933 Dst->isIntegerTy(128)) 934 return LIBCALL_COST; 935 936 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) { 937 if (Src->isIntegerTy(1)) { 938 if (DstScalarBits == 128) { 939 if (Opcode == Instruction::SExt && ST->hasVectorEnhancements3()) 940 return 0;/*VCEQQ*/ 941 return 5 /*branch seq.*/; 942 } 943 944 if (ST->hasLoadStoreOnCond2()) 945 return 2; // li 0; loc 1 946 947 // This should be extension of a compare i1 result, which is done with 948 // ipm and a varying sequence of instructions. 949 unsigned Cost = 0; 950 if (Opcode == Instruction::SExt) 951 Cost = (DstScalarBits < 64 ? 3 : 4); 952 if (Opcode == Instruction::ZExt) 953 Cost = 3; 954 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr); 955 if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy()) 956 // If operands of an fp-type was compared, this costs +1. 957 Cost++; 958 return Cost; 959 } 960 else if (isInt128InVR(Dst)) { 961 // Extensions from GPR to i128 (in VR) typically costs two instructions, 962 // but a zero-extending load would be just one extra instruction. 963 if (Opcode == Instruction::ZExt && I != nullptr) 964 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0))) 965 if (Ld->hasOneUse()) 966 return 1; 967 return 2; 968 } 969 } 970 971 if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) { 972 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0))) 973 if (Ld->hasOneUse()) 974 return 0; // Will be converted to GPR load. 975 bool OnlyTruncatingStores = true; 976 for (const User *U : I->users()) 977 if (!isa<StoreInst>(U)) { 978 OnlyTruncatingStores = false; 979 break; 980 } 981 if (OnlyTruncatingStores) 982 return 0; 983 return 2; // Vector element extraction. 984 } 985 } 986 else if (ST->hasVector()) { 987 // Vector to scalar cast. 988 auto *SrcVecTy = cast<FixedVectorType>(Src); 989 auto *DstVecTy = dyn_cast<FixedVectorType>(Dst); 990 if (!DstVecTy) { 991 // TODO: tune vector-to-scalar cast. 992 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 993 } 994 unsigned VF = SrcVecTy->getNumElements(); 995 unsigned NumDstVectors = getNumVectorRegs(Dst); 996 unsigned NumSrcVectors = getNumVectorRegs(Src); 997 998 if (Opcode == Instruction::Trunc) { 999 if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits()) 1000 return 0; // Check for NOOP conversions. 1001 return getVectorTruncCost(Src, Dst); 1002 } 1003 1004 if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 1005 if (SrcScalarBits >= 8) { 1006 // ZExt will use either a single unpack or a vector permute. 1007 if (Opcode == Instruction::ZExt) 1008 return NumDstVectors; 1009 1010 // SExt will be handled with one unpack per doubling of width. 1011 unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst); 1012 1013 // For types that spans multiple vector registers, some additional 1014 // instructions are used to setup the unpacking. 1015 unsigned NumSrcVectorOps = 1016 (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors) 1017 : (NumDstVectors / 2)); 1018 1019 return (NumUnpacks * NumDstVectors) + NumSrcVectorOps; 1020 } 1021 else if (SrcScalarBits == 1) 1022 return getBoolVecToIntConversionCost(Opcode, Dst, I); 1023 } 1024 1025 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP || 1026 Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) { 1027 // TODO: Fix base implementation which could simplify things a bit here 1028 // (seems to miss on differentiating on scalar/vector types). 1029 1030 // Only 64 bit vector conversions are natively supported before z15. 1031 if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) { 1032 if (SrcScalarBits == DstScalarBits) 1033 return NumDstVectors; 1034 1035 if (SrcScalarBits == 1) 1036 return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors; 1037 } 1038 1039 // Return the cost of multiple scalar invocation plus the cost of 1040 // inserting and extracting the values. Base implementation does not 1041 // realize float->int gets scalarized. 1042 InstructionCost ScalarCost = getCastInstrCost( 1043 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind); 1044 InstructionCost TotCost = VF * ScalarCost; 1045 bool NeedsInserts = true, NeedsExtracts = true; 1046 // FP128 registers do not get inserted or extracted. 1047 if (DstScalarBits == 128 && 1048 (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)) 1049 NeedsInserts = false; 1050 if (SrcScalarBits == 128 && 1051 (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI)) 1052 NeedsExtracts = false; 1053 1054 TotCost += BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false, 1055 NeedsExtracts, CostKind); 1056 TotCost += BaseT::getScalarizationOverhead(DstVecTy, NeedsInserts, 1057 /*Extract*/ false, CostKind); 1058 1059 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4. 1060 if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32) 1061 TotCost *= 2; 1062 1063 return TotCost; 1064 } 1065 1066 if (Opcode == Instruction::FPTrunc) { 1067 if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements. 1068 return VF /*ldxbr/lexbr*/ + 1069 BaseT::getScalarizationOverhead(DstVecTy, /*Insert*/ true, 1070 /*Extract*/ false, CostKind); 1071 else // double -> float 1072 return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/); 1073 } 1074 1075 if (Opcode == Instruction::FPExt) { 1076 if (SrcScalarBits == 32 && DstScalarBits == 64) { 1077 // float -> double is very rare and currently unoptimized. Instead of 1078 // using vldeb, which can do two at a time, all conversions are 1079 // scalarized. 1080 return VF * 2; 1081 } 1082 // -> fp128. VF * lxdb/lxeb + extraction of elements. 1083 return VF + BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false, 1084 /*Extract*/ true, CostKind); 1085 } 1086 } 1087 1088 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1089 } 1090 1091 // Scalar i8 / i16 operations will typically be made after first extending 1092 // the operands to i32. 1093 static unsigned getOperandsExtensionCost(const Instruction *I) { 1094 unsigned ExtCost = 0; 1095 for (Value *Op : I->operands()) 1096 // A load of i8 or i16 sign/zero extends to i32. 1097 if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op)) 1098 ExtCost++; 1099 1100 return ExtCost; 1101 } 1102 1103 InstructionCost SystemZTTIImpl::getCmpSelInstrCost( 1104 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, 1105 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, 1106 TTI::OperandValueInfo Op2Info, const Instruction *I) const { 1107 if (CostKind != TTI::TCK_RecipThroughput) 1108 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1109 Op1Info, Op2Info); 1110 1111 if (!ValTy->isVectorTy()) { 1112 switch (Opcode) { 1113 case Instruction::ICmp: { 1114 // A loaded value compared with 0 with multiple users becomes Load and 1115 // Test. The load is then not foldable, so return 0 cost for the ICmp. 1116 unsigned ScalarBits = ValTy->getScalarSizeInBits(); 1117 if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64)) 1118 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0))) 1119 if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1))) 1120 if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() && 1121 C->isZero()) 1122 return 0; 1123 1124 unsigned Cost = 1; 1125 if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16) 1126 Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2); 1127 return Cost; 1128 } 1129 case Instruction::Select: 1130 if (ValTy->isFloatingPointTy()) 1131 return 4; // No LOC for FP - costs a conditional jump. 1132 1133 // When selecting based on an i128 comparison, LOC / VSEL is possible 1134 // if i128 comparisons are directly supported. 1135 if (I != nullptr) 1136 if (ICmpInst *CI = dyn_cast<ICmpInst>(I->getOperand(0))) 1137 if (CI->getOperand(0)->getType()->isIntegerTy(128)) 1138 return ST->hasVectorEnhancements3() ? 1 : 4; 1139 1140 // Load On Condition / Select Register available, except for i128. 1141 return !isInt128InVR(ValTy) ? 1 : 4; 1142 } 1143 } 1144 else if (ST->hasVector()) { 1145 unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements(); 1146 1147 // Called with a compare instruction. 1148 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { 1149 unsigned PredicateExtraCost = 0; 1150 if (I != nullptr) { 1151 // Some predicates cost one or two extra instructions. 1152 switch (cast<CmpInst>(I)->getPredicate()) { 1153 case CmpInst::Predicate::ICMP_NE: 1154 case CmpInst::Predicate::ICMP_UGE: 1155 case CmpInst::Predicate::ICMP_ULE: 1156 case CmpInst::Predicate::ICMP_SGE: 1157 case CmpInst::Predicate::ICMP_SLE: 1158 PredicateExtraCost = 1; 1159 break; 1160 case CmpInst::Predicate::FCMP_ONE: 1161 case CmpInst::Predicate::FCMP_ORD: 1162 case CmpInst::Predicate::FCMP_UEQ: 1163 case CmpInst::Predicate::FCMP_UNO: 1164 PredicateExtraCost = 2; 1165 break; 1166 default: 1167 break; 1168 } 1169 } 1170 1171 // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of 1172 // floats. FIXME: <2 x float> generates same code as <4 x float>. 1173 unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1); 1174 unsigned NumVecs_cmp = getNumVectorRegs(ValTy); 1175 1176 unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost)); 1177 return Cost; 1178 } 1179 else { // Called with a select instruction. 1180 assert (Opcode == Instruction::Select); 1181 1182 // We can figure out the extra cost of packing / unpacking if the 1183 // instruction was passed and the compare instruction is found. 1184 unsigned PackCost = 0; 1185 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr); 1186 if (CmpOpTy != nullptr) 1187 PackCost = 1188 getVectorBitmaskConversionCost(CmpOpTy, ValTy); 1189 1190 return getNumVectorRegs(ValTy) /*vsel*/ + PackCost; 1191 } 1192 } 1193 1194 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1195 Op1Info, Op2Info); 1196 } 1197 1198 InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1199 TTI::TargetCostKind CostKind, 1200 unsigned Index, 1201 const Value *Op0, 1202 const Value *Op1) const { 1203 if (Opcode == Instruction::InsertElement) { 1204 // Vector Element Load. 1205 if (Op1 != nullptr && isFreeEltLoad(Op1)) 1206 return 0; 1207 1208 // vlvgp will insert two grs into a vector register, so count half the 1209 // number of instructions as an estimate when we don't have the full 1210 // picture (as in getScalarizationOverhead()). 1211 if (Val->isIntOrIntVectorTy(64)) 1212 return ((Index % 2 == 0) ? 1 : 0); 1213 } 1214 1215 if (Opcode == Instruction::ExtractElement) { 1216 int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1); 1217 1218 // Give a slight penalty for moving out of vector pipeline to FXU unit. 1219 if (Index == 0 && Val->isIntOrIntVectorTy()) 1220 Cost += 1; 1221 1222 return Cost; 1223 } 1224 1225 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1226 } 1227 1228 // Check if a load may be folded as a memory operand in its user. 1229 bool SystemZTTIImpl::isFoldableLoad(const LoadInst *Ld, 1230 const Instruction *&FoldedValue) const { 1231 if (!Ld->hasOneUse()) 1232 return false; 1233 FoldedValue = Ld; 1234 const Instruction *UserI = cast<Instruction>(*Ld->user_begin()); 1235 unsigned LoadedBits = getScalarSizeInBits(Ld->getType()); 1236 unsigned TruncBits = 0; 1237 unsigned SExtBits = 0; 1238 unsigned ZExtBits = 0; 1239 if (UserI->hasOneUse()) { 1240 unsigned UserBits = UserI->getType()->getScalarSizeInBits(); 1241 if (isa<TruncInst>(UserI)) 1242 TruncBits = UserBits; 1243 else if (isa<SExtInst>(UserI)) 1244 SExtBits = UserBits; 1245 else if (isa<ZExtInst>(UserI)) 1246 ZExtBits = UserBits; 1247 } 1248 if (TruncBits || SExtBits || ZExtBits) { 1249 FoldedValue = UserI; 1250 UserI = cast<Instruction>(*UserI->user_begin()); 1251 // Load (single use) -> trunc/extend (single use) -> UserI 1252 } 1253 if ((UserI->getOpcode() == Instruction::Sub || 1254 UserI->getOpcode() == Instruction::SDiv || 1255 UserI->getOpcode() == Instruction::UDiv) && 1256 UserI->getOperand(1) != FoldedValue) 1257 return false; // Not commutative, only RHS foldable. 1258 // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an 1259 // extension was made of the load. 1260 unsigned LoadOrTruncBits = 1261 ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits)); 1262 switch (UserI->getOpcode()) { 1263 case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64 1264 case Instruction::Sub: 1265 case Instruction::ICmp: 1266 if (LoadedBits == 32 && ZExtBits == 64) 1267 return true; 1268 [[fallthrough]]; 1269 case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64 1270 if (UserI->getOpcode() != Instruction::ICmp) { 1271 if (LoadedBits == 16 && 1272 (SExtBits == 32 || 1273 (SExtBits == 64 && ST->hasMiscellaneousExtensions2()))) 1274 return true; 1275 if (LoadOrTruncBits == 16) 1276 return true; 1277 } 1278 [[fallthrough]]; 1279 case Instruction::SDiv:// SE: 32->64 1280 if (LoadedBits == 32 && SExtBits == 64) 1281 return true; 1282 [[fallthrough]]; 1283 case Instruction::UDiv: 1284 case Instruction::And: 1285 case Instruction::Or: 1286 case Instruction::Xor: 1287 // This also makes sense for float operations, but disabled for now due 1288 // to regressions. 1289 // case Instruction::FCmp: 1290 // case Instruction::FAdd: 1291 // case Instruction::FSub: 1292 // case Instruction::FMul: 1293 // case Instruction::FDiv: 1294 1295 // All possible extensions of memory checked above. 1296 1297 // Comparison between memory and immediate. 1298 if (UserI->getOpcode() == Instruction::ICmp) 1299 if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1))) 1300 if (CI->getValue().isIntN(16)) 1301 return true; 1302 return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64); 1303 break; 1304 } 1305 return false; 1306 } 1307 1308 static bool isBswapIntrinsicCall(const Value *V) { 1309 if (const Instruction *I = dyn_cast<Instruction>(V)) 1310 if (auto *CI = dyn_cast<CallInst>(I)) 1311 if (auto *F = CI->getCalledFunction()) 1312 if (F->getIntrinsicID() == Intrinsic::bswap) 1313 return true; 1314 return false; 1315 } 1316 1317 InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 1318 Align Alignment, 1319 unsigned AddressSpace, 1320 TTI::TargetCostKind CostKind, 1321 TTI::OperandValueInfo OpInfo, 1322 const Instruction *I) const { 1323 assert(!Src->isVoidTy() && "Invalid type"); 1324 1325 // TODO: Handle other cost kinds. 1326 if (CostKind != TTI::TCK_RecipThroughput) 1327 return 1; 1328 1329 if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) { 1330 // Store the load or its truncated or extended value in FoldedValue. 1331 const Instruction *FoldedValue = nullptr; 1332 if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) { 1333 const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin()); 1334 assert (UserI->getNumOperands() == 2 && "Expected a binop."); 1335 1336 // UserI can't fold two loads, so in that case return 0 cost only 1337 // half of the time. 1338 for (unsigned i = 0; i < 2; ++i) { 1339 if (UserI->getOperand(i) == FoldedValue) 1340 continue; 1341 1342 if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){ 1343 LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp); 1344 if (!OtherLoad && 1345 (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) || 1346 isa<ZExtInst>(OtherOp))) 1347 OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0)); 1348 if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/)) 1349 return i == 0; // Both operands foldable. 1350 } 1351 } 1352 1353 return 0; // Only I is foldable in user. 1354 } 1355 } 1356 1357 // Type legalization (via getNumberOfParts) can't handle structs 1358 if (TLI->getValueType(DL, Src, true) == MVT::Other) 1359 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1360 CostKind); 1361 1362 // FP128 is a legal type but kept in a register pair on older CPUs. 1363 if (Src->isFP128Ty() && !ST->hasVectorEnhancements1()) 1364 return 2; 1365 1366 unsigned NumOps = 1367 (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src)); 1368 1369 // Store/Load reversed saves one instruction. 1370 if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) && 1371 I != nullptr) { 1372 if (Opcode == Instruction::Load && I->hasOneUse()) { 1373 const Instruction *LdUser = cast<Instruction>(*I->user_begin()); 1374 // In case of load -> bswap -> store, return normal cost for the load. 1375 if (isBswapIntrinsicCall(LdUser) && 1376 (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin()))) 1377 return 0; 1378 } 1379 else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) { 1380 const Value *StoredVal = SI->getValueOperand(); 1381 if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal)) 1382 return 0; 1383 } 1384 } 1385 1386 return NumOps; 1387 } 1388 1389 // The generic implementation of getInterleavedMemoryOpCost() is based on 1390 // adding costs of the memory operations plus all the extracts and inserts 1391 // needed for using / defining the vector operands. The SystemZ version does 1392 // roughly the same but bases the computations on vector permutations 1393 // instead. 1394 InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost( 1395 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1396 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 1397 bool UseMaskForCond, bool UseMaskForGaps) const { 1398 if (UseMaskForCond || UseMaskForGaps) 1399 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 1400 Alignment, AddressSpace, CostKind, 1401 UseMaskForCond, UseMaskForGaps); 1402 assert(isa<VectorType>(VecTy) && 1403 "Expect a vector type for interleaved memory op"); 1404 1405 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements(); 1406 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 1407 unsigned VF = NumElts / Factor; 1408 unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy)); 1409 unsigned NumVectorMemOps = getNumVectorRegs(VecTy); 1410 unsigned NumPermutes = 0; 1411 1412 if (Opcode == Instruction::Load) { 1413 // Loading interleave groups may have gaps, which may mean fewer 1414 // loads. Find out how many vectors will be loaded in total, and in how 1415 // many of them each value will be in. 1416 BitVector UsedInsts(NumVectorMemOps, false); 1417 std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false)); 1418 for (unsigned Index : Indices) 1419 for (unsigned Elt = 0; Elt < VF; ++Elt) { 1420 unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg; 1421 UsedInsts.set(Vec); 1422 ValueVecs[Index].set(Vec); 1423 } 1424 NumVectorMemOps = UsedInsts.count(); 1425 1426 for (unsigned Index : Indices) { 1427 // Estimate that each loaded source vector containing this Index 1428 // requires one operation, except that vperm can handle two input 1429 // registers first time for each dst vector. 1430 unsigned NumSrcVecs = ValueVecs[Index].count(); 1431 unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U); 1432 assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources"); 1433 NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs); 1434 } 1435 } else { 1436 // Estimate the permutes for each stored vector as the smaller of the 1437 // number of elements and the number of source vectors. Subtract one per 1438 // dst vector for vperm (S.A.). 1439 unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor); 1440 unsigned NumDstVecs = NumVectorMemOps; 1441 NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs; 1442 } 1443 1444 // Cost of load/store operations and the permutations needed. 1445 return NumVectorMemOps + NumPermutes; 1446 } 1447 1448 InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits) { 1449 InstructionCost Cost = 0; 1450 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. 1451 Cost += NumVec - 1; 1452 // For integer adds, VSUM creates shorter reductions on the final vector. 1453 Cost += (ScalarBits < 32) ? 3 : 2; 1454 return Cost; 1455 } 1456 1457 InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems, 1458 unsigned ScalarBits) { 1459 unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits); 1460 InstructionCost Cost = 0; 1461 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. 1462 Cost += NumVec - 1; 1463 // For each shuffle / arithmetic layer, we need 2 instructions, and we need 1464 // log2(Elements in Last Vector) layers. 1465 Cost += 2 * Log2_32_Ceil(std::min(NumElems, NumEltsPerVecReg)); 1466 return Cost; 1467 } 1468 1469 inline bool customCostReductions(unsigned Opcode) { 1470 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 1471 Opcode == Instruction::Add || Opcode == Instruction::Mul; 1472 } 1473 1474 InstructionCost 1475 SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 1476 std::optional<FastMathFlags> FMF, 1477 TTI::TargetCostKind CostKind) const { 1478 unsigned ScalarBits = Ty->getScalarSizeInBits(); 1479 // The following is only for subtargets with vector math, non-ordered 1480 // reductions, and reasonable scalar sizes for int and fp add/mul. 1481 if (customCostReductions(Opcode) && ST->hasVector() && 1482 !TTI::requiresOrderedReduction(FMF) && 1483 ScalarBits <= SystemZ::VectorBits) { 1484 unsigned NumVectors = getNumVectorRegs(Ty); 1485 unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements(); 1486 // Integer Add is using custom code gen, that needs to be accounted for. 1487 if (Opcode == Instruction::Add) 1488 return getIntAddReductionCost(NumVectors, ScalarBits); 1489 // The base cost is the same across all other arithmetic instructions 1490 InstructionCost Cost = 1491 getFastReductionCost(NumVectors, NumElems, ScalarBits); 1492 // But we need to account for the final op involving the scalar operand. 1493 if ((Opcode == Instruction::FAdd) || (Opcode == Instruction::FMul)) 1494 Cost += 1; 1495 return Cost; 1496 } 1497 // otherwise, fall back to the standard implementation 1498 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1499 } 1500 1501 InstructionCost 1502 SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 1503 FastMathFlags FMF, 1504 TTI::TargetCostKind CostKind) const { 1505 // Return custom costs only on subtargets with vector enhancements. 1506 if (ST->hasVectorEnhancements1()) { 1507 unsigned NumVectors = getNumVectorRegs(Ty); 1508 unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements(); 1509 unsigned ScalarBits = Ty->getScalarSizeInBits(); 1510 InstructionCost Cost = 0; 1511 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. 1512 Cost += NumVectors - 1; 1513 // For the final vector, we need shuffle + min/max operations, and 1514 // we need #Elements - 1 of them. 1515 Cost += 2 * (std::min(NumElems, SystemZ::VectorBits / ScalarBits) - 1); 1516 return Cost; 1517 } 1518 // For other targets, fall back to the standard implementation 1519 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 1520 } 1521 1522 static int 1523 getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, 1524 const SmallVectorImpl<Type *> &ParamTys) { 1525 if (RetTy->isVectorTy() && ID == Intrinsic::bswap) 1526 return getNumVectorRegs(RetTy); // VPERM 1527 1528 return -1; 1529 } 1530 1531 InstructionCost 1532 SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1533 TTI::TargetCostKind CostKind) const { 1534 InstructionCost Cost = getVectorIntrinsicInstrCost( 1535 ICA.getID(), ICA.getReturnType(), ICA.getArgTypes()); 1536 if (Cost != -1) 1537 return Cost; 1538 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1539 } 1540 1541 bool SystemZTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { 1542 // Always expand on Subtargets without vector instructions. 1543 if (!ST->hasVector()) 1544 return true; 1545 1546 // Whether or not to expand is a per-intrinsic decision. 1547 switch (II->getIntrinsicID()) { 1548 default: 1549 return true; 1550 // Do not expand vector.reduce.add... 1551 case Intrinsic::vector_reduce_add: 1552 auto *VType = cast<FixedVectorType>(II->getOperand(0)->getType()); 1553 // ...unless the scalar size is i64 or larger, 1554 // or the operand vector is not full, since the 1555 // performance benefit is dubious in those cases. 1556 return VType->getScalarSizeInBits() >= 64 || 1557 VType->getPrimitiveSizeInBits() < SystemZ::VectorBits; 1558 } 1559 } 1560