1 //===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a TargetTransformInfo analysis pass specific to the 10 // SystemZ target machine. It uses the target's detailed information to provide 11 // more precise answers to certain TTI queries, while letting the target 12 // independent and default TTI implementations handle the rest. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "SystemZTargetTransformInfo.h" 17 #include "llvm/Analysis/TargetTransformInfo.h" 18 #include "llvm/CodeGen/BasicTTIImpl.h" 19 #include "llvm/CodeGen/TargetLowering.h" 20 #include "llvm/IR/DerivedTypes.h" 21 #include "llvm/IR/InstIterator.h" 22 #include "llvm/IR/IntrinsicInst.h" 23 #include "llvm/IR/Intrinsics.h" 24 #include "llvm/Support/Debug.h" 25 #include "llvm/Support/InstructionCost.h" 26 #include "llvm/Support/MathExtras.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "systemztti" 31 32 //===----------------------------------------------------------------------===// 33 // 34 // SystemZ cost model. 35 // 36 //===----------------------------------------------------------------------===// 37 38 static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) { 39 bool UsedAsMemCpySource = false; 40 for (const User *U : V->users()) 41 if (const Instruction *User = dyn_cast<Instruction>(U)) { 42 if (isa<BitCastInst>(User) || isa<GetElementPtrInst>(User)) { 43 UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse); 44 continue; 45 } 46 if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) { 47 if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) { 48 UsedAsMemCpySource = true; 49 continue; 50 } 51 } 52 OtherUse = true; 53 } 54 return UsedAsMemCpySource; 55 } 56 57 static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores, 58 unsigned &NumLoads, const Function *F) { 59 if (!isa<PointerType>(Ptr->getType())) 60 return; 61 for (const User *U : Ptr->users()) 62 if (const Instruction *User = dyn_cast<Instruction>(U)) { 63 if (User->getParent()->getParent() == F) { 64 if (const auto *SI = dyn_cast<StoreInst>(User)) { 65 if (SI->getPointerOperand() == Ptr && !SI->isVolatile()) 66 NumStores++; 67 } else if (const auto *LI = dyn_cast<LoadInst>(User)) { 68 if (LI->getPointerOperand() == Ptr && !LI->isVolatile()) 69 NumLoads++; 70 } else if (const auto *GEP = dyn_cast<GetElementPtrInst>(User)) { 71 if (GEP->getPointerOperand() == Ptr) 72 countNumMemAccesses(GEP, NumStores, NumLoads, F); 73 } 74 } 75 } 76 } 77 78 unsigned SystemZTTIImpl::adjustInliningThreshold(const CallBase *CB) const { 79 unsigned Bonus = 0; 80 const Function *Caller = CB->getParent()->getParent(); 81 const Function *Callee = CB->getCalledFunction(); 82 if (!Callee) 83 return 0; 84 85 // Increase the threshold if an incoming argument is used only as a memcpy 86 // source. 87 for (const Argument &Arg : Callee->args()) { 88 bool OtherUse = false; 89 if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse) { 90 Bonus = 1000; 91 break; 92 } 93 } 94 95 // Give bonus for globals used much in both caller and a relatively small 96 // callee. 97 unsigned InstrCount = 0; 98 SmallDenseMap<const Value *, unsigned> Ptr2NumUses; 99 for (auto &I : instructions(Callee)) { 100 if (++InstrCount == 200) { 101 Ptr2NumUses.clear(); 102 break; 103 } 104 if (const auto *SI = dyn_cast<StoreInst>(&I)) { 105 if (!SI->isVolatile()) 106 if (auto *GV = dyn_cast<GlobalVariable>(SI->getPointerOperand())) 107 Ptr2NumUses[GV]++; 108 } else if (const auto *LI = dyn_cast<LoadInst>(&I)) { 109 if (!LI->isVolatile()) 110 if (auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand())) 111 Ptr2NumUses[GV]++; 112 } else if (const auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 113 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getPointerOperand())) { 114 unsigned NumStores = 0, NumLoads = 0; 115 countNumMemAccesses(GEP, NumStores, NumLoads, Callee); 116 Ptr2NumUses[GV] += NumLoads + NumStores; 117 } 118 } 119 } 120 121 for (auto [Ptr, NumCalleeUses] : Ptr2NumUses) 122 if (NumCalleeUses > 10) { 123 unsigned CallerStores = 0, CallerLoads = 0; 124 countNumMemAccesses(Ptr, CallerStores, CallerLoads, Caller); 125 if (CallerStores + CallerLoads > 10) { 126 Bonus = 1000; 127 break; 128 } 129 } 130 131 // Give bonus when Callee accesses an Alloca of Caller heavily. 132 unsigned NumStores = 0; 133 unsigned NumLoads = 0; 134 for (unsigned OpIdx = 0; OpIdx != Callee->arg_size(); ++OpIdx) { 135 Value *CallerArg = CB->getArgOperand(OpIdx); 136 Argument *CalleeArg = Callee->getArg(OpIdx); 137 if (isa<AllocaInst>(CallerArg)) 138 countNumMemAccesses(CalleeArg, NumStores, NumLoads, Callee); 139 } 140 if (NumLoads > 10) 141 Bonus += NumLoads * 50; 142 if (NumStores > 10) 143 Bonus += NumStores * 50; 144 Bonus = std::min(Bonus, unsigned(1000)); 145 146 LLVM_DEBUG(if (Bonus) 147 dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";); 148 return Bonus; 149 } 150 151 InstructionCost 152 SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 153 TTI::TargetCostKind CostKind) const { 154 assert(Ty->isIntegerTy()); 155 156 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 157 // There is no cost model for constants with a bit size of 0. Return TCC_Free 158 // here, so that constant hoisting will ignore this constant. 159 if (BitSize == 0) 160 return TTI::TCC_Free; 161 // No cost model for operations on integers larger than 128 bit implemented yet. 162 if ((!ST->hasVector() && BitSize > 64) || BitSize > 128) 163 return TTI::TCC_Free; 164 165 if (Imm == 0) 166 return TTI::TCC_Free; 167 168 if (Imm.getBitWidth() <= 64) { 169 // Constants loaded via lgfi. 170 if (isInt<32>(Imm.getSExtValue())) 171 return TTI::TCC_Basic; 172 // Constants loaded via llilf. 173 if (isUInt<32>(Imm.getZExtValue())) 174 return TTI::TCC_Basic; 175 // Constants loaded via llihf: 176 if ((Imm.getZExtValue() & 0xffffffff) == 0) 177 return TTI::TCC_Basic; 178 179 return 2 * TTI::TCC_Basic; 180 } 181 182 // i128 immediates loads from Constant Pool 183 return 2 * TTI::TCC_Basic; 184 } 185 186 InstructionCost SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 187 const APInt &Imm, Type *Ty, 188 TTI::TargetCostKind CostKind, 189 Instruction *Inst) const { 190 assert(Ty->isIntegerTy()); 191 192 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 193 // There is no cost model for constants with a bit size of 0. Return TCC_Free 194 // here, so that constant hoisting will ignore this constant. 195 if (BitSize == 0) 196 return TTI::TCC_Free; 197 // No cost model for operations on integers larger than 64 bit implemented yet. 198 if (BitSize > 64) 199 return TTI::TCC_Free; 200 201 switch (Opcode) { 202 default: 203 return TTI::TCC_Free; 204 case Instruction::GetElementPtr: 205 // Always hoist the base address of a GetElementPtr. This prevents the 206 // creation of new constants for every base constant that gets constant 207 // folded with the offset. 208 if (Idx == 0) 209 return 2 * TTI::TCC_Basic; 210 return TTI::TCC_Free; 211 case Instruction::Store: 212 if (Idx == 0 && Imm.getBitWidth() <= 64) { 213 // Any 8-bit immediate store can by implemented via mvi. 214 if (BitSize == 8) 215 return TTI::TCC_Free; 216 // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi. 217 if (isInt<16>(Imm.getSExtValue())) 218 return TTI::TCC_Free; 219 } 220 break; 221 case Instruction::ICmp: 222 if (Idx == 1 && Imm.getBitWidth() <= 64) { 223 // Comparisons against signed 32-bit immediates implemented via cgfi. 224 if (isInt<32>(Imm.getSExtValue())) 225 return TTI::TCC_Free; 226 // Comparisons against unsigned 32-bit immediates implemented via clgfi. 227 if (isUInt<32>(Imm.getZExtValue())) 228 return TTI::TCC_Free; 229 } 230 break; 231 case Instruction::Add: 232 case Instruction::Sub: 233 if (Idx == 1 && Imm.getBitWidth() <= 64) { 234 // We use algfi/slgfi to add/subtract 32-bit unsigned immediates. 235 if (isUInt<32>(Imm.getZExtValue())) 236 return TTI::TCC_Free; 237 // Or their negation, by swapping addition vs. subtraction. 238 if (isUInt<32>(-Imm.getSExtValue())) 239 return TTI::TCC_Free; 240 } 241 break; 242 case Instruction::Mul: 243 if (Idx == 1 && Imm.getBitWidth() <= 64) { 244 // We use msgfi to multiply by 32-bit signed immediates. 245 if (isInt<32>(Imm.getSExtValue())) 246 return TTI::TCC_Free; 247 } 248 break; 249 case Instruction::Or: 250 case Instruction::Xor: 251 if (Idx == 1 && Imm.getBitWidth() <= 64) { 252 // Masks supported by oilf/xilf. 253 if (isUInt<32>(Imm.getZExtValue())) 254 return TTI::TCC_Free; 255 // Masks supported by oihf/xihf. 256 if ((Imm.getZExtValue() & 0xffffffff) == 0) 257 return TTI::TCC_Free; 258 } 259 break; 260 case Instruction::And: 261 if (Idx == 1 && Imm.getBitWidth() <= 64) { 262 // Any 32-bit AND operation can by implemented via nilf. 263 if (BitSize <= 32) 264 return TTI::TCC_Free; 265 // 64-bit masks supported by nilf. 266 if (isUInt<32>(~Imm.getZExtValue())) 267 return TTI::TCC_Free; 268 // 64-bit masks supported by nilh. 269 if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff) 270 return TTI::TCC_Free; 271 // Some 64-bit AND operations can be implemented via risbg. 272 const SystemZInstrInfo *TII = ST->getInstrInfo(); 273 unsigned Start, End; 274 if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End)) 275 return TTI::TCC_Free; 276 } 277 break; 278 case Instruction::Shl: 279 case Instruction::LShr: 280 case Instruction::AShr: 281 // Always return TCC_Free for the shift value of a shift instruction. 282 if (Idx == 1) 283 return TTI::TCC_Free; 284 break; 285 case Instruction::UDiv: 286 case Instruction::SDiv: 287 case Instruction::URem: 288 case Instruction::SRem: 289 case Instruction::Trunc: 290 case Instruction::ZExt: 291 case Instruction::SExt: 292 case Instruction::IntToPtr: 293 case Instruction::PtrToInt: 294 case Instruction::BitCast: 295 case Instruction::PHI: 296 case Instruction::Call: 297 case Instruction::Select: 298 case Instruction::Ret: 299 case Instruction::Load: 300 break; 301 } 302 303 return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind); 304 } 305 306 InstructionCost 307 SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 308 const APInt &Imm, Type *Ty, 309 TTI::TargetCostKind CostKind) const { 310 assert(Ty->isIntegerTy()); 311 312 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 313 // There is no cost model for constants with a bit size of 0. Return TCC_Free 314 // here, so that constant hoisting will ignore this constant. 315 if (BitSize == 0) 316 return TTI::TCC_Free; 317 // No cost model for operations on integers larger than 64 bit implemented yet. 318 if (BitSize > 64) 319 return TTI::TCC_Free; 320 321 switch (IID) { 322 default: 323 return TTI::TCC_Free; 324 case Intrinsic::sadd_with_overflow: 325 case Intrinsic::uadd_with_overflow: 326 case Intrinsic::ssub_with_overflow: 327 case Intrinsic::usub_with_overflow: 328 // These get expanded to include a normal addition/subtraction. 329 if (Idx == 1 && Imm.getBitWidth() <= 64) { 330 if (isUInt<32>(Imm.getZExtValue())) 331 return TTI::TCC_Free; 332 if (isUInt<32>(-Imm.getSExtValue())) 333 return TTI::TCC_Free; 334 } 335 break; 336 case Intrinsic::smul_with_overflow: 337 case Intrinsic::umul_with_overflow: 338 // These get expanded to include a normal multiplication. 339 if (Idx == 1 && Imm.getBitWidth() <= 64) { 340 if (isInt<32>(Imm.getSExtValue())) 341 return TTI::TCC_Free; 342 } 343 break; 344 case Intrinsic::experimental_stackmap: 345 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 346 return TTI::TCC_Free; 347 break; 348 case Intrinsic::experimental_patchpoint_void: 349 case Intrinsic::experimental_patchpoint: 350 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 351 return TTI::TCC_Free; 352 break; 353 } 354 return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind); 355 } 356 357 TargetTransformInfo::PopcntSupportKind 358 SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) const { 359 assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2"); 360 if (ST->hasPopulationCount() && TyWidth <= 64) 361 return TTI::PSK_FastHardware; 362 return TTI::PSK_Software; 363 } 364 365 void SystemZTTIImpl::getUnrollingPreferences( 366 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, 367 OptimizationRemarkEmitter *ORE) const { 368 // Find out if L contains a call, what the machine instruction count 369 // estimate is, and how many stores there are. 370 bool HasCall = false; 371 InstructionCost NumStores = 0; 372 for (auto &BB : L->blocks()) 373 for (auto &I : *BB) { 374 if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) { 375 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 376 if (isLoweredToCall(F)) 377 HasCall = true; 378 if (F->getIntrinsicID() == Intrinsic::memcpy || 379 F->getIntrinsicID() == Intrinsic::memset) 380 NumStores++; 381 } else { // indirect call. 382 HasCall = true; 383 } 384 } 385 if (isa<StoreInst>(&I)) { 386 Type *MemAccessTy = I.getOperand(0)->getType(); 387 NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, Align(), 388 0, TTI::TCK_RecipThroughput); 389 } 390 } 391 392 // The z13 processor will run out of store tags if too many stores 393 // are fed into it too quickly. Therefore make sure there are not 394 // too many stores in the resulting unrolled loop. 395 unsigned const NumStoresVal = NumStores.getValue(); 396 unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX); 397 398 if (HasCall) { 399 // Only allow full unrolling if loop has any calls. 400 UP.FullUnrollMaxCount = Max; 401 UP.MaxCount = 1; 402 return; 403 } 404 405 UP.MaxCount = Max; 406 if (UP.MaxCount <= 1) 407 return; 408 409 // Allow partial and runtime trip count unrolling. 410 UP.Partial = UP.Runtime = true; 411 412 UP.PartialThreshold = 75; 413 UP.DefaultUnrollRuntimeCount = 4; 414 415 // Allow expensive instructions in the pre-header of the loop. 416 UP.AllowExpensiveTripCount = true; 417 418 UP.Force = true; 419 } 420 421 void SystemZTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 422 TTI::PeelingPreferences &PP) const { 423 BaseT::getPeelingPreferences(L, SE, PP); 424 } 425 426 bool SystemZTTIImpl::isLSRCostLess( 427 const TargetTransformInfo::LSRCost &C1, 428 const TargetTransformInfo::LSRCost &C2) const { 429 // SystemZ specific: check instruction count (first), and don't care about 430 // ImmCost, since offsets are checked explicitly. 431 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 432 C1.NumIVMuls, C1.NumBaseAdds, 433 C1.ScaleCost, C1.SetupCost) < 434 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 435 C2.NumIVMuls, C2.NumBaseAdds, 436 C2.ScaleCost, C2.SetupCost); 437 } 438 439 unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const { 440 bool Vector = (ClassID == 1); 441 if (!Vector) 442 // Discount the stack pointer. Also leave out %r0, since it can't 443 // be used in an address. 444 return 14; 445 if (ST->hasVector()) 446 return 32; 447 return 0; 448 } 449 450 TypeSize 451 SystemZTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 452 switch (K) { 453 case TargetTransformInfo::RGK_Scalar: 454 return TypeSize::getFixed(64); 455 case TargetTransformInfo::RGK_FixedWidthVector: 456 return TypeSize::getFixed(ST->hasVector() ? 128 : 0); 457 case TargetTransformInfo::RGK_ScalableVector: 458 return TypeSize::getScalable(0); 459 } 460 461 llvm_unreachable("Unsupported register kind"); 462 } 463 464 unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses, 465 unsigned NumStridedMemAccesses, 466 unsigned NumPrefetches, 467 bool HasCall) const { 468 // Don't prefetch a loop with many far apart accesses. 469 if (NumPrefetches > 16) 470 return UINT_MAX; 471 472 // Emit prefetch instructions for smaller strides in cases where we think 473 // the hardware prefetcher might not be able to keep up. 474 if (NumStridedMemAccesses > 32 && !HasCall && 475 (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses) 476 return 1; 477 478 return ST->hasMiscellaneousExtensions3() ? 8192 : 2048; 479 } 480 481 bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const { 482 EVT VT = TLI->getValueType(DL, DataType); 483 return (VT.isScalarInteger() && TLI->isTypeLegal(VT)); 484 } 485 486 static bool isFreeEltLoad(const Value *Op) { 487 if (isa<LoadInst>(Op) && Op->hasOneUse()) { 488 const Instruction *UserI = cast<Instruction>(*Op->user_begin()); 489 return !isa<StoreInst>(UserI); // Prefer MVC 490 } 491 return false; 492 } 493 494 InstructionCost SystemZTTIImpl::getScalarizationOverhead( 495 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, 496 TTI::TargetCostKind CostKind, bool ForPoisonSrc, 497 ArrayRef<Value *> VL) const { 498 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements(); 499 InstructionCost Cost = 0; 500 501 if (Insert && Ty->isIntOrIntVectorTy(64)) { 502 // VLVGP will insert two GPRs with one instruction, while VLE will load 503 // an element directly with no extra cost 504 assert((VL.empty() || VL.size() == NumElts) && 505 "Type does not match the number of values."); 506 InstructionCost CurrVectorCost = 0; 507 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 508 if (DemandedElts[Idx] && !(VL.size() && isFreeEltLoad(VL[Idx]))) 509 ++CurrVectorCost; 510 if (Idx % 2 == 1) { 511 Cost += std::min(InstructionCost(1), CurrVectorCost); 512 CurrVectorCost = 0; 513 } 514 } 515 Insert = false; 516 } 517 518 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, 519 CostKind, ForPoisonSrc, VL); 520 return Cost; 521 } 522 523 // Return the bit size for the scalar type or vector element 524 // type. getScalarSizeInBits() returns 0 for a pointer type. 525 static unsigned getScalarSizeInBits(Type *Ty) { 526 unsigned Size = 527 (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits()); 528 assert(Size > 0 && "Element must have non-zero size."); 529 return Size; 530 } 531 532 // getNumberOfParts() calls getTypeLegalizationCost() which splits the vector 533 // type until it is legal. This would e.g. return 4 for <6 x i64>, instead of 534 // 3. 535 static unsigned getNumVectorRegs(Type *Ty) { 536 auto *VTy = cast<FixedVectorType>(Ty); 537 unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements(); 538 assert(WideBits > 0 && "Could not compute size of vector"); 539 return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U)); 540 } 541 542 InstructionCost SystemZTTIImpl::getArithmeticInstrCost( 543 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 544 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 545 ArrayRef<const Value *> Args, const Instruction *CxtI) const { 546 547 // TODO: Handle more cost kinds. 548 if (CostKind != TTI::TCK_RecipThroughput) 549 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 550 Op2Info, Args, CxtI); 551 552 // TODO: return a good value for BB-VECTORIZER that includes the 553 // immediate loads, which we do not want to count for the loop 554 // vectorizer, since they are hopefully hoisted out of the loop. This 555 // would require a new parameter 'InLoop', but not sure if constant 556 // args are common enough to motivate this. 557 558 unsigned ScalarBits = Ty->getScalarSizeInBits(); 559 560 // There are thre cases of division and remainder: Dividing with a register 561 // needs a divide instruction. A divisor which is a power of two constant 562 // can be implemented with a sequence of shifts. Any other constant needs a 563 // multiply and shifts. 564 const unsigned DivInstrCost = 20; 565 const unsigned DivMulSeqCost = 10; 566 const unsigned SDivPow2Cost = 4; 567 568 bool SignedDivRem = 569 Opcode == Instruction::SDiv || Opcode == Instruction::SRem; 570 bool UnsignedDivRem = 571 Opcode == Instruction::UDiv || Opcode == Instruction::URem; 572 573 // Check for a constant divisor. 574 bool DivRemConst = false; 575 bool DivRemConstPow2 = false; 576 if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) { 577 if (const Constant *C = dyn_cast<Constant>(Args[1])) { 578 const ConstantInt *CVal = 579 (C->getType()->isVectorTy() 580 ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue()) 581 : dyn_cast<const ConstantInt>(C)); 582 if (CVal && (CVal->getValue().isPowerOf2() || 583 CVal->getValue().isNegatedPowerOf2())) 584 DivRemConstPow2 = true; 585 else 586 DivRemConst = true; 587 } 588 } 589 590 if (!Ty->isVectorTy()) { 591 // These FP operations are supported with a dedicated instruction for 592 // float, double and fp128 (base implementation assumes float generally 593 // costs 2). 594 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub || 595 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) 596 return 1; 597 598 // There is no native support for FRem. 599 if (Opcode == Instruction::FRem) 600 return LIBCALL_COST; 601 602 // Give discount for some combined logical operations if supported. 603 if (Args.size() == 2) { 604 if (Opcode == Instruction::Xor) { 605 for (const Value *A : Args) { 606 if (const Instruction *I = dyn_cast<Instruction>(A)) 607 if (I->hasOneUse() && 608 (I->getOpcode() == Instruction::Or || 609 I->getOpcode() == Instruction::And || 610 I->getOpcode() == Instruction::Xor)) 611 if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) || 612 (isInt128InVR(Ty) && 613 (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1()))) 614 return 0; 615 } 616 } 617 else if (Opcode == Instruction::And || Opcode == Instruction::Or) { 618 for (const Value *A : Args) { 619 if (const Instruction *I = dyn_cast<Instruction>(A)) 620 if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) && 621 ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) || 622 (isInt128InVR(Ty) && 623 (Opcode == Instruction::And || ST->hasVectorEnhancements1())))) 624 return 0; 625 } 626 } 627 } 628 629 // Or requires one instruction, although it has custom handling for i64. 630 if (Opcode == Instruction::Or) 631 return 1; 632 633 if (Opcode == Instruction::Xor && ScalarBits == 1) { 634 if (ST->hasLoadStoreOnCond2()) 635 return 5; // 2 * (li 0; loc 1); xor 636 return 7; // 2 * ipm sequences ; xor ; shift ; compare 637 } 638 639 if (DivRemConstPow2) 640 return (SignedDivRem ? SDivPow2Cost : 1); 641 if (DivRemConst) 642 return DivMulSeqCost; 643 if (SignedDivRem || UnsignedDivRem) 644 return DivInstrCost; 645 } 646 else if (ST->hasVector()) { 647 auto *VTy = cast<FixedVectorType>(Ty); 648 unsigned VF = VTy->getNumElements(); 649 unsigned NumVectors = getNumVectorRegs(Ty); 650 651 // These vector operations are custom handled, but are still supported 652 // with one instruction per vector, regardless of element size. 653 if (Opcode == Instruction::Shl || Opcode == Instruction::LShr || 654 Opcode == Instruction::AShr) { 655 return NumVectors; 656 } 657 658 if (DivRemConstPow2) 659 return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1)); 660 if (DivRemConst) { 661 SmallVector<Type *> Tys(Args.size(), Ty); 662 return VF * DivMulSeqCost + 663 BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind); 664 } 665 if (SignedDivRem || UnsignedDivRem) { 666 if (ST->hasVectorEnhancements3() && ScalarBits >= 32) 667 return NumVectors * DivInstrCost; 668 else if (VF > 4) 669 // Temporary hack: disable high vectorization factors with integer 670 // division/remainder, which will get scalarized and handled with 671 // GR128 registers. The mischeduler is not clever enough to avoid 672 // spilling yet. 673 return 1000; 674 } 675 676 // These FP operations are supported with a single vector instruction for 677 // double (base implementation assumes float generally costs 2). For 678 // FP128, the scalar cost is 1, and there is no overhead since the values 679 // are already in scalar registers. 680 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub || 681 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) { 682 switch (ScalarBits) { 683 case 32: { 684 // The vector enhancements facility 1 provides v4f32 instructions. 685 if (ST->hasVectorEnhancements1()) 686 return NumVectors; 687 // Return the cost of multiple scalar invocation plus the cost of 688 // inserting and extracting the values. 689 InstructionCost ScalarCost = 690 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind); 691 SmallVector<Type *> Tys(Args.size(), Ty); 692 InstructionCost Cost = 693 (VF * ScalarCost) + 694 BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind); 695 // FIXME: VF 2 for these FP operations are currently just as 696 // expensive as for VF 4. 697 if (VF == 2) 698 Cost *= 2; 699 return Cost; 700 } 701 case 64: 702 case 128: 703 return NumVectors; 704 default: 705 break; 706 } 707 } 708 709 // There is no native support for FRem. 710 if (Opcode == Instruction::FRem) { 711 SmallVector<Type *> Tys(Args.size(), Ty); 712 InstructionCost Cost = 713 (VF * LIBCALL_COST) + 714 BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind); 715 // FIXME: VF 2 for float is currently just as expensive as for VF 4. 716 if (VF == 2 && ScalarBits == 32) 717 Cost *= 2; 718 return Cost; 719 } 720 } 721 722 // Fallback to the default implementation. 723 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 724 Args, CxtI); 725 } 726 727 InstructionCost 728 SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, 729 VectorType *SrcTy, ArrayRef<int> Mask, 730 TTI::TargetCostKind CostKind, int Index, 731 VectorType *SubTp, ArrayRef<const Value *> Args, 732 const Instruction *CxtI) const { 733 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp); 734 if (ST->hasVector()) { 735 unsigned NumVectors = getNumVectorRegs(SrcTy); 736 737 // TODO: Since fp32 is expanded, the shuffle cost should always be 0. 738 739 // FP128 values are always in scalar registers, so there is no work 740 // involved with a shuffle, except for broadcast. In that case register 741 // moves are done with a single instruction per element. 742 if (SrcTy->getScalarType()->isFP128Ty()) 743 return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0); 744 745 switch (Kind) { 746 case TargetTransformInfo::SK_ExtractSubvector: 747 // ExtractSubvector Index indicates start offset. 748 749 // Extracting a subvector from first index is a noop. 750 return (Index == 0 ? 0 : NumVectors); 751 752 case TargetTransformInfo::SK_Broadcast: 753 // Loop vectorizer calls here to figure out the extra cost of 754 // broadcasting a loaded value to all elements of a vector. Since vlrep 755 // loads and replicates with a single instruction, adjust the returned 756 // value. 757 return NumVectors - 1; 758 759 default: 760 761 // SystemZ supports single instruction permutation / replication. 762 return NumVectors; 763 } 764 } 765 766 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, 767 SubTp); 768 } 769 770 // Return the log2 difference of the element sizes of the two vector types. 771 static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) { 772 unsigned Bits0 = Ty0->getScalarSizeInBits(); 773 unsigned Bits1 = Ty1->getScalarSizeInBits(); 774 775 if (Bits1 > Bits0) 776 return (Log2_32(Bits1) - Log2_32(Bits0)); 777 778 return (Log2_32(Bits0) - Log2_32(Bits1)); 779 } 780 781 // Return the number of instructions needed to truncate SrcTy to DstTy. 782 unsigned SystemZTTIImpl::getVectorTruncCost(Type *SrcTy, Type *DstTy) const { 783 assert (SrcTy->isVectorTy() && DstTy->isVectorTy()); 784 assert(SrcTy->getPrimitiveSizeInBits().getFixedValue() > 785 DstTy->getPrimitiveSizeInBits().getFixedValue() && 786 "Packing must reduce size of vector type."); 787 assert(cast<FixedVectorType>(SrcTy)->getNumElements() == 788 cast<FixedVectorType>(DstTy)->getNumElements() && 789 "Packing should not change number of elements."); 790 791 // TODO: Since fp32 is expanded, the extract cost should always be 0. 792 793 unsigned NumParts = getNumVectorRegs(SrcTy); 794 if (NumParts <= 2) 795 // Up to 2 vector registers can be truncated efficiently with pack or 796 // permute. The latter requires an immediate mask to be loaded, which 797 // typically gets hoisted out of a loop. TODO: return a good value for 798 // BB-VECTORIZER that includes the immediate loads, which we do not want 799 // to count for the loop vectorizer. 800 return 1; 801 802 unsigned Cost = 0; 803 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy); 804 unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements(); 805 for (unsigned P = 0; P < Log2Diff; ++P) { 806 if (NumParts > 1) 807 NumParts /= 2; 808 Cost += NumParts; 809 } 810 811 // Currently, a general mix of permutes and pack instructions is output by 812 // isel, which follow the cost computation above except for this case which 813 // is one instruction less: 814 if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 && 815 DstTy->getScalarSizeInBits() == 8) 816 Cost--; 817 818 return Cost; 819 } 820 821 // Return the cost of converting a vector bitmask produced by a compare 822 // (SrcTy), to the type of the select or extend instruction (DstTy). 823 unsigned SystemZTTIImpl::getVectorBitmaskConversionCost(Type *SrcTy, 824 Type *DstTy) const { 825 assert (SrcTy->isVectorTy() && DstTy->isVectorTy() && 826 "Should only be called with vector types."); 827 828 unsigned PackCost = 0; 829 unsigned SrcScalarBits = SrcTy->getScalarSizeInBits(); 830 unsigned DstScalarBits = DstTy->getScalarSizeInBits(); 831 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy); 832 if (SrcScalarBits > DstScalarBits) 833 // The bitmask will be truncated. 834 PackCost = getVectorTruncCost(SrcTy, DstTy); 835 else if (SrcScalarBits < DstScalarBits) { 836 unsigned DstNumParts = getNumVectorRegs(DstTy); 837 // Each vector select needs its part of the bitmask unpacked. 838 PackCost = Log2Diff * DstNumParts; 839 // Extra cost for moving part of mask before unpacking. 840 PackCost += DstNumParts - 1; 841 } 842 843 return PackCost; 844 } 845 846 // Return the type of the compared operands. This is needed to compute the 847 // cost for a Select / ZExt or SExt instruction. 848 static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) { 849 Type *OpTy = nullptr; 850 if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0))) 851 OpTy = CI->getOperand(0)->getType(); 852 else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0))) 853 if (LogicI->getNumOperands() == 2) 854 if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0))) 855 if (isa<CmpInst>(LogicI->getOperand(1))) 856 OpTy = CI0->getOperand(0)->getType(); 857 858 if (OpTy != nullptr) { 859 if (VF == 1) { 860 assert (!OpTy->isVectorTy() && "Expected scalar type"); 861 return OpTy; 862 } 863 // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may 864 // be either scalar or already vectorized with a same or lesser VF. 865 Type *ElTy = OpTy->getScalarType(); 866 return FixedVectorType::get(ElTy, VF); 867 } 868 869 return nullptr; 870 } 871 872 // Get the cost of converting a boolean vector to a vector with same width 873 // and element size as Dst, plus the cost of zero extending if needed. 874 unsigned 875 SystemZTTIImpl::getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, 876 const Instruction *I) const { 877 auto *DstVTy = cast<FixedVectorType>(Dst); 878 unsigned VF = DstVTy->getNumElements(); 879 unsigned Cost = 0; 880 // If we know what the widths of the compared operands, get any cost of 881 // converting it to match Dst. Otherwise assume same widths. 882 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr); 883 if (CmpOpTy != nullptr) 884 Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst); 885 if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP) 886 // One 'vn' per dst vector with an immediate mask. 887 Cost += getNumVectorRegs(Dst); 888 return Cost; 889 } 890 891 InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 892 Type *Src, 893 TTI::CastContextHint CCH, 894 TTI::TargetCostKind CostKind, 895 const Instruction *I) const { 896 // FIXME: Can the logic below also be used for these cost kinds? 897 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) { 898 auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 899 return BaseCost == 0 ? BaseCost : 1; 900 } 901 902 unsigned DstScalarBits = Dst->getScalarSizeInBits(); 903 unsigned SrcScalarBits = Src->getScalarSizeInBits(); 904 905 if (!Src->isVectorTy()) { 906 if (Dst->isVectorTy()) 907 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 908 909 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) { 910 if (Src->isIntegerTy(128)) 911 return LIBCALL_COST; 912 if (SrcScalarBits >= 32 || 913 (I != nullptr && isa<LoadInst>(I->getOperand(0)))) 914 return 1; 915 return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/; 916 } 917 918 if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) && 919 Dst->isIntegerTy(128)) 920 return LIBCALL_COST; 921 922 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) { 923 if (Src->isIntegerTy(1)) { 924 if (DstScalarBits == 128) { 925 if (Opcode == Instruction::SExt && ST->hasVectorEnhancements3()) 926 return 0;/*VCEQQ*/ 927 return 5 /*branch seq.*/; 928 } 929 930 if (ST->hasLoadStoreOnCond2()) 931 return 2; // li 0; loc 1 932 933 // This should be extension of a compare i1 result, which is done with 934 // ipm and a varying sequence of instructions. 935 unsigned Cost = 0; 936 if (Opcode == Instruction::SExt) 937 Cost = (DstScalarBits < 64 ? 3 : 4); 938 if (Opcode == Instruction::ZExt) 939 Cost = 3; 940 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr); 941 if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy()) 942 // If operands of an fp-type was compared, this costs +1. 943 Cost++; 944 return Cost; 945 } 946 else if (isInt128InVR(Dst)) { 947 // Extensions from GPR to i128 (in VR) typically costs two instructions, 948 // but a zero-extending load would be just one extra instruction. 949 if (Opcode == Instruction::ZExt && I != nullptr) 950 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0))) 951 if (Ld->hasOneUse()) 952 return 1; 953 return 2; 954 } 955 } 956 957 if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) { 958 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0))) 959 if (Ld->hasOneUse()) 960 return 0; // Will be converted to GPR load. 961 bool OnlyTruncatingStores = true; 962 for (const User *U : I->users()) 963 if (!isa<StoreInst>(U)) { 964 OnlyTruncatingStores = false; 965 break; 966 } 967 if (OnlyTruncatingStores) 968 return 0; 969 return 2; // Vector element extraction. 970 } 971 } 972 else if (ST->hasVector()) { 973 // Vector to scalar cast. 974 auto *SrcVecTy = cast<FixedVectorType>(Src); 975 auto *DstVecTy = dyn_cast<FixedVectorType>(Dst); 976 if (!DstVecTy) { 977 // TODO: tune vector-to-scalar cast. 978 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 979 } 980 unsigned VF = SrcVecTy->getNumElements(); 981 unsigned NumDstVectors = getNumVectorRegs(Dst); 982 unsigned NumSrcVectors = getNumVectorRegs(Src); 983 984 if (Opcode == Instruction::Trunc) { 985 if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits()) 986 return 0; // Check for NOOP conversions. 987 return getVectorTruncCost(Src, Dst); 988 } 989 990 if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 991 if (SrcScalarBits >= 8) { 992 // ZExt will use either a single unpack or a vector permute. 993 if (Opcode == Instruction::ZExt) 994 return NumDstVectors; 995 996 // SExt will be handled with one unpack per doubling of width. 997 unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst); 998 999 // For types that spans multiple vector registers, some additional 1000 // instructions are used to setup the unpacking. 1001 unsigned NumSrcVectorOps = 1002 (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors) 1003 : (NumDstVectors / 2)); 1004 1005 return (NumUnpacks * NumDstVectors) + NumSrcVectorOps; 1006 } 1007 else if (SrcScalarBits == 1) 1008 return getBoolVecToIntConversionCost(Opcode, Dst, I); 1009 } 1010 1011 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP || 1012 Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) { 1013 // TODO: Fix base implementation which could simplify things a bit here 1014 // (seems to miss on differentiating on scalar/vector types). 1015 1016 // Only 64 bit vector conversions are natively supported before z15. 1017 if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) { 1018 if (SrcScalarBits == DstScalarBits) 1019 return NumDstVectors; 1020 1021 if (SrcScalarBits == 1) 1022 return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors; 1023 } 1024 1025 // Return the cost of multiple scalar invocation plus the cost of 1026 // inserting and extracting the values. Base implementation does not 1027 // realize float->int gets scalarized. 1028 InstructionCost ScalarCost = getCastInstrCost( 1029 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind); 1030 InstructionCost TotCost = VF * ScalarCost; 1031 bool NeedsInserts = true, NeedsExtracts = true; 1032 // FP128 registers do not get inserted or extracted. 1033 if (DstScalarBits == 128 && 1034 (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)) 1035 NeedsInserts = false; 1036 if (SrcScalarBits == 128 && 1037 (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI)) 1038 NeedsExtracts = false; 1039 1040 TotCost += BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false, 1041 NeedsExtracts, CostKind); 1042 TotCost += BaseT::getScalarizationOverhead(DstVecTy, NeedsInserts, 1043 /*Extract*/ false, CostKind); 1044 1045 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4. 1046 if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32) 1047 TotCost *= 2; 1048 1049 return TotCost; 1050 } 1051 1052 if (Opcode == Instruction::FPTrunc) { 1053 if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements. 1054 return VF /*ldxbr/lexbr*/ + 1055 BaseT::getScalarizationOverhead(DstVecTy, /*Insert*/ true, 1056 /*Extract*/ false, CostKind); 1057 else // double -> float 1058 return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/); 1059 } 1060 1061 if (Opcode == Instruction::FPExt) { 1062 if (SrcScalarBits == 32 && DstScalarBits == 64) { 1063 // float -> double is very rare and currently unoptimized. Instead of 1064 // using vldeb, which can do two at a time, all conversions are 1065 // scalarized. 1066 return VF * 2; 1067 } 1068 // -> fp128. VF * lxdb/lxeb + extraction of elements. 1069 return VF + BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false, 1070 /*Extract*/ true, CostKind); 1071 } 1072 } 1073 1074 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 1075 } 1076 1077 // Scalar i8 / i16 operations will typically be made after first extending 1078 // the operands to i32. 1079 static unsigned getOperandsExtensionCost(const Instruction *I) { 1080 unsigned ExtCost = 0; 1081 for (Value *Op : I->operands()) 1082 // A load of i8 or i16 sign/zero extends to i32. 1083 if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op)) 1084 ExtCost++; 1085 1086 return ExtCost; 1087 } 1088 1089 InstructionCost SystemZTTIImpl::getCmpSelInstrCost( 1090 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, 1091 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, 1092 TTI::OperandValueInfo Op2Info, const Instruction *I) const { 1093 if (CostKind != TTI::TCK_RecipThroughput) 1094 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1095 Op1Info, Op2Info); 1096 1097 if (!ValTy->isVectorTy()) { 1098 switch (Opcode) { 1099 case Instruction::ICmp: { 1100 // A loaded value compared with 0 with multiple users becomes Load and 1101 // Test. The load is then not foldable, so return 0 cost for the ICmp. 1102 unsigned ScalarBits = ValTy->getScalarSizeInBits(); 1103 if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64)) 1104 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0))) 1105 if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1))) 1106 if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() && 1107 C->isZero()) 1108 return 0; 1109 1110 unsigned Cost = 1; 1111 if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16) 1112 Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2); 1113 return Cost; 1114 } 1115 case Instruction::Select: 1116 if (ValTy->isFloatingPointTy()) 1117 return 4; // No LOC for FP - costs a conditional jump. 1118 1119 // When selecting based on an i128 comparison, LOC / VSEL is possible 1120 // if i128 comparisons are directly supported. 1121 if (I != nullptr) 1122 if (ICmpInst *CI = dyn_cast<ICmpInst>(I->getOperand(0))) 1123 if (CI->getOperand(0)->getType()->isIntegerTy(128)) 1124 return ST->hasVectorEnhancements3() ? 1 : 4; 1125 1126 // Load On Condition / Select Register available, except for i128. 1127 return !isInt128InVR(ValTy) ? 1 : 4; 1128 } 1129 } 1130 else if (ST->hasVector()) { 1131 unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements(); 1132 1133 // Called with a compare instruction. 1134 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { 1135 unsigned PredicateExtraCost = 0; 1136 if (I != nullptr) { 1137 // Some predicates cost one or two extra instructions. 1138 switch (cast<CmpInst>(I)->getPredicate()) { 1139 case CmpInst::Predicate::ICMP_NE: 1140 case CmpInst::Predicate::ICMP_UGE: 1141 case CmpInst::Predicate::ICMP_ULE: 1142 case CmpInst::Predicate::ICMP_SGE: 1143 case CmpInst::Predicate::ICMP_SLE: 1144 PredicateExtraCost = 1; 1145 break; 1146 case CmpInst::Predicate::FCMP_ONE: 1147 case CmpInst::Predicate::FCMP_ORD: 1148 case CmpInst::Predicate::FCMP_UEQ: 1149 case CmpInst::Predicate::FCMP_UNO: 1150 PredicateExtraCost = 2; 1151 break; 1152 default: 1153 break; 1154 } 1155 } 1156 1157 // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of 1158 // floats. FIXME: <2 x float> generates same code as <4 x float>. 1159 unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1); 1160 unsigned NumVecs_cmp = getNumVectorRegs(ValTy); 1161 1162 unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost)); 1163 return Cost; 1164 } 1165 else { // Called with a select instruction. 1166 assert (Opcode == Instruction::Select); 1167 1168 // We can figure out the extra cost of packing / unpacking if the 1169 // instruction was passed and the compare instruction is found. 1170 unsigned PackCost = 0; 1171 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr); 1172 if (CmpOpTy != nullptr) 1173 PackCost = 1174 getVectorBitmaskConversionCost(CmpOpTy, ValTy); 1175 1176 return getNumVectorRegs(ValTy) /*vsel*/ + PackCost; 1177 } 1178 } 1179 1180 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1181 Op1Info, Op2Info); 1182 } 1183 1184 InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1185 TTI::TargetCostKind CostKind, 1186 unsigned Index, 1187 const Value *Op0, 1188 const Value *Op1) const { 1189 if (Opcode == Instruction::InsertElement) { 1190 // Vector Element Load. 1191 if (Op1 != nullptr && isFreeEltLoad(Op1)) 1192 return 0; 1193 1194 // vlvgp will insert two grs into a vector register, so count half the 1195 // number of instructions as an estimate when we don't have the full 1196 // picture (as in getScalarizationOverhead()). 1197 if (Val->isIntOrIntVectorTy(64)) 1198 return ((Index % 2 == 0) ? 1 : 0); 1199 } 1200 1201 if (Opcode == Instruction::ExtractElement) { 1202 int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1); 1203 1204 // Give a slight penalty for moving out of vector pipeline to FXU unit. 1205 if (Index == 0 && Val->isIntOrIntVectorTy()) 1206 Cost += 1; 1207 1208 return Cost; 1209 } 1210 1211 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1212 } 1213 1214 // Check if a load may be folded as a memory operand in its user. 1215 bool SystemZTTIImpl::isFoldableLoad(const LoadInst *Ld, 1216 const Instruction *&FoldedValue) const { 1217 if (!Ld->hasOneUse()) 1218 return false; 1219 FoldedValue = Ld; 1220 const Instruction *UserI = cast<Instruction>(*Ld->user_begin()); 1221 unsigned LoadedBits = getScalarSizeInBits(Ld->getType()); 1222 unsigned TruncBits = 0; 1223 unsigned SExtBits = 0; 1224 unsigned ZExtBits = 0; 1225 if (UserI->hasOneUse()) { 1226 unsigned UserBits = UserI->getType()->getScalarSizeInBits(); 1227 if (isa<TruncInst>(UserI)) 1228 TruncBits = UserBits; 1229 else if (isa<SExtInst>(UserI)) 1230 SExtBits = UserBits; 1231 else if (isa<ZExtInst>(UserI)) 1232 ZExtBits = UserBits; 1233 } 1234 if (TruncBits || SExtBits || ZExtBits) { 1235 FoldedValue = UserI; 1236 UserI = cast<Instruction>(*UserI->user_begin()); 1237 // Load (single use) -> trunc/extend (single use) -> UserI 1238 } 1239 if ((UserI->getOpcode() == Instruction::Sub || 1240 UserI->getOpcode() == Instruction::SDiv || 1241 UserI->getOpcode() == Instruction::UDiv) && 1242 UserI->getOperand(1) != FoldedValue) 1243 return false; // Not commutative, only RHS foldable. 1244 // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an 1245 // extension was made of the load. 1246 unsigned LoadOrTruncBits = 1247 ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits)); 1248 switch (UserI->getOpcode()) { 1249 case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64 1250 case Instruction::Sub: 1251 case Instruction::ICmp: 1252 if (LoadedBits == 32 && ZExtBits == 64) 1253 return true; 1254 [[fallthrough]]; 1255 case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64 1256 if (UserI->getOpcode() != Instruction::ICmp) { 1257 if (LoadedBits == 16 && 1258 (SExtBits == 32 || 1259 (SExtBits == 64 && ST->hasMiscellaneousExtensions2()))) 1260 return true; 1261 if (LoadOrTruncBits == 16) 1262 return true; 1263 } 1264 [[fallthrough]]; 1265 case Instruction::SDiv:// SE: 32->64 1266 if (LoadedBits == 32 && SExtBits == 64) 1267 return true; 1268 [[fallthrough]]; 1269 case Instruction::UDiv: 1270 case Instruction::And: 1271 case Instruction::Or: 1272 case Instruction::Xor: 1273 // This also makes sense for float operations, but disabled for now due 1274 // to regressions. 1275 // case Instruction::FCmp: 1276 // case Instruction::FAdd: 1277 // case Instruction::FSub: 1278 // case Instruction::FMul: 1279 // case Instruction::FDiv: 1280 1281 // All possible extensions of memory checked above. 1282 1283 // Comparison between memory and immediate. 1284 if (UserI->getOpcode() == Instruction::ICmp) 1285 if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1))) 1286 if (CI->getValue().isIntN(16)) 1287 return true; 1288 return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64); 1289 break; 1290 } 1291 return false; 1292 } 1293 1294 static bool isBswapIntrinsicCall(const Value *V) { 1295 if (const Instruction *I = dyn_cast<Instruction>(V)) 1296 if (auto *CI = dyn_cast<CallInst>(I)) 1297 if (auto *F = CI->getCalledFunction()) 1298 if (F->getIntrinsicID() == Intrinsic::bswap) 1299 return true; 1300 return false; 1301 } 1302 1303 InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 1304 Align Alignment, 1305 unsigned AddressSpace, 1306 TTI::TargetCostKind CostKind, 1307 TTI::OperandValueInfo OpInfo, 1308 const Instruction *I) const { 1309 assert(!Src->isVoidTy() && "Invalid type"); 1310 1311 // TODO: Handle other cost kinds. 1312 if (CostKind != TTI::TCK_RecipThroughput) 1313 return 1; 1314 1315 if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) { 1316 // Store the load or its truncated or extended value in FoldedValue. 1317 const Instruction *FoldedValue = nullptr; 1318 if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) { 1319 const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin()); 1320 assert (UserI->getNumOperands() == 2 && "Expected a binop."); 1321 1322 // UserI can't fold two loads, so in that case return 0 cost only 1323 // half of the time. 1324 for (unsigned i = 0; i < 2; ++i) { 1325 if (UserI->getOperand(i) == FoldedValue) 1326 continue; 1327 1328 if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){ 1329 LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp); 1330 if (!OtherLoad && 1331 (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) || 1332 isa<ZExtInst>(OtherOp))) 1333 OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0)); 1334 if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/)) 1335 return i == 0; // Both operands foldable. 1336 } 1337 } 1338 1339 return 0; // Only I is foldable in user. 1340 } 1341 } 1342 1343 // Type legalization (via getNumberOfParts) can't handle structs 1344 if (TLI->getValueType(DL, Src, true) == MVT::Other) 1345 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1346 CostKind); 1347 1348 // FP128 is a legal type but kept in a register pair on older CPUs. 1349 if (Src->isFP128Ty() && !ST->hasVectorEnhancements1()) 1350 return 2; 1351 1352 unsigned NumOps = 1353 (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src)); 1354 1355 // Store/Load reversed saves one instruction. 1356 if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) && 1357 I != nullptr) { 1358 if (Opcode == Instruction::Load && I->hasOneUse()) { 1359 const Instruction *LdUser = cast<Instruction>(*I->user_begin()); 1360 // In case of load -> bswap -> store, return normal cost for the load. 1361 if (isBswapIntrinsicCall(LdUser) && 1362 (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin()))) 1363 return 0; 1364 } 1365 else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) { 1366 const Value *StoredVal = SI->getValueOperand(); 1367 if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal)) 1368 return 0; 1369 } 1370 } 1371 1372 return NumOps; 1373 } 1374 1375 // The generic implementation of getInterleavedMemoryOpCost() is based on 1376 // adding costs of the memory operations plus all the extracts and inserts 1377 // needed for using / defining the vector operands. The SystemZ version does 1378 // roughly the same but bases the computations on vector permutations 1379 // instead. 1380 InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost( 1381 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1382 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 1383 bool UseMaskForCond, bool UseMaskForGaps) const { 1384 if (UseMaskForCond || UseMaskForGaps) 1385 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 1386 Alignment, AddressSpace, CostKind, 1387 UseMaskForCond, UseMaskForGaps); 1388 assert(isa<VectorType>(VecTy) && 1389 "Expect a vector type for interleaved memory op"); 1390 1391 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements(); 1392 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 1393 unsigned VF = NumElts / Factor; 1394 unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy)); 1395 unsigned NumVectorMemOps = getNumVectorRegs(VecTy); 1396 unsigned NumPermutes = 0; 1397 1398 if (Opcode == Instruction::Load) { 1399 // Loading interleave groups may have gaps, which may mean fewer 1400 // loads. Find out how many vectors will be loaded in total, and in how 1401 // many of them each value will be in. 1402 BitVector UsedInsts(NumVectorMemOps, false); 1403 std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false)); 1404 for (unsigned Index : Indices) 1405 for (unsigned Elt = 0; Elt < VF; ++Elt) { 1406 unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg; 1407 UsedInsts.set(Vec); 1408 ValueVecs[Index].set(Vec); 1409 } 1410 NumVectorMemOps = UsedInsts.count(); 1411 1412 for (unsigned Index : Indices) { 1413 // Estimate that each loaded source vector containing this Index 1414 // requires one operation, except that vperm can handle two input 1415 // registers first time for each dst vector. 1416 unsigned NumSrcVecs = ValueVecs[Index].count(); 1417 unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U); 1418 assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources"); 1419 NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs); 1420 } 1421 } else { 1422 // Estimate the permutes for each stored vector as the smaller of the 1423 // number of elements and the number of source vectors. Subtract one per 1424 // dst vector for vperm (S.A.). 1425 unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor); 1426 unsigned NumDstVecs = NumVectorMemOps; 1427 NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs; 1428 } 1429 1430 // Cost of load/store operations and the permutations needed. 1431 return NumVectorMemOps + NumPermutes; 1432 } 1433 1434 InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits) { 1435 InstructionCost Cost = 0; 1436 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. 1437 Cost += NumVec - 1; 1438 // For integer adds, VSUM creates shorter reductions on the final vector. 1439 Cost += (ScalarBits < 32) ? 3 : 2; 1440 return Cost; 1441 } 1442 1443 InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems, 1444 unsigned ScalarBits) { 1445 unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits); 1446 InstructionCost Cost = 0; 1447 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. 1448 Cost += NumVec - 1; 1449 // For each shuffle / arithmetic layer, we need 2 instructions, and we need 1450 // log2(Elements in Last Vector) layers. 1451 Cost += 2 * Log2_32_Ceil(std::min(NumElems, NumEltsPerVecReg)); 1452 return Cost; 1453 } 1454 1455 inline bool customCostReductions(unsigned Opcode) { 1456 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || 1457 Opcode == Instruction::Add || Opcode == Instruction::Mul; 1458 } 1459 1460 InstructionCost 1461 SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 1462 std::optional<FastMathFlags> FMF, 1463 TTI::TargetCostKind CostKind) const { 1464 unsigned ScalarBits = Ty->getScalarSizeInBits(); 1465 // The following is only for subtargets with vector math, non-ordered 1466 // reductions, and reasonable scalar sizes for int and fp add/mul. 1467 if (customCostReductions(Opcode) && ST->hasVector() && 1468 !TTI::requiresOrderedReduction(FMF) && 1469 ScalarBits <= SystemZ::VectorBits) { 1470 unsigned NumVectors = getNumVectorRegs(Ty); 1471 unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements(); 1472 // Integer Add is using custom code gen, that needs to be accounted for. 1473 if (Opcode == Instruction::Add) 1474 return getIntAddReductionCost(NumVectors, ScalarBits); 1475 // The base cost is the same across all other arithmetic instructions 1476 InstructionCost Cost = 1477 getFastReductionCost(NumVectors, NumElems, ScalarBits); 1478 // But we need to account for the final op involving the scalar operand. 1479 if ((Opcode == Instruction::FAdd) || (Opcode == Instruction::FMul)) 1480 Cost += 1; 1481 return Cost; 1482 } 1483 // otherwise, fall back to the standard implementation 1484 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); 1485 } 1486 1487 InstructionCost 1488 SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 1489 FastMathFlags FMF, 1490 TTI::TargetCostKind CostKind) const { 1491 // Return custom costs only on subtargets with vector enhancements. 1492 if (ST->hasVectorEnhancements1()) { 1493 unsigned NumVectors = getNumVectorRegs(Ty); 1494 unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements(); 1495 unsigned ScalarBits = Ty->getScalarSizeInBits(); 1496 InstructionCost Cost = 0; 1497 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. 1498 Cost += NumVectors - 1; 1499 // For the final vector, we need shuffle + min/max operations, and 1500 // we need #Elements - 1 of them. 1501 Cost += 2 * (std::min(NumElems, SystemZ::VectorBits / ScalarBits) - 1); 1502 return Cost; 1503 } 1504 // For other targets, fall back to the standard implementation 1505 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); 1506 } 1507 1508 static int 1509 getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, 1510 const SmallVectorImpl<Type *> &ParamTys) { 1511 if (RetTy->isVectorTy() && ID == Intrinsic::bswap) 1512 return getNumVectorRegs(RetTy); // VPERM 1513 1514 return -1; 1515 } 1516 1517 InstructionCost 1518 SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1519 TTI::TargetCostKind CostKind) const { 1520 InstructionCost Cost = getVectorIntrinsicInstrCost( 1521 ICA.getID(), ICA.getReturnType(), ICA.getArgTypes()); 1522 if (Cost != -1) 1523 return Cost; 1524 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1525 } 1526 1527 bool SystemZTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { 1528 // Always expand on Subtargets without vector instructions. 1529 if (!ST->hasVector()) 1530 return true; 1531 1532 // Whether or not to expand is a per-intrinsic decision. 1533 switch (II->getIntrinsicID()) { 1534 default: 1535 return true; 1536 // Do not expand vector.reduce.add... 1537 case Intrinsic::vector_reduce_add: 1538 auto *VType = cast<FixedVectorType>(II->getOperand(0)->getType()); 1539 // ...unless the scalar size is i64 or larger, 1540 // or the operand vector is not full, since the 1541 // performance benefit is dubious in those cases. 1542 return VType->getScalarSizeInBits() >= 64 || 1543 VType->getPrimitiveSizeInBits() < SystemZ::VectorBits; 1544 } 1545 } 1546