1 //===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a TargetTransformInfo analysis pass specific to the 10 // SystemZ target machine. It uses the target's detailed information to provide 11 // more precise answers to certain TTI queries, while letting the target 12 // independent and default TTI implementations handle the rest. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "SystemZTargetTransformInfo.h" 17 #include "llvm/Analysis/TargetTransformInfo.h" 18 #include "llvm/CodeGen/BasicTTIImpl.h" 19 #include "llvm/CodeGen/CostTable.h" 20 #include "llvm/CodeGen/TargetLowering.h" 21 #include "llvm/IR/IntrinsicInst.h" 22 #include "llvm/Support/Debug.h" 23 using namespace llvm; 24 25 #define DEBUG_TYPE "systemztti" 26 27 //===----------------------------------------------------------------------===// 28 // 29 // SystemZ cost model. 30 // 31 //===----------------------------------------------------------------------===// 32 33 static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) { 34 bool UsedAsMemCpySource = false; 35 for (const User *U : V->users()) 36 if (const Instruction *User = dyn_cast<Instruction>(U)) { 37 if (isa<BitCastInst>(User) || isa<GetElementPtrInst>(User)) { 38 UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse); 39 continue; 40 } 41 if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) { 42 if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) { 43 UsedAsMemCpySource = true; 44 continue; 45 } 46 } 47 OtherUse = true; 48 } 49 return UsedAsMemCpySource; 50 } 51 52 unsigned SystemZTTIImpl::adjustInliningThreshold(const CallBase *CB) const { 53 unsigned Bonus = 0; 54 55 // Increase the threshold if an incoming argument is used only as a memcpy 56 // source. 57 if (Function *Callee = CB->getCalledFunction()) 58 for (Argument &Arg : Callee->args()) { 59 bool OtherUse = false; 60 if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse) 61 Bonus += 150; 62 } 63 64 LLVM_DEBUG(if (Bonus) 65 dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";); 66 return Bonus; 67 } 68 69 InstructionCost SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 70 TTI::TargetCostKind CostKind) { 71 assert(Ty->isIntegerTy()); 72 73 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 74 // There is no cost model for constants with a bit size of 0. Return TCC_Free 75 // here, so that constant hoisting will ignore this constant. 76 if (BitSize == 0) 77 return TTI::TCC_Free; 78 // No cost model for operations on integers larger than 64 bit implemented yet. 79 if (BitSize > 64) 80 return TTI::TCC_Free; 81 82 if (Imm == 0) 83 return TTI::TCC_Free; 84 85 if (Imm.getBitWidth() <= 64) { 86 // Constants loaded via lgfi. 87 if (isInt<32>(Imm.getSExtValue())) 88 return TTI::TCC_Basic; 89 // Constants loaded via llilf. 90 if (isUInt<32>(Imm.getZExtValue())) 91 return TTI::TCC_Basic; 92 // Constants loaded via llihf: 93 if ((Imm.getZExtValue() & 0xffffffff) == 0) 94 return TTI::TCC_Basic; 95 96 return 2 * TTI::TCC_Basic; 97 } 98 99 return 4 * TTI::TCC_Basic; 100 } 101 102 InstructionCost SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 103 const APInt &Imm, Type *Ty, 104 TTI::TargetCostKind CostKind, 105 Instruction *Inst) { 106 assert(Ty->isIntegerTy()); 107 108 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 109 // There is no cost model for constants with a bit size of 0. Return TCC_Free 110 // here, so that constant hoisting will ignore this constant. 111 if (BitSize == 0) 112 return TTI::TCC_Free; 113 // No cost model for operations on integers larger than 64 bit implemented yet. 114 if (BitSize > 64) 115 return TTI::TCC_Free; 116 117 switch (Opcode) { 118 default: 119 return TTI::TCC_Free; 120 case Instruction::GetElementPtr: 121 // Always hoist the base address of a GetElementPtr. This prevents the 122 // creation of new constants for every base constant that gets constant 123 // folded with the offset. 124 if (Idx == 0) 125 return 2 * TTI::TCC_Basic; 126 return TTI::TCC_Free; 127 case Instruction::Store: 128 if (Idx == 0 && Imm.getBitWidth() <= 64) { 129 // Any 8-bit immediate store can by implemented via mvi. 130 if (BitSize == 8) 131 return TTI::TCC_Free; 132 // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi. 133 if (isInt<16>(Imm.getSExtValue())) 134 return TTI::TCC_Free; 135 } 136 break; 137 case Instruction::ICmp: 138 if (Idx == 1 && Imm.getBitWidth() <= 64) { 139 // Comparisons against signed 32-bit immediates implemented via cgfi. 140 if (isInt<32>(Imm.getSExtValue())) 141 return TTI::TCC_Free; 142 // Comparisons against unsigned 32-bit immediates implemented via clgfi. 143 if (isUInt<32>(Imm.getZExtValue())) 144 return TTI::TCC_Free; 145 } 146 break; 147 case Instruction::Add: 148 case Instruction::Sub: 149 if (Idx == 1 && Imm.getBitWidth() <= 64) { 150 // We use algfi/slgfi to add/subtract 32-bit unsigned immediates. 151 if (isUInt<32>(Imm.getZExtValue())) 152 return TTI::TCC_Free; 153 // Or their negation, by swapping addition vs. subtraction. 154 if (isUInt<32>(-Imm.getSExtValue())) 155 return TTI::TCC_Free; 156 } 157 break; 158 case Instruction::Mul: 159 if (Idx == 1 && Imm.getBitWidth() <= 64) { 160 // We use msgfi to multiply by 32-bit signed immediates. 161 if (isInt<32>(Imm.getSExtValue())) 162 return TTI::TCC_Free; 163 } 164 break; 165 case Instruction::Or: 166 case Instruction::Xor: 167 if (Idx == 1 && Imm.getBitWidth() <= 64) { 168 // Masks supported by oilf/xilf. 169 if (isUInt<32>(Imm.getZExtValue())) 170 return TTI::TCC_Free; 171 // Masks supported by oihf/xihf. 172 if ((Imm.getZExtValue() & 0xffffffff) == 0) 173 return TTI::TCC_Free; 174 } 175 break; 176 case Instruction::And: 177 if (Idx == 1 && Imm.getBitWidth() <= 64) { 178 // Any 32-bit AND operation can by implemented via nilf. 179 if (BitSize <= 32) 180 return TTI::TCC_Free; 181 // 64-bit masks supported by nilf. 182 if (isUInt<32>(~Imm.getZExtValue())) 183 return TTI::TCC_Free; 184 // 64-bit masks supported by nilh. 185 if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff) 186 return TTI::TCC_Free; 187 // Some 64-bit AND operations can be implemented via risbg. 188 const SystemZInstrInfo *TII = ST->getInstrInfo(); 189 unsigned Start, End; 190 if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End)) 191 return TTI::TCC_Free; 192 } 193 break; 194 case Instruction::Shl: 195 case Instruction::LShr: 196 case Instruction::AShr: 197 // Always return TCC_Free for the shift value of a shift instruction. 198 if (Idx == 1) 199 return TTI::TCC_Free; 200 break; 201 case Instruction::UDiv: 202 case Instruction::SDiv: 203 case Instruction::URem: 204 case Instruction::SRem: 205 case Instruction::Trunc: 206 case Instruction::ZExt: 207 case Instruction::SExt: 208 case Instruction::IntToPtr: 209 case Instruction::PtrToInt: 210 case Instruction::BitCast: 211 case Instruction::PHI: 212 case Instruction::Call: 213 case Instruction::Select: 214 case Instruction::Ret: 215 case Instruction::Load: 216 break; 217 } 218 219 return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind); 220 } 221 222 InstructionCost 223 SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 224 const APInt &Imm, Type *Ty, 225 TTI::TargetCostKind CostKind) { 226 assert(Ty->isIntegerTy()); 227 228 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 229 // There is no cost model for constants with a bit size of 0. Return TCC_Free 230 // here, so that constant hoisting will ignore this constant. 231 if (BitSize == 0) 232 return TTI::TCC_Free; 233 // No cost model for operations on integers larger than 64 bit implemented yet. 234 if (BitSize > 64) 235 return TTI::TCC_Free; 236 237 switch (IID) { 238 default: 239 return TTI::TCC_Free; 240 case Intrinsic::sadd_with_overflow: 241 case Intrinsic::uadd_with_overflow: 242 case Intrinsic::ssub_with_overflow: 243 case Intrinsic::usub_with_overflow: 244 // These get expanded to include a normal addition/subtraction. 245 if (Idx == 1 && Imm.getBitWidth() <= 64) { 246 if (isUInt<32>(Imm.getZExtValue())) 247 return TTI::TCC_Free; 248 if (isUInt<32>(-Imm.getSExtValue())) 249 return TTI::TCC_Free; 250 } 251 break; 252 case Intrinsic::smul_with_overflow: 253 case Intrinsic::umul_with_overflow: 254 // These get expanded to include a normal multiplication. 255 if (Idx == 1 && Imm.getBitWidth() <= 64) { 256 if (isInt<32>(Imm.getSExtValue())) 257 return TTI::TCC_Free; 258 } 259 break; 260 case Intrinsic::experimental_stackmap: 261 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 262 return TTI::TCC_Free; 263 break; 264 case Intrinsic::experimental_patchpoint_void: 265 case Intrinsic::experimental_patchpoint_i64: 266 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 267 return TTI::TCC_Free; 268 break; 269 } 270 return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind); 271 } 272 273 TargetTransformInfo::PopcntSupportKind 274 SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) { 275 assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2"); 276 if (ST->hasPopulationCount() && TyWidth <= 64) 277 return TTI::PSK_FastHardware; 278 return TTI::PSK_Software; 279 } 280 281 void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 282 TTI::UnrollingPreferences &UP, 283 OptimizationRemarkEmitter *ORE) { 284 // Find out if L contains a call, what the machine instruction count 285 // estimate is, and how many stores there are. 286 bool HasCall = false; 287 InstructionCost NumStores = 0; 288 for (auto &BB : L->blocks()) 289 for (auto &I : *BB) { 290 if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) { 291 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 292 if (isLoweredToCall(F)) 293 HasCall = true; 294 if (F->getIntrinsicID() == Intrinsic::memcpy || 295 F->getIntrinsicID() == Intrinsic::memset) 296 NumStores++; 297 } else { // indirect call. 298 HasCall = true; 299 } 300 } 301 if (isa<StoreInst>(&I)) { 302 Type *MemAccessTy = I.getOperand(0)->getType(); 303 NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, None, 0, 304 TTI::TCK_RecipThroughput); 305 } 306 } 307 308 // The z13 processor will run out of store tags if too many stores 309 // are fed into it too quickly. Therefore make sure there are not 310 // too many stores in the resulting unrolled loop. 311 unsigned const NumStoresVal = *NumStores.getValue(); 312 unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX); 313 314 if (HasCall) { 315 // Only allow full unrolling if loop has any calls. 316 UP.FullUnrollMaxCount = Max; 317 UP.MaxCount = 1; 318 return; 319 } 320 321 UP.MaxCount = Max; 322 if (UP.MaxCount <= 1) 323 return; 324 325 // Allow partial and runtime trip count unrolling. 326 UP.Partial = UP.Runtime = true; 327 328 UP.PartialThreshold = 75; 329 UP.DefaultUnrollRuntimeCount = 4; 330 331 // Allow expensive instructions in the pre-header of the loop. 332 UP.AllowExpensiveTripCount = true; 333 334 UP.Force = true; 335 } 336 337 void SystemZTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 338 TTI::PeelingPreferences &PP) { 339 BaseT::getPeelingPreferences(L, SE, PP); 340 } 341 342 bool SystemZTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 343 const TargetTransformInfo::LSRCost &C2) { 344 // SystemZ specific: check instruction count (first), and don't care about 345 // ImmCost, since offsets are checked explicitly. 346 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 347 C1.NumIVMuls, C1.NumBaseAdds, 348 C1.ScaleCost, C1.SetupCost) < 349 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 350 C2.NumIVMuls, C2.NumBaseAdds, 351 C2.ScaleCost, C2.SetupCost); 352 } 353 354 unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const { 355 bool Vector = (ClassID == 1); 356 if (!Vector) 357 // Discount the stack pointer. Also leave out %r0, since it can't 358 // be used in an address. 359 return 14; 360 if (ST->hasVector()) 361 return 32; 362 return 0; 363 } 364 365 TypeSize 366 SystemZTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 367 switch (K) { 368 case TargetTransformInfo::RGK_Scalar: 369 return TypeSize::getFixed(64); 370 case TargetTransformInfo::RGK_FixedWidthVector: 371 return TypeSize::getFixed(ST->hasVector() ? 128 : 0); 372 case TargetTransformInfo::RGK_ScalableVector: 373 return TypeSize::getScalable(0); 374 } 375 376 llvm_unreachable("Unsupported register kind"); 377 } 378 379 unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses, 380 unsigned NumStridedMemAccesses, 381 unsigned NumPrefetches, 382 bool HasCall) const { 383 // Don't prefetch a loop with many far apart accesses. 384 if (NumPrefetches > 16) 385 return UINT_MAX; 386 387 // Emit prefetch instructions for smaller strides in cases where we think 388 // the hardware prefetcher might not be able to keep up. 389 if (NumStridedMemAccesses > 32 && !HasCall && 390 (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses) 391 return 1; 392 393 return ST->hasMiscellaneousExtensions3() ? 8192 : 2048; 394 } 395 396 bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { 397 EVT VT = TLI->getValueType(DL, DataType); 398 return (VT.isScalarInteger() && TLI->isTypeLegal(VT)); 399 } 400 401 // Return the bit size for the scalar type or vector element 402 // type. getScalarSizeInBits() returns 0 for a pointer type. 403 static unsigned getScalarSizeInBits(Type *Ty) { 404 unsigned Size = 405 (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits()); 406 assert(Size > 0 && "Element must have non-zero size."); 407 return Size; 408 } 409 410 // getNumberOfParts() calls getTypeLegalizationCost() which splits the vector 411 // type until it is legal. This would e.g. return 4 for <6 x i64>, instead of 412 // 3. 413 static unsigned getNumVectorRegs(Type *Ty) { 414 auto *VTy = cast<FixedVectorType>(Ty); 415 unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements(); 416 assert(WideBits > 0 && "Could not compute size of vector"); 417 return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U)); 418 } 419 420 InstructionCost SystemZTTIImpl::getArithmeticInstrCost( 421 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 422 TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, 423 TTI::OperandValueProperties Opd1PropInfo, 424 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, 425 const Instruction *CxtI) { 426 427 // TODO: Handle more cost kinds. 428 if (CostKind != TTI::TCK_RecipThroughput) 429 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 430 Op2Info, Opd1PropInfo, 431 Opd2PropInfo, Args, CxtI); 432 433 // TODO: return a good value for BB-VECTORIZER that includes the 434 // immediate loads, which we do not want to count for the loop 435 // vectorizer, since they are hopefully hoisted out of the loop. This 436 // would require a new parameter 'InLoop', but not sure if constant 437 // args are common enough to motivate this. 438 439 unsigned ScalarBits = Ty->getScalarSizeInBits(); 440 441 // There are thre cases of division and remainder: Dividing with a register 442 // needs a divide instruction. A divisor which is a power of two constant 443 // can be implemented with a sequence of shifts. Any other constant needs a 444 // multiply and shifts. 445 const unsigned DivInstrCost = 20; 446 const unsigned DivMulSeqCost = 10; 447 const unsigned SDivPow2Cost = 4; 448 449 bool SignedDivRem = 450 Opcode == Instruction::SDiv || Opcode == Instruction::SRem; 451 bool UnsignedDivRem = 452 Opcode == Instruction::UDiv || Opcode == Instruction::URem; 453 454 // Check for a constant divisor. 455 bool DivRemConst = false; 456 bool DivRemConstPow2 = false; 457 if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) { 458 if (const Constant *C = dyn_cast<Constant>(Args[1])) { 459 const ConstantInt *CVal = 460 (C->getType()->isVectorTy() 461 ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue()) 462 : dyn_cast<const ConstantInt>(C)); 463 if (CVal && (CVal->getValue().isPowerOf2() || 464 CVal->getValue().isNegatedPowerOf2())) 465 DivRemConstPow2 = true; 466 else 467 DivRemConst = true; 468 } 469 } 470 471 if (!Ty->isVectorTy()) { 472 // These FP operations are supported with a dedicated instruction for 473 // float, double and fp128 (base implementation assumes float generally 474 // costs 2). 475 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub || 476 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) 477 return 1; 478 479 // There is no native support for FRem. 480 if (Opcode == Instruction::FRem) 481 return LIBCALL_COST; 482 483 // Give discount for some combined logical operations if supported. 484 if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) { 485 if (Opcode == Instruction::Xor) { 486 for (const Value *A : Args) { 487 if (const Instruction *I = dyn_cast<Instruction>(A)) 488 if (I->hasOneUse() && 489 (I->getOpcode() == Instruction::And || 490 I->getOpcode() == Instruction::Or || 491 I->getOpcode() == Instruction::Xor)) 492 return 0; 493 } 494 } 495 else if (Opcode == Instruction::Or || Opcode == Instruction::And) { 496 for (const Value *A : Args) { 497 if (const Instruction *I = dyn_cast<Instruction>(A)) 498 if (I->hasOneUse() && I->getOpcode() == Instruction::Xor) 499 return 0; 500 } 501 } 502 } 503 504 // Or requires one instruction, although it has custom handling for i64. 505 if (Opcode == Instruction::Or) 506 return 1; 507 508 if (Opcode == Instruction::Xor && ScalarBits == 1) { 509 if (ST->hasLoadStoreOnCond2()) 510 return 5; // 2 * (li 0; loc 1); xor 511 return 7; // 2 * ipm sequences ; xor ; shift ; compare 512 } 513 514 if (DivRemConstPow2) 515 return (SignedDivRem ? SDivPow2Cost : 1); 516 if (DivRemConst) 517 return DivMulSeqCost; 518 if (SignedDivRem || UnsignedDivRem) 519 return DivInstrCost; 520 } 521 else if (ST->hasVector()) { 522 auto *VTy = cast<FixedVectorType>(Ty); 523 unsigned VF = VTy->getNumElements(); 524 unsigned NumVectors = getNumVectorRegs(Ty); 525 526 // These vector operations are custom handled, but are still supported 527 // with one instruction per vector, regardless of element size. 528 if (Opcode == Instruction::Shl || Opcode == Instruction::LShr || 529 Opcode == Instruction::AShr) { 530 return NumVectors; 531 } 532 533 if (DivRemConstPow2) 534 return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1)); 535 if (DivRemConst) { 536 SmallVector<Type *> Tys(Args.size(), Ty); 537 return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args, Tys); 538 } 539 if ((SignedDivRem || UnsignedDivRem) && VF > 4) 540 // Temporary hack: disable high vectorization factors with integer 541 // division/remainder, which will get scalarized and handled with 542 // GR128 registers. The mischeduler is not clever enough to avoid 543 // spilling yet. 544 return 1000; 545 546 // These FP operations are supported with a single vector instruction for 547 // double (base implementation assumes float generally costs 2). For 548 // FP128, the scalar cost is 1, and there is no overhead since the values 549 // are already in scalar registers. 550 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub || 551 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) { 552 switch (ScalarBits) { 553 case 32: { 554 // The vector enhancements facility 1 provides v4f32 instructions. 555 if (ST->hasVectorEnhancements1()) 556 return NumVectors; 557 // Return the cost of multiple scalar invocation plus the cost of 558 // inserting and extracting the values. 559 InstructionCost ScalarCost = 560 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind); 561 SmallVector<Type *> Tys(Args.size(), Ty); 562 InstructionCost Cost = 563 (VF * ScalarCost) + getScalarizationOverhead(VTy, Args, Tys); 564 // FIXME: VF 2 for these FP operations are currently just as 565 // expensive as for VF 4. 566 if (VF == 2) 567 Cost *= 2; 568 return Cost; 569 } 570 case 64: 571 case 128: 572 return NumVectors; 573 default: 574 break; 575 } 576 } 577 578 // There is no native support for FRem. 579 if (Opcode == Instruction::FRem) { 580 SmallVector<Type *> Tys(Args.size(), Ty); 581 InstructionCost Cost = 582 (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args, Tys); 583 // FIXME: VF 2 for float is currently just as expensive as for VF 4. 584 if (VF == 2 && ScalarBits == 32) 585 Cost *= 2; 586 return Cost; 587 } 588 } 589 590 // Fallback to the default implementation. 591 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 592 Opd1PropInfo, Opd2PropInfo, Args, CxtI); 593 } 594 595 InstructionCost SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 596 VectorType *Tp, 597 ArrayRef<int> Mask, int Index, 598 VectorType *SubTp, 599 ArrayRef<const Value *> Args) { 600 Kind = improveShuffleKindFromMask(Kind, Mask); 601 if (ST->hasVector()) { 602 unsigned NumVectors = getNumVectorRegs(Tp); 603 604 // TODO: Since fp32 is expanded, the shuffle cost should always be 0. 605 606 // FP128 values are always in scalar registers, so there is no work 607 // involved with a shuffle, except for broadcast. In that case register 608 // moves are done with a single instruction per element. 609 if (Tp->getScalarType()->isFP128Ty()) 610 return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0); 611 612 switch (Kind) { 613 case TargetTransformInfo::SK_ExtractSubvector: 614 // ExtractSubvector Index indicates start offset. 615 616 // Extracting a subvector from first index is a noop. 617 return (Index == 0 ? 0 : NumVectors); 618 619 case TargetTransformInfo::SK_Broadcast: 620 // Loop vectorizer calls here to figure out the extra cost of 621 // broadcasting a loaded value to all elements of a vector. Since vlrep 622 // loads and replicates with a single instruction, adjust the returned 623 // value. 624 return NumVectors - 1; 625 626 default: 627 628 // SystemZ supports single instruction permutation / replication. 629 return NumVectors; 630 } 631 } 632 633 return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); 634 } 635 636 // Return the log2 difference of the element sizes of the two vector types. 637 static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) { 638 unsigned Bits0 = Ty0->getScalarSizeInBits(); 639 unsigned Bits1 = Ty1->getScalarSizeInBits(); 640 641 if (Bits1 > Bits0) 642 return (Log2_32(Bits1) - Log2_32(Bits0)); 643 644 return (Log2_32(Bits0) - Log2_32(Bits1)); 645 } 646 647 // Return the number of instructions needed to truncate SrcTy to DstTy. 648 unsigned SystemZTTIImpl:: 649 getVectorTruncCost(Type *SrcTy, Type *DstTy) { 650 assert (SrcTy->isVectorTy() && DstTy->isVectorTy()); 651 assert(SrcTy->getPrimitiveSizeInBits().getFixedSize() > 652 DstTy->getPrimitiveSizeInBits().getFixedSize() && 653 "Packing must reduce size of vector type."); 654 assert(cast<FixedVectorType>(SrcTy)->getNumElements() == 655 cast<FixedVectorType>(DstTy)->getNumElements() && 656 "Packing should not change number of elements."); 657 658 // TODO: Since fp32 is expanded, the extract cost should always be 0. 659 660 unsigned NumParts = getNumVectorRegs(SrcTy); 661 if (NumParts <= 2) 662 // Up to 2 vector registers can be truncated efficiently with pack or 663 // permute. The latter requires an immediate mask to be loaded, which 664 // typically gets hoisted out of a loop. TODO: return a good value for 665 // BB-VECTORIZER that includes the immediate loads, which we do not want 666 // to count for the loop vectorizer. 667 return 1; 668 669 unsigned Cost = 0; 670 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy); 671 unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements(); 672 for (unsigned P = 0; P < Log2Diff; ++P) { 673 if (NumParts > 1) 674 NumParts /= 2; 675 Cost += NumParts; 676 } 677 678 // Currently, a general mix of permutes and pack instructions is output by 679 // isel, which follow the cost computation above except for this case which 680 // is one instruction less: 681 if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 && 682 DstTy->getScalarSizeInBits() == 8) 683 Cost--; 684 685 return Cost; 686 } 687 688 // Return the cost of converting a vector bitmask produced by a compare 689 // (SrcTy), to the type of the select or extend instruction (DstTy). 690 unsigned SystemZTTIImpl:: 691 getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) { 692 assert (SrcTy->isVectorTy() && DstTy->isVectorTy() && 693 "Should only be called with vector types."); 694 695 unsigned PackCost = 0; 696 unsigned SrcScalarBits = SrcTy->getScalarSizeInBits(); 697 unsigned DstScalarBits = DstTy->getScalarSizeInBits(); 698 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy); 699 if (SrcScalarBits > DstScalarBits) 700 // The bitmask will be truncated. 701 PackCost = getVectorTruncCost(SrcTy, DstTy); 702 else if (SrcScalarBits < DstScalarBits) { 703 unsigned DstNumParts = getNumVectorRegs(DstTy); 704 // Each vector select needs its part of the bitmask unpacked. 705 PackCost = Log2Diff * DstNumParts; 706 // Extra cost for moving part of mask before unpacking. 707 PackCost += DstNumParts - 1; 708 } 709 710 return PackCost; 711 } 712 713 // Return the type of the compared operands. This is needed to compute the 714 // cost for a Select / ZExt or SExt instruction. 715 static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) { 716 Type *OpTy = nullptr; 717 if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0))) 718 OpTy = CI->getOperand(0)->getType(); 719 else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0))) 720 if (LogicI->getNumOperands() == 2) 721 if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0))) 722 if (isa<CmpInst>(LogicI->getOperand(1))) 723 OpTy = CI0->getOperand(0)->getType(); 724 725 if (OpTy != nullptr) { 726 if (VF == 1) { 727 assert (!OpTy->isVectorTy() && "Expected scalar type"); 728 return OpTy; 729 } 730 // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may 731 // be either scalar or already vectorized with a same or lesser VF. 732 Type *ElTy = OpTy->getScalarType(); 733 return FixedVectorType::get(ElTy, VF); 734 } 735 736 return nullptr; 737 } 738 739 // Get the cost of converting a boolean vector to a vector with same width 740 // and element size as Dst, plus the cost of zero extending if needed. 741 unsigned SystemZTTIImpl:: 742 getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, 743 const Instruction *I) { 744 auto *DstVTy = cast<FixedVectorType>(Dst); 745 unsigned VF = DstVTy->getNumElements(); 746 unsigned Cost = 0; 747 // If we know what the widths of the compared operands, get any cost of 748 // converting it to match Dst. Otherwise assume same widths. 749 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr); 750 if (CmpOpTy != nullptr) 751 Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst); 752 if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP) 753 // One 'vn' per dst vector with an immediate mask. 754 Cost += getNumVectorRegs(Dst); 755 return Cost; 756 } 757 758 InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 759 Type *Src, 760 TTI::CastContextHint CCH, 761 TTI::TargetCostKind CostKind, 762 const Instruction *I) { 763 // FIXME: Can the logic below also be used for these cost kinds? 764 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) { 765 auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 766 return BaseCost == 0 ? BaseCost : 1; 767 } 768 769 unsigned DstScalarBits = Dst->getScalarSizeInBits(); 770 unsigned SrcScalarBits = Src->getScalarSizeInBits(); 771 772 if (!Src->isVectorTy()) { 773 assert (!Dst->isVectorTy()); 774 775 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) { 776 if (SrcScalarBits >= 32 || 777 (I != nullptr && isa<LoadInst>(I->getOperand(0)))) 778 return 1; 779 return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/; 780 } 781 782 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) && 783 Src->isIntegerTy(1)) { 784 if (ST->hasLoadStoreOnCond2()) 785 return 2; // li 0; loc 1 786 787 // This should be extension of a compare i1 result, which is done with 788 // ipm and a varying sequence of instructions. 789 unsigned Cost = 0; 790 if (Opcode == Instruction::SExt) 791 Cost = (DstScalarBits < 64 ? 3 : 4); 792 if (Opcode == Instruction::ZExt) 793 Cost = 3; 794 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr); 795 if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy()) 796 // If operands of an fp-type was compared, this costs +1. 797 Cost++; 798 return Cost; 799 } 800 } 801 else if (ST->hasVector()) { 802 // Vector to scalar cast. 803 auto *SrcVecTy = cast<FixedVectorType>(Src); 804 auto *DstVecTy = dyn_cast<FixedVectorType>(Dst); 805 if (!DstVecTy) { 806 // TODO: tune vector-to-scalar cast. 807 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 808 } 809 unsigned VF = SrcVecTy->getNumElements(); 810 unsigned NumDstVectors = getNumVectorRegs(Dst); 811 unsigned NumSrcVectors = getNumVectorRegs(Src); 812 813 if (Opcode == Instruction::Trunc) { 814 if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits()) 815 return 0; // Check for NOOP conversions. 816 return getVectorTruncCost(Src, Dst); 817 } 818 819 if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 820 if (SrcScalarBits >= 8) { 821 // ZExt will use either a single unpack or a vector permute. 822 if (Opcode == Instruction::ZExt) 823 return NumDstVectors; 824 825 // SExt will be handled with one unpack per doubling of width. 826 unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst); 827 828 // For types that spans multiple vector registers, some additional 829 // instructions are used to setup the unpacking. 830 unsigned NumSrcVectorOps = 831 (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors) 832 : (NumDstVectors / 2)); 833 834 return (NumUnpacks * NumDstVectors) + NumSrcVectorOps; 835 } 836 else if (SrcScalarBits == 1) 837 return getBoolVecToIntConversionCost(Opcode, Dst, I); 838 } 839 840 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP || 841 Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) { 842 // TODO: Fix base implementation which could simplify things a bit here 843 // (seems to miss on differentiating on scalar/vector types). 844 845 // Only 64 bit vector conversions are natively supported before z15. 846 if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) { 847 if (SrcScalarBits == DstScalarBits) 848 return NumDstVectors; 849 850 if (SrcScalarBits == 1) 851 return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors; 852 } 853 854 // Return the cost of multiple scalar invocation plus the cost of 855 // inserting and extracting the values. Base implementation does not 856 // realize float->int gets scalarized. 857 InstructionCost ScalarCost = getCastInstrCost( 858 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind); 859 InstructionCost TotCost = VF * ScalarCost; 860 bool NeedsInserts = true, NeedsExtracts = true; 861 // FP128 registers do not get inserted or extracted. 862 if (DstScalarBits == 128 && 863 (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)) 864 NeedsInserts = false; 865 if (SrcScalarBits == 128 && 866 (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI)) 867 NeedsExtracts = false; 868 869 TotCost += getScalarizationOverhead(SrcVecTy, false, NeedsExtracts); 870 TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts, false); 871 872 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4. 873 if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32) 874 TotCost *= 2; 875 876 return TotCost; 877 } 878 879 if (Opcode == Instruction::FPTrunc) { 880 if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements. 881 return VF /*ldxbr/lexbr*/ + 882 getScalarizationOverhead(DstVecTy, true, false); 883 else // double -> float 884 return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/); 885 } 886 887 if (Opcode == Instruction::FPExt) { 888 if (SrcScalarBits == 32 && DstScalarBits == 64) { 889 // float -> double is very rare and currently unoptimized. Instead of 890 // using vldeb, which can do two at a time, all conversions are 891 // scalarized. 892 return VF * 2; 893 } 894 // -> fp128. VF * lxdb/lxeb + extraction of elements. 895 return VF + getScalarizationOverhead(SrcVecTy, false, true); 896 } 897 } 898 899 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 900 } 901 902 // Scalar i8 / i16 operations will typically be made after first extending 903 // the operands to i32. 904 static unsigned getOperandsExtensionCost(const Instruction *I) { 905 unsigned ExtCost = 0; 906 for (Value *Op : I->operands()) 907 // A load of i8 or i16 sign/zero extends to i32. 908 if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op)) 909 ExtCost++; 910 911 return ExtCost; 912 } 913 914 InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 915 Type *CondTy, 916 CmpInst::Predicate VecPred, 917 TTI::TargetCostKind CostKind, 918 const Instruction *I) { 919 if (CostKind != TTI::TCK_RecipThroughput) 920 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind); 921 922 if (!ValTy->isVectorTy()) { 923 switch (Opcode) { 924 case Instruction::ICmp: { 925 // A loaded value compared with 0 with multiple users becomes Load and 926 // Test. The load is then not foldable, so return 0 cost for the ICmp. 927 unsigned ScalarBits = ValTy->getScalarSizeInBits(); 928 if (I != nullptr && ScalarBits >= 32) 929 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0))) 930 if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1))) 931 if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() && 932 C->isZero()) 933 return 0; 934 935 unsigned Cost = 1; 936 if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16) 937 Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2); 938 return Cost; 939 } 940 case Instruction::Select: 941 if (ValTy->isFloatingPointTy()) 942 return 4; // No load on condition for FP - costs a conditional jump. 943 return 1; // Load On Condition / Select Register. 944 } 945 } 946 else if (ST->hasVector()) { 947 unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements(); 948 949 // Called with a compare instruction. 950 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { 951 unsigned PredicateExtraCost = 0; 952 if (I != nullptr) { 953 // Some predicates cost one or two extra instructions. 954 switch (cast<CmpInst>(I)->getPredicate()) { 955 case CmpInst::Predicate::ICMP_NE: 956 case CmpInst::Predicate::ICMP_UGE: 957 case CmpInst::Predicate::ICMP_ULE: 958 case CmpInst::Predicate::ICMP_SGE: 959 case CmpInst::Predicate::ICMP_SLE: 960 PredicateExtraCost = 1; 961 break; 962 case CmpInst::Predicate::FCMP_ONE: 963 case CmpInst::Predicate::FCMP_ORD: 964 case CmpInst::Predicate::FCMP_UEQ: 965 case CmpInst::Predicate::FCMP_UNO: 966 PredicateExtraCost = 2; 967 break; 968 default: 969 break; 970 } 971 } 972 973 // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of 974 // floats. FIXME: <2 x float> generates same code as <4 x float>. 975 unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1); 976 unsigned NumVecs_cmp = getNumVectorRegs(ValTy); 977 978 unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost)); 979 return Cost; 980 } 981 else { // Called with a select instruction. 982 assert (Opcode == Instruction::Select); 983 984 // We can figure out the extra cost of packing / unpacking if the 985 // instruction was passed and the compare instruction is found. 986 unsigned PackCost = 0; 987 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr); 988 if (CmpOpTy != nullptr) 989 PackCost = 990 getVectorBitmaskConversionCost(CmpOpTy, ValTy); 991 992 return getNumVectorRegs(ValTy) /*vsel*/ + PackCost; 993 } 994 } 995 996 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind); 997 } 998 999 InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1000 unsigned Index) { 1001 // vlvgp will insert two grs into a vector register, so only count half the 1002 // number of instructions. 1003 if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64)) 1004 return ((Index % 2 == 0) ? 1 : 0); 1005 1006 if (Opcode == Instruction::ExtractElement) { 1007 int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1); 1008 1009 // Give a slight penalty for moving out of vector pipeline to FXU unit. 1010 if (Index == 0 && Val->isIntOrIntVectorTy()) 1011 Cost += 1; 1012 1013 return Cost; 1014 } 1015 1016 return BaseT::getVectorInstrCost(Opcode, Val, Index); 1017 } 1018 1019 // Check if a load may be folded as a memory operand in its user. 1020 bool SystemZTTIImpl:: 1021 isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) { 1022 if (!Ld->hasOneUse()) 1023 return false; 1024 FoldedValue = Ld; 1025 const Instruction *UserI = cast<Instruction>(*Ld->user_begin()); 1026 unsigned LoadedBits = getScalarSizeInBits(Ld->getType()); 1027 unsigned TruncBits = 0; 1028 unsigned SExtBits = 0; 1029 unsigned ZExtBits = 0; 1030 if (UserI->hasOneUse()) { 1031 unsigned UserBits = UserI->getType()->getScalarSizeInBits(); 1032 if (isa<TruncInst>(UserI)) 1033 TruncBits = UserBits; 1034 else if (isa<SExtInst>(UserI)) 1035 SExtBits = UserBits; 1036 else if (isa<ZExtInst>(UserI)) 1037 ZExtBits = UserBits; 1038 } 1039 if (TruncBits || SExtBits || ZExtBits) { 1040 FoldedValue = UserI; 1041 UserI = cast<Instruction>(*UserI->user_begin()); 1042 // Load (single use) -> trunc/extend (single use) -> UserI 1043 } 1044 if ((UserI->getOpcode() == Instruction::Sub || 1045 UserI->getOpcode() == Instruction::SDiv || 1046 UserI->getOpcode() == Instruction::UDiv) && 1047 UserI->getOperand(1) != FoldedValue) 1048 return false; // Not commutative, only RHS foldable. 1049 // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an 1050 // extension was made of the load. 1051 unsigned LoadOrTruncBits = 1052 ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits)); 1053 switch (UserI->getOpcode()) { 1054 case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64 1055 case Instruction::Sub: 1056 case Instruction::ICmp: 1057 if (LoadedBits == 32 && ZExtBits == 64) 1058 return true; 1059 LLVM_FALLTHROUGH; 1060 case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64 1061 if (UserI->getOpcode() != Instruction::ICmp) { 1062 if (LoadedBits == 16 && 1063 (SExtBits == 32 || 1064 (SExtBits == 64 && ST->hasMiscellaneousExtensions2()))) 1065 return true; 1066 if (LoadOrTruncBits == 16) 1067 return true; 1068 } 1069 LLVM_FALLTHROUGH; 1070 case Instruction::SDiv:// SE: 32->64 1071 if (LoadedBits == 32 && SExtBits == 64) 1072 return true; 1073 LLVM_FALLTHROUGH; 1074 case Instruction::UDiv: 1075 case Instruction::And: 1076 case Instruction::Or: 1077 case Instruction::Xor: 1078 // This also makes sense for float operations, but disabled for now due 1079 // to regressions. 1080 // case Instruction::FCmp: 1081 // case Instruction::FAdd: 1082 // case Instruction::FSub: 1083 // case Instruction::FMul: 1084 // case Instruction::FDiv: 1085 1086 // All possible extensions of memory checked above. 1087 1088 // Comparison between memory and immediate. 1089 if (UserI->getOpcode() == Instruction::ICmp) 1090 if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1))) 1091 if (CI->getValue().isIntN(16)) 1092 return true; 1093 return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64); 1094 break; 1095 } 1096 return false; 1097 } 1098 1099 static bool isBswapIntrinsicCall(const Value *V) { 1100 if (const Instruction *I = dyn_cast<Instruction>(V)) 1101 if (auto *CI = dyn_cast<CallInst>(I)) 1102 if (auto *F = CI->getCalledFunction()) 1103 if (F->getIntrinsicID() == Intrinsic::bswap) 1104 return true; 1105 return false; 1106 } 1107 1108 InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 1109 MaybeAlign Alignment, 1110 unsigned AddressSpace, 1111 TTI::TargetCostKind CostKind, 1112 const Instruction *I) { 1113 assert(!Src->isVoidTy() && "Invalid type"); 1114 1115 // TODO: Handle other cost kinds. 1116 if (CostKind != TTI::TCK_RecipThroughput) 1117 return 1; 1118 1119 if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) { 1120 // Store the load or its truncated or extended value in FoldedValue. 1121 const Instruction *FoldedValue = nullptr; 1122 if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) { 1123 const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin()); 1124 assert (UserI->getNumOperands() == 2 && "Expected a binop."); 1125 1126 // UserI can't fold two loads, so in that case return 0 cost only 1127 // half of the time. 1128 for (unsigned i = 0; i < 2; ++i) { 1129 if (UserI->getOperand(i) == FoldedValue) 1130 continue; 1131 1132 if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){ 1133 LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp); 1134 if (!OtherLoad && 1135 (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) || 1136 isa<ZExtInst>(OtherOp))) 1137 OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0)); 1138 if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/)) 1139 return i == 0; // Both operands foldable. 1140 } 1141 } 1142 1143 return 0; // Only I is foldable in user. 1144 } 1145 } 1146 1147 unsigned NumOps = 1148 (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src)); 1149 1150 // Store/Load reversed saves one instruction. 1151 if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) && 1152 I != nullptr) { 1153 if (Opcode == Instruction::Load && I->hasOneUse()) { 1154 const Instruction *LdUser = cast<Instruction>(*I->user_begin()); 1155 // In case of load -> bswap -> store, return normal cost for the load. 1156 if (isBswapIntrinsicCall(LdUser) && 1157 (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin()))) 1158 return 0; 1159 } 1160 else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) { 1161 const Value *StoredVal = SI->getValueOperand(); 1162 if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal)) 1163 return 0; 1164 } 1165 } 1166 1167 if (Src->getScalarSizeInBits() == 128) 1168 // 128 bit scalars are held in a pair of two 64 bit registers. 1169 NumOps *= 2; 1170 1171 return NumOps; 1172 } 1173 1174 // The generic implementation of getInterleavedMemoryOpCost() is based on 1175 // adding costs of the memory operations plus all the extracts and inserts 1176 // needed for using / defining the vector operands. The SystemZ version does 1177 // roughly the same but bases the computations on vector permutations 1178 // instead. 1179 InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost( 1180 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1181 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 1182 bool UseMaskForCond, bool UseMaskForGaps) { 1183 if (UseMaskForCond || UseMaskForGaps) 1184 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 1185 Alignment, AddressSpace, CostKind, 1186 UseMaskForCond, UseMaskForGaps); 1187 assert(isa<VectorType>(VecTy) && 1188 "Expect a vector type for interleaved memory op"); 1189 1190 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements(); 1191 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 1192 unsigned VF = NumElts / Factor; 1193 unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy)); 1194 unsigned NumVectorMemOps = getNumVectorRegs(VecTy); 1195 unsigned NumPermutes = 0; 1196 1197 if (Opcode == Instruction::Load) { 1198 // Loading interleave groups may have gaps, which may mean fewer 1199 // loads. Find out how many vectors will be loaded in total, and in how 1200 // many of them each value will be in. 1201 BitVector UsedInsts(NumVectorMemOps, false); 1202 std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false)); 1203 for (unsigned Index : Indices) 1204 for (unsigned Elt = 0; Elt < VF; ++Elt) { 1205 unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg; 1206 UsedInsts.set(Vec); 1207 ValueVecs[Index].set(Vec); 1208 } 1209 NumVectorMemOps = UsedInsts.count(); 1210 1211 for (unsigned Index : Indices) { 1212 // Estimate that each loaded source vector containing this Index 1213 // requires one operation, except that vperm can handle two input 1214 // registers first time for each dst vector. 1215 unsigned NumSrcVecs = ValueVecs[Index].count(); 1216 unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U); 1217 assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources"); 1218 NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs); 1219 } 1220 } else { 1221 // Estimate the permutes for each stored vector as the smaller of the 1222 // number of elements and the number of source vectors. Subtract one per 1223 // dst vector for vperm (S.A.). 1224 unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor); 1225 unsigned NumDstVecs = NumVectorMemOps; 1226 assert (NumSrcVecs > 1 && "Expected at least two source vectors."); 1227 NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs; 1228 } 1229 1230 // Cost of load/store operations and the permutations needed. 1231 return NumVectorMemOps + NumPermutes; 1232 } 1233 1234 static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) { 1235 if (RetTy->isVectorTy() && ID == Intrinsic::bswap) 1236 return getNumVectorRegs(RetTy); // VPERM 1237 return -1; 1238 } 1239 1240 InstructionCost 1241 SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1242 TTI::TargetCostKind CostKind) { 1243 InstructionCost Cost = 1244 getVectorIntrinsicInstrCost(ICA.getID(), ICA.getReturnType()); 1245 if (Cost != -1) 1246 return Cost; 1247 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1248 } 1249