1 //===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a TargetTransformInfo analysis pass specific to the 10 // SystemZ target machine. It uses the target's detailed information to provide 11 // more precise answers to certain TTI queries, while letting the target 12 // independent and default TTI implementations handle the rest. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "SystemZTargetTransformInfo.h" 17 #include "llvm/Analysis/TargetTransformInfo.h" 18 #include "llvm/CodeGen/BasicTTIImpl.h" 19 #include "llvm/CodeGen/CostTable.h" 20 #include "llvm/CodeGen/TargetLowering.h" 21 #include "llvm/IR/IntrinsicInst.h" 22 #include "llvm/Support/Debug.h" 23 using namespace llvm; 24 25 #define DEBUG_TYPE "systemztti" 26 27 //===----------------------------------------------------------------------===// 28 // 29 // SystemZ cost model. 30 // 31 //===----------------------------------------------------------------------===// 32 33 static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) { 34 bool UsedAsMemCpySource = false; 35 for (const User *U : V->users()) 36 if (const Instruction *User = dyn_cast<Instruction>(U)) { 37 if (isa<BitCastInst>(User) || isa<GetElementPtrInst>(User)) { 38 UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse); 39 continue; 40 } 41 if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) { 42 if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) { 43 UsedAsMemCpySource = true; 44 continue; 45 } 46 } 47 OtherUse = true; 48 } 49 return UsedAsMemCpySource; 50 } 51 52 unsigned SystemZTTIImpl::adjustInliningThreshold(const CallBase *CB) const { 53 unsigned Bonus = 0; 54 55 // Increase the threshold if an incoming argument is used only as a memcpy 56 // source. 57 if (Function *Callee = CB->getCalledFunction()) 58 for (Argument &Arg : Callee->args()) { 59 bool OtherUse = false; 60 if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse) 61 Bonus += 150; 62 } 63 64 LLVM_DEBUG(if (Bonus) 65 dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";); 66 return Bonus; 67 } 68 69 InstructionCost SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 70 TTI::TargetCostKind CostKind) { 71 assert(Ty->isIntegerTy()); 72 73 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 74 // There is no cost model for constants with a bit size of 0. Return TCC_Free 75 // here, so that constant hoisting will ignore this constant. 76 if (BitSize == 0) 77 return TTI::TCC_Free; 78 // No cost model for operations on integers larger than 128 bit implemented yet. 79 if ((!ST->hasVector() && BitSize > 64) || BitSize > 128) 80 return TTI::TCC_Free; 81 82 if (Imm == 0) 83 return TTI::TCC_Free; 84 85 if (Imm.getBitWidth() <= 64) { 86 // Constants loaded via lgfi. 87 if (isInt<32>(Imm.getSExtValue())) 88 return TTI::TCC_Basic; 89 // Constants loaded via llilf. 90 if (isUInt<32>(Imm.getZExtValue())) 91 return TTI::TCC_Basic; 92 // Constants loaded via llihf: 93 if ((Imm.getZExtValue() & 0xffffffff) == 0) 94 return TTI::TCC_Basic; 95 96 return 2 * TTI::TCC_Basic; 97 } 98 99 // i128 immediates loads from Constant Pool 100 return 2 * TTI::TCC_Basic; 101 } 102 103 InstructionCost SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 104 const APInt &Imm, Type *Ty, 105 TTI::TargetCostKind CostKind, 106 Instruction *Inst) { 107 assert(Ty->isIntegerTy()); 108 109 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 110 // There is no cost model for constants with a bit size of 0. Return TCC_Free 111 // here, so that constant hoisting will ignore this constant. 112 if (BitSize == 0) 113 return TTI::TCC_Free; 114 // No cost model for operations on integers larger than 64 bit implemented yet. 115 if (BitSize > 64) 116 return TTI::TCC_Free; 117 118 switch (Opcode) { 119 default: 120 return TTI::TCC_Free; 121 case Instruction::GetElementPtr: 122 // Always hoist the base address of a GetElementPtr. This prevents the 123 // creation of new constants for every base constant that gets constant 124 // folded with the offset. 125 if (Idx == 0) 126 return 2 * TTI::TCC_Basic; 127 return TTI::TCC_Free; 128 case Instruction::Store: 129 if (Idx == 0 && Imm.getBitWidth() <= 64) { 130 // Any 8-bit immediate store can by implemented via mvi. 131 if (BitSize == 8) 132 return TTI::TCC_Free; 133 // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi. 134 if (isInt<16>(Imm.getSExtValue())) 135 return TTI::TCC_Free; 136 } 137 break; 138 case Instruction::ICmp: 139 if (Idx == 1 && Imm.getBitWidth() <= 64) { 140 // Comparisons against signed 32-bit immediates implemented via cgfi. 141 if (isInt<32>(Imm.getSExtValue())) 142 return TTI::TCC_Free; 143 // Comparisons against unsigned 32-bit immediates implemented via clgfi. 144 if (isUInt<32>(Imm.getZExtValue())) 145 return TTI::TCC_Free; 146 } 147 break; 148 case Instruction::Add: 149 case Instruction::Sub: 150 if (Idx == 1 && Imm.getBitWidth() <= 64) { 151 // We use algfi/slgfi to add/subtract 32-bit unsigned immediates. 152 if (isUInt<32>(Imm.getZExtValue())) 153 return TTI::TCC_Free; 154 // Or their negation, by swapping addition vs. subtraction. 155 if (isUInt<32>(-Imm.getSExtValue())) 156 return TTI::TCC_Free; 157 } 158 break; 159 case Instruction::Mul: 160 if (Idx == 1 && Imm.getBitWidth() <= 64) { 161 // We use msgfi to multiply by 32-bit signed immediates. 162 if (isInt<32>(Imm.getSExtValue())) 163 return TTI::TCC_Free; 164 } 165 break; 166 case Instruction::Or: 167 case Instruction::Xor: 168 if (Idx == 1 && Imm.getBitWidth() <= 64) { 169 // Masks supported by oilf/xilf. 170 if (isUInt<32>(Imm.getZExtValue())) 171 return TTI::TCC_Free; 172 // Masks supported by oihf/xihf. 173 if ((Imm.getZExtValue() & 0xffffffff) == 0) 174 return TTI::TCC_Free; 175 } 176 break; 177 case Instruction::And: 178 if (Idx == 1 && Imm.getBitWidth() <= 64) { 179 // Any 32-bit AND operation can by implemented via nilf. 180 if (BitSize <= 32) 181 return TTI::TCC_Free; 182 // 64-bit masks supported by nilf. 183 if (isUInt<32>(~Imm.getZExtValue())) 184 return TTI::TCC_Free; 185 // 64-bit masks supported by nilh. 186 if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff) 187 return TTI::TCC_Free; 188 // Some 64-bit AND operations can be implemented via risbg. 189 const SystemZInstrInfo *TII = ST->getInstrInfo(); 190 unsigned Start, End; 191 if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End)) 192 return TTI::TCC_Free; 193 } 194 break; 195 case Instruction::Shl: 196 case Instruction::LShr: 197 case Instruction::AShr: 198 // Always return TCC_Free for the shift value of a shift instruction. 199 if (Idx == 1) 200 return TTI::TCC_Free; 201 break; 202 case Instruction::UDiv: 203 case Instruction::SDiv: 204 case Instruction::URem: 205 case Instruction::SRem: 206 case Instruction::Trunc: 207 case Instruction::ZExt: 208 case Instruction::SExt: 209 case Instruction::IntToPtr: 210 case Instruction::PtrToInt: 211 case Instruction::BitCast: 212 case Instruction::PHI: 213 case Instruction::Call: 214 case Instruction::Select: 215 case Instruction::Ret: 216 case Instruction::Load: 217 break; 218 } 219 220 return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind); 221 } 222 223 InstructionCost 224 SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, 225 const APInt &Imm, Type *Ty, 226 TTI::TargetCostKind CostKind) { 227 assert(Ty->isIntegerTy()); 228 229 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 230 // There is no cost model for constants with a bit size of 0. Return TCC_Free 231 // here, so that constant hoisting will ignore this constant. 232 if (BitSize == 0) 233 return TTI::TCC_Free; 234 // No cost model for operations on integers larger than 64 bit implemented yet. 235 if (BitSize > 64) 236 return TTI::TCC_Free; 237 238 switch (IID) { 239 default: 240 return TTI::TCC_Free; 241 case Intrinsic::sadd_with_overflow: 242 case Intrinsic::uadd_with_overflow: 243 case Intrinsic::ssub_with_overflow: 244 case Intrinsic::usub_with_overflow: 245 // These get expanded to include a normal addition/subtraction. 246 if (Idx == 1 && Imm.getBitWidth() <= 64) { 247 if (isUInt<32>(Imm.getZExtValue())) 248 return TTI::TCC_Free; 249 if (isUInt<32>(-Imm.getSExtValue())) 250 return TTI::TCC_Free; 251 } 252 break; 253 case Intrinsic::smul_with_overflow: 254 case Intrinsic::umul_with_overflow: 255 // These get expanded to include a normal multiplication. 256 if (Idx == 1 && Imm.getBitWidth() <= 64) { 257 if (isInt<32>(Imm.getSExtValue())) 258 return TTI::TCC_Free; 259 } 260 break; 261 case Intrinsic::experimental_stackmap: 262 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 263 return TTI::TCC_Free; 264 break; 265 case Intrinsic::experimental_patchpoint_void: 266 case Intrinsic::experimental_patchpoint_i64: 267 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 268 return TTI::TCC_Free; 269 break; 270 } 271 return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind); 272 } 273 274 TargetTransformInfo::PopcntSupportKind 275 SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) { 276 assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2"); 277 if (ST->hasPopulationCount() && TyWidth <= 64) 278 return TTI::PSK_FastHardware; 279 return TTI::PSK_Software; 280 } 281 282 void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 283 TTI::UnrollingPreferences &UP, 284 OptimizationRemarkEmitter *ORE) { 285 // Find out if L contains a call, what the machine instruction count 286 // estimate is, and how many stores there are. 287 bool HasCall = false; 288 InstructionCost NumStores = 0; 289 for (auto &BB : L->blocks()) 290 for (auto &I : *BB) { 291 if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) { 292 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 293 if (isLoweredToCall(F)) 294 HasCall = true; 295 if (F->getIntrinsicID() == Intrinsic::memcpy || 296 F->getIntrinsicID() == Intrinsic::memset) 297 NumStores++; 298 } else { // indirect call. 299 HasCall = true; 300 } 301 } 302 if (isa<StoreInst>(&I)) { 303 Type *MemAccessTy = I.getOperand(0)->getType(); 304 NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, 305 std::nullopt, 0, TTI::TCK_RecipThroughput); 306 } 307 } 308 309 // The z13 processor will run out of store tags if too many stores 310 // are fed into it too quickly. Therefore make sure there are not 311 // too many stores in the resulting unrolled loop. 312 unsigned const NumStoresVal = *NumStores.getValue(); 313 unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX); 314 315 if (HasCall) { 316 // Only allow full unrolling if loop has any calls. 317 UP.FullUnrollMaxCount = Max; 318 UP.MaxCount = 1; 319 return; 320 } 321 322 UP.MaxCount = Max; 323 if (UP.MaxCount <= 1) 324 return; 325 326 // Allow partial and runtime trip count unrolling. 327 UP.Partial = UP.Runtime = true; 328 329 UP.PartialThreshold = 75; 330 UP.DefaultUnrollRuntimeCount = 4; 331 332 // Allow expensive instructions in the pre-header of the loop. 333 UP.AllowExpensiveTripCount = true; 334 335 UP.Force = true; 336 } 337 338 void SystemZTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 339 TTI::PeelingPreferences &PP) { 340 BaseT::getPeelingPreferences(L, SE, PP); 341 } 342 343 bool SystemZTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, 344 const TargetTransformInfo::LSRCost &C2) { 345 // SystemZ specific: check instruction count (first), and don't care about 346 // ImmCost, since offsets are checked explicitly. 347 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 348 C1.NumIVMuls, C1.NumBaseAdds, 349 C1.ScaleCost, C1.SetupCost) < 350 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 351 C2.NumIVMuls, C2.NumBaseAdds, 352 C2.ScaleCost, C2.SetupCost); 353 } 354 355 unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const { 356 bool Vector = (ClassID == 1); 357 if (!Vector) 358 // Discount the stack pointer. Also leave out %r0, since it can't 359 // be used in an address. 360 return 14; 361 if (ST->hasVector()) 362 return 32; 363 return 0; 364 } 365 366 TypeSize 367 SystemZTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 368 switch (K) { 369 case TargetTransformInfo::RGK_Scalar: 370 return TypeSize::getFixed(64); 371 case TargetTransformInfo::RGK_FixedWidthVector: 372 return TypeSize::getFixed(ST->hasVector() ? 128 : 0); 373 case TargetTransformInfo::RGK_ScalableVector: 374 return TypeSize::getScalable(0); 375 } 376 377 llvm_unreachable("Unsupported register kind"); 378 } 379 380 unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses, 381 unsigned NumStridedMemAccesses, 382 unsigned NumPrefetches, 383 bool HasCall) const { 384 // Don't prefetch a loop with many far apart accesses. 385 if (NumPrefetches > 16) 386 return UINT_MAX; 387 388 // Emit prefetch instructions for smaller strides in cases where we think 389 // the hardware prefetcher might not be able to keep up. 390 if (NumStridedMemAccesses > 32 && !HasCall && 391 (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses) 392 return 1; 393 394 return ST->hasMiscellaneousExtensions3() ? 8192 : 2048; 395 } 396 397 bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { 398 EVT VT = TLI->getValueType(DL, DataType); 399 return (VT.isScalarInteger() && TLI->isTypeLegal(VT)); 400 } 401 402 // Return the bit size for the scalar type or vector element 403 // type. getScalarSizeInBits() returns 0 for a pointer type. 404 static unsigned getScalarSizeInBits(Type *Ty) { 405 unsigned Size = 406 (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits()); 407 assert(Size > 0 && "Element must have non-zero size."); 408 return Size; 409 } 410 411 // getNumberOfParts() calls getTypeLegalizationCost() which splits the vector 412 // type until it is legal. This would e.g. return 4 for <6 x i64>, instead of 413 // 3. 414 static unsigned getNumVectorRegs(Type *Ty) { 415 auto *VTy = cast<FixedVectorType>(Ty); 416 unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements(); 417 assert(WideBits > 0 && "Could not compute size of vector"); 418 return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U)); 419 } 420 421 InstructionCost SystemZTTIImpl::getArithmeticInstrCost( 422 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 423 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, 424 ArrayRef<const Value *> Args, 425 const Instruction *CxtI) { 426 427 // TODO: Handle more cost kinds. 428 if (CostKind != TTI::TCK_RecipThroughput) 429 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, 430 Op2Info, Args, CxtI); 431 432 // TODO: return a good value for BB-VECTORIZER that includes the 433 // immediate loads, which we do not want to count for the loop 434 // vectorizer, since they are hopefully hoisted out of the loop. This 435 // would require a new parameter 'InLoop', but not sure if constant 436 // args are common enough to motivate this. 437 438 unsigned ScalarBits = Ty->getScalarSizeInBits(); 439 440 // There are thre cases of division and remainder: Dividing with a register 441 // needs a divide instruction. A divisor which is a power of two constant 442 // can be implemented with a sequence of shifts. Any other constant needs a 443 // multiply and shifts. 444 const unsigned DivInstrCost = 20; 445 const unsigned DivMulSeqCost = 10; 446 const unsigned SDivPow2Cost = 4; 447 448 bool SignedDivRem = 449 Opcode == Instruction::SDiv || Opcode == Instruction::SRem; 450 bool UnsignedDivRem = 451 Opcode == Instruction::UDiv || Opcode == Instruction::URem; 452 453 // Check for a constant divisor. 454 bool DivRemConst = false; 455 bool DivRemConstPow2 = false; 456 if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) { 457 if (const Constant *C = dyn_cast<Constant>(Args[1])) { 458 const ConstantInt *CVal = 459 (C->getType()->isVectorTy() 460 ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue()) 461 : dyn_cast<const ConstantInt>(C)); 462 if (CVal && (CVal->getValue().isPowerOf2() || 463 CVal->getValue().isNegatedPowerOf2())) 464 DivRemConstPow2 = true; 465 else 466 DivRemConst = true; 467 } 468 } 469 470 if (!Ty->isVectorTy()) { 471 // These FP operations are supported with a dedicated instruction for 472 // float, double and fp128 (base implementation assumes float generally 473 // costs 2). 474 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub || 475 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) 476 return 1; 477 478 // There is no native support for FRem. 479 if (Opcode == Instruction::FRem) 480 return LIBCALL_COST; 481 482 // Give discount for some combined logical operations if supported. 483 if (Args.size() == 2) { 484 if (Opcode == Instruction::Xor) { 485 for (const Value *A : Args) { 486 if (const Instruction *I = dyn_cast<Instruction>(A)) 487 if (I->hasOneUse() && 488 (I->getOpcode() == Instruction::Or || 489 I->getOpcode() == Instruction::And || 490 I->getOpcode() == Instruction::Xor)) 491 if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) || 492 (isInt128InVR(Ty) && 493 (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1()))) 494 return 0; 495 } 496 } 497 else if (Opcode == Instruction::And || Opcode == Instruction::Or) { 498 for (const Value *A : Args) { 499 if (const Instruction *I = dyn_cast<Instruction>(A)) 500 if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) && 501 ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) || 502 (isInt128InVR(Ty) && 503 (Opcode == Instruction::And || ST->hasVectorEnhancements1())))) 504 return 0; 505 } 506 } 507 } 508 509 // Or requires one instruction, although it has custom handling for i64. 510 if (Opcode == Instruction::Or) 511 return 1; 512 513 if (Opcode == Instruction::Xor && ScalarBits == 1) { 514 if (ST->hasLoadStoreOnCond2()) 515 return 5; // 2 * (li 0; loc 1); xor 516 return 7; // 2 * ipm sequences ; xor ; shift ; compare 517 } 518 519 if (DivRemConstPow2) 520 return (SignedDivRem ? SDivPow2Cost : 1); 521 if (DivRemConst) 522 return DivMulSeqCost; 523 if (SignedDivRem || UnsignedDivRem) 524 return DivInstrCost; 525 } 526 else if (ST->hasVector()) { 527 auto *VTy = cast<FixedVectorType>(Ty); 528 unsigned VF = VTy->getNumElements(); 529 unsigned NumVectors = getNumVectorRegs(Ty); 530 531 // These vector operations are custom handled, but are still supported 532 // with one instruction per vector, regardless of element size. 533 if (Opcode == Instruction::Shl || Opcode == Instruction::LShr || 534 Opcode == Instruction::AShr) { 535 return NumVectors; 536 } 537 538 if (DivRemConstPow2) 539 return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1)); 540 if (DivRemConst) { 541 SmallVector<Type *> Tys(Args.size(), Ty); 542 return VF * DivMulSeqCost + 543 getScalarizationOverhead(VTy, Args, Tys, CostKind); 544 } 545 if ((SignedDivRem || UnsignedDivRem) && VF > 4) 546 // Temporary hack: disable high vectorization factors with integer 547 // division/remainder, which will get scalarized and handled with 548 // GR128 registers. The mischeduler is not clever enough to avoid 549 // spilling yet. 550 return 1000; 551 552 // These FP operations are supported with a single vector instruction for 553 // double (base implementation assumes float generally costs 2). For 554 // FP128, the scalar cost is 1, and there is no overhead since the values 555 // are already in scalar registers. 556 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub || 557 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) { 558 switch (ScalarBits) { 559 case 32: { 560 // The vector enhancements facility 1 provides v4f32 instructions. 561 if (ST->hasVectorEnhancements1()) 562 return NumVectors; 563 // Return the cost of multiple scalar invocation plus the cost of 564 // inserting and extracting the values. 565 InstructionCost ScalarCost = 566 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind); 567 SmallVector<Type *> Tys(Args.size(), Ty); 568 InstructionCost Cost = 569 (VF * ScalarCost) + 570 getScalarizationOverhead(VTy, Args, Tys, CostKind); 571 // FIXME: VF 2 for these FP operations are currently just as 572 // expensive as for VF 4. 573 if (VF == 2) 574 Cost *= 2; 575 return Cost; 576 } 577 case 64: 578 case 128: 579 return NumVectors; 580 default: 581 break; 582 } 583 } 584 585 // There is no native support for FRem. 586 if (Opcode == Instruction::FRem) { 587 SmallVector<Type *> Tys(Args.size(), Ty); 588 InstructionCost Cost = (VF * LIBCALL_COST) + 589 getScalarizationOverhead(VTy, Args, Tys, CostKind); 590 // FIXME: VF 2 for float is currently just as expensive as for VF 4. 591 if (VF == 2 && ScalarBits == 32) 592 Cost *= 2; 593 return Cost; 594 } 595 } 596 597 // Fallback to the default implementation. 598 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, 599 Args, CxtI); 600 } 601 602 InstructionCost SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 603 VectorType *Tp, 604 ArrayRef<int> Mask, 605 TTI::TargetCostKind CostKind, 606 int Index, VectorType *SubTp, 607 ArrayRef<const Value *> Args) { 608 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); 609 if (ST->hasVector()) { 610 unsigned NumVectors = getNumVectorRegs(Tp); 611 612 // TODO: Since fp32 is expanded, the shuffle cost should always be 0. 613 614 // FP128 values are always in scalar registers, so there is no work 615 // involved with a shuffle, except for broadcast. In that case register 616 // moves are done with a single instruction per element. 617 if (Tp->getScalarType()->isFP128Ty()) 618 return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0); 619 620 switch (Kind) { 621 case TargetTransformInfo::SK_ExtractSubvector: 622 // ExtractSubvector Index indicates start offset. 623 624 // Extracting a subvector from first index is a noop. 625 return (Index == 0 ? 0 : NumVectors); 626 627 case TargetTransformInfo::SK_Broadcast: 628 // Loop vectorizer calls here to figure out the extra cost of 629 // broadcasting a loaded value to all elements of a vector. Since vlrep 630 // loads and replicates with a single instruction, adjust the returned 631 // value. 632 return NumVectors - 1; 633 634 default: 635 636 // SystemZ supports single instruction permutation / replication. 637 return NumVectors; 638 } 639 } 640 641 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); 642 } 643 644 // Return the log2 difference of the element sizes of the two vector types. 645 static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) { 646 unsigned Bits0 = Ty0->getScalarSizeInBits(); 647 unsigned Bits1 = Ty1->getScalarSizeInBits(); 648 649 if (Bits1 > Bits0) 650 return (Log2_32(Bits1) - Log2_32(Bits0)); 651 652 return (Log2_32(Bits0) - Log2_32(Bits1)); 653 } 654 655 // Return the number of instructions needed to truncate SrcTy to DstTy. 656 unsigned SystemZTTIImpl:: 657 getVectorTruncCost(Type *SrcTy, Type *DstTy) { 658 assert (SrcTy->isVectorTy() && DstTy->isVectorTy()); 659 assert(SrcTy->getPrimitiveSizeInBits().getFixedValue() > 660 DstTy->getPrimitiveSizeInBits().getFixedValue() && 661 "Packing must reduce size of vector type."); 662 assert(cast<FixedVectorType>(SrcTy)->getNumElements() == 663 cast<FixedVectorType>(DstTy)->getNumElements() && 664 "Packing should not change number of elements."); 665 666 // TODO: Since fp32 is expanded, the extract cost should always be 0. 667 668 unsigned NumParts = getNumVectorRegs(SrcTy); 669 if (NumParts <= 2) 670 // Up to 2 vector registers can be truncated efficiently with pack or 671 // permute. The latter requires an immediate mask to be loaded, which 672 // typically gets hoisted out of a loop. TODO: return a good value for 673 // BB-VECTORIZER that includes the immediate loads, which we do not want 674 // to count for the loop vectorizer. 675 return 1; 676 677 unsigned Cost = 0; 678 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy); 679 unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements(); 680 for (unsigned P = 0; P < Log2Diff; ++P) { 681 if (NumParts > 1) 682 NumParts /= 2; 683 Cost += NumParts; 684 } 685 686 // Currently, a general mix of permutes and pack instructions is output by 687 // isel, which follow the cost computation above except for this case which 688 // is one instruction less: 689 if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 && 690 DstTy->getScalarSizeInBits() == 8) 691 Cost--; 692 693 return Cost; 694 } 695 696 // Return the cost of converting a vector bitmask produced by a compare 697 // (SrcTy), to the type of the select or extend instruction (DstTy). 698 unsigned SystemZTTIImpl:: 699 getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) { 700 assert (SrcTy->isVectorTy() && DstTy->isVectorTy() && 701 "Should only be called with vector types."); 702 703 unsigned PackCost = 0; 704 unsigned SrcScalarBits = SrcTy->getScalarSizeInBits(); 705 unsigned DstScalarBits = DstTy->getScalarSizeInBits(); 706 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy); 707 if (SrcScalarBits > DstScalarBits) 708 // The bitmask will be truncated. 709 PackCost = getVectorTruncCost(SrcTy, DstTy); 710 else if (SrcScalarBits < DstScalarBits) { 711 unsigned DstNumParts = getNumVectorRegs(DstTy); 712 // Each vector select needs its part of the bitmask unpacked. 713 PackCost = Log2Diff * DstNumParts; 714 // Extra cost for moving part of mask before unpacking. 715 PackCost += DstNumParts - 1; 716 } 717 718 return PackCost; 719 } 720 721 // Return the type of the compared operands. This is needed to compute the 722 // cost for a Select / ZExt or SExt instruction. 723 static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) { 724 Type *OpTy = nullptr; 725 if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0))) 726 OpTy = CI->getOperand(0)->getType(); 727 else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0))) 728 if (LogicI->getNumOperands() == 2) 729 if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0))) 730 if (isa<CmpInst>(LogicI->getOperand(1))) 731 OpTy = CI0->getOperand(0)->getType(); 732 733 if (OpTy != nullptr) { 734 if (VF == 1) { 735 assert (!OpTy->isVectorTy() && "Expected scalar type"); 736 return OpTy; 737 } 738 // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may 739 // be either scalar or already vectorized with a same or lesser VF. 740 Type *ElTy = OpTy->getScalarType(); 741 return FixedVectorType::get(ElTy, VF); 742 } 743 744 return nullptr; 745 } 746 747 // Get the cost of converting a boolean vector to a vector with same width 748 // and element size as Dst, plus the cost of zero extending if needed. 749 unsigned SystemZTTIImpl:: 750 getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, 751 const Instruction *I) { 752 auto *DstVTy = cast<FixedVectorType>(Dst); 753 unsigned VF = DstVTy->getNumElements(); 754 unsigned Cost = 0; 755 // If we know what the widths of the compared operands, get any cost of 756 // converting it to match Dst. Otherwise assume same widths. 757 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr); 758 if (CmpOpTy != nullptr) 759 Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst); 760 if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP) 761 // One 'vn' per dst vector with an immediate mask. 762 Cost += getNumVectorRegs(Dst); 763 return Cost; 764 } 765 766 InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 767 Type *Src, 768 TTI::CastContextHint CCH, 769 TTI::TargetCostKind CostKind, 770 const Instruction *I) { 771 // FIXME: Can the logic below also be used for these cost kinds? 772 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) { 773 auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 774 return BaseCost == 0 ? BaseCost : 1; 775 } 776 777 unsigned DstScalarBits = Dst->getScalarSizeInBits(); 778 unsigned SrcScalarBits = Src->getScalarSizeInBits(); 779 780 if (!Src->isVectorTy()) { 781 assert (!Dst->isVectorTy()); 782 783 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) { 784 if (Src->isIntegerTy(128)) 785 return LIBCALL_COST; 786 if (SrcScalarBits >= 32 || 787 (I != nullptr && isa<LoadInst>(I->getOperand(0)))) 788 return 1; 789 return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/; 790 } 791 792 if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) && 793 Dst->isIntegerTy(128)) 794 return LIBCALL_COST; 795 796 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) { 797 if (Src->isIntegerTy(1)) { 798 if (DstScalarBits == 128) 799 return 5 /*branch seq.*/; 800 801 if (ST->hasLoadStoreOnCond2()) 802 return 2; // li 0; loc 1 803 804 // This should be extension of a compare i1 result, which is done with 805 // ipm and a varying sequence of instructions. 806 unsigned Cost = 0; 807 if (Opcode == Instruction::SExt) 808 Cost = (DstScalarBits < 64 ? 3 : 4); 809 if (Opcode == Instruction::ZExt) 810 Cost = 3; 811 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr); 812 if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy()) 813 // If operands of an fp-type was compared, this costs +1. 814 Cost++; 815 return Cost; 816 } 817 else if (isInt128InVR(Dst)) { 818 // Extensions from GPR to i128 (in VR) typically costs two instructions, 819 // but a zero-extending load would be just one extra instruction. 820 if (Opcode == Instruction::ZExt && I != nullptr) 821 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0))) 822 if (Ld->hasOneUse()) 823 return 1; 824 return 2; 825 } 826 } 827 828 if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) { 829 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0))) 830 if (Ld->hasOneUse()) 831 return 0; // Will be converted to GPR load. 832 bool OnlyTruncatingStores = true; 833 for (const User *U : I->users()) 834 if (!isa<StoreInst>(U)) { 835 OnlyTruncatingStores = false; 836 break; 837 } 838 if (OnlyTruncatingStores) 839 return 0; 840 return 2; // Vector element extraction. 841 } 842 } 843 else if (ST->hasVector()) { 844 // Vector to scalar cast. 845 auto *SrcVecTy = cast<FixedVectorType>(Src); 846 auto *DstVecTy = dyn_cast<FixedVectorType>(Dst); 847 if (!DstVecTy) { 848 // TODO: tune vector-to-scalar cast. 849 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 850 } 851 unsigned VF = SrcVecTy->getNumElements(); 852 unsigned NumDstVectors = getNumVectorRegs(Dst); 853 unsigned NumSrcVectors = getNumVectorRegs(Src); 854 855 if (Opcode == Instruction::Trunc) { 856 if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits()) 857 return 0; // Check for NOOP conversions. 858 return getVectorTruncCost(Src, Dst); 859 } 860 861 if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 862 if (SrcScalarBits >= 8) { 863 // ZExt will use either a single unpack or a vector permute. 864 if (Opcode == Instruction::ZExt) 865 return NumDstVectors; 866 867 // SExt will be handled with one unpack per doubling of width. 868 unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst); 869 870 // For types that spans multiple vector registers, some additional 871 // instructions are used to setup the unpacking. 872 unsigned NumSrcVectorOps = 873 (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors) 874 : (NumDstVectors / 2)); 875 876 return (NumUnpacks * NumDstVectors) + NumSrcVectorOps; 877 } 878 else if (SrcScalarBits == 1) 879 return getBoolVecToIntConversionCost(Opcode, Dst, I); 880 } 881 882 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP || 883 Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) { 884 // TODO: Fix base implementation which could simplify things a bit here 885 // (seems to miss on differentiating on scalar/vector types). 886 887 // Only 64 bit vector conversions are natively supported before z15. 888 if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) { 889 if (SrcScalarBits == DstScalarBits) 890 return NumDstVectors; 891 892 if (SrcScalarBits == 1) 893 return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors; 894 } 895 896 // Return the cost of multiple scalar invocation plus the cost of 897 // inserting and extracting the values. Base implementation does not 898 // realize float->int gets scalarized. 899 InstructionCost ScalarCost = getCastInstrCost( 900 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind); 901 InstructionCost TotCost = VF * ScalarCost; 902 bool NeedsInserts = true, NeedsExtracts = true; 903 // FP128 registers do not get inserted or extracted. 904 if (DstScalarBits == 128 && 905 (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)) 906 NeedsInserts = false; 907 if (SrcScalarBits == 128 && 908 (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI)) 909 NeedsExtracts = false; 910 911 TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false, 912 NeedsExtracts, CostKind); 913 TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts, 914 /*Extract*/ false, CostKind); 915 916 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4. 917 if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32) 918 TotCost *= 2; 919 920 return TotCost; 921 } 922 923 if (Opcode == Instruction::FPTrunc) { 924 if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements. 925 return VF /*ldxbr/lexbr*/ + 926 getScalarizationOverhead(DstVecTy, /*Insert*/ true, 927 /*Extract*/ false, CostKind); 928 else // double -> float 929 return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/); 930 } 931 932 if (Opcode == Instruction::FPExt) { 933 if (SrcScalarBits == 32 && DstScalarBits == 64) { 934 // float -> double is very rare and currently unoptimized. Instead of 935 // using vldeb, which can do two at a time, all conversions are 936 // scalarized. 937 return VF * 2; 938 } 939 // -> fp128. VF * lxdb/lxeb + extraction of elements. 940 return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false, 941 /*Extract*/ true, CostKind); 942 } 943 } 944 945 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); 946 } 947 948 // Scalar i8 / i16 operations will typically be made after first extending 949 // the operands to i32. 950 static unsigned getOperandsExtensionCost(const Instruction *I) { 951 unsigned ExtCost = 0; 952 for (Value *Op : I->operands()) 953 // A load of i8 or i16 sign/zero extends to i32. 954 if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op)) 955 ExtCost++; 956 957 return ExtCost; 958 } 959 960 InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 961 Type *CondTy, 962 CmpInst::Predicate VecPred, 963 TTI::TargetCostKind CostKind, 964 const Instruction *I) { 965 if (CostKind != TTI::TCK_RecipThroughput) 966 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind); 967 968 if (!ValTy->isVectorTy()) { 969 switch (Opcode) { 970 case Instruction::ICmp: { 971 // A loaded value compared with 0 with multiple users becomes Load and 972 // Test. The load is then not foldable, so return 0 cost for the ICmp. 973 unsigned ScalarBits = ValTy->getScalarSizeInBits(); 974 if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64)) 975 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0))) 976 if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1))) 977 if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() && 978 C->isZero()) 979 return 0; 980 981 unsigned Cost = 1; 982 if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16) 983 Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2); 984 return Cost; 985 } 986 case Instruction::Select: 987 if (ValTy->isFloatingPointTy() || isInt128InVR(ValTy)) 988 return 4; // No LOC for FP / i128 - costs a conditional jump. 989 return 1; // Load On Condition / Select Register. 990 } 991 } 992 else if (ST->hasVector()) { 993 unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements(); 994 995 // Called with a compare instruction. 996 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { 997 unsigned PredicateExtraCost = 0; 998 if (I != nullptr) { 999 // Some predicates cost one or two extra instructions. 1000 switch (cast<CmpInst>(I)->getPredicate()) { 1001 case CmpInst::Predicate::ICMP_NE: 1002 case CmpInst::Predicate::ICMP_UGE: 1003 case CmpInst::Predicate::ICMP_ULE: 1004 case CmpInst::Predicate::ICMP_SGE: 1005 case CmpInst::Predicate::ICMP_SLE: 1006 PredicateExtraCost = 1; 1007 break; 1008 case CmpInst::Predicate::FCMP_ONE: 1009 case CmpInst::Predicate::FCMP_ORD: 1010 case CmpInst::Predicate::FCMP_UEQ: 1011 case CmpInst::Predicate::FCMP_UNO: 1012 PredicateExtraCost = 2; 1013 break; 1014 default: 1015 break; 1016 } 1017 } 1018 1019 // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of 1020 // floats. FIXME: <2 x float> generates same code as <4 x float>. 1021 unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1); 1022 unsigned NumVecs_cmp = getNumVectorRegs(ValTy); 1023 1024 unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost)); 1025 return Cost; 1026 } 1027 else { // Called with a select instruction. 1028 assert (Opcode == Instruction::Select); 1029 1030 // We can figure out the extra cost of packing / unpacking if the 1031 // instruction was passed and the compare instruction is found. 1032 unsigned PackCost = 0; 1033 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr); 1034 if (CmpOpTy != nullptr) 1035 PackCost = 1036 getVectorBitmaskConversionCost(CmpOpTy, ValTy); 1037 1038 return getNumVectorRegs(ValTy) /*vsel*/ + PackCost; 1039 } 1040 } 1041 1042 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind); 1043 } 1044 1045 InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, 1046 TTI::TargetCostKind CostKind, 1047 unsigned Index, Value *Op0, 1048 Value *Op1) { 1049 // vlvgp will insert two grs into a vector register, so only count half the 1050 // number of instructions. 1051 if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64)) 1052 return ((Index % 2 == 0) ? 1 : 0); 1053 1054 if (Opcode == Instruction::ExtractElement) { 1055 int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1); 1056 1057 // Give a slight penalty for moving out of vector pipeline to FXU unit. 1058 if (Index == 0 && Val->isIntOrIntVectorTy()) 1059 Cost += 1; 1060 1061 return Cost; 1062 } 1063 1064 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); 1065 } 1066 1067 // Check if a load may be folded as a memory operand in its user. 1068 bool SystemZTTIImpl:: 1069 isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) { 1070 if (!Ld->hasOneUse()) 1071 return false; 1072 FoldedValue = Ld; 1073 const Instruction *UserI = cast<Instruction>(*Ld->user_begin()); 1074 unsigned LoadedBits = getScalarSizeInBits(Ld->getType()); 1075 unsigned TruncBits = 0; 1076 unsigned SExtBits = 0; 1077 unsigned ZExtBits = 0; 1078 if (UserI->hasOneUse()) { 1079 unsigned UserBits = UserI->getType()->getScalarSizeInBits(); 1080 if (isa<TruncInst>(UserI)) 1081 TruncBits = UserBits; 1082 else if (isa<SExtInst>(UserI)) 1083 SExtBits = UserBits; 1084 else if (isa<ZExtInst>(UserI)) 1085 ZExtBits = UserBits; 1086 } 1087 if (TruncBits || SExtBits || ZExtBits) { 1088 FoldedValue = UserI; 1089 UserI = cast<Instruction>(*UserI->user_begin()); 1090 // Load (single use) -> trunc/extend (single use) -> UserI 1091 } 1092 if ((UserI->getOpcode() == Instruction::Sub || 1093 UserI->getOpcode() == Instruction::SDiv || 1094 UserI->getOpcode() == Instruction::UDiv) && 1095 UserI->getOperand(1) != FoldedValue) 1096 return false; // Not commutative, only RHS foldable. 1097 // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an 1098 // extension was made of the load. 1099 unsigned LoadOrTruncBits = 1100 ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits)); 1101 switch (UserI->getOpcode()) { 1102 case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64 1103 case Instruction::Sub: 1104 case Instruction::ICmp: 1105 if (LoadedBits == 32 && ZExtBits == 64) 1106 return true; 1107 [[fallthrough]]; 1108 case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64 1109 if (UserI->getOpcode() != Instruction::ICmp) { 1110 if (LoadedBits == 16 && 1111 (SExtBits == 32 || 1112 (SExtBits == 64 && ST->hasMiscellaneousExtensions2()))) 1113 return true; 1114 if (LoadOrTruncBits == 16) 1115 return true; 1116 } 1117 [[fallthrough]]; 1118 case Instruction::SDiv:// SE: 32->64 1119 if (LoadedBits == 32 && SExtBits == 64) 1120 return true; 1121 [[fallthrough]]; 1122 case Instruction::UDiv: 1123 case Instruction::And: 1124 case Instruction::Or: 1125 case Instruction::Xor: 1126 // This also makes sense for float operations, but disabled for now due 1127 // to regressions. 1128 // case Instruction::FCmp: 1129 // case Instruction::FAdd: 1130 // case Instruction::FSub: 1131 // case Instruction::FMul: 1132 // case Instruction::FDiv: 1133 1134 // All possible extensions of memory checked above. 1135 1136 // Comparison between memory and immediate. 1137 if (UserI->getOpcode() == Instruction::ICmp) 1138 if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1))) 1139 if (CI->getValue().isIntN(16)) 1140 return true; 1141 return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64); 1142 break; 1143 } 1144 return false; 1145 } 1146 1147 static bool isBswapIntrinsicCall(const Value *V) { 1148 if (const Instruction *I = dyn_cast<Instruction>(V)) 1149 if (auto *CI = dyn_cast<CallInst>(I)) 1150 if (auto *F = CI->getCalledFunction()) 1151 if (F->getIntrinsicID() == Intrinsic::bswap) 1152 return true; 1153 return false; 1154 } 1155 1156 InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 1157 MaybeAlign Alignment, 1158 unsigned AddressSpace, 1159 TTI::TargetCostKind CostKind, 1160 TTI::OperandValueInfo OpInfo, 1161 const Instruction *I) { 1162 assert(!Src->isVoidTy() && "Invalid type"); 1163 1164 // TODO: Handle other cost kinds. 1165 if (CostKind != TTI::TCK_RecipThroughput) 1166 return 1; 1167 1168 if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) { 1169 // Store the load or its truncated or extended value in FoldedValue. 1170 const Instruction *FoldedValue = nullptr; 1171 if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) { 1172 const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin()); 1173 assert (UserI->getNumOperands() == 2 && "Expected a binop."); 1174 1175 // UserI can't fold two loads, so in that case return 0 cost only 1176 // half of the time. 1177 for (unsigned i = 0; i < 2; ++i) { 1178 if (UserI->getOperand(i) == FoldedValue) 1179 continue; 1180 1181 if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){ 1182 LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp); 1183 if (!OtherLoad && 1184 (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) || 1185 isa<ZExtInst>(OtherOp))) 1186 OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0)); 1187 if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/)) 1188 return i == 0; // Both operands foldable. 1189 } 1190 } 1191 1192 return 0; // Only I is foldable in user. 1193 } 1194 } 1195 1196 // Type legalization (via getNumberOfParts) can't handle structs 1197 if (TLI->getValueType(DL, Src, true) == MVT::Other) 1198 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1199 CostKind); 1200 1201 // FP128 is a legal type but kept in a register pair on older CPUs. 1202 if (Src->isFP128Ty() && !ST->hasVectorEnhancements1()) 1203 return 2; 1204 1205 unsigned NumOps = 1206 (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src)); 1207 1208 // Store/Load reversed saves one instruction. 1209 if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) && 1210 I != nullptr) { 1211 if (Opcode == Instruction::Load && I->hasOneUse()) { 1212 const Instruction *LdUser = cast<Instruction>(*I->user_begin()); 1213 // In case of load -> bswap -> store, return normal cost for the load. 1214 if (isBswapIntrinsicCall(LdUser) && 1215 (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin()))) 1216 return 0; 1217 } 1218 else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) { 1219 const Value *StoredVal = SI->getValueOperand(); 1220 if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal)) 1221 return 0; 1222 } 1223 } 1224 1225 return NumOps; 1226 } 1227 1228 // The generic implementation of getInterleavedMemoryOpCost() is based on 1229 // adding costs of the memory operations plus all the extracts and inserts 1230 // needed for using / defining the vector operands. The SystemZ version does 1231 // roughly the same but bases the computations on vector permutations 1232 // instead. 1233 InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost( 1234 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1235 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 1236 bool UseMaskForCond, bool UseMaskForGaps) { 1237 if (UseMaskForCond || UseMaskForGaps) 1238 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 1239 Alignment, AddressSpace, CostKind, 1240 UseMaskForCond, UseMaskForGaps); 1241 assert(isa<VectorType>(VecTy) && 1242 "Expect a vector type for interleaved memory op"); 1243 1244 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements(); 1245 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 1246 unsigned VF = NumElts / Factor; 1247 unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy)); 1248 unsigned NumVectorMemOps = getNumVectorRegs(VecTy); 1249 unsigned NumPermutes = 0; 1250 1251 if (Opcode == Instruction::Load) { 1252 // Loading interleave groups may have gaps, which may mean fewer 1253 // loads. Find out how many vectors will be loaded in total, and in how 1254 // many of them each value will be in. 1255 BitVector UsedInsts(NumVectorMemOps, false); 1256 std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false)); 1257 for (unsigned Index : Indices) 1258 for (unsigned Elt = 0; Elt < VF; ++Elt) { 1259 unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg; 1260 UsedInsts.set(Vec); 1261 ValueVecs[Index].set(Vec); 1262 } 1263 NumVectorMemOps = UsedInsts.count(); 1264 1265 for (unsigned Index : Indices) { 1266 // Estimate that each loaded source vector containing this Index 1267 // requires one operation, except that vperm can handle two input 1268 // registers first time for each dst vector. 1269 unsigned NumSrcVecs = ValueVecs[Index].count(); 1270 unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U); 1271 assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources"); 1272 NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs); 1273 } 1274 } else { 1275 // Estimate the permutes for each stored vector as the smaller of the 1276 // number of elements and the number of source vectors. Subtract one per 1277 // dst vector for vperm (S.A.). 1278 unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor); 1279 unsigned NumDstVecs = NumVectorMemOps; 1280 NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs; 1281 } 1282 1283 // Cost of load/store operations and the permutations needed. 1284 return NumVectorMemOps + NumPermutes; 1285 } 1286 1287 static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) { 1288 if (RetTy->isVectorTy() && ID == Intrinsic::bswap) 1289 return getNumVectorRegs(RetTy); // VPERM 1290 return -1; 1291 } 1292 1293 InstructionCost 1294 SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1295 TTI::TargetCostKind CostKind) { 1296 InstructionCost Cost = 1297 getVectorIntrinsicInstrCost(ICA.getID(), ICA.getReturnType()); 1298 if (Cost != -1) 1299 return Cost; 1300 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1301 } 1302