xref: /freebsd/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp (revision e32fecd0c2c3ee37c47ee100f169e7eb0282a873)
1 //===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "PPCTargetTransformInfo.h"
10 #include "llvm/Analysis/CodeMetrics.h"
11 #include "llvm/Analysis/TargetLibraryInfo.h"
12 #include "llvm/Analysis/TargetTransformInfo.h"
13 #include "llvm/CodeGen/BasicTTIImpl.h"
14 #include "llvm/CodeGen/CostTable.h"
15 #include "llvm/CodeGen/TargetLowering.h"
16 #include "llvm/CodeGen/TargetSchedule.h"
17 #include "llvm/IR/IntrinsicsPowerPC.h"
18 #include "llvm/Support/CommandLine.h"
19 #include "llvm/Support/Debug.h"
20 #include "llvm/Support/KnownBits.h"
21 #include "llvm/Transforms/InstCombine/InstCombiner.h"
22 #include "llvm/Transforms/Utils/Local.h"
23 
24 using namespace llvm;
25 
26 #define DEBUG_TYPE "ppctti"
27 
28 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
29 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
30 
31 static cl::opt<bool>
32 EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
33                 cl::desc("Enable using coldcc calling conv for cold "
34                          "internal functions"));
35 
36 static cl::opt<bool>
37 LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false),
38                cl::desc("Do not add instruction count to lsr cost model"));
39 
40 // The latency of mtctr is only justified if there are more than 4
41 // comparisons that will be removed as a result.
42 static cl::opt<unsigned>
43 SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
44                       cl::desc("Loops with a constant trip count smaller than "
45                                "this value will not use the count register."));
46 
47 //===----------------------------------------------------------------------===//
48 //
49 // PPC cost model.
50 //
51 //===----------------------------------------------------------------------===//
52 
53 TargetTransformInfo::PopcntSupportKind
54 PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
55   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
56   if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64)
57     return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow ?
58              TTI::PSK_SlowHardware : TTI::PSK_FastHardware;
59   return TTI::PSK_Software;
60 }
61 
62 Optional<Instruction *>
63 PPCTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
64   Intrinsic::ID IID = II.getIntrinsicID();
65   switch (IID) {
66   default:
67     break;
68   case Intrinsic::ppc_altivec_lvx:
69   case Intrinsic::ppc_altivec_lvxl:
70     // Turn PPC lvx -> load if the pointer is known aligned.
71     if (getOrEnforceKnownAlignment(
72             II.getArgOperand(0), Align(16), IC.getDataLayout(), &II,
73             &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
74       Value *Ptr = IC.Builder.CreateBitCast(
75           II.getArgOperand(0), PointerType::getUnqual(II.getType()));
76       return new LoadInst(II.getType(), Ptr, "", false, Align(16));
77     }
78     break;
79   case Intrinsic::ppc_vsx_lxvw4x:
80   case Intrinsic::ppc_vsx_lxvd2x: {
81     // Turn PPC VSX loads into normal loads.
82     Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(0),
83                                           PointerType::getUnqual(II.getType()));
84     return new LoadInst(II.getType(), Ptr, Twine(""), false, Align(1));
85   }
86   case Intrinsic::ppc_altivec_stvx:
87   case Intrinsic::ppc_altivec_stvxl:
88     // Turn stvx -> store if the pointer is known aligned.
89     if (getOrEnforceKnownAlignment(
90             II.getArgOperand(1), Align(16), IC.getDataLayout(), &II,
91             &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
92       Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType());
93       Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
94       return new StoreInst(II.getArgOperand(0), Ptr, false, Align(16));
95     }
96     break;
97   case Intrinsic::ppc_vsx_stxvw4x:
98   case Intrinsic::ppc_vsx_stxvd2x: {
99     // Turn PPC VSX stores into normal stores.
100     Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType());
101     Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
102     return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1));
103   }
104   case Intrinsic::ppc_altivec_vperm:
105     // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
106     // Note that ppc_altivec_vperm has a big-endian bias, so when creating
107     // a vectorshuffle for little endian, we must undo the transformation
108     // performed on vec_perm in altivec.h.  That is, we must complement
109     // the permutation mask with respect to 31 and reverse the order of
110     // V1 and V2.
111     if (Constant *Mask = dyn_cast<Constant>(II.getArgOperand(2))) {
112       assert(cast<FixedVectorType>(Mask->getType())->getNumElements() == 16 &&
113              "Bad type for intrinsic!");
114 
115       // Check that all of the elements are integer constants or undefs.
116       bool AllEltsOk = true;
117       for (unsigned i = 0; i != 16; ++i) {
118         Constant *Elt = Mask->getAggregateElement(i);
119         if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
120           AllEltsOk = false;
121           break;
122         }
123       }
124 
125       if (AllEltsOk) {
126         // Cast the input vectors to byte vectors.
127         Value *Op0 =
128             IC.Builder.CreateBitCast(II.getArgOperand(0), Mask->getType());
129         Value *Op1 =
130             IC.Builder.CreateBitCast(II.getArgOperand(1), Mask->getType());
131         Value *Result = UndefValue::get(Op0->getType());
132 
133         // Only extract each element once.
134         Value *ExtractedElts[32];
135         memset(ExtractedElts, 0, sizeof(ExtractedElts));
136 
137         for (unsigned i = 0; i != 16; ++i) {
138           if (isa<UndefValue>(Mask->getAggregateElement(i)))
139             continue;
140           unsigned Idx =
141               cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
142           Idx &= 31; // Match the hardware behavior.
143           if (DL.isLittleEndian())
144             Idx = 31 - Idx;
145 
146           if (!ExtractedElts[Idx]) {
147             Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
148             Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
149             ExtractedElts[Idx] = IC.Builder.CreateExtractElement(
150                 Idx < 16 ? Op0ToUse : Op1ToUse, IC.Builder.getInt32(Idx & 15));
151           }
152 
153           // Insert this value into the result vector.
154           Result = IC.Builder.CreateInsertElement(Result, ExtractedElts[Idx],
155                                                   IC.Builder.getInt32(i));
156         }
157         return CastInst::Create(Instruction::BitCast, Result, II.getType());
158       }
159     }
160     break;
161   }
162   return None;
163 }
164 
165 InstructionCost PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
166                                           TTI::TargetCostKind CostKind) {
167   if (DisablePPCConstHoist)
168     return BaseT::getIntImmCost(Imm, Ty, CostKind);
169 
170   assert(Ty->isIntegerTy());
171 
172   unsigned BitSize = Ty->getPrimitiveSizeInBits();
173   if (BitSize == 0)
174     return ~0U;
175 
176   if (Imm == 0)
177     return TTI::TCC_Free;
178 
179   if (Imm.getBitWidth() <= 64) {
180     if (isInt<16>(Imm.getSExtValue()))
181       return TTI::TCC_Basic;
182 
183     if (isInt<32>(Imm.getSExtValue())) {
184       // A constant that can be materialized using lis.
185       if ((Imm.getZExtValue() & 0xFFFF) == 0)
186         return TTI::TCC_Basic;
187 
188       return 2 * TTI::TCC_Basic;
189     }
190   }
191 
192   return 4 * TTI::TCC_Basic;
193 }
194 
195 InstructionCost PPCTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
196                                                 const APInt &Imm, Type *Ty,
197                                                 TTI::TargetCostKind CostKind) {
198   if (DisablePPCConstHoist)
199     return BaseT::getIntImmCostIntrin(IID, Idx, Imm, Ty, CostKind);
200 
201   assert(Ty->isIntegerTy());
202 
203   unsigned BitSize = Ty->getPrimitiveSizeInBits();
204   if (BitSize == 0)
205     return ~0U;
206 
207   switch (IID) {
208   default:
209     return TTI::TCC_Free;
210   case Intrinsic::sadd_with_overflow:
211   case Intrinsic::uadd_with_overflow:
212   case Intrinsic::ssub_with_overflow:
213   case Intrinsic::usub_with_overflow:
214     if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
215       return TTI::TCC_Free;
216     break;
217   case Intrinsic::experimental_stackmap:
218     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
219       return TTI::TCC_Free;
220     break;
221   case Intrinsic::experimental_patchpoint_void:
222   case Intrinsic::experimental_patchpoint_i64:
223     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
224       return TTI::TCC_Free;
225     break;
226   }
227   return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
228 }
229 
230 InstructionCost PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
231                                               const APInt &Imm, Type *Ty,
232                                               TTI::TargetCostKind CostKind,
233                                               Instruction *Inst) {
234   if (DisablePPCConstHoist)
235     return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind, Inst);
236 
237   assert(Ty->isIntegerTy());
238 
239   unsigned BitSize = Ty->getPrimitiveSizeInBits();
240   if (BitSize == 0)
241     return ~0U;
242 
243   unsigned ImmIdx = ~0U;
244   bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
245        ZeroFree = false;
246   switch (Opcode) {
247   default:
248     return TTI::TCC_Free;
249   case Instruction::GetElementPtr:
250     // Always hoist the base address of a GetElementPtr. This prevents the
251     // creation of new constants for every base constant that gets constant
252     // folded with the offset.
253     if (Idx == 0)
254       return 2 * TTI::TCC_Basic;
255     return TTI::TCC_Free;
256   case Instruction::And:
257     RunFree = true; // (for the rotate-and-mask instructions)
258     LLVM_FALLTHROUGH;
259   case Instruction::Add:
260   case Instruction::Or:
261   case Instruction::Xor:
262     ShiftedFree = true;
263     LLVM_FALLTHROUGH;
264   case Instruction::Sub:
265   case Instruction::Mul:
266   case Instruction::Shl:
267   case Instruction::LShr:
268   case Instruction::AShr:
269     ImmIdx = 1;
270     break;
271   case Instruction::ICmp:
272     UnsignedFree = true;
273     ImmIdx = 1;
274     // Zero comparisons can use record-form instructions.
275     LLVM_FALLTHROUGH;
276   case Instruction::Select:
277     ZeroFree = true;
278     break;
279   case Instruction::PHI:
280   case Instruction::Call:
281   case Instruction::Ret:
282   case Instruction::Load:
283   case Instruction::Store:
284     break;
285   }
286 
287   if (ZeroFree && Imm == 0)
288     return TTI::TCC_Free;
289 
290   if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
291     if (isInt<16>(Imm.getSExtValue()))
292       return TTI::TCC_Free;
293 
294     if (RunFree) {
295       if (Imm.getBitWidth() <= 32 &&
296           (isShiftedMask_32(Imm.getZExtValue()) ||
297            isShiftedMask_32(~Imm.getZExtValue())))
298         return TTI::TCC_Free;
299 
300       if (ST->isPPC64() &&
301           (isShiftedMask_64(Imm.getZExtValue()) ||
302            isShiftedMask_64(~Imm.getZExtValue())))
303         return TTI::TCC_Free;
304     }
305 
306     if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
307       return TTI::TCC_Free;
308 
309     if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
310       return TTI::TCC_Free;
311   }
312 
313   return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
314 }
315 
316 // Check if the current Type is an MMA vector type. Valid MMA types are
317 // v256i1 and v512i1 respectively.
318 static bool isMMAType(Type *Ty) {
319   return Ty->isVectorTy() && (Ty->getScalarSizeInBits() == 1) &&
320          (Ty->getPrimitiveSizeInBits() > 128);
321 }
322 
323 InstructionCost PPCTTIImpl::getUserCost(const User *U,
324                                         ArrayRef<const Value *> Operands,
325                                         TTI::TargetCostKind CostKind) {
326   // We already implement getCastInstrCost and getMemoryOpCost where we perform
327   // the vector adjustment there.
328   if (isa<CastInst>(U) || isa<LoadInst>(U) || isa<StoreInst>(U))
329     return BaseT::getUserCost(U, Operands, CostKind);
330 
331   if (U->getType()->isVectorTy()) {
332     // Instructions that need to be split should cost more.
333     std::pair<InstructionCost, MVT> LT =
334         TLI->getTypeLegalizationCost(DL, U->getType());
335     return LT.first * BaseT::getUserCost(U, Operands, CostKind);
336   }
337 
338   return BaseT::getUserCost(U, Operands, CostKind);
339 }
340 
341 // Determining the address of a TLS variable results in a function call in
342 // certain TLS models.
343 static bool memAddrUsesCTR(const Value *MemAddr, const PPCTargetMachine &TM,
344                            SmallPtrSetImpl<const Value *> &Visited) {
345   // No need to traverse again if we already checked this operand.
346   if (!Visited.insert(MemAddr).second)
347     return false;
348   const auto *GV = dyn_cast<GlobalValue>(MemAddr);
349   if (!GV) {
350     // Recurse to check for constants that refer to TLS global variables.
351     if (const auto *CV = dyn_cast<Constant>(MemAddr))
352       for (const auto &CO : CV->operands())
353         if (memAddrUsesCTR(CO, TM, Visited))
354           return true;
355     return false;
356   }
357 
358   if (!GV->isThreadLocal())
359     return false;
360   TLSModel::Model Model = TM.getTLSModel(GV);
361   return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic;
362 }
363 
364 bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
365                              SmallPtrSetImpl<const Value *> &Visited) {
366   const PPCTargetMachine &TM = ST->getTargetMachine();
367 
368   // Loop through the inline asm constraints and look for something that
369   // clobbers ctr.
370   auto asmClobbersCTR = [](InlineAsm *IA) {
371     InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
372     for (const InlineAsm::ConstraintInfo &C : CIV) {
373       if (C.Type != InlineAsm::isInput)
374         for (const auto &Code : C.Codes)
375           if (StringRef(Code).equals_insensitive("{ctr}"))
376             return true;
377     }
378     return false;
379   };
380 
381   auto isLargeIntegerTy = [](bool Is32Bit, Type *Ty) {
382     if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
383       return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
384 
385     return false;
386   };
387 
388   auto supportedHalfPrecisionOp = [](Instruction *Inst) {
389     switch (Inst->getOpcode()) {
390     default:
391       return false;
392     case Instruction::FPTrunc:
393     case Instruction::FPExt:
394     case Instruction::Load:
395     case Instruction::Store:
396     case Instruction::FPToUI:
397     case Instruction::UIToFP:
398     case Instruction::FPToSI:
399     case Instruction::SIToFP:
400       return true;
401     }
402   };
403 
404   for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
405        J != JE; ++J) {
406     // There are no direct operations on half precision so assume that
407     // anything with that type requires a call except for a few select
408     // operations with Power9.
409     if (Instruction *CurrInst = dyn_cast<Instruction>(J)) {
410       for (const auto &Op : CurrInst->operands()) {
411         if (Op->getType()->getScalarType()->isHalfTy() ||
412             CurrInst->getType()->getScalarType()->isHalfTy())
413           return !(ST->isISA3_0() && supportedHalfPrecisionOp(CurrInst));
414       }
415     }
416     if (CallInst *CI = dyn_cast<CallInst>(J)) {
417       // Inline ASM is okay, unless it clobbers the ctr register.
418       if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) {
419         if (asmClobbersCTR(IA))
420           return true;
421         continue;
422       }
423 
424       if (Function *F = CI->getCalledFunction()) {
425         // Most intrinsics don't become function calls, but some might.
426         // sin, cos, exp and log are always calls.
427         unsigned Opcode = 0;
428         if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
429           switch (F->getIntrinsicID()) {
430           default: continue;
431           // If we have a call to loop_decrement or set_loop_iterations,
432           // we're definitely using CTR.
433           case Intrinsic::set_loop_iterations:
434           case Intrinsic::loop_decrement:
435             return true;
436 
437           // Binary operations on 128-bit value will use CTR.
438           case Intrinsic::experimental_constrained_fadd:
439           case Intrinsic::experimental_constrained_fsub:
440           case Intrinsic::experimental_constrained_fmul:
441           case Intrinsic::experimental_constrained_fdiv:
442           case Intrinsic::experimental_constrained_frem:
443             if (F->getType()->getScalarType()->isFP128Ty() ||
444                 F->getType()->getScalarType()->isPPC_FP128Ty())
445               return true;
446             break;
447 
448           case Intrinsic::experimental_constrained_fptosi:
449           case Intrinsic::experimental_constrained_fptoui:
450           case Intrinsic::experimental_constrained_sitofp:
451           case Intrinsic::experimental_constrained_uitofp: {
452             Type *SrcType = CI->getArgOperand(0)->getType()->getScalarType();
453             Type *DstType = CI->getType()->getScalarType();
454             if (SrcType->isPPC_FP128Ty() || DstType->isPPC_FP128Ty() ||
455                 isLargeIntegerTy(!TM.isPPC64(), SrcType) ||
456                 isLargeIntegerTy(!TM.isPPC64(), DstType))
457               return true;
458             break;
459           }
460 
461           // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
462           // because, although it does clobber the counter register, the
463           // control can't then return to inside the loop unless there is also
464           // an eh_sjlj_setjmp.
465           case Intrinsic::eh_sjlj_setjmp:
466 
467           case Intrinsic::memcpy:
468           case Intrinsic::memmove:
469           case Intrinsic::memset:
470           case Intrinsic::powi:
471           case Intrinsic::log:
472           case Intrinsic::log2:
473           case Intrinsic::log10:
474           case Intrinsic::exp:
475           case Intrinsic::exp2:
476           case Intrinsic::pow:
477           case Intrinsic::sin:
478           case Intrinsic::cos:
479           case Intrinsic::experimental_constrained_powi:
480           case Intrinsic::experimental_constrained_log:
481           case Intrinsic::experimental_constrained_log2:
482           case Intrinsic::experimental_constrained_log10:
483           case Intrinsic::experimental_constrained_exp:
484           case Intrinsic::experimental_constrained_exp2:
485           case Intrinsic::experimental_constrained_pow:
486           case Intrinsic::experimental_constrained_sin:
487           case Intrinsic::experimental_constrained_cos:
488             return true;
489           case Intrinsic::copysign:
490             if (CI->getArgOperand(0)->getType()->getScalarType()->
491                 isPPC_FP128Ty())
492               return true;
493             else
494               continue; // ISD::FCOPYSIGN is never a library call.
495           case Intrinsic::fmuladd:
496           case Intrinsic::fma:                Opcode = ISD::FMA;        break;
497           case Intrinsic::sqrt:               Opcode = ISD::FSQRT;      break;
498           case Intrinsic::floor:              Opcode = ISD::FFLOOR;     break;
499           case Intrinsic::ceil:               Opcode = ISD::FCEIL;      break;
500           case Intrinsic::trunc:              Opcode = ISD::FTRUNC;     break;
501           case Intrinsic::rint:               Opcode = ISD::FRINT;      break;
502           case Intrinsic::lrint:              Opcode = ISD::LRINT;      break;
503           case Intrinsic::llrint:             Opcode = ISD::LLRINT;     break;
504           case Intrinsic::nearbyint:          Opcode = ISD::FNEARBYINT; break;
505           case Intrinsic::round:              Opcode = ISD::FROUND;     break;
506           case Intrinsic::lround:             Opcode = ISD::LROUND;     break;
507           case Intrinsic::llround:            Opcode = ISD::LLROUND;    break;
508           case Intrinsic::minnum:             Opcode = ISD::FMINNUM;    break;
509           case Intrinsic::maxnum:             Opcode = ISD::FMAXNUM;    break;
510           case Intrinsic::experimental_constrained_fcmp:
511             Opcode = ISD::STRICT_FSETCC;
512             break;
513           case Intrinsic::experimental_constrained_fcmps:
514             Opcode = ISD::STRICT_FSETCCS;
515             break;
516           case Intrinsic::experimental_constrained_fma:
517             Opcode = ISD::STRICT_FMA;
518             break;
519           case Intrinsic::experimental_constrained_sqrt:
520             Opcode = ISD::STRICT_FSQRT;
521             break;
522           case Intrinsic::experimental_constrained_floor:
523             Opcode = ISD::STRICT_FFLOOR;
524             break;
525           case Intrinsic::experimental_constrained_ceil:
526             Opcode = ISD::STRICT_FCEIL;
527             break;
528           case Intrinsic::experimental_constrained_trunc:
529             Opcode = ISD::STRICT_FTRUNC;
530             break;
531           case Intrinsic::experimental_constrained_rint:
532             Opcode = ISD::STRICT_FRINT;
533             break;
534           case Intrinsic::experimental_constrained_lrint:
535             Opcode = ISD::STRICT_LRINT;
536             break;
537           case Intrinsic::experimental_constrained_llrint:
538             Opcode = ISD::STRICT_LLRINT;
539             break;
540           case Intrinsic::experimental_constrained_nearbyint:
541             Opcode = ISD::STRICT_FNEARBYINT;
542             break;
543           case Intrinsic::experimental_constrained_round:
544             Opcode = ISD::STRICT_FROUND;
545             break;
546           case Intrinsic::experimental_constrained_lround:
547             Opcode = ISD::STRICT_LROUND;
548             break;
549           case Intrinsic::experimental_constrained_llround:
550             Opcode = ISD::STRICT_LLROUND;
551             break;
552           case Intrinsic::experimental_constrained_minnum:
553             Opcode = ISD::STRICT_FMINNUM;
554             break;
555           case Intrinsic::experimental_constrained_maxnum:
556             Opcode = ISD::STRICT_FMAXNUM;
557             break;
558           case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO;      break;
559           case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO;      break;
560           }
561         }
562 
563         // PowerPC does not use [US]DIVREM or other library calls for
564         // operations on regular types which are not otherwise library calls
565         // (i.e. soft float or atomics). If adapting for targets that do,
566         // additional care is required here.
567 
568         LibFunc Func;
569         if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
570             LibInfo->getLibFunc(F->getName(), Func) &&
571             LibInfo->hasOptimizedCodeGen(Func)) {
572           // Non-read-only functions are never treated as intrinsics.
573           if (!CI->onlyReadsMemory())
574             return true;
575 
576           // Conversion happens only for FP calls.
577           if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
578             return true;
579 
580           switch (Func) {
581           default: return true;
582           case LibFunc_copysign:
583           case LibFunc_copysignf:
584             continue; // ISD::FCOPYSIGN is never a library call.
585           case LibFunc_copysignl:
586             return true;
587           case LibFunc_fabs:
588           case LibFunc_fabsf:
589           case LibFunc_fabsl:
590             continue; // ISD::FABS is never a library call.
591           case LibFunc_sqrt:
592           case LibFunc_sqrtf:
593           case LibFunc_sqrtl:
594             Opcode = ISD::FSQRT; break;
595           case LibFunc_floor:
596           case LibFunc_floorf:
597           case LibFunc_floorl:
598             Opcode = ISD::FFLOOR; break;
599           case LibFunc_nearbyint:
600           case LibFunc_nearbyintf:
601           case LibFunc_nearbyintl:
602             Opcode = ISD::FNEARBYINT; break;
603           case LibFunc_ceil:
604           case LibFunc_ceilf:
605           case LibFunc_ceill:
606             Opcode = ISD::FCEIL; break;
607           case LibFunc_rint:
608           case LibFunc_rintf:
609           case LibFunc_rintl:
610             Opcode = ISD::FRINT; break;
611           case LibFunc_round:
612           case LibFunc_roundf:
613           case LibFunc_roundl:
614             Opcode = ISD::FROUND; break;
615           case LibFunc_trunc:
616           case LibFunc_truncf:
617           case LibFunc_truncl:
618             Opcode = ISD::FTRUNC; break;
619           case LibFunc_fmin:
620           case LibFunc_fminf:
621           case LibFunc_fminl:
622             Opcode = ISD::FMINNUM; break;
623           case LibFunc_fmax:
624           case LibFunc_fmaxf:
625           case LibFunc_fmaxl:
626             Opcode = ISD::FMAXNUM; break;
627           }
628         }
629 
630         if (Opcode) {
631           EVT EVTy =
632               TLI->getValueType(DL, CI->getArgOperand(0)->getType(), true);
633 
634           if (EVTy == MVT::Other)
635             return true;
636 
637           if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
638             continue;
639           else if (EVTy.isVector() &&
640                    TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
641             continue;
642 
643           return true;
644         }
645       }
646 
647       return true;
648     } else if ((J->getType()->getScalarType()->isFP128Ty() ||
649                 J->getType()->getScalarType()->isPPC_FP128Ty())) {
650       // Most operations on f128 or ppc_f128 values become calls.
651       return true;
652     } else if (isa<FCmpInst>(J) &&
653                J->getOperand(0)->getType()->getScalarType()->isFP128Ty()) {
654       return true;
655     } else if ((isa<FPTruncInst>(J) || isa<FPExtInst>(J)) &&
656                (cast<CastInst>(J)->getSrcTy()->getScalarType()->isFP128Ty() ||
657                 cast<CastInst>(J)->getDestTy()->getScalarType()->isFP128Ty())) {
658       return true;
659     } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
660                isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
661       CastInst *CI = cast<CastInst>(J);
662       if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
663           CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
664           isLargeIntegerTy(!TM.isPPC64(), CI->getSrcTy()->getScalarType()) ||
665           isLargeIntegerTy(!TM.isPPC64(), CI->getDestTy()->getScalarType()))
666         return true;
667     } else if (isLargeIntegerTy(!TM.isPPC64(),
668                                 J->getType()->getScalarType()) &&
669                (J->getOpcode() == Instruction::UDiv ||
670                 J->getOpcode() == Instruction::SDiv ||
671                 J->getOpcode() == Instruction::URem ||
672                 J->getOpcode() == Instruction::SRem)) {
673       return true;
674     } else if (!TM.isPPC64() &&
675                isLargeIntegerTy(false, J->getType()->getScalarType()) &&
676                (J->getOpcode() == Instruction::Shl ||
677                 J->getOpcode() == Instruction::AShr ||
678                 J->getOpcode() == Instruction::LShr)) {
679       // Only on PPC32, for 128-bit integers (specifically not 64-bit
680       // integers), these might be runtime calls.
681       return true;
682     } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
683       // On PowerPC, indirect jumps use the counter register.
684       return true;
685     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
686       if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
687         return true;
688     }
689 
690     // FREM is always a call.
691     if (J->getOpcode() == Instruction::FRem)
692       return true;
693 
694     if (ST->useSoftFloat()) {
695       switch(J->getOpcode()) {
696       case Instruction::FAdd:
697       case Instruction::FSub:
698       case Instruction::FMul:
699       case Instruction::FDiv:
700       case Instruction::FPTrunc:
701       case Instruction::FPExt:
702       case Instruction::FPToUI:
703       case Instruction::FPToSI:
704       case Instruction::UIToFP:
705       case Instruction::SIToFP:
706       case Instruction::FCmp:
707         return true;
708       }
709     }
710 
711     for (Value *Operand : J->operands())
712       if (memAddrUsesCTR(Operand, TM, Visited))
713         return true;
714   }
715 
716   return false;
717 }
718 
719 bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
720                                           AssumptionCache &AC,
721                                           TargetLibraryInfo *LibInfo,
722                                           HardwareLoopInfo &HWLoopInfo) {
723   const PPCTargetMachine &TM = ST->getTargetMachine();
724   TargetSchedModel SchedModel;
725   SchedModel.init(ST);
726 
727   // Do not convert small short loops to CTR loop.
728   unsigned ConstTripCount = SE.getSmallConstantTripCount(L);
729   if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
730     SmallPtrSet<const Value *, 32> EphValues;
731     CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
732     CodeMetrics Metrics;
733     for (BasicBlock *BB : L->blocks())
734       Metrics.analyzeBasicBlock(BB, *this, EphValues);
735     // 6 is an approximate latency for the mtctr instruction.
736     if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
737       return false;
738   }
739 
740   // We don't want to spill/restore the counter register, and so we don't
741   // want to use the counter register if the loop contains calls.
742   SmallPtrSet<const Value *, 4> Visited;
743   for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
744        I != IE; ++I)
745     if (mightUseCTR(*I, LibInfo, Visited))
746       return false;
747 
748   SmallVector<BasicBlock*, 4> ExitingBlocks;
749   L->getExitingBlocks(ExitingBlocks);
750 
751   // If there is an exit edge known to be frequently taken,
752   // we should not transform this loop.
753   for (auto &BB : ExitingBlocks) {
754     Instruction *TI = BB->getTerminator();
755     if (!TI) continue;
756 
757     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
758       uint64_t TrueWeight = 0, FalseWeight = 0;
759       if (!BI->isConditional() ||
760           !BI->extractProfMetadata(TrueWeight, FalseWeight))
761         continue;
762 
763       // If the exit path is more frequent than the loop path,
764       // we return here without further analysis for this loop.
765       bool TrueIsExit = !L->contains(BI->getSuccessor(0));
766       if (( TrueIsExit && FalseWeight < TrueWeight) ||
767           (!TrueIsExit && FalseWeight > TrueWeight))
768         return false;
769     }
770   }
771 
772   // If an exit block has a PHI that accesses a TLS variable as one of the
773   // incoming values from the loop, we cannot produce a CTR loop because the
774   // address for that value will be computed in the loop.
775   SmallVector<BasicBlock *, 4> ExitBlocks;
776   L->getExitBlocks(ExitBlocks);
777   for (auto &BB : ExitBlocks) {
778     for (auto &PHI : BB->phis()) {
779       for (int Idx = 0, EndIdx = PHI.getNumIncomingValues(); Idx < EndIdx;
780            Idx++) {
781         const BasicBlock *IncomingBB = PHI.getIncomingBlock(Idx);
782         const Value *IncomingValue = PHI.getIncomingValue(Idx);
783         if (L->contains(IncomingBB) &&
784             memAddrUsesCTR(IncomingValue, TM, Visited))
785           return false;
786       }
787     }
788   }
789 
790   LLVMContext &C = L->getHeader()->getContext();
791   HWLoopInfo.CountType = TM.isPPC64() ?
792     Type::getInt64Ty(C) : Type::getInt32Ty(C);
793   HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
794   return true;
795 }
796 
797 void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
798                                          TTI::UnrollingPreferences &UP,
799                                          OptimizationRemarkEmitter *ORE) {
800   if (ST->getCPUDirective() == PPC::DIR_A2) {
801     // The A2 is in-order with a deep pipeline, and concatenation unrolling
802     // helps expose latency-hiding opportunities to the instruction scheduler.
803     UP.Partial = UP.Runtime = true;
804 
805     // We unroll a lot on the A2 (hundreds of instructions), and the benefits
806     // often outweigh the cost of a division to compute the trip count.
807     UP.AllowExpensiveTripCount = true;
808   }
809 
810   BaseT::getUnrollingPreferences(L, SE, UP, ORE);
811 }
812 
813 void PPCTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
814                                        TTI::PeelingPreferences &PP) {
815   BaseT::getPeelingPreferences(L, SE, PP);
816 }
817 // This function returns true to allow using coldcc calling convention.
818 // Returning true results in coldcc being used for functions which are cold at
819 // all call sites when the callers of the functions are not calling any other
820 // non coldcc functions.
821 bool PPCTTIImpl::useColdCCForColdCall(Function &F) {
822   return EnablePPCColdCC;
823 }
824 
825 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
826   // On the A2, always unroll aggressively.
827   if (ST->getCPUDirective() == PPC::DIR_A2)
828     return true;
829 
830   return LoopHasReductions;
831 }
832 
833 PPCTTIImpl::TTI::MemCmpExpansionOptions
834 PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
835   TTI::MemCmpExpansionOptions Options;
836   Options.LoadSizes = {8, 4, 2, 1};
837   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
838   return Options;
839 }
840 
841 bool PPCTTIImpl::enableInterleavedAccessVectorization() {
842   return true;
843 }
844 
845 unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
846   assert(ClassID == GPRRC || ClassID == FPRRC ||
847          ClassID == VRRC || ClassID == VSXRC);
848   if (ST->hasVSX()) {
849     assert(ClassID == GPRRC || ClassID == VSXRC || ClassID == VRRC);
850     return ClassID == VSXRC ? 64 : 32;
851   }
852   assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC);
853   return 32;
854 }
855 
856 unsigned PPCTTIImpl::getRegisterClassForType(bool Vector, Type *Ty) const {
857   if (Vector)
858     return ST->hasVSX() ? VSXRC : VRRC;
859   else if (Ty && (Ty->getScalarType()->isFloatTy() ||
860                   Ty->getScalarType()->isDoubleTy()))
861     return ST->hasVSX() ? VSXRC : FPRRC;
862   else if (Ty && (Ty->getScalarType()->isFP128Ty() ||
863                   Ty->getScalarType()->isPPC_FP128Ty()))
864     return VRRC;
865   else if (Ty && Ty->getScalarType()->isHalfTy())
866     return VSXRC;
867   else
868     return GPRRC;
869 }
870 
871 const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
872 
873   switch (ClassID) {
874     default:
875       llvm_unreachable("unknown register class");
876       return "PPC::unknown register class";
877     case GPRRC:       return "PPC::GPRRC";
878     case FPRRC:       return "PPC::FPRRC";
879     case VRRC:        return "PPC::VRRC";
880     case VSXRC:       return "PPC::VSXRC";
881   }
882 }
883 
884 TypeSize
885 PPCTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
886   switch (K) {
887   case TargetTransformInfo::RGK_Scalar:
888     return TypeSize::getFixed(ST->isPPC64() ? 64 : 32);
889   case TargetTransformInfo::RGK_FixedWidthVector:
890     return TypeSize::getFixed(ST->hasAltivec() ? 128 : 0);
891   case TargetTransformInfo::RGK_ScalableVector:
892     return TypeSize::getScalable(0);
893   }
894 
895   llvm_unreachable("Unsupported register kind");
896 }
897 
898 unsigned PPCTTIImpl::getCacheLineSize() const {
899   // Starting with P7 we have a cache line size of 128.
900   unsigned Directive = ST->getCPUDirective();
901   // Assume that Future CPU has the same cache line size as the others.
902   if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
903       Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR10 ||
904       Directive == PPC::DIR_PWR_FUTURE)
905     return 128;
906 
907   // On other processors return a default of 64 bytes.
908   return 64;
909 }
910 
911 unsigned PPCTTIImpl::getPrefetchDistance() const {
912   return 300;
913 }
914 
915 unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
916   unsigned Directive = ST->getCPUDirective();
917   // The 440 has no SIMD support, but floating-point instructions
918   // have a 5-cycle latency, so unroll by 5x for latency hiding.
919   if (Directive == PPC::DIR_440)
920     return 5;
921 
922   // The A2 has no SIMD support, but floating-point instructions
923   // have a 6-cycle latency, so unroll by 6x for latency hiding.
924   if (Directive == PPC::DIR_A2)
925     return 6;
926 
927   // FIXME: For lack of any better information, do no harm...
928   if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
929     return 1;
930 
931   // For P7 and P8, floating-point instructions have a 6-cycle latency and
932   // there are two execution units, so unroll by 12x for latency hiding.
933   // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
934   // FIXME: the same for P10 as previous gen until POWER10 scheduling is ready
935   // Assume that future is the same as the others.
936   if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
937       Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR10 ||
938       Directive == PPC::DIR_PWR_FUTURE)
939     return 12;
940 
941   // For most things, modern systems have two execution units (and
942   // out-of-order execution).
943   return 2;
944 }
945 
946 // Returns a cost adjustment factor to adjust the cost of vector instructions
947 // on targets which there is overlap between the vector and scalar units,
948 // thereby reducing the overall throughput of vector code wrt. scalar code.
949 // An invalid instruction cost is returned if the type is an MMA vector type.
950 InstructionCost PPCTTIImpl::vectorCostAdjustmentFactor(unsigned Opcode,
951                                                        Type *Ty1, Type *Ty2) {
952   // If the vector type is of an MMA type (v256i1, v512i1), an invalid
953   // instruction cost is returned. This is to signify to other cost computing
954   // functions to return the maximum instruction cost in order to prevent any
955   // opportunities for the optimizer to produce MMA types within the IR.
956   if (isMMAType(Ty1))
957     return InstructionCost::getInvalid();
958 
959   if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy())
960     return InstructionCost(1);
961 
962   std::pair<InstructionCost, MVT> LT1 = TLI->getTypeLegalizationCost(DL, Ty1);
963   // If type legalization involves splitting the vector, we don't want to
964   // double the cost at every step - only the last step.
965   if (LT1.first != 1 || !LT1.second.isVector())
966     return InstructionCost(1);
967 
968   int ISD = TLI->InstructionOpcodeToISD(Opcode);
969   if (TLI->isOperationExpand(ISD, LT1.second))
970     return InstructionCost(1);
971 
972   if (Ty2) {
973     std::pair<InstructionCost, MVT> LT2 = TLI->getTypeLegalizationCost(DL, Ty2);
974     if (LT2.first != 1 || !LT2.second.isVector())
975       return InstructionCost(1);
976   }
977 
978   return InstructionCost(2);
979 }
980 
981 InstructionCost PPCTTIImpl::getArithmeticInstrCost(
982     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
983     TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
984     TTI::OperandValueProperties Opd1PropInfo,
985     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
986     const Instruction *CxtI) {
987   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
988 
989   InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Ty, nullptr);
990   if (!CostFactor.isValid())
991     return InstructionCost::getMax();
992 
993   // TODO: Handle more cost kinds.
994   if (CostKind != TTI::TCK_RecipThroughput)
995     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
996                                          Op2Info, Opd1PropInfo,
997                                          Opd2PropInfo, Args, CxtI);
998 
999   // Fallback to the default implementation.
1000   InstructionCost Cost = BaseT::getArithmeticInstrCost(
1001       Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
1002   return Cost * CostFactor;
1003 }
1004 
1005 InstructionCost PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
1006                                            ArrayRef<int> Mask, int Index,
1007                                            Type *SubTp,
1008                                            ArrayRef<const Value *> Args) {
1009 
1010   InstructionCost CostFactor =
1011       vectorCostAdjustmentFactor(Instruction::ShuffleVector, Tp, nullptr);
1012   if (!CostFactor.isValid())
1013     return InstructionCost::getMax();
1014 
1015   // Legalize the type.
1016   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1017 
1018   // PPC, for both Altivec/VSX, support cheap arbitrary permutations
1019   // (at least in the sense that there need only be one non-loop-invariant
1020   // instruction). We need one such shuffle instruction for each actual
1021   // register (this is not true for arbitrary shuffles, but is true for the
1022   // structured types of shuffles covered by TTI::ShuffleKind).
1023   return LT.first * CostFactor;
1024 }
1025 
1026 InstructionCost PPCTTIImpl::getCFInstrCost(unsigned Opcode,
1027                                            TTI::TargetCostKind CostKind,
1028                                            const Instruction *I) {
1029   if (CostKind != TTI::TCK_RecipThroughput)
1030     return Opcode == Instruction::PHI ? 0 : 1;
1031   // Branches are assumed to be predicted.
1032   return 0;
1033 }
1034 
1035 InstructionCost PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1036                                              Type *Src,
1037                                              TTI::CastContextHint CCH,
1038                                              TTI::TargetCostKind CostKind,
1039                                              const Instruction *I) {
1040   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
1041 
1042   InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Dst, Src);
1043   if (!CostFactor.isValid())
1044     return InstructionCost::getMax();
1045 
1046   InstructionCost Cost =
1047       BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1048   Cost *= CostFactor;
1049   // TODO: Allow non-throughput costs that aren't binary.
1050   if (CostKind != TTI::TCK_RecipThroughput)
1051     return Cost == 0 ? 0 : 1;
1052   return Cost;
1053 }
1054 
1055 InstructionCost PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1056                                                Type *CondTy,
1057                                                CmpInst::Predicate VecPred,
1058                                                TTI::TargetCostKind CostKind,
1059                                                const Instruction *I) {
1060   InstructionCost CostFactor =
1061       vectorCostAdjustmentFactor(Opcode, ValTy, nullptr);
1062   if (!CostFactor.isValid())
1063     return InstructionCost::getMax();
1064 
1065   InstructionCost Cost =
1066       BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1067   // TODO: Handle other cost kinds.
1068   if (CostKind != TTI::TCK_RecipThroughput)
1069     return Cost;
1070   return Cost * CostFactor;
1071 }
1072 
1073 InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1074                                                unsigned Index) {
1075   assert(Val->isVectorTy() && "This must be a vector type");
1076 
1077   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1078   assert(ISD && "Invalid opcode");
1079 
1080   InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Val, nullptr);
1081   if (!CostFactor.isValid())
1082     return InstructionCost::getMax();
1083 
1084   InstructionCost Cost = BaseT::getVectorInstrCost(Opcode, Val, Index);
1085   Cost *= CostFactor;
1086 
1087   if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
1088     // Double-precision scalars are already located in index #0 (or #1 if LE).
1089     if (ISD == ISD::EXTRACT_VECTOR_ELT &&
1090         Index == (ST->isLittleEndian() ? 1 : 0))
1091       return 0;
1092 
1093     return Cost;
1094 
1095   } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) {
1096     if (ST->hasP9Altivec()) {
1097       if (ISD == ISD::INSERT_VECTOR_ELT)
1098         // A move-to VSR and a permute/insert.  Assume vector operation cost
1099         // for both (cost will be 2x on P9).
1100         return 2 * CostFactor;
1101 
1102       // It's an extract.  Maybe we can do a cheap move-from VSR.
1103       unsigned EltSize = Val->getScalarSizeInBits();
1104       if (EltSize == 64) {
1105         unsigned MfvsrdIndex = ST->isLittleEndian() ? 1 : 0;
1106         if (Index == MfvsrdIndex)
1107           return 1;
1108       } else if (EltSize == 32) {
1109         unsigned MfvsrwzIndex = ST->isLittleEndian() ? 2 : 1;
1110         if (Index == MfvsrwzIndex)
1111           return 1;
1112       }
1113 
1114       // We need a vector extract (or mfvsrld).  Assume vector operation cost.
1115       // The cost of the load constant for a vector extract is disregarded
1116       // (invariant, easily schedulable).
1117       return CostFactor;
1118 
1119     } else if (ST->hasDirectMove())
1120       // Assume permute has standard cost.
1121       // Assume move-to/move-from VSR have 2x standard cost.
1122       return 3;
1123   }
1124 
1125   // Estimated cost of a load-hit-store delay.  This was obtained
1126   // experimentally as a minimum needed to prevent unprofitable
1127   // vectorization for the paq8p benchmark.  It may need to be
1128   // raised further if other unprofitable cases remain.
1129   unsigned LHSPenalty = 2;
1130   if (ISD == ISD::INSERT_VECTOR_ELT)
1131     LHSPenalty += 7;
1132 
1133   // Vector element insert/extract with Altivec is very expensive,
1134   // because they require store and reload with the attendant
1135   // processor stall for load-hit-store.  Until VSX is available,
1136   // these need to be estimated as very costly.
1137   if (ISD == ISD::EXTRACT_VECTOR_ELT ||
1138       ISD == ISD::INSERT_VECTOR_ELT)
1139     return LHSPenalty + Cost;
1140 
1141   return Cost;
1142 }
1143 
1144 InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1145                                             MaybeAlign Alignment,
1146                                             unsigned AddressSpace,
1147                                             TTI::TargetCostKind CostKind,
1148                                             const Instruction *I) {
1149 
1150   InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Src, nullptr);
1151   if (!CostFactor.isValid())
1152     return InstructionCost::getMax();
1153 
1154   if (TLI->getValueType(DL, Src,  true) == MVT::Other)
1155     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1156                                   CostKind);
1157   // Legalize the type.
1158   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1159   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1160          "Invalid Opcode");
1161 
1162   InstructionCost Cost =
1163       BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1164   // TODO: Handle other cost kinds.
1165   if (CostKind != TTI::TCK_RecipThroughput)
1166     return Cost;
1167 
1168   Cost *= CostFactor;
1169 
1170   bool IsAltivecType = ST->hasAltivec() &&
1171                        (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
1172                         LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
1173   bool IsVSXType = ST->hasVSX() &&
1174                    (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
1175 
1176   // VSX has 32b/64b load instructions. Legalization can handle loading of
1177   // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
1178   // PPCTargetLowering can't compute the cost appropriately. So here we
1179   // explicitly check this case.
1180   unsigned MemBytes = Src->getPrimitiveSizeInBits();
1181   if (Opcode == Instruction::Load && ST->hasVSX() && IsAltivecType &&
1182       (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32)))
1183     return 1;
1184 
1185   // Aligned loads and stores are easy.
1186   unsigned SrcBytes = LT.second.getStoreSize();
1187   if (!SrcBytes || !Alignment || *Alignment >= SrcBytes)
1188     return Cost;
1189 
1190   // If we can use the permutation-based load sequence, then this is also
1191   // relatively cheap (not counting loop-invariant instructions): one load plus
1192   // one permute (the last load in a series has extra cost, but we're
1193   // neglecting that here). Note that on the P7, we could do unaligned loads
1194   // for Altivec types using the VSX instructions, but that's more expensive
1195   // than using the permutation-based load sequence. On the P8, that's no
1196   // longer true.
1197   if (Opcode == Instruction::Load && (!ST->hasP8Vector() && IsAltivecType) &&
1198       *Alignment >= LT.second.getScalarType().getStoreSize())
1199     return Cost + LT.first; // Add the cost of the permutations.
1200 
1201   // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
1202   // P7, unaligned vector loads are more expensive than the permutation-based
1203   // load sequence, so that might be used instead, but regardless, the net cost
1204   // is about the same (not counting loop-invariant instructions).
1205   if (IsVSXType || (ST->hasVSX() && IsAltivecType))
1206     return Cost;
1207 
1208   // Newer PPC supports unaligned memory access.
1209   if (TLI->allowsMisalignedMemoryAccesses(LT.second, 0))
1210     return Cost;
1211 
1212   // PPC in general does not support unaligned loads and stores. They'll need
1213   // to be decomposed based on the alignment factor.
1214 
1215   // Add the cost of each scalar load or store.
1216   assert(Alignment);
1217   Cost += LT.first * ((SrcBytes / Alignment->value()) - 1);
1218 
1219   // For a vector type, there is also scalarization overhead (only for
1220   // stores, loads are expanded using the vector-load + permutation sequence,
1221   // which is much less expensive).
1222   if (Src->isVectorTy() && Opcode == Instruction::Store)
1223     for (int i = 0, e = cast<FixedVectorType>(Src)->getNumElements(); i < e;
1224          ++i)
1225       Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
1226 
1227   return Cost;
1228 }
1229 
1230 InstructionCost PPCTTIImpl::getInterleavedMemoryOpCost(
1231     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1232     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1233     bool UseMaskForCond, bool UseMaskForGaps) {
1234   InstructionCost CostFactor =
1235       vectorCostAdjustmentFactor(Opcode, VecTy, nullptr);
1236   if (!CostFactor.isValid())
1237     return InstructionCost::getMax();
1238 
1239   if (UseMaskForCond || UseMaskForGaps)
1240     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1241                                              Alignment, AddressSpace, CostKind,
1242                                              UseMaskForCond, UseMaskForGaps);
1243 
1244   assert(isa<VectorType>(VecTy) &&
1245          "Expect a vector type for interleaved memory op");
1246 
1247   // Legalize the type.
1248   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
1249 
1250   // Firstly, the cost of load/store operation.
1251   InstructionCost Cost = getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment),
1252                                          AddressSpace, CostKind);
1253 
1254   // PPC, for both Altivec/VSX, support cheap arbitrary permutations
1255   // (at least in the sense that there need only be one non-loop-invariant
1256   // instruction). For each result vector, we need one shuffle per incoming
1257   // vector (except that the first shuffle can take two incoming vectors
1258   // because it does not need to take itself).
1259   Cost += Factor*(LT.first-1);
1260 
1261   return Cost;
1262 }
1263 
1264 InstructionCost
1265 PPCTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1266                                   TTI::TargetCostKind CostKind) {
1267   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1268 }
1269 
1270 bool PPCTTIImpl::areTypesABICompatible(const Function *Caller,
1271                                        const Function *Callee,
1272                                        const ArrayRef<Type *> &Types) const {
1273 
1274   // We need to ensure that argument promotion does not
1275   // attempt to promote pointers to MMA types (__vector_pair
1276   // and __vector_quad) since these types explicitly cannot be
1277   // passed as arguments. Both of these types are larger than
1278   // the 128-bit Altivec vectors and have a scalar size of 1 bit.
1279   if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
1280     return false;
1281 
1282   return llvm::none_of(Types, [](Type *Ty) {
1283     if (Ty->isSized())
1284       return Ty->isIntOrIntVectorTy(1) && Ty->getPrimitiveSizeInBits() > 128;
1285     return false;
1286   });
1287 }
1288 
1289 bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
1290                             LoopInfo *LI, DominatorTree *DT,
1291                             AssumptionCache *AC, TargetLibraryInfo *LibInfo) {
1292   // Process nested loops first.
1293   for (Loop *I : *L)
1294     if (canSaveCmp(I, BI, SE, LI, DT, AC, LibInfo))
1295       return false; // Stop search.
1296 
1297   HardwareLoopInfo HWLoopInfo(L);
1298 
1299   if (!HWLoopInfo.canAnalyze(*LI))
1300     return false;
1301 
1302   if (!isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo))
1303     return false;
1304 
1305   if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT))
1306     return false;
1307 
1308   *BI = HWLoopInfo.ExitBranch;
1309   return true;
1310 }
1311 
1312 bool PPCTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
1313                                const TargetTransformInfo::LSRCost &C2) {
1314   // PowerPC default behaviour here is "instruction number 1st priority".
1315   // If LsrNoInsnsCost is set, call default implementation.
1316   if (!LsrNoInsnsCost)
1317     return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
1318                     C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1319            std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
1320                     C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1321   else
1322     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
1323 }
1324 
1325 bool PPCTTIImpl::isNumRegsMajorCostOfLSR() {
1326   return false;
1327 }
1328 
1329 bool PPCTTIImpl::shouldBuildRelLookupTables() const {
1330   const PPCTargetMachine &TM = ST->getTargetMachine();
1331   // XCOFF hasn't implemented lowerRelativeReference, disable non-ELF for now.
1332   if (!TM.isELFv2ABI())
1333     return false;
1334   return BaseT::shouldBuildRelLookupTables();
1335 }
1336 
1337 bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
1338                                     MemIntrinsicInfo &Info) {
1339   switch (Inst->getIntrinsicID()) {
1340   case Intrinsic::ppc_altivec_lvx:
1341   case Intrinsic::ppc_altivec_lvxl:
1342   case Intrinsic::ppc_altivec_lvebx:
1343   case Intrinsic::ppc_altivec_lvehx:
1344   case Intrinsic::ppc_altivec_lvewx:
1345   case Intrinsic::ppc_vsx_lxvd2x:
1346   case Intrinsic::ppc_vsx_lxvw4x:
1347   case Intrinsic::ppc_vsx_lxvd2x_be:
1348   case Intrinsic::ppc_vsx_lxvw4x_be:
1349   case Intrinsic::ppc_vsx_lxvl:
1350   case Intrinsic::ppc_vsx_lxvll:
1351   case Intrinsic::ppc_vsx_lxvp: {
1352     Info.PtrVal = Inst->getArgOperand(0);
1353     Info.ReadMem = true;
1354     Info.WriteMem = false;
1355     return true;
1356   }
1357   case Intrinsic::ppc_altivec_stvx:
1358   case Intrinsic::ppc_altivec_stvxl:
1359   case Intrinsic::ppc_altivec_stvebx:
1360   case Intrinsic::ppc_altivec_stvehx:
1361   case Intrinsic::ppc_altivec_stvewx:
1362   case Intrinsic::ppc_vsx_stxvd2x:
1363   case Intrinsic::ppc_vsx_stxvw4x:
1364   case Intrinsic::ppc_vsx_stxvd2x_be:
1365   case Intrinsic::ppc_vsx_stxvw4x_be:
1366   case Intrinsic::ppc_vsx_stxvl:
1367   case Intrinsic::ppc_vsx_stxvll:
1368   case Intrinsic::ppc_vsx_stxvp: {
1369     Info.PtrVal = Inst->getArgOperand(1);
1370     Info.ReadMem = false;
1371     Info.WriteMem = true;
1372     return true;
1373   }
1374   default:
1375     break;
1376   }
1377 
1378   return false;
1379 }
1380 
1381 bool PPCTTIImpl::hasActiveVectorLength(unsigned Opcode, Type *DataType,
1382                                        Align Alignment) const {
1383   // Only load and stores instructions can have variable vector length on Power.
1384   if (Opcode != Instruction::Load && Opcode != Instruction::Store)
1385     return false;
1386   // Loads/stores with length instructions use bits 0-7 of the GPR operand and
1387   // therefore cannot be used in 32-bit mode.
1388   if ((!ST->hasP9Vector() && !ST->hasP10Vector()) || !ST->isPPC64())
1389     return false;
1390   if (isa<FixedVectorType>(DataType)) {
1391     unsigned VecWidth = DataType->getPrimitiveSizeInBits();
1392     return VecWidth == 128;
1393   }
1394   Type *ScalarTy = DataType->getScalarType();
1395 
1396   if (ScalarTy->isPointerTy())
1397     return true;
1398 
1399   if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
1400     return true;
1401 
1402   if (!ScalarTy->isIntegerTy())
1403     return false;
1404 
1405   unsigned IntWidth = ScalarTy->getIntegerBitWidth();
1406   return IntWidth == 8 || IntWidth == 16 || IntWidth == 32 || IntWidth == 64;
1407 }
1408 
1409 InstructionCost PPCTTIImpl::getVPMemoryOpCost(unsigned Opcode, Type *Src,
1410                                               Align Alignment,
1411                                               unsigned AddressSpace,
1412                                               TTI::TargetCostKind CostKind,
1413                                               const Instruction *I) {
1414   InstructionCost Cost = BaseT::getVPMemoryOpCost(Opcode, Src, Alignment,
1415                                                   AddressSpace, CostKind, I);
1416   if (TLI->getValueType(DL, Src, true) == MVT::Other)
1417     return Cost;
1418   // TODO: Handle other cost kinds.
1419   if (CostKind != TTI::TCK_RecipThroughput)
1420     return Cost;
1421 
1422   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1423          "Invalid Opcode");
1424 
1425   auto *SrcVTy = dyn_cast<FixedVectorType>(Src);
1426   assert(SrcVTy && "Expected a vector type for VP memory operations");
1427 
1428   if (hasActiveVectorLength(Opcode, Src, Alignment)) {
1429     std::pair<InstructionCost, MVT> LT =
1430         TLI->getTypeLegalizationCost(DL, SrcVTy);
1431 
1432     InstructionCost CostFactor =
1433         vectorCostAdjustmentFactor(Opcode, Src, nullptr);
1434     if (!CostFactor.isValid())
1435       return InstructionCost::getMax();
1436 
1437     InstructionCost Cost = LT.first * CostFactor;
1438     assert(Cost.isValid() && "Expected valid cost");
1439 
1440     // On P9 but not on P10, if the op is misaligned then it will cause a
1441     // pipeline flush. Otherwise the VSX masked memops cost the same as unmasked
1442     // ones.
1443     const Align DesiredAlignment(16);
1444     if (Alignment >= DesiredAlignment || ST->getCPUDirective() != PPC::DIR_PWR9)
1445       return Cost;
1446 
1447     // Since alignment may be under estimated, we try to compute the probability
1448     // that the actual address is aligned to the desired boundary. For example
1449     // an 8-byte aligned load is assumed to be actually 16-byte aligned half the
1450     // time, while a 4-byte aligned load has a 25% chance of being 16-byte
1451     // aligned.
1452     float AlignmentProb = ((float)Alignment.value()) / DesiredAlignment.value();
1453     float MisalignmentProb = 1.0 - AlignmentProb;
1454     return (MisalignmentProb * P9PipelineFlushEstimate) +
1455            (AlignmentProb * *Cost.getValue());
1456   }
1457 
1458   // Usually we should not get to this point, but the following is an attempt to
1459   // model the cost of legalization. Currently we can only lower intrinsics with
1460   // evl but no mask, on Power 9/10. Otherwise, we must scalarize.
1461   return getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1462 }
1463