1 //===------ PPCLoopInstrFormPrep.cpp - Loop Instr Form Prep Pass ----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a pass to prepare loops for ppc preferred addressing 10 // modes, leveraging different instruction form. (eg: DS/DQ form, D/DS form with 11 // update) 12 // Additional PHIs are created for loop induction variables used by load/store 13 // instructions so that preferred addressing modes can be used. 14 // 15 // 1: DS/DQ form preparation, prepare the load/store instructions so that they 16 // can satisfy the DS/DQ form displacement requirements. 17 // Generically, this means transforming loops like this: 18 // for (int i = 0; i < n; ++i) { 19 // unsigned long x1 = *(unsigned long *)(p + i + 5); 20 // unsigned long x2 = *(unsigned long *)(p + i + 9); 21 // } 22 // 23 // to look like this: 24 // 25 // unsigned NewP = p + 5; 26 // for (int i = 0; i < n; ++i) { 27 // unsigned long x1 = *(unsigned long *)(i + NewP); 28 // unsigned long x2 = *(unsigned long *)(i + NewP + 4); 29 // } 30 // 31 // 2: D/DS form with update preparation, prepare the load/store instructions so 32 // that we can use update form to do pre-increment. 33 // Generically, this means transforming loops like this: 34 // for (int i = 0; i < n; ++i) 35 // array[i] = c; 36 // 37 // to look like this: 38 // 39 // T *p = array[-1]; 40 // for (int i = 0; i < n; ++i) 41 // *++p = c; 42 // 43 // 3: common multiple chains for the load/stores with same offsets in the loop, 44 // so that we can reuse the offsets and reduce the register pressure in the 45 // loop. This transformation can also increase the loop ILP as now each chain 46 // uses its own loop induction add/addi. But this will increase the number of 47 // add/addi in the loop. 48 // 49 // Generically, this means transforming loops like this: 50 // 51 // char *p; 52 // A1 = p + base1 53 // A2 = p + base1 + offset 54 // B1 = p + base2 55 // B2 = p + base2 + offset 56 // 57 // for (int i = 0; i < n; i++) 58 // unsigned long x1 = *(unsigned long *)(A1 + i); 59 // unsigned long x2 = *(unsigned long *)(A2 + i) 60 // unsigned long x3 = *(unsigned long *)(B1 + i); 61 // unsigned long x4 = *(unsigned long *)(B2 + i); 62 // } 63 // 64 // to look like this: 65 // 66 // A1_new = p + base1 // chain 1 67 // B1_new = p + base2 // chain 2, now inside the loop, common offset is 68 // // reused. 69 // 70 // for (long long i = 0; i < n; i+=count) { 71 // unsigned long x1 = *(unsigned long *)(A1_new + i); 72 // unsigned long x2 = *(unsigned long *)((A1_new + i) + offset); 73 // unsigned long x3 = *(unsigned long *)(B1_new + i); 74 // unsigned long x4 = *(unsigned long *)((B1_new + i) + offset); 75 // } 76 //===----------------------------------------------------------------------===// 77 78 #include "PPC.h" 79 #include "PPCSubtarget.h" 80 #include "PPCTargetMachine.h" 81 #include "llvm/ADT/DepthFirstIterator.h" 82 #include "llvm/ADT/SmallPtrSet.h" 83 #include "llvm/ADT/SmallSet.h" 84 #include "llvm/ADT/SmallVector.h" 85 #include "llvm/ADT/Statistic.h" 86 #include "llvm/Analysis/LoopInfo.h" 87 #include "llvm/Analysis/ScalarEvolution.h" 88 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 89 #include "llvm/IR/BasicBlock.h" 90 #include "llvm/IR/CFG.h" 91 #include "llvm/IR/Dominators.h" 92 #include "llvm/IR/Instruction.h" 93 #include "llvm/IR/Instructions.h" 94 #include "llvm/IR/IntrinsicInst.h" 95 #include "llvm/IR/IntrinsicsPowerPC.h" 96 #include "llvm/IR/Type.h" 97 #include "llvm/IR/Value.h" 98 #include "llvm/Pass.h" 99 #include "llvm/Support/Casting.h" 100 #include "llvm/Support/CommandLine.h" 101 #include "llvm/Support/Debug.h" 102 #include "llvm/Transforms/Scalar.h" 103 #include "llvm/Transforms/Utils.h" 104 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 105 #include "llvm/Transforms/Utils/Local.h" 106 #include "llvm/Transforms/Utils/LoopUtils.h" 107 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 108 #include <cassert> 109 #include <cmath> 110 #include <utility> 111 112 #define DEBUG_TYPE "ppc-loop-instr-form-prep" 113 114 using namespace llvm; 115 116 static cl::opt<unsigned> 117 MaxVarsPrep("ppc-formprep-max-vars", cl::Hidden, cl::init(24), 118 cl::desc("Potential common base number threshold per function " 119 "for PPC loop prep")); 120 121 static cl::opt<bool> PreferUpdateForm("ppc-formprep-prefer-update", 122 cl::init(true), cl::Hidden, 123 cl::desc("prefer update form when ds form is also a update form")); 124 125 static cl::opt<bool> EnableUpdateFormForNonConstInc( 126 "ppc-formprep-update-nonconst-inc", cl::init(false), cl::Hidden, 127 cl::desc("prepare update form when the load/store increment is a loop " 128 "invariant non-const value.")); 129 130 static cl::opt<bool> EnableChainCommoning( 131 "ppc-formprep-chain-commoning", cl::init(false), cl::Hidden, 132 cl::desc("Enable chain commoning in PPC loop prepare pass.")); 133 134 // Sum of following 3 per loop thresholds for all loops can not be larger 135 // than MaxVarsPrep. 136 // now the thresholds for each kind prep are exterimental values on Power9. 137 static cl::opt<unsigned> MaxVarsUpdateForm("ppc-preinc-prep-max-vars", 138 cl::Hidden, cl::init(3), 139 cl::desc("Potential PHI threshold per loop for PPC loop prep of update " 140 "form")); 141 142 static cl::opt<unsigned> MaxVarsDSForm("ppc-dsprep-max-vars", 143 cl::Hidden, cl::init(3), 144 cl::desc("Potential PHI threshold per loop for PPC loop prep of DS form")); 145 146 static cl::opt<unsigned> MaxVarsDQForm("ppc-dqprep-max-vars", 147 cl::Hidden, cl::init(8), 148 cl::desc("Potential PHI threshold per loop for PPC loop prep of DQ form")); 149 150 // Commoning chain will reduce the register pressure, so we don't consider about 151 // the PHI nodes number. 152 // But commoning chain will increase the addi/add number in the loop and also 153 // increase loop ILP. Maximum chain number should be same with hardware 154 // IssueWidth, because we won't benefit from ILP if the parallel chains number 155 // is bigger than IssueWidth. We assume there are 2 chains in one bucket, so 156 // there would be 4 buckets at most on P9(IssueWidth is 8). 157 static cl::opt<unsigned> MaxVarsChainCommon( 158 "ppc-chaincommon-max-vars", cl::Hidden, cl::init(4), 159 cl::desc("Bucket number per loop for PPC loop chain common")); 160 161 // If would not be profitable if the common base has only one load/store, ISEL 162 // should already be able to choose best load/store form based on offset for 163 // single load/store. Set minimal profitable value default to 2 and make it as 164 // an option. 165 static cl::opt<unsigned> DispFormPrepMinThreshold("ppc-dispprep-min-threshold", 166 cl::Hidden, cl::init(2), 167 cl::desc("Minimal common base load/store instructions triggering DS/DQ form " 168 "preparation")); 169 170 static cl::opt<unsigned> ChainCommonPrepMinThreshold( 171 "ppc-chaincommon-min-threshold", cl::Hidden, cl::init(4), 172 cl::desc("Minimal common base load/store instructions triggering chain " 173 "commoning preparation. Must be not smaller than 4")); 174 175 STATISTIC(PHINodeAlreadyExistsUpdate, "PHI node already in pre-increment form"); 176 STATISTIC(PHINodeAlreadyExistsDS, "PHI node already in DS form"); 177 STATISTIC(PHINodeAlreadyExistsDQ, "PHI node already in DQ form"); 178 STATISTIC(DSFormChainRewritten, "Num of DS form chain rewritten"); 179 STATISTIC(DQFormChainRewritten, "Num of DQ form chain rewritten"); 180 STATISTIC(UpdFormChainRewritten, "Num of update form chain rewritten"); 181 STATISTIC(ChainCommoningRewritten, "Num of commoning chains"); 182 183 namespace { 184 struct BucketElement { 185 BucketElement(const SCEV *O, Instruction *I) : Offset(O), Instr(I) {} 186 BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {} 187 188 const SCEV *Offset; 189 Instruction *Instr; 190 }; 191 192 struct Bucket { 193 Bucket(const SCEV *B, Instruction *I) 194 : BaseSCEV(B), Elements(1, BucketElement(I)) { 195 ChainSize = 0; 196 } 197 198 // The base of the whole bucket. 199 const SCEV *BaseSCEV; 200 201 // All elements in the bucket. In the bucket, the element with the BaseSCEV 202 // has no offset and all other elements are stored as offsets to the 203 // BaseSCEV. 204 SmallVector<BucketElement, 16> Elements; 205 206 // The potential chains size. This is used for chain commoning only. 207 unsigned ChainSize; 208 209 // The base for each potential chain. This is used for chain commoning only. 210 SmallVector<BucketElement, 16> ChainBases; 211 }; 212 213 // "UpdateForm" is not a real PPC instruction form, it stands for dform 214 // load/store with update like ldu/stdu, or Prefetch intrinsic. 215 // For DS form instructions, their displacements must be multiple of 4. 216 // For DQ form instructions, their displacements must be multiple of 16. 217 enum PrepForm { UpdateForm = 1, DSForm = 4, DQForm = 16, ChainCommoning }; 218 219 class PPCLoopInstrFormPrep : public FunctionPass { 220 public: 221 static char ID; // Pass ID, replacement for typeid 222 223 PPCLoopInstrFormPrep(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {} 224 225 void getAnalysisUsage(AnalysisUsage &AU) const override { 226 AU.addPreserved<DominatorTreeWrapperPass>(); 227 AU.addRequired<LoopInfoWrapperPass>(); 228 AU.addPreserved<LoopInfoWrapperPass>(); 229 AU.addRequired<ScalarEvolutionWrapperPass>(); 230 } 231 232 bool runOnFunction(Function &F) override; 233 234 private: 235 PPCTargetMachine *TM = nullptr; 236 const PPCSubtarget *ST; 237 DominatorTree *DT; 238 LoopInfo *LI; 239 ScalarEvolution *SE; 240 bool PreserveLCSSA; 241 bool HasCandidateForPrepare; 242 243 /// Successful preparation number for Update/DS/DQ form in all inner most 244 /// loops. One successful preparation will put one common base out of loop, 245 /// this may leads to register presure like LICM does. 246 /// Make sure total preparation number can be controlled by option. 247 unsigned SuccPrepCount; 248 249 bool runOnLoop(Loop *L); 250 251 /// Check if required PHI node is already exist in Loop \p L. 252 bool alreadyPrepared(Loop *L, Instruction *MemI, 253 const SCEV *BasePtrStartSCEV, 254 const SCEV *BasePtrIncSCEV, PrepForm Form); 255 256 /// Get the value which defines the increment SCEV \p BasePtrIncSCEV. 257 Value *getNodeForInc(Loop *L, Instruction *MemI, 258 const SCEV *BasePtrIncSCEV); 259 260 /// Common chains to reuse offsets for a loop to reduce register pressure. 261 bool chainCommoning(Loop *L, SmallVector<Bucket, 16> &Buckets); 262 263 /// Find out the potential commoning chains and their bases. 264 bool prepareBasesForCommoningChains(Bucket &BucketChain); 265 266 /// Rewrite load/store according to the common chains. 267 bool 268 rewriteLoadStoresForCommoningChains(Loop *L, Bucket &Bucket, 269 SmallSet<BasicBlock *, 16> &BBChanged); 270 271 /// Collect condition matched(\p isValidCandidate() returns true) 272 /// candidates in Loop \p L. 273 SmallVector<Bucket, 16> collectCandidates( 274 Loop *L, 275 std::function<bool(const Instruction *, Value *, const Type *)> 276 isValidCandidate, 277 std::function<bool(const SCEV *)> isValidDiff, 278 unsigned MaxCandidateNum); 279 280 /// Add a candidate to candidates \p Buckets if diff between candidate and 281 /// one base in \p Buckets matches \p isValidDiff. 282 void addOneCandidate(Instruction *MemI, const SCEV *LSCEV, 283 SmallVector<Bucket, 16> &Buckets, 284 std::function<bool(const SCEV *)> isValidDiff, 285 unsigned MaxCandidateNum); 286 287 /// Prepare all candidates in \p Buckets for update form. 288 bool updateFormPrep(Loop *L, SmallVector<Bucket, 16> &Buckets); 289 290 /// Prepare all candidates in \p Buckets for displacement form, now for 291 /// ds/dq. 292 bool dispFormPrep(Loop *L, SmallVector<Bucket, 16> &Buckets, PrepForm Form); 293 294 /// Prepare for one chain \p BucketChain, find the best base element and 295 /// update all other elements in \p BucketChain accordingly. 296 /// \p Form is used to find the best base element. 297 /// If success, best base element must be stored as the first element of 298 /// \p BucketChain. 299 /// Return false if no base element found, otherwise return true. 300 bool prepareBaseForDispFormChain(Bucket &BucketChain, PrepForm Form); 301 302 /// Prepare for one chain \p BucketChain, find the best base element and 303 /// update all other elements in \p BucketChain accordingly. 304 /// If success, best base element must be stored as the first element of 305 /// \p BucketChain. 306 /// Return false if no base element found, otherwise return true. 307 bool prepareBaseForUpdateFormChain(Bucket &BucketChain); 308 309 /// Rewrite load/store instructions in \p BucketChain according to 310 /// preparation. 311 bool rewriteLoadStores(Loop *L, Bucket &BucketChain, 312 SmallSet<BasicBlock *, 16> &BBChanged, 313 PrepForm Form); 314 315 /// Rewrite for the base load/store of a chain. 316 std::pair<Instruction *, Instruction *> 317 rewriteForBase(Loop *L, const SCEVAddRecExpr *BasePtrSCEV, 318 Instruction *BaseMemI, bool CanPreInc, PrepForm Form, 319 SCEVExpander &SCEVE, SmallPtrSet<Value *, 16> &DeletedPtrs); 320 321 /// Rewrite for the other load/stores of a chain according to the new \p 322 /// Base. 323 Instruction * 324 rewriteForBucketElement(std::pair<Instruction *, Instruction *> Base, 325 const BucketElement &Element, Value *OffToBase, 326 SmallPtrSet<Value *, 16> &DeletedPtrs); 327 }; 328 329 } // end anonymous namespace 330 331 char PPCLoopInstrFormPrep::ID = 0; 332 static const char *name = "Prepare loop for ppc preferred instruction forms"; 333 INITIALIZE_PASS_BEGIN(PPCLoopInstrFormPrep, DEBUG_TYPE, name, false, false) 334 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 335 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 336 INITIALIZE_PASS_END(PPCLoopInstrFormPrep, DEBUG_TYPE, name, false, false) 337 338 static constexpr StringRef PHINodeNameSuffix = ".phi"; 339 static constexpr StringRef CastNodeNameSuffix = ".cast"; 340 static constexpr StringRef GEPNodeIncNameSuffix = ".inc"; 341 static constexpr StringRef GEPNodeOffNameSuffix = ".off"; 342 343 FunctionPass *llvm::createPPCLoopInstrFormPrepPass(PPCTargetMachine &TM) { 344 return new PPCLoopInstrFormPrep(TM); 345 } 346 347 static bool IsPtrInBounds(Value *BasePtr) { 348 Value *StrippedBasePtr = BasePtr; 349 while (BitCastInst *BC = dyn_cast<BitCastInst>(StrippedBasePtr)) 350 StrippedBasePtr = BC->getOperand(0); 351 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(StrippedBasePtr)) 352 return GEP->isInBounds(); 353 354 return false; 355 } 356 357 static std::string getInstrName(const Value *I, StringRef Suffix) { 358 assert(I && "Invalid paramater!"); 359 if (I->hasName()) 360 return (I->getName() + Suffix).str(); 361 else 362 return ""; 363 } 364 365 static Value *getPointerOperandAndType(Value *MemI, 366 Type **PtrElementType = nullptr) { 367 368 Value *PtrValue = nullptr; 369 Type *PointerElementType = nullptr; 370 371 if (LoadInst *LMemI = dyn_cast<LoadInst>(MemI)) { 372 PtrValue = LMemI->getPointerOperand(); 373 PointerElementType = LMemI->getType(); 374 } else if (StoreInst *SMemI = dyn_cast<StoreInst>(MemI)) { 375 PtrValue = SMemI->getPointerOperand(); 376 PointerElementType = SMemI->getValueOperand()->getType(); 377 } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(MemI)) { 378 PointerElementType = Type::getInt8Ty(MemI->getContext()); 379 if (IMemI->getIntrinsicID() == Intrinsic::prefetch || 380 IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) { 381 PtrValue = IMemI->getArgOperand(0); 382 } else if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp) { 383 PtrValue = IMemI->getArgOperand(1); 384 } 385 } 386 /*Get ElementType if PtrElementType is not null.*/ 387 if (PtrElementType) 388 *PtrElementType = PointerElementType; 389 390 return PtrValue; 391 } 392 393 bool PPCLoopInstrFormPrep::runOnFunction(Function &F) { 394 if (skipFunction(F)) 395 return false; 396 397 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 398 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 399 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 400 DT = DTWP ? &DTWP->getDomTree() : nullptr; 401 PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); 402 ST = TM ? TM->getSubtargetImpl(F) : nullptr; 403 SuccPrepCount = 0; 404 405 bool MadeChange = false; 406 407 for (Loop *I : *LI) 408 for (Loop *L : depth_first(I)) 409 MadeChange |= runOnLoop(L); 410 411 return MadeChange; 412 } 413 414 // Finding the minimal(chain_number + reusable_offset_number) is a complicated 415 // algorithmic problem. 416 // For now, the algorithm used here is simply adjusted to handle the case for 417 // manually unrolling cases. 418 // FIXME: use a more powerful algorithm to find minimal sum of chain_number and 419 // reusable_offset_number for one base with multiple offsets. 420 bool PPCLoopInstrFormPrep::prepareBasesForCommoningChains(Bucket &CBucket) { 421 // The minimal size for profitable chain commoning: 422 // A1 = base + offset1 423 // A2 = base + offset2 (offset2 - offset1 = X) 424 // A3 = base + offset3 425 // A4 = base + offset4 (offset4 - offset3 = X) 426 // ======> 427 // base1 = base + offset1 428 // base2 = base + offset3 429 // A1 = base1 430 // A2 = base1 + X 431 // A3 = base2 432 // A4 = base2 + X 433 // 434 // There is benefit because of reuse of offest 'X'. 435 436 assert(ChainCommonPrepMinThreshold >= 4 && 437 "Thredhold can not be smaller than 4!\n"); 438 if (CBucket.Elements.size() < ChainCommonPrepMinThreshold) 439 return false; 440 441 // We simply select the FirstOffset as the first reusable offset between each 442 // chain element 1 and element 0. 443 const SCEV *FirstOffset = CBucket.Elements[1].Offset; 444 445 // Figure out how many times above FirstOffset is used in the chain. 446 // For a success commoning chain candidate, offset difference between each 447 // chain element 1 and element 0 must be also FirstOffset. 448 unsigned FirstOffsetReusedCount = 1; 449 450 // Figure out how many times above FirstOffset is used in the first chain. 451 // Chain number is FirstOffsetReusedCount / FirstOffsetReusedCountInFirstChain 452 unsigned FirstOffsetReusedCountInFirstChain = 1; 453 454 unsigned EleNum = CBucket.Elements.size(); 455 bool SawChainSeparater = false; 456 for (unsigned j = 2; j != EleNum; ++j) { 457 if (SE->getMinusSCEV(CBucket.Elements[j].Offset, 458 CBucket.Elements[j - 1].Offset) == FirstOffset) { 459 if (!SawChainSeparater) 460 FirstOffsetReusedCountInFirstChain++; 461 FirstOffsetReusedCount++; 462 } else 463 // For now, if we meet any offset which is not FirstOffset, we assume we 464 // find a new Chain. 465 // This makes us miss some opportunities. 466 // For example, we can common: 467 // 468 // {OffsetA, Offset A, OffsetB, OffsetA, OffsetA, OffsetB} 469 // 470 // as two chains: 471 // {{OffsetA, Offset A, OffsetB}, {OffsetA, OffsetA, OffsetB}} 472 // FirstOffsetReusedCount = 4; FirstOffsetReusedCountInFirstChain = 2 473 // 474 // But we fail to common: 475 // 476 // {OffsetA, OffsetB, OffsetA, OffsetA, OffsetB, OffsetA} 477 // FirstOffsetReusedCount = 4; FirstOffsetReusedCountInFirstChain = 1 478 479 SawChainSeparater = true; 480 } 481 482 // FirstOffset is not reused, skip this bucket. 483 if (FirstOffsetReusedCount == 1) 484 return false; 485 486 unsigned ChainNum = 487 FirstOffsetReusedCount / FirstOffsetReusedCountInFirstChain; 488 489 // All elements are increased by FirstOffset. 490 // The number of chains should be sqrt(EleNum). 491 if (!SawChainSeparater) 492 ChainNum = (unsigned)sqrt((double)EleNum); 493 494 CBucket.ChainSize = (unsigned)(EleNum / ChainNum); 495 496 // If this is not a perfect chain(eg: not all elements can be put inside 497 // commoning chains.), skip now. 498 if (CBucket.ChainSize * ChainNum != EleNum) 499 return false; 500 501 if (SawChainSeparater) { 502 // Check that the offset seqs are the same for all chains. 503 for (unsigned i = 1; i < CBucket.ChainSize; i++) 504 for (unsigned j = 1; j < ChainNum; j++) 505 if (CBucket.Elements[i].Offset != 506 SE->getMinusSCEV(CBucket.Elements[i + j * CBucket.ChainSize].Offset, 507 CBucket.Elements[j * CBucket.ChainSize].Offset)) 508 return false; 509 } 510 511 for (unsigned i = 0; i < ChainNum; i++) 512 CBucket.ChainBases.push_back(CBucket.Elements[i * CBucket.ChainSize]); 513 514 LLVM_DEBUG(dbgs() << "Bucket has " << ChainNum << " chains.\n"); 515 516 return true; 517 } 518 519 bool PPCLoopInstrFormPrep::chainCommoning(Loop *L, 520 SmallVector<Bucket, 16> &Buckets) { 521 bool MadeChange = false; 522 523 if (Buckets.empty()) 524 return MadeChange; 525 526 SmallSet<BasicBlock *, 16> BBChanged; 527 528 for (auto &Bucket : Buckets) { 529 if (prepareBasesForCommoningChains(Bucket)) 530 MadeChange |= rewriteLoadStoresForCommoningChains(L, Bucket, BBChanged); 531 } 532 533 if (MadeChange) 534 for (auto *BB : BBChanged) 535 DeleteDeadPHIs(BB); 536 return MadeChange; 537 } 538 539 bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains( 540 Loop *L, Bucket &Bucket, SmallSet<BasicBlock *, 16> &BBChanged) { 541 bool MadeChange = false; 542 543 assert(Bucket.Elements.size() == 544 Bucket.ChainBases.size() * Bucket.ChainSize && 545 "invalid bucket for chain commoning!\n"); 546 SmallPtrSet<Value *, 16> DeletedPtrs; 547 548 BasicBlock *Header = L->getHeader(); 549 BasicBlock *LoopPredecessor = L->getLoopPredecessor(); 550 551 SCEVExpander SCEVE(*SE, Header->getDataLayout(), 552 "loopprepare-chaincommon"); 553 554 for (unsigned ChainIdx = 0; ChainIdx < Bucket.ChainBases.size(); ++ChainIdx) { 555 unsigned BaseElemIdx = Bucket.ChainSize * ChainIdx; 556 const SCEV *BaseSCEV = 557 ChainIdx ? SE->getAddExpr(Bucket.BaseSCEV, 558 Bucket.Elements[BaseElemIdx].Offset) 559 : Bucket.BaseSCEV; 560 const SCEVAddRecExpr *BasePtrSCEV = cast<SCEVAddRecExpr>(BaseSCEV); 561 562 // Make sure the base is able to expand. 563 if (!SCEVE.isSafeToExpand(BasePtrSCEV->getStart())) 564 return MadeChange; 565 566 assert(BasePtrSCEV->isAffine() && 567 "Invalid SCEV type for the base ptr for a candidate chain!\n"); 568 569 std::pair<Instruction *, Instruction *> Base = rewriteForBase( 570 L, BasePtrSCEV, Bucket.Elements[BaseElemIdx].Instr, 571 false /* CanPreInc */, ChainCommoning, SCEVE, DeletedPtrs); 572 573 if (!Base.first || !Base.second) 574 return MadeChange; 575 576 // Keep track of the replacement pointer values we've inserted so that we 577 // don't generate more pointer values than necessary. 578 SmallPtrSet<Value *, 16> NewPtrs; 579 NewPtrs.insert(Base.first); 580 581 for (unsigned Idx = BaseElemIdx + 1; Idx < BaseElemIdx + Bucket.ChainSize; 582 ++Idx) { 583 BucketElement &I = Bucket.Elements[Idx]; 584 Value *Ptr = getPointerOperandAndType(I.Instr); 585 assert(Ptr && "No pointer operand"); 586 if (NewPtrs.count(Ptr)) 587 continue; 588 589 const SCEV *OffsetSCEV = 590 BaseElemIdx ? SE->getMinusSCEV(Bucket.Elements[Idx].Offset, 591 Bucket.Elements[BaseElemIdx].Offset) 592 : Bucket.Elements[Idx].Offset; 593 594 // Make sure offset is able to expand. Only need to check one time as the 595 // offsets are reused between different chains. 596 if (!BaseElemIdx) 597 if (!SCEVE.isSafeToExpand(OffsetSCEV)) 598 return false; 599 600 Value *OffsetValue = SCEVE.expandCodeFor( 601 OffsetSCEV, OffsetSCEV->getType(), LoopPredecessor->getTerminator()); 602 603 Instruction *NewPtr = rewriteForBucketElement(Base, Bucket.Elements[Idx], 604 OffsetValue, DeletedPtrs); 605 606 assert(NewPtr && "Wrong rewrite!\n"); 607 NewPtrs.insert(NewPtr); 608 } 609 610 ++ChainCommoningRewritten; 611 } 612 613 // Clear the rewriter cache, because values that are in the rewriter's cache 614 // can be deleted below, causing the AssertingVH in the cache to trigger. 615 SCEVE.clear(); 616 617 for (auto *Ptr : DeletedPtrs) { 618 if (Instruction *IDel = dyn_cast<Instruction>(Ptr)) 619 BBChanged.insert(IDel->getParent()); 620 RecursivelyDeleteTriviallyDeadInstructions(Ptr); 621 } 622 623 MadeChange = true; 624 return MadeChange; 625 } 626 627 // Rewrite the new base according to BasePtrSCEV. 628 // bb.loop.preheader: 629 // %newstart = ... 630 // bb.loop.body: 631 // %phinode = phi [ %newstart, %bb.loop.preheader ], [ %add, %bb.loop.body ] 632 // ... 633 // %add = getelementptr %phinode, %inc 634 // 635 // First returned instruciton is %phinode (or a type cast to %phinode), caller 636 // needs this value to rewrite other load/stores in the same chain. 637 // Second returned instruction is %add, caller needs this value to rewrite other 638 // load/stores in the same chain. 639 std::pair<Instruction *, Instruction *> 640 PPCLoopInstrFormPrep::rewriteForBase(Loop *L, const SCEVAddRecExpr *BasePtrSCEV, 641 Instruction *BaseMemI, bool CanPreInc, 642 PrepForm Form, SCEVExpander &SCEVE, 643 SmallPtrSet<Value *, 16> &DeletedPtrs) { 644 645 LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n"); 646 647 assert(BasePtrSCEV->getLoop() == L && "AddRec for the wrong loop?"); 648 649 Value *BasePtr = getPointerOperandAndType(BaseMemI); 650 assert(BasePtr && "No pointer operand"); 651 652 Type *I8Ty = Type::getInt8Ty(BaseMemI->getParent()->getContext()); 653 Type *I8PtrTy = 654 PointerType::get(BaseMemI->getParent()->getContext(), 655 BasePtr->getType()->getPointerAddressSpace()); 656 657 bool IsConstantInc = false; 658 const SCEV *BasePtrIncSCEV = BasePtrSCEV->getStepRecurrence(*SE); 659 Value *IncNode = getNodeForInc(L, BaseMemI, BasePtrIncSCEV); 660 661 const SCEVConstant *BasePtrIncConstantSCEV = 662 dyn_cast<SCEVConstant>(BasePtrIncSCEV); 663 if (BasePtrIncConstantSCEV) 664 IsConstantInc = true; 665 666 // No valid representation for the increment. 667 if (!IncNode) { 668 LLVM_DEBUG(dbgs() << "Loop Increasement can not be represented!\n"); 669 return std::make_pair(nullptr, nullptr); 670 } 671 672 if (Form == UpdateForm && !IsConstantInc && !EnableUpdateFormForNonConstInc) { 673 LLVM_DEBUG( 674 dbgs() 675 << "Update form prepare for non-const increment is not enabled!\n"); 676 return std::make_pair(nullptr, nullptr); 677 } 678 679 const SCEV *BasePtrStartSCEV = nullptr; 680 if (CanPreInc) { 681 assert(SE->isLoopInvariant(BasePtrIncSCEV, L) && 682 "Increment is not loop invariant!\n"); 683 BasePtrStartSCEV = SE->getMinusSCEV(BasePtrSCEV->getStart(), 684 IsConstantInc ? BasePtrIncConstantSCEV 685 : BasePtrIncSCEV); 686 } else 687 BasePtrStartSCEV = BasePtrSCEV->getStart(); 688 689 if (alreadyPrepared(L, BaseMemI, BasePtrStartSCEV, BasePtrIncSCEV, Form)) { 690 LLVM_DEBUG(dbgs() << "Instruction form is already prepared!\n"); 691 return std::make_pair(nullptr, nullptr); 692 } 693 694 LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n"); 695 696 BasicBlock *Header = L->getHeader(); 697 unsigned HeaderLoopPredCount = pred_size(Header); 698 BasicBlock *LoopPredecessor = L->getLoopPredecessor(); 699 700 PHINode *NewPHI = PHINode::Create(I8PtrTy, HeaderLoopPredCount, 701 getInstrName(BaseMemI, PHINodeNameSuffix)); 702 NewPHI->insertBefore(Header->getFirstNonPHIIt()); 703 704 Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy, 705 LoopPredecessor->getTerminator()); 706 707 // Note that LoopPredecessor might occur in the predecessor list multiple 708 // times, and we need to add it the right number of times. 709 for (auto *PI : predecessors(Header)) { 710 if (PI != LoopPredecessor) 711 continue; 712 713 NewPHI->addIncoming(BasePtrStart, LoopPredecessor); 714 } 715 716 Instruction *PtrInc = nullptr; 717 Instruction *NewBasePtr = nullptr; 718 if (CanPreInc) { 719 BasicBlock::iterator InsPoint = Header->getFirstInsertionPt(); 720 PtrInc = GetElementPtrInst::Create( 721 I8Ty, NewPHI, IncNode, getInstrName(BaseMemI, GEPNodeIncNameSuffix), 722 InsPoint); 723 cast<GetElementPtrInst>(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr)); 724 for (auto *PI : predecessors(Header)) { 725 if (PI == LoopPredecessor) 726 continue; 727 728 NewPHI->addIncoming(PtrInc, PI); 729 } 730 if (PtrInc->getType() != BasePtr->getType()) 731 NewBasePtr = 732 new BitCastInst(PtrInc, BasePtr->getType(), 733 getInstrName(PtrInc, CastNodeNameSuffix), InsPoint); 734 else 735 NewBasePtr = PtrInc; 736 } else { 737 // Note that LoopPredecessor might occur in the predecessor list multiple 738 // times, and we need to make sure no more incoming value for them in PHI. 739 for (auto *PI : predecessors(Header)) { 740 if (PI == LoopPredecessor) 741 continue; 742 743 // For the latch predecessor, we need to insert a GEP just before the 744 // terminator to increase the address. 745 BasicBlock *BB = PI; 746 BasicBlock::iterator InsPoint = BB->getTerminator()->getIterator(); 747 PtrInc = GetElementPtrInst::Create( 748 I8Ty, NewPHI, IncNode, getInstrName(BaseMemI, GEPNodeIncNameSuffix), 749 InsPoint); 750 cast<GetElementPtrInst>(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr)); 751 752 NewPHI->addIncoming(PtrInc, PI); 753 } 754 PtrInc = NewPHI; 755 if (NewPHI->getType() != BasePtr->getType()) 756 NewBasePtr = new BitCastInst(NewPHI, BasePtr->getType(), 757 getInstrName(NewPHI, CastNodeNameSuffix), 758 Header->getFirstInsertionPt()); 759 else 760 NewBasePtr = NewPHI; 761 } 762 763 BasePtr->replaceAllUsesWith(NewBasePtr); 764 765 DeletedPtrs.insert(BasePtr); 766 767 return std::make_pair(NewBasePtr, PtrInc); 768 } 769 770 Instruction *PPCLoopInstrFormPrep::rewriteForBucketElement( 771 std::pair<Instruction *, Instruction *> Base, const BucketElement &Element, 772 Value *OffToBase, SmallPtrSet<Value *, 16> &DeletedPtrs) { 773 Instruction *NewBasePtr = Base.first; 774 Instruction *PtrInc = Base.second; 775 assert((NewBasePtr && PtrInc) && "base does not exist!\n"); 776 777 Type *I8Ty = Type::getInt8Ty(PtrInc->getParent()->getContext()); 778 779 Value *Ptr = getPointerOperandAndType(Element.Instr); 780 assert(Ptr && "No pointer operand"); 781 782 Instruction *RealNewPtr; 783 if (!Element.Offset || 784 (isa<SCEVConstant>(Element.Offset) && 785 cast<SCEVConstant>(Element.Offset)->getValue()->isZero())) { 786 RealNewPtr = NewBasePtr; 787 } else { 788 std::optional<BasicBlock::iterator> PtrIP = std::nullopt; 789 if (Instruction *I = dyn_cast<Instruction>(Ptr)) 790 PtrIP = I->getIterator(); 791 792 if (PtrIP && isa<Instruction>(NewBasePtr) && 793 cast<Instruction>(NewBasePtr)->getParent() == (*PtrIP)->getParent()) 794 PtrIP = std::nullopt; 795 else if (PtrIP && isa<PHINode>(*PtrIP)) 796 PtrIP = (*PtrIP)->getParent()->getFirstInsertionPt(); 797 else if (!PtrIP) 798 PtrIP = Element.Instr->getIterator(); 799 800 assert(OffToBase && "There should be an offset for non base element!\n"); 801 GetElementPtrInst *NewPtr = GetElementPtrInst::Create( 802 I8Ty, PtrInc, OffToBase, 803 getInstrName(Element.Instr, GEPNodeOffNameSuffix)); 804 if (PtrIP) 805 NewPtr->insertBefore(*(*PtrIP)->getParent(), *PtrIP); 806 else 807 NewPtr->insertAfter(cast<Instruction>(PtrInc)); 808 NewPtr->setIsInBounds(IsPtrInBounds(Ptr)); 809 RealNewPtr = NewPtr; 810 } 811 812 Instruction *ReplNewPtr; 813 if (Ptr->getType() != RealNewPtr->getType()) { 814 ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(), 815 getInstrName(Ptr, CastNodeNameSuffix)); 816 ReplNewPtr->insertAfter(RealNewPtr); 817 } else 818 ReplNewPtr = RealNewPtr; 819 820 Ptr->replaceAllUsesWith(ReplNewPtr); 821 DeletedPtrs.insert(Ptr); 822 823 return ReplNewPtr; 824 } 825 826 void PPCLoopInstrFormPrep::addOneCandidate( 827 Instruction *MemI, const SCEV *LSCEV, SmallVector<Bucket, 16> &Buckets, 828 std::function<bool(const SCEV *)> isValidDiff, unsigned MaxCandidateNum) { 829 assert((MemI && getPointerOperandAndType(MemI)) && 830 "Candidate should be a memory instruction."); 831 assert(LSCEV && "Invalid SCEV for Ptr value."); 832 833 bool FoundBucket = false; 834 for (auto &B : Buckets) { 835 if (cast<SCEVAddRecExpr>(B.BaseSCEV)->getStepRecurrence(*SE) != 836 cast<SCEVAddRecExpr>(LSCEV)->getStepRecurrence(*SE)) 837 continue; 838 const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV); 839 if (isValidDiff(Diff)) { 840 B.Elements.push_back(BucketElement(Diff, MemI)); 841 FoundBucket = true; 842 break; 843 } 844 } 845 846 if (!FoundBucket) { 847 if (Buckets.size() == MaxCandidateNum) { 848 LLVM_DEBUG(dbgs() << "Can not prepare more chains, reach maximum limit " 849 << MaxCandidateNum << "\n"); 850 return; 851 } 852 Buckets.push_back(Bucket(LSCEV, MemI)); 853 } 854 } 855 856 SmallVector<Bucket, 16> PPCLoopInstrFormPrep::collectCandidates( 857 Loop *L, 858 std::function<bool(const Instruction *, Value *, const Type *)> 859 isValidCandidate, 860 std::function<bool(const SCEV *)> isValidDiff, unsigned MaxCandidateNum) { 861 SmallVector<Bucket, 16> Buckets; 862 863 for (const auto &BB : L->blocks()) 864 for (auto &J : *BB) { 865 Value *PtrValue = nullptr; 866 Type *PointerElementType = nullptr; 867 PtrValue = getPointerOperandAndType(&J, &PointerElementType); 868 869 if (!PtrValue) 870 continue; 871 872 if (PtrValue->getType()->getPointerAddressSpace()) 873 continue; 874 875 if (L->isLoopInvariant(PtrValue)) 876 continue; 877 878 const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L); 879 const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV); 880 if (!LARSCEV || LARSCEV->getLoop() != L) 881 continue; 882 883 // Mark that we have candidates for preparing. 884 HasCandidateForPrepare = true; 885 886 if (isValidCandidate(&J, PtrValue, PointerElementType)) 887 addOneCandidate(&J, LSCEV, Buckets, isValidDiff, MaxCandidateNum); 888 } 889 return Buckets; 890 } 891 892 bool PPCLoopInstrFormPrep::prepareBaseForDispFormChain(Bucket &BucketChain, 893 PrepForm Form) { 894 // RemainderOffsetInfo details: 895 // key: value of (Offset urem DispConstraint). For DSForm, it can 896 // be [0, 4). 897 // first of pair: the index of first BucketElement whose remainder is equal 898 // to key. For key 0, this value must be 0. 899 // second of pair: number of load/stores with the same remainder. 900 DenseMap<unsigned, std::pair<unsigned, unsigned>> RemainderOffsetInfo; 901 902 for (unsigned j = 0, je = BucketChain.Elements.size(); j != je; ++j) { 903 if (!BucketChain.Elements[j].Offset) 904 RemainderOffsetInfo[0] = std::make_pair(0, 1); 905 else { 906 unsigned Remainder = cast<SCEVConstant>(BucketChain.Elements[j].Offset) 907 ->getAPInt() 908 .urem(Form); 909 if (!RemainderOffsetInfo.contains(Remainder)) 910 RemainderOffsetInfo[Remainder] = std::make_pair(j, 1); 911 else 912 RemainderOffsetInfo[Remainder].second++; 913 } 914 } 915 // Currently we choose the most profitable base as the one which has the max 916 // number of load/store with same remainder. 917 // FIXME: adjust the base selection strategy according to load/store offset 918 // distribution. 919 // For example, if we have one candidate chain for DS form preparation, which 920 // contains following load/stores with different remainders: 921 // 1: 10 load/store whose remainder is 1; 922 // 2: 9 load/store whose remainder is 2; 923 // 3: 1 for remainder 3 and 0 for remainder 0; 924 // Now we will choose the first load/store whose remainder is 1 as base and 925 // adjust all other load/stores according to new base, so we will get 10 DS 926 // form and 10 X form. 927 // But we should be more clever, for this case we could use two bases, one for 928 // remainder 1 and the other for remainder 2, thus we could get 19 DS form and 929 // 1 X form. 930 unsigned MaxCountRemainder = 0; 931 for (unsigned j = 0; j < (unsigned)Form; j++) 932 if (auto It = RemainderOffsetInfo.find(j); 933 It != RemainderOffsetInfo.end() && 934 It->second.second > RemainderOffsetInfo[MaxCountRemainder].second) 935 MaxCountRemainder = j; 936 937 // Abort when there are too few insts with common base. 938 if (RemainderOffsetInfo[MaxCountRemainder].second < DispFormPrepMinThreshold) 939 return false; 940 941 // If the first value is most profitable, no needed to adjust BucketChain 942 // elements as they are substracted the first value when collecting. 943 if (MaxCountRemainder == 0) 944 return true; 945 946 // Adjust load/store to the new chosen base. 947 const SCEV *Offset = 948 BucketChain.Elements[RemainderOffsetInfo[MaxCountRemainder].first].Offset; 949 BucketChain.BaseSCEV = SE->getAddExpr(BucketChain.BaseSCEV, Offset); 950 for (auto &E : BucketChain.Elements) { 951 if (E.Offset) 952 E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset)); 953 else 954 E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset)); 955 } 956 957 std::swap(BucketChain.Elements[RemainderOffsetInfo[MaxCountRemainder].first], 958 BucketChain.Elements[0]); 959 return true; 960 } 961 962 // FIXME: implement a more clever base choosing policy. 963 // Currently we always choose an exist load/store offset. This maybe lead to 964 // suboptimal code sequences. For example, for one DS chain with offsets 965 // {-32769, 2003, 2007, 2011}, we choose -32769 as base offset, and left disp 966 // for load/stores are {0, 34772, 34776, 34780}. Though each offset now is a 967 // multipler of 4, it cannot be represented by sint16. 968 bool PPCLoopInstrFormPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) { 969 // We have a choice now of which instruction's memory operand we use as the 970 // base for the generated PHI. Always picking the first instruction in each 971 // bucket does not work well, specifically because that instruction might 972 // be a prefetch (and there are no pre-increment dcbt variants). Otherwise, 973 // the choice is somewhat arbitrary, because the backend will happily 974 // generate direct offsets from both the pre-incremented and 975 // post-incremented pointer values. Thus, we'll pick the first non-prefetch 976 // instruction in each bucket, and adjust the recurrence and other offsets 977 // accordingly. 978 for (int j = 0, je = BucketChain.Elements.size(); j != je; ++j) { 979 if (auto *II = dyn_cast<IntrinsicInst>(BucketChain.Elements[j].Instr)) 980 if (II->getIntrinsicID() == Intrinsic::prefetch) 981 continue; 982 983 // If we'd otherwise pick the first element anyway, there's nothing to do. 984 if (j == 0) 985 break; 986 987 // If our chosen element has no offset from the base pointer, there's 988 // nothing to do. 989 if (!BucketChain.Elements[j].Offset || 990 cast<SCEVConstant>(BucketChain.Elements[j].Offset)->isZero()) 991 break; 992 993 const SCEV *Offset = BucketChain.Elements[j].Offset; 994 BucketChain.BaseSCEV = SE->getAddExpr(BucketChain.BaseSCEV, Offset); 995 for (auto &E : BucketChain.Elements) { 996 if (E.Offset) 997 E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset)); 998 else 999 E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset)); 1000 } 1001 1002 std::swap(BucketChain.Elements[j], BucketChain.Elements[0]); 1003 break; 1004 } 1005 return true; 1006 } 1007 1008 bool PPCLoopInstrFormPrep::rewriteLoadStores( 1009 Loop *L, Bucket &BucketChain, SmallSet<BasicBlock *, 16> &BBChanged, 1010 PrepForm Form) { 1011 bool MadeChange = false; 1012 1013 const SCEVAddRecExpr *BasePtrSCEV = 1014 cast<SCEVAddRecExpr>(BucketChain.BaseSCEV); 1015 if (!BasePtrSCEV->isAffine()) 1016 return MadeChange; 1017 1018 BasicBlock *Header = L->getHeader(); 1019 SCEVExpander SCEVE(*SE, Header->getDataLayout(), 1020 "loopprepare-formrewrite"); 1021 if (!SCEVE.isSafeToExpand(BasePtrSCEV->getStart())) 1022 return MadeChange; 1023 1024 SmallPtrSet<Value *, 16> DeletedPtrs; 1025 1026 // For some DS form load/store instructions, it can also be an update form, 1027 // if the stride is constant and is a multipler of 4. Use update form if 1028 // prefer it. 1029 bool CanPreInc = (Form == UpdateForm || 1030 ((Form == DSForm) && 1031 isa<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE)) && 1032 !cast<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE)) 1033 ->getAPInt() 1034 .urem(4) && 1035 PreferUpdateForm)); 1036 1037 std::pair<Instruction *, Instruction *> Base = 1038 rewriteForBase(L, BasePtrSCEV, BucketChain.Elements.begin()->Instr, 1039 CanPreInc, Form, SCEVE, DeletedPtrs); 1040 1041 if (!Base.first || !Base.second) 1042 return MadeChange; 1043 1044 // Keep track of the replacement pointer values we've inserted so that we 1045 // don't generate more pointer values than necessary. 1046 SmallPtrSet<Value *, 16> NewPtrs; 1047 NewPtrs.insert(Base.first); 1048 1049 for (const BucketElement &BE : llvm::drop_begin(BucketChain.Elements)) { 1050 Value *Ptr = getPointerOperandAndType(BE.Instr); 1051 assert(Ptr && "No pointer operand"); 1052 if (NewPtrs.count(Ptr)) 1053 continue; 1054 1055 Instruction *NewPtr = rewriteForBucketElement( 1056 Base, BE, 1057 BE.Offset ? cast<SCEVConstant>(BE.Offset)->getValue() : nullptr, 1058 DeletedPtrs); 1059 assert(NewPtr && "wrong rewrite!\n"); 1060 NewPtrs.insert(NewPtr); 1061 } 1062 1063 // Clear the rewriter cache, because values that are in the rewriter's cache 1064 // can be deleted below, causing the AssertingVH in the cache to trigger. 1065 SCEVE.clear(); 1066 1067 for (auto *Ptr : DeletedPtrs) { 1068 if (Instruction *IDel = dyn_cast<Instruction>(Ptr)) 1069 BBChanged.insert(IDel->getParent()); 1070 RecursivelyDeleteTriviallyDeadInstructions(Ptr); 1071 } 1072 1073 MadeChange = true; 1074 1075 SuccPrepCount++; 1076 1077 if (Form == DSForm && !CanPreInc) 1078 DSFormChainRewritten++; 1079 else if (Form == DQForm) 1080 DQFormChainRewritten++; 1081 else if (Form == UpdateForm || (Form == DSForm && CanPreInc)) 1082 UpdFormChainRewritten++; 1083 1084 return MadeChange; 1085 } 1086 1087 bool PPCLoopInstrFormPrep::updateFormPrep(Loop *L, 1088 SmallVector<Bucket, 16> &Buckets) { 1089 bool MadeChange = false; 1090 if (Buckets.empty()) 1091 return MadeChange; 1092 SmallSet<BasicBlock *, 16> BBChanged; 1093 for (auto &Bucket : Buckets) 1094 // The base address of each bucket is transformed into a phi and the others 1095 // are rewritten based on new base. 1096 if (prepareBaseForUpdateFormChain(Bucket)) 1097 MadeChange |= rewriteLoadStores(L, Bucket, BBChanged, UpdateForm); 1098 1099 if (MadeChange) 1100 for (auto *BB : BBChanged) 1101 DeleteDeadPHIs(BB); 1102 return MadeChange; 1103 } 1104 1105 bool PPCLoopInstrFormPrep::dispFormPrep(Loop *L, 1106 SmallVector<Bucket, 16> &Buckets, 1107 PrepForm Form) { 1108 bool MadeChange = false; 1109 1110 if (Buckets.empty()) 1111 return MadeChange; 1112 1113 SmallSet<BasicBlock *, 16> BBChanged; 1114 for (auto &Bucket : Buckets) { 1115 if (Bucket.Elements.size() < DispFormPrepMinThreshold) 1116 continue; 1117 if (prepareBaseForDispFormChain(Bucket, Form)) 1118 MadeChange |= rewriteLoadStores(L, Bucket, BBChanged, Form); 1119 } 1120 1121 if (MadeChange) 1122 for (auto *BB : BBChanged) 1123 DeleteDeadPHIs(BB); 1124 return MadeChange; 1125 } 1126 1127 // Find the loop invariant increment node for SCEV BasePtrIncSCEV. 1128 // bb.loop.preheader: 1129 // %start = ... 1130 // bb.loop.body: 1131 // %phinode = phi [ %start, %bb.loop.preheader ], [ %add, %bb.loop.body ] 1132 // ... 1133 // %add = add %phinode, %inc ; %inc is what we want to get. 1134 // 1135 Value *PPCLoopInstrFormPrep::getNodeForInc(Loop *L, Instruction *MemI, 1136 const SCEV *BasePtrIncSCEV) { 1137 // If the increment is a constant, no definition is needed. 1138 // Return the value directly. 1139 if (isa<SCEVConstant>(BasePtrIncSCEV)) 1140 return cast<SCEVConstant>(BasePtrIncSCEV)->getValue(); 1141 1142 if (!SE->isLoopInvariant(BasePtrIncSCEV, L)) 1143 return nullptr; 1144 1145 BasicBlock *BB = MemI->getParent(); 1146 if (!BB) 1147 return nullptr; 1148 1149 BasicBlock *LatchBB = L->getLoopLatch(); 1150 1151 if (!LatchBB) 1152 return nullptr; 1153 1154 // Run through the PHIs and check their operands to find valid representation 1155 // for the increment SCEV. 1156 iterator_range<BasicBlock::phi_iterator> PHIIter = BB->phis(); 1157 for (auto &CurrentPHI : PHIIter) { 1158 PHINode *CurrentPHINode = dyn_cast<PHINode>(&CurrentPHI); 1159 if (!CurrentPHINode) 1160 continue; 1161 1162 if (!SE->isSCEVable(CurrentPHINode->getType())) 1163 continue; 1164 1165 const SCEV *PHISCEV = SE->getSCEVAtScope(CurrentPHINode, L); 1166 1167 const SCEVAddRecExpr *PHIBasePtrSCEV = dyn_cast<SCEVAddRecExpr>(PHISCEV); 1168 if (!PHIBasePtrSCEV) 1169 continue; 1170 1171 const SCEV *PHIBasePtrIncSCEV = PHIBasePtrSCEV->getStepRecurrence(*SE); 1172 1173 if (!PHIBasePtrIncSCEV || (PHIBasePtrIncSCEV != BasePtrIncSCEV)) 1174 continue; 1175 1176 // Get the incoming value from the loop latch and check if the value has 1177 // the add form with the required increment. 1178 if (CurrentPHINode->getBasicBlockIndex(LatchBB) < 0) 1179 continue; 1180 if (Instruction *I = dyn_cast<Instruction>( 1181 CurrentPHINode->getIncomingValueForBlock(LatchBB))) { 1182 Value *StrippedBaseI = I; 1183 while (BitCastInst *BC = dyn_cast<BitCastInst>(StrippedBaseI)) 1184 StrippedBaseI = BC->getOperand(0); 1185 1186 Instruction *StrippedI = dyn_cast<Instruction>(StrippedBaseI); 1187 if (!StrippedI) 1188 continue; 1189 1190 // LSR pass may add a getelementptr instruction to do the loop increment, 1191 // also search in that getelementptr instruction. 1192 if (StrippedI->getOpcode() == Instruction::Add || 1193 (StrippedI->getOpcode() == Instruction::GetElementPtr && 1194 StrippedI->getNumOperands() == 2)) { 1195 if (SE->getSCEVAtScope(StrippedI->getOperand(0), L) == BasePtrIncSCEV) 1196 return StrippedI->getOperand(0); 1197 if (SE->getSCEVAtScope(StrippedI->getOperand(1), L) == BasePtrIncSCEV) 1198 return StrippedI->getOperand(1); 1199 } 1200 } 1201 } 1202 return nullptr; 1203 } 1204 1205 // In order to prepare for the preferred instruction form, a PHI is added. 1206 // This function will check to see if that PHI already exists and will return 1207 // true if it found an existing PHI with the matched start and increment as the 1208 // one we wanted to create. 1209 bool PPCLoopInstrFormPrep::alreadyPrepared(Loop *L, Instruction *MemI, 1210 const SCEV *BasePtrStartSCEV, 1211 const SCEV *BasePtrIncSCEV, 1212 PrepForm Form) { 1213 BasicBlock *BB = MemI->getParent(); 1214 if (!BB) 1215 return false; 1216 1217 BasicBlock *PredBB = L->getLoopPredecessor(); 1218 BasicBlock *LatchBB = L->getLoopLatch(); 1219 1220 if (!PredBB || !LatchBB) 1221 return false; 1222 1223 // Run through the PHIs and see if we have some that looks like a preparation 1224 iterator_range<BasicBlock::phi_iterator> PHIIter = BB->phis(); 1225 for (auto & CurrentPHI : PHIIter) { 1226 PHINode *CurrentPHINode = dyn_cast<PHINode>(&CurrentPHI); 1227 if (!CurrentPHINode) 1228 continue; 1229 1230 if (!SE->isSCEVable(CurrentPHINode->getType())) 1231 continue; 1232 1233 const SCEV *PHISCEV = SE->getSCEVAtScope(CurrentPHINode, L); 1234 1235 const SCEVAddRecExpr *PHIBasePtrSCEV = dyn_cast<SCEVAddRecExpr>(PHISCEV); 1236 if (!PHIBasePtrSCEV) 1237 continue; 1238 1239 const SCEVConstant *PHIBasePtrIncSCEV = 1240 dyn_cast<SCEVConstant>(PHIBasePtrSCEV->getStepRecurrence(*SE)); 1241 if (!PHIBasePtrIncSCEV) 1242 continue; 1243 1244 if (CurrentPHINode->getNumIncomingValues() == 2) { 1245 if ((CurrentPHINode->getIncomingBlock(0) == LatchBB && 1246 CurrentPHINode->getIncomingBlock(1) == PredBB) || 1247 (CurrentPHINode->getIncomingBlock(1) == LatchBB && 1248 CurrentPHINode->getIncomingBlock(0) == PredBB)) { 1249 if (PHIBasePtrIncSCEV == BasePtrIncSCEV) { 1250 // The existing PHI (CurrentPHINode) has the same start and increment 1251 // as the PHI that we wanted to create. 1252 if ((Form == UpdateForm || Form == ChainCommoning ) && 1253 PHIBasePtrSCEV->getStart() == BasePtrStartSCEV) { 1254 ++PHINodeAlreadyExistsUpdate; 1255 return true; 1256 } 1257 if (Form == DSForm || Form == DQForm) { 1258 const SCEVConstant *Diff = dyn_cast<SCEVConstant>( 1259 SE->getMinusSCEV(PHIBasePtrSCEV->getStart(), BasePtrStartSCEV)); 1260 if (Diff && !Diff->getAPInt().urem(Form)) { 1261 if (Form == DSForm) 1262 ++PHINodeAlreadyExistsDS; 1263 else 1264 ++PHINodeAlreadyExistsDQ; 1265 return true; 1266 } 1267 } 1268 } 1269 } 1270 } 1271 } 1272 return false; 1273 } 1274 1275 bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) { 1276 bool MadeChange = false; 1277 1278 // Only prep. the inner-most loop 1279 if (!L->isInnermost()) 1280 return MadeChange; 1281 1282 // Return if already done enough preparation. 1283 if (SuccPrepCount >= MaxVarsPrep) 1284 return MadeChange; 1285 1286 LLVM_DEBUG(dbgs() << "PIP: Examining: " << *L << "\n"); 1287 1288 BasicBlock *LoopPredecessor = L->getLoopPredecessor(); 1289 // If there is no loop predecessor, or the loop predecessor's terminator 1290 // returns a value (which might contribute to determining the loop's 1291 // iteration space), insert a new preheader for the loop. 1292 if (!LoopPredecessor || 1293 !LoopPredecessor->getTerminator()->getType()->isVoidTy()) { 1294 LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA); 1295 if (LoopPredecessor) 1296 MadeChange = true; 1297 } 1298 if (!LoopPredecessor) { 1299 LLVM_DEBUG(dbgs() << "PIP fails since no predecessor for current loop.\n"); 1300 return MadeChange; 1301 } 1302 // Check if a load/store has update form. This lambda is used by function 1303 // collectCandidates which can collect candidates for types defined by lambda. 1304 auto isUpdateFormCandidate = [&](const Instruction *I, Value *PtrValue, 1305 const Type *PointerElementType) { 1306 assert((PtrValue && I) && "Invalid parameter!"); 1307 // There are no update forms for Altivec vector load/stores. 1308 if (ST && ST->hasAltivec() && PointerElementType->isVectorTy()) 1309 return false; 1310 // There are no update forms for P10 lxvp/stxvp intrinsic. 1311 auto *II = dyn_cast<IntrinsicInst>(I); 1312 if (II && ((II->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) || 1313 II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp)) 1314 return false; 1315 // See getPreIndexedAddressParts, the displacement for LDU/STDU has to 1316 // be 4's multiple (DS-form). For i64 loads/stores when the displacement 1317 // fits in a 16-bit signed field but isn't a multiple of 4, it will be 1318 // useless and possible to break some original well-form addressing mode 1319 // to make this pre-inc prep for it. 1320 if (PointerElementType->isIntegerTy(64)) { 1321 const SCEV *LSCEV = SE->getSCEVAtScope(const_cast<Value *>(PtrValue), L); 1322 const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV); 1323 if (!LARSCEV || LARSCEV->getLoop() != L) 1324 return false; 1325 if (const SCEVConstant *StepConst = 1326 dyn_cast<SCEVConstant>(LARSCEV->getStepRecurrence(*SE))) { 1327 const APInt &ConstInt = StepConst->getValue()->getValue(); 1328 if (ConstInt.isSignedIntN(16) && ConstInt.srem(4) != 0) 1329 return false; 1330 } 1331 } 1332 return true; 1333 }; 1334 1335 // Check if a load/store has DS form. 1336 auto isDSFormCandidate = [](const Instruction *I, Value *PtrValue, 1337 const Type *PointerElementType) { 1338 assert((PtrValue && I) && "Invalid parameter!"); 1339 if (isa<IntrinsicInst>(I)) 1340 return false; 1341 return (PointerElementType->isIntegerTy(64)) || 1342 (PointerElementType->isFloatTy()) || 1343 (PointerElementType->isDoubleTy()) || 1344 (PointerElementType->isIntegerTy(32) && 1345 llvm::any_of(I->users(), 1346 [](const User *U) { return isa<SExtInst>(U); })); 1347 }; 1348 1349 // Check if a load/store has DQ form. 1350 auto isDQFormCandidate = [&](const Instruction *I, Value *PtrValue, 1351 const Type *PointerElementType) { 1352 assert((PtrValue && I) && "Invalid parameter!"); 1353 // Check if it is a P10 lxvp/stxvp intrinsic. 1354 auto *II = dyn_cast<IntrinsicInst>(I); 1355 if (II) 1356 return II->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp || 1357 II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp; 1358 // Check if it is a P9 vector load/store. 1359 return ST && ST->hasP9Vector() && (PointerElementType->isVectorTy()); 1360 }; 1361 1362 // Check if a load/store is candidate for chain commoning. 1363 // If the SCEV is only with one ptr operand in its start, we can use that 1364 // start as a chain separator. Mark this load/store as a candidate. 1365 auto isChainCommoningCandidate = [&](const Instruction *I, Value *PtrValue, 1366 const Type *PointerElementType) { 1367 const SCEVAddRecExpr *ARSCEV = 1368 cast<SCEVAddRecExpr>(SE->getSCEVAtScope(PtrValue, L)); 1369 if (!ARSCEV) 1370 return false; 1371 1372 if (!ARSCEV->isAffine()) 1373 return false; 1374 1375 const SCEV *Start = ARSCEV->getStart(); 1376 1377 // A single pointer. We can treat it as offset 0. 1378 if (isa<SCEVUnknown>(Start) && Start->getType()->isPointerTy()) 1379 return true; 1380 1381 const SCEVAddExpr *ASCEV = dyn_cast<SCEVAddExpr>(Start); 1382 1383 // We need a SCEVAddExpr to include both base and offset. 1384 if (!ASCEV) 1385 return false; 1386 1387 // Make sure there is only one pointer operand(base) and all other operands 1388 // are integer type. 1389 bool SawPointer = false; 1390 for (const SCEV *Op : ASCEV->operands()) { 1391 if (Op->getType()->isPointerTy()) { 1392 if (SawPointer) 1393 return false; 1394 SawPointer = true; 1395 } else if (!Op->getType()->isIntegerTy()) 1396 return false; 1397 } 1398 1399 return SawPointer; 1400 }; 1401 1402 // Check if the diff is a constant type. This is used for update/DS/DQ form 1403 // preparation. 1404 auto isValidConstantDiff = [](const SCEV *Diff) { 1405 return dyn_cast<SCEVConstant>(Diff) != nullptr; 1406 }; 1407 1408 // Make sure the diff between the base and new candidate is required type. 1409 // This is used for chain commoning preparation. 1410 auto isValidChainCommoningDiff = [](const SCEV *Diff) { 1411 assert(Diff && "Invalid Diff!\n"); 1412 1413 // Don't mess up previous dform prepare. 1414 if (isa<SCEVConstant>(Diff)) 1415 return false; 1416 1417 // A single integer type offset. 1418 if (isa<SCEVUnknown>(Diff) && Diff->getType()->isIntegerTy()) 1419 return true; 1420 1421 const SCEVNAryExpr *ADiff = dyn_cast<SCEVNAryExpr>(Diff); 1422 if (!ADiff) 1423 return false; 1424 1425 for (const SCEV *Op : ADiff->operands()) 1426 if (!Op->getType()->isIntegerTy()) 1427 return false; 1428 1429 return true; 1430 }; 1431 1432 HasCandidateForPrepare = false; 1433 1434 LLVM_DEBUG(dbgs() << "Start to prepare for update form.\n"); 1435 // Collect buckets of comparable addresses used by loads and stores for update 1436 // form. 1437 SmallVector<Bucket, 16> UpdateFormBuckets = collectCandidates( 1438 L, isUpdateFormCandidate, isValidConstantDiff, MaxVarsUpdateForm); 1439 1440 // Prepare for update form. 1441 if (!UpdateFormBuckets.empty()) 1442 MadeChange |= updateFormPrep(L, UpdateFormBuckets); 1443 else if (!HasCandidateForPrepare) { 1444 LLVM_DEBUG( 1445 dbgs() 1446 << "No prepare candidates found, stop praparation for current loop!\n"); 1447 // If no candidate for preparing, return early. 1448 return MadeChange; 1449 } 1450 1451 LLVM_DEBUG(dbgs() << "Start to prepare for DS form.\n"); 1452 // Collect buckets of comparable addresses used by loads and stores for DS 1453 // form. 1454 SmallVector<Bucket, 16> DSFormBuckets = collectCandidates( 1455 L, isDSFormCandidate, isValidConstantDiff, MaxVarsDSForm); 1456 1457 // Prepare for DS form. 1458 if (!DSFormBuckets.empty()) 1459 MadeChange |= dispFormPrep(L, DSFormBuckets, DSForm); 1460 1461 LLVM_DEBUG(dbgs() << "Start to prepare for DQ form.\n"); 1462 // Collect buckets of comparable addresses used by loads and stores for DQ 1463 // form. 1464 SmallVector<Bucket, 16> DQFormBuckets = collectCandidates( 1465 L, isDQFormCandidate, isValidConstantDiff, MaxVarsDQForm); 1466 1467 // Prepare for DQ form. 1468 if (!DQFormBuckets.empty()) 1469 MadeChange |= dispFormPrep(L, DQFormBuckets, DQForm); 1470 1471 // Collect buckets of comparable addresses used by loads and stores for chain 1472 // commoning. With chain commoning, we reuse offsets between the chains, so 1473 // the register pressure will be reduced. 1474 if (!EnableChainCommoning) { 1475 LLVM_DEBUG(dbgs() << "Chain commoning is not enabled.\n"); 1476 return MadeChange; 1477 } 1478 1479 LLVM_DEBUG(dbgs() << "Start to prepare for chain commoning.\n"); 1480 SmallVector<Bucket, 16> Buckets = 1481 collectCandidates(L, isChainCommoningCandidate, isValidChainCommoningDiff, 1482 MaxVarsChainCommon); 1483 1484 // Prepare for chain commoning. 1485 if (!Buckets.empty()) 1486 MadeChange |= chainCommoning(L, Buckets); 1487 1488 return MadeChange; 1489 } 1490