1 //===- SeparateConstOffsetFromGEP.cpp -------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Loop unrolling may create many similar GEPs for array accesses. 10 // e.g., a 2-level loop 11 // 12 // float a[32][32]; // global variable 13 // 14 // for (int i = 0; i < 2; ++i) { 15 // for (int j = 0; j < 2; ++j) { 16 // ... 17 // ... = a[x + i][y + j]; 18 // ... 19 // } 20 // } 21 // 22 // will probably be unrolled to: 23 // 24 // gep %a, 0, %x, %y; load 25 // gep %a, 0, %x, %y + 1; load 26 // gep %a, 0, %x + 1, %y; load 27 // gep %a, 0, %x + 1, %y + 1; load 28 // 29 // LLVM's GVN does not use partial redundancy elimination yet, and is thus 30 // unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs 31 // significant slowdown in targets with limited addressing modes. For instance, 32 // because the PTX target does not support the reg+reg addressing mode, the 33 // NVPTX backend emits PTX code that literally computes the pointer address of 34 // each GEP, wasting tons of registers. It emits the following PTX for the 35 // first load and similar PTX for other loads. 36 // 37 // mov.u32 %r1, %x; 38 // mov.u32 %r2, %y; 39 // mul.wide.u32 %rl2, %r1, 128; 40 // mov.u64 %rl3, a; 41 // add.s64 %rl4, %rl3, %rl2; 42 // mul.wide.u32 %rl5, %r2, 4; 43 // add.s64 %rl6, %rl4, %rl5; 44 // ld.global.f32 %f1, [%rl6]; 45 // 46 // To reduce the register pressure, the optimization implemented in this file 47 // merges the common part of a group of GEPs, so we can compute each pointer 48 // address by adding a simple offset to the common part, saving many registers. 49 // 50 // It works by splitting each GEP into a variadic base and a constant offset. 51 // The variadic base can be computed once and reused by multiple GEPs, and the 52 // constant offsets can be nicely folded into the reg+immediate addressing mode 53 // (supported by most targets) without using any extra register. 54 // 55 // For instance, we transform the four GEPs and four loads in the above example 56 // into: 57 // 58 // base = gep a, 0, x, y 59 // load base 60 // load base + 1 * sizeof(float) 61 // load base + 32 * sizeof(float) 62 // load base + 33 * sizeof(float) 63 // 64 // Given the transformed IR, a backend that supports the reg+immediate 65 // addressing mode can easily fold the pointer arithmetics into the loads. For 66 // example, the NVPTX backend can easily fold the pointer arithmetics into the 67 // ld.global.f32 instructions, and the resultant PTX uses much fewer registers. 68 // 69 // mov.u32 %r1, %tid.x; 70 // mov.u32 %r2, %tid.y; 71 // mul.wide.u32 %rl2, %r1, 128; 72 // mov.u64 %rl3, a; 73 // add.s64 %rl4, %rl3, %rl2; 74 // mul.wide.u32 %rl5, %r2, 4; 75 // add.s64 %rl6, %rl4, %rl5; 76 // ld.global.f32 %f1, [%rl6]; // so far the same as unoptimized PTX 77 // ld.global.f32 %f2, [%rl6+4]; // much better 78 // ld.global.f32 %f3, [%rl6+128]; // much better 79 // ld.global.f32 %f4, [%rl6+132]; // much better 80 // 81 // Another improvement enabled by the LowerGEP flag is to lower a GEP with 82 // multiple indices to either multiple GEPs with a single index or arithmetic 83 // operations (depending on whether the target uses alias analysis in codegen). 84 // Such transformation can have following benefits: 85 // (1) It can always extract constants in the indices of structure type. 86 // (2) After such Lowering, there are more optimization opportunities such as 87 // CSE, LICM and CGP. 88 // 89 // E.g. The following GEPs have multiple indices: 90 // BB1: 91 // %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3 92 // load %p 93 // ... 94 // BB2: 95 // %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2 96 // load %p2 97 // ... 98 // 99 // We can not do CSE to the common part related to index "i64 %i". Lowering 100 // GEPs can achieve such goals. 101 // If the target does not use alias analysis in codegen, this pass will 102 // lower a GEP with multiple indices into arithmetic operations: 103 // BB1: 104 // %1 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity 105 // %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity 106 // %3 = add i64 %1, %2 ; CSE opportunity 107 // %4 = mul i64 %j1, length_of_struct 108 // %5 = add i64 %3, %4 109 // %6 = add i64 %3, struct_field_3 ; Constant offset 110 // %p = inttoptr i64 %6 to i32* 111 // load %p 112 // ... 113 // BB2: 114 // %7 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity 115 // %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity 116 // %9 = add i64 %7, %8 ; CSE opportunity 117 // %10 = mul i64 %j2, length_of_struct 118 // %11 = add i64 %9, %10 119 // %12 = add i64 %11, struct_field_2 ; Constant offset 120 // %p = inttoptr i64 %12 to i32* 121 // load %p2 122 // ... 123 // 124 // If the target uses alias analysis in codegen, this pass will lower a GEP 125 // with multiple indices into multiple GEPs with a single index: 126 // BB1: 127 // %1 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity 128 // %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity 129 // %3 = getelementptr i8* %1, i64 %2 ; CSE opportunity 130 // %4 = mul i64 %j1, length_of_struct 131 // %5 = getelementptr i8* %3, i64 %4 132 // %6 = getelementptr i8* %5, struct_field_3 ; Constant offset 133 // %p = bitcast i8* %6 to i32* 134 // load %p 135 // ... 136 // BB2: 137 // %7 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity 138 // %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity 139 // %9 = getelementptr i8* %7, i64 %8 ; CSE opportunity 140 // %10 = mul i64 %j2, length_of_struct 141 // %11 = getelementptr i8* %9, i64 %10 142 // %12 = getelementptr i8* %11, struct_field_2 ; Constant offset 143 // %p2 = bitcast i8* %12 to i32* 144 // load %p2 145 // ... 146 // 147 // Lowering GEPs can also benefit other passes such as LICM and CGP. 148 // LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple 149 // indices if one of the index is variant. If we lower such GEP into invariant 150 // parts and variant parts, LICM can hoist/sink those invariant parts. 151 // CGP (CodeGen Prepare) tries to sink address calculations that match the 152 // target's addressing modes. A GEP with multiple indices may not match and will 153 // not be sunk. If we lower such GEP into smaller parts, CGP may sink some of 154 // them. So we end up with a better addressing mode. 155 // 156 //===----------------------------------------------------------------------===// 157 158 #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h" 159 #include "llvm/ADT/APInt.h" 160 #include "llvm/ADT/DenseMap.h" 161 #include "llvm/ADT/DepthFirstIterator.h" 162 #include "llvm/ADT/SmallVector.h" 163 #include "llvm/Analysis/LoopInfo.h" 164 #include "llvm/Analysis/MemoryBuiltins.h" 165 #include "llvm/Analysis/TargetLibraryInfo.h" 166 #include "llvm/Analysis/TargetTransformInfo.h" 167 #include "llvm/Analysis/ValueTracking.h" 168 #include "llvm/IR/BasicBlock.h" 169 #include "llvm/IR/Constant.h" 170 #include "llvm/IR/Constants.h" 171 #include "llvm/IR/DataLayout.h" 172 #include "llvm/IR/DerivedTypes.h" 173 #include "llvm/IR/Dominators.h" 174 #include "llvm/IR/Function.h" 175 #include "llvm/IR/GetElementPtrTypeIterator.h" 176 #include "llvm/IR/IRBuilder.h" 177 #include "llvm/IR/InstrTypes.h" 178 #include "llvm/IR/Instruction.h" 179 #include "llvm/IR/Instructions.h" 180 #include "llvm/IR/Module.h" 181 #include "llvm/IR/PassManager.h" 182 #include "llvm/IR/PatternMatch.h" 183 #include "llvm/IR/Type.h" 184 #include "llvm/IR/User.h" 185 #include "llvm/IR/Value.h" 186 #include "llvm/InitializePasses.h" 187 #include "llvm/Pass.h" 188 #include "llvm/Support/Casting.h" 189 #include "llvm/Support/CommandLine.h" 190 #include "llvm/Support/ErrorHandling.h" 191 #include "llvm/Support/raw_ostream.h" 192 #include "llvm/Transforms/Scalar.h" 193 #include "llvm/Transforms/Utils/Local.h" 194 #include <cassert> 195 #include <cstdint> 196 #include <string> 197 198 using namespace llvm; 199 using namespace llvm::PatternMatch; 200 201 static cl::opt<bool> DisableSeparateConstOffsetFromGEP( 202 "disable-separate-const-offset-from-gep", cl::init(false), 203 cl::desc("Do not separate the constant offset from a GEP instruction"), 204 cl::Hidden); 205 206 // Setting this flag may emit false positives when the input module already 207 // contains dead instructions. Therefore, we set it only in unit tests that are 208 // free of dead code. 209 static cl::opt<bool> 210 VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false), 211 cl::desc("Verify this pass produces no dead code"), 212 cl::Hidden); 213 214 namespace { 215 216 /// A helper class for separating a constant offset from a GEP index. 217 /// 218 /// In real programs, a GEP index may be more complicated than a simple addition 219 /// of something and a constant integer which can be trivially splitted. For 220 /// example, to split ((a << 3) | 5) + b, we need to search deeper for the 221 /// constant offset, so that we can separate the index to (a << 3) + b and 5. 222 /// 223 /// Therefore, this class looks into the expression that computes a given GEP 224 /// index, and tries to find a constant integer that can be hoisted to the 225 /// outermost level of the expression as an addition. Not every constant in an 226 /// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a + 227 /// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case, 228 /// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15). 229 class ConstantOffsetExtractor { 230 public: 231 /// Extracts a constant offset from the given GEP index. It returns the 232 /// new index representing the remainder (equal to the original index minus 233 /// the constant offset), or nullptr if we cannot extract a constant offset. 234 /// \p Idx The given GEP index 235 /// \p GEP The given GEP 236 /// \p UserChainTail Outputs the tail of UserChain so that we can 237 /// garbage-collect unused instructions in UserChain. 238 static Value *Extract(Value *Idx, GetElementPtrInst *GEP, 239 User *&UserChainTail); 240 241 /// Looks for a constant offset from the given GEP index without extracting 242 /// it. It returns the numeric value of the extracted constant offset (0 if 243 /// failed). The meaning of the arguments are the same as Extract. 244 static int64_t Find(Value *Idx, GetElementPtrInst *GEP); 245 246 private: 247 ConstantOffsetExtractor(BasicBlock::iterator InsertionPt) 248 : IP(InsertionPt), DL(InsertionPt->getDataLayout()) {} 249 250 /// Searches the expression that computes V for a non-zero constant C s.t. 251 /// V can be reassociated into the form V' + C. If the searching is 252 /// successful, returns C and update UserChain as a def-use chain from C to V; 253 /// otherwise, UserChain is empty. 254 /// 255 /// \p V The given expression 256 /// \p SignExtended Whether V will be sign-extended in the computation of the 257 /// GEP index 258 /// \p ZeroExtended Whether V will be zero-extended in the computation of the 259 /// GEP index 260 /// \p NonNegative Whether V is guaranteed to be non-negative. For example, 261 /// an index of an inbounds GEP is guaranteed to be 262 /// non-negative. Levaraging this, we can better split 263 /// inbounds GEPs. 264 APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative); 265 266 /// A helper function to look into both operands of a binary operator. 267 APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended, 268 bool ZeroExtended); 269 270 /// After finding the constant offset C from the GEP index I, we build a new 271 /// index I' s.t. I' + C = I. This function builds and returns the new 272 /// index I' according to UserChain produced by function "find". 273 /// 274 /// The building conceptually takes two steps: 275 /// 1) iteratively distribute s/zext towards the leaves of the expression tree 276 /// that computes I 277 /// 2) reassociate the expression tree to the form I' + C. 278 /// 279 /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute 280 /// sext to a, b and 5 so that we have 281 /// sext(a) + (sext(b) + 5). 282 /// Then, we reassociate it to 283 /// (sext(a) + sext(b)) + 5. 284 /// Given this form, we know I' is sext(a) + sext(b). 285 Value *rebuildWithoutConstOffset(); 286 287 /// After the first step of rebuilding the GEP index without the constant 288 /// offset, distribute s/zext to the operands of all operators in UserChain. 289 /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) => 290 /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))). 291 /// 292 /// The function also updates UserChain to point to new subexpressions after 293 /// distributing s/zext. e.g., the old UserChain of the above example is 294 /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)), 295 /// and the new UserChain is 296 /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) -> 297 /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5)) 298 /// 299 /// \p ChainIndex The index to UserChain. ChainIndex is initially 300 /// UserChain.size() - 1, and is decremented during 301 /// the recursion. 302 Value *distributeExtsAndCloneChain(unsigned ChainIndex); 303 304 /// Reassociates the GEP index to the form I' + C and returns I'. 305 Value *removeConstOffset(unsigned ChainIndex); 306 307 /// A helper function to apply ExtInsts, a list of s/zext, to value V. 308 /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function 309 /// returns "sext i32 (zext i16 V to i32) to i64". 310 Value *applyExts(Value *V); 311 312 /// A helper function that returns whether we can trace into the operands 313 /// of binary operator BO for a constant offset. 314 /// 315 /// \p SignExtended Whether BO is surrounded by sext 316 /// \p ZeroExtended Whether BO is surrounded by zext 317 /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound 318 /// array index. 319 bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, 320 bool NonNegative); 321 322 /// The path from the constant offset to the old GEP index. e.g., if the GEP 323 /// index is "a * b + (c + 5)". After running function find, UserChain[0] will 324 /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and 325 /// UserChain[2] will be the entire expression "a * b + (c + 5)". 326 /// 327 /// This path helps to rebuild the new GEP index. 328 SmallVector<User *, 8> UserChain; 329 330 /// A data structure used in rebuildWithoutConstOffset. Contains all 331 /// sext/zext instructions along UserChain. 332 SmallVector<CastInst *, 16> ExtInsts; 333 334 /// Insertion position of cloned instructions. 335 BasicBlock::iterator IP; 336 337 const DataLayout &DL; 338 }; 339 340 /// A pass that tries to split every GEP in the function into a variadic 341 /// base and a constant offset. It is a FunctionPass because searching for the 342 /// constant offset may inspect other basic blocks. 343 class SeparateConstOffsetFromGEPLegacyPass : public FunctionPass { 344 public: 345 static char ID; 346 347 SeparateConstOffsetFromGEPLegacyPass(bool LowerGEP = false) 348 : FunctionPass(ID), LowerGEP(LowerGEP) { 349 initializeSeparateConstOffsetFromGEPLegacyPassPass( 350 *PassRegistry::getPassRegistry()); 351 } 352 353 void getAnalysisUsage(AnalysisUsage &AU) const override { 354 AU.addRequired<DominatorTreeWrapperPass>(); 355 AU.addRequired<TargetTransformInfoWrapperPass>(); 356 AU.addRequired<LoopInfoWrapperPass>(); 357 AU.setPreservesCFG(); 358 AU.addRequired<TargetLibraryInfoWrapperPass>(); 359 } 360 361 bool runOnFunction(Function &F) override; 362 363 private: 364 bool LowerGEP; 365 }; 366 367 /// A pass that tries to split every GEP in the function into a variadic 368 /// base and a constant offset. It is a FunctionPass because searching for the 369 /// constant offset may inspect other basic blocks. 370 class SeparateConstOffsetFromGEP { 371 public: 372 SeparateConstOffsetFromGEP( 373 DominatorTree *DT, LoopInfo *LI, TargetLibraryInfo *TLI, 374 function_ref<TargetTransformInfo &(Function &)> GetTTI, bool LowerGEP) 375 : DT(DT), LI(LI), TLI(TLI), GetTTI(GetTTI), LowerGEP(LowerGEP) {} 376 377 bool run(Function &F); 378 379 private: 380 /// Track the operands of an add or sub. 381 using ExprKey = std::pair<Value *, Value *>; 382 383 /// Create a pair for use as a map key for a commutable operation. 384 static ExprKey createNormalizedCommutablePair(Value *A, Value *B) { 385 if (A < B) 386 return {A, B}; 387 return {B, A}; 388 } 389 390 /// Tries to split the given GEP into a variadic base and a constant offset, 391 /// and returns true if the splitting succeeds. 392 bool splitGEP(GetElementPtrInst *GEP); 393 394 /// Tries to reorder the given GEP with the GEP that produces the base if 395 /// doing so results in producing a constant offset as the outermost 396 /// index. 397 bool reorderGEP(GetElementPtrInst *GEP, TargetTransformInfo &TTI); 398 399 /// Lower a GEP with multiple indices into multiple GEPs with a single index. 400 /// Function splitGEP already split the original GEP into a variadic part and 401 /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the 402 /// variadic part into a set of GEPs with a single index and applies 403 /// AccumulativeByteOffset to it. 404 /// \p Variadic The variadic part of the original GEP. 405 /// \p AccumulativeByteOffset The constant offset. 406 void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic, 407 int64_t AccumulativeByteOffset); 408 409 /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form. 410 /// Function splitGEP already split the original GEP into a variadic part and 411 /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the 412 /// variadic part into a set of arithmetic operations and applies 413 /// AccumulativeByteOffset to it. 414 /// \p Variadic The variadic part of the original GEP. 415 /// \p AccumulativeByteOffset The constant offset. 416 void lowerToArithmetics(GetElementPtrInst *Variadic, 417 int64_t AccumulativeByteOffset); 418 419 /// Finds the constant offset within each index and accumulates them. If 420 /// LowerGEP is true, it finds in indices of both sequential and structure 421 /// types, otherwise it only finds in sequential indices. The output 422 /// NeedsExtraction indicates whether we successfully find a non-zero constant 423 /// offset. 424 int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction); 425 426 /// Canonicalize array indices to pointer-size integers. This helps to 427 /// simplify the logic of splitting a GEP. For example, if a + b is a 428 /// pointer-size integer, we have 429 /// gep base, a + b = gep (gep base, a), b 430 /// However, this equality may not hold if the size of a + b is smaller than 431 /// the pointer size, because LLVM conceptually sign-extends GEP indices to 432 /// pointer size before computing the address 433 /// (http://llvm.org/docs/LangRef.html#id181). 434 /// 435 /// This canonicalization is very likely already done in clang and 436 /// instcombine. Therefore, the program will probably remain the same. 437 /// 438 /// Returns true if the module changes. 439 /// 440 /// Verified in @i32_add in split-gep.ll 441 bool canonicalizeArrayIndicesToIndexSize(GetElementPtrInst *GEP); 442 443 /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow. 444 /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting 445 /// the constant offset. After extraction, it becomes desirable to reunion the 446 /// distributed sexts. For example, 447 /// 448 /// &a[sext(i +nsw (j +nsw 5)] 449 /// => distribute &a[sext(i) +nsw (sext(j) +nsw 5)] 450 /// => constant extraction &a[sext(i) + sext(j)] + 5 451 /// => reunion &a[sext(i +nsw j)] + 5 452 bool reuniteExts(Function &F); 453 454 /// A helper that reunites sexts in an instruction. 455 bool reuniteExts(Instruction *I); 456 457 /// Find the closest dominator of <Dominatee> that is equivalent to <Key>. 458 Instruction *findClosestMatchingDominator( 459 ExprKey Key, Instruction *Dominatee, 460 DenseMap<ExprKey, SmallVector<Instruction *, 2>> &DominatingExprs); 461 462 /// Verify F is free of dead code. 463 void verifyNoDeadCode(Function &F); 464 465 bool hasMoreThanOneUseInLoop(Value *v, Loop *L); 466 467 // Swap the index operand of two GEP. 468 void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second); 469 470 // Check if it is safe to swap operand of two GEP. 471 bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second, 472 Loop *CurLoop); 473 474 const DataLayout *DL = nullptr; 475 DominatorTree *DT = nullptr; 476 LoopInfo *LI; 477 TargetLibraryInfo *TLI; 478 // Retrieved lazily since not always used. 479 function_ref<TargetTransformInfo &(Function &)> GetTTI; 480 481 /// Whether to lower a GEP with multiple indices into arithmetic operations or 482 /// multiple GEPs with a single index. 483 bool LowerGEP; 484 485 DenseMap<ExprKey, SmallVector<Instruction *, 2>> DominatingAdds; 486 DenseMap<ExprKey, SmallVector<Instruction *, 2>> DominatingSubs; 487 }; 488 489 } // end anonymous namespace 490 491 char SeparateConstOffsetFromGEPLegacyPass::ID = 0; 492 493 INITIALIZE_PASS_BEGIN( 494 SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep", 495 "Split GEPs to a variadic base and a constant offset for better CSE", false, 496 false) 497 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 498 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 499 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 500 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 501 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 502 INITIALIZE_PASS_END( 503 SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep", 504 "Split GEPs to a variadic base and a constant offset for better CSE", false, 505 false) 506 507 FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) { 508 return new SeparateConstOffsetFromGEPLegacyPass(LowerGEP); 509 } 510 511 bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, 512 bool ZeroExtended, 513 BinaryOperator *BO, 514 bool NonNegative) { 515 // We only consider ADD, SUB and OR, because a non-zero constant found in 516 // expressions composed of these operations can be easily hoisted as a 517 // constant offset by reassociation. 518 if (BO->getOpcode() != Instruction::Add && 519 BO->getOpcode() != Instruction::Sub && 520 BO->getOpcode() != Instruction::Or) { 521 return false; 522 } 523 524 Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1); 525 // Do not trace into "or" unless it is equivalent to "add". 526 // This is the case if the or's disjoint flag is set. 527 if (BO->getOpcode() == Instruction::Or && 528 !cast<PossiblyDisjointInst>(BO)->isDisjoint()) 529 return false; 530 531 // FIXME: We don't currently support constants from the RHS of subs, 532 // when we are zero-extended, because we need a way to zero-extended 533 // them before they are negated. 534 if (ZeroExtended && !SignExtended && BO->getOpcode() == Instruction::Sub) 535 return false; 536 537 // In addition, tracing into BO requires that its surrounding s/zext (if 538 // any) is distributable to both operands. 539 // 540 // Suppose BO = A op B. 541 // SignExtended | ZeroExtended | Distributable? 542 // --------------+--------------+---------------------------------- 543 // 0 | 0 | true because no s/zext exists 544 // 0 | 1 | zext(BO) == zext(A) op zext(B) 545 // 1 | 0 | sext(BO) == sext(A) op sext(B) 546 // 1 | 1 | zext(sext(BO)) == 547 // | | zext(sext(A)) op zext(sext(B)) 548 if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) { 549 // If a + b >= 0 and (a >= 0 or b >= 0), then 550 // sext(a + b) = sext(a) + sext(b) 551 // even if the addition is not marked nsw. 552 // 553 // Leveraging this invariant, we can trace into an sext'ed inbound GEP 554 // index if the constant offset is non-negative. 555 // 556 // Verified in @sext_add in split-gep.ll. 557 if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) { 558 if (!ConstLHS->isNegative()) 559 return true; 560 } 561 if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) { 562 if (!ConstRHS->isNegative()) 563 return true; 564 } 565 } 566 567 // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B) 568 // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B) 569 if (BO->getOpcode() == Instruction::Add || 570 BO->getOpcode() == Instruction::Sub) { 571 if (SignExtended && !BO->hasNoSignedWrap()) 572 return false; 573 if (ZeroExtended && !BO->hasNoUnsignedWrap()) 574 return false; 575 } 576 577 return true; 578 } 579 580 APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO, 581 bool SignExtended, 582 bool ZeroExtended) { 583 // Save off the current height of the chain, in case we need to restore it. 584 size_t ChainLength = UserChain.size(); 585 586 // BO being non-negative does not shed light on whether its operands are 587 // non-negative. Clear the NonNegative flag here. 588 APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended, 589 /* NonNegative */ false); 590 // If we found a constant offset in the left operand, stop and return that. 591 // This shortcut might cause us to miss opportunities of combining the 592 // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9. 593 // However, such cases are probably already handled by -instcombine, 594 // given this pass runs after the standard optimizations. 595 if (ConstantOffset != 0) return ConstantOffset; 596 597 // Reset the chain back to where it was when we started exploring this node, 598 // since visiting the LHS didn't pan out. 599 UserChain.resize(ChainLength); 600 601 ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended, 602 /* NonNegative */ false); 603 // If U is a sub operator, negate the constant offset found in the right 604 // operand. 605 if (BO->getOpcode() == Instruction::Sub) 606 ConstantOffset = -ConstantOffset; 607 608 // If RHS wasn't a suitable candidate either, reset the chain again. 609 if (ConstantOffset == 0) 610 UserChain.resize(ChainLength); 611 612 return ConstantOffset; 613 } 614 615 APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, 616 bool ZeroExtended, bool NonNegative) { 617 // TODO(jingyue): We could trace into integer/pointer casts, such as 618 // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only 619 // integers because it gives good enough results for our benchmarks. 620 unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth(); 621 622 // We cannot do much with Values that are not a User, such as an Argument. 623 User *U = dyn_cast<User>(V); 624 if (U == nullptr) return APInt(BitWidth, 0); 625 626 APInt ConstantOffset(BitWidth, 0); 627 if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) { 628 // Hooray, we found it! 629 ConstantOffset = CI->getValue(); 630 } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) { 631 // Trace into subexpressions for more hoisting opportunities. 632 if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) 633 ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended); 634 } else if (isa<TruncInst>(V)) { 635 ConstantOffset = 636 find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative) 637 .trunc(BitWidth); 638 } else if (isa<SExtInst>(V)) { 639 ConstantOffset = find(U->getOperand(0), /* SignExtended */ true, 640 ZeroExtended, NonNegative).sext(BitWidth); 641 } else if (isa<ZExtInst>(V)) { 642 // As an optimization, we can clear the SignExtended flag because 643 // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll. 644 // 645 // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0. 646 ConstantOffset = 647 find(U->getOperand(0), /* SignExtended */ false, 648 /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth); 649 } 650 651 // If we found a non-zero constant offset, add it to the path for 652 // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't 653 // help this optimization. 654 if (ConstantOffset != 0) 655 UserChain.push_back(U); 656 return ConstantOffset; 657 } 658 659 Value *ConstantOffsetExtractor::applyExts(Value *V) { 660 Value *Current = V; 661 // ExtInsts is built in the use-def order. Therefore, we apply them to V 662 // in the reversed order. 663 for (CastInst *I : llvm::reverse(ExtInsts)) { 664 if (Constant *C = dyn_cast<Constant>(Current)) { 665 // Try to constant fold the cast. 666 Current = ConstantFoldCastOperand(I->getOpcode(), C, I->getType(), DL); 667 if (Current) 668 continue; 669 } 670 671 Instruction *Ext = I->clone(); 672 Ext->setOperand(0, Current); 673 Ext->insertBefore(*IP->getParent(), IP); 674 Current = Ext; 675 } 676 return Current; 677 } 678 679 Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() { 680 distributeExtsAndCloneChain(UserChain.size() - 1); 681 // Remove all nullptrs (used to be s/zext) from UserChain. 682 unsigned NewSize = 0; 683 for (User *I : UserChain) { 684 if (I != nullptr) { 685 UserChain[NewSize] = I; 686 NewSize++; 687 } 688 } 689 UserChain.resize(NewSize); 690 return removeConstOffset(UserChain.size() - 1); 691 } 692 693 Value * 694 ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) { 695 User *U = UserChain[ChainIndex]; 696 if (ChainIndex == 0) { 697 assert(isa<ConstantInt>(U)); 698 // If U is a ConstantInt, applyExts will return a ConstantInt as well. 699 return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U)); 700 } 701 702 if (CastInst *Cast = dyn_cast<CastInst>(U)) { 703 assert( 704 (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) && 705 "Only following instructions can be traced: sext, zext & trunc"); 706 ExtInsts.push_back(Cast); 707 UserChain[ChainIndex] = nullptr; 708 return distributeExtsAndCloneChain(ChainIndex - 1); 709 } 710 711 // Function find only trace into BinaryOperator and CastInst. 712 BinaryOperator *BO = cast<BinaryOperator>(U); 713 // OpNo = which operand of BO is UserChain[ChainIndex - 1] 714 unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); 715 Value *TheOther = applyExts(BO->getOperand(1 - OpNo)); 716 Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1); 717 718 BinaryOperator *NewBO = nullptr; 719 if (OpNo == 0) { 720 NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther, 721 BO->getName(), IP); 722 } else { 723 NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain, 724 BO->getName(), IP); 725 } 726 return UserChain[ChainIndex] = NewBO; 727 } 728 729 Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { 730 if (ChainIndex == 0) { 731 assert(isa<ConstantInt>(UserChain[ChainIndex])); 732 return ConstantInt::getNullValue(UserChain[ChainIndex]->getType()); 733 } 734 735 BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]); 736 assert((BO->use_empty() || BO->hasOneUse()) && 737 "distributeExtsAndCloneChain clones each BinaryOperator in " 738 "UserChain, so no one should be used more than " 739 "once"); 740 741 unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); 742 assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]); 743 Value *NextInChain = removeConstOffset(ChainIndex - 1); 744 Value *TheOther = BO->getOperand(1 - OpNo); 745 746 // If NextInChain is 0 and not the LHS of a sub, we can simplify the 747 // sub-expression to be just TheOther. 748 if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) { 749 if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0)) 750 return TheOther; 751 } 752 753 BinaryOperator::BinaryOps NewOp = BO->getOpcode(); 754 if (BO->getOpcode() == Instruction::Or) { 755 // Rebuild "or" as "add", because "or" may be invalid for the new 756 // expression. 757 // 758 // For instance, given 759 // a | (b + 5) where a and b + 5 have no common bits, 760 // we can extract 5 as the constant offset. 761 // 762 // However, reusing the "or" in the new index would give us 763 // (a | b) + 5 764 // which does not equal a | (b + 5). 765 // 766 // Replacing the "or" with "add" is fine, because 767 // a | (b + 5) = a + (b + 5) = (a + b) + 5 768 NewOp = Instruction::Add; 769 } 770 771 BinaryOperator *NewBO; 772 if (OpNo == 0) { 773 NewBO = BinaryOperator::Create(NewOp, NextInChain, TheOther, "", IP); 774 } else { 775 NewBO = BinaryOperator::Create(NewOp, TheOther, NextInChain, "", IP); 776 } 777 NewBO->takeName(BO); 778 return NewBO; 779 } 780 781 Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP, 782 User *&UserChainTail) { 783 ConstantOffsetExtractor Extractor(GEP->getIterator()); 784 // Find a non-zero constant offset first. 785 APInt ConstantOffset = 786 Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, 787 GEP->isInBounds()); 788 if (ConstantOffset == 0) { 789 UserChainTail = nullptr; 790 return nullptr; 791 } 792 // Separates the constant offset from the GEP index. 793 Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset(); 794 UserChainTail = Extractor.UserChain.back(); 795 return IdxWithoutConstOffset; 796 } 797 798 int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP) { 799 // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative. 800 return ConstantOffsetExtractor(GEP->getIterator()) 801 .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, 802 GEP->isInBounds()) 803 .getSExtValue(); 804 } 805 806 bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToIndexSize( 807 GetElementPtrInst *GEP) { 808 bool Changed = false; 809 Type *PtrIdxTy = DL->getIndexType(GEP->getType()); 810 gep_type_iterator GTI = gep_type_begin(*GEP); 811 for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end(); 812 I != E; ++I, ++GTI) { 813 // Skip struct member indices which must be i32. 814 if (GTI.isSequential()) { 815 if ((*I)->getType() != PtrIdxTy) { 816 *I = CastInst::CreateIntegerCast(*I, PtrIdxTy, true, "idxprom", 817 GEP->getIterator()); 818 Changed = true; 819 } 820 } 821 } 822 return Changed; 823 } 824 825 int64_t 826 SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, 827 bool &NeedsExtraction) { 828 NeedsExtraction = false; 829 int64_t AccumulativeByteOffset = 0; 830 gep_type_iterator GTI = gep_type_begin(*GEP); 831 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { 832 if (GTI.isSequential()) { 833 // Constant offsets of scalable types are not really constant. 834 if (GTI.getIndexedType()->isScalableTy()) 835 continue; 836 837 // Tries to extract a constant offset from this GEP index. 838 int64_t ConstantOffset = 839 ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP); 840 if (ConstantOffset != 0) { 841 NeedsExtraction = true; 842 // A GEP may have multiple indices. We accumulate the extracted 843 // constant offset to a byte offset, and later offset the remainder of 844 // the original GEP with this byte offset. 845 AccumulativeByteOffset += 846 ConstantOffset * GTI.getSequentialElementStride(*DL); 847 } 848 } else if (LowerGEP) { 849 StructType *StTy = GTI.getStructType(); 850 uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue(); 851 // Skip field 0 as the offset is always 0. 852 if (Field != 0) { 853 NeedsExtraction = true; 854 AccumulativeByteOffset += 855 DL->getStructLayout(StTy)->getElementOffset(Field); 856 } 857 } 858 } 859 return AccumulativeByteOffset; 860 } 861 862 void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( 863 GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) { 864 IRBuilder<> Builder(Variadic); 865 Type *PtrIndexTy = DL->getIndexType(Variadic->getType()); 866 867 Value *ResultPtr = Variadic->getOperand(0); 868 Loop *L = LI->getLoopFor(Variadic->getParent()); 869 // Check if the base is not loop invariant or used more than once. 870 bool isSwapCandidate = 871 L && L->isLoopInvariant(ResultPtr) && 872 !hasMoreThanOneUseInLoop(ResultPtr, L); 873 Value *FirstResult = nullptr; 874 875 gep_type_iterator GTI = gep_type_begin(*Variadic); 876 // Create an ugly GEP for each sequential index. We don't create GEPs for 877 // structure indices, as they are accumulated in the constant offset index. 878 for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) { 879 if (GTI.isSequential()) { 880 Value *Idx = Variadic->getOperand(I); 881 // Skip zero indices. 882 if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) 883 if (CI->isZero()) 884 continue; 885 886 APInt ElementSize = APInt(PtrIndexTy->getIntegerBitWidth(), 887 GTI.getSequentialElementStride(*DL)); 888 // Scale the index by element size. 889 if (ElementSize != 1) { 890 if (ElementSize.isPowerOf2()) { 891 Idx = Builder.CreateShl( 892 Idx, ConstantInt::get(PtrIndexTy, ElementSize.logBase2())); 893 } else { 894 Idx = 895 Builder.CreateMul(Idx, ConstantInt::get(PtrIndexTy, ElementSize)); 896 } 897 } 898 // Create an ugly GEP with a single index for each index. 899 ResultPtr = Builder.CreatePtrAdd(ResultPtr, Idx, "uglygep"); 900 if (FirstResult == nullptr) 901 FirstResult = ResultPtr; 902 } 903 } 904 905 // Create a GEP with the constant offset index. 906 if (AccumulativeByteOffset != 0) { 907 Value *Offset = ConstantInt::get(PtrIndexTy, AccumulativeByteOffset); 908 ResultPtr = Builder.CreatePtrAdd(ResultPtr, Offset, "uglygep"); 909 } else 910 isSwapCandidate = false; 911 912 // If we created a GEP with constant index, and the base is loop invariant, 913 // then we swap the first one with it, so LICM can move constant GEP out 914 // later. 915 auto *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult); 916 auto *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr); 917 if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L)) 918 swapGEPOperand(FirstGEP, SecondGEP); 919 920 Variadic->replaceAllUsesWith(ResultPtr); 921 Variadic->eraseFromParent(); 922 } 923 924 void 925 SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic, 926 int64_t AccumulativeByteOffset) { 927 IRBuilder<> Builder(Variadic); 928 Type *IntPtrTy = DL->getIntPtrType(Variadic->getType()); 929 assert(IntPtrTy == DL->getIndexType(Variadic->getType()) && 930 "Pointer type must match index type for arithmetic-based lowering of " 931 "split GEPs"); 932 933 Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy); 934 gep_type_iterator GTI = gep_type_begin(*Variadic); 935 // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We 936 // don't create arithmetics for structure indices, as they are accumulated 937 // in the constant offset index. 938 for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) { 939 if (GTI.isSequential()) { 940 Value *Idx = Variadic->getOperand(I); 941 // Skip zero indices. 942 if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) 943 if (CI->isZero()) 944 continue; 945 946 APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(), 947 GTI.getSequentialElementStride(*DL)); 948 // Scale the index by element size. 949 if (ElementSize != 1) { 950 if (ElementSize.isPowerOf2()) { 951 Idx = Builder.CreateShl( 952 Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2())); 953 } else { 954 Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize)); 955 } 956 } 957 // Create an ADD for each index. 958 ResultPtr = Builder.CreateAdd(ResultPtr, Idx); 959 } 960 } 961 962 // Create an ADD for the constant offset index. 963 if (AccumulativeByteOffset != 0) { 964 ResultPtr = Builder.CreateAdd( 965 ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset)); 966 } 967 968 ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType()); 969 Variadic->replaceAllUsesWith(ResultPtr); 970 Variadic->eraseFromParent(); 971 } 972 973 bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP, 974 TargetTransformInfo &TTI) { 975 auto PtrGEP = dyn_cast<GetElementPtrInst>(GEP->getPointerOperand()); 976 if (!PtrGEP) 977 return false; 978 979 bool NestedNeedsExtraction; 980 int64_t NestedByteOffset = 981 accumulateByteOffset(PtrGEP, NestedNeedsExtraction); 982 if (!NestedNeedsExtraction) 983 return false; 984 985 unsigned AddrSpace = PtrGEP->getPointerAddressSpace(); 986 if (!TTI.isLegalAddressingMode(GEP->getResultElementType(), 987 /*BaseGV=*/nullptr, NestedByteOffset, 988 /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace)) 989 return false; 990 991 bool GEPInBounds = GEP->isInBounds(); 992 bool PtrGEPInBounds = PtrGEP->isInBounds(); 993 bool IsChainInBounds = GEPInBounds && PtrGEPInBounds; 994 if (IsChainInBounds) { 995 auto IsKnownNonNegative = [this](Value *V) { 996 return isKnownNonNegative(V, *DL); 997 }; 998 IsChainInBounds &= all_of(GEP->indices(), IsKnownNonNegative); 999 if (IsChainInBounds) 1000 IsChainInBounds &= all_of(PtrGEP->indices(), IsKnownNonNegative); 1001 } 1002 1003 IRBuilder<> Builder(GEP); 1004 // For trivial GEP chains, we can swap the indices. 1005 Value *NewSrc = Builder.CreateGEP( 1006 GEP->getSourceElementType(), PtrGEP->getPointerOperand(), 1007 SmallVector<Value *, 4>(GEP->indices()), "", IsChainInBounds); 1008 Value *NewGEP = Builder.CreateGEP(PtrGEP->getSourceElementType(), NewSrc, 1009 SmallVector<Value *, 4>(PtrGEP->indices()), 1010 "", IsChainInBounds); 1011 GEP->replaceAllUsesWith(NewGEP); 1012 RecursivelyDeleteTriviallyDeadInstructions(GEP); 1013 return true; 1014 } 1015 1016 bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { 1017 // Skip vector GEPs. 1018 if (GEP->getType()->isVectorTy()) 1019 return false; 1020 1021 // The backend can already nicely handle the case where all indices are 1022 // constant. 1023 if (GEP->hasAllConstantIndices()) 1024 return false; 1025 1026 bool Changed = canonicalizeArrayIndicesToIndexSize(GEP); 1027 1028 bool NeedsExtraction; 1029 int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction); 1030 1031 TargetTransformInfo &TTI = GetTTI(*GEP->getFunction()); 1032 1033 if (!NeedsExtraction) { 1034 Changed |= reorderGEP(GEP, TTI); 1035 return Changed; 1036 } 1037 1038 // If LowerGEP is disabled, before really splitting the GEP, check whether the 1039 // backend supports the addressing mode we are about to produce. If no, this 1040 // splitting probably won't be beneficial. 1041 // If LowerGEP is enabled, even the extracted constant offset can not match 1042 // the addressing mode, we can still do optimizations to other lowered parts 1043 // of variable indices. Therefore, we don't check for addressing modes in that 1044 // case. 1045 if (!LowerGEP) { 1046 unsigned AddrSpace = GEP->getPointerAddressSpace(); 1047 if (!TTI.isLegalAddressingMode(GEP->getResultElementType(), 1048 /*BaseGV=*/nullptr, AccumulativeByteOffset, 1049 /*HasBaseReg=*/true, /*Scale=*/0, 1050 AddrSpace)) { 1051 return Changed; 1052 } 1053 } 1054 1055 // Remove the constant offset in each sequential index. The resultant GEP 1056 // computes the variadic base. 1057 // Notice that we don't remove struct field indices here. If LowerGEP is 1058 // disabled, a structure index is not accumulated and we still use the old 1059 // one. If LowerGEP is enabled, a structure index is accumulated in the 1060 // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later 1061 // handle the constant offset and won't need a new structure index. 1062 gep_type_iterator GTI = gep_type_begin(*GEP); 1063 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { 1064 if (GTI.isSequential()) { 1065 // Constant offsets of scalable types are not really constant. 1066 if (GTI.getIndexedType()->isScalableTy()) 1067 continue; 1068 1069 // Splits this GEP index into a variadic part and a constant offset, and 1070 // uses the variadic part as the new index. 1071 Value *OldIdx = GEP->getOperand(I); 1072 User *UserChainTail; 1073 Value *NewIdx = 1074 ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail); 1075 if (NewIdx != nullptr) { 1076 // Switches to the index with the constant offset removed. 1077 GEP->setOperand(I, NewIdx); 1078 // After switching to the new index, we can garbage-collect UserChain 1079 // and the old index if they are not used. 1080 RecursivelyDeleteTriviallyDeadInstructions(UserChainTail); 1081 RecursivelyDeleteTriviallyDeadInstructions(OldIdx); 1082 } 1083 } 1084 } 1085 1086 // Clear the inbounds attribute because the new index may be off-bound. 1087 // e.g., 1088 // 1089 // b = add i64 a, 5 1090 // addr = gep inbounds float, float* p, i64 b 1091 // 1092 // is transformed to: 1093 // 1094 // addr2 = gep float, float* p, i64 a ; inbounds removed 1095 // addr = gep inbounds float, float* addr2, i64 5 1096 // 1097 // If a is -4, although the old index b is in bounds, the new index a is 1098 // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the 1099 // inbounds keyword is not present, the offsets are added to the base 1100 // address with silently-wrapping two's complement arithmetic". 1101 // Therefore, the final code will be a semantically equivalent. 1102 // 1103 // TODO(jingyue): do some range analysis to keep as many inbounds as 1104 // possible. GEPs with inbounds are more friendly to alias analysis. 1105 // TODO(gep_nowrap): Preserve nuw at least. 1106 bool GEPWasInBounds = GEP->isInBounds(); 1107 GEP->setNoWrapFlags(GEPNoWrapFlags::none()); 1108 1109 // Lowers a GEP to either GEPs with a single index or arithmetic operations. 1110 if (LowerGEP) { 1111 // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to 1112 // arithmetic operations if the target uses alias analysis in codegen. 1113 // Additionally, pointers that aren't integral (and so can't be safely 1114 // converted to integers) or those whose offset size is different from their 1115 // pointer size (which means that doing integer arithmetic on them could 1116 // affect that data) can't be lowered in this way. 1117 unsigned AddrSpace = GEP->getPointerAddressSpace(); 1118 bool PointerHasExtraData = DL->getPointerSizeInBits(AddrSpace) != 1119 DL->getIndexSizeInBits(AddrSpace); 1120 if (TTI.useAA() || DL->isNonIntegralAddressSpace(AddrSpace) || 1121 PointerHasExtraData) 1122 lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset); 1123 else 1124 lowerToArithmetics(GEP, AccumulativeByteOffset); 1125 return true; 1126 } 1127 1128 // No need to create another GEP if the accumulative byte offset is 0. 1129 if (AccumulativeByteOffset == 0) 1130 return true; 1131 1132 // Offsets the base with the accumulative byte offset. 1133 // 1134 // %gep ; the base 1135 // ... %gep ... 1136 // 1137 // => add the offset 1138 // 1139 // %gep2 ; clone of %gep 1140 // %new.gep = gep i8, %gep2, %offset 1141 // %gep ; will be removed 1142 // ... %gep ... 1143 // 1144 // => replace all uses of %gep with %new.gep and remove %gep 1145 // 1146 // %gep2 ; clone of %gep 1147 // %new.gep = gep i8, %gep2, %offset 1148 // ... %new.gep ... 1149 Instruction *NewGEP = GEP->clone(); 1150 NewGEP->insertBefore(GEP); 1151 1152 Type *PtrIdxTy = DL->getIndexType(GEP->getType()); 1153 IRBuilder<> Builder(GEP); 1154 NewGEP = cast<Instruction>(Builder.CreatePtrAdd( 1155 NewGEP, ConstantInt::get(PtrIdxTy, AccumulativeByteOffset, true), 1156 GEP->getName(), GEPWasInBounds)); 1157 NewGEP->copyMetadata(*GEP); 1158 1159 GEP->replaceAllUsesWith(NewGEP); 1160 GEP->eraseFromParent(); 1161 1162 return true; 1163 } 1164 1165 bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) { 1166 if (skipFunction(F)) 1167 return false; 1168 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1169 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1170 auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 1171 auto GetTTI = [this](Function &F) -> TargetTransformInfo & { 1172 return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1173 }; 1174 SeparateConstOffsetFromGEP Impl(DT, LI, TLI, GetTTI, LowerGEP); 1175 return Impl.run(F); 1176 } 1177 1178 bool SeparateConstOffsetFromGEP::run(Function &F) { 1179 if (DisableSeparateConstOffsetFromGEP) 1180 return false; 1181 1182 DL = &F.getDataLayout(); 1183 bool Changed = false; 1184 for (BasicBlock &B : F) { 1185 if (!DT->isReachableFromEntry(&B)) 1186 continue; 1187 1188 for (Instruction &I : llvm::make_early_inc_range(B)) 1189 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) 1190 Changed |= splitGEP(GEP); 1191 // No need to split GEP ConstantExprs because all its indices are constant 1192 // already. 1193 } 1194 1195 Changed |= reuniteExts(F); 1196 1197 if (VerifyNoDeadCode) 1198 verifyNoDeadCode(F); 1199 1200 return Changed; 1201 } 1202 1203 Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator( 1204 ExprKey Key, Instruction *Dominatee, 1205 DenseMap<ExprKey, SmallVector<Instruction *, 2>> &DominatingExprs) { 1206 auto Pos = DominatingExprs.find(Key); 1207 if (Pos == DominatingExprs.end()) 1208 return nullptr; 1209 1210 auto &Candidates = Pos->second; 1211 // Because we process the basic blocks in pre-order of the dominator tree, a 1212 // candidate that doesn't dominate the current instruction won't dominate any 1213 // future instruction either. Therefore, we pop it out of the stack. This 1214 // optimization makes the algorithm O(n). 1215 while (!Candidates.empty()) { 1216 Instruction *Candidate = Candidates.back(); 1217 if (DT->dominates(Candidate, Dominatee)) 1218 return Candidate; 1219 Candidates.pop_back(); 1220 } 1221 return nullptr; 1222 } 1223 1224 bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) { 1225 if (!I->getType()->isIntOrIntVectorTy()) 1226 return false; 1227 1228 // Dom: LHS+RHS 1229 // I: sext(LHS)+sext(RHS) 1230 // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom). 1231 // TODO: handle zext 1232 Value *LHS = nullptr, *RHS = nullptr; 1233 if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) { 1234 if (LHS->getType() == RHS->getType()) { 1235 ExprKey Key = createNormalizedCommutablePair(LHS, RHS); 1236 if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingAdds)) { 1237 Instruction *NewSExt = 1238 new SExtInst(Dom, I->getType(), "", I->getIterator()); 1239 NewSExt->takeName(I); 1240 I->replaceAllUsesWith(NewSExt); 1241 NewSExt->setDebugLoc(I->getDebugLoc()); 1242 RecursivelyDeleteTriviallyDeadInstructions(I); 1243 return true; 1244 } 1245 } 1246 } else if (match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) { 1247 if (LHS->getType() == RHS->getType()) { 1248 if (auto *Dom = 1249 findClosestMatchingDominator({LHS, RHS}, I, DominatingSubs)) { 1250 Instruction *NewSExt = 1251 new SExtInst(Dom, I->getType(), "", I->getIterator()); 1252 NewSExt->takeName(I); 1253 I->replaceAllUsesWith(NewSExt); 1254 NewSExt->setDebugLoc(I->getDebugLoc()); 1255 RecursivelyDeleteTriviallyDeadInstructions(I); 1256 return true; 1257 } 1258 } 1259 } 1260 1261 // Add I to DominatingExprs if it's an add/sub that can't sign overflow. 1262 if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS)))) { 1263 if (programUndefinedIfPoison(I)) { 1264 ExprKey Key = createNormalizedCommutablePair(LHS, RHS); 1265 DominatingAdds[Key].push_back(I); 1266 } 1267 } else if (match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) { 1268 if (programUndefinedIfPoison(I)) 1269 DominatingSubs[{LHS, RHS}].push_back(I); 1270 } 1271 return false; 1272 } 1273 1274 bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) { 1275 bool Changed = false; 1276 DominatingAdds.clear(); 1277 DominatingSubs.clear(); 1278 for (const auto Node : depth_first(DT)) { 1279 BasicBlock *BB = Node->getBlock(); 1280 for (Instruction &I : llvm::make_early_inc_range(*BB)) 1281 Changed |= reuniteExts(&I); 1282 } 1283 return Changed; 1284 } 1285 1286 void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) { 1287 for (BasicBlock &B : F) { 1288 for (Instruction &I : B) { 1289 if (isInstructionTriviallyDead(&I)) { 1290 std::string ErrMessage; 1291 raw_string_ostream RSO(ErrMessage); 1292 RSO << "Dead instruction detected!\n" << I << "\n"; 1293 llvm_unreachable(RSO.str().c_str()); 1294 } 1295 } 1296 } 1297 } 1298 1299 bool SeparateConstOffsetFromGEP::isLegalToSwapOperand( 1300 GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) { 1301 if (!FirstGEP || !FirstGEP->hasOneUse()) 1302 return false; 1303 1304 if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent()) 1305 return false; 1306 1307 if (FirstGEP == SecondGEP) 1308 return false; 1309 1310 unsigned FirstNum = FirstGEP->getNumOperands(); 1311 unsigned SecondNum = SecondGEP->getNumOperands(); 1312 // Give up if the number of operands are not 2. 1313 if (FirstNum != SecondNum || FirstNum != 2) 1314 return false; 1315 1316 Value *FirstBase = FirstGEP->getOperand(0); 1317 Value *SecondBase = SecondGEP->getOperand(0); 1318 Value *FirstOffset = FirstGEP->getOperand(1); 1319 // Give up if the index of the first GEP is loop invariant. 1320 if (CurLoop->isLoopInvariant(FirstOffset)) 1321 return false; 1322 1323 // Give up if base doesn't have same type. 1324 if (FirstBase->getType() != SecondBase->getType()) 1325 return false; 1326 1327 Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset); 1328 1329 // Check if the second operand of first GEP has constant coefficient. 1330 // For an example, for the following code, we won't gain anything by 1331 // hoisting the second GEP out because the second GEP can be folded away. 1332 // %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256 1333 // %67 = shl i64 %scevgep.sum.ur159, 2 1334 // %uglygep160 = getelementptr i8* %65, i64 %67 1335 // %uglygep161 = getelementptr i8* %uglygep160, i64 -1024 1336 1337 // Skip constant shift instruction which may be generated by Splitting GEPs. 1338 if (FirstOffsetDef && FirstOffsetDef->isShift() && 1339 isa<ConstantInt>(FirstOffsetDef->getOperand(1))) 1340 FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0)); 1341 1342 // Give up if FirstOffsetDef is an Add or Sub with constant. 1343 // Because it may not profitable at all due to constant folding. 1344 if (FirstOffsetDef) 1345 if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) { 1346 unsigned opc = BO->getOpcode(); 1347 if ((opc == Instruction::Add || opc == Instruction::Sub) && 1348 (isa<ConstantInt>(BO->getOperand(0)) || 1349 isa<ConstantInt>(BO->getOperand(1)))) 1350 return false; 1351 } 1352 return true; 1353 } 1354 1355 bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) { 1356 int UsesInLoop = 0; 1357 for (User *U : V->users()) { 1358 if (Instruction *User = dyn_cast<Instruction>(U)) 1359 if (L->contains(User)) 1360 if (++UsesInLoop > 1) 1361 return true; 1362 } 1363 return false; 1364 } 1365 1366 void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First, 1367 GetElementPtrInst *Second) { 1368 Value *Offset1 = First->getOperand(1); 1369 Value *Offset2 = Second->getOperand(1); 1370 First->setOperand(1, Offset2); 1371 Second->setOperand(1, Offset1); 1372 1373 // We changed p+o+c to p+c+o, p+c may not be inbound anymore. 1374 const DataLayout &DAL = First->getDataLayout(); 1375 APInt Offset(DAL.getIndexSizeInBits( 1376 cast<PointerType>(First->getType())->getAddressSpace()), 1377 0); 1378 Value *NewBase = 1379 First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset); 1380 uint64_t ObjectSize; 1381 if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) || 1382 Offset.ugt(ObjectSize)) { 1383 // TODO(gep_nowrap): Make flag preservation more precise. 1384 First->setNoWrapFlags(GEPNoWrapFlags::none()); 1385 Second->setNoWrapFlags(GEPNoWrapFlags::none()); 1386 } else 1387 First->setIsInBounds(true); 1388 } 1389 1390 void SeparateConstOffsetFromGEPPass::printPipeline( 1391 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 1392 static_cast<PassInfoMixin<SeparateConstOffsetFromGEPPass> *>(this) 1393 ->printPipeline(OS, MapClassName2PassName); 1394 OS << '<'; 1395 if (LowerGEP) 1396 OS << "lower-gep"; 1397 OS << '>'; 1398 } 1399 1400 PreservedAnalyses 1401 SeparateConstOffsetFromGEPPass::run(Function &F, FunctionAnalysisManager &AM) { 1402 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F); 1403 auto *LI = &AM.getResult<LoopAnalysis>(F); 1404 auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F); 1405 auto GetTTI = [&AM](Function &F) -> TargetTransformInfo & { 1406 return AM.getResult<TargetIRAnalysis>(F); 1407 }; 1408 SeparateConstOffsetFromGEP Impl(DT, LI, TLI, GetTTI, LowerGEP); 1409 if (!Impl.run(F)) 1410 return PreservedAnalyses::all(); 1411 PreservedAnalyses PA; 1412 PA.preserveSet<CFGAnalyses>(); 1413 return PA; 1414 } 1415