1 //===- SeparateConstOffsetFromGEP.cpp -------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Loop unrolling may create many similar GEPs for array accesses. 10 // e.g., a 2-level loop 11 // 12 // float a[32][32]; // global variable 13 // 14 // for (int i = 0; i < 2; ++i) { 15 // for (int j = 0; j < 2; ++j) { 16 // ... 17 // ... = a[x + i][y + j]; 18 // ... 19 // } 20 // } 21 // 22 // will probably be unrolled to: 23 // 24 // gep %a, 0, %x, %y; load 25 // gep %a, 0, %x, %y + 1; load 26 // gep %a, 0, %x + 1, %y; load 27 // gep %a, 0, %x + 1, %y + 1; load 28 // 29 // LLVM's GVN does not use partial redundancy elimination yet, and is thus 30 // unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs 31 // significant slowdown in targets with limited addressing modes. For instance, 32 // because the PTX target does not support the reg+reg addressing mode, the 33 // NVPTX backend emits PTX code that literally computes the pointer address of 34 // each GEP, wasting tons of registers. It emits the following PTX for the 35 // first load and similar PTX for other loads. 36 // 37 // mov.u32 %r1, %x; 38 // mov.u32 %r2, %y; 39 // mul.wide.u32 %rl2, %r1, 128; 40 // mov.u64 %rl3, a; 41 // add.s64 %rl4, %rl3, %rl2; 42 // mul.wide.u32 %rl5, %r2, 4; 43 // add.s64 %rl6, %rl4, %rl5; 44 // ld.global.f32 %f1, [%rl6]; 45 // 46 // To reduce the register pressure, the optimization implemented in this file 47 // merges the common part of a group of GEPs, so we can compute each pointer 48 // address by adding a simple offset to the common part, saving many registers. 49 // 50 // It works by splitting each GEP into a variadic base and a constant offset. 51 // The variadic base can be computed once and reused by multiple GEPs, and the 52 // constant offsets can be nicely folded into the reg+immediate addressing mode 53 // (supported by most targets) without using any extra register. 54 // 55 // For instance, we transform the four GEPs and four loads in the above example 56 // into: 57 // 58 // base = gep a, 0, x, y 59 // load base 60 // laod base + 1 * sizeof(float) 61 // load base + 32 * sizeof(float) 62 // load base + 33 * sizeof(float) 63 // 64 // Given the transformed IR, a backend that supports the reg+immediate 65 // addressing mode can easily fold the pointer arithmetics into the loads. For 66 // example, the NVPTX backend can easily fold the pointer arithmetics into the 67 // ld.global.f32 instructions, and the resultant PTX uses much fewer registers. 68 // 69 // mov.u32 %r1, %tid.x; 70 // mov.u32 %r2, %tid.y; 71 // mul.wide.u32 %rl2, %r1, 128; 72 // mov.u64 %rl3, a; 73 // add.s64 %rl4, %rl3, %rl2; 74 // mul.wide.u32 %rl5, %r2, 4; 75 // add.s64 %rl6, %rl4, %rl5; 76 // ld.global.f32 %f1, [%rl6]; // so far the same as unoptimized PTX 77 // ld.global.f32 %f2, [%rl6+4]; // much better 78 // ld.global.f32 %f3, [%rl6+128]; // much better 79 // ld.global.f32 %f4, [%rl6+132]; // much better 80 // 81 // Another improvement enabled by the LowerGEP flag is to lower a GEP with 82 // multiple indices to either multiple GEPs with a single index or arithmetic 83 // operations (depending on whether the target uses alias analysis in codegen). 84 // Such transformation can have following benefits: 85 // (1) It can always extract constants in the indices of structure type. 86 // (2) After such Lowering, there are more optimization opportunities such as 87 // CSE, LICM and CGP. 88 // 89 // E.g. The following GEPs have multiple indices: 90 // BB1: 91 // %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3 92 // load %p 93 // ... 94 // BB2: 95 // %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2 96 // load %p2 97 // ... 98 // 99 // We can not do CSE to the common part related to index "i64 %i". Lowering 100 // GEPs can achieve such goals. 101 // If the target does not use alias analysis in codegen, this pass will 102 // lower a GEP with multiple indices into arithmetic operations: 103 // BB1: 104 // %1 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity 105 // %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity 106 // %3 = add i64 %1, %2 ; CSE opportunity 107 // %4 = mul i64 %j1, length_of_struct 108 // %5 = add i64 %3, %4 109 // %6 = add i64 %3, struct_field_3 ; Constant offset 110 // %p = inttoptr i64 %6 to i32* 111 // load %p 112 // ... 113 // BB2: 114 // %7 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity 115 // %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity 116 // %9 = add i64 %7, %8 ; CSE opportunity 117 // %10 = mul i64 %j2, length_of_struct 118 // %11 = add i64 %9, %10 119 // %12 = add i64 %11, struct_field_2 ; Constant offset 120 // %p = inttoptr i64 %12 to i32* 121 // load %p2 122 // ... 123 // 124 // If the target uses alias analysis in codegen, this pass will lower a GEP 125 // with multiple indices into multiple GEPs with a single index: 126 // BB1: 127 // %1 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity 128 // %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity 129 // %3 = getelementptr i8* %1, i64 %2 ; CSE opportunity 130 // %4 = mul i64 %j1, length_of_struct 131 // %5 = getelementptr i8* %3, i64 %4 132 // %6 = getelementptr i8* %5, struct_field_3 ; Constant offset 133 // %p = bitcast i8* %6 to i32* 134 // load %p 135 // ... 136 // BB2: 137 // %7 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity 138 // %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity 139 // %9 = getelementptr i8* %7, i64 %8 ; CSE opportunity 140 // %10 = mul i64 %j2, length_of_struct 141 // %11 = getelementptr i8* %9, i64 %10 142 // %12 = getelementptr i8* %11, struct_field_2 ; Constant offset 143 // %p2 = bitcast i8* %12 to i32* 144 // load %p2 145 // ... 146 // 147 // Lowering GEPs can also benefit other passes such as LICM and CGP. 148 // LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple 149 // indices if one of the index is variant. If we lower such GEP into invariant 150 // parts and variant parts, LICM can hoist/sink those invariant parts. 151 // CGP (CodeGen Prepare) tries to sink address calculations that match the 152 // target's addressing modes. A GEP with multiple indices may not match and will 153 // not be sunk. If we lower such GEP into smaller parts, CGP may sink some of 154 // them. So we end up with a better addressing mode. 155 // 156 //===----------------------------------------------------------------------===// 157 158 #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h" 159 #include "llvm/ADT/APInt.h" 160 #include "llvm/ADT/DenseMap.h" 161 #include "llvm/ADT/DepthFirstIterator.h" 162 #include "llvm/ADT/SmallVector.h" 163 #include "llvm/Analysis/LoopInfo.h" 164 #include "llvm/Analysis/MemoryBuiltins.h" 165 #include "llvm/Analysis/ScalarEvolution.h" 166 #include "llvm/Analysis/TargetLibraryInfo.h" 167 #include "llvm/Analysis/TargetTransformInfo.h" 168 #include "llvm/Analysis/ValueTracking.h" 169 #include "llvm/IR/BasicBlock.h" 170 #include "llvm/IR/Constant.h" 171 #include "llvm/IR/Constants.h" 172 #include "llvm/IR/DataLayout.h" 173 #include "llvm/IR/DerivedTypes.h" 174 #include "llvm/IR/Dominators.h" 175 #include "llvm/IR/Function.h" 176 #include "llvm/IR/GetElementPtrTypeIterator.h" 177 #include "llvm/IR/IRBuilder.h" 178 #include "llvm/IR/Instruction.h" 179 #include "llvm/IR/Instructions.h" 180 #include "llvm/IR/Module.h" 181 #include "llvm/IR/PassManager.h" 182 #include "llvm/IR/PatternMatch.h" 183 #include "llvm/IR/Type.h" 184 #include "llvm/IR/User.h" 185 #include "llvm/IR/Value.h" 186 #include "llvm/InitializePasses.h" 187 #include "llvm/Pass.h" 188 #include "llvm/Support/Casting.h" 189 #include "llvm/Support/CommandLine.h" 190 #include "llvm/Support/ErrorHandling.h" 191 #include "llvm/Support/raw_ostream.h" 192 #include "llvm/Target/TargetMachine.h" 193 #include "llvm/Transforms/Scalar.h" 194 #include "llvm/Transforms/Utils/Local.h" 195 #include <cassert> 196 #include <cstdint> 197 #include <string> 198 199 using namespace llvm; 200 using namespace llvm::PatternMatch; 201 202 static cl::opt<bool> DisableSeparateConstOffsetFromGEP( 203 "disable-separate-const-offset-from-gep", cl::init(false), 204 cl::desc("Do not separate the constant offset from a GEP instruction"), 205 cl::Hidden); 206 207 // Setting this flag may emit false positives when the input module already 208 // contains dead instructions. Therefore, we set it only in unit tests that are 209 // free of dead code. 210 static cl::opt<bool> 211 VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false), 212 cl::desc("Verify this pass produces no dead code"), 213 cl::Hidden); 214 215 namespace { 216 217 /// A helper class for separating a constant offset from a GEP index. 218 /// 219 /// In real programs, a GEP index may be more complicated than a simple addition 220 /// of something and a constant integer which can be trivially splitted. For 221 /// example, to split ((a << 3) | 5) + b, we need to search deeper for the 222 /// constant offset, so that we can separate the index to (a << 3) + b and 5. 223 /// 224 /// Therefore, this class looks into the expression that computes a given GEP 225 /// index, and tries to find a constant integer that can be hoisted to the 226 /// outermost level of the expression as an addition. Not every constant in an 227 /// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a + 228 /// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case, 229 /// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15). 230 class ConstantOffsetExtractor { 231 public: 232 /// Extracts a constant offset from the given GEP index. It returns the 233 /// new index representing the remainder (equal to the original index minus 234 /// the constant offset), or nullptr if we cannot extract a constant offset. 235 /// \p Idx The given GEP index 236 /// \p GEP The given GEP 237 /// \p UserChainTail Outputs the tail of UserChain so that we can 238 /// garbage-collect unused instructions in UserChain. 239 static Value *Extract(Value *Idx, GetElementPtrInst *GEP, 240 User *&UserChainTail, const DominatorTree *DT); 241 242 /// Looks for a constant offset from the given GEP index without extracting 243 /// it. It returns the numeric value of the extracted constant offset (0 if 244 /// failed). The meaning of the arguments are the same as Extract. 245 static int64_t Find(Value *Idx, GetElementPtrInst *GEP, 246 const DominatorTree *DT); 247 248 private: 249 ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT) 250 : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) { 251 } 252 253 /// Searches the expression that computes V for a non-zero constant C s.t. 254 /// V can be reassociated into the form V' + C. If the searching is 255 /// successful, returns C and update UserChain as a def-use chain from C to V; 256 /// otherwise, UserChain is empty. 257 /// 258 /// \p V The given expression 259 /// \p SignExtended Whether V will be sign-extended in the computation of the 260 /// GEP index 261 /// \p ZeroExtended Whether V will be zero-extended in the computation of the 262 /// GEP index 263 /// \p NonNegative Whether V is guaranteed to be non-negative. For example, 264 /// an index of an inbounds GEP is guaranteed to be 265 /// non-negative. Levaraging this, we can better split 266 /// inbounds GEPs. 267 APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative); 268 269 /// A helper function to look into both operands of a binary operator. 270 APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended, 271 bool ZeroExtended); 272 273 /// After finding the constant offset C from the GEP index I, we build a new 274 /// index I' s.t. I' + C = I. This function builds and returns the new 275 /// index I' according to UserChain produced by function "find". 276 /// 277 /// The building conceptually takes two steps: 278 /// 1) iteratively distribute s/zext towards the leaves of the expression tree 279 /// that computes I 280 /// 2) reassociate the expression tree to the form I' + C. 281 /// 282 /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute 283 /// sext to a, b and 5 so that we have 284 /// sext(a) + (sext(b) + 5). 285 /// Then, we reassociate it to 286 /// (sext(a) + sext(b)) + 5. 287 /// Given this form, we know I' is sext(a) + sext(b). 288 Value *rebuildWithoutConstOffset(); 289 290 /// After the first step of rebuilding the GEP index without the constant 291 /// offset, distribute s/zext to the operands of all operators in UserChain. 292 /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) => 293 /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))). 294 /// 295 /// The function also updates UserChain to point to new subexpressions after 296 /// distributing s/zext. e.g., the old UserChain of the above example is 297 /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)), 298 /// and the new UserChain is 299 /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) -> 300 /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5)) 301 /// 302 /// \p ChainIndex The index to UserChain. ChainIndex is initially 303 /// UserChain.size() - 1, and is decremented during 304 /// the recursion. 305 Value *distributeExtsAndCloneChain(unsigned ChainIndex); 306 307 /// Reassociates the GEP index to the form I' + C and returns I'. 308 Value *removeConstOffset(unsigned ChainIndex); 309 310 /// A helper function to apply ExtInsts, a list of s/zext, to value V. 311 /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function 312 /// returns "sext i32 (zext i16 V to i32) to i64". 313 Value *applyExts(Value *V); 314 315 /// A helper function that returns whether we can trace into the operands 316 /// of binary operator BO for a constant offset. 317 /// 318 /// \p SignExtended Whether BO is surrounded by sext 319 /// \p ZeroExtended Whether BO is surrounded by zext 320 /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound 321 /// array index. 322 bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, 323 bool NonNegative); 324 325 /// The path from the constant offset to the old GEP index. e.g., if the GEP 326 /// index is "a * b + (c + 5)". After running function find, UserChain[0] will 327 /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and 328 /// UserChain[2] will be the entire expression "a * b + (c + 5)". 329 /// 330 /// This path helps to rebuild the new GEP index. 331 SmallVector<User *, 8> UserChain; 332 333 /// A data structure used in rebuildWithoutConstOffset. Contains all 334 /// sext/zext instructions along UserChain. 335 SmallVector<CastInst *, 16> ExtInsts; 336 337 /// Insertion position of cloned instructions. 338 Instruction *IP; 339 340 const DataLayout &DL; 341 const DominatorTree *DT; 342 }; 343 344 /// A pass that tries to split every GEP in the function into a variadic 345 /// base and a constant offset. It is a FunctionPass because searching for the 346 /// constant offset may inspect other basic blocks. 347 class SeparateConstOffsetFromGEPLegacyPass : public FunctionPass { 348 public: 349 static char ID; 350 351 SeparateConstOffsetFromGEPLegacyPass(bool LowerGEP = false) 352 : FunctionPass(ID), LowerGEP(LowerGEP) { 353 initializeSeparateConstOffsetFromGEPLegacyPassPass( 354 *PassRegistry::getPassRegistry()); 355 } 356 357 void getAnalysisUsage(AnalysisUsage &AU) const override { 358 AU.addRequired<DominatorTreeWrapperPass>(); 359 AU.addRequired<ScalarEvolutionWrapperPass>(); 360 AU.addRequired<TargetTransformInfoWrapperPass>(); 361 AU.addRequired<LoopInfoWrapperPass>(); 362 AU.setPreservesCFG(); 363 AU.addRequired<TargetLibraryInfoWrapperPass>(); 364 } 365 366 bool runOnFunction(Function &F) override; 367 368 private: 369 bool LowerGEP; 370 }; 371 372 /// A pass that tries to split every GEP in the function into a variadic 373 /// base and a constant offset. It is a FunctionPass because searching for the 374 /// constant offset may inspect other basic blocks. 375 class SeparateConstOffsetFromGEP { 376 public: 377 SeparateConstOffsetFromGEP( 378 DominatorTree *DT, ScalarEvolution *SE, LoopInfo *LI, 379 TargetLibraryInfo *TLI, 380 function_ref<TargetTransformInfo &(Function &)> GetTTI, bool LowerGEP) 381 : DT(DT), SE(SE), LI(LI), TLI(TLI), GetTTI(GetTTI), LowerGEP(LowerGEP) {} 382 383 bool run(Function &F); 384 385 private: 386 /// Tries to split the given GEP into a variadic base and a constant offset, 387 /// and returns true if the splitting succeeds. 388 bool splitGEP(GetElementPtrInst *GEP); 389 390 /// Lower a GEP with multiple indices into multiple GEPs with a single index. 391 /// Function splitGEP already split the original GEP into a variadic part and 392 /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the 393 /// variadic part into a set of GEPs with a single index and applies 394 /// AccumulativeByteOffset to it. 395 /// \p Variadic The variadic part of the original GEP. 396 /// \p AccumulativeByteOffset The constant offset. 397 void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic, 398 int64_t AccumulativeByteOffset); 399 400 /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form. 401 /// Function splitGEP already split the original GEP into a variadic part and 402 /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the 403 /// variadic part into a set of arithmetic operations and applies 404 /// AccumulativeByteOffset to it. 405 /// \p Variadic The variadic part of the original GEP. 406 /// \p AccumulativeByteOffset The constant offset. 407 void lowerToArithmetics(GetElementPtrInst *Variadic, 408 int64_t AccumulativeByteOffset); 409 410 /// Finds the constant offset within each index and accumulates them. If 411 /// LowerGEP is true, it finds in indices of both sequential and structure 412 /// types, otherwise it only finds in sequential indices. The output 413 /// NeedsExtraction indicates whether we successfully find a non-zero constant 414 /// offset. 415 int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction); 416 417 /// Canonicalize array indices to pointer-size integers. This helps to 418 /// simplify the logic of splitting a GEP. For example, if a + b is a 419 /// pointer-size integer, we have 420 /// gep base, a + b = gep (gep base, a), b 421 /// However, this equality may not hold if the size of a + b is smaller than 422 /// the pointer size, because LLVM conceptually sign-extends GEP indices to 423 /// pointer size before computing the address 424 /// (http://llvm.org/docs/LangRef.html#id181). 425 /// 426 /// This canonicalization is very likely already done in clang and 427 /// instcombine. Therefore, the program will probably remain the same. 428 /// 429 /// Returns true if the module changes. 430 /// 431 /// Verified in @i32_add in split-gep.ll 432 bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); 433 434 /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow. 435 /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting 436 /// the constant offset. After extraction, it becomes desirable to reunion the 437 /// distributed sexts. For example, 438 /// 439 /// &a[sext(i +nsw (j +nsw 5)] 440 /// => distribute &a[sext(i) +nsw (sext(j) +nsw 5)] 441 /// => constant extraction &a[sext(i) + sext(j)] + 5 442 /// => reunion &a[sext(i +nsw j)] + 5 443 bool reuniteExts(Function &F); 444 445 /// A helper that reunites sexts in an instruction. 446 bool reuniteExts(Instruction *I); 447 448 /// Find the closest dominator of <Dominatee> that is equivalent to <Key>. 449 Instruction *findClosestMatchingDominator( 450 const SCEV *Key, Instruction *Dominatee, 451 DenseMap<const SCEV *, SmallVector<Instruction *, 2>> &DominatingExprs); 452 453 /// Verify F is free of dead code. 454 void verifyNoDeadCode(Function &F); 455 456 bool hasMoreThanOneUseInLoop(Value *v, Loop *L); 457 458 // Swap the index operand of two GEP. 459 void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second); 460 461 // Check if it is safe to swap operand of two GEP. 462 bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second, 463 Loop *CurLoop); 464 465 const DataLayout *DL = nullptr; 466 DominatorTree *DT = nullptr; 467 ScalarEvolution *SE; 468 LoopInfo *LI; 469 TargetLibraryInfo *TLI; 470 // Retrieved lazily since not always used. 471 function_ref<TargetTransformInfo &(Function &)> GetTTI; 472 473 /// Whether to lower a GEP with multiple indices into arithmetic operations or 474 /// multiple GEPs with a single index. 475 bool LowerGEP; 476 477 DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingAdds; 478 DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingSubs; 479 }; 480 481 } // end anonymous namespace 482 483 char SeparateConstOffsetFromGEPLegacyPass::ID = 0; 484 485 INITIALIZE_PASS_BEGIN( 486 SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep", 487 "Split GEPs to a variadic base and a constant offset for better CSE", false, 488 false) 489 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 490 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 491 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 492 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 493 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 494 INITIALIZE_PASS_END( 495 SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep", 496 "Split GEPs to a variadic base and a constant offset for better CSE", false, 497 false) 498 499 FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) { 500 return new SeparateConstOffsetFromGEPLegacyPass(LowerGEP); 501 } 502 503 bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, 504 bool ZeroExtended, 505 BinaryOperator *BO, 506 bool NonNegative) { 507 // We only consider ADD, SUB and OR, because a non-zero constant found in 508 // expressions composed of these operations can be easily hoisted as a 509 // constant offset by reassociation. 510 if (BO->getOpcode() != Instruction::Add && 511 BO->getOpcode() != Instruction::Sub && 512 BO->getOpcode() != Instruction::Or) { 513 return false; 514 } 515 516 Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1); 517 // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS 518 // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS). 519 // FIXME: this does not appear to be covered by any tests 520 // (with x86/aarch64 backends at least) 521 if (BO->getOpcode() == Instruction::Or && 522 !haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT)) 523 return false; 524 525 // In addition, tracing into BO requires that its surrounding s/zext (if 526 // any) is distributable to both operands. 527 // 528 // Suppose BO = A op B. 529 // SignExtended | ZeroExtended | Distributable? 530 // --------------+--------------+---------------------------------- 531 // 0 | 0 | true because no s/zext exists 532 // 0 | 1 | zext(BO) == zext(A) op zext(B) 533 // 1 | 0 | sext(BO) == sext(A) op sext(B) 534 // 1 | 1 | zext(sext(BO)) == 535 // | | zext(sext(A)) op zext(sext(B)) 536 if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) { 537 // If a + b >= 0 and (a >= 0 or b >= 0), then 538 // sext(a + b) = sext(a) + sext(b) 539 // even if the addition is not marked nsw. 540 // 541 // Leveraging this invariant, we can trace into an sext'ed inbound GEP 542 // index if the constant offset is non-negative. 543 // 544 // Verified in @sext_add in split-gep.ll. 545 if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) { 546 if (!ConstLHS->isNegative()) 547 return true; 548 } 549 if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) { 550 if (!ConstRHS->isNegative()) 551 return true; 552 } 553 } 554 555 // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B) 556 // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B) 557 if (BO->getOpcode() == Instruction::Add || 558 BO->getOpcode() == Instruction::Sub) { 559 if (SignExtended && !BO->hasNoSignedWrap()) 560 return false; 561 if (ZeroExtended && !BO->hasNoUnsignedWrap()) 562 return false; 563 } 564 565 return true; 566 } 567 568 APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO, 569 bool SignExtended, 570 bool ZeroExtended) { 571 // Save off the current height of the chain, in case we need to restore it. 572 size_t ChainLength = UserChain.size(); 573 574 // BO being non-negative does not shed light on whether its operands are 575 // non-negative. Clear the NonNegative flag here. 576 APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended, 577 /* NonNegative */ false); 578 // If we found a constant offset in the left operand, stop and return that. 579 // This shortcut might cause us to miss opportunities of combining the 580 // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9. 581 // However, such cases are probably already handled by -instcombine, 582 // given this pass runs after the standard optimizations. 583 if (ConstantOffset != 0) return ConstantOffset; 584 585 // Reset the chain back to where it was when we started exploring this node, 586 // since visiting the LHS didn't pan out. 587 UserChain.resize(ChainLength); 588 589 ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended, 590 /* NonNegative */ false); 591 // If U is a sub operator, negate the constant offset found in the right 592 // operand. 593 if (BO->getOpcode() == Instruction::Sub) 594 ConstantOffset = -ConstantOffset; 595 596 // If RHS wasn't a suitable candidate either, reset the chain again. 597 if (ConstantOffset == 0) 598 UserChain.resize(ChainLength); 599 600 return ConstantOffset; 601 } 602 603 APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, 604 bool ZeroExtended, bool NonNegative) { 605 // TODO(jingyue): We could trace into integer/pointer casts, such as 606 // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only 607 // integers because it gives good enough results for our benchmarks. 608 unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth(); 609 610 // We cannot do much with Values that are not a User, such as an Argument. 611 User *U = dyn_cast<User>(V); 612 if (U == nullptr) return APInt(BitWidth, 0); 613 614 APInt ConstantOffset(BitWidth, 0); 615 if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) { 616 // Hooray, we found it! 617 ConstantOffset = CI->getValue(); 618 } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) { 619 // Trace into subexpressions for more hoisting opportunities. 620 if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) 621 ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended); 622 } else if (isa<TruncInst>(V)) { 623 ConstantOffset = 624 find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative) 625 .trunc(BitWidth); 626 } else if (isa<SExtInst>(V)) { 627 ConstantOffset = find(U->getOperand(0), /* SignExtended */ true, 628 ZeroExtended, NonNegative).sext(BitWidth); 629 } else if (isa<ZExtInst>(V)) { 630 // As an optimization, we can clear the SignExtended flag because 631 // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll. 632 // 633 // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0. 634 ConstantOffset = 635 find(U->getOperand(0), /* SignExtended */ false, 636 /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth); 637 } 638 639 // If we found a non-zero constant offset, add it to the path for 640 // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't 641 // help this optimization. 642 if (ConstantOffset != 0) 643 UserChain.push_back(U); 644 return ConstantOffset; 645 } 646 647 Value *ConstantOffsetExtractor::applyExts(Value *V) { 648 Value *Current = V; 649 // ExtInsts is built in the use-def order. Therefore, we apply them to V 650 // in the reversed order. 651 for (CastInst *I : llvm::reverse(ExtInsts)) { 652 if (Constant *C = dyn_cast<Constant>(Current)) { 653 // If Current is a constant, apply s/zext using ConstantExpr::getCast. 654 // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt. 655 Current = ConstantExpr::getCast(I->getOpcode(), C, I->getType()); 656 } else { 657 Instruction *Ext = I->clone(); 658 Ext->setOperand(0, Current); 659 Ext->insertBefore(IP); 660 Current = Ext; 661 } 662 } 663 return Current; 664 } 665 666 Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() { 667 distributeExtsAndCloneChain(UserChain.size() - 1); 668 // Remove all nullptrs (used to be s/zext) from UserChain. 669 unsigned NewSize = 0; 670 for (User *I : UserChain) { 671 if (I != nullptr) { 672 UserChain[NewSize] = I; 673 NewSize++; 674 } 675 } 676 UserChain.resize(NewSize); 677 return removeConstOffset(UserChain.size() - 1); 678 } 679 680 Value * 681 ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) { 682 User *U = UserChain[ChainIndex]; 683 if (ChainIndex == 0) { 684 assert(isa<ConstantInt>(U)); 685 // If U is a ConstantInt, applyExts will return a ConstantInt as well. 686 return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U)); 687 } 688 689 if (CastInst *Cast = dyn_cast<CastInst>(U)) { 690 assert( 691 (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) && 692 "Only following instructions can be traced: sext, zext & trunc"); 693 ExtInsts.push_back(Cast); 694 UserChain[ChainIndex] = nullptr; 695 return distributeExtsAndCloneChain(ChainIndex - 1); 696 } 697 698 // Function find only trace into BinaryOperator and CastInst. 699 BinaryOperator *BO = cast<BinaryOperator>(U); 700 // OpNo = which operand of BO is UserChain[ChainIndex - 1] 701 unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); 702 Value *TheOther = applyExts(BO->getOperand(1 - OpNo)); 703 Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1); 704 705 BinaryOperator *NewBO = nullptr; 706 if (OpNo == 0) { 707 NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther, 708 BO->getName(), IP); 709 } else { 710 NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain, 711 BO->getName(), IP); 712 } 713 return UserChain[ChainIndex] = NewBO; 714 } 715 716 Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { 717 if (ChainIndex == 0) { 718 assert(isa<ConstantInt>(UserChain[ChainIndex])); 719 return ConstantInt::getNullValue(UserChain[ChainIndex]->getType()); 720 } 721 722 BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]); 723 assert((BO->use_empty() || BO->hasOneUse()) && 724 "distributeExtsAndCloneChain clones each BinaryOperator in " 725 "UserChain, so no one should be used more than " 726 "once"); 727 728 unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); 729 assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]); 730 Value *NextInChain = removeConstOffset(ChainIndex - 1); 731 Value *TheOther = BO->getOperand(1 - OpNo); 732 733 // If NextInChain is 0 and not the LHS of a sub, we can simplify the 734 // sub-expression to be just TheOther. 735 if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) { 736 if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0)) 737 return TheOther; 738 } 739 740 BinaryOperator::BinaryOps NewOp = BO->getOpcode(); 741 if (BO->getOpcode() == Instruction::Or) { 742 // Rebuild "or" as "add", because "or" may be invalid for the new 743 // expression. 744 // 745 // For instance, given 746 // a | (b + 5) where a and b + 5 have no common bits, 747 // we can extract 5 as the constant offset. 748 // 749 // However, reusing the "or" in the new index would give us 750 // (a | b) + 5 751 // which does not equal a | (b + 5). 752 // 753 // Replacing the "or" with "add" is fine, because 754 // a | (b + 5) = a + (b + 5) = (a + b) + 5 755 NewOp = Instruction::Add; 756 } 757 758 BinaryOperator *NewBO; 759 if (OpNo == 0) { 760 NewBO = BinaryOperator::Create(NewOp, NextInChain, TheOther, "", IP); 761 } else { 762 NewBO = BinaryOperator::Create(NewOp, TheOther, NextInChain, "", IP); 763 } 764 NewBO->takeName(BO); 765 return NewBO; 766 } 767 768 Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP, 769 User *&UserChainTail, 770 const DominatorTree *DT) { 771 ConstantOffsetExtractor Extractor(GEP, DT); 772 // Find a non-zero constant offset first. 773 APInt ConstantOffset = 774 Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, 775 GEP->isInBounds()); 776 if (ConstantOffset == 0) { 777 UserChainTail = nullptr; 778 return nullptr; 779 } 780 // Separates the constant offset from the GEP index. 781 Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset(); 782 UserChainTail = Extractor.UserChain.back(); 783 return IdxWithoutConstOffset; 784 } 785 786 int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP, 787 const DominatorTree *DT) { 788 // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative. 789 return ConstantOffsetExtractor(GEP, DT) 790 .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, 791 GEP->isInBounds()) 792 .getSExtValue(); 793 } 794 795 bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize( 796 GetElementPtrInst *GEP) { 797 bool Changed = false; 798 Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); 799 gep_type_iterator GTI = gep_type_begin(*GEP); 800 for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end(); 801 I != E; ++I, ++GTI) { 802 // Skip struct member indices which must be i32. 803 if (GTI.isSequential()) { 804 if ((*I)->getType() != IntPtrTy) { 805 *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP); 806 Changed = true; 807 } 808 } 809 } 810 return Changed; 811 } 812 813 int64_t 814 SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, 815 bool &NeedsExtraction) { 816 NeedsExtraction = false; 817 int64_t AccumulativeByteOffset = 0; 818 gep_type_iterator GTI = gep_type_begin(*GEP); 819 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { 820 if (GTI.isSequential()) { 821 // Tries to extract a constant offset from this GEP index. 822 int64_t ConstantOffset = 823 ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT); 824 if (ConstantOffset != 0) { 825 NeedsExtraction = true; 826 // A GEP may have multiple indices. We accumulate the extracted 827 // constant offset to a byte offset, and later offset the remainder of 828 // the original GEP with this byte offset. 829 AccumulativeByteOffset += 830 ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType()); 831 } 832 } else if (LowerGEP) { 833 StructType *StTy = GTI.getStructType(); 834 uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue(); 835 // Skip field 0 as the offset is always 0. 836 if (Field != 0) { 837 NeedsExtraction = true; 838 AccumulativeByteOffset += 839 DL->getStructLayout(StTy)->getElementOffset(Field); 840 } 841 } 842 } 843 return AccumulativeByteOffset; 844 } 845 846 void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( 847 GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) { 848 IRBuilder<> Builder(Variadic); 849 Type *IntPtrTy = DL->getIntPtrType(Variadic->getType()); 850 851 Type *I8PtrTy = 852 Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace()); 853 Value *ResultPtr = Variadic->getOperand(0); 854 Loop *L = LI->getLoopFor(Variadic->getParent()); 855 // Check if the base is not loop invariant or used more than once. 856 bool isSwapCandidate = 857 L && L->isLoopInvariant(ResultPtr) && 858 !hasMoreThanOneUseInLoop(ResultPtr, L); 859 Value *FirstResult = nullptr; 860 861 if (ResultPtr->getType() != I8PtrTy) 862 ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); 863 864 gep_type_iterator GTI = gep_type_begin(*Variadic); 865 // Create an ugly GEP for each sequential index. We don't create GEPs for 866 // structure indices, as they are accumulated in the constant offset index. 867 for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) { 868 if (GTI.isSequential()) { 869 Value *Idx = Variadic->getOperand(I); 870 // Skip zero indices. 871 if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) 872 if (CI->isZero()) 873 continue; 874 875 APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(), 876 DL->getTypeAllocSize(GTI.getIndexedType())); 877 // Scale the index by element size. 878 if (ElementSize != 1) { 879 if (ElementSize.isPowerOf2()) { 880 Idx = Builder.CreateShl( 881 Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2())); 882 } else { 883 Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize)); 884 } 885 } 886 // Create an ugly GEP with a single index for each index. 887 ResultPtr = 888 Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep"); 889 if (FirstResult == nullptr) 890 FirstResult = ResultPtr; 891 } 892 } 893 894 // Create a GEP with the constant offset index. 895 if (AccumulativeByteOffset != 0) { 896 Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset); 897 ResultPtr = 898 Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep"); 899 } else 900 isSwapCandidate = false; 901 902 // If we created a GEP with constant index, and the base is loop invariant, 903 // then we swap the first one with it, so LICM can move constant GEP out 904 // later. 905 auto *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult); 906 auto *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr); 907 if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L)) 908 swapGEPOperand(FirstGEP, SecondGEP); 909 910 if (ResultPtr->getType() != Variadic->getType()) 911 ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType()); 912 913 Variadic->replaceAllUsesWith(ResultPtr); 914 Variadic->eraseFromParent(); 915 } 916 917 void 918 SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic, 919 int64_t AccumulativeByteOffset) { 920 IRBuilder<> Builder(Variadic); 921 Type *IntPtrTy = DL->getIntPtrType(Variadic->getType()); 922 923 Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy); 924 gep_type_iterator GTI = gep_type_begin(*Variadic); 925 // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We 926 // don't create arithmetics for structure indices, as they are accumulated 927 // in the constant offset index. 928 for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) { 929 if (GTI.isSequential()) { 930 Value *Idx = Variadic->getOperand(I); 931 // Skip zero indices. 932 if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) 933 if (CI->isZero()) 934 continue; 935 936 APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(), 937 DL->getTypeAllocSize(GTI.getIndexedType())); 938 // Scale the index by element size. 939 if (ElementSize != 1) { 940 if (ElementSize.isPowerOf2()) { 941 Idx = Builder.CreateShl( 942 Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2())); 943 } else { 944 Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize)); 945 } 946 } 947 // Create an ADD for each index. 948 ResultPtr = Builder.CreateAdd(ResultPtr, Idx); 949 } 950 } 951 952 // Create an ADD for the constant offset index. 953 if (AccumulativeByteOffset != 0) { 954 ResultPtr = Builder.CreateAdd( 955 ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset)); 956 } 957 958 ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType()); 959 Variadic->replaceAllUsesWith(ResultPtr); 960 Variadic->eraseFromParent(); 961 } 962 963 bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { 964 // Skip vector GEPs. 965 if (GEP->getType()->isVectorTy()) 966 return false; 967 968 // The backend can already nicely handle the case where all indices are 969 // constant. 970 if (GEP->hasAllConstantIndices()) 971 return false; 972 973 bool Changed = canonicalizeArrayIndicesToPointerSize(GEP); 974 975 bool NeedsExtraction; 976 int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction); 977 978 if (!NeedsExtraction) 979 return Changed; 980 981 TargetTransformInfo &TTI = GetTTI(*GEP->getFunction()); 982 983 // If LowerGEP is disabled, before really splitting the GEP, check whether the 984 // backend supports the addressing mode we are about to produce. If no, this 985 // splitting probably won't be beneficial. 986 // If LowerGEP is enabled, even the extracted constant offset can not match 987 // the addressing mode, we can still do optimizations to other lowered parts 988 // of variable indices. Therefore, we don't check for addressing modes in that 989 // case. 990 if (!LowerGEP) { 991 unsigned AddrSpace = GEP->getPointerAddressSpace(); 992 if (!TTI.isLegalAddressingMode(GEP->getResultElementType(), 993 /*BaseGV=*/nullptr, AccumulativeByteOffset, 994 /*HasBaseReg=*/true, /*Scale=*/0, 995 AddrSpace)) { 996 return Changed; 997 } 998 } 999 1000 // Remove the constant offset in each sequential index. The resultant GEP 1001 // computes the variadic base. 1002 // Notice that we don't remove struct field indices here. If LowerGEP is 1003 // disabled, a structure index is not accumulated and we still use the old 1004 // one. If LowerGEP is enabled, a structure index is accumulated in the 1005 // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later 1006 // handle the constant offset and won't need a new structure index. 1007 gep_type_iterator GTI = gep_type_begin(*GEP); 1008 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { 1009 if (GTI.isSequential()) { 1010 // Splits this GEP index into a variadic part and a constant offset, and 1011 // uses the variadic part as the new index. 1012 Value *OldIdx = GEP->getOperand(I); 1013 User *UserChainTail; 1014 Value *NewIdx = 1015 ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT); 1016 if (NewIdx != nullptr) { 1017 // Switches to the index with the constant offset removed. 1018 GEP->setOperand(I, NewIdx); 1019 // After switching to the new index, we can garbage-collect UserChain 1020 // and the old index if they are not used. 1021 RecursivelyDeleteTriviallyDeadInstructions(UserChainTail); 1022 RecursivelyDeleteTriviallyDeadInstructions(OldIdx); 1023 } 1024 } 1025 } 1026 1027 // Clear the inbounds attribute because the new index may be off-bound. 1028 // e.g., 1029 // 1030 // b = add i64 a, 5 1031 // addr = gep inbounds float, float* p, i64 b 1032 // 1033 // is transformed to: 1034 // 1035 // addr2 = gep float, float* p, i64 a ; inbounds removed 1036 // addr = gep inbounds float, float* addr2, i64 5 1037 // 1038 // If a is -4, although the old index b is in bounds, the new index a is 1039 // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the 1040 // inbounds keyword is not present, the offsets are added to the base 1041 // address with silently-wrapping two's complement arithmetic". 1042 // Therefore, the final code will be a semantically equivalent. 1043 // 1044 // TODO(jingyue): do some range analysis to keep as many inbounds as 1045 // possible. GEPs with inbounds are more friendly to alias analysis. 1046 bool GEPWasInBounds = GEP->isInBounds(); 1047 GEP->setIsInBounds(false); 1048 1049 // Lowers a GEP to either GEPs with a single index or arithmetic operations. 1050 if (LowerGEP) { 1051 // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to 1052 // arithmetic operations if the target uses alias analysis in codegen. 1053 if (TTI.useAA()) 1054 lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset); 1055 else 1056 lowerToArithmetics(GEP, AccumulativeByteOffset); 1057 return true; 1058 } 1059 1060 // No need to create another GEP if the accumulative byte offset is 0. 1061 if (AccumulativeByteOffset == 0) 1062 return true; 1063 1064 // Offsets the base with the accumulative byte offset. 1065 // 1066 // %gep ; the base 1067 // ... %gep ... 1068 // 1069 // => add the offset 1070 // 1071 // %gep2 ; clone of %gep 1072 // %new.gep = gep %gep2, <offset / sizeof(*%gep)> 1073 // %gep ; will be removed 1074 // ... %gep ... 1075 // 1076 // => replace all uses of %gep with %new.gep and remove %gep 1077 // 1078 // %gep2 ; clone of %gep 1079 // %new.gep = gep %gep2, <offset / sizeof(*%gep)> 1080 // ... %new.gep ... 1081 // 1082 // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an 1083 // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep): 1084 // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the 1085 // type of %gep. 1086 // 1087 // %gep2 ; clone of %gep 1088 // %0 = bitcast %gep2 to i8* 1089 // %uglygep = gep %0, <offset> 1090 // %new.gep = bitcast %uglygep to <type of %gep> 1091 // ... %new.gep ... 1092 Instruction *NewGEP = GEP->clone(); 1093 NewGEP->insertBefore(GEP); 1094 1095 // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned = 1096 // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is 1097 // used with unsigned integers later. 1098 int64_t ElementTypeSizeOfGEP = static_cast<int64_t>( 1099 DL->getTypeAllocSize(GEP->getResultElementType())); 1100 Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); 1101 if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) { 1102 // Very likely. As long as %gep is naturally aligned, the byte offset we 1103 // extracted should be a multiple of sizeof(*%gep). 1104 int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP; 1105 NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP, 1106 ConstantInt::get(IntPtrTy, Index, true), 1107 GEP->getName(), GEP); 1108 NewGEP->copyMetadata(*GEP); 1109 // Inherit the inbounds attribute of the original GEP. 1110 cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds); 1111 } else { 1112 // Unlikely but possible. For example, 1113 // #pragma pack(1) 1114 // struct S { 1115 // int a[3]; 1116 // int64 b[8]; 1117 // }; 1118 // #pragma pack() 1119 // 1120 // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After 1121 // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is 1122 // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of 1123 // sizeof(int64). 1124 // 1125 // Emit an uglygep in this case. 1126 Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(), 1127 GEP->getPointerAddressSpace()); 1128 NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP); 1129 NewGEP = GetElementPtrInst::Create( 1130 Type::getInt8Ty(GEP->getContext()), NewGEP, 1131 ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep", 1132 GEP); 1133 NewGEP->copyMetadata(*GEP); 1134 // Inherit the inbounds attribute of the original GEP. 1135 cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds); 1136 if (GEP->getType() != I8PtrTy) 1137 NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP); 1138 } 1139 1140 GEP->replaceAllUsesWith(NewGEP); 1141 GEP->eraseFromParent(); 1142 1143 return true; 1144 } 1145 1146 bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) { 1147 if (skipFunction(F)) 1148 return false; 1149 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1150 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1151 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1152 auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 1153 auto GetTTI = [this](Function &F) -> TargetTransformInfo & { 1154 return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1155 }; 1156 SeparateConstOffsetFromGEP Impl(DT, SE, LI, TLI, GetTTI, LowerGEP); 1157 return Impl.run(F); 1158 } 1159 1160 bool SeparateConstOffsetFromGEP::run(Function &F) { 1161 if (DisableSeparateConstOffsetFromGEP) 1162 return false; 1163 1164 DL = &F.getParent()->getDataLayout(); 1165 bool Changed = false; 1166 for (BasicBlock &B : F) { 1167 if (!DT->isReachableFromEntry(&B)) 1168 continue; 1169 1170 for (Instruction &I : llvm::make_early_inc_range(B)) 1171 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) 1172 Changed |= splitGEP(GEP); 1173 // No need to split GEP ConstantExprs because all its indices are constant 1174 // already. 1175 } 1176 1177 Changed |= reuniteExts(F); 1178 1179 if (VerifyNoDeadCode) 1180 verifyNoDeadCode(F); 1181 1182 return Changed; 1183 } 1184 1185 Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator( 1186 const SCEV *Key, Instruction *Dominatee, 1187 DenseMap<const SCEV *, SmallVector<Instruction *, 2>> &DominatingExprs) { 1188 auto Pos = DominatingExprs.find(Key); 1189 if (Pos == DominatingExprs.end()) 1190 return nullptr; 1191 1192 auto &Candidates = Pos->second; 1193 // Because we process the basic blocks in pre-order of the dominator tree, a 1194 // candidate that doesn't dominate the current instruction won't dominate any 1195 // future instruction either. Therefore, we pop it out of the stack. This 1196 // optimization makes the algorithm O(n). 1197 while (!Candidates.empty()) { 1198 Instruction *Candidate = Candidates.back(); 1199 if (DT->dominates(Candidate, Dominatee)) 1200 return Candidate; 1201 Candidates.pop_back(); 1202 } 1203 return nullptr; 1204 } 1205 1206 bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) { 1207 if (!SE->isSCEVable(I->getType())) 1208 return false; 1209 1210 // Dom: LHS+RHS 1211 // I: sext(LHS)+sext(RHS) 1212 // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom). 1213 // TODO: handle zext 1214 Value *LHS = nullptr, *RHS = nullptr; 1215 if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) { 1216 if (LHS->getType() == RHS->getType()) { 1217 const SCEV *Key = 1218 SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); 1219 if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingAdds)) { 1220 Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I); 1221 NewSExt->takeName(I); 1222 I->replaceAllUsesWith(NewSExt); 1223 RecursivelyDeleteTriviallyDeadInstructions(I); 1224 return true; 1225 } 1226 } 1227 } else if (match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) { 1228 if (LHS->getType() == RHS->getType()) { 1229 const SCEV *Key = 1230 SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); 1231 if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingSubs)) { 1232 Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I); 1233 NewSExt->takeName(I); 1234 I->replaceAllUsesWith(NewSExt); 1235 RecursivelyDeleteTriviallyDeadInstructions(I); 1236 return true; 1237 } 1238 } 1239 } 1240 1241 // Add I to DominatingExprs if it's an add/sub that can't sign overflow. 1242 if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS)))) { 1243 if (programUndefinedIfPoison(I)) { 1244 const SCEV *Key = 1245 SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); 1246 DominatingAdds[Key].push_back(I); 1247 } 1248 } else if (match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) { 1249 if (programUndefinedIfPoison(I)) { 1250 const SCEV *Key = 1251 SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); 1252 DominatingSubs[Key].push_back(I); 1253 } 1254 } 1255 return false; 1256 } 1257 1258 bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) { 1259 bool Changed = false; 1260 DominatingAdds.clear(); 1261 DominatingSubs.clear(); 1262 for (const auto Node : depth_first(DT)) { 1263 BasicBlock *BB = Node->getBlock(); 1264 for (Instruction &I : llvm::make_early_inc_range(*BB)) 1265 Changed |= reuniteExts(&I); 1266 } 1267 return Changed; 1268 } 1269 1270 void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) { 1271 for (BasicBlock &B : F) { 1272 for (Instruction &I : B) { 1273 if (isInstructionTriviallyDead(&I)) { 1274 std::string ErrMessage; 1275 raw_string_ostream RSO(ErrMessage); 1276 RSO << "Dead instruction detected!\n" << I << "\n"; 1277 llvm_unreachable(RSO.str().c_str()); 1278 } 1279 } 1280 } 1281 } 1282 1283 bool SeparateConstOffsetFromGEP::isLegalToSwapOperand( 1284 GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) { 1285 if (!FirstGEP || !FirstGEP->hasOneUse()) 1286 return false; 1287 1288 if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent()) 1289 return false; 1290 1291 if (FirstGEP == SecondGEP) 1292 return false; 1293 1294 unsigned FirstNum = FirstGEP->getNumOperands(); 1295 unsigned SecondNum = SecondGEP->getNumOperands(); 1296 // Give up if the number of operands are not 2. 1297 if (FirstNum != SecondNum || FirstNum != 2) 1298 return false; 1299 1300 Value *FirstBase = FirstGEP->getOperand(0); 1301 Value *SecondBase = SecondGEP->getOperand(0); 1302 Value *FirstOffset = FirstGEP->getOperand(1); 1303 // Give up if the index of the first GEP is loop invariant. 1304 if (CurLoop->isLoopInvariant(FirstOffset)) 1305 return false; 1306 1307 // Give up if base doesn't have same type. 1308 if (FirstBase->getType() != SecondBase->getType()) 1309 return false; 1310 1311 Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset); 1312 1313 // Check if the second operand of first GEP has constant coefficient. 1314 // For an example, for the following code, we won't gain anything by 1315 // hoisting the second GEP out because the second GEP can be folded away. 1316 // %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256 1317 // %67 = shl i64 %scevgep.sum.ur159, 2 1318 // %uglygep160 = getelementptr i8* %65, i64 %67 1319 // %uglygep161 = getelementptr i8* %uglygep160, i64 -1024 1320 1321 // Skip constant shift instruction which may be generated by Splitting GEPs. 1322 if (FirstOffsetDef && FirstOffsetDef->isShift() && 1323 isa<ConstantInt>(FirstOffsetDef->getOperand(1))) 1324 FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0)); 1325 1326 // Give up if FirstOffsetDef is an Add or Sub with constant. 1327 // Because it may not profitable at all due to constant folding. 1328 if (FirstOffsetDef) 1329 if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) { 1330 unsigned opc = BO->getOpcode(); 1331 if ((opc == Instruction::Add || opc == Instruction::Sub) && 1332 (isa<ConstantInt>(BO->getOperand(0)) || 1333 isa<ConstantInt>(BO->getOperand(1)))) 1334 return false; 1335 } 1336 return true; 1337 } 1338 1339 bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) { 1340 int UsesInLoop = 0; 1341 for (User *U : V->users()) { 1342 if (Instruction *User = dyn_cast<Instruction>(U)) 1343 if (L->contains(User)) 1344 if (++UsesInLoop > 1) 1345 return true; 1346 } 1347 return false; 1348 } 1349 1350 void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First, 1351 GetElementPtrInst *Second) { 1352 Value *Offset1 = First->getOperand(1); 1353 Value *Offset2 = Second->getOperand(1); 1354 First->setOperand(1, Offset2); 1355 Second->setOperand(1, Offset1); 1356 1357 // We changed p+o+c to p+c+o, p+c may not be inbound anymore. 1358 const DataLayout &DAL = First->getModule()->getDataLayout(); 1359 APInt Offset(DAL.getIndexSizeInBits( 1360 cast<PointerType>(First->getType())->getAddressSpace()), 1361 0); 1362 Value *NewBase = 1363 First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset); 1364 uint64_t ObjectSize; 1365 if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) || 1366 Offset.ugt(ObjectSize)) { 1367 First->setIsInBounds(false); 1368 Second->setIsInBounds(false); 1369 } else 1370 First->setIsInBounds(true); 1371 } 1372 1373 PreservedAnalyses 1374 SeparateConstOffsetFromGEPPass::run(Function &F, FunctionAnalysisManager &AM) { 1375 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F); 1376 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F); 1377 auto *LI = &AM.getResult<LoopAnalysis>(F); 1378 auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F); 1379 auto GetTTI = [&AM](Function &F) -> TargetTransformInfo & { 1380 return AM.getResult<TargetIRAnalysis>(F); 1381 }; 1382 SeparateConstOffsetFromGEP Impl(DT, SE, LI, TLI, GetTTI, LowerGEP); 1383 if (!Impl.run(F)) 1384 return PreservedAnalyses::all(); 1385 PreservedAnalyses PA; 1386 PA.preserveSet<CFGAnalyses>(); 1387 return PA; 1388 } 1389