1 //===- AggressiveInstCombine.cpp ------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the aggressive expression pattern combiner classes. 10 // Currently, it handles expression patterns for: 11 // * Truncate instruction 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" 16 #include "AggressiveInstCombineInternal.h" 17 #include "llvm/ADT/Statistic.h" 18 #include "llvm/Analysis/AliasAnalysis.h" 19 #include "llvm/Analysis/AssumptionCache.h" 20 #include "llvm/Analysis/BasicAliasAnalysis.h" 21 #include "llvm/Analysis/ConstantFolding.h" 22 #include "llvm/Analysis/DomTreeUpdater.h" 23 #include "llvm/Analysis/GlobalsModRef.h" 24 #include "llvm/Analysis/TargetLibraryInfo.h" 25 #include "llvm/Analysis/TargetTransformInfo.h" 26 #include "llvm/Analysis/ValueTracking.h" 27 #include "llvm/IR/DataLayout.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/Function.h" 30 #include "llvm/IR/IRBuilder.h" 31 #include "llvm/IR/PatternMatch.h" 32 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 33 #include "llvm/Transforms/Utils/BuildLibCalls.h" 34 #include "llvm/Transforms/Utils/Local.h" 35 36 using namespace llvm; 37 using namespace PatternMatch; 38 39 #define DEBUG_TYPE "aggressive-instcombine" 40 41 STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded"); 42 STATISTIC(NumGuardedRotates, 43 "Number of guarded rotates transformed into funnel shifts"); 44 STATISTIC(NumGuardedFunnelShifts, 45 "Number of guarded funnel shifts transformed into funnel shifts"); 46 STATISTIC(NumPopCountRecognized, "Number of popcount idioms recognized"); 47 48 static cl::opt<unsigned> MaxInstrsToScan( 49 "aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, 50 cl::desc("Max number of instructions to scan for aggressive instcombine.")); 51 52 /// Match a pattern for a bitwise funnel/rotate operation that partially guards 53 /// against undefined behavior by branching around the funnel-shift/rotation 54 /// when the shift amount is 0. 55 static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) { 56 if (I.getOpcode() != Instruction::PHI || I.getNumOperands() != 2) 57 return false; 58 59 // As with the one-use checks below, this is not strictly necessary, but we 60 // are being cautious to avoid potential perf regressions on targets that 61 // do not actually have a funnel/rotate instruction (where the funnel shift 62 // would be expanded back into math/shift/logic ops). 63 if (!isPowerOf2_32(I.getType()->getScalarSizeInBits())) 64 return false; 65 66 // Match V to funnel shift left/right and capture the source operands and 67 // shift amount. 68 auto matchFunnelShift = [](Value *V, Value *&ShVal0, Value *&ShVal1, 69 Value *&ShAmt) { 70 unsigned Width = V->getType()->getScalarSizeInBits(); 71 72 // fshl(ShVal0, ShVal1, ShAmt) 73 // == (ShVal0 << ShAmt) | (ShVal1 >> (Width -ShAmt)) 74 if (match(V, m_OneUse(m_c_Or( 75 m_Shl(m_Value(ShVal0), m_Value(ShAmt)), 76 m_LShr(m_Value(ShVal1), 77 m_Sub(m_SpecificInt(Width), m_Deferred(ShAmt))))))) { 78 return Intrinsic::fshl; 79 } 80 81 // fshr(ShVal0, ShVal1, ShAmt) 82 // == (ShVal0 >> ShAmt) | (ShVal1 << (Width - ShAmt)) 83 if (match(V, 84 m_OneUse(m_c_Or(m_Shl(m_Value(ShVal0), m_Sub(m_SpecificInt(Width), 85 m_Value(ShAmt))), 86 m_LShr(m_Value(ShVal1), m_Deferred(ShAmt)))))) { 87 return Intrinsic::fshr; 88 } 89 90 return Intrinsic::not_intrinsic; 91 }; 92 93 // One phi operand must be a funnel/rotate operation, and the other phi 94 // operand must be the source value of that funnel/rotate operation: 95 // phi [ rotate(RotSrc, ShAmt), FunnelBB ], [ RotSrc, GuardBB ] 96 // phi [ fshl(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal0, GuardBB ] 97 // phi [ fshr(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal1, GuardBB ] 98 PHINode &Phi = cast<PHINode>(I); 99 unsigned FunnelOp = 0, GuardOp = 1; 100 Value *P0 = Phi.getOperand(0), *P1 = Phi.getOperand(1); 101 Value *ShVal0, *ShVal1, *ShAmt; 102 Intrinsic::ID IID = matchFunnelShift(P0, ShVal0, ShVal1, ShAmt); 103 if (IID == Intrinsic::not_intrinsic || 104 (IID == Intrinsic::fshl && ShVal0 != P1) || 105 (IID == Intrinsic::fshr && ShVal1 != P1)) { 106 IID = matchFunnelShift(P1, ShVal0, ShVal1, ShAmt); 107 if (IID == Intrinsic::not_intrinsic || 108 (IID == Intrinsic::fshl && ShVal0 != P0) || 109 (IID == Intrinsic::fshr && ShVal1 != P0)) 110 return false; 111 assert((IID == Intrinsic::fshl || IID == Intrinsic::fshr) && 112 "Pattern must match funnel shift left or right"); 113 std::swap(FunnelOp, GuardOp); 114 } 115 116 // The incoming block with our source operand must be the "guard" block. 117 // That must contain a cmp+branch to avoid the funnel/rotate when the shift 118 // amount is equal to 0. The other incoming block is the block with the 119 // funnel/rotate. 120 BasicBlock *GuardBB = Phi.getIncomingBlock(GuardOp); 121 BasicBlock *FunnelBB = Phi.getIncomingBlock(FunnelOp); 122 Instruction *TermI = GuardBB->getTerminator(); 123 124 // Ensure that the shift values dominate each block. 125 if (!DT.dominates(ShVal0, TermI) || !DT.dominates(ShVal1, TermI)) 126 return false; 127 128 ICmpInst::Predicate Pred; 129 BasicBlock *PhiBB = Phi.getParent(); 130 if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()), 131 m_SpecificBB(PhiBB), m_SpecificBB(FunnelBB)))) 132 return false; 133 134 if (Pred != CmpInst::ICMP_EQ) 135 return false; 136 137 IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt()); 138 139 if (ShVal0 == ShVal1) 140 ++NumGuardedRotates; 141 else 142 ++NumGuardedFunnelShifts; 143 144 // If this is not a rotate then the select was blocking poison from the 145 // 'shift-by-zero' non-TVal, but a funnel shift won't - so freeze it. 146 bool IsFshl = IID == Intrinsic::fshl; 147 if (ShVal0 != ShVal1) { 148 if (IsFshl && !llvm::isGuaranteedNotToBePoison(ShVal1)) 149 ShVal1 = Builder.CreateFreeze(ShVal1); 150 else if (!IsFshl && !llvm::isGuaranteedNotToBePoison(ShVal0)) 151 ShVal0 = Builder.CreateFreeze(ShVal0); 152 } 153 154 // We matched a variation of this IR pattern: 155 // GuardBB: 156 // %cmp = icmp eq i32 %ShAmt, 0 157 // br i1 %cmp, label %PhiBB, label %FunnelBB 158 // FunnelBB: 159 // %sub = sub i32 32, %ShAmt 160 // %shr = lshr i32 %ShVal1, %sub 161 // %shl = shl i32 %ShVal0, %ShAmt 162 // %fsh = or i32 %shr, %shl 163 // br label %PhiBB 164 // PhiBB: 165 // %cond = phi i32 [ %fsh, %FunnelBB ], [ %ShVal0, %GuardBB ] 166 // --> 167 // llvm.fshl.i32(i32 %ShVal0, i32 %ShVal1, i32 %ShAmt) 168 Function *F = Intrinsic::getDeclaration(Phi.getModule(), IID, Phi.getType()); 169 Phi.replaceAllUsesWith(Builder.CreateCall(F, {ShVal0, ShVal1, ShAmt})); 170 return true; 171 } 172 173 /// This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and 174 /// the bit indexes (Mask) needed by a masked compare. If we're matching a chain 175 /// of 'and' ops, then we also need to capture the fact that we saw an 176 /// "and X, 1", so that's an extra return value for that case. 177 struct MaskOps { 178 Value *Root = nullptr; 179 APInt Mask; 180 bool MatchAndChain; 181 bool FoundAnd1 = false; 182 183 MaskOps(unsigned BitWidth, bool MatchAnds) 184 : Mask(APInt::getZero(BitWidth)), MatchAndChain(MatchAnds) {} 185 }; 186 187 /// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a 188 /// chain of 'and' or 'or' instructions looking for shift ops of a common source 189 /// value. Examples: 190 /// or (or (or X, (X >> 3)), (X >> 5)), (X >> 8) 191 /// returns { X, 0x129 } 192 /// and (and (X >> 1), 1), (X >> 4) 193 /// returns { X, 0x12 } 194 static bool matchAndOrChain(Value *V, MaskOps &MOps) { 195 Value *Op0, *Op1; 196 if (MOps.MatchAndChain) { 197 // Recurse through a chain of 'and' operands. This requires an extra check 198 // vs. the 'or' matcher: we must find an "and X, 1" instruction somewhere 199 // in the chain to know that all of the high bits are cleared. 200 if (match(V, m_And(m_Value(Op0), m_One()))) { 201 MOps.FoundAnd1 = true; 202 return matchAndOrChain(Op0, MOps); 203 } 204 if (match(V, m_And(m_Value(Op0), m_Value(Op1)))) 205 return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps); 206 } else { 207 // Recurse through a chain of 'or' operands. 208 if (match(V, m_Or(m_Value(Op0), m_Value(Op1)))) 209 return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps); 210 } 211 212 // We need a shift-right or a bare value representing a compare of bit 0 of 213 // the original source operand. 214 Value *Candidate; 215 const APInt *BitIndex = nullptr; 216 if (!match(V, m_LShr(m_Value(Candidate), m_APInt(BitIndex)))) 217 Candidate = V; 218 219 // Initialize result source operand. 220 if (!MOps.Root) 221 MOps.Root = Candidate; 222 223 // The shift constant is out-of-range? This code hasn't been simplified. 224 if (BitIndex && BitIndex->uge(MOps.Mask.getBitWidth())) 225 return false; 226 227 // Fill in the mask bit derived from the shift constant. 228 MOps.Mask.setBit(BitIndex ? BitIndex->getZExtValue() : 0); 229 return MOps.Root == Candidate; 230 } 231 232 /// Match patterns that correspond to "any-bits-set" and "all-bits-set". 233 /// These will include a chain of 'or' or 'and'-shifted bits from a 234 /// common source value: 235 /// and (or (lshr X, C), ...), 1 --> (X & CMask) != 0 236 /// and (and (lshr X, C), ...), 1 --> (X & CMask) == CMask 237 /// Note: "any-bits-clear" and "all-bits-clear" are variations of these patterns 238 /// that differ only with a final 'not' of the result. We expect that final 239 /// 'not' to be folded with the compare that we create here (invert predicate). 240 static bool foldAnyOrAllBitsSet(Instruction &I) { 241 // The 'any-bits-set' ('or' chain) pattern is simpler to match because the 242 // final "and X, 1" instruction must be the final op in the sequence. 243 bool MatchAllBitsSet; 244 if (match(&I, m_c_And(m_OneUse(m_And(m_Value(), m_Value())), m_Value()))) 245 MatchAllBitsSet = true; 246 else if (match(&I, m_And(m_OneUse(m_Or(m_Value(), m_Value())), m_One()))) 247 MatchAllBitsSet = false; 248 else 249 return false; 250 251 MaskOps MOps(I.getType()->getScalarSizeInBits(), MatchAllBitsSet); 252 if (MatchAllBitsSet) { 253 if (!matchAndOrChain(cast<BinaryOperator>(&I), MOps) || !MOps.FoundAnd1) 254 return false; 255 } else { 256 if (!matchAndOrChain(cast<BinaryOperator>(&I)->getOperand(0), MOps)) 257 return false; 258 } 259 260 // The pattern was found. Create a masked compare that replaces all of the 261 // shift and logic ops. 262 IRBuilder<> Builder(&I); 263 Constant *Mask = ConstantInt::get(I.getType(), MOps.Mask); 264 Value *And = Builder.CreateAnd(MOps.Root, Mask); 265 Value *Cmp = MatchAllBitsSet ? Builder.CreateICmpEQ(And, Mask) 266 : Builder.CreateIsNotNull(And); 267 Value *Zext = Builder.CreateZExt(Cmp, I.getType()); 268 I.replaceAllUsesWith(Zext); 269 ++NumAnyOrAllBitsSet; 270 return true; 271 } 272 273 // Try to recognize below function as popcount intrinsic. 274 // This is the "best" algorithm from 275 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel 276 // Also used in TargetLowering::expandCTPOP(). 277 // 278 // int popcount(unsigned int i) { 279 // i = i - ((i >> 1) & 0x55555555); 280 // i = (i & 0x33333333) + ((i >> 2) & 0x33333333); 281 // i = ((i + (i >> 4)) & 0x0F0F0F0F); 282 // return (i * 0x01010101) >> 24; 283 // } 284 static bool tryToRecognizePopCount(Instruction &I) { 285 if (I.getOpcode() != Instruction::LShr) 286 return false; 287 288 Type *Ty = I.getType(); 289 if (!Ty->isIntOrIntVectorTy()) 290 return false; 291 292 unsigned Len = Ty->getScalarSizeInBits(); 293 // FIXME: fix Len == 8 and other irregular type lengths. 294 if (!(Len <= 128 && Len > 8 && Len % 8 == 0)) 295 return false; 296 297 APInt Mask55 = APInt::getSplat(Len, APInt(8, 0x55)); 298 APInt Mask33 = APInt::getSplat(Len, APInt(8, 0x33)); 299 APInt Mask0F = APInt::getSplat(Len, APInt(8, 0x0F)); 300 APInt Mask01 = APInt::getSplat(Len, APInt(8, 0x01)); 301 APInt MaskShift = APInt(Len, Len - 8); 302 303 Value *Op0 = I.getOperand(0); 304 Value *Op1 = I.getOperand(1); 305 Value *MulOp0; 306 // Matching "(i * 0x01010101...) >> 24". 307 if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) && 308 match(Op1, m_SpecificInt(MaskShift))) { 309 Value *ShiftOp0; 310 // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)". 311 if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)), 312 m_Deferred(ShiftOp0)), 313 m_SpecificInt(Mask0F)))) { 314 Value *AndOp0; 315 // Matching "(i & 0x33333333...) + ((i >> 2) & 0x33333333...)". 316 if (match(ShiftOp0, 317 m_c_Add(m_And(m_Value(AndOp0), m_SpecificInt(Mask33)), 318 m_And(m_LShr(m_Deferred(AndOp0), m_SpecificInt(2)), 319 m_SpecificInt(Mask33))))) { 320 Value *Root, *SubOp1; 321 // Matching "i - ((i >> 1) & 0x55555555...)". 322 if (match(AndOp0, m_Sub(m_Value(Root), m_Value(SubOp1))) && 323 match(SubOp1, m_And(m_LShr(m_Specific(Root), m_SpecificInt(1)), 324 m_SpecificInt(Mask55)))) { 325 LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n"); 326 IRBuilder<> Builder(&I); 327 Function *Func = Intrinsic::getDeclaration( 328 I.getModule(), Intrinsic::ctpop, I.getType()); 329 I.replaceAllUsesWith(Builder.CreateCall(Func, {Root})); 330 ++NumPopCountRecognized; 331 return true; 332 } 333 } 334 } 335 } 336 337 return false; 338 } 339 340 /// Fold smin(smax(fptosi(x), C1), C2) to llvm.fptosi.sat(x), providing C1 and 341 /// C2 saturate the value of the fp conversion. The transform is not reversable 342 /// as the fptosi.sat is more defined than the input - all values produce a 343 /// valid value for the fptosi.sat, where as some produce poison for original 344 /// that were out of range of the integer conversion. The reversed pattern may 345 /// use fmax and fmin instead. As we cannot directly reverse the transform, and 346 /// it is not always profitable, we make it conditional on the cost being 347 /// reported as lower by TTI. 348 static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) { 349 // Look for min(max(fptosi, converting to fptosi_sat. 350 Value *In; 351 const APInt *MinC, *MaxC; 352 if (!match(&I, m_SMax(m_OneUse(m_SMin(m_OneUse(m_FPToSI(m_Value(In))), 353 m_APInt(MinC))), 354 m_APInt(MaxC))) && 355 !match(&I, m_SMin(m_OneUse(m_SMax(m_OneUse(m_FPToSI(m_Value(In))), 356 m_APInt(MaxC))), 357 m_APInt(MinC)))) 358 return false; 359 360 // Check that the constants clamp a saturate. 361 if (!(*MinC + 1).isPowerOf2() || -*MaxC != *MinC + 1) 362 return false; 363 364 Type *IntTy = I.getType(); 365 Type *FpTy = In->getType(); 366 Type *SatTy = 367 IntegerType::get(IntTy->getContext(), (*MinC + 1).exactLogBase2() + 1); 368 if (auto *VecTy = dyn_cast<VectorType>(IntTy)) 369 SatTy = VectorType::get(SatTy, VecTy->getElementCount()); 370 371 // Get the cost of the intrinsic, and check that against the cost of 372 // fptosi+smin+smax 373 InstructionCost SatCost = TTI.getIntrinsicInstrCost( 374 IntrinsicCostAttributes(Intrinsic::fptosi_sat, SatTy, {In}, {FpTy}), 375 TTI::TCK_RecipThroughput); 376 SatCost += TTI.getCastInstrCost(Instruction::SExt, SatTy, IntTy, 377 TTI::CastContextHint::None, 378 TTI::TCK_RecipThroughput); 379 380 InstructionCost MinMaxCost = TTI.getCastInstrCost( 381 Instruction::FPToSI, IntTy, FpTy, TTI::CastContextHint::None, 382 TTI::TCK_RecipThroughput); 383 MinMaxCost += TTI.getIntrinsicInstrCost( 384 IntrinsicCostAttributes(Intrinsic::smin, IntTy, {IntTy}), 385 TTI::TCK_RecipThroughput); 386 MinMaxCost += TTI.getIntrinsicInstrCost( 387 IntrinsicCostAttributes(Intrinsic::smax, IntTy, {IntTy}), 388 TTI::TCK_RecipThroughput); 389 390 if (SatCost >= MinMaxCost) 391 return false; 392 393 IRBuilder<> Builder(&I); 394 Function *Fn = Intrinsic::getDeclaration(I.getModule(), Intrinsic::fptosi_sat, 395 {SatTy, FpTy}); 396 Value *Sat = Builder.CreateCall(Fn, In); 397 I.replaceAllUsesWith(Builder.CreateSExt(Sat, IntTy)); 398 return true; 399 } 400 401 // Check if this array of constants represents a cttz table. 402 // Iterate over the elements from \p Table by trying to find/match all 403 // the numbers from 0 to \p InputBits that should represent cttz results. 404 static bool isCTTZTable(const ConstantDataArray &Table, uint64_t Mul, 405 uint64_t Shift, uint64_t InputBits) { 406 unsigned Length = Table.getNumElements(); 407 if (Length < InputBits || Length > InputBits * 2) 408 return false; 409 410 APInt Mask = APInt::getBitsSetFrom(InputBits, Shift); 411 unsigned Matched = 0; 412 413 for (unsigned i = 0; i < Length; i++) { 414 uint64_t Element = Table.getElementAsInteger(i); 415 if (Element >= InputBits) 416 continue; 417 418 // Check if \p Element matches a concrete answer. It could fail for some 419 // elements that are never accessed, so we keep iterating over each element 420 // from the table. The number of matched elements should be equal to the 421 // number of potential right answers which is \p InputBits actually. 422 if ((((Mul << Element) & Mask.getZExtValue()) >> Shift) == i) 423 Matched++; 424 } 425 426 return Matched == InputBits; 427 } 428 429 // Try to recognize table-based ctz implementation. 430 // E.g., an example in C (for more cases please see the llvm/tests): 431 // int f(unsigned x) { 432 // static const char table[32] = 433 // {0, 1, 28, 2, 29, 14, 24, 3, 30, 434 // 22, 20, 15, 25, 17, 4, 8, 31, 27, 435 // 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; 436 // return table[((unsigned)((x & -x) * 0x077CB531U)) >> 27]; 437 // } 438 // this can be lowered to `cttz` instruction. 439 // There is also a special case when the element is 0. 440 // 441 // Here are some examples or LLVM IR for a 64-bit target: 442 // 443 // CASE 1: 444 // %sub = sub i32 0, %x 445 // %and = and i32 %sub, %x 446 // %mul = mul i32 %and, 125613361 447 // %shr = lshr i32 %mul, 27 448 // %idxprom = zext i32 %shr to i64 449 // %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @ctz1.table, i64 0, 450 // i64 %idxprom %0 = load i8, i8* %arrayidx, align 1, !tbaa !8 451 // 452 // CASE 2: 453 // %sub = sub i32 0, %x 454 // %and = and i32 %sub, %x 455 // %mul = mul i32 %and, 72416175 456 // %shr = lshr i32 %mul, 26 457 // %idxprom = zext i32 %shr to i64 458 // %arrayidx = getelementptr inbounds [64 x i16], [64 x i16]* @ctz2.table, i64 459 // 0, i64 %idxprom %0 = load i16, i16* %arrayidx, align 2, !tbaa !8 460 // 461 // CASE 3: 462 // %sub = sub i32 0, %x 463 // %and = and i32 %sub, %x 464 // %mul = mul i32 %and, 81224991 465 // %shr = lshr i32 %mul, 27 466 // %idxprom = zext i32 %shr to i64 467 // %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @ctz3.table, i64 468 // 0, i64 %idxprom %0 = load i32, i32* %arrayidx, align 4, !tbaa !8 469 // 470 // CASE 4: 471 // %sub = sub i64 0, %x 472 // %and = and i64 %sub, %x 473 // %mul = mul i64 %and, 283881067100198605 474 // %shr = lshr i64 %mul, 58 475 // %arrayidx = getelementptr inbounds [64 x i8], [64 x i8]* @table, i64 0, i64 476 // %shr %0 = load i8, i8* %arrayidx, align 1, !tbaa !8 477 // 478 // All this can be lowered to @llvm.cttz.i32/64 intrinsic. 479 static bool tryToRecognizeTableBasedCttz(Instruction &I) { 480 LoadInst *LI = dyn_cast<LoadInst>(&I); 481 if (!LI) 482 return false; 483 484 Type *AccessType = LI->getType(); 485 if (!AccessType->isIntegerTy()) 486 return false; 487 488 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getPointerOperand()); 489 if (!GEP || !GEP->isInBounds() || GEP->getNumIndices() != 2) 490 return false; 491 492 if (!GEP->getSourceElementType()->isArrayTy()) 493 return false; 494 495 uint64_t ArraySize = GEP->getSourceElementType()->getArrayNumElements(); 496 if (ArraySize != 32 && ArraySize != 64) 497 return false; 498 499 GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand()); 500 if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant()) 501 return false; 502 503 ConstantDataArray *ConstData = 504 dyn_cast<ConstantDataArray>(GVTable->getInitializer()); 505 if (!ConstData) 506 return false; 507 508 if (!match(GEP->idx_begin()->get(), m_ZeroInt())) 509 return false; 510 511 Value *Idx2 = std::next(GEP->idx_begin())->get(); 512 Value *X1; 513 uint64_t MulConst, ShiftConst; 514 // FIXME: 64-bit targets have `i64` type for the GEP index, so this match will 515 // probably fail for other (e.g. 32-bit) targets. 516 if (!match(Idx2, m_ZExtOrSelf( 517 m_LShr(m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)), 518 m_ConstantInt(MulConst)), 519 m_ConstantInt(ShiftConst))))) 520 return false; 521 522 unsigned InputBits = X1->getType()->getScalarSizeInBits(); 523 if (InputBits != 32 && InputBits != 64) 524 return false; 525 526 // Shift should extract top 5..7 bits. 527 if (InputBits - Log2_32(InputBits) != ShiftConst && 528 InputBits - Log2_32(InputBits) - 1 != ShiftConst) 529 return false; 530 531 if (!isCTTZTable(*ConstData, MulConst, ShiftConst, InputBits)) 532 return false; 533 534 auto ZeroTableElem = ConstData->getElementAsInteger(0); 535 bool DefinedForZero = ZeroTableElem == InputBits; 536 537 IRBuilder<> B(LI); 538 ConstantInt *BoolConst = B.getInt1(!DefinedForZero); 539 Type *XType = X1->getType(); 540 auto Cttz = B.CreateIntrinsic(Intrinsic::cttz, {XType}, {X1, BoolConst}); 541 Value *ZExtOrTrunc = nullptr; 542 543 if (DefinedForZero) { 544 ZExtOrTrunc = B.CreateZExtOrTrunc(Cttz, AccessType); 545 } else { 546 // If the value in elem 0 isn't the same as InputBits, we still want to 547 // produce the value from the table. 548 auto Cmp = B.CreateICmpEQ(X1, ConstantInt::get(XType, 0)); 549 auto Select = 550 B.CreateSelect(Cmp, ConstantInt::get(XType, ZeroTableElem), Cttz); 551 552 // NOTE: If the table[0] is 0, but the cttz(0) is defined by the Target 553 // it should be handled as: `cttz(x) & (typeSize - 1)`. 554 555 ZExtOrTrunc = B.CreateZExtOrTrunc(Select, AccessType); 556 } 557 558 LI->replaceAllUsesWith(ZExtOrTrunc); 559 560 return true; 561 } 562 563 /// This is used by foldLoadsRecursive() to capture a Root Load node which is 564 /// of type or(load, load) and recursively build the wide load. Also capture the 565 /// shift amount, zero extend type and loadSize. 566 struct LoadOps { 567 LoadInst *Root = nullptr; 568 LoadInst *RootInsert = nullptr; 569 bool FoundRoot = false; 570 uint64_t LoadSize = 0; 571 const APInt *Shift = nullptr; 572 Type *ZextType; 573 AAMDNodes AATags; 574 }; 575 576 // Identify and Merge consecutive loads recursively which is of the form 577 // (ZExt(L1) << shift1) | (ZExt(L2) << shift2) -> ZExt(L3) << shift1 578 // (ZExt(L1) << shift1) | ZExt(L2) -> ZExt(L3) 579 static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL, 580 AliasAnalysis &AA) { 581 const APInt *ShAmt2 = nullptr; 582 Value *X; 583 Instruction *L1, *L2; 584 585 // Go to the last node with loads. 586 if (match(V, m_OneUse(m_c_Or( 587 m_Value(X), 588 m_OneUse(m_Shl(m_OneUse(m_ZExt(m_OneUse(m_Instruction(L2)))), 589 m_APInt(ShAmt2)))))) || 590 match(V, m_OneUse(m_Or(m_Value(X), 591 m_OneUse(m_ZExt(m_OneUse(m_Instruction(L2)))))))) { 592 if (!foldLoadsRecursive(X, LOps, DL, AA) && LOps.FoundRoot) 593 // Avoid Partial chain merge. 594 return false; 595 } else 596 return false; 597 598 // Check if the pattern has loads 599 LoadInst *LI1 = LOps.Root; 600 const APInt *ShAmt1 = LOps.Shift; 601 if (LOps.FoundRoot == false && 602 (match(X, m_OneUse(m_ZExt(m_Instruction(L1)))) || 603 match(X, m_OneUse(m_Shl(m_OneUse(m_ZExt(m_OneUse(m_Instruction(L1)))), 604 m_APInt(ShAmt1)))))) { 605 LI1 = dyn_cast<LoadInst>(L1); 606 } 607 LoadInst *LI2 = dyn_cast<LoadInst>(L2); 608 609 // Check if loads are same, atomic, volatile and having same address space. 610 if (LI1 == LI2 || !LI1 || !LI2 || !LI1->isSimple() || !LI2->isSimple() || 611 LI1->getPointerAddressSpace() != LI2->getPointerAddressSpace()) 612 return false; 613 614 // Check if Loads come from same BB. 615 if (LI1->getParent() != LI2->getParent()) 616 return false; 617 618 // Find the data layout 619 bool IsBigEndian = DL.isBigEndian(); 620 621 // Check if loads are consecutive and same size. 622 Value *Load1Ptr = LI1->getPointerOperand(); 623 APInt Offset1(DL.getIndexTypeSizeInBits(Load1Ptr->getType()), 0); 624 Load1Ptr = 625 Load1Ptr->stripAndAccumulateConstantOffsets(DL, Offset1, 626 /* AllowNonInbounds */ true); 627 628 Value *Load2Ptr = LI2->getPointerOperand(); 629 APInt Offset2(DL.getIndexTypeSizeInBits(Load2Ptr->getType()), 0); 630 Load2Ptr = 631 Load2Ptr->stripAndAccumulateConstantOffsets(DL, Offset2, 632 /* AllowNonInbounds */ true); 633 634 // Verify if both loads have same base pointers and load sizes are same. 635 uint64_t LoadSize1 = LI1->getType()->getPrimitiveSizeInBits(); 636 uint64_t LoadSize2 = LI2->getType()->getPrimitiveSizeInBits(); 637 if (Load1Ptr != Load2Ptr || LoadSize1 != LoadSize2) 638 return false; 639 640 // Support Loadsizes greater or equal to 8bits and only power of 2. 641 if (LoadSize1 < 8 || !isPowerOf2_64(LoadSize1)) 642 return false; 643 644 // Alias Analysis to check for stores b/w the loads. 645 LoadInst *Start = LOps.FoundRoot ? LOps.RootInsert : LI1, *End = LI2; 646 MemoryLocation Loc; 647 if (!Start->comesBefore(End)) { 648 std::swap(Start, End); 649 Loc = MemoryLocation::get(End); 650 if (LOps.FoundRoot) 651 Loc = Loc.getWithNewSize(LOps.LoadSize); 652 } else 653 Loc = MemoryLocation::get(End); 654 unsigned NumScanned = 0; 655 for (Instruction &Inst : 656 make_range(Start->getIterator(), End->getIterator())) { 657 if (Inst.mayWriteToMemory() && isModSet(AA.getModRefInfo(&Inst, Loc))) 658 return false; 659 if (++NumScanned > MaxInstrsToScan) 660 return false; 661 } 662 663 // Make sure Load with lower Offset is at LI1 664 bool Reverse = false; 665 if (Offset2.slt(Offset1)) { 666 std::swap(LI1, LI2); 667 std::swap(ShAmt1, ShAmt2); 668 std::swap(Offset1, Offset2); 669 std::swap(Load1Ptr, Load2Ptr); 670 std::swap(LoadSize1, LoadSize2); 671 Reverse = true; 672 } 673 674 // Big endian swap the shifts 675 if (IsBigEndian) 676 std::swap(ShAmt1, ShAmt2); 677 678 // Find Shifts values. 679 uint64_t Shift1 = 0, Shift2 = 0; 680 if (ShAmt1) 681 Shift1 = ShAmt1->getZExtValue(); 682 if (ShAmt2) 683 Shift2 = ShAmt2->getZExtValue(); 684 685 // First load is always LI1. This is where we put the new load. 686 // Use the merged load size available from LI1 for forward loads. 687 if (LOps.FoundRoot) { 688 if (!Reverse) 689 LoadSize1 = LOps.LoadSize; 690 else 691 LoadSize2 = LOps.LoadSize; 692 } 693 694 // Verify if shift amount and load index aligns and verifies that loads 695 // are consecutive. 696 uint64_t ShiftDiff = IsBigEndian ? LoadSize2 : LoadSize1; 697 uint64_t PrevSize = 698 DL.getTypeStoreSize(IntegerType::get(LI1->getContext(), LoadSize1)); 699 if ((Shift2 - Shift1) != ShiftDiff || (Offset2 - Offset1) != PrevSize) 700 return false; 701 702 // Update LOps 703 AAMDNodes AATags1 = LOps.AATags; 704 AAMDNodes AATags2 = LI2->getAAMetadata(); 705 if (LOps.FoundRoot == false) { 706 LOps.FoundRoot = true; 707 AATags1 = LI1->getAAMetadata(); 708 } 709 LOps.LoadSize = LoadSize1 + LoadSize2; 710 LOps.RootInsert = Start; 711 712 // Concatenate the AATags of the Merged Loads. 713 LOps.AATags = AATags1.concat(AATags2); 714 715 LOps.Root = LI1; 716 LOps.Shift = ShAmt1; 717 LOps.ZextType = X->getType(); 718 return true; 719 } 720 721 // For a given BB instruction, evaluate all loads in the chain that form a 722 // pattern which suggests that the loads can be combined. The one and only use 723 // of the loads is to form a wider load. 724 static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL, 725 TargetTransformInfo &TTI, AliasAnalysis &AA, 726 const DominatorTree &DT) { 727 // Only consider load chains of scalar values. 728 if (isa<VectorType>(I.getType())) 729 return false; 730 731 LoadOps LOps; 732 if (!foldLoadsRecursive(&I, LOps, DL, AA) || !LOps.FoundRoot) 733 return false; 734 735 IRBuilder<> Builder(&I); 736 LoadInst *NewLoad = nullptr, *LI1 = LOps.Root; 737 738 IntegerType *WiderType = IntegerType::get(I.getContext(), LOps.LoadSize); 739 // TTI based checks if we want to proceed with wider load 740 bool Allowed = TTI.isTypeLegal(WiderType); 741 if (!Allowed) 742 return false; 743 744 unsigned AS = LI1->getPointerAddressSpace(); 745 unsigned Fast = 0; 746 Allowed = TTI.allowsMisalignedMemoryAccesses(I.getContext(), LOps.LoadSize, 747 AS, LI1->getAlign(), &Fast); 748 if (!Allowed || !Fast) 749 return false; 750 751 // Get the Index and Ptr for the new GEP. 752 Value *Load1Ptr = LI1->getPointerOperand(); 753 Builder.SetInsertPoint(LOps.RootInsert); 754 if (!DT.dominates(Load1Ptr, LOps.RootInsert)) { 755 APInt Offset1(DL.getIndexTypeSizeInBits(Load1Ptr->getType()), 0); 756 Load1Ptr = Load1Ptr->stripAndAccumulateConstantOffsets( 757 DL, Offset1, /* AllowNonInbounds */ true); 758 Load1Ptr = Builder.CreateGEP(Builder.getInt8Ty(), Load1Ptr, 759 Builder.getInt32(Offset1.getZExtValue())); 760 } 761 // Generate wider load. 762 NewLoad = Builder.CreateAlignedLoad(WiderType, Load1Ptr, LI1->getAlign(), 763 LI1->isVolatile(), ""); 764 NewLoad->takeName(LI1); 765 // Set the New Load AATags Metadata. 766 if (LOps.AATags) 767 NewLoad->setAAMetadata(LOps.AATags); 768 769 Value *NewOp = NewLoad; 770 // Check if zero extend needed. 771 if (LOps.ZextType) 772 NewOp = Builder.CreateZExt(NewOp, LOps.ZextType); 773 774 // Check if shift needed. We need to shift with the amount of load1 775 // shift if not zero. 776 if (LOps.Shift) 777 NewOp = Builder.CreateShl(NewOp, ConstantInt::get(I.getContext(), *LOps.Shift)); 778 I.replaceAllUsesWith(NewOp); 779 780 return true; 781 } 782 783 // Calculate GEP Stride and accumulated const ModOffset. Return Stride and 784 // ModOffset 785 static std::pair<APInt, APInt> 786 getStrideAndModOffsetOfGEP(Value *PtrOp, const DataLayout &DL) { 787 unsigned BW = DL.getIndexTypeSizeInBits(PtrOp->getType()); 788 std::optional<APInt> Stride; 789 APInt ModOffset(BW, 0); 790 // Return a minimum gep stride, greatest common divisor of consective gep 791 // index scales(c.f. Bézout's identity). 792 while (auto *GEP = dyn_cast<GEPOperator>(PtrOp)) { 793 MapVector<Value *, APInt> VarOffsets; 794 if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset)) 795 break; 796 797 for (auto [V, Scale] : VarOffsets) { 798 // Only keep a power of two factor for non-inbounds 799 if (!GEP->isInBounds()) 800 Scale = APInt::getOneBitSet(Scale.getBitWidth(), Scale.countr_zero()); 801 802 if (!Stride) 803 Stride = Scale; 804 else 805 Stride = APIntOps::GreatestCommonDivisor(*Stride, Scale); 806 } 807 808 PtrOp = GEP->getPointerOperand(); 809 } 810 811 // Check whether pointer arrives back at Global Variable via at least one GEP. 812 // Even if it doesn't, we can check by alignment. 813 if (!isa<GlobalVariable>(PtrOp) || !Stride) 814 return {APInt(BW, 1), APInt(BW, 0)}; 815 816 // In consideration of signed GEP indices, non-negligible offset become 817 // remainder of division by minimum GEP stride. 818 ModOffset = ModOffset.srem(*Stride); 819 if (ModOffset.isNegative()) 820 ModOffset += *Stride; 821 822 return {*Stride, ModOffset}; 823 } 824 825 /// If C is a constant patterned array and all valid loaded results for given 826 /// alignment are same to a constant, return that constant. 827 static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) { 828 auto *LI = dyn_cast<LoadInst>(&I); 829 if (!LI || LI->isVolatile()) 830 return false; 831 832 // We can only fold the load if it is from a constant global with definitive 833 // initializer. Skip expensive logic if this is not the case. 834 auto *PtrOp = LI->getPointerOperand(); 835 auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(PtrOp)); 836 if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer()) 837 return false; 838 839 // Bail for large initializers in excess of 4K to avoid too many scans. 840 Constant *C = GV->getInitializer(); 841 uint64_t GVSize = DL.getTypeAllocSize(C->getType()); 842 if (!GVSize || 4096 < GVSize) 843 return false; 844 845 Type *LoadTy = LI->getType(); 846 unsigned BW = DL.getIndexTypeSizeInBits(PtrOp->getType()); 847 auto [Stride, ConstOffset] = getStrideAndModOffsetOfGEP(PtrOp, DL); 848 849 // Any possible offset could be multiple of GEP stride. And any valid 850 // offset is multiple of load alignment, so checking only multiples of bigger 851 // one is sufficient to say results' equality. 852 if (auto LA = LI->getAlign(); 853 LA <= GV->getAlign().valueOrOne() && Stride.getZExtValue() < LA.value()) { 854 ConstOffset = APInt(BW, 0); 855 Stride = APInt(BW, LA.value()); 856 } 857 858 Constant *Ca = ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL); 859 if (!Ca) 860 return false; 861 862 unsigned E = GVSize - DL.getTypeStoreSize(LoadTy); 863 for (; ConstOffset.getZExtValue() <= E; ConstOffset += Stride) 864 if (Ca != ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL)) 865 return false; 866 867 I.replaceAllUsesWith(Ca); 868 869 return true; 870 } 871 872 /// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids 873 /// pessimistic codegen that has to account for setting errno and can enable 874 /// vectorization. 875 static bool foldSqrt(CallInst *Call, TargetTransformInfo &TTI, 876 TargetLibraryInfo &TLI, AssumptionCache &AC, 877 DominatorTree &DT) { 878 Module *M = Call->getModule(); 879 880 // If (1) this is a sqrt libcall, (2) we can assume that NAN is not created 881 // (because NNAN or the operand arg must not be less than -0.0) and (2) we 882 // would not end up lowering to a libcall anyway (which could change the value 883 // of errno), then: 884 // (1) errno won't be set. 885 // (2) it is safe to convert this to an intrinsic call. 886 Type *Ty = Call->getType(); 887 Value *Arg = Call->getArgOperand(0); 888 if (TTI.haveFastSqrt(Ty) && 889 (Call->hasNoNaNs() || 890 cannotBeOrderedLessThanZero(Arg, M->getDataLayout(), &TLI, 0, &AC, Call, 891 &DT))) { 892 IRBuilder<> Builder(Call); 893 IRBuilderBase::FastMathFlagGuard Guard(Builder); 894 Builder.setFastMathFlags(Call->getFastMathFlags()); 895 896 Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, Ty); 897 Value *NewSqrt = Builder.CreateCall(Sqrt, Arg, "sqrt"); 898 Call->replaceAllUsesWith(NewSqrt); 899 900 // Explicitly erase the old call because a call with side effects is not 901 // trivially dead. 902 Call->eraseFromParent(); 903 return true; 904 } 905 906 return false; 907 } 908 909 /// Try to expand strcmp(P, "x") calls. 910 static bool expandStrcmp(CallInst *CI, DominatorTree &DT, bool &MadeCFGChange) { 911 Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); 912 913 // Trivial cases are optimized during inst combine 914 if (Str1P == Str2P) 915 return false; 916 917 StringRef Str1, Str2; 918 bool HasStr1 = getConstantStringInfo(Str1P, Str1); 919 bool HasStr2 = getConstantStringInfo(Str2P, Str2); 920 921 Value *NonConstantP = nullptr; 922 StringRef ConstantStr; 923 924 if (!HasStr1 && HasStr2 && Str2.size() == 1) { 925 NonConstantP = Str1P; 926 ConstantStr = Str2; 927 } else if (!HasStr2 && HasStr1 && Str1.size() == 1) { 928 NonConstantP = Str2P; 929 ConstantStr = Str1; 930 } else { 931 return false; 932 } 933 934 // Check if strcmp result is only used in a comparison with zero 935 if (!isOnlyUsedInZeroComparison(CI)) 936 return false; 937 938 // For strcmp(P, "x") do the following transformation: 939 // 940 // (before) 941 // dst = strcmp(P, "x") 942 // 943 // (after) 944 // v0 = P[0] - 'x' 945 // [if v0 == 0] 946 // v1 = P[1] 947 // dst = phi(v0, v1) 948 // 949 950 IRBuilder<> B(CI->getParent()); 951 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); 952 953 Type *RetType = CI->getType(); 954 955 B.SetInsertPoint(CI); 956 BasicBlock *InitialBB = B.GetInsertBlock(); 957 Value *Str1FirstCharacterValue = 958 B.CreateZExt(B.CreateLoad(B.getInt8Ty(), NonConstantP), RetType); 959 Value *Str2FirstCharacterValue = 960 ConstantInt::get(RetType, static_cast<unsigned char>(ConstantStr[0])); 961 Value *FirstCharacterSub = 962 B.CreateNSWSub(Str1FirstCharacterValue, Str2FirstCharacterValue); 963 Value *IsFirstCharacterSubZero = 964 B.CreateICmpEQ(FirstCharacterSub, ConstantInt::get(RetType, 0)); 965 Instruction *IsFirstCharacterSubZeroBBTerminator = SplitBlockAndInsertIfThen( 966 IsFirstCharacterSubZero, CI, /*Unreachable*/ false, 967 /*BranchWeights*/ nullptr, &DTU); 968 969 B.SetInsertPoint(IsFirstCharacterSubZeroBBTerminator); 970 B.GetInsertBlock()->setName("strcmp_expand_sub_is_zero"); 971 BasicBlock *IsFirstCharacterSubZeroBB = B.GetInsertBlock(); 972 Value *Str1SecondCharacterValue = B.CreateZExt( 973 B.CreateLoad(B.getInt8Ty(), B.CreateConstInBoundsGEP1_64( 974 B.getInt8Ty(), NonConstantP, 1)), 975 RetType); 976 977 B.SetInsertPoint(CI); 978 B.GetInsertBlock()->setName("strcmp_expand_sub_join"); 979 980 PHINode *Result = B.CreatePHI(RetType, 2); 981 Result->addIncoming(FirstCharacterSub, InitialBB); 982 Result->addIncoming(Str1SecondCharacterValue, IsFirstCharacterSubZeroBB); 983 984 CI->replaceAllUsesWith(Result); 985 CI->eraseFromParent(); 986 987 MadeCFGChange = true; 988 989 return true; 990 } 991 992 static bool foldLibraryCalls(Instruction &I, TargetTransformInfo &TTI, 993 TargetLibraryInfo &TLI, DominatorTree &DT, 994 AssumptionCache &AC, bool &MadeCFGChange) { 995 CallInst *CI = dyn_cast<CallInst>(&I); 996 if (!CI) 997 return false; 998 999 LibFunc Func; 1000 Module *M = I.getModule(); 1001 if (!TLI.getLibFunc(*CI, Func) || !isLibFuncEmittable(M, &TLI, Func)) 1002 return false; 1003 1004 switch (Func) { 1005 case LibFunc_sqrt: 1006 case LibFunc_sqrtf: 1007 case LibFunc_sqrtl: 1008 return foldSqrt(CI, TTI, TLI, AC, DT); 1009 case LibFunc_strcmp: 1010 return expandStrcmp(CI, DT, MadeCFGChange); 1011 default: 1012 break; 1013 } 1014 1015 return false; 1016 } 1017 1018 /// This is the entry point for folds that could be implemented in regular 1019 /// InstCombine, but they are separated because they are not expected to 1020 /// occur frequently and/or have more than a constant-length pattern match. 1021 static bool foldUnusualPatterns(Function &F, DominatorTree &DT, 1022 TargetTransformInfo &TTI, 1023 TargetLibraryInfo &TLI, AliasAnalysis &AA, 1024 AssumptionCache &AC, bool &MadeCFGChange) { 1025 bool MadeChange = false; 1026 for (BasicBlock &BB : F) { 1027 // Ignore unreachable basic blocks. 1028 if (!DT.isReachableFromEntry(&BB)) 1029 continue; 1030 1031 const DataLayout &DL = F.getParent()->getDataLayout(); 1032 1033 // Walk the block backwards for efficiency. We're matching a chain of 1034 // use->defs, so we're more likely to succeed by starting from the bottom. 1035 // Also, we want to avoid matching partial patterns. 1036 // TODO: It would be more efficient if we removed dead instructions 1037 // iteratively in this loop rather than waiting until the end. 1038 for (Instruction &I : make_early_inc_range(llvm::reverse(BB))) { 1039 MadeChange |= foldAnyOrAllBitsSet(I); 1040 MadeChange |= foldGuardedFunnelShift(I, DT); 1041 MadeChange |= tryToRecognizePopCount(I); 1042 MadeChange |= tryToFPToSat(I, TTI); 1043 MadeChange |= tryToRecognizeTableBasedCttz(I); 1044 MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT); 1045 MadeChange |= foldPatternedLoads(I, DL); 1046 // NOTE: This function introduces erasing of the instruction `I`, so it 1047 // needs to be called at the end of this sequence, otherwise we may make 1048 // bugs. 1049 MadeChange |= foldLibraryCalls(I, TTI, TLI, DT, AC, MadeCFGChange); 1050 } 1051 } 1052 1053 // We're done with transforms, so remove dead instructions. 1054 if (MadeChange) 1055 for (BasicBlock &BB : F) 1056 SimplifyInstructionsInBlock(&BB); 1057 1058 return MadeChange; 1059 } 1060 1061 /// This is the entry point for all transforms. Pass manager differences are 1062 /// handled in the callers of this function. 1063 static bool runImpl(Function &F, AssumptionCache &AC, TargetTransformInfo &TTI, 1064 TargetLibraryInfo &TLI, DominatorTree &DT, 1065 AliasAnalysis &AA, bool &ChangedCFG) { 1066 bool MadeChange = false; 1067 const DataLayout &DL = F.getParent()->getDataLayout(); 1068 TruncInstCombine TIC(AC, TLI, DL, DT); 1069 MadeChange |= TIC.run(F); 1070 MadeChange |= foldUnusualPatterns(F, DT, TTI, TLI, AA, AC, ChangedCFG); 1071 return MadeChange; 1072 } 1073 1074 PreservedAnalyses AggressiveInstCombinePass::run(Function &F, 1075 FunctionAnalysisManager &AM) { 1076 auto &AC = AM.getResult<AssumptionAnalysis>(F); 1077 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 1078 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 1079 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 1080 auto &AA = AM.getResult<AAManager>(F); 1081 1082 bool MadeCFGChange = false; 1083 1084 if (!runImpl(F, AC, TTI, TLI, DT, AA, MadeCFGChange)) { 1085 // No changes, all analyses are preserved. 1086 return PreservedAnalyses::all(); 1087 } 1088 1089 // Mark all the analyses that instcombine updates as preserved. 1090 PreservedAnalyses PA; 1091 1092 if (MadeCFGChange) 1093 PA.preserve<DominatorTreeAnalysis>(); 1094 else 1095 PA.preserveSet<CFGAnalyses>(); 1096 1097 return PA; 1098 } 1099