1 //===- BypassSlowDivision.cpp - Bypass slow division ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains an optimization for div and rem on architectures that 10 // execute short instructions significantly faster than longer instructions. 11 // For example, on Intel Atom 32-bit divides are slow enough that during 12 // runtime it is profitable to check the value of the operands, and if they are 13 // positive and less than 256 use an unsigned 8-bit divide. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/Transforms/Utils/BypassSlowDivision.h" 18 #include "llvm/ADT/DenseMap.h" 19 #include "llvm/ADT/None.h" 20 #include "llvm/ADT/Optional.h" 21 #include "llvm/ADT/STLExtras.h" 22 #include "llvm/ADT/SmallPtrSet.h" 23 #include "llvm/Transforms/Utils/Local.h" 24 #include "llvm/Analysis/ValueTracking.h" 25 #include "llvm/IR/BasicBlock.h" 26 #include "llvm/IR/Constants.h" 27 #include "llvm/IR/DerivedTypes.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/Instruction.h" 31 #include "llvm/IR/Instructions.h" 32 #include "llvm/IR/Module.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/IR/Value.h" 35 #include "llvm/Support/Casting.h" 36 #include "llvm/Support/KnownBits.h" 37 #include <cassert> 38 #include <cstdint> 39 40 using namespace llvm; 41 42 #define DEBUG_TYPE "bypass-slow-division" 43 44 namespace { 45 46 struct QuotRemPair { 47 Value *Quotient; 48 Value *Remainder; 49 50 QuotRemPair(Value *InQuotient, Value *InRemainder) 51 : Quotient(InQuotient), Remainder(InRemainder) {} 52 }; 53 54 /// A quotient and remainder, plus a BB from which they logically "originate". 55 /// If you use Quotient or Remainder in a Phi node, you should use BB as its 56 /// corresponding predecessor. 57 struct QuotRemWithBB { 58 BasicBlock *BB = nullptr; 59 Value *Quotient = nullptr; 60 Value *Remainder = nullptr; 61 }; 62 63 using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>; 64 using BypassWidthsTy = DenseMap<unsigned, unsigned>; 65 using VisitedSetTy = SmallPtrSet<Instruction *, 4>; 66 67 enum ValueRange { 68 /// Operand definitely fits into BypassType. No runtime checks are needed. 69 VALRNG_KNOWN_SHORT, 70 /// A runtime check is required, as value range is unknown. 71 VALRNG_UNKNOWN, 72 /// Operand is unlikely to fit into BypassType. The bypassing should be 73 /// disabled. 74 VALRNG_LIKELY_LONG 75 }; 76 77 class FastDivInsertionTask { 78 bool IsValidTask = false; 79 Instruction *SlowDivOrRem = nullptr; 80 IntegerType *BypassType = nullptr; 81 BasicBlock *MainBB = nullptr; 82 83 bool isHashLikeValue(Value *V, VisitedSetTy &Visited); 84 ValueRange getValueRange(Value *Op, VisitedSetTy &Visited); 85 QuotRemWithBB createSlowBB(BasicBlock *Successor); 86 QuotRemWithBB createFastBB(BasicBlock *Successor); 87 QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, 88 BasicBlock *PhiBB); 89 Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2); 90 Optional<QuotRemPair> insertFastDivAndRem(); 91 92 bool isSignedOp() { 93 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 94 SlowDivOrRem->getOpcode() == Instruction::SRem; 95 } 96 97 bool isDivisionOp() { 98 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 99 SlowDivOrRem->getOpcode() == Instruction::UDiv; 100 } 101 102 Type *getSlowType() { return SlowDivOrRem->getType(); } 103 104 public: 105 FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths); 106 107 Value *getReplacement(DivCacheTy &Cache); 108 }; 109 110 } // end anonymous namespace 111 112 FastDivInsertionTask::FastDivInsertionTask(Instruction *I, 113 const BypassWidthsTy &BypassWidths) { 114 switch (I->getOpcode()) { 115 case Instruction::UDiv: 116 case Instruction::SDiv: 117 case Instruction::URem: 118 case Instruction::SRem: 119 SlowDivOrRem = I; 120 break; 121 default: 122 // I is not a div/rem operation. 123 return; 124 } 125 126 // Skip division on vector types. Only optimize integer instructions. 127 IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType()); 128 if (!SlowType) 129 return; 130 131 // Skip if this bitwidth is not bypassed. 132 auto BI = BypassWidths.find(SlowType->getBitWidth()); 133 if (BI == BypassWidths.end()) 134 return; 135 136 // Get type for div/rem instruction with bypass bitwidth. 137 IntegerType *BT = IntegerType::get(I->getContext(), BI->second); 138 BypassType = BT; 139 140 // The original basic block. 141 MainBB = I->getParent(); 142 143 // The instruction is indeed a slow div or rem operation. 144 IsValidTask = true; 145 } 146 147 /// Reuses previously-computed dividend or remainder from the current BB if 148 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to 149 /// perform the optimization and caches the resulting dividend and remainder. 150 /// If no replacement can be generated, nullptr is returned. 151 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) { 152 // First, make sure that the task is valid. 153 if (!IsValidTask) 154 return nullptr; 155 156 // Then, look for a value in Cache. 157 Value *Dividend = SlowDivOrRem->getOperand(0); 158 Value *Divisor = SlowDivOrRem->getOperand(1); 159 DivRemMapKey Key(isSignedOp(), Dividend, Divisor); 160 auto CacheI = Cache.find(Key); 161 162 if (CacheI == Cache.end()) { 163 // If previous instance does not exist, try to insert fast div. 164 Optional<QuotRemPair> OptResult = insertFastDivAndRem(); 165 // Bail out if insertFastDivAndRem has failed. 166 if (!OptResult) 167 return nullptr; 168 CacheI = Cache.insert({Key, *OptResult}).first; 169 } 170 171 QuotRemPair &Value = CacheI->second; 172 return isDivisionOp() ? Value.Quotient : Value.Remainder; 173 } 174 175 /// Check if a value looks like a hash. 176 /// 177 /// The routine is expected to detect values computed using the most common hash 178 /// algorithms. Typically, hash computations end with one of the following 179 /// instructions: 180 /// 181 /// 1) MUL with a constant wider than BypassType 182 /// 2) XOR instruction 183 /// 184 /// And even if we are wrong and the value is not a hash, it is still quite 185 /// unlikely that such values will fit into BypassType. 186 /// 187 /// To detect string hash algorithms like FNV we have to look through PHI-nodes. 188 /// It is implemented as a depth-first search for values that look neither long 189 /// nor hash-like. 190 bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) { 191 Instruction *I = dyn_cast<Instruction>(V); 192 if (!I) 193 return false; 194 195 switch (I->getOpcode()) { 196 case Instruction::Xor: 197 return true; 198 case Instruction::Mul: { 199 // After Constant Hoisting pass, long constants may be represented as 200 // bitcast instructions. As a result, some constants may look like an 201 // instruction at first, and an additional check is necessary to find out if 202 // an operand is actually a constant. 203 Value *Op1 = I->getOperand(1); 204 ConstantInt *C = dyn_cast<ConstantInt>(Op1); 205 if (!C && isa<BitCastInst>(Op1)) 206 C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0)); 207 return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth(); 208 } 209 case Instruction::PHI: 210 // Stop IR traversal in case of a crazy input code. This limits recursion 211 // depth. 212 if (Visited.size() >= 16) 213 return false; 214 // Do not visit nodes that have been visited already. We return true because 215 // it means that we couldn't find any value that doesn't look hash-like. 216 if (Visited.find(I) != Visited.end()) 217 return true; 218 Visited.insert(I); 219 return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) { 220 // Ignore undef values as they probably don't affect the division 221 // operands. 222 return getValueRange(V, Visited) == VALRNG_LIKELY_LONG || 223 isa<UndefValue>(V); 224 }); 225 default: 226 return false; 227 } 228 } 229 230 /// Check if an integer value fits into our bypass type. 231 ValueRange FastDivInsertionTask::getValueRange(Value *V, 232 VisitedSetTy &Visited) { 233 unsigned ShortLen = BypassType->getBitWidth(); 234 unsigned LongLen = V->getType()->getIntegerBitWidth(); 235 236 assert(LongLen > ShortLen && "Value type must be wider than BypassType"); 237 unsigned HiBits = LongLen - ShortLen; 238 239 const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout(); 240 KnownBits Known(LongLen); 241 242 computeKnownBits(V, Known, DL); 243 244 if (Known.countMinLeadingZeros() >= HiBits) 245 return VALRNG_KNOWN_SHORT; 246 247 if (Known.countMaxLeadingZeros() < HiBits) 248 return VALRNG_LIKELY_LONG; 249 250 // Long integer divisions are often used in hashtable implementations. It's 251 // not worth bypassing such divisions because hash values are extremely 252 // unlikely to have enough leading zeros. The call below tries to detect 253 // values that are unlikely to fit BypassType (including hashes). 254 if (isHashLikeValue(V, Visited)) 255 return VALRNG_LIKELY_LONG; 256 257 return VALRNG_UNKNOWN; 258 } 259 260 /// Add new basic block for slow div and rem operations and put it before 261 /// SuccessorBB. 262 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { 263 QuotRemWithBB DivRemPair; 264 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 265 MainBB->getParent(), SuccessorBB); 266 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 267 268 Value *Dividend = SlowDivOrRem->getOperand(0); 269 Value *Divisor = SlowDivOrRem->getOperand(1); 270 271 if (isSignedOp()) { 272 DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor); 273 DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor); 274 } else { 275 DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor); 276 DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor); 277 } 278 279 Builder.CreateBr(SuccessorBB); 280 return DivRemPair; 281 } 282 283 /// Add new basic block for fast div and rem operations and put it before 284 /// SuccessorBB. 285 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) { 286 QuotRemWithBB DivRemPair; 287 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 288 MainBB->getParent(), SuccessorBB); 289 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 290 291 Value *Dividend = SlowDivOrRem->getOperand(0); 292 Value *Divisor = SlowDivOrRem->getOperand(1); 293 Value *ShortDivisorV = 294 Builder.CreateCast(Instruction::Trunc, Divisor, BypassType); 295 Value *ShortDividendV = 296 Builder.CreateCast(Instruction::Trunc, Dividend, BypassType); 297 298 // udiv/urem because this optimization only handles positive numbers. 299 Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV); 300 Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV); 301 DivRemPair.Quotient = 302 Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType()); 303 DivRemPair.Remainder = 304 Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType()); 305 Builder.CreateBr(SuccessorBB); 306 307 return DivRemPair; 308 } 309 310 /// Creates Phi nodes for result of Div and Rem. 311 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS, 312 QuotRemWithBB &RHS, 313 BasicBlock *PhiBB) { 314 IRBuilder<> Builder(PhiBB, PhiBB->begin()); 315 PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2); 316 QuoPhi->addIncoming(LHS.Quotient, LHS.BB); 317 QuoPhi->addIncoming(RHS.Quotient, RHS.BB); 318 PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2); 319 RemPhi->addIncoming(LHS.Remainder, LHS.BB); 320 RemPhi->addIncoming(RHS.Remainder, RHS.BB); 321 return QuotRemPair(QuoPhi, RemPhi); 322 } 323 324 /// Creates a runtime check to test whether both the divisor and dividend fit 325 /// into BypassType. The check is inserted at the end of MainBB. True return 326 /// value means that the operands fit. Either of the operands may be NULL if it 327 /// doesn't need a runtime check. 328 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { 329 assert((Op1 || Op2) && "Nothing to check"); 330 IRBuilder<> Builder(MainBB, MainBB->end()); 331 332 Value *OrV; 333 if (Op1 && Op2) 334 OrV = Builder.CreateOr(Op1, Op2); 335 else 336 OrV = Op1 ? Op1 : Op2; 337 338 // BitMask is inverted to check if the operands are 339 // larger than the bypass type 340 uint64_t BitMask = ~BypassType->getBitMask(); 341 Value *AndV = Builder.CreateAnd(OrV, BitMask); 342 343 // Compare operand values 344 Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0); 345 return Builder.CreateICmpEQ(AndV, ZeroV); 346 } 347 348 /// Substitutes the div/rem instruction with code that checks the value of the 349 /// operands and uses a shorter-faster div/rem instruction when possible. 350 Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() { 351 Value *Dividend = SlowDivOrRem->getOperand(0); 352 Value *Divisor = SlowDivOrRem->getOperand(1); 353 354 VisitedSetTy SetL; 355 ValueRange DividendRange = getValueRange(Dividend, SetL); 356 if (DividendRange == VALRNG_LIKELY_LONG) 357 return None; 358 359 VisitedSetTy SetR; 360 ValueRange DivisorRange = getValueRange(Divisor, SetR); 361 if (DivisorRange == VALRNG_LIKELY_LONG) 362 return None; 363 364 bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT); 365 bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT); 366 367 if (DividendShort && DivisorShort) { 368 // If both operands are known to be short then just replace the long 369 // division with a short one in-place. Since we're not introducing control 370 // flow in this case, narrowing the division is always a win, even if the 371 // divisor is a constant (and will later get replaced by a multiplication). 372 373 IRBuilder<> Builder(SlowDivOrRem); 374 Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); 375 Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); 376 Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor); 377 Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor); 378 Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType()); 379 Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType()); 380 return QuotRemPair(ExtDiv, ExtRem); 381 } 382 383 if (isa<ConstantInt>(Divisor)) { 384 // If the divisor is not a constant, DAGCombiner will convert it to a 385 // multiplication by a magic constant. It isn't clear if it is worth 386 // introducing control flow to get a narrower multiply. 387 return None; 388 } 389 390 // After Constant Hoisting pass, long constants may be represented as 391 // bitcast instructions. As a result, some constants may look like an 392 // instruction at first, and an additional check is necessary to find out if 393 // an operand is actually a constant. 394 if (auto *BCI = dyn_cast<BitCastInst>(Divisor)) 395 if (BCI->getParent() == SlowDivOrRem->getParent() && 396 isa<ConstantInt>(BCI->getOperand(0))) 397 return None; 398 399 if (DividendShort && !isSignedOp()) { 400 // If the division is unsigned and Dividend is known to be short, then 401 // either 402 // 1) Divisor is less or equal to Dividend, and the result can be computed 403 // with a short division. 404 // 2) Divisor is greater than Dividend. In this case, no division is needed 405 // at all: The quotient is 0 and the remainder is equal to Dividend. 406 // 407 // So instead of checking at runtime whether Divisor fits into BypassType, 408 // we emit a runtime check to differentiate between these two cases. This 409 // lets us entirely avoid a long div. 410 411 // Split the basic block before the div/rem. 412 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 413 // Remove the unconditional branch from MainBB to SuccessorBB. 414 MainBB->getInstList().back().eraseFromParent(); 415 QuotRemWithBB Long; 416 Long.BB = MainBB; 417 Long.Quotient = ConstantInt::get(getSlowType(), 0); 418 Long.Remainder = Dividend; 419 QuotRemWithBB Fast = createFastBB(SuccessorBB); 420 QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); 421 IRBuilder<> Builder(MainBB, MainBB->end()); 422 Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); 423 Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); 424 return Result; 425 } else { 426 // General case. Create both slow and fast div/rem pairs and choose one of 427 // them at runtime. 428 429 // Split the basic block before the div/rem. 430 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 431 // Remove the unconditional branch from MainBB to SuccessorBB. 432 MainBB->getInstList().back().eraseFromParent(); 433 QuotRemWithBB Fast = createFastBB(SuccessorBB); 434 QuotRemWithBB Slow = createSlowBB(SuccessorBB); 435 QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); 436 Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, 437 DivisorShort ? nullptr : Divisor); 438 IRBuilder<> Builder(MainBB, MainBB->end()); 439 Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); 440 return Result; 441 } 442 } 443 444 /// This optimization identifies DIV/REM instructions in a BB that can be 445 /// profitably bypassed and carried out with a shorter, faster divide. 446 bool llvm::bypassSlowDivision(BasicBlock *BB, 447 const BypassWidthsTy &BypassWidths) { 448 DivCacheTy PerBBDivCache; 449 450 bool MadeChange = false; 451 Instruction* Next = &*BB->begin(); 452 while (Next != nullptr) { 453 // We may add instructions immediately after I, but we want to skip over 454 // them. 455 Instruction* I = Next; 456 Next = Next->getNextNode(); 457 458 FastDivInsertionTask Task(I, BypassWidths); 459 if (Value *Replacement = Task.getReplacement(PerBBDivCache)) { 460 I->replaceAllUsesWith(Replacement); 461 I->eraseFromParent(); 462 MadeChange = true; 463 } 464 } 465 466 // Above we eagerly create divs and rems, as pairs, so that we can efficiently 467 // create divrem machine instructions. Now erase any unused divs / rems so we 468 // don't leave extra instructions sitting around. 469 for (auto &KV : PerBBDivCache) 470 for (Value *V : {KV.second.Quotient, KV.second.Remainder}) 471 RecursivelyDeleteTriviallyDeadInstructions(V); 472 473 return MadeChange; 474 } 475