1 //===- BypassSlowDivision.cpp - Bypass slow division ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains an optimization for div and rem on architectures that 10 // execute short instructions significantly faster than longer instructions. 11 // For example, on Intel Atom 32-bit divides are slow enough that during 12 // runtime it is profitable to check the value of the operands, and if they are 13 // positive and less than 256 use an unsigned 8-bit divide. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/Transforms/Utils/BypassSlowDivision.h" 18 #include "llvm/ADT/DenseMap.h" 19 #include "llvm/ADT/None.h" 20 #include "llvm/ADT/Optional.h" 21 #include "llvm/ADT/STLExtras.h" 22 #include "llvm/ADT/SmallPtrSet.h" 23 #include "llvm/Transforms/Utils/Local.h" 24 #include "llvm/Analysis/ValueTracking.h" 25 #include "llvm/IR/BasicBlock.h" 26 #include "llvm/IR/Constants.h" 27 #include "llvm/IR/DerivedTypes.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/Instruction.h" 31 #include "llvm/IR/Instructions.h" 32 #include "llvm/IR/Module.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/IR/Value.h" 35 #include "llvm/Support/Casting.h" 36 #include "llvm/Support/KnownBits.h" 37 #include <cassert> 38 #include <cstdint> 39 40 using namespace llvm; 41 42 #define DEBUG_TYPE "bypass-slow-division" 43 44 namespace { 45 46 struct QuotRemPair { 47 Value *Quotient; 48 Value *Remainder; 49 50 QuotRemPair(Value *InQuotient, Value *InRemainder) 51 : Quotient(InQuotient), Remainder(InRemainder) {} 52 }; 53 54 /// A quotient and remainder, plus a BB from which they logically "originate". 55 /// If you use Quotient or Remainder in a Phi node, you should use BB as its 56 /// corresponding predecessor. 57 struct QuotRemWithBB { 58 BasicBlock *BB = nullptr; 59 Value *Quotient = nullptr; 60 Value *Remainder = nullptr; 61 }; 62 63 using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>; 64 using BypassWidthsTy = DenseMap<unsigned, unsigned>; 65 using VisitedSetTy = SmallPtrSet<Instruction *, 4>; 66 67 enum ValueRange { 68 /// Operand definitely fits into BypassType. No runtime checks are needed. 69 VALRNG_KNOWN_SHORT, 70 /// A runtime check is required, as value range is unknown. 71 VALRNG_UNKNOWN, 72 /// Operand is unlikely to fit into BypassType. The bypassing should be 73 /// disabled. 74 VALRNG_LIKELY_LONG 75 }; 76 77 class FastDivInsertionTask { 78 bool IsValidTask = false; 79 Instruction *SlowDivOrRem = nullptr; 80 IntegerType *BypassType = nullptr; 81 BasicBlock *MainBB = nullptr; 82 83 bool isHashLikeValue(Value *V, VisitedSetTy &Visited); 84 ValueRange getValueRange(Value *Op, VisitedSetTy &Visited); 85 QuotRemWithBB createSlowBB(BasicBlock *Successor); 86 QuotRemWithBB createFastBB(BasicBlock *Successor); 87 QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, 88 BasicBlock *PhiBB); 89 Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2); 90 Optional<QuotRemPair> insertFastDivAndRem(); 91 92 bool isSignedOp() { 93 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 94 SlowDivOrRem->getOpcode() == Instruction::SRem; 95 } 96 97 bool isDivisionOp() { 98 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 99 SlowDivOrRem->getOpcode() == Instruction::UDiv; 100 } 101 102 Type *getSlowType() { return SlowDivOrRem->getType(); } 103 104 public: 105 FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths); 106 107 Value *getReplacement(DivCacheTy &Cache); 108 }; 109 110 } // end anonymous namespace 111 112 FastDivInsertionTask::FastDivInsertionTask(Instruction *I, 113 const BypassWidthsTy &BypassWidths) { 114 switch (I->getOpcode()) { 115 case Instruction::UDiv: 116 case Instruction::SDiv: 117 case Instruction::URem: 118 case Instruction::SRem: 119 SlowDivOrRem = I; 120 break; 121 default: 122 // I is not a div/rem operation. 123 return; 124 } 125 126 // Skip division on vector types. Only optimize integer instructions. 127 IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType()); 128 if (!SlowType) 129 return; 130 131 // Skip if this bitwidth is not bypassed. 132 auto BI = BypassWidths.find(SlowType->getBitWidth()); 133 if (BI == BypassWidths.end()) 134 return; 135 136 // Get type for div/rem instruction with bypass bitwidth. 137 IntegerType *BT = IntegerType::get(I->getContext(), BI->second); 138 BypassType = BT; 139 140 // The original basic block. 141 MainBB = I->getParent(); 142 143 // The instruction is indeed a slow div or rem operation. 144 IsValidTask = true; 145 } 146 147 /// Reuses previously-computed dividend or remainder from the current BB if 148 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to 149 /// perform the optimization and caches the resulting dividend and remainder. 150 /// If no replacement can be generated, nullptr is returned. 151 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) { 152 // First, make sure that the task is valid. 153 if (!IsValidTask) 154 return nullptr; 155 156 // Then, look for a value in Cache. 157 Value *Dividend = SlowDivOrRem->getOperand(0); 158 Value *Divisor = SlowDivOrRem->getOperand(1); 159 DivRemMapKey Key(isSignedOp(), Dividend, Divisor); 160 auto CacheI = Cache.find(Key); 161 162 if (CacheI == Cache.end()) { 163 // If previous instance does not exist, try to insert fast div. 164 Optional<QuotRemPair> OptResult = insertFastDivAndRem(); 165 // Bail out if insertFastDivAndRem has failed. 166 if (!OptResult) 167 return nullptr; 168 CacheI = Cache.insert({Key, *OptResult}).first; 169 } 170 171 QuotRemPair &Value = CacheI->second; 172 return isDivisionOp() ? Value.Quotient : Value.Remainder; 173 } 174 175 /// Check if a value looks like a hash. 176 /// 177 /// The routine is expected to detect values computed using the most common hash 178 /// algorithms. Typically, hash computations end with one of the following 179 /// instructions: 180 /// 181 /// 1) MUL with a constant wider than BypassType 182 /// 2) XOR instruction 183 /// 184 /// And even if we are wrong and the value is not a hash, it is still quite 185 /// unlikely that such values will fit into BypassType. 186 /// 187 /// To detect string hash algorithms like FNV we have to look through PHI-nodes. 188 /// It is implemented as a depth-first search for values that look neither long 189 /// nor hash-like. 190 bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) { 191 Instruction *I = dyn_cast<Instruction>(V); 192 if (!I) 193 return false; 194 195 switch (I->getOpcode()) { 196 case Instruction::Xor: 197 return true; 198 case Instruction::Mul: { 199 // After Constant Hoisting pass, long constants may be represented as 200 // bitcast instructions. As a result, some constants may look like an 201 // instruction at first, and an additional check is necessary to find out if 202 // an operand is actually a constant. 203 Value *Op1 = I->getOperand(1); 204 ConstantInt *C = dyn_cast<ConstantInt>(Op1); 205 if (!C && isa<BitCastInst>(Op1)) 206 C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0)); 207 return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth(); 208 } 209 case Instruction::PHI: 210 // Stop IR traversal in case of a crazy input code. This limits recursion 211 // depth. 212 if (Visited.size() >= 16) 213 return false; 214 // Do not visit nodes that have been visited already. We return true because 215 // it means that we couldn't find any value that doesn't look hash-like. 216 if (!Visited.insert(I).second) 217 return true; 218 return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) { 219 // Ignore undef values as they probably don't affect the division 220 // operands. 221 return getValueRange(V, Visited) == VALRNG_LIKELY_LONG || 222 isa<UndefValue>(V); 223 }); 224 default: 225 return false; 226 } 227 } 228 229 /// Check if an integer value fits into our bypass type. 230 ValueRange FastDivInsertionTask::getValueRange(Value *V, 231 VisitedSetTy &Visited) { 232 unsigned ShortLen = BypassType->getBitWidth(); 233 unsigned LongLen = V->getType()->getIntegerBitWidth(); 234 235 assert(LongLen > ShortLen && "Value type must be wider than BypassType"); 236 unsigned HiBits = LongLen - ShortLen; 237 238 const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout(); 239 KnownBits Known(LongLen); 240 241 computeKnownBits(V, Known, DL); 242 243 if (Known.countMinLeadingZeros() >= HiBits) 244 return VALRNG_KNOWN_SHORT; 245 246 if (Known.countMaxLeadingZeros() < HiBits) 247 return VALRNG_LIKELY_LONG; 248 249 // Long integer divisions are often used in hashtable implementations. It's 250 // not worth bypassing such divisions because hash values are extremely 251 // unlikely to have enough leading zeros. The call below tries to detect 252 // values that are unlikely to fit BypassType (including hashes). 253 if (isHashLikeValue(V, Visited)) 254 return VALRNG_LIKELY_LONG; 255 256 return VALRNG_UNKNOWN; 257 } 258 259 /// Add new basic block for slow div and rem operations and put it before 260 /// SuccessorBB. 261 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { 262 QuotRemWithBB DivRemPair; 263 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 264 MainBB->getParent(), SuccessorBB); 265 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 266 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 267 268 Value *Dividend = SlowDivOrRem->getOperand(0); 269 Value *Divisor = SlowDivOrRem->getOperand(1); 270 271 if (isSignedOp()) { 272 DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor); 273 DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor); 274 } else { 275 DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor); 276 DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor); 277 } 278 279 Builder.CreateBr(SuccessorBB); 280 return DivRemPair; 281 } 282 283 /// Add new basic block for fast div and rem operations and put it before 284 /// SuccessorBB. 285 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) { 286 QuotRemWithBB DivRemPair; 287 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 288 MainBB->getParent(), SuccessorBB); 289 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 290 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 291 292 Value *Dividend = SlowDivOrRem->getOperand(0); 293 Value *Divisor = SlowDivOrRem->getOperand(1); 294 Value *ShortDivisorV = 295 Builder.CreateCast(Instruction::Trunc, Divisor, BypassType); 296 Value *ShortDividendV = 297 Builder.CreateCast(Instruction::Trunc, Dividend, BypassType); 298 299 // udiv/urem because this optimization only handles positive numbers. 300 Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV); 301 Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV); 302 DivRemPair.Quotient = 303 Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType()); 304 DivRemPair.Remainder = 305 Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType()); 306 Builder.CreateBr(SuccessorBB); 307 308 return DivRemPair; 309 } 310 311 /// Creates Phi nodes for result of Div and Rem. 312 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS, 313 QuotRemWithBB &RHS, 314 BasicBlock *PhiBB) { 315 IRBuilder<> Builder(PhiBB, PhiBB->begin()); 316 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 317 PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2); 318 QuoPhi->addIncoming(LHS.Quotient, LHS.BB); 319 QuoPhi->addIncoming(RHS.Quotient, RHS.BB); 320 PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2); 321 RemPhi->addIncoming(LHS.Remainder, LHS.BB); 322 RemPhi->addIncoming(RHS.Remainder, RHS.BB); 323 return QuotRemPair(QuoPhi, RemPhi); 324 } 325 326 /// Creates a runtime check to test whether both the divisor and dividend fit 327 /// into BypassType. The check is inserted at the end of MainBB. True return 328 /// value means that the operands fit. Either of the operands may be NULL if it 329 /// doesn't need a runtime check. 330 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { 331 assert((Op1 || Op2) && "Nothing to check"); 332 IRBuilder<> Builder(MainBB, MainBB->end()); 333 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 334 335 Value *OrV; 336 if (Op1 && Op2) 337 OrV = Builder.CreateOr(Op1, Op2); 338 else 339 OrV = Op1 ? Op1 : Op2; 340 341 // BitMask is inverted to check if the operands are 342 // larger than the bypass type 343 uint64_t BitMask = ~BypassType->getBitMask(); 344 Value *AndV = Builder.CreateAnd(OrV, BitMask); 345 346 // Compare operand values 347 Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0); 348 return Builder.CreateICmpEQ(AndV, ZeroV); 349 } 350 351 /// Substitutes the div/rem instruction with code that checks the value of the 352 /// operands and uses a shorter-faster div/rem instruction when possible. 353 Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() { 354 Value *Dividend = SlowDivOrRem->getOperand(0); 355 Value *Divisor = SlowDivOrRem->getOperand(1); 356 357 VisitedSetTy SetL; 358 ValueRange DividendRange = getValueRange(Dividend, SetL); 359 if (DividendRange == VALRNG_LIKELY_LONG) 360 return None; 361 362 VisitedSetTy SetR; 363 ValueRange DivisorRange = getValueRange(Divisor, SetR); 364 if (DivisorRange == VALRNG_LIKELY_LONG) 365 return None; 366 367 bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT); 368 bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT); 369 370 if (DividendShort && DivisorShort) { 371 // If both operands are known to be short then just replace the long 372 // division with a short one in-place. Since we're not introducing control 373 // flow in this case, narrowing the division is always a win, even if the 374 // divisor is a constant (and will later get replaced by a multiplication). 375 376 IRBuilder<> Builder(SlowDivOrRem); 377 Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); 378 Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); 379 Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor); 380 Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor); 381 Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType()); 382 Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType()); 383 return QuotRemPair(ExtDiv, ExtRem); 384 } 385 386 if (isa<ConstantInt>(Divisor)) { 387 // If the divisor is not a constant, DAGCombiner will convert it to a 388 // multiplication by a magic constant. It isn't clear if it is worth 389 // introducing control flow to get a narrower multiply. 390 return None; 391 } 392 393 // After Constant Hoisting pass, long constants may be represented as 394 // bitcast instructions. As a result, some constants may look like an 395 // instruction at first, and an additional check is necessary to find out if 396 // an operand is actually a constant. 397 if (auto *BCI = dyn_cast<BitCastInst>(Divisor)) 398 if (BCI->getParent() == SlowDivOrRem->getParent() && 399 isa<ConstantInt>(BCI->getOperand(0))) 400 return None; 401 402 IRBuilder<> Builder(MainBB, MainBB->end()); 403 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 404 405 if (DividendShort && !isSignedOp()) { 406 // If the division is unsigned and Dividend is known to be short, then 407 // either 408 // 1) Divisor is less or equal to Dividend, and the result can be computed 409 // with a short division. 410 // 2) Divisor is greater than Dividend. In this case, no division is needed 411 // at all: The quotient is 0 and the remainder is equal to Dividend. 412 // 413 // So instead of checking at runtime whether Divisor fits into BypassType, 414 // we emit a runtime check to differentiate between these two cases. This 415 // lets us entirely avoid a long div. 416 417 // Split the basic block before the div/rem. 418 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 419 // Remove the unconditional branch from MainBB to SuccessorBB. 420 MainBB->getInstList().back().eraseFromParent(); 421 QuotRemWithBB Long; 422 Long.BB = MainBB; 423 Long.Quotient = ConstantInt::get(getSlowType(), 0); 424 Long.Remainder = Dividend; 425 QuotRemWithBB Fast = createFastBB(SuccessorBB); 426 QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); 427 Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); 428 Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); 429 return Result; 430 } else { 431 // General case. Create both slow and fast div/rem pairs and choose one of 432 // them at runtime. 433 434 // Split the basic block before the div/rem. 435 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 436 // Remove the unconditional branch from MainBB to SuccessorBB. 437 MainBB->getInstList().back().eraseFromParent(); 438 QuotRemWithBB Fast = createFastBB(SuccessorBB); 439 QuotRemWithBB Slow = createSlowBB(SuccessorBB); 440 QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); 441 Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, 442 DivisorShort ? nullptr : Divisor); 443 Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); 444 return Result; 445 } 446 } 447 448 /// This optimization identifies DIV/REM instructions in a BB that can be 449 /// profitably bypassed and carried out with a shorter, faster divide. 450 bool llvm::bypassSlowDivision(BasicBlock *BB, 451 const BypassWidthsTy &BypassWidths) { 452 DivCacheTy PerBBDivCache; 453 454 bool MadeChange = false; 455 Instruction *Next = &*BB->begin(); 456 while (Next != nullptr) { 457 // We may add instructions immediately after I, but we want to skip over 458 // them. 459 Instruction *I = Next; 460 Next = Next->getNextNode(); 461 462 // Ignore dead code to save time and avoid bugs. 463 if (I->hasNUses(0)) 464 continue; 465 466 FastDivInsertionTask Task(I, BypassWidths); 467 if (Value *Replacement = Task.getReplacement(PerBBDivCache)) { 468 I->replaceAllUsesWith(Replacement); 469 I->eraseFromParent(); 470 MadeChange = true; 471 } 472 } 473 474 // Above we eagerly create divs and rems, as pairs, so that we can efficiently 475 // create divrem machine instructions. Now erase any unused divs / rems so we 476 // don't leave extra instructions sitting around. 477 for (auto &KV : PerBBDivCache) 478 for (Value *V : {KV.second.Quotient, KV.second.Remainder}) 479 RecursivelyDeleteTriviallyDeadInstructions(V); 480 481 return MadeChange; 482 } 483