1 //===- BypassSlowDivision.cpp - Bypass slow division ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains an optimization for div and rem on architectures that 10 // execute short instructions significantly faster than longer instructions. 11 // For example, on Intel Atom 32-bit divides are slow enough that during 12 // runtime it is profitable to check the value of the operands, and if they are 13 // positive and less than 256 use an unsigned 8-bit divide. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/Transforms/Utils/BypassSlowDivision.h" 18 #include "llvm/ADT/DenseMap.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallPtrSet.h" 21 #include "llvm/Transforms/Utils/Local.h" 22 #include "llvm/Analysis/ValueTracking.h" 23 #include "llvm/IR/BasicBlock.h" 24 #include "llvm/IR/Constants.h" 25 #include "llvm/IR/DerivedTypes.h" 26 #include "llvm/IR/Function.h" 27 #include "llvm/IR/IRBuilder.h" 28 #include "llvm/IR/Instruction.h" 29 #include "llvm/IR/Instructions.h" 30 #include "llvm/IR/Module.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/IR/Value.h" 33 #include "llvm/Support/Casting.h" 34 #include "llvm/Support/KnownBits.h" 35 #include <cassert> 36 #include <cstdint> 37 38 using namespace llvm; 39 40 #define DEBUG_TYPE "bypass-slow-division" 41 42 namespace { 43 44 struct QuotRemPair { 45 Value *Quotient; 46 Value *Remainder; 47 48 QuotRemPair(Value *InQuotient, Value *InRemainder) 49 : Quotient(InQuotient), Remainder(InRemainder) {} 50 }; 51 52 /// A quotient and remainder, plus a BB from which they logically "originate". 53 /// If you use Quotient or Remainder in a Phi node, you should use BB as its 54 /// corresponding predecessor. 55 struct QuotRemWithBB { 56 BasicBlock *BB = nullptr; 57 Value *Quotient = nullptr; 58 Value *Remainder = nullptr; 59 }; 60 61 using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>; 62 using BypassWidthsTy = DenseMap<unsigned, unsigned>; 63 using VisitedSetTy = SmallPtrSet<Instruction *, 4>; 64 65 enum ValueRange { 66 /// Operand definitely fits into BypassType. No runtime checks are needed. 67 VALRNG_KNOWN_SHORT, 68 /// A runtime check is required, as value range is unknown. 69 VALRNG_UNKNOWN, 70 /// Operand is unlikely to fit into BypassType. The bypassing should be 71 /// disabled. 72 VALRNG_LIKELY_LONG 73 }; 74 75 class FastDivInsertionTask { 76 bool IsValidTask = false; 77 Instruction *SlowDivOrRem = nullptr; 78 IntegerType *BypassType = nullptr; 79 BasicBlock *MainBB = nullptr; 80 81 bool isHashLikeValue(Value *V, VisitedSetTy &Visited); 82 ValueRange getValueRange(Value *Op, VisitedSetTy &Visited); 83 QuotRemWithBB createSlowBB(BasicBlock *Successor); 84 QuotRemWithBB createFastBB(BasicBlock *Successor); 85 QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, 86 BasicBlock *PhiBB); 87 Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2); 88 std::optional<QuotRemPair> insertFastDivAndRem(); 89 90 bool isSignedOp() { 91 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 92 SlowDivOrRem->getOpcode() == Instruction::SRem; 93 } 94 95 bool isDivisionOp() { 96 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 97 SlowDivOrRem->getOpcode() == Instruction::UDiv; 98 } 99 100 Type *getSlowType() { return SlowDivOrRem->getType(); } 101 102 public: 103 FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths); 104 105 Value *getReplacement(DivCacheTy &Cache); 106 }; 107 108 } // end anonymous namespace 109 110 FastDivInsertionTask::FastDivInsertionTask(Instruction *I, 111 const BypassWidthsTy &BypassWidths) { 112 switch (I->getOpcode()) { 113 case Instruction::UDiv: 114 case Instruction::SDiv: 115 case Instruction::URem: 116 case Instruction::SRem: 117 SlowDivOrRem = I; 118 break; 119 default: 120 // I is not a div/rem operation. 121 return; 122 } 123 124 // Skip division on vector types. Only optimize integer instructions. 125 IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType()); 126 if (!SlowType) 127 return; 128 129 // Skip if this bitwidth is not bypassed. 130 auto BI = BypassWidths.find(SlowType->getBitWidth()); 131 if (BI == BypassWidths.end()) 132 return; 133 134 // Get type for div/rem instruction with bypass bitwidth. 135 IntegerType *BT = IntegerType::get(I->getContext(), BI->second); 136 BypassType = BT; 137 138 // The original basic block. 139 MainBB = I->getParent(); 140 141 // The instruction is indeed a slow div or rem operation. 142 IsValidTask = true; 143 } 144 145 /// Reuses previously-computed dividend or remainder from the current BB if 146 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to 147 /// perform the optimization and caches the resulting dividend and remainder. 148 /// If no replacement can be generated, nullptr is returned. 149 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) { 150 // First, make sure that the task is valid. 151 if (!IsValidTask) 152 return nullptr; 153 154 // Then, look for a value in Cache. 155 Value *Dividend = SlowDivOrRem->getOperand(0); 156 Value *Divisor = SlowDivOrRem->getOperand(1); 157 DivRemMapKey Key(isSignedOp(), Dividend, Divisor); 158 auto CacheI = Cache.find(Key); 159 160 if (CacheI == Cache.end()) { 161 // If previous instance does not exist, try to insert fast div. 162 std::optional<QuotRemPair> OptResult = insertFastDivAndRem(); 163 // Bail out if insertFastDivAndRem has failed. 164 if (!OptResult) 165 return nullptr; 166 CacheI = Cache.insert({Key, *OptResult}).first; 167 } 168 169 QuotRemPair &Value = CacheI->second; 170 return isDivisionOp() ? Value.Quotient : Value.Remainder; 171 } 172 173 /// Check if a value looks like a hash. 174 /// 175 /// The routine is expected to detect values computed using the most common hash 176 /// algorithms. Typically, hash computations end with one of the following 177 /// instructions: 178 /// 179 /// 1) MUL with a constant wider than BypassType 180 /// 2) XOR instruction 181 /// 182 /// And even if we are wrong and the value is not a hash, it is still quite 183 /// unlikely that such values will fit into BypassType. 184 /// 185 /// To detect string hash algorithms like FNV we have to look through PHI-nodes. 186 /// It is implemented as a depth-first search for values that look neither long 187 /// nor hash-like. 188 bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) { 189 Instruction *I = dyn_cast<Instruction>(V); 190 if (!I) 191 return false; 192 193 switch (I->getOpcode()) { 194 case Instruction::Xor: 195 return true; 196 case Instruction::Mul: { 197 // After Constant Hoisting pass, long constants may be represented as 198 // bitcast instructions. As a result, some constants may look like an 199 // instruction at first, and an additional check is necessary to find out if 200 // an operand is actually a constant. 201 Value *Op1 = I->getOperand(1); 202 ConstantInt *C = dyn_cast<ConstantInt>(Op1); 203 if (!C && isa<BitCastInst>(Op1)) 204 C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0)); 205 return C && C->getValue().getSignificantBits() > BypassType->getBitWidth(); 206 } 207 case Instruction::PHI: 208 // Stop IR traversal in case of a crazy input code. This limits recursion 209 // depth. 210 if (Visited.size() >= 16) 211 return false; 212 // Do not visit nodes that have been visited already. We return true because 213 // it means that we couldn't find any value that doesn't look hash-like. 214 if (!Visited.insert(I).second) 215 return true; 216 return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) { 217 // Ignore undef values as they probably don't affect the division 218 // operands. 219 return getValueRange(V, Visited) == VALRNG_LIKELY_LONG || 220 isa<UndefValue>(V); 221 }); 222 default: 223 return false; 224 } 225 } 226 227 /// Check if an integer value fits into our bypass type. 228 ValueRange FastDivInsertionTask::getValueRange(Value *V, 229 VisitedSetTy &Visited) { 230 unsigned ShortLen = BypassType->getBitWidth(); 231 unsigned LongLen = V->getType()->getIntegerBitWidth(); 232 233 assert(LongLen > ShortLen && "Value type must be wider than BypassType"); 234 unsigned HiBits = LongLen - ShortLen; 235 236 const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout(); 237 KnownBits Known(LongLen); 238 239 computeKnownBits(V, Known, DL); 240 241 if (Known.countMinLeadingZeros() >= HiBits) 242 return VALRNG_KNOWN_SHORT; 243 244 if (Known.countMaxLeadingZeros() < HiBits) 245 return VALRNG_LIKELY_LONG; 246 247 // Long integer divisions are often used in hashtable implementations. It's 248 // not worth bypassing such divisions because hash values are extremely 249 // unlikely to have enough leading zeros. The call below tries to detect 250 // values that are unlikely to fit BypassType (including hashes). 251 if (isHashLikeValue(V, Visited)) 252 return VALRNG_LIKELY_LONG; 253 254 return VALRNG_UNKNOWN; 255 } 256 257 /// Add new basic block for slow div and rem operations and put it before 258 /// SuccessorBB. 259 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { 260 QuotRemWithBB DivRemPair; 261 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 262 MainBB->getParent(), SuccessorBB); 263 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 264 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 265 266 Value *Dividend = SlowDivOrRem->getOperand(0); 267 Value *Divisor = SlowDivOrRem->getOperand(1); 268 269 if (isSignedOp()) { 270 DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor); 271 DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor); 272 } else { 273 DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor); 274 DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor); 275 } 276 277 Builder.CreateBr(SuccessorBB); 278 return DivRemPair; 279 } 280 281 /// Add new basic block for fast div and rem operations and put it before 282 /// SuccessorBB. 283 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) { 284 QuotRemWithBB DivRemPair; 285 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 286 MainBB->getParent(), SuccessorBB); 287 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 288 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 289 290 Value *Dividend = SlowDivOrRem->getOperand(0); 291 Value *Divisor = SlowDivOrRem->getOperand(1); 292 Value *ShortDivisorV = 293 Builder.CreateCast(Instruction::Trunc, Divisor, BypassType); 294 Value *ShortDividendV = 295 Builder.CreateCast(Instruction::Trunc, Dividend, BypassType); 296 297 // udiv/urem because this optimization only handles positive numbers. 298 Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV); 299 Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV); 300 DivRemPair.Quotient = 301 Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType()); 302 DivRemPair.Remainder = 303 Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType()); 304 Builder.CreateBr(SuccessorBB); 305 306 return DivRemPair; 307 } 308 309 /// Creates Phi nodes for result of Div and Rem. 310 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS, 311 QuotRemWithBB &RHS, 312 BasicBlock *PhiBB) { 313 IRBuilder<> Builder(PhiBB, PhiBB->begin()); 314 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 315 PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2); 316 QuoPhi->addIncoming(LHS.Quotient, LHS.BB); 317 QuoPhi->addIncoming(RHS.Quotient, RHS.BB); 318 PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2); 319 RemPhi->addIncoming(LHS.Remainder, LHS.BB); 320 RemPhi->addIncoming(RHS.Remainder, RHS.BB); 321 return QuotRemPair(QuoPhi, RemPhi); 322 } 323 324 /// Creates a runtime check to test whether both the divisor and dividend fit 325 /// into BypassType. The check is inserted at the end of MainBB. True return 326 /// value means that the operands fit. Either of the operands may be NULL if it 327 /// doesn't need a runtime check. 328 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { 329 assert((Op1 || Op2) && "Nothing to check"); 330 IRBuilder<> Builder(MainBB, MainBB->end()); 331 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 332 333 Value *OrV; 334 if (Op1 && Op2) 335 OrV = Builder.CreateOr(Op1, Op2); 336 else 337 OrV = Op1 ? Op1 : Op2; 338 339 // BitMask is inverted to check if the operands are 340 // larger than the bypass type 341 uint64_t BitMask = ~BypassType->getBitMask(); 342 Value *AndV = Builder.CreateAnd(OrV, BitMask); 343 344 // Compare operand values 345 Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0); 346 return Builder.CreateICmpEQ(AndV, ZeroV); 347 } 348 349 /// Substitutes the div/rem instruction with code that checks the value of the 350 /// operands and uses a shorter-faster div/rem instruction when possible. 351 std::optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() { 352 Value *Dividend = SlowDivOrRem->getOperand(0); 353 Value *Divisor = SlowDivOrRem->getOperand(1); 354 355 VisitedSetTy SetL; 356 ValueRange DividendRange = getValueRange(Dividend, SetL); 357 if (DividendRange == VALRNG_LIKELY_LONG) 358 return std::nullopt; 359 360 VisitedSetTy SetR; 361 ValueRange DivisorRange = getValueRange(Divisor, SetR); 362 if (DivisorRange == VALRNG_LIKELY_LONG) 363 return std::nullopt; 364 365 bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT); 366 bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT); 367 368 if (DividendShort && DivisorShort) { 369 // If both operands are known to be short then just replace the long 370 // division with a short one in-place. Since we're not introducing control 371 // flow in this case, narrowing the division is always a win, even if the 372 // divisor is a constant (and will later get replaced by a multiplication). 373 374 IRBuilder<> Builder(SlowDivOrRem); 375 Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); 376 Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); 377 Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor); 378 Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor); 379 Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType()); 380 Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType()); 381 return QuotRemPair(ExtDiv, ExtRem); 382 } 383 384 if (isa<ConstantInt>(Divisor)) { 385 // If the divisor is not a constant, DAGCombiner will convert it to a 386 // multiplication by a magic constant. It isn't clear if it is worth 387 // introducing control flow to get a narrower multiply. 388 return std::nullopt; 389 } 390 391 // After Constant Hoisting pass, long constants may be represented as 392 // bitcast instructions. As a result, some constants may look like an 393 // instruction at first, and an additional check is necessary to find out if 394 // an operand is actually a constant. 395 if (auto *BCI = dyn_cast<BitCastInst>(Divisor)) 396 if (BCI->getParent() == SlowDivOrRem->getParent() && 397 isa<ConstantInt>(BCI->getOperand(0))) 398 return std::nullopt; 399 400 IRBuilder<> Builder(MainBB, MainBB->end()); 401 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc()); 402 403 if (DividendShort && !isSignedOp()) { 404 // If the division is unsigned and Dividend is known to be short, then 405 // either 406 // 1) Divisor is less or equal to Dividend, and the result can be computed 407 // with a short division. 408 // 2) Divisor is greater than Dividend. In this case, no division is needed 409 // at all: The quotient is 0 and the remainder is equal to Dividend. 410 // 411 // So instead of checking at runtime whether Divisor fits into BypassType, 412 // we emit a runtime check to differentiate between these two cases. This 413 // lets us entirely avoid a long div. 414 415 // Split the basic block before the div/rem. 416 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 417 // Remove the unconditional branch from MainBB to SuccessorBB. 418 MainBB->back().eraseFromParent(); 419 QuotRemWithBB Long; 420 Long.BB = MainBB; 421 Long.Quotient = ConstantInt::get(getSlowType(), 0); 422 Long.Remainder = Dividend; 423 QuotRemWithBB Fast = createFastBB(SuccessorBB); 424 QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); 425 Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); 426 Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); 427 return Result; 428 } else { 429 // General case. Create both slow and fast div/rem pairs and choose one of 430 // them at runtime. 431 432 // Split the basic block before the div/rem. 433 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 434 // Remove the unconditional branch from MainBB to SuccessorBB. 435 MainBB->back().eraseFromParent(); 436 QuotRemWithBB Fast = createFastBB(SuccessorBB); 437 QuotRemWithBB Slow = createSlowBB(SuccessorBB); 438 QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); 439 Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, 440 DivisorShort ? nullptr : Divisor); 441 Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); 442 return Result; 443 } 444 } 445 446 /// This optimization identifies DIV/REM instructions in a BB that can be 447 /// profitably bypassed and carried out with a shorter, faster divide. 448 bool llvm::bypassSlowDivision(BasicBlock *BB, 449 const BypassWidthsTy &BypassWidths) { 450 DivCacheTy PerBBDivCache; 451 452 bool MadeChange = false; 453 Instruction *Next = &*BB->begin(); 454 while (Next != nullptr) { 455 // We may add instructions immediately after I, but we want to skip over 456 // them. 457 Instruction *I = Next; 458 Next = Next->getNextNode(); 459 460 // Ignore dead code to save time and avoid bugs. 461 if (I->hasNUses(0)) 462 continue; 463 464 FastDivInsertionTask Task(I, BypassWidths); 465 if (Value *Replacement = Task.getReplacement(PerBBDivCache)) { 466 I->replaceAllUsesWith(Replacement); 467 I->eraseFromParent(); 468 MadeChange = true; 469 } 470 } 471 472 // Above we eagerly create divs and rems, as pairs, so that we can efficiently 473 // create divrem machine instructions. Now erase any unused divs / rems so we 474 // don't leave extra instructions sitting around. 475 for (auto &KV : PerBBDivCache) 476 for (Value *V : {KV.second.Quotient, KV.second.Remainder}) 477 RecursivelyDeleteTriviallyDeadInstructions(V); 478 479 return MadeChange; 480 } 481