1 //===- AtomicExpandPass.cpp - Expand atomic instructions ------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains a pass (at IR level) to replace atomic instructions with 10 // __atomic_* library calls, or target specific instruction which implement the 11 // same semantics in a way which better fits the target backend. This can 12 // include the use of (intrinsic-based) load-linked/store-conditional loops, 13 // AtomicCmpXchg, or type coercions. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/STLExtras.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/CodeGen/AtomicExpandUtils.h" 21 #include "llvm/CodeGen/RuntimeLibcalls.h" 22 #include "llvm/CodeGen/TargetLowering.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/CodeGen/TargetSubtargetInfo.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/Attributes.h" 27 #include "llvm/IR/BasicBlock.h" 28 #include "llvm/IR/Constant.h" 29 #include "llvm/IR/Constants.h" 30 #include "llvm/IR/DataLayout.h" 31 #include "llvm/IR/DerivedTypes.h" 32 #include "llvm/IR/Function.h" 33 #include "llvm/IR/IRBuilder.h" 34 #include "llvm/IR/InstIterator.h" 35 #include "llvm/IR/Instruction.h" 36 #include "llvm/IR/Instructions.h" 37 #include "llvm/IR/Module.h" 38 #include "llvm/IR/Type.h" 39 #include "llvm/IR/User.h" 40 #include "llvm/IR/Value.h" 41 #include "llvm/InitializePasses.h" 42 #include "llvm/Pass.h" 43 #include "llvm/Support/AtomicOrdering.h" 44 #include "llvm/Support/Casting.h" 45 #include "llvm/Support/Debug.h" 46 #include "llvm/Support/ErrorHandling.h" 47 #include "llvm/Support/raw_ostream.h" 48 #include "llvm/Target/TargetMachine.h" 49 #include <cassert> 50 #include <cstdint> 51 #include <iterator> 52 53 using namespace llvm; 54 55 #define DEBUG_TYPE "atomic-expand" 56 57 namespace { 58 59 class AtomicExpand: public FunctionPass { 60 const TargetLowering *TLI = nullptr; 61 62 public: 63 static char ID; // Pass identification, replacement for typeid 64 65 AtomicExpand() : FunctionPass(ID) { 66 initializeAtomicExpandPass(*PassRegistry::getPassRegistry()); 67 } 68 69 bool runOnFunction(Function &F) override; 70 71 private: 72 bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); 73 IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); 74 LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); 75 bool tryExpandAtomicLoad(LoadInst *LI); 76 bool expandAtomicLoadToLL(LoadInst *LI); 77 bool expandAtomicLoadToCmpXchg(LoadInst *LI); 78 StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI); 79 bool expandAtomicStore(StoreInst *SI); 80 bool tryExpandAtomicRMW(AtomicRMWInst *AI); 81 Value * 82 insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 83 AtomicOrdering MemOpOrder, 84 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp); 85 void expandAtomicOpToLLSC( 86 Instruction *I, Type *ResultTy, Value *Addr, AtomicOrdering MemOpOrder, 87 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp); 88 void expandPartwordAtomicRMW( 89 AtomicRMWInst *I, 90 TargetLoweringBase::AtomicExpansionKind ExpansionKind); 91 AtomicRMWInst *widenPartwordAtomicRMW(AtomicRMWInst *AI); 92 bool expandPartwordCmpXchg(AtomicCmpXchgInst *I); 93 void expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI); 94 void expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI); 95 96 AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI); 97 static Value *insertRMWCmpXchgLoop( 98 IRBuilder<> &Builder, Type *ResultType, Value *Addr, 99 AtomicOrdering MemOpOrder, 100 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp, 101 CreateCmpXchgInstFun CreateCmpXchg); 102 bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI); 103 104 bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); 105 bool isIdempotentRMW(AtomicRMWInst *RMWI); 106 bool simplifyIdempotentRMW(AtomicRMWInst *RMWI); 107 108 bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, Align Alignment, 109 Value *PointerOperand, Value *ValueOperand, 110 Value *CASExpected, AtomicOrdering Ordering, 111 AtomicOrdering Ordering2, 112 ArrayRef<RTLIB::Libcall> Libcalls); 113 void expandAtomicLoadToLibcall(LoadInst *LI); 114 void expandAtomicStoreToLibcall(StoreInst *LI); 115 void expandAtomicRMWToLibcall(AtomicRMWInst *I); 116 void expandAtomicCASToLibcall(AtomicCmpXchgInst *I); 117 118 friend bool 119 llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, 120 CreateCmpXchgInstFun CreateCmpXchg); 121 }; 122 123 } // end anonymous namespace 124 125 char AtomicExpand::ID = 0; 126 127 char &llvm::AtomicExpandID = AtomicExpand::ID; 128 129 INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", 130 false, false) 131 132 FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); } 133 134 // Helper functions to retrieve the size of atomic instructions. 135 static unsigned getAtomicOpSize(LoadInst *LI) { 136 const DataLayout &DL = LI->getModule()->getDataLayout(); 137 return DL.getTypeStoreSize(LI->getType()); 138 } 139 140 static unsigned getAtomicOpSize(StoreInst *SI) { 141 const DataLayout &DL = SI->getModule()->getDataLayout(); 142 return DL.getTypeStoreSize(SI->getValueOperand()->getType()); 143 } 144 145 static unsigned getAtomicOpSize(AtomicRMWInst *RMWI) { 146 const DataLayout &DL = RMWI->getModule()->getDataLayout(); 147 return DL.getTypeStoreSize(RMWI->getValOperand()->getType()); 148 } 149 150 static unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) { 151 const DataLayout &DL = CASI->getModule()->getDataLayout(); 152 return DL.getTypeStoreSize(CASI->getCompareOperand()->getType()); 153 } 154 155 // Determine if a particular atomic operation has a supported size, 156 // and is of appropriate alignment, to be passed through for target 157 // lowering. (Versus turning into a __atomic libcall) 158 template <typename Inst> 159 static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { 160 unsigned Size = getAtomicOpSize(I); 161 Align Alignment = I->getAlign(); 162 return Alignment >= Size && 163 Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; 164 } 165 166 bool AtomicExpand::runOnFunction(Function &F) { 167 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 168 if (!TPC) 169 return false; 170 171 auto &TM = TPC->getTM<TargetMachine>(); 172 if (!TM.getSubtargetImpl(F)->enableAtomicExpand()) 173 return false; 174 TLI = TM.getSubtargetImpl(F)->getTargetLowering(); 175 176 SmallVector<Instruction *, 1> AtomicInsts; 177 178 // Changing control-flow while iterating through it is a bad idea, so gather a 179 // list of all atomic instructions before we start. 180 for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) { 181 Instruction *I = &*II; 182 if (I->isAtomic() && !isa<FenceInst>(I)) 183 AtomicInsts.push_back(I); 184 } 185 186 bool MadeChange = false; 187 for (auto I : AtomicInsts) { 188 auto LI = dyn_cast<LoadInst>(I); 189 auto SI = dyn_cast<StoreInst>(I); 190 auto RMWI = dyn_cast<AtomicRMWInst>(I); 191 auto CASI = dyn_cast<AtomicCmpXchgInst>(I); 192 assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction"); 193 194 // If the Size/Alignment is not supported, replace with a libcall. 195 if (LI) { 196 if (!atomicSizeSupported(TLI, LI)) { 197 expandAtomicLoadToLibcall(LI); 198 MadeChange = true; 199 continue; 200 } 201 } else if (SI) { 202 if (!atomicSizeSupported(TLI, SI)) { 203 expandAtomicStoreToLibcall(SI); 204 MadeChange = true; 205 continue; 206 } 207 } else if (RMWI) { 208 if (!atomicSizeSupported(TLI, RMWI)) { 209 expandAtomicRMWToLibcall(RMWI); 210 MadeChange = true; 211 continue; 212 } 213 } else if (CASI) { 214 if (!atomicSizeSupported(TLI, CASI)) { 215 expandAtomicCASToLibcall(CASI); 216 MadeChange = true; 217 continue; 218 } 219 } 220 221 if (TLI->shouldInsertFencesForAtomic(I)) { 222 auto FenceOrdering = AtomicOrdering::Monotonic; 223 if (LI && isAcquireOrStronger(LI->getOrdering())) { 224 FenceOrdering = LI->getOrdering(); 225 LI->setOrdering(AtomicOrdering::Monotonic); 226 } else if (SI && isReleaseOrStronger(SI->getOrdering())) { 227 FenceOrdering = SI->getOrdering(); 228 SI->setOrdering(AtomicOrdering::Monotonic); 229 } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) || 230 isAcquireOrStronger(RMWI->getOrdering()))) { 231 FenceOrdering = RMWI->getOrdering(); 232 RMWI->setOrdering(AtomicOrdering::Monotonic); 233 } else if (CASI && 234 TLI->shouldExpandAtomicCmpXchgInIR(CASI) == 235 TargetLoweringBase::AtomicExpansionKind::None && 236 (isReleaseOrStronger(CASI->getSuccessOrdering()) || 237 isAcquireOrStronger(CASI->getSuccessOrdering()))) { 238 // If a compare and swap is lowered to LL/SC, we can do smarter fence 239 // insertion, with a stronger one on the success path than on the 240 // failure path. As a result, fence insertion is directly done by 241 // expandAtomicCmpXchg in that case. 242 FenceOrdering = CASI->getSuccessOrdering(); 243 CASI->setSuccessOrdering(AtomicOrdering::Monotonic); 244 CASI->setFailureOrdering(AtomicOrdering::Monotonic); 245 } 246 247 if (FenceOrdering != AtomicOrdering::Monotonic) { 248 MadeChange |= bracketInstWithFences(I, FenceOrdering); 249 } 250 } 251 252 if (LI) { 253 if (LI->getType()->isFloatingPointTy()) { 254 // TODO: add a TLI hook to control this so that each target can 255 // convert to lowering the original type one at a time. 256 LI = convertAtomicLoadToIntegerType(LI); 257 assert(LI->getType()->isIntegerTy() && "invariant broken"); 258 MadeChange = true; 259 } 260 261 MadeChange |= tryExpandAtomicLoad(LI); 262 } else if (SI) { 263 if (SI->getValueOperand()->getType()->isFloatingPointTy()) { 264 // TODO: add a TLI hook to control this so that each target can 265 // convert to lowering the original type one at a time. 266 SI = convertAtomicStoreToIntegerType(SI); 267 assert(SI->getValueOperand()->getType()->isIntegerTy() && 268 "invariant broken"); 269 MadeChange = true; 270 } 271 272 if (TLI->shouldExpandAtomicStoreInIR(SI)) 273 MadeChange |= expandAtomicStore(SI); 274 } else if (RMWI) { 275 // There are two different ways of expanding RMW instructions: 276 // - into a load if it is idempotent 277 // - into a Cmpxchg/LL-SC loop otherwise 278 // we try them in that order. 279 280 if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) { 281 MadeChange = true; 282 } else { 283 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 284 unsigned ValueSize = getAtomicOpSize(RMWI); 285 AtomicRMWInst::BinOp Op = RMWI->getOperation(); 286 if (ValueSize < MinCASSize && 287 (Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor || 288 Op == AtomicRMWInst::And)) { 289 RMWI = widenPartwordAtomicRMW(RMWI); 290 MadeChange = true; 291 } 292 293 MadeChange |= tryExpandAtomicRMW(RMWI); 294 } 295 } else if (CASI) { 296 // TODO: when we're ready to make the change at the IR level, we can 297 // extend convertCmpXchgToInteger for floating point too. 298 assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() && 299 "unimplemented - floating point not legal at IR level"); 300 if (CASI->getCompareOperand()->getType()->isPointerTy() ) { 301 // TODO: add a TLI hook to control this so that each target can 302 // convert to lowering the original type one at a time. 303 CASI = convertCmpXchgToIntegerType(CASI); 304 assert(CASI->getCompareOperand()->getType()->isIntegerTy() && 305 "invariant broken"); 306 MadeChange = true; 307 } 308 309 MadeChange |= tryExpandAtomicCmpXchg(CASI); 310 } 311 } 312 return MadeChange; 313 } 314 315 bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) { 316 IRBuilder<> Builder(I); 317 318 auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order); 319 320 auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order); 321 // We have a guard here because not every atomic operation generates a 322 // trailing fence. 323 if (TrailingFence) 324 TrailingFence->moveAfter(I); 325 326 return (LeadingFence || TrailingFence); 327 } 328 329 /// Get the iX type with the same bitwidth as T. 330 IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T, 331 const DataLayout &DL) { 332 EVT VT = TLI->getMemValueType(DL, T); 333 unsigned BitWidth = VT.getStoreSizeInBits(); 334 assert(BitWidth == VT.getSizeInBits() && "must be a power of two"); 335 return IntegerType::get(T->getContext(), BitWidth); 336 } 337 338 /// Convert an atomic load of a non-integral type to an integer load of the 339 /// equivalent bitwidth. See the function comment on 340 /// convertAtomicStoreToIntegerType for background. 341 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) { 342 auto *M = LI->getModule(); 343 Type *NewTy = getCorrespondingIntegerType(LI->getType(), 344 M->getDataLayout()); 345 346 IRBuilder<> Builder(LI); 347 348 Value *Addr = LI->getPointerOperand(); 349 Type *PT = PointerType::get(NewTy, 350 Addr->getType()->getPointerAddressSpace()); 351 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 352 353 auto *NewLI = Builder.CreateLoad(NewTy, NewAddr); 354 NewLI->setAlignment(LI->getAlign()); 355 NewLI->setVolatile(LI->isVolatile()); 356 NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID()); 357 LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n"); 358 359 Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType()); 360 LI->replaceAllUsesWith(NewVal); 361 LI->eraseFromParent(); 362 return NewLI; 363 } 364 365 bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) { 366 switch (TLI->shouldExpandAtomicLoadInIR(LI)) { 367 case TargetLoweringBase::AtomicExpansionKind::None: 368 return false; 369 case TargetLoweringBase::AtomicExpansionKind::LLSC: 370 expandAtomicOpToLLSC( 371 LI, LI->getType(), LI->getPointerOperand(), LI->getOrdering(), 372 [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; }); 373 return true; 374 case TargetLoweringBase::AtomicExpansionKind::LLOnly: 375 return expandAtomicLoadToLL(LI); 376 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: 377 return expandAtomicLoadToCmpXchg(LI); 378 default: 379 llvm_unreachable("Unhandled case in tryExpandAtomicLoad"); 380 } 381 } 382 383 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) { 384 IRBuilder<> Builder(LI); 385 386 // On some architectures, load-linked instructions are atomic for larger 387 // sizes than normal loads. For example, the only 64-bit load guaranteed 388 // to be single-copy atomic by ARM is an ldrexd (A3.5.3). 389 Value *Val = 390 TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering()); 391 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 392 393 LI->replaceAllUsesWith(Val); 394 LI->eraseFromParent(); 395 396 return true; 397 } 398 399 bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) { 400 IRBuilder<> Builder(LI); 401 AtomicOrdering Order = LI->getOrdering(); 402 if (Order == AtomicOrdering::Unordered) 403 Order = AtomicOrdering::Monotonic; 404 405 Value *Addr = LI->getPointerOperand(); 406 Type *Ty = cast<PointerType>(Addr->getType())->getElementType(); 407 Constant *DummyVal = Constant::getNullValue(Ty); 408 409 Value *Pair = Builder.CreateAtomicCmpXchg( 410 Addr, DummyVal, DummyVal, Order, 411 AtomicCmpXchgInst::getStrongestFailureOrdering(Order)); 412 Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded"); 413 414 LI->replaceAllUsesWith(Loaded); 415 LI->eraseFromParent(); 416 417 return true; 418 } 419 420 /// Convert an atomic store of a non-integral type to an integer store of the 421 /// equivalent bitwidth. We used to not support floating point or vector 422 /// atomics in the IR at all. The backends learned to deal with the bitcast 423 /// idiom because that was the only way of expressing the notion of a atomic 424 /// float or vector store. The long term plan is to teach each backend to 425 /// instruction select from the original atomic store, but as a migration 426 /// mechanism, we convert back to the old format which the backends understand. 427 /// Each backend will need individual work to recognize the new format. 428 StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) { 429 IRBuilder<> Builder(SI); 430 auto *M = SI->getModule(); 431 Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(), 432 M->getDataLayout()); 433 Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy); 434 435 Value *Addr = SI->getPointerOperand(); 436 Type *PT = PointerType::get(NewTy, 437 Addr->getType()->getPointerAddressSpace()); 438 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 439 440 StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr); 441 NewSI->setAlignment(SI->getAlign()); 442 NewSI->setVolatile(SI->isVolatile()); 443 NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID()); 444 LLVM_DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n"); 445 SI->eraseFromParent(); 446 return NewSI; 447 } 448 449 bool AtomicExpand::expandAtomicStore(StoreInst *SI) { 450 // This function is only called on atomic stores that are too large to be 451 // atomic if implemented as a native store. So we replace them by an 452 // atomic swap, that can be implemented for example as a ldrex/strex on ARM 453 // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes. 454 // It is the responsibility of the target to only signal expansion via 455 // shouldExpandAtomicRMW in cases where this is required and possible. 456 IRBuilder<> Builder(SI); 457 AtomicRMWInst *AI = 458 Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), 459 SI->getValueOperand(), SI->getOrdering()); 460 SI->eraseFromParent(); 461 462 // Now we have an appropriate swap instruction, lower it as usual. 463 return tryExpandAtomicRMW(AI); 464 } 465 466 static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr, 467 Value *Loaded, Value *NewVal, 468 AtomicOrdering MemOpOrder, 469 Value *&Success, Value *&NewLoaded) { 470 Type *OrigTy = NewVal->getType(); 471 472 // This code can go away when cmpxchg supports FP types. 473 bool NeedBitcast = OrigTy->isFloatingPointTy(); 474 if (NeedBitcast) { 475 IntegerType *IntTy = Builder.getIntNTy(OrigTy->getPrimitiveSizeInBits()); 476 unsigned AS = Addr->getType()->getPointerAddressSpace(); 477 Addr = Builder.CreateBitCast(Addr, IntTy->getPointerTo(AS)); 478 NewVal = Builder.CreateBitCast(NewVal, IntTy); 479 Loaded = Builder.CreateBitCast(Loaded, IntTy); 480 } 481 482 Value* Pair = Builder.CreateAtomicCmpXchg( 483 Addr, Loaded, NewVal, MemOpOrder, 484 AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); 485 Success = Builder.CreateExtractValue(Pair, 1, "success"); 486 NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); 487 488 if (NeedBitcast) 489 NewLoaded = Builder.CreateBitCast(NewLoaded, OrigTy); 490 } 491 492 /// Emit IR to implement the given atomicrmw operation on values in registers, 493 /// returning the new value. 494 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, 495 Value *Loaded, Value *Inc) { 496 Value *NewVal; 497 switch (Op) { 498 case AtomicRMWInst::Xchg: 499 return Inc; 500 case AtomicRMWInst::Add: 501 return Builder.CreateAdd(Loaded, Inc, "new"); 502 case AtomicRMWInst::Sub: 503 return Builder.CreateSub(Loaded, Inc, "new"); 504 case AtomicRMWInst::And: 505 return Builder.CreateAnd(Loaded, Inc, "new"); 506 case AtomicRMWInst::Nand: 507 return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); 508 case AtomicRMWInst::Or: 509 return Builder.CreateOr(Loaded, Inc, "new"); 510 case AtomicRMWInst::Xor: 511 return Builder.CreateXor(Loaded, Inc, "new"); 512 case AtomicRMWInst::Max: 513 NewVal = Builder.CreateICmpSGT(Loaded, Inc); 514 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 515 case AtomicRMWInst::Min: 516 NewVal = Builder.CreateICmpSLE(Loaded, Inc); 517 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 518 case AtomicRMWInst::UMax: 519 NewVal = Builder.CreateICmpUGT(Loaded, Inc); 520 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 521 case AtomicRMWInst::UMin: 522 NewVal = Builder.CreateICmpULE(Loaded, Inc); 523 return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); 524 case AtomicRMWInst::FAdd: 525 return Builder.CreateFAdd(Loaded, Inc, "new"); 526 case AtomicRMWInst::FSub: 527 return Builder.CreateFSub(Loaded, Inc, "new"); 528 default: 529 llvm_unreachable("Unknown atomic op"); 530 } 531 } 532 533 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { 534 switch (TLI->shouldExpandAtomicRMWInIR(AI)) { 535 case TargetLoweringBase::AtomicExpansionKind::None: 536 return false; 537 case TargetLoweringBase::AtomicExpansionKind::LLSC: { 538 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 539 unsigned ValueSize = getAtomicOpSize(AI); 540 if (ValueSize < MinCASSize) { 541 expandPartwordAtomicRMW(AI, 542 TargetLoweringBase::AtomicExpansionKind::LLSC); 543 } else { 544 auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) { 545 return performAtomicOp(AI->getOperation(), Builder, Loaded, 546 AI->getValOperand()); 547 }; 548 expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(), 549 AI->getOrdering(), PerformOp); 550 } 551 return true; 552 } 553 case TargetLoweringBase::AtomicExpansionKind::CmpXChg: { 554 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 555 unsigned ValueSize = getAtomicOpSize(AI); 556 if (ValueSize < MinCASSize) { 557 // TODO: Handle atomicrmw fadd/fsub 558 if (AI->getType()->isFloatingPointTy()) 559 return false; 560 561 expandPartwordAtomicRMW(AI, 562 TargetLoweringBase::AtomicExpansionKind::CmpXChg); 563 } else { 564 expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); 565 } 566 return true; 567 } 568 case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic: { 569 expandAtomicRMWToMaskedIntrinsic(AI); 570 return true; 571 } 572 default: 573 llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); 574 } 575 } 576 577 namespace { 578 579 struct PartwordMaskValues { 580 // These three fields are guaranteed to be set by createMaskInstrs. 581 Type *WordType = nullptr; 582 Type *ValueType = nullptr; 583 Value *AlignedAddr = nullptr; 584 // The remaining fields can be null. 585 Value *ShiftAmt = nullptr; 586 Value *Mask = nullptr; 587 Value *Inv_Mask = nullptr; 588 }; 589 590 LLVM_ATTRIBUTE_UNUSED 591 raw_ostream &operator<<(raw_ostream &O, const PartwordMaskValues &PMV) { 592 auto PrintObj = [&O](auto *V) { 593 if (V) 594 O << *V; 595 else 596 O << "nullptr"; 597 O << '\n'; 598 }; 599 O << "PartwordMaskValues {\n"; 600 O << " WordType: "; 601 PrintObj(PMV.WordType); 602 O << " ValueType: "; 603 PrintObj(PMV.ValueType); 604 O << " AlignedAddr: "; 605 PrintObj(PMV.AlignedAddr); 606 O << " ShiftAmt: "; 607 PrintObj(PMV.ShiftAmt); 608 O << " Mask: "; 609 PrintObj(PMV.Mask); 610 O << " Inv_Mask: "; 611 PrintObj(PMV.Inv_Mask); 612 O << "}\n"; 613 return O; 614 } 615 616 } // end anonymous namespace 617 618 /// This is a helper function which builds instructions to provide 619 /// values necessary for partword atomic operations. It takes an 620 /// incoming address, Addr, and ValueType, and constructs the address, 621 /// shift-amounts and masks needed to work with a larger value of size 622 /// WordSize. 623 /// 624 /// AlignedAddr: Addr rounded down to a multiple of WordSize 625 /// 626 /// ShiftAmt: Number of bits to right-shift a WordSize value loaded 627 /// from AlignAddr for it to have the same value as if 628 /// ValueType was loaded from Addr. 629 /// 630 /// Mask: Value to mask with the value loaded from AlignAddr to 631 /// include only the part that would've been loaded from Addr. 632 /// 633 /// Inv_Mask: The inverse of Mask. 634 static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I, 635 Type *ValueType, Value *Addr, 636 unsigned MinWordSize) { 637 PartwordMaskValues PMV; 638 639 Module *M = I->getModule(); 640 LLVMContext &Ctx = M->getContext(); 641 const DataLayout &DL = M->getDataLayout(); 642 unsigned ValueSize = DL.getTypeStoreSize(ValueType); 643 644 PMV.ValueType = ValueType; 645 PMV.WordType = MinWordSize > ValueSize ? Type::getIntNTy(Ctx, MinWordSize * 8) 646 : ValueType; 647 if (PMV.ValueType == PMV.WordType) { 648 PMV.AlignedAddr = Addr; 649 return PMV; 650 } 651 652 assert(ValueSize < MinWordSize); 653 654 Type *WordPtrType = 655 PMV.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace()); 656 657 Value *AddrInt = Builder.CreatePtrToInt(Addr, DL.getIntPtrType(Ctx)); 658 PMV.AlignedAddr = Builder.CreateIntToPtr( 659 Builder.CreateAnd(AddrInt, ~(uint64_t)(MinWordSize - 1)), WordPtrType, 660 "AlignedAddr"); 661 662 Value *PtrLSB = Builder.CreateAnd(AddrInt, MinWordSize - 1, "PtrLSB"); 663 if (DL.isLittleEndian()) { 664 // turn bytes into bits 665 PMV.ShiftAmt = Builder.CreateShl(PtrLSB, 3); 666 } else { 667 // turn bytes into bits, and count from the other side. 668 PMV.ShiftAmt = Builder.CreateShl( 669 Builder.CreateXor(PtrLSB, MinWordSize - ValueSize), 3); 670 } 671 672 PMV.ShiftAmt = Builder.CreateTrunc(PMV.ShiftAmt, PMV.WordType, "ShiftAmt"); 673 PMV.Mask = Builder.CreateShl( 674 ConstantInt::get(PMV.WordType, (1 << (ValueSize * 8)) - 1), PMV.ShiftAmt, 675 "Mask"); 676 PMV.Inv_Mask = Builder.CreateNot(PMV.Mask, "Inv_Mask"); 677 return PMV; 678 } 679 680 static Value *extractMaskedValue(IRBuilder<> &Builder, Value *WideWord, 681 const PartwordMaskValues &PMV) { 682 assert(WideWord->getType() == PMV.WordType && "Widened type mismatch"); 683 if (PMV.WordType == PMV.ValueType) 684 return WideWord; 685 686 Value *Shift = Builder.CreateLShr(WideWord, PMV.ShiftAmt, "shifted"); 687 Value *Trunc = Builder.CreateTrunc(Shift, PMV.ValueType, "extracted"); 688 return Trunc; 689 } 690 691 static Value *insertMaskedValue(IRBuilder<> &Builder, Value *WideWord, 692 Value *Updated, const PartwordMaskValues &PMV) { 693 assert(WideWord->getType() == PMV.WordType && "Widened type mismatch"); 694 assert(Updated->getType() == PMV.ValueType && "Value type mismatch"); 695 if (PMV.WordType == PMV.ValueType) 696 return Updated; 697 698 Value *ZExt = Builder.CreateZExt(Updated, PMV.WordType, "extended"); 699 Value *Shift = 700 Builder.CreateShl(ZExt, PMV.ShiftAmt, "shifted", /*HasNUW*/ true); 701 Value *And = Builder.CreateAnd(WideWord, PMV.Inv_Mask, "unmasked"); 702 Value *Or = Builder.CreateOr(And, Shift, "inserted"); 703 return Or; 704 } 705 706 /// Emit IR to implement a masked version of a given atomicrmw 707 /// operation. (That is, only the bits under the Mask should be 708 /// affected by the operation) 709 static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op, 710 IRBuilder<> &Builder, Value *Loaded, 711 Value *Shifted_Inc, Value *Inc, 712 const PartwordMaskValues &PMV) { 713 // TODO: update to use 714 // https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge in order 715 // to merge bits from two values without requiring PMV.Inv_Mask. 716 switch (Op) { 717 case AtomicRMWInst::Xchg: { 718 Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); 719 Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, Shifted_Inc); 720 return FinalVal; 721 } 722 case AtomicRMWInst::Or: 723 case AtomicRMWInst::Xor: 724 case AtomicRMWInst::And: 725 llvm_unreachable("Or/Xor/And handled by widenPartwordAtomicRMW"); 726 case AtomicRMWInst::Add: 727 case AtomicRMWInst::Sub: 728 case AtomicRMWInst::Nand: { 729 // The other arithmetic ops need to be masked into place. 730 Value *NewVal = performAtomicOp(Op, Builder, Loaded, Shifted_Inc); 731 Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask); 732 Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); 733 Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked); 734 return FinalVal; 735 } 736 case AtomicRMWInst::Max: 737 case AtomicRMWInst::Min: 738 case AtomicRMWInst::UMax: 739 case AtomicRMWInst::UMin: { 740 // Finally, comparison ops will operate on the full value, so 741 // truncate down to the original size, and expand out again after 742 // doing the operation. 743 Value *Loaded_Extract = extractMaskedValue(Builder, Loaded, PMV); 744 Value *NewVal = performAtomicOp(Op, Builder, Loaded_Extract, Inc); 745 Value *FinalVal = insertMaskedValue(Builder, Loaded, NewVal, PMV); 746 return FinalVal; 747 } 748 default: 749 llvm_unreachable("Unknown atomic op"); 750 } 751 } 752 753 /// Expand a sub-word atomicrmw operation into an appropriate 754 /// word-sized operation. 755 /// 756 /// It will create an LL/SC or cmpxchg loop, as appropriate, the same 757 /// way as a typical atomicrmw expansion. The only difference here is 758 /// that the operation inside of the loop may operate upon only a 759 /// part of the value. 760 void AtomicExpand::expandPartwordAtomicRMW( 761 AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) { 762 AtomicOrdering MemOpOrder = AI->getOrdering(); 763 764 IRBuilder<> Builder(AI); 765 766 PartwordMaskValues PMV = 767 createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(), 768 TLI->getMinCmpXchgSizeInBits() / 8); 769 770 Value *ValOperand_Shifted = 771 Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType), 772 PMV.ShiftAmt, "ValOperand_Shifted"); 773 774 auto PerformPartwordOp = [&](IRBuilder<> &Builder, Value *Loaded) { 775 return performMaskedAtomicOp(AI->getOperation(), Builder, Loaded, 776 ValOperand_Shifted, AI->getValOperand(), PMV); 777 }; 778 779 Value *OldResult; 780 if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) { 781 OldResult = 782 insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder, 783 PerformPartwordOp, createCmpXchgInstFun); 784 } else { 785 assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC); 786 OldResult = insertRMWLLSCLoop(Builder, PMV.WordType, PMV.AlignedAddr, 787 MemOpOrder, PerformPartwordOp); 788 } 789 790 Value *FinalOldResult = extractMaskedValue(Builder, OldResult, PMV); 791 AI->replaceAllUsesWith(FinalOldResult); 792 AI->eraseFromParent(); 793 } 794 795 // Widen the bitwise atomicrmw (or/xor/and) to the minimum supported width. 796 AtomicRMWInst *AtomicExpand::widenPartwordAtomicRMW(AtomicRMWInst *AI) { 797 IRBuilder<> Builder(AI); 798 AtomicRMWInst::BinOp Op = AI->getOperation(); 799 800 assert((Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor || 801 Op == AtomicRMWInst::And) && 802 "Unable to widen operation"); 803 804 PartwordMaskValues PMV = 805 createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(), 806 TLI->getMinCmpXchgSizeInBits() / 8); 807 808 Value *ValOperand_Shifted = 809 Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType), 810 PMV.ShiftAmt, "ValOperand_Shifted"); 811 812 Value *NewOperand; 813 814 if (Op == AtomicRMWInst::And) 815 NewOperand = 816 Builder.CreateOr(PMV.Inv_Mask, ValOperand_Shifted, "AndOperand"); 817 else 818 NewOperand = ValOperand_Shifted; 819 820 AtomicRMWInst *NewAI = Builder.CreateAtomicRMW(Op, PMV.AlignedAddr, 821 NewOperand, AI->getOrdering()); 822 823 Value *FinalOldResult = extractMaskedValue(Builder, NewAI, PMV); 824 AI->replaceAllUsesWith(FinalOldResult); 825 AI->eraseFromParent(); 826 return NewAI; 827 } 828 829 bool AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) { 830 // The basic idea here is that we're expanding a cmpxchg of a 831 // smaller memory size up to a word-sized cmpxchg. To do this, we 832 // need to add a retry-loop for strong cmpxchg, so that 833 // modifications to other parts of the word don't cause a spurious 834 // failure. 835 836 // This generates code like the following: 837 // [[Setup mask values PMV.*]] 838 // %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt 839 // %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt 840 // %InitLoaded = load i32* %addr 841 // %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask 842 // br partword.cmpxchg.loop 843 // partword.cmpxchg.loop: 844 // %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ], 845 // [ %OldVal_MaskOut, %partword.cmpxchg.failure ] 846 // %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted 847 // %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted 848 // %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp, 849 // i32 %FullWord_NewVal success_ordering failure_ordering 850 // %OldVal = extractvalue { i32, i1 } %NewCI, 0 851 // %Success = extractvalue { i32, i1 } %NewCI, 1 852 // br i1 %Success, label %partword.cmpxchg.end, 853 // label %partword.cmpxchg.failure 854 // partword.cmpxchg.failure: 855 // %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask 856 // %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut 857 // br i1 %ShouldContinue, label %partword.cmpxchg.loop, 858 // label %partword.cmpxchg.end 859 // partword.cmpxchg.end: 860 // %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt 861 // %FinalOldVal = trunc i32 %tmp1 to i8 862 // %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0 863 // %Res = insertvalue { i8, i1 } %25, i1 %Success, 1 864 865 Value *Addr = CI->getPointerOperand(); 866 Value *Cmp = CI->getCompareOperand(); 867 Value *NewVal = CI->getNewValOperand(); 868 869 BasicBlock *BB = CI->getParent(); 870 Function *F = BB->getParent(); 871 IRBuilder<> Builder(CI); 872 LLVMContext &Ctx = Builder.getContext(); 873 874 const int WordSize = TLI->getMinCmpXchgSizeInBits() / 8; 875 876 BasicBlock *EndBB = 877 BB->splitBasicBlock(CI->getIterator(), "partword.cmpxchg.end"); 878 auto FailureBB = 879 BasicBlock::Create(Ctx, "partword.cmpxchg.failure", F, EndBB); 880 auto LoopBB = BasicBlock::Create(Ctx, "partword.cmpxchg.loop", F, FailureBB); 881 882 // The split call above "helpfully" added a branch at the end of BB 883 // (to the wrong place). 884 std::prev(BB->end())->eraseFromParent(); 885 Builder.SetInsertPoint(BB); 886 887 PartwordMaskValues PMV = createMaskInstrs( 888 Builder, CI, CI->getCompareOperand()->getType(), Addr, WordSize); 889 890 // Shift the incoming values over, into the right location in the word. 891 Value *NewVal_Shifted = 892 Builder.CreateShl(Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt); 893 Value *Cmp_Shifted = 894 Builder.CreateShl(Builder.CreateZExt(Cmp, PMV.WordType), PMV.ShiftAmt); 895 896 // Load the entire current word, and mask into place the expected and new 897 // values 898 LoadInst *InitLoaded = Builder.CreateLoad(PMV.WordType, PMV.AlignedAddr); 899 InitLoaded->setVolatile(CI->isVolatile()); 900 Value *InitLoaded_MaskOut = Builder.CreateAnd(InitLoaded, PMV.Inv_Mask); 901 Builder.CreateBr(LoopBB); 902 903 // partword.cmpxchg.loop: 904 Builder.SetInsertPoint(LoopBB); 905 PHINode *Loaded_MaskOut = Builder.CreatePHI(PMV.WordType, 2); 906 Loaded_MaskOut->addIncoming(InitLoaded_MaskOut, BB); 907 908 // Mask/Or the expected and new values into place in the loaded word. 909 Value *FullWord_NewVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shifted); 910 Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted); 911 AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg( 912 PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, CI->getSuccessOrdering(), 913 CI->getFailureOrdering(), CI->getSyncScopeID()); 914 NewCI->setVolatile(CI->isVolatile()); 915 // When we're building a strong cmpxchg, we need a loop, so you 916 // might think we could use a weak cmpxchg inside. But, using strong 917 // allows the below comparison for ShouldContinue, and we're 918 // expecting the underlying cmpxchg to be a machine instruction, 919 // which is strong anyways. 920 NewCI->setWeak(CI->isWeak()); 921 922 Value *OldVal = Builder.CreateExtractValue(NewCI, 0); 923 Value *Success = Builder.CreateExtractValue(NewCI, 1); 924 925 if (CI->isWeak()) 926 Builder.CreateBr(EndBB); 927 else 928 Builder.CreateCondBr(Success, EndBB, FailureBB); 929 930 // partword.cmpxchg.failure: 931 Builder.SetInsertPoint(FailureBB); 932 // Upon failure, verify that the masked-out part of the loaded value 933 // has been modified. If it didn't, abort the cmpxchg, since the 934 // masked-in part must've. 935 Value *OldVal_MaskOut = Builder.CreateAnd(OldVal, PMV.Inv_Mask); 936 Value *ShouldContinue = Builder.CreateICmpNE(Loaded_MaskOut, OldVal_MaskOut); 937 Builder.CreateCondBr(ShouldContinue, LoopBB, EndBB); 938 939 // Add the second value to the phi from above 940 Loaded_MaskOut->addIncoming(OldVal_MaskOut, FailureBB); 941 942 // partword.cmpxchg.end: 943 Builder.SetInsertPoint(CI); 944 945 Value *FinalOldVal = extractMaskedValue(Builder, OldVal, PMV); 946 Value *Res = UndefValue::get(CI->getType()); 947 Res = Builder.CreateInsertValue(Res, FinalOldVal, 0); 948 Res = Builder.CreateInsertValue(Res, Success, 1); 949 950 CI->replaceAllUsesWith(Res); 951 CI->eraseFromParent(); 952 return true; 953 } 954 955 void AtomicExpand::expandAtomicOpToLLSC( 956 Instruction *I, Type *ResultType, Value *Addr, AtomicOrdering MemOpOrder, 957 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) { 958 IRBuilder<> Builder(I); 959 Value *Loaded = 960 insertRMWLLSCLoop(Builder, ResultType, Addr, MemOpOrder, PerformOp); 961 962 I->replaceAllUsesWith(Loaded); 963 I->eraseFromParent(); 964 } 965 966 void AtomicExpand::expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI) { 967 IRBuilder<> Builder(AI); 968 969 PartwordMaskValues PMV = 970 createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(), 971 TLI->getMinCmpXchgSizeInBits() / 8); 972 973 // The value operand must be sign-extended for signed min/max so that the 974 // target's signed comparison instructions can be used. Otherwise, just 975 // zero-ext. 976 Instruction::CastOps CastOp = Instruction::ZExt; 977 AtomicRMWInst::BinOp RMWOp = AI->getOperation(); 978 if (RMWOp == AtomicRMWInst::Max || RMWOp == AtomicRMWInst::Min) 979 CastOp = Instruction::SExt; 980 981 Value *ValOperand_Shifted = Builder.CreateShl( 982 Builder.CreateCast(CastOp, AI->getValOperand(), PMV.WordType), 983 PMV.ShiftAmt, "ValOperand_Shifted"); 984 Value *OldResult = TLI->emitMaskedAtomicRMWIntrinsic( 985 Builder, AI, PMV.AlignedAddr, ValOperand_Shifted, PMV.Mask, PMV.ShiftAmt, 986 AI->getOrdering()); 987 Value *FinalOldResult = extractMaskedValue(Builder, OldResult, PMV); 988 AI->replaceAllUsesWith(FinalOldResult); 989 AI->eraseFromParent(); 990 } 991 992 void AtomicExpand::expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI) { 993 IRBuilder<> Builder(CI); 994 995 PartwordMaskValues PMV = createMaskInstrs( 996 Builder, CI, CI->getCompareOperand()->getType(), CI->getPointerOperand(), 997 TLI->getMinCmpXchgSizeInBits() / 8); 998 999 Value *CmpVal_Shifted = Builder.CreateShl( 1000 Builder.CreateZExt(CI->getCompareOperand(), PMV.WordType), PMV.ShiftAmt, 1001 "CmpVal_Shifted"); 1002 Value *NewVal_Shifted = Builder.CreateShl( 1003 Builder.CreateZExt(CI->getNewValOperand(), PMV.WordType), PMV.ShiftAmt, 1004 "NewVal_Shifted"); 1005 Value *OldVal = TLI->emitMaskedAtomicCmpXchgIntrinsic( 1006 Builder, CI, PMV.AlignedAddr, CmpVal_Shifted, NewVal_Shifted, PMV.Mask, 1007 CI->getSuccessOrdering()); 1008 Value *FinalOldVal = extractMaskedValue(Builder, OldVal, PMV); 1009 Value *Res = UndefValue::get(CI->getType()); 1010 Res = Builder.CreateInsertValue(Res, FinalOldVal, 0); 1011 Value *Success = Builder.CreateICmpEQ( 1012 CmpVal_Shifted, Builder.CreateAnd(OldVal, PMV.Mask), "Success"); 1013 Res = Builder.CreateInsertValue(Res, Success, 1); 1014 1015 CI->replaceAllUsesWith(Res); 1016 CI->eraseFromParent(); 1017 } 1018 1019 Value *AtomicExpand::insertRMWLLSCLoop( 1020 IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 1021 AtomicOrdering MemOpOrder, 1022 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) { 1023 LLVMContext &Ctx = Builder.getContext(); 1024 BasicBlock *BB = Builder.GetInsertBlock(); 1025 Function *F = BB->getParent(); 1026 1027 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 1028 // 1029 // The standard expansion we produce is: 1030 // [...] 1031 // atomicrmw.start: 1032 // %loaded = @load.linked(%addr) 1033 // %new = some_op iN %loaded, %incr 1034 // %stored = @store_conditional(%new, %addr) 1035 // %try_again = icmp i32 ne %stored, 0 1036 // br i1 %try_again, label %loop, label %atomicrmw.end 1037 // atomicrmw.end: 1038 // [...] 1039 BasicBlock *ExitBB = 1040 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); 1041 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 1042 1043 // The split call above "helpfully" added a branch at the end of BB (to the 1044 // wrong place). 1045 std::prev(BB->end())->eraseFromParent(); 1046 Builder.SetInsertPoint(BB); 1047 Builder.CreateBr(LoopBB); 1048 1049 // Start the main loop block now that we've taken care of the preliminaries. 1050 Builder.SetInsertPoint(LoopBB); 1051 Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); 1052 1053 Value *NewVal = PerformOp(Builder, Loaded); 1054 1055 Value *StoreSuccess = 1056 TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); 1057 Value *TryAgain = Builder.CreateICmpNE( 1058 StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain"); 1059 Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); 1060 1061 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 1062 return Loaded; 1063 } 1064 1065 /// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of 1066 /// the equivalent bitwidth. We used to not support pointer cmpxchg in the 1067 /// IR. As a migration step, we convert back to what use to be the standard 1068 /// way to represent a pointer cmpxchg so that we can update backends one by 1069 /// one. 1070 AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) { 1071 auto *M = CI->getModule(); 1072 Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(), 1073 M->getDataLayout()); 1074 1075 IRBuilder<> Builder(CI); 1076 1077 Value *Addr = CI->getPointerOperand(); 1078 Type *PT = PointerType::get(NewTy, 1079 Addr->getType()->getPointerAddressSpace()); 1080 Value *NewAddr = Builder.CreateBitCast(Addr, PT); 1081 1082 Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy); 1083 Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy); 1084 1085 1086 auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal, 1087 CI->getSuccessOrdering(), 1088 CI->getFailureOrdering(), 1089 CI->getSyncScopeID()); 1090 NewCI->setVolatile(CI->isVolatile()); 1091 NewCI->setWeak(CI->isWeak()); 1092 LLVM_DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n"); 1093 1094 Value *OldVal = Builder.CreateExtractValue(NewCI, 0); 1095 Value *Succ = Builder.CreateExtractValue(NewCI, 1); 1096 1097 OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType()); 1098 1099 Value *Res = UndefValue::get(CI->getType()); 1100 Res = Builder.CreateInsertValue(Res, OldVal, 0); 1101 Res = Builder.CreateInsertValue(Res, Succ, 1); 1102 1103 CI->replaceAllUsesWith(Res); 1104 CI->eraseFromParent(); 1105 return NewCI; 1106 } 1107 1108 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { 1109 AtomicOrdering SuccessOrder = CI->getSuccessOrdering(); 1110 AtomicOrdering FailureOrder = CI->getFailureOrdering(); 1111 Value *Addr = CI->getPointerOperand(); 1112 BasicBlock *BB = CI->getParent(); 1113 Function *F = BB->getParent(); 1114 LLVMContext &Ctx = F->getContext(); 1115 // If shouldInsertFencesForAtomic() returns true, then the target does not 1116 // want to deal with memory orders, and emitLeading/TrailingFence should take 1117 // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we 1118 // should preserve the ordering. 1119 bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI); 1120 AtomicOrdering MemOpOrder = 1121 ShouldInsertFencesForAtomic ? AtomicOrdering::Monotonic : SuccessOrder; 1122 1123 // In implementations which use a barrier to achieve release semantics, we can 1124 // delay emitting this barrier until we know a store is actually going to be 1125 // attempted. The cost of this delay is that we need 2 copies of the block 1126 // emitting the load-linked, affecting code size. 1127 // 1128 // Ideally, this logic would be unconditional except for the minsize check 1129 // since in other cases the extra blocks naturally collapse down to the 1130 // minimal loop. Unfortunately, this puts too much stress on later 1131 // optimisations so we avoid emitting the extra logic in those cases too. 1132 bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic && 1133 SuccessOrder != AtomicOrdering::Monotonic && 1134 SuccessOrder != AtomicOrdering::Acquire && 1135 !F->hasMinSize(); 1136 1137 // There's no overhead for sinking the release barrier in a weak cmpxchg, so 1138 // do it even on minsize. 1139 bool UseUnconditionalReleaseBarrier = F->hasMinSize() && !CI->isWeak(); 1140 1141 // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord 1142 // 1143 // The full expansion we produce is: 1144 // [...] 1145 // %aligned.addr = ... 1146 // cmpxchg.start: 1147 // %unreleasedload = @load.linked(%aligned.addr) 1148 // %unreleasedload.extract = extract value from %unreleasedload 1149 // %should_store = icmp eq %unreleasedload.extract, %desired 1150 // br i1 %should_store, label %cmpxchg.releasingstore, 1151 // label %cmpxchg.nostore 1152 // cmpxchg.releasingstore: 1153 // fence? 1154 // br label cmpxchg.trystore 1155 // cmpxchg.trystore: 1156 // %loaded.trystore = phi [%unreleasedload, %cmpxchg.releasingstore], 1157 // [%releasedload, %cmpxchg.releasedload] 1158 // %updated.new = insert %new into %loaded.trystore 1159 // %stored = @store_conditional(%updated.new, %aligned.addr) 1160 // %success = icmp eq i32 %stored, 0 1161 // br i1 %success, label %cmpxchg.success, 1162 // label %cmpxchg.releasedload/%cmpxchg.failure 1163 // cmpxchg.releasedload: 1164 // %releasedload = @load.linked(%aligned.addr) 1165 // %releasedload.extract = extract value from %releasedload 1166 // %should_store = icmp eq %releasedload.extract, %desired 1167 // br i1 %should_store, label %cmpxchg.trystore, 1168 // label %cmpxchg.failure 1169 // cmpxchg.success: 1170 // fence? 1171 // br label %cmpxchg.end 1172 // cmpxchg.nostore: 1173 // %loaded.nostore = phi [%unreleasedload, %cmpxchg.start], 1174 // [%releasedload, 1175 // %cmpxchg.releasedload/%cmpxchg.trystore] 1176 // @load_linked_fail_balance()? 1177 // br label %cmpxchg.failure 1178 // cmpxchg.failure: 1179 // fence? 1180 // br label %cmpxchg.end 1181 // cmpxchg.end: 1182 // %loaded.exit = phi [%loaded.nostore, %cmpxchg.failure], 1183 // [%loaded.trystore, %cmpxchg.trystore] 1184 // %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure] 1185 // %loaded = extract value from %loaded.exit 1186 // %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0 1187 // %res = insertvalue { iN, i1 } %restmp, i1 %success, 1 1188 // [...] 1189 BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end"); 1190 auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB); 1191 auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB); 1192 auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB); 1193 auto ReleasedLoadBB = 1194 BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB); 1195 auto TryStoreBB = 1196 BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB); 1197 auto ReleasingStoreBB = 1198 BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB); 1199 auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB); 1200 1201 // This grabs the DebugLoc from CI 1202 IRBuilder<> Builder(CI); 1203 1204 // The split call above "helpfully" added a branch at the end of BB (to the 1205 // wrong place), but we might want a fence too. It's easiest to just remove 1206 // the branch entirely. 1207 std::prev(BB->end())->eraseFromParent(); 1208 Builder.SetInsertPoint(BB); 1209 if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier) 1210 TLI->emitLeadingFence(Builder, CI, SuccessOrder); 1211 1212 PartwordMaskValues PMV = 1213 createMaskInstrs(Builder, CI, CI->getCompareOperand()->getType(), Addr, 1214 TLI->getMinCmpXchgSizeInBits() / 8); 1215 Builder.CreateBr(StartBB); 1216 1217 // Start the main loop block now that we've taken care of the preliminaries. 1218 Builder.SetInsertPoint(StartBB); 1219 Value *UnreleasedLoad = 1220 TLI->emitLoadLinked(Builder, PMV.AlignedAddr, MemOpOrder); 1221 Value *UnreleasedLoadExtract = 1222 extractMaskedValue(Builder, UnreleasedLoad, PMV); 1223 Value *ShouldStore = Builder.CreateICmpEQ( 1224 UnreleasedLoadExtract, CI->getCompareOperand(), "should_store"); 1225 1226 // If the cmpxchg doesn't actually need any ordering when it fails, we can 1227 // jump straight past that fence instruction (if it exists). 1228 Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB); 1229 1230 Builder.SetInsertPoint(ReleasingStoreBB); 1231 if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier) 1232 TLI->emitLeadingFence(Builder, CI, SuccessOrder); 1233 Builder.CreateBr(TryStoreBB); 1234 1235 Builder.SetInsertPoint(TryStoreBB); 1236 PHINode *LoadedTryStore = 1237 Builder.CreatePHI(PMV.WordType, 2, "loaded.trystore"); 1238 LoadedTryStore->addIncoming(UnreleasedLoad, ReleasingStoreBB); 1239 Value *NewValueInsert = 1240 insertMaskedValue(Builder, LoadedTryStore, CI->getNewValOperand(), PMV); 1241 Value *StoreSuccess = 1242 TLI->emitStoreConditional(Builder, NewValueInsert, PMV.AlignedAddr, 1243 MemOpOrder); 1244 StoreSuccess = Builder.CreateICmpEQ( 1245 StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success"); 1246 BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB; 1247 Builder.CreateCondBr(StoreSuccess, SuccessBB, 1248 CI->isWeak() ? FailureBB : RetryBB); 1249 1250 Builder.SetInsertPoint(ReleasedLoadBB); 1251 Value *SecondLoad; 1252 if (HasReleasedLoadBB) { 1253 SecondLoad = TLI->emitLoadLinked(Builder, PMV.AlignedAddr, MemOpOrder); 1254 Value *SecondLoadExtract = extractMaskedValue(Builder, SecondLoad, PMV); 1255 ShouldStore = Builder.CreateICmpEQ(SecondLoadExtract, 1256 CI->getCompareOperand(), "should_store"); 1257 1258 // If the cmpxchg doesn't actually need any ordering when it fails, we can 1259 // jump straight past that fence instruction (if it exists). 1260 Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB); 1261 // Update PHI node in TryStoreBB. 1262 LoadedTryStore->addIncoming(SecondLoad, ReleasedLoadBB); 1263 } else 1264 Builder.CreateUnreachable(); 1265 1266 // Make sure later instructions don't get reordered with a fence if 1267 // necessary. 1268 Builder.SetInsertPoint(SuccessBB); 1269 if (ShouldInsertFencesForAtomic) 1270 TLI->emitTrailingFence(Builder, CI, SuccessOrder); 1271 Builder.CreateBr(ExitBB); 1272 1273 Builder.SetInsertPoint(NoStoreBB); 1274 PHINode *LoadedNoStore = 1275 Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.nostore"); 1276 LoadedNoStore->addIncoming(UnreleasedLoad, StartBB); 1277 if (HasReleasedLoadBB) 1278 LoadedNoStore->addIncoming(SecondLoad, ReleasedLoadBB); 1279 1280 // In the failing case, where we don't execute the store-conditional, the 1281 // target might want to balance out the load-linked with a dedicated 1282 // instruction (e.g., on ARM, clearing the exclusive monitor). 1283 TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); 1284 Builder.CreateBr(FailureBB); 1285 1286 Builder.SetInsertPoint(FailureBB); 1287 PHINode *LoadedFailure = 1288 Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.failure"); 1289 LoadedFailure->addIncoming(LoadedNoStore, NoStoreBB); 1290 if (CI->isWeak()) 1291 LoadedFailure->addIncoming(LoadedTryStore, TryStoreBB); 1292 if (ShouldInsertFencesForAtomic) 1293 TLI->emitTrailingFence(Builder, CI, FailureOrder); 1294 Builder.CreateBr(ExitBB); 1295 1296 // Finally, we have control-flow based knowledge of whether the cmpxchg 1297 // succeeded or not. We expose this to later passes by converting any 1298 // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate 1299 // PHI. 1300 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 1301 PHINode *LoadedExit = 1302 Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.exit"); 1303 LoadedExit->addIncoming(LoadedTryStore, SuccessBB); 1304 LoadedExit->addIncoming(LoadedFailure, FailureBB); 1305 PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2, "success"); 1306 Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB); 1307 Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB); 1308 1309 // This is the "exit value" from the cmpxchg expansion. It may be of 1310 // a type wider than the one in the cmpxchg instruction. 1311 Value *LoadedFull = LoadedExit; 1312 1313 Builder.SetInsertPoint(ExitBB, std::next(Success->getIterator())); 1314 Value *Loaded = extractMaskedValue(Builder, LoadedFull, PMV); 1315 1316 // Look for any users of the cmpxchg that are just comparing the loaded value 1317 // against the desired one, and replace them with the CFG-derived version. 1318 SmallVector<ExtractValueInst *, 2> PrunedInsts; 1319 for (auto User : CI->users()) { 1320 ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User); 1321 if (!EV) 1322 continue; 1323 1324 assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 && 1325 "weird extraction from { iN, i1 }"); 1326 1327 if (EV->getIndices()[0] == 0) 1328 EV->replaceAllUsesWith(Loaded); 1329 else 1330 EV->replaceAllUsesWith(Success); 1331 1332 PrunedInsts.push_back(EV); 1333 } 1334 1335 // We can remove the instructions now we're no longer iterating through them. 1336 for (auto EV : PrunedInsts) 1337 EV->eraseFromParent(); 1338 1339 if (!CI->use_empty()) { 1340 // Some use of the full struct return that we don't understand has happened, 1341 // so we've got to reconstruct it properly. 1342 Value *Res; 1343 Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0); 1344 Res = Builder.CreateInsertValue(Res, Success, 1); 1345 1346 CI->replaceAllUsesWith(Res); 1347 } 1348 1349 CI->eraseFromParent(); 1350 return true; 1351 } 1352 1353 bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) { 1354 auto C = dyn_cast<ConstantInt>(RMWI->getValOperand()); 1355 if(!C) 1356 return false; 1357 1358 AtomicRMWInst::BinOp Op = RMWI->getOperation(); 1359 switch(Op) { 1360 case AtomicRMWInst::Add: 1361 case AtomicRMWInst::Sub: 1362 case AtomicRMWInst::Or: 1363 case AtomicRMWInst::Xor: 1364 return C->isZero(); 1365 case AtomicRMWInst::And: 1366 return C->isMinusOne(); 1367 // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/... 1368 default: 1369 return false; 1370 } 1371 } 1372 1373 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) { 1374 if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) { 1375 tryExpandAtomicLoad(ResultingLoad); 1376 return true; 1377 } 1378 return false; 1379 } 1380 1381 Value *AtomicExpand::insertRMWCmpXchgLoop( 1382 IRBuilder<> &Builder, Type *ResultTy, Value *Addr, 1383 AtomicOrdering MemOpOrder, 1384 function_ref<Value *(IRBuilder<> &, Value *)> PerformOp, 1385 CreateCmpXchgInstFun CreateCmpXchg) { 1386 LLVMContext &Ctx = Builder.getContext(); 1387 BasicBlock *BB = Builder.GetInsertBlock(); 1388 Function *F = BB->getParent(); 1389 1390 // Given: atomicrmw some_op iN* %addr, iN %incr ordering 1391 // 1392 // The standard expansion we produce is: 1393 // [...] 1394 // %init_loaded = load atomic iN* %addr 1395 // br label %loop 1396 // loop: 1397 // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] 1398 // %new = some_op iN %loaded, %incr 1399 // %pair = cmpxchg iN* %addr, iN %loaded, iN %new 1400 // %new_loaded = extractvalue { iN, i1 } %pair, 0 1401 // %success = extractvalue { iN, i1 } %pair, 1 1402 // br i1 %success, label %atomicrmw.end, label %loop 1403 // atomicrmw.end: 1404 // [...] 1405 BasicBlock *ExitBB = 1406 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); 1407 BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); 1408 1409 // The split call above "helpfully" added a branch at the end of BB (to the 1410 // wrong place), but we want a load. It's easiest to just remove 1411 // the branch entirely. 1412 std::prev(BB->end())->eraseFromParent(); 1413 Builder.SetInsertPoint(BB); 1414 LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr); 1415 // Atomics require at least natural alignment. 1416 InitLoaded->setAlignment(Align(ResultTy->getPrimitiveSizeInBits() / 8)); 1417 Builder.CreateBr(LoopBB); 1418 1419 // Start the main loop block now that we've taken care of the preliminaries. 1420 Builder.SetInsertPoint(LoopBB); 1421 PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded"); 1422 Loaded->addIncoming(InitLoaded, BB); 1423 1424 Value *NewVal = PerformOp(Builder, Loaded); 1425 1426 Value *NewLoaded = nullptr; 1427 Value *Success = nullptr; 1428 1429 CreateCmpXchg(Builder, Addr, Loaded, NewVal, 1430 MemOpOrder == AtomicOrdering::Unordered 1431 ? AtomicOrdering::Monotonic 1432 : MemOpOrder, 1433 Success, NewLoaded); 1434 assert(Success && NewLoaded); 1435 1436 Loaded->addIncoming(NewLoaded, LoopBB); 1437 1438 Builder.CreateCondBr(Success, ExitBB, LoopBB); 1439 1440 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 1441 return NewLoaded; 1442 } 1443 1444 bool AtomicExpand::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) { 1445 unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; 1446 unsigned ValueSize = getAtomicOpSize(CI); 1447 1448 switch (TLI->shouldExpandAtomicCmpXchgInIR(CI)) { 1449 default: 1450 llvm_unreachable("Unhandled case in tryExpandAtomicCmpXchg"); 1451 case TargetLoweringBase::AtomicExpansionKind::None: 1452 if (ValueSize < MinCASSize) 1453 return expandPartwordCmpXchg(CI); 1454 return false; 1455 case TargetLoweringBase::AtomicExpansionKind::LLSC: { 1456 return expandAtomicCmpXchg(CI); 1457 } 1458 case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic: 1459 expandAtomicCmpXchgToMaskedIntrinsic(CI); 1460 return true; 1461 } 1462 } 1463 1464 // Note: This function is exposed externally by AtomicExpandUtils.h 1465 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, 1466 CreateCmpXchgInstFun CreateCmpXchg) { 1467 IRBuilder<> Builder(AI); 1468 Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop( 1469 Builder, AI->getType(), AI->getPointerOperand(), AI->getOrdering(), 1470 [&](IRBuilder<> &Builder, Value *Loaded) { 1471 return performAtomicOp(AI->getOperation(), Builder, Loaded, 1472 AI->getValOperand()); 1473 }, 1474 CreateCmpXchg); 1475 1476 AI->replaceAllUsesWith(Loaded); 1477 AI->eraseFromParent(); 1478 return true; 1479 } 1480 1481 // In order to use one of the sized library calls such as 1482 // __atomic_fetch_add_4, the alignment must be sufficient, the size 1483 // must be one of the potentially-specialized sizes, and the value 1484 // type must actually exist in C on the target (otherwise, the 1485 // function wouldn't actually be defined.) 1486 static bool canUseSizedAtomicCall(unsigned Size, Align Alignment, 1487 const DataLayout &DL) { 1488 // TODO: "LargestSize" is an approximation for "largest type that 1489 // you can express in C". It seems to be the case that int128 is 1490 // supported on all 64-bit platforms, otherwise only up to 64-bit 1491 // integers are supported. If we get this wrong, then we'll try to 1492 // call a sized libcall that doesn't actually exist. There should 1493 // really be some more reliable way in LLVM of determining integer 1494 // sizes which are valid in the target's C ABI... 1495 unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 16 : 8; 1496 return Alignment >= Size && 1497 (Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) && 1498 Size <= LargestSize; 1499 } 1500 1501 void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) { 1502 static const RTLIB::Libcall Libcalls[6] = { 1503 RTLIB::ATOMIC_LOAD, RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2, 1504 RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16}; 1505 unsigned Size = getAtomicOpSize(I); 1506 1507 bool expanded = expandAtomicOpToLibcall( 1508 I, Size, I->getAlign(), I->getPointerOperand(), nullptr, nullptr, 1509 I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1510 (void)expanded; 1511 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Load"); 1512 } 1513 1514 void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) { 1515 static const RTLIB::Libcall Libcalls[6] = { 1516 RTLIB::ATOMIC_STORE, RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2, 1517 RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16}; 1518 unsigned Size = getAtomicOpSize(I); 1519 1520 bool expanded = expandAtomicOpToLibcall( 1521 I, Size, I->getAlign(), I->getPointerOperand(), I->getValueOperand(), 1522 nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1523 (void)expanded; 1524 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Store"); 1525 } 1526 1527 void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) { 1528 static const RTLIB::Libcall Libcalls[6] = { 1529 RTLIB::ATOMIC_COMPARE_EXCHANGE, RTLIB::ATOMIC_COMPARE_EXCHANGE_1, 1530 RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4, 1531 RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16}; 1532 unsigned Size = getAtomicOpSize(I); 1533 1534 bool expanded = expandAtomicOpToLibcall( 1535 I, Size, I->getAlign(), I->getPointerOperand(), I->getNewValOperand(), 1536 I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(), 1537 Libcalls); 1538 (void)expanded; 1539 assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor CAS"); 1540 } 1541 1542 static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) { 1543 static const RTLIB::Libcall LibcallsXchg[6] = { 1544 RTLIB::ATOMIC_EXCHANGE, RTLIB::ATOMIC_EXCHANGE_1, 1545 RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4, 1546 RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16}; 1547 static const RTLIB::Libcall LibcallsAdd[6] = { 1548 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_ADD_1, 1549 RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4, 1550 RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16}; 1551 static const RTLIB::Libcall LibcallsSub[6] = { 1552 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_SUB_1, 1553 RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4, 1554 RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16}; 1555 static const RTLIB::Libcall LibcallsAnd[6] = { 1556 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_AND_1, 1557 RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4, 1558 RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16}; 1559 static const RTLIB::Libcall LibcallsOr[6] = { 1560 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_OR_1, 1561 RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4, 1562 RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16}; 1563 static const RTLIB::Libcall LibcallsXor[6] = { 1564 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_XOR_1, 1565 RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4, 1566 RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16}; 1567 static const RTLIB::Libcall LibcallsNand[6] = { 1568 RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_NAND_1, 1569 RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4, 1570 RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16}; 1571 1572 switch (Op) { 1573 case AtomicRMWInst::BAD_BINOP: 1574 llvm_unreachable("Should not have BAD_BINOP."); 1575 case AtomicRMWInst::Xchg: 1576 return makeArrayRef(LibcallsXchg); 1577 case AtomicRMWInst::Add: 1578 return makeArrayRef(LibcallsAdd); 1579 case AtomicRMWInst::Sub: 1580 return makeArrayRef(LibcallsSub); 1581 case AtomicRMWInst::And: 1582 return makeArrayRef(LibcallsAnd); 1583 case AtomicRMWInst::Or: 1584 return makeArrayRef(LibcallsOr); 1585 case AtomicRMWInst::Xor: 1586 return makeArrayRef(LibcallsXor); 1587 case AtomicRMWInst::Nand: 1588 return makeArrayRef(LibcallsNand); 1589 case AtomicRMWInst::Max: 1590 case AtomicRMWInst::Min: 1591 case AtomicRMWInst::UMax: 1592 case AtomicRMWInst::UMin: 1593 case AtomicRMWInst::FAdd: 1594 case AtomicRMWInst::FSub: 1595 // No atomic libcalls are available for max/min/umax/umin. 1596 return {}; 1597 } 1598 llvm_unreachable("Unexpected AtomicRMW operation."); 1599 } 1600 1601 void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) { 1602 ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation()); 1603 1604 unsigned Size = getAtomicOpSize(I); 1605 1606 bool Success = false; 1607 if (!Libcalls.empty()) 1608 Success = expandAtomicOpToLibcall( 1609 I, Size, I->getAlign(), I->getPointerOperand(), I->getValOperand(), 1610 nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); 1611 1612 // The expansion failed: either there were no libcalls at all for 1613 // the operation (min/max), or there were only size-specialized 1614 // libcalls (add/sub/etc) and we needed a generic. So, expand to a 1615 // CAS libcall, via a CAS loop, instead. 1616 if (!Success) { 1617 expandAtomicRMWToCmpXchg(I, [this](IRBuilder<> &Builder, Value *Addr, 1618 Value *Loaded, Value *NewVal, 1619 AtomicOrdering MemOpOrder, 1620 Value *&Success, Value *&NewLoaded) { 1621 // Create the CAS instruction normally... 1622 AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg( 1623 Addr, Loaded, NewVal, MemOpOrder, 1624 AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); 1625 Success = Builder.CreateExtractValue(Pair, 1, "success"); 1626 NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); 1627 1628 // ...and then expand the CAS into a libcall. 1629 expandAtomicCASToLibcall(Pair); 1630 }); 1631 } 1632 } 1633 1634 // A helper routine for the above expandAtomic*ToLibcall functions. 1635 // 1636 // 'Libcalls' contains an array of enum values for the particular 1637 // ATOMIC libcalls to be emitted. All of the other arguments besides 1638 // 'I' are extracted from the Instruction subclass by the 1639 // caller. Depending on the particular call, some will be null. 1640 bool AtomicExpand::expandAtomicOpToLibcall( 1641 Instruction *I, unsigned Size, Align Alignment, Value *PointerOperand, 1642 Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering, 1643 AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) { 1644 assert(Libcalls.size() == 6); 1645 1646 LLVMContext &Ctx = I->getContext(); 1647 Module *M = I->getModule(); 1648 const DataLayout &DL = M->getDataLayout(); 1649 IRBuilder<> Builder(I); 1650 IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front()); 1651 1652 bool UseSizedLibcall = canUseSizedAtomicCall(Size, Alignment, DL); 1653 Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8); 1654 1655 const Align AllocaAlignment = DL.getPrefTypeAlign(SizedIntTy); 1656 1657 // TODO: the "order" argument type is "int", not int32. So 1658 // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints. 1659 ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size); 1660 assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO"); 1661 Constant *OrderingVal = 1662 ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering)); 1663 Constant *Ordering2Val = nullptr; 1664 if (CASExpected) { 1665 assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO"); 1666 Ordering2Val = 1667 ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering2)); 1668 } 1669 bool HasResult = I->getType() != Type::getVoidTy(Ctx); 1670 1671 RTLIB::Libcall RTLibType; 1672 if (UseSizedLibcall) { 1673 switch (Size) { 1674 case 1: RTLibType = Libcalls[1]; break; 1675 case 2: RTLibType = Libcalls[2]; break; 1676 case 4: RTLibType = Libcalls[3]; break; 1677 case 8: RTLibType = Libcalls[4]; break; 1678 case 16: RTLibType = Libcalls[5]; break; 1679 } 1680 } else if (Libcalls[0] != RTLIB::UNKNOWN_LIBCALL) { 1681 RTLibType = Libcalls[0]; 1682 } else { 1683 // Can't use sized function, and there's no generic for this 1684 // operation, so give up. 1685 return false; 1686 } 1687 1688 // Build up the function call. There's two kinds. First, the sized 1689 // variants. These calls are going to be one of the following (with 1690 // N=1,2,4,8,16): 1691 // iN __atomic_load_N(iN *ptr, int ordering) 1692 // void __atomic_store_N(iN *ptr, iN val, int ordering) 1693 // iN __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering) 1694 // bool __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired, 1695 // int success_order, int failure_order) 1696 // 1697 // Note that these functions can be used for non-integer atomic 1698 // operations, the values just need to be bitcast to integers on the 1699 // way in and out. 1700 // 1701 // And, then, the generic variants. They look like the following: 1702 // void __atomic_load(size_t size, void *ptr, void *ret, int ordering) 1703 // void __atomic_store(size_t size, void *ptr, void *val, int ordering) 1704 // void __atomic_exchange(size_t size, void *ptr, void *val, void *ret, 1705 // int ordering) 1706 // bool __atomic_compare_exchange(size_t size, void *ptr, void *expected, 1707 // void *desired, int success_order, 1708 // int failure_order) 1709 // 1710 // The different signatures are built up depending on the 1711 // 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult' 1712 // variables. 1713 1714 AllocaInst *AllocaCASExpected = nullptr; 1715 Value *AllocaCASExpected_i8 = nullptr; 1716 AllocaInst *AllocaValue = nullptr; 1717 Value *AllocaValue_i8 = nullptr; 1718 AllocaInst *AllocaResult = nullptr; 1719 Value *AllocaResult_i8 = nullptr; 1720 1721 Type *ResultTy; 1722 SmallVector<Value *, 6> Args; 1723 AttributeList Attr; 1724 1725 // 'size' argument. 1726 if (!UseSizedLibcall) { 1727 // Note, getIntPtrType is assumed equivalent to size_t. 1728 Args.push_back(ConstantInt::get(DL.getIntPtrType(Ctx), Size)); 1729 } 1730 1731 // 'ptr' argument. 1732 // note: This assumes all address spaces share a common libfunc 1733 // implementation and that addresses are convertable. For systems without 1734 // that property, we'd need to extend this mechanism to support AS-specific 1735 // families of atomic intrinsics. 1736 auto PtrTypeAS = PointerOperand->getType()->getPointerAddressSpace(); 1737 Value *PtrVal = Builder.CreateBitCast(PointerOperand, 1738 Type::getInt8PtrTy(Ctx, PtrTypeAS)); 1739 PtrVal = Builder.CreateAddrSpaceCast(PtrVal, Type::getInt8PtrTy(Ctx)); 1740 Args.push_back(PtrVal); 1741 1742 // 'expected' argument, if present. 1743 if (CASExpected) { 1744 AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType()); 1745 AllocaCASExpected->setAlignment(AllocaAlignment); 1746 unsigned AllocaAS = AllocaCASExpected->getType()->getPointerAddressSpace(); 1747 1748 AllocaCASExpected_i8 = 1749 Builder.CreateBitCast(AllocaCASExpected, 1750 Type::getInt8PtrTy(Ctx, AllocaAS)); 1751 Builder.CreateLifetimeStart(AllocaCASExpected_i8, SizeVal64); 1752 Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment); 1753 Args.push_back(AllocaCASExpected_i8); 1754 } 1755 1756 // 'val' argument ('desired' for cas), if present. 1757 if (ValueOperand) { 1758 if (UseSizedLibcall) { 1759 Value *IntValue = 1760 Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy); 1761 Args.push_back(IntValue); 1762 } else { 1763 AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType()); 1764 AllocaValue->setAlignment(AllocaAlignment); 1765 AllocaValue_i8 = 1766 Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx)); 1767 Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64); 1768 Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment); 1769 Args.push_back(AllocaValue_i8); 1770 } 1771 } 1772 1773 // 'ret' argument. 1774 if (!CASExpected && HasResult && !UseSizedLibcall) { 1775 AllocaResult = AllocaBuilder.CreateAlloca(I->getType()); 1776 AllocaResult->setAlignment(AllocaAlignment); 1777 unsigned AllocaAS = AllocaResult->getType()->getPointerAddressSpace(); 1778 AllocaResult_i8 = 1779 Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx, AllocaAS)); 1780 Builder.CreateLifetimeStart(AllocaResult_i8, SizeVal64); 1781 Args.push_back(AllocaResult_i8); 1782 } 1783 1784 // 'ordering' ('success_order' for cas) argument. 1785 Args.push_back(OrderingVal); 1786 1787 // 'failure_order' argument, if present. 1788 if (Ordering2Val) 1789 Args.push_back(Ordering2Val); 1790 1791 // Now, the return type. 1792 if (CASExpected) { 1793 ResultTy = Type::getInt1Ty(Ctx); 1794 Attr = Attr.addAttribute(Ctx, AttributeList::ReturnIndex, Attribute::ZExt); 1795 } else if (HasResult && UseSizedLibcall) 1796 ResultTy = SizedIntTy; 1797 else 1798 ResultTy = Type::getVoidTy(Ctx); 1799 1800 // Done with setting up arguments and return types, create the call: 1801 SmallVector<Type *, 6> ArgTys; 1802 for (Value *Arg : Args) 1803 ArgTys.push_back(Arg->getType()); 1804 FunctionType *FnType = FunctionType::get(ResultTy, ArgTys, false); 1805 FunctionCallee LibcallFn = 1806 M->getOrInsertFunction(TLI->getLibcallName(RTLibType), FnType, Attr); 1807 CallInst *Call = Builder.CreateCall(LibcallFn, Args); 1808 Call->setAttributes(Attr); 1809 Value *Result = Call; 1810 1811 // And then, extract the results... 1812 if (ValueOperand && !UseSizedLibcall) 1813 Builder.CreateLifetimeEnd(AllocaValue_i8, SizeVal64); 1814 1815 if (CASExpected) { 1816 // The final result from the CAS is {load of 'expected' alloca, bool result 1817 // from call} 1818 Type *FinalResultTy = I->getType(); 1819 Value *V = UndefValue::get(FinalResultTy); 1820 Value *ExpectedOut = Builder.CreateAlignedLoad( 1821 CASExpected->getType(), AllocaCASExpected, AllocaAlignment); 1822 Builder.CreateLifetimeEnd(AllocaCASExpected_i8, SizeVal64); 1823 V = Builder.CreateInsertValue(V, ExpectedOut, 0); 1824 V = Builder.CreateInsertValue(V, Result, 1); 1825 I->replaceAllUsesWith(V); 1826 } else if (HasResult) { 1827 Value *V; 1828 if (UseSizedLibcall) 1829 V = Builder.CreateBitOrPointerCast(Result, I->getType()); 1830 else { 1831 V = Builder.CreateAlignedLoad(I->getType(), AllocaResult, 1832 AllocaAlignment); 1833 Builder.CreateLifetimeEnd(AllocaResult_i8, SizeVal64); 1834 } 1835 I->replaceAllUsesWith(V); 1836 } 1837 I->eraseFromParent(); 1838 return true; 1839 } 1840