1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass does misc. AMDGPU optimizations on IR *just* before instruction 11 /// selection. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "llvm/Analysis/AssumptionCache.h" 17 #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 18 #include "llvm/Analysis/ValueTracking.h" 19 #include "llvm/IR/IRBuilder.h" 20 #include "llvm/IR/InstVisitor.h" 21 #include "llvm/InitializePasses.h" 22 #include "llvm/Support/CommandLine.h" 23 #include "llvm/Support/KnownBits.h" 24 #include "llvm/Transforms/Utils/Local.h" 25 26 #define DEBUG_TYPE "amdgpu-late-codegenprepare" 27 28 using namespace llvm; 29 30 // Scalar load widening needs running after load-store-vectorizer as that pass 31 // doesn't handle overlapping cases. In addition, this pass enhances the 32 // widening to handle cases where scalar sub-dword loads are naturally aligned 33 // only but not dword aligned. 34 static cl::opt<bool> 35 WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", 36 cl::desc("Widen sub-dword constant address space loads in " 37 "AMDGPULateCodeGenPrepare"), 38 cl::ReallyHidden, cl::init(true)); 39 40 namespace { 41 42 class AMDGPULateCodeGenPrepare 43 : public FunctionPass, 44 public InstVisitor<AMDGPULateCodeGenPrepare, bool> { 45 Module *Mod = nullptr; 46 const DataLayout *DL = nullptr; 47 48 AssumptionCache *AC = nullptr; 49 LegacyDivergenceAnalysis *DA = nullptr; 50 51 public: 52 static char ID; 53 54 AMDGPULateCodeGenPrepare() : FunctionPass(ID) {} 55 56 StringRef getPassName() const override { 57 return "AMDGPU IR late optimizations"; 58 } 59 60 void getAnalysisUsage(AnalysisUsage &AU) const override { 61 AU.addRequired<AssumptionCacheTracker>(); 62 AU.addRequired<LegacyDivergenceAnalysis>(); 63 AU.setPreservesAll(); 64 } 65 66 bool doInitialization(Module &M) override; 67 bool runOnFunction(Function &F) override; 68 69 bool visitInstruction(Instruction &) { return false; } 70 71 // Check if the specified value is at least DWORD aligned. 72 bool isDWORDAligned(const Value *V) const { 73 KnownBits Known = computeKnownBits(V, *DL, 0, AC); 74 return Known.countMinTrailingZeros() >= 2; 75 } 76 77 bool canWidenScalarExtLoad(LoadInst &LI) const; 78 bool visitLoadInst(LoadInst &LI); 79 }; 80 81 } // end anonymous namespace 82 83 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) { 84 Mod = &M; 85 DL = &Mod->getDataLayout(); 86 return false; 87 } 88 89 bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) { 90 if (skipFunction(F)) 91 return false; 92 93 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 94 DA = &getAnalysis<LegacyDivergenceAnalysis>(); 95 96 bool Changed = false; 97 for (auto &BB : F) 98 for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) { 99 Instruction *I = &*BI++; 100 Changed |= visit(*I); 101 } 102 103 return Changed; 104 } 105 106 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { 107 unsigned AS = LI.getPointerAddressSpace(); 108 // Skip non-constant address space. 109 if (AS != AMDGPUAS::CONSTANT_ADDRESS && 110 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) 111 return false; 112 // Skip non-simple loads. 113 if (!LI.isSimple()) 114 return false; 115 auto *Ty = LI.getType(); 116 // Skip aggregate types. 117 if (Ty->isAggregateType()) 118 return false; 119 unsigned TySize = DL->getTypeStoreSize(Ty); 120 // Only handle sub-DWORD loads. 121 if (TySize >= 4) 122 return false; 123 // That load must be at least naturally aligned. 124 if (LI.getAlign() < DL->getABITypeAlign(Ty)) 125 return false; 126 // It should be uniform, i.e. a scalar load. 127 return DA->isUniform(&LI); 128 } 129 130 bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { 131 if (!WidenLoads) 132 return false; 133 134 // Skip if that load is already aligned on DWORD at least as it's handled in 135 // SDAG. 136 if (LI.getAlign() >= 4) 137 return false; 138 139 if (!canWidenScalarExtLoad(LI)) 140 return false; 141 142 int64_t Offset = 0; 143 auto *Base = 144 GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL); 145 // If that base is not DWORD aligned, it's not safe to perform the following 146 // transforms. 147 if (!isDWORDAligned(Base)) 148 return false; 149 150 int64_t Adjust = Offset & 0x3; 151 if (Adjust == 0) { 152 // With a zero adjust, the original alignment could be promoted with a 153 // better one. 154 LI.setAlignment(Align(4)); 155 return true; 156 } 157 158 IRBuilder<> IRB(&LI); 159 IRB.SetCurrentDebugLocation(LI.getDebugLoc()); 160 161 unsigned AS = LI.getPointerAddressSpace(); 162 unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8; 163 auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits); 164 165 PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS); 166 PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS); 167 auto *NewPtr = IRB.CreateBitCast( 168 IRB.CreateConstGEP1_64( 169 IRB.getInt8Ty(), 170 IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy), 171 Offset - Adjust), 172 Int32PtrTy); 173 LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4)); 174 NewLd->copyMetadata(LI); 175 NewLd->setMetadata(LLVMContext::MD_range, nullptr); 176 177 unsigned ShAmt = Adjust * 8; 178 auto *NewVal = IRB.CreateBitCast( 179 IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType()); 180 LI.replaceAllUsesWith(NewVal); 181 RecursivelyDeleteTriviallyDeadInstructions(&LI); 182 183 return true; 184 } 185 186 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 187 "AMDGPU IR late optimizations", false, false) 188 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 189 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 190 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 191 "AMDGPU IR late optimizations", false, false) 192 193 char AMDGPULateCodeGenPrepare::ID = 0; 194 195 FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() { 196 return new AMDGPULateCodeGenPrepare(); 197 } 198