1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass does misc. AMDGPU optimizations on IR *just* before instruction 11 /// selection. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "llvm/Analysis/AssumptionCache.h" 17 #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 18 #include "llvm/Analysis/ValueTracking.h" 19 #include "llvm/IR/IRBuilder.h" 20 #include "llvm/IR/InstVisitor.h" 21 #include "llvm/InitializePasses.h" 22 #include "llvm/Support/CommandLine.h" 23 #include "llvm/Support/KnownBits.h" 24 #include "llvm/Transforms/Utils/Local.h" 25 26 #define DEBUG_TYPE "amdgpu-late-codegenprepare" 27 28 using namespace llvm; 29 30 // Scalar load widening needs running after load-store-vectorizer as that pass 31 // doesn't handle overlapping cases. In addition, this pass enhances the 32 // widening to handle cases where scalar sub-dword loads are naturally aligned 33 // only but not dword aligned. 34 static cl::opt<bool> 35 WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", 36 cl::desc("Widen sub-dword constant address space loads in " 37 "AMDGPULateCodeGenPrepare"), 38 cl::ReallyHidden, cl::init(true)); 39 40 namespace { 41 42 class AMDGPULateCodeGenPrepare 43 : public FunctionPass, 44 public InstVisitor<AMDGPULateCodeGenPrepare, bool> { 45 Module *Mod = nullptr; 46 const DataLayout *DL = nullptr; 47 48 AssumptionCache *AC = nullptr; 49 LegacyDivergenceAnalysis *DA = nullptr; 50 51 public: 52 static char ID; 53 54 AMDGPULateCodeGenPrepare() : FunctionPass(ID) {} 55 56 StringRef getPassName() const override { 57 return "AMDGPU IR late optimizations"; 58 } 59 60 void getAnalysisUsage(AnalysisUsage &AU) const override { 61 AU.addRequired<AssumptionCacheTracker>(); 62 AU.addRequired<LegacyDivergenceAnalysis>(); 63 AU.setPreservesAll(); 64 } 65 66 bool doInitialization(Module &M) override; 67 bool runOnFunction(Function &F) override; 68 69 bool visitInstruction(Instruction &) { return false; } 70 71 // Check if the specified value is at least DWORD aligned. 72 bool isDWORDAligned(const Value *V) const { 73 KnownBits Known = computeKnownBits(V, *DL, 0, AC); 74 return Known.countMinTrailingZeros() >= 2; 75 } 76 77 bool canWidenScalarExtLoad(LoadInst &LI) const; 78 bool visitLoadInst(LoadInst &LI); 79 }; 80 81 } // end anonymous namespace 82 83 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) { 84 Mod = &M; 85 DL = &Mod->getDataLayout(); 86 return false; 87 } 88 89 bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) { 90 if (skipFunction(F)) 91 return false; 92 93 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 94 DA = &getAnalysis<LegacyDivergenceAnalysis>(); 95 96 bool Changed = false; 97 for (auto &BB : F) 98 for (Instruction &I : llvm::make_early_inc_range(BB)) 99 Changed |= visit(I); 100 101 return Changed; 102 } 103 104 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { 105 unsigned AS = LI.getPointerAddressSpace(); 106 // Skip non-constant address space. 107 if (AS != AMDGPUAS::CONSTANT_ADDRESS && 108 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) 109 return false; 110 // Skip non-simple loads. 111 if (!LI.isSimple()) 112 return false; 113 auto *Ty = LI.getType(); 114 // Skip aggregate types. 115 if (Ty->isAggregateType()) 116 return false; 117 unsigned TySize = DL->getTypeStoreSize(Ty); 118 // Only handle sub-DWORD loads. 119 if (TySize >= 4) 120 return false; 121 // That load must be at least naturally aligned. 122 if (LI.getAlign() < DL->getABITypeAlign(Ty)) 123 return false; 124 // It should be uniform, i.e. a scalar load. 125 return DA->isUniform(&LI); 126 } 127 128 bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { 129 if (!WidenLoads) 130 return false; 131 132 // Skip if that load is already aligned on DWORD at least as it's handled in 133 // SDAG. 134 if (LI.getAlign() >= 4) 135 return false; 136 137 if (!canWidenScalarExtLoad(LI)) 138 return false; 139 140 int64_t Offset = 0; 141 auto *Base = 142 GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL); 143 // If that base is not DWORD aligned, it's not safe to perform the following 144 // transforms. 145 if (!isDWORDAligned(Base)) 146 return false; 147 148 int64_t Adjust = Offset & 0x3; 149 if (Adjust == 0) { 150 // With a zero adjust, the original alignment could be promoted with a 151 // better one. 152 LI.setAlignment(Align(4)); 153 return true; 154 } 155 156 IRBuilder<> IRB(&LI); 157 IRB.SetCurrentDebugLocation(LI.getDebugLoc()); 158 159 unsigned AS = LI.getPointerAddressSpace(); 160 unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8; 161 auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits); 162 163 PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS); 164 PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS); 165 auto *NewPtr = IRB.CreateBitCast( 166 IRB.CreateConstGEP1_64( 167 IRB.getInt8Ty(), 168 IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy), 169 Offset - Adjust), 170 Int32PtrTy); 171 LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4)); 172 NewLd->copyMetadata(LI); 173 NewLd->setMetadata(LLVMContext::MD_range, nullptr); 174 175 unsigned ShAmt = Adjust * 8; 176 auto *NewVal = IRB.CreateBitCast( 177 IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType()); 178 LI.replaceAllUsesWith(NewVal); 179 RecursivelyDeleteTriviallyDeadInstructions(&LI); 180 181 return true; 182 } 183 184 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 185 "AMDGPU IR late optimizations", false, false) 186 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 187 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 188 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 189 "AMDGPU IR late optimizations", false, false) 190 191 char AMDGPULateCodeGenPrepare::ID = 0; 192 193 FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() { 194 return new AMDGPULateCodeGenPrepare(); 195 } 196