1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass does misc. AMDGPU optimizations on IR *just* before instruction 11 /// selection. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "llvm/Analysis/AssumptionCache.h" 17 #include "llvm/Analysis/UniformityAnalysis.h" 18 #include "llvm/Analysis/ValueTracking.h" 19 #include "llvm/IR/IRBuilder.h" 20 #include "llvm/IR/InstVisitor.h" 21 #include "llvm/InitializePasses.h" 22 #include "llvm/Support/CommandLine.h" 23 #include "llvm/Support/KnownBits.h" 24 #include "llvm/Transforms/Utils/Local.h" 25 26 #define DEBUG_TYPE "amdgpu-late-codegenprepare" 27 28 using namespace llvm; 29 30 // Scalar load widening needs running after load-store-vectorizer as that pass 31 // doesn't handle overlapping cases. In addition, this pass enhances the 32 // widening to handle cases where scalar sub-dword loads are naturally aligned 33 // only but not dword aligned. 34 static cl::opt<bool> 35 WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", 36 cl::desc("Widen sub-dword constant address space loads in " 37 "AMDGPULateCodeGenPrepare"), 38 cl::ReallyHidden, cl::init(true)); 39 40 namespace { 41 42 class AMDGPULateCodeGenPrepare 43 : public FunctionPass, 44 public InstVisitor<AMDGPULateCodeGenPrepare, bool> { 45 Module *Mod = nullptr; 46 const DataLayout *DL = nullptr; 47 48 AssumptionCache *AC = nullptr; 49 UniformityInfo *UA = nullptr; 50 51 public: 52 static char ID; 53 54 AMDGPULateCodeGenPrepare() : FunctionPass(ID) {} 55 56 StringRef getPassName() const override { 57 return "AMDGPU IR late optimizations"; 58 } 59 60 void getAnalysisUsage(AnalysisUsage &AU) const override { 61 AU.addRequired<AssumptionCacheTracker>(); 62 AU.addRequired<UniformityInfoWrapperPass>(); 63 AU.setPreservesAll(); 64 } 65 66 bool doInitialization(Module &M) override; 67 bool runOnFunction(Function &F) override; 68 69 bool visitInstruction(Instruction &) { return false; } 70 71 // Check if the specified value is at least DWORD aligned. 72 bool isDWORDAligned(const Value *V) const { 73 KnownBits Known = computeKnownBits(V, *DL, 0, AC); 74 return Known.countMinTrailingZeros() >= 2; 75 } 76 77 bool canWidenScalarExtLoad(LoadInst &LI) const; 78 bool visitLoadInst(LoadInst &LI); 79 }; 80 81 } // end anonymous namespace 82 83 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) { 84 Mod = &M; 85 DL = &Mod->getDataLayout(); 86 return false; 87 } 88 89 bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) { 90 if (skipFunction(F)) 91 return false; 92 93 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 94 UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); 95 96 bool Changed = false; 97 for (auto &BB : F) 98 for (Instruction &I : llvm::make_early_inc_range(BB)) 99 Changed |= visit(I); 100 101 return Changed; 102 } 103 104 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { 105 unsigned AS = LI.getPointerAddressSpace(); 106 // Skip non-constant address space. 107 if (AS != AMDGPUAS::CONSTANT_ADDRESS && 108 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) 109 return false; 110 // Skip non-simple loads. 111 if (!LI.isSimple()) 112 return false; 113 auto *Ty = LI.getType(); 114 // Skip aggregate types. 115 if (Ty->isAggregateType()) 116 return false; 117 unsigned TySize = DL->getTypeStoreSize(Ty); 118 // Only handle sub-DWORD loads. 119 if (TySize >= 4) 120 return false; 121 // That load must be at least naturally aligned. 122 if (LI.getAlign() < DL->getABITypeAlign(Ty)) 123 return false; 124 // It should be uniform, i.e. a scalar load. 125 return UA->isUniform(&LI); 126 } 127 128 bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { 129 if (!WidenLoads) 130 return false; 131 132 // Skip if that load is already aligned on DWORD at least as it's handled in 133 // SDAG. 134 if (LI.getAlign() >= 4) 135 return false; 136 137 if (!canWidenScalarExtLoad(LI)) 138 return false; 139 140 int64_t Offset = 0; 141 auto *Base = 142 GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL); 143 // If that base is not DWORD aligned, it's not safe to perform the following 144 // transforms. 145 if (!isDWORDAligned(Base)) 146 return false; 147 148 int64_t Adjust = Offset & 0x3; 149 if (Adjust == 0) { 150 // With a zero adjust, the original alignment could be promoted with a 151 // better one. 152 LI.setAlignment(Align(4)); 153 return true; 154 } 155 156 IRBuilder<> IRB(&LI); 157 IRB.SetCurrentDebugLocation(LI.getDebugLoc()); 158 159 unsigned LdBits = DL->getTypeStoreSizeInBits(LI.getType()); 160 auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits); 161 162 auto *NewPtr = IRB.CreateConstGEP1_64( 163 IRB.getInt8Ty(), 164 IRB.CreateAddrSpaceCast(Base, LI.getPointerOperand()->getType()), 165 Offset - Adjust); 166 167 LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4)); 168 NewLd->copyMetadata(LI); 169 NewLd->setMetadata(LLVMContext::MD_range, nullptr); 170 171 unsigned ShAmt = Adjust * 8; 172 auto *NewVal = IRB.CreateBitCast( 173 IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType()); 174 LI.replaceAllUsesWith(NewVal); 175 RecursivelyDeleteTriviallyDeadInstructions(&LI); 176 177 return true; 178 } 179 180 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 181 "AMDGPU IR late optimizations", false, false) 182 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 183 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) 184 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 185 "AMDGPU IR late optimizations", false, false) 186 187 char AMDGPULateCodeGenPrepare::ID = 0; 188 189 FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() { 190 return new AMDGPULateCodeGenPrepare(); 191 } 192