1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass does misc. AMDGPU optimizations on IR *just* before instruction 11 /// selection. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "llvm/Analysis/AssumptionCache.h" 18 #include "llvm/Analysis/UniformityAnalysis.h" 19 #include "llvm/Analysis/ValueTracking.h" 20 #include "llvm/CodeGen/TargetPassConfig.h" 21 #include "llvm/IR/IRBuilder.h" 22 #include "llvm/IR/InstVisitor.h" 23 #include "llvm/InitializePasses.h" 24 #include "llvm/Support/CommandLine.h" 25 #include "llvm/Support/KnownBits.h" 26 #include "llvm/Transforms/Utils/Local.h" 27 28 #define DEBUG_TYPE "amdgpu-late-codegenprepare" 29 30 using namespace llvm; 31 32 // Scalar load widening needs running after load-store-vectorizer as that pass 33 // doesn't handle overlapping cases. In addition, this pass enhances the 34 // widening to handle cases where scalar sub-dword loads are naturally aligned 35 // only but not dword aligned. 36 static cl::opt<bool> 37 WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", 38 cl::desc("Widen sub-dword constant address space loads in " 39 "AMDGPULateCodeGenPrepare"), 40 cl::ReallyHidden, cl::init(true)); 41 42 namespace { 43 44 class AMDGPULateCodeGenPrepare 45 : public FunctionPass, 46 public InstVisitor<AMDGPULateCodeGenPrepare, bool> { 47 Module *Mod = nullptr; 48 const DataLayout *DL = nullptr; 49 50 AssumptionCache *AC = nullptr; 51 UniformityInfo *UA = nullptr; 52 53 public: 54 static char ID; 55 56 AMDGPULateCodeGenPrepare() : FunctionPass(ID) {} 57 58 StringRef getPassName() const override { 59 return "AMDGPU IR late optimizations"; 60 } 61 62 void getAnalysisUsage(AnalysisUsage &AU) const override { 63 AU.addRequired<TargetPassConfig>(); 64 AU.addRequired<AssumptionCacheTracker>(); 65 AU.addRequired<UniformityInfoWrapperPass>(); 66 AU.setPreservesAll(); 67 } 68 69 bool doInitialization(Module &M) override; 70 bool runOnFunction(Function &F) override; 71 72 bool visitInstruction(Instruction &) { return false; } 73 74 // Check if the specified value is at least DWORD aligned. 75 bool isDWORDAligned(const Value *V) const { 76 KnownBits Known = computeKnownBits(V, *DL, 0, AC); 77 return Known.countMinTrailingZeros() >= 2; 78 } 79 80 bool canWidenScalarExtLoad(LoadInst &LI) const; 81 bool visitLoadInst(LoadInst &LI); 82 }; 83 84 } // end anonymous namespace 85 86 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) { 87 Mod = &M; 88 DL = &Mod->getDataLayout(); 89 return false; 90 } 91 92 bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) { 93 if (skipFunction(F)) 94 return false; 95 96 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); 97 const TargetMachine &TM = TPC.getTM<TargetMachine>(); 98 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); 99 if (ST.hasScalarSubwordLoads()) 100 return false; 101 102 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 103 UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); 104 105 bool Changed = false; 106 for (auto &BB : F) 107 for (Instruction &I : llvm::make_early_inc_range(BB)) 108 Changed |= visit(I); 109 110 return Changed; 111 } 112 113 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { 114 unsigned AS = LI.getPointerAddressSpace(); 115 // Skip non-constant address space. 116 if (AS != AMDGPUAS::CONSTANT_ADDRESS && 117 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) 118 return false; 119 // Skip non-simple loads. 120 if (!LI.isSimple()) 121 return false; 122 auto *Ty = LI.getType(); 123 // Skip aggregate types. 124 if (Ty->isAggregateType()) 125 return false; 126 unsigned TySize = DL->getTypeStoreSize(Ty); 127 // Only handle sub-DWORD loads. 128 if (TySize >= 4) 129 return false; 130 // That load must be at least naturally aligned. 131 if (LI.getAlign() < DL->getABITypeAlign(Ty)) 132 return false; 133 // It should be uniform, i.e. a scalar load. 134 return UA->isUniform(&LI); 135 } 136 137 bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { 138 if (!WidenLoads) 139 return false; 140 141 // Skip if that load is already aligned on DWORD at least as it's handled in 142 // SDAG. 143 if (LI.getAlign() >= 4) 144 return false; 145 146 if (!canWidenScalarExtLoad(LI)) 147 return false; 148 149 int64_t Offset = 0; 150 auto *Base = 151 GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL); 152 // If that base is not DWORD aligned, it's not safe to perform the following 153 // transforms. 154 if (!isDWORDAligned(Base)) 155 return false; 156 157 int64_t Adjust = Offset & 0x3; 158 if (Adjust == 0) { 159 // With a zero adjust, the original alignment could be promoted with a 160 // better one. 161 LI.setAlignment(Align(4)); 162 return true; 163 } 164 165 IRBuilder<> IRB(&LI); 166 IRB.SetCurrentDebugLocation(LI.getDebugLoc()); 167 168 unsigned LdBits = DL->getTypeStoreSizeInBits(LI.getType()); 169 auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits); 170 171 auto *NewPtr = IRB.CreateConstGEP1_64( 172 IRB.getInt8Ty(), 173 IRB.CreateAddrSpaceCast(Base, LI.getPointerOperand()->getType()), 174 Offset - Adjust); 175 176 LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4)); 177 NewLd->copyMetadata(LI); 178 NewLd->setMetadata(LLVMContext::MD_range, nullptr); 179 180 unsigned ShAmt = Adjust * 8; 181 auto *NewVal = IRB.CreateBitCast( 182 IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType()); 183 LI.replaceAllUsesWith(NewVal); 184 RecursivelyDeleteTriviallyDeadInstructions(&LI); 185 186 return true; 187 } 188 189 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 190 "AMDGPU IR late optimizations", false, false) 191 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 192 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 193 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) 194 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE, 195 "AMDGPU IR late optimizations", false, false) 196 197 char AMDGPULateCodeGenPrepare::ID = 0; 198 199 FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() { 200 return new AMDGPULateCodeGenPrepare(); 201 } 202