1 //===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Performs general IR level optimizations on SVE intrinsics. 11 // 12 // This pass performs the following optimizations: 13 // 14 // - removes unnecessary ptrue intrinsics (llvm.aarch64.sve.ptrue), e.g: 15 // %1 = @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) 16 // %2 = @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) 17 // ; (%1 can be replaced with a reinterpret of %2) 18 // 19 // - optimizes ptest intrinsics where the operands are being needlessly 20 // converted to and from svbool_t. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "AArch64.h" 25 #include "Utils/AArch64BaseInfo.h" 26 #include "llvm/ADT/PostOrderIterator.h" 27 #include "llvm/ADT/SetVector.h" 28 #include "llvm/IR/Constants.h" 29 #include "llvm/IR/Dominators.h" 30 #include "llvm/IR/IRBuilder.h" 31 #include "llvm/IR/Instructions.h" 32 #include "llvm/IR/IntrinsicInst.h" 33 #include "llvm/IR/IntrinsicsAArch64.h" 34 #include "llvm/IR/LLVMContext.h" 35 #include "llvm/IR/PatternMatch.h" 36 #include "llvm/InitializePasses.h" 37 #include "llvm/Support/Debug.h" 38 39 using namespace llvm; 40 using namespace llvm::PatternMatch; 41 42 #define DEBUG_TYPE "aarch64-sve-intrinsic-opts" 43 44 namespace llvm { 45 void initializeSVEIntrinsicOptsPass(PassRegistry &); 46 } 47 48 namespace { 49 struct SVEIntrinsicOpts : public ModulePass { 50 static char ID; // Pass identification, replacement for typeid 51 SVEIntrinsicOpts() : ModulePass(ID) { 52 initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry()); 53 } 54 55 bool runOnModule(Module &M) override; 56 void getAnalysisUsage(AnalysisUsage &AU) const override; 57 58 private: 59 bool coalescePTrueIntrinsicCalls(BasicBlock &BB, 60 SmallSetVector<IntrinsicInst *, 4> &PTrues); 61 bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, 4> &Functions); 62 63 /// Operates at the function-scope. I.e., optimizations are applied local to 64 /// the functions themselves. 65 bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions); 66 }; 67 } // end anonymous namespace 68 69 void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const { 70 AU.addRequired<DominatorTreeWrapperPass>(); 71 AU.setPreservesCFG(); 72 } 73 74 char SVEIntrinsicOpts::ID = 0; 75 static const char *name = "SVE intrinsics optimizations"; 76 INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) 77 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 78 INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) 79 80 ModulePass *llvm::createSVEIntrinsicOptsPass() { 81 return new SVEIntrinsicOpts(); 82 } 83 84 /// Checks if a ptrue intrinsic call is promoted. The act of promoting a 85 /// ptrue will introduce zeroing. For example: 86 /// 87 /// %1 = <vscale x 4 x i1> call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) 88 /// %2 = <vscale x 16 x i1> call @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1) 89 /// %3 = <vscale x 8 x i1> call @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %2) 90 /// 91 /// %1 is promoted, because it is converted: 92 /// 93 /// <vscale x 4 x i1> => <vscale x 16 x i1> => <vscale x 8 x i1> 94 /// 95 /// via a sequence of the SVE reinterpret intrinsics convert.{to,from}.svbool. 96 static bool isPTruePromoted(IntrinsicInst *PTrue) { 97 // Find all users of this intrinsic that are calls to convert-to-svbool 98 // reinterpret intrinsics. 99 SmallVector<IntrinsicInst *, 4> ConvertToUses; 100 for (User *User : PTrue->users()) { 101 if (match(User, m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>())) { 102 ConvertToUses.push_back(cast<IntrinsicInst>(User)); 103 } 104 } 105 106 // If no such calls were found, this is ptrue is not promoted. 107 if (ConvertToUses.empty()) 108 return false; 109 110 // Otherwise, try to find users of the convert-to-svbool intrinsics that are 111 // calls to the convert-from-svbool intrinsic, and would result in some lanes 112 // being zeroed. 113 const auto *PTrueVTy = cast<ScalableVectorType>(PTrue->getType()); 114 for (IntrinsicInst *ConvertToUse : ConvertToUses) { 115 for (User *User : ConvertToUse->users()) { 116 auto *IntrUser = dyn_cast<IntrinsicInst>(User); 117 if (IntrUser && IntrUser->getIntrinsicID() == 118 Intrinsic::aarch64_sve_convert_from_svbool) { 119 const auto *IntrUserVTy = cast<ScalableVectorType>(IntrUser->getType()); 120 121 // Would some lanes become zeroed by the conversion? 122 if (IntrUserVTy->getElementCount().getKnownMinValue() > 123 PTrueVTy->getElementCount().getKnownMinValue()) 124 // This is a promoted ptrue. 125 return true; 126 } 127 } 128 } 129 130 // If no matching calls were found, this is not a promoted ptrue. 131 return false; 132 } 133 134 /// Attempts to coalesce ptrues in a basic block. 135 bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls( 136 BasicBlock &BB, SmallSetVector<IntrinsicInst *, 4> &PTrues) { 137 if (PTrues.size() <= 1) 138 return false; 139 140 // Find the ptrue with the most lanes. 141 auto *MostEncompassingPTrue = *std::max_element( 142 PTrues.begin(), PTrues.end(), [](auto *PTrue1, auto *PTrue2) { 143 auto *PTrue1VTy = cast<ScalableVectorType>(PTrue1->getType()); 144 auto *PTrue2VTy = cast<ScalableVectorType>(PTrue2->getType()); 145 return PTrue1VTy->getElementCount().getKnownMinValue() < 146 PTrue2VTy->getElementCount().getKnownMinValue(); 147 }); 148 149 // Remove the most encompassing ptrue, as well as any promoted ptrues, leaving 150 // behind only the ptrues to be coalesced. 151 PTrues.remove(MostEncompassingPTrue); 152 PTrues.remove_if([](auto *PTrue) { return isPTruePromoted(PTrue); }); 153 154 // Hoist MostEncompassingPTrue to the start of the basic block. It is always 155 // safe to do this, since ptrue intrinsic calls are guaranteed to have no 156 // predecessors. 157 MostEncompassingPTrue->moveBefore(BB, BB.getFirstInsertionPt()); 158 159 LLVMContext &Ctx = BB.getContext(); 160 IRBuilder<> Builder(Ctx); 161 Builder.SetInsertPoint(&BB, ++MostEncompassingPTrue->getIterator()); 162 163 auto *MostEncompassingPTrueVTy = 164 cast<VectorType>(MostEncompassingPTrue->getType()); 165 auto *ConvertToSVBool = Builder.CreateIntrinsic( 166 Intrinsic::aarch64_sve_convert_to_svbool, {MostEncompassingPTrueVTy}, 167 {MostEncompassingPTrue}); 168 169 bool ConvertFromCreated = false; 170 for (auto *PTrue : PTrues) { 171 auto *PTrueVTy = cast<VectorType>(PTrue->getType()); 172 173 // Only create the converts if the types are not already the same, otherwise 174 // just use the most encompassing ptrue. 175 if (MostEncompassingPTrueVTy != PTrueVTy) { 176 ConvertFromCreated = true; 177 178 Builder.SetInsertPoint(&BB, ++ConvertToSVBool->getIterator()); 179 auto *ConvertFromSVBool = 180 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, 181 {PTrueVTy}, {ConvertToSVBool}); 182 PTrue->replaceAllUsesWith(ConvertFromSVBool); 183 } else 184 PTrue->replaceAllUsesWith(MostEncompassingPTrue); 185 186 PTrue->eraseFromParent(); 187 } 188 189 // We never used the ConvertTo so remove it 190 if (!ConvertFromCreated) 191 ConvertToSVBool->eraseFromParent(); 192 193 return true; 194 } 195 196 /// The goal of this function is to remove redundant calls to the SVE ptrue 197 /// intrinsic in each basic block within the given functions. 198 /// 199 /// SVE ptrues have two representations in LLVM IR: 200 /// - a logical representation -- an arbitrary-width scalable vector of i1s, 201 /// i.e. <vscale x N x i1>. 202 /// - a physical representation (svbool, <vscale x 16 x i1>) -- a 16-element 203 /// scalable vector of i1s, i.e. <vscale x 16 x i1>. 204 /// 205 /// The SVE ptrue intrinsic is used to create a logical representation of an SVE 206 /// predicate. Suppose that we have two SVE ptrue intrinsic calls: P1 and P2. If 207 /// P1 creates a logical SVE predicate that is at least as wide as the logical 208 /// SVE predicate created by P2, then all of the bits that are true in the 209 /// physical representation of P2 are necessarily also true in the physical 210 /// representation of P1. P1 'encompasses' P2, therefore, the intrinsic call to 211 /// P2 is redundant and can be replaced by an SVE reinterpret of P1 via 212 /// convert.{to,from}.svbool. 213 /// 214 /// Currently, this pass only coalesces calls to SVE ptrue intrinsics 215 /// if they match the following conditions: 216 /// 217 /// - the call to the intrinsic uses either the SV_ALL or SV_POW2 patterns. 218 /// SV_ALL indicates that all bits of the predicate vector are to be set to 219 /// true. SV_POW2 indicates that all bits of the predicate vector up to the 220 /// largest power-of-two are to be set to true. 221 /// - the result of the call to the intrinsic is not promoted to a wider 222 /// predicate. In this case, keeping the extra ptrue leads to better codegen 223 /// -- coalescing here would create an irreducible chain of SVE reinterprets 224 /// via convert.{to,from}.svbool. 225 /// 226 /// EXAMPLE: 227 /// 228 /// %1 = <vscale x 8 x i1> ptrue(i32 SV_ALL) 229 /// ; Logical: <1, 1, 1, 1, 1, 1, 1, 1> 230 /// ; Physical: <1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0> 231 /// ... 232 /// 233 /// %2 = <vscale x 4 x i1> ptrue(i32 SV_ALL) 234 /// ; Logical: <1, 1, 1, 1> 235 /// ; Physical: <1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0> 236 /// ... 237 /// 238 /// Here, %2 can be replaced by an SVE reinterpret of %1, giving, for instance: 239 /// 240 /// %1 = <vscale x 8 x i1> ptrue(i32 i31) 241 /// %2 = <vscale x 16 x i1> convert.to.svbool(<vscale x 8 x i1> %1) 242 /// %3 = <vscale x 4 x i1> convert.from.svbool(<vscale x 16 x i1> %2) 243 /// 244 bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls( 245 SmallSetVector<Function *, 4> &Functions) { 246 bool Changed = false; 247 248 for (auto *F : Functions) { 249 for (auto &BB : *F) { 250 SmallSetVector<IntrinsicInst *, 4> SVAllPTrues; 251 SmallSetVector<IntrinsicInst *, 4> SVPow2PTrues; 252 253 // For each basic block, collect the used ptrues and try to coalesce them. 254 for (Instruction &I : BB) { 255 if (I.use_empty()) 256 continue; 257 258 auto *IntrI = dyn_cast<IntrinsicInst>(&I); 259 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) 260 continue; 261 262 const auto PTruePattern = 263 cast<ConstantInt>(IntrI->getOperand(0))->getZExtValue(); 264 265 if (PTruePattern == AArch64SVEPredPattern::all) 266 SVAllPTrues.insert(IntrI); 267 if (PTruePattern == AArch64SVEPredPattern::pow2) 268 SVPow2PTrues.insert(IntrI); 269 } 270 271 Changed |= coalescePTrueIntrinsicCalls(BB, SVAllPTrues); 272 Changed |= coalescePTrueIntrinsicCalls(BB, SVPow2PTrues); 273 } 274 } 275 276 return Changed; 277 } 278 279 bool SVEIntrinsicOpts::optimizeFunctions( 280 SmallSetVector<Function *, 4> &Functions) { 281 bool Changed = false; 282 283 Changed |= optimizePTrueIntrinsicCalls(Functions); 284 285 return Changed; 286 } 287 288 bool SVEIntrinsicOpts::runOnModule(Module &M) { 289 bool Changed = false; 290 SmallSetVector<Function *, 4> Functions; 291 292 // Check for SVE intrinsic declarations first so that we only iterate over 293 // relevant functions. Where an appropriate declaration is found, store the 294 // function(s) where it is used so we can target these only. 295 for (auto &F : M.getFunctionList()) { 296 if (!F.isDeclaration()) 297 continue; 298 299 switch (F.getIntrinsicID()) { 300 case Intrinsic::aarch64_sve_ptrue: 301 for (User *U : F.users()) 302 Functions.insert(cast<Instruction>(U)->getFunction()); 303 break; 304 default: 305 break; 306 } 307 } 308 309 if (!Functions.empty()) 310 Changed |= optimizeFunctions(Functions); 311 312 return Changed; 313 } 314