1 //=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // before the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64GlobalISelUtils.h" 15 #include "AArch64TargetMachine.h" 16 #include "llvm/CodeGen/GlobalISel/Combiner.h" 17 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 18 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 19 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 21 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/MachineFunction.h" 24 #include "llvm/CodeGen/MachineFunctionPass.h" 25 #include "llvm/CodeGen/MachineRegisterInfo.h" 26 #include "llvm/CodeGen/TargetPassConfig.h" 27 #include "llvm/IR/Instructions.h" 28 #include "llvm/Support/Debug.h" 29 30 #define DEBUG_TYPE "aarch64-prelegalizer-combiner" 31 32 using namespace llvm; 33 using namespace MIPatternMatch; 34 35 /// Return true if a G_FCONSTANT instruction is known to be better-represented 36 /// as a G_CONSTANT. 37 static bool matchFConstantToConstant(MachineInstr &MI, 38 MachineRegisterInfo &MRI) { 39 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); 40 Register DstReg = MI.getOperand(0).getReg(); 41 const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); 42 if (DstSize != 32 && DstSize != 64) 43 return false; 44 45 // When we're storing a value, it doesn't matter what register bank it's on. 46 // Since not all floating point constants can be materialized using a fmov, 47 // it makes more sense to just use a GPR. 48 return all_of(MRI.use_nodbg_instructions(DstReg), 49 [](const MachineInstr &Use) { return Use.mayStore(); }); 50 } 51 52 /// Change a G_FCONSTANT into a G_CONSTANT. 53 static void applyFConstantToConstant(MachineInstr &MI) { 54 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); 55 MachineIRBuilder MIB(MI); 56 const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF(); 57 MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt()); 58 MI.eraseFromParent(); 59 } 60 61 /// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits 62 /// are sign bits. In this case, we can transform the G_ICMP to directly compare 63 /// the wide value with a zero. 64 static bool matchICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, 65 GISelKnownBits *KB, Register &MatchInfo) { 66 assert(MI.getOpcode() == TargetOpcode::G_ICMP && KB); 67 68 auto Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate(); 69 if (!ICmpInst::isEquality(Pred)) 70 return false; 71 72 Register LHS = MI.getOperand(2).getReg(); 73 LLT LHSTy = MRI.getType(LHS); 74 if (!LHSTy.isScalar()) 75 return false; 76 77 Register RHS = MI.getOperand(3).getReg(); 78 Register WideReg; 79 80 if (!mi_match(LHS, MRI, m_GTrunc(m_Reg(WideReg))) || 81 !mi_match(RHS, MRI, m_SpecificICst(0))) 82 return false; 83 84 LLT WideTy = MRI.getType(WideReg); 85 if (KB->computeNumSignBits(WideReg) <= 86 WideTy.getSizeInBits() - LHSTy.getSizeInBits()) 87 return false; 88 89 MatchInfo = WideReg; 90 return true; 91 } 92 93 static bool applyICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, 94 MachineIRBuilder &Builder, 95 GISelChangeObserver &Observer, 96 Register &WideReg) { 97 assert(MI.getOpcode() == TargetOpcode::G_ICMP); 98 99 LLT WideTy = MRI.getType(WideReg); 100 // We're going to directly use the wide register as the LHS, and then use an 101 // equivalent size zero for RHS. 102 Builder.setInstrAndDebugLoc(MI); 103 auto WideZero = Builder.buildConstant(WideTy, 0); 104 Observer.changingInstr(MI); 105 MI.getOperand(2).setReg(WideReg); 106 MI.getOperand(3).setReg(WideZero.getReg(0)); 107 Observer.changedInstr(MI); 108 return true; 109 } 110 111 /// \returns true if it is possible to fold a constant into a G_GLOBAL_VALUE. 112 /// 113 /// e.g. 114 /// 115 /// %g = G_GLOBAL_VALUE @x -> %g = G_GLOBAL_VALUE @x + cst 116 static bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI, 117 std::pair<uint64_t, uint64_t> &MatchInfo) { 118 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); 119 MachineFunction &MF = *MI.getMF(); 120 auto &GlobalOp = MI.getOperand(1); 121 auto *GV = GlobalOp.getGlobal(); 122 if (GV->isThreadLocal()) 123 return false; 124 125 // Don't allow anything that could represent offsets etc. 126 if (MF.getSubtarget<AArch64Subtarget>().ClassifyGlobalReference( 127 GV, MF.getTarget()) != AArch64II::MO_NO_FLAG) 128 return false; 129 130 // Look for a G_GLOBAL_VALUE only used by G_PTR_ADDs against constants: 131 // 132 // %g = G_GLOBAL_VALUE @x 133 // %ptr1 = G_PTR_ADD %g, cst1 134 // %ptr2 = G_PTR_ADD %g, cst2 135 // ... 136 // %ptrN = G_PTR_ADD %g, cstN 137 // 138 // Identify the *smallest* constant. We want to be able to form this: 139 // 140 // %offset_g = G_GLOBAL_VALUE @x + min_cst 141 // %g = G_PTR_ADD %offset_g, -min_cst 142 // %ptr1 = G_PTR_ADD %g, cst1 143 // ... 144 Register Dst = MI.getOperand(0).getReg(); 145 uint64_t MinOffset = -1ull; 146 for (auto &UseInstr : MRI.use_nodbg_instructions(Dst)) { 147 if (UseInstr.getOpcode() != TargetOpcode::G_PTR_ADD) 148 return false; 149 auto Cst = getIConstantVRegValWithLookThrough( 150 UseInstr.getOperand(2).getReg(), MRI); 151 if (!Cst) 152 return false; 153 MinOffset = std::min(MinOffset, Cst->Value.getZExtValue()); 154 } 155 156 // Require that the new offset is larger than the existing one to avoid 157 // infinite loops. 158 uint64_t CurrOffset = GlobalOp.getOffset(); 159 uint64_t NewOffset = MinOffset + CurrOffset; 160 if (NewOffset <= CurrOffset) 161 return false; 162 163 // Check whether folding this offset is legal. It must not go out of bounds of 164 // the referenced object to avoid violating the code model, and must be 165 // smaller than 2^20 because this is the largest offset expressible in all 166 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF 167 // stores an immediate signed 21 bit offset.) 168 // 169 // This check also prevents us from folding negative offsets, which will end 170 // up being treated in the same way as large positive ones. They could also 171 // cause code model violations, and aren't really common enough to matter. 172 if (NewOffset >= (1 << 20)) 173 return false; 174 175 Type *T = GV->getValueType(); 176 if (!T->isSized() || 177 NewOffset > GV->getParent()->getDataLayout().getTypeAllocSize(T)) 178 return false; 179 MatchInfo = std::make_pair(NewOffset, MinOffset); 180 return true; 181 } 182 183 static bool applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI, 184 MachineIRBuilder &B, 185 GISelChangeObserver &Observer, 186 std::pair<uint64_t, uint64_t> &MatchInfo) { 187 // Change: 188 // 189 // %g = G_GLOBAL_VALUE @x 190 // %ptr1 = G_PTR_ADD %g, cst1 191 // %ptr2 = G_PTR_ADD %g, cst2 192 // ... 193 // %ptrN = G_PTR_ADD %g, cstN 194 // 195 // To: 196 // 197 // %offset_g = G_GLOBAL_VALUE @x + min_cst 198 // %g = G_PTR_ADD %offset_g, -min_cst 199 // %ptr1 = G_PTR_ADD %g, cst1 200 // ... 201 // %ptrN = G_PTR_ADD %g, cstN 202 // 203 // Then, the original G_PTR_ADDs should be folded later on so that they look 204 // like this: 205 // 206 // %ptrN = G_PTR_ADD %offset_g, cstN - min_cst 207 uint64_t Offset, MinOffset; 208 std::tie(Offset, MinOffset) = MatchInfo; 209 B.setInstrAndDebugLoc(MI); 210 Observer.changingInstr(MI); 211 auto &GlobalOp = MI.getOperand(1); 212 auto *GV = GlobalOp.getGlobal(); 213 GlobalOp.ChangeToGA(GV, Offset, GlobalOp.getTargetFlags()); 214 Register Dst = MI.getOperand(0).getReg(); 215 Register NewGVDst = MRI.cloneVirtualRegister(Dst); 216 MI.getOperand(0).setReg(NewGVDst); 217 Observer.changedInstr(MI); 218 B.buildPtrAdd( 219 Dst, NewGVDst, 220 B.buildConstant(LLT::scalar(64), -static_cast<int64_t>(MinOffset))); 221 return true; 222 } 223 224 static bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B, 225 CombinerHelper &Helper, 226 GISelChangeObserver &Observer) { 227 // Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if 228 // result is only used in the no-overflow case. It is restricted to cases 229 // where we know that the high-bits of the operands are 0. If there's an 230 // overflow, then the the 9th or 17th bit must be set, which can be checked 231 // using TBNZ. 232 // 233 // Change (for UADDOs on 8 and 16 bits): 234 // 235 // %z0 = G_ASSERT_ZEXT _ 236 // %op0 = G_TRUNC %z0 237 // %z1 = G_ASSERT_ZEXT _ 238 // %op1 = G_TRUNC %z1 239 // %val, %cond = G_UADDO %op0, %op1 240 // G_BRCOND %cond, %error.bb 241 // 242 // error.bb: 243 // (no successors and no uses of %val) 244 // 245 // To: 246 // 247 // %z0 = G_ASSERT_ZEXT _ 248 // %z1 = G_ASSERT_ZEXT _ 249 // %add = G_ADD %z0, %z1 250 // %val = G_TRUNC %add 251 // %bit = G_AND %add, 1 << scalar-size-in-bits(%op1) 252 // %cond = G_ICMP NE, %bit, 0 253 // G_BRCOND %cond, %error.bb 254 255 auto &MRI = *B.getMRI(); 256 257 MachineOperand *DefOp0 = MRI.getOneDef(MI.getOperand(2).getReg()); 258 MachineOperand *DefOp1 = MRI.getOneDef(MI.getOperand(3).getReg()); 259 Register Op0Wide; 260 Register Op1Wide; 261 if (!mi_match(DefOp0->getParent(), MRI, m_GTrunc(m_Reg(Op0Wide))) || 262 !mi_match(DefOp1->getParent(), MRI, m_GTrunc(m_Reg(Op1Wide)))) 263 return false; 264 LLT WideTy0 = MRI.getType(Op0Wide); 265 LLT WideTy1 = MRI.getType(Op1Wide); 266 Register ResVal = MI.getOperand(0).getReg(); 267 LLT OpTy = MRI.getType(ResVal); 268 MachineInstr *Op0WideDef = MRI.getVRegDef(Op0Wide); 269 MachineInstr *Op1WideDef = MRI.getVRegDef(Op1Wide); 270 271 unsigned OpTySize = OpTy.getScalarSizeInBits(); 272 // First check that the G_TRUNC feeding the G_UADDO are no-ops, because the 273 // inputs have been zero-extended. 274 if (Op0WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT || 275 Op1WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT || 276 OpTySize != Op0WideDef->getOperand(2).getImm() || 277 OpTySize != Op1WideDef->getOperand(2).getImm()) 278 return false; 279 280 // Only scalar UADDO with either 8 or 16 bit operands are handled. 281 if (!WideTy0.isScalar() || !WideTy1.isScalar() || WideTy0 != WideTy1 || 282 OpTySize >= WideTy0.getScalarSizeInBits() || 283 (OpTySize != 8 && OpTySize != 16)) 284 return false; 285 286 // The overflow-status result must be used by a branch only. 287 Register ResStatus = MI.getOperand(1).getReg(); 288 if (!MRI.hasOneNonDBGUse(ResStatus)) 289 return false; 290 MachineInstr *CondUser = &*MRI.use_instr_nodbg_begin(ResStatus); 291 if (CondUser->getOpcode() != TargetOpcode::G_BRCOND) 292 return false; 293 294 // Make sure the computed result is only used in the no-overflow blocks. 295 MachineBasicBlock *CurrentMBB = MI.getParent(); 296 MachineBasicBlock *FailMBB = CondUser->getOperand(1).getMBB(); 297 if (!FailMBB->succ_empty() || CondUser->getParent() != CurrentMBB) 298 return false; 299 if (any_of(MRI.use_nodbg_instructions(ResVal), 300 [&MI, FailMBB, CurrentMBB](MachineInstr &I) { 301 return &MI != &I && 302 (I.getParent() == FailMBB || I.getParent() == CurrentMBB); 303 })) 304 return false; 305 306 // Remove G_ADDO. 307 B.setInstrAndDebugLoc(*MI.getNextNode()); 308 MI.eraseFromParent(); 309 310 // Emit wide add. 311 Register AddDst = MRI.cloneVirtualRegister(Op0Wide); 312 B.buildInstr(TargetOpcode::G_ADD, {AddDst}, {Op0Wide, Op1Wide}); 313 314 // Emit check of the 9th or 17th bit and update users (the branch). This will 315 // later be folded to TBNZ. 316 Register CondBit = MRI.cloneVirtualRegister(Op0Wide); 317 B.buildAnd( 318 CondBit, AddDst, 319 B.buildConstant(LLT::scalar(32), OpTySize == 8 ? 1 << 8 : 1 << 16)); 320 B.buildICmp(CmpInst::ICMP_NE, ResStatus, CondBit, 321 B.buildConstant(LLT::scalar(32), 0)); 322 323 // Update ZEXts users of the result value. Because all uses are in the 324 // no-overflow case, we know that the top bits are 0 and we can ignore ZExts. 325 B.buildZExtOrTrunc(ResVal, AddDst); 326 for (MachineOperand &U : make_early_inc_range(MRI.use_operands(ResVal))) { 327 Register WideReg; 328 if (mi_match(U.getParent(), MRI, m_GZExt(m_Reg(WideReg)))) { 329 auto OldR = U.getParent()->getOperand(0).getReg(); 330 Observer.erasingInstr(*U.getParent()); 331 U.getParent()->eraseFromParent(); 332 Helper.replaceRegWith(MRI, OldR, AddDst); 333 } 334 } 335 336 return true; 337 } 338 339 class AArch64PreLegalizerCombinerHelperState { 340 protected: 341 CombinerHelper &Helper; 342 343 public: 344 AArch64PreLegalizerCombinerHelperState(CombinerHelper &Helper) 345 : Helper(Helper) {} 346 }; 347 348 #define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 349 #include "AArch64GenPreLegalizeGICombiner.inc" 350 #undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 351 352 namespace { 353 #define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 354 #include "AArch64GenPreLegalizeGICombiner.inc" 355 #undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 356 357 class AArch64PreLegalizerCombinerInfo : public CombinerInfo { 358 GISelKnownBits *KB; 359 MachineDominatorTree *MDT; 360 AArch64GenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; 361 362 public: 363 AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 364 GISelKnownBits *KB, MachineDominatorTree *MDT) 365 : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, 366 /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize), 367 KB(KB), MDT(MDT) { 368 if (!GeneratedRuleCfg.parseCommandLineOption()) 369 report_fatal_error("Invalid rule identifier"); 370 } 371 372 virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 373 MachineIRBuilder &B) const override; 374 }; 375 376 bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 377 MachineInstr &MI, 378 MachineIRBuilder &B) const { 379 CombinerHelper Helper(Observer, B, KB, MDT); 380 AArch64GenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper); 381 382 if (Generated.tryCombineAll(Observer, MI, B)) 383 return true; 384 385 unsigned Opc = MI.getOpcode(); 386 switch (Opc) { 387 case TargetOpcode::G_CONCAT_VECTORS: 388 return Helper.tryCombineConcatVectors(MI); 389 case TargetOpcode::G_SHUFFLE_VECTOR: 390 return Helper.tryCombineShuffleVector(MI); 391 case TargetOpcode::G_UADDO: 392 return tryToSimplifyUADDO(MI, B, Helper, Observer); 393 case TargetOpcode::G_MEMCPY_INLINE: 394 return Helper.tryEmitMemcpyInline(MI); 395 case TargetOpcode::G_MEMCPY: 396 case TargetOpcode::G_MEMMOVE: 397 case TargetOpcode::G_MEMSET: { 398 // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other 399 // heuristics decide. 400 unsigned MaxLen = EnableOpt ? 0 : 32; 401 // Try to inline memcpy type calls if optimizations are enabled. 402 if (Helper.tryCombineMemCpyFamily(MI, MaxLen)) 403 return true; 404 if (Opc == TargetOpcode::G_MEMSET) 405 return llvm::AArch64GISelUtils::tryEmitBZero(MI, B, EnableMinSize); 406 return false; 407 } 408 } 409 410 return false; 411 } 412 413 #define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 414 #include "AArch64GenPreLegalizeGICombiner.inc" 415 #undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 416 417 // Pass boilerplate 418 // ================ 419 420 class AArch64PreLegalizerCombiner : public MachineFunctionPass { 421 public: 422 static char ID; 423 424 AArch64PreLegalizerCombiner(); 425 426 StringRef getPassName() const override { return "AArch64PreLegalizerCombiner"; } 427 428 bool runOnMachineFunction(MachineFunction &MF) override; 429 430 void getAnalysisUsage(AnalysisUsage &AU) const override; 431 }; 432 } // end anonymous namespace 433 434 void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 435 AU.addRequired<TargetPassConfig>(); 436 AU.setPreservesCFG(); 437 getSelectionDAGFallbackAnalysisUsage(AU); 438 AU.addRequired<GISelKnownBitsAnalysis>(); 439 AU.addPreserved<GISelKnownBitsAnalysis>(); 440 AU.addRequired<MachineDominatorTree>(); 441 AU.addPreserved<MachineDominatorTree>(); 442 AU.addRequired<GISelCSEAnalysisWrapperPass>(); 443 AU.addPreserved<GISelCSEAnalysisWrapperPass>(); 444 MachineFunctionPass::getAnalysisUsage(AU); 445 } 446 447 AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner() 448 : MachineFunctionPass(ID) { 449 initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 450 } 451 452 bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 453 if (MF.getProperties().hasProperty( 454 MachineFunctionProperties::Property::FailedISel)) 455 return false; 456 auto &TPC = getAnalysis<TargetPassConfig>(); 457 458 // Enable CSE. 459 GISelCSEAnalysisWrapper &Wrapper = 460 getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); 461 auto *CSEInfo = &Wrapper.get(TPC.getCSEConfig()); 462 463 const Function &F = MF.getFunction(); 464 bool EnableOpt = 465 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 466 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 467 MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>(); 468 AArch64PreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 469 F.hasMinSize(), KB, MDT); 470 Combiner C(PCInfo, &TPC); 471 return C.combineMachineInstrs(MF, CSEInfo); 472 } 473 474 char AArch64PreLegalizerCombiner::ID = 0; 475 INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE, 476 "Combine AArch64 machine instrs before legalization", 477 false, false) 478 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 479 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 480 INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) 481 INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE, 482 "Combine AArch64 machine instrs before legalization", false, 483 false) 484 485 486 namespace llvm { 487 FunctionPass *createAArch64PreLegalizerCombiner() { 488 return new AArch64PreLegalizerCombiner(); 489 } 490 } // end namespace llvm 491