1 //=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // before the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64GlobalISelUtils.h" 15 #include "AArch64TargetMachine.h" 16 #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 17 #include "llvm/CodeGen/GlobalISel/Combiner.h" 18 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 19 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 20 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 24 #include "llvm/CodeGen/GlobalISel/Utils.h" 25 #include "llvm/CodeGen/MachineDominators.h" 26 #include "llvm/CodeGen/MachineFunction.h" 27 #include "llvm/CodeGen/MachineFunctionPass.h" 28 #include "llvm/CodeGen/MachineRegisterInfo.h" 29 #include "llvm/CodeGen/TargetPassConfig.h" 30 #include "llvm/IR/Instructions.h" 31 #include "llvm/Support/Debug.h" 32 33 #define GET_GICOMBINER_DEPS 34 #include "AArch64GenPreLegalizeGICombiner.inc" 35 #undef GET_GICOMBINER_DEPS 36 37 #define DEBUG_TYPE "aarch64-prelegalizer-combiner" 38 39 using namespace llvm; 40 using namespace MIPatternMatch; 41 42 namespace { 43 44 #define GET_GICOMBINER_TYPES 45 #include "AArch64GenPreLegalizeGICombiner.inc" 46 #undef GET_GICOMBINER_TYPES 47 48 /// Return true if a G_FCONSTANT instruction is known to be better-represented 49 /// as a G_CONSTANT. 50 bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) { 51 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); 52 Register DstReg = MI.getOperand(0).getReg(); 53 const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); 54 if (DstSize != 32 && DstSize != 64) 55 return false; 56 57 // When we're storing a value, it doesn't matter what register bank it's on. 58 // Since not all floating point constants can be materialized using a fmov, 59 // it makes more sense to just use a GPR. 60 return all_of(MRI.use_nodbg_instructions(DstReg), 61 [](const MachineInstr &Use) { return Use.mayStore(); }); 62 } 63 64 /// Change a G_FCONSTANT into a G_CONSTANT. 65 void applyFConstantToConstant(MachineInstr &MI) { 66 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); 67 MachineIRBuilder MIB(MI); 68 const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF(); 69 MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt()); 70 MI.eraseFromParent(); 71 } 72 73 /// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits 74 /// are sign bits. In this case, we can transform the G_ICMP to directly compare 75 /// the wide value with a zero. 76 bool matchICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, 77 GISelKnownBits *KB, Register &MatchInfo) { 78 assert(MI.getOpcode() == TargetOpcode::G_ICMP && KB); 79 80 auto Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate(); 81 if (!ICmpInst::isEquality(Pred)) 82 return false; 83 84 Register LHS = MI.getOperand(2).getReg(); 85 LLT LHSTy = MRI.getType(LHS); 86 if (!LHSTy.isScalar()) 87 return false; 88 89 Register RHS = MI.getOperand(3).getReg(); 90 Register WideReg; 91 92 if (!mi_match(LHS, MRI, m_GTrunc(m_Reg(WideReg))) || 93 !mi_match(RHS, MRI, m_SpecificICst(0))) 94 return false; 95 96 LLT WideTy = MRI.getType(WideReg); 97 if (KB->computeNumSignBits(WideReg) <= 98 WideTy.getSizeInBits() - LHSTy.getSizeInBits()) 99 return false; 100 101 MatchInfo = WideReg; 102 return true; 103 } 104 105 void applyICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, 106 MachineIRBuilder &Builder, 107 GISelChangeObserver &Observer, Register &WideReg) { 108 assert(MI.getOpcode() == TargetOpcode::G_ICMP); 109 110 LLT WideTy = MRI.getType(WideReg); 111 // We're going to directly use the wide register as the LHS, and then use an 112 // equivalent size zero for RHS. 113 Builder.setInstrAndDebugLoc(MI); 114 auto WideZero = Builder.buildConstant(WideTy, 0); 115 Observer.changingInstr(MI); 116 MI.getOperand(2).setReg(WideReg); 117 MI.getOperand(3).setReg(WideZero.getReg(0)); 118 Observer.changedInstr(MI); 119 } 120 121 /// \returns true if it is possible to fold a constant into a G_GLOBAL_VALUE. 122 /// 123 /// e.g. 124 /// 125 /// %g = G_GLOBAL_VALUE @x -> %g = G_GLOBAL_VALUE @x + cst 126 bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI, 127 std::pair<uint64_t, uint64_t> &MatchInfo) { 128 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); 129 MachineFunction &MF = *MI.getMF(); 130 auto &GlobalOp = MI.getOperand(1); 131 auto *GV = GlobalOp.getGlobal(); 132 if (GV->isThreadLocal()) 133 return false; 134 135 // Don't allow anything that could represent offsets etc. 136 if (MF.getSubtarget<AArch64Subtarget>().ClassifyGlobalReference( 137 GV, MF.getTarget()) != AArch64II::MO_NO_FLAG) 138 return false; 139 140 // Look for a G_GLOBAL_VALUE only used by G_PTR_ADDs against constants: 141 // 142 // %g = G_GLOBAL_VALUE @x 143 // %ptr1 = G_PTR_ADD %g, cst1 144 // %ptr2 = G_PTR_ADD %g, cst2 145 // ... 146 // %ptrN = G_PTR_ADD %g, cstN 147 // 148 // Identify the *smallest* constant. We want to be able to form this: 149 // 150 // %offset_g = G_GLOBAL_VALUE @x + min_cst 151 // %g = G_PTR_ADD %offset_g, -min_cst 152 // %ptr1 = G_PTR_ADD %g, cst1 153 // ... 154 Register Dst = MI.getOperand(0).getReg(); 155 uint64_t MinOffset = -1ull; 156 for (auto &UseInstr : MRI.use_nodbg_instructions(Dst)) { 157 if (UseInstr.getOpcode() != TargetOpcode::G_PTR_ADD) 158 return false; 159 auto Cst = getIConstantVRegValWithLookThrough( 160 UseInstr.getOperand(2).getReg(), MRI); 161 if (!Cst) 162 return false; 163 MinOffset = std::min(MinOffset, Cst->Value.getZExtValue()); 164 } 165 166 // Require that the new offset is larger than the existing one to avoid 167 // infinite loops. 168 uint64_t CurrOffset = GlobalOp.getOffset(); 169 uint64_t NewOffset = MinOffset + CurrOffset; 170 if (NewOffset <= CurrOffset) 171 return false; 172 173 // Check whether folding this offset is legal. It must not go out of bounds of 174 // the referenced object to avoid violating the code model, and must be 175 // smaller than 2^20 because this is the largest offset expressible in all 176 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF 177 // stores an immediate signed 21 bit offset.) 178 // 179 // This check also prevents us from folding negative offsets, which will end 180 // up being treated in the same way as large positive ones. They could also 181 // cause code model violations, and aren't really common enough to matter. 182 if (NewOffset >= (1 << 20)) 183 return false; 184 185 Type *T = GV->getValueType(); 186 if (!T->isSized() || 187 NewOffset > GV->getParent()->getDataLayout().getTypeAllocSize(T)) 188 return false; 189 MatchInfo = std::make_pair(NewOffset, MinOffset); 190 return true; 191 } 192 193 void applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI, 194 MachineIRBuilder &B, GISelChangeObserver &Observer, 195 std::pair<uint64_t, uint64_t> &MatchInfo) { 196 // Change: 197 // 198 // %g = G_GLOBAL_VALUE @x 199 // %ptr1 = G_PTR_ADD %g, cst1 200 // %ptr2 = G_PTR_ADD %g, cst2 201 // ... 202 // %ptrN = G_PTR_ADD %g, cstN 203 // 204 // To: 205 // 206 // %offset_g = G_GLOBAL_VALUE @x + min_cst 207 // %g = G_PTR_ADD %offset_g, -min_cst 208 // %ptr1 = G_PTR_ADD %g, cst1 209 // ... 210 // %ptrN = G_PTR_ADD %g, cstN 211 // 212 // Then, the original G_PTR_ADDs should be folded later on so that they look 213 // like this: 214 // 215 // %ptrN = G_PTR_ADD %offset_g, cstN - min_cst 216 uint64_t Offset, MinOffset; 217 std::tie(Offset, MinOffset) = MatchInfo; 218 B.setInstrAndDebugLoc(*std::next(MI.getIterator())); 219 Observer.changingInstr(MI); 220 auto &GlobalOp = MI.getOperand(1); 221 auto *GV = GlobalOp.getGlobal(); 222 GlobalOp.ChangeToGA(GV, Offset, GlobalOp.getTargetFlags()); 223 Register Dst = MI.getOperand(0).getReg(); 224 Register NewGVDst = MRI.cloneVirtualRegister(Dst); 225 MI.getOperand(0).setReg(NewGVDst); 226 Observer.changedInstr(MI); 227 B.buildPtrAdd( 228 Dst, NewGVDst, 229 B.buildConstant(LLT::scalar(64), -static_cast<int64_t>(MinOffset))); 230 } 231 232 // Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add(udot(x, y)) 233 // Or vecreduce_add(ext(x)) -> vecreduce_add(udot(x, 1)) 234 // Similar to performVecReduceAddCombine in SelectionDAG 235 bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, 236 const AArch64Subtarget &STI, 237 std::tuple<Register, Register, bool> &MatchInfo) { 238 assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && 239 "Expected a G_VECREDUCE_ADD instruction"); 240 assert(STI.hasDotProd() && "Target should have Dot Product feature"); 241 242 MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI); 243 Register DstReg = MI.getOperand(0).getReg(); 244 Register MidReg = I1->getOperand(0).getReg(); 245 LLT DstTy = MRI.getType(DstReg); 246 LLT MidTy = MRI.getType(MidReg); 247 if (DstTy.getScalarSizeInBits() != 32 || MidTy.getScalarSizeInBits() != 32) 248 return false; 249 250 LLT SrcTy; 251 auto I1Opc = I1->getOpcode(); 252 if (I1Opc == TargetOpcode::G_MUL) { 253 // If result of this has more than 1 use, then there is no point in creating 254 // udot instruction 255 if (!MRI.hasOneNonDBGUse(MidReg)) 256 return false; 257 258 MachineInstr *ExtMI1 = 259 getDefIgnoringCopies(I1->getOperand(1).getReg(), MRI); 260 MachineInstr *ExtMI2 = 261 getDefIgnoringCopies(I1->getOperand(2).getReg(), MRI); 262 LLT Ext1DstTy = MRI.getType(ExtMI1->getOperand(0).getReg()); 263 LLT Ext2DstTy = MRI.getType(ExtMI2->getOperand(0).getReg()); 264 265 if (ExtMI1->getOpcode() != ExtMI2->getOpcode() || Ext1DstTy != Ext2DstTy) 266 return false; 267 I1Opc = ExtMI1->getOpcode(); 268 SrcTy = MRI.getType(ExtMI1->getOperand(1).getReg()); 269 std::get<0>(MatchInfo) = ExtMI1->getOperand(1).getReg(); 270 std::get<1>(MatchInfo) = ExtMI2->getOperand(1).getReg(); 271 } else { 272 SrcTy = MRI.getType(I1->getOperand(1).getReg()); 273 std::get<0>(MatchInfo) = I1->getOperand(1).getReg(); 274 std::get<1>(MatchInfo) = 0; 275 } 276 277 if (I1Opc == TargetOpcode::G_ZEXT) 278 std::get<2>(MatchInfo) = 0; 279 else if (I1Opc == TargetOpcode::G_SEXT) 280 std::get<2>(MatchInfo) = 1; 281 else 282 return false; 283 284 if (SrcTy.getScalarSizeInBits() != 8 || SrcTy.getNumElements() % 8 != 0) 285 return false; 286 287 return true; 288 } 289 290 void applyExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, 291 MachineIRBuilder &Builder, 292 GISelChangeObserver &Observer, 293 const AArch64Subtarget &STI, 294 std::tuple<Register, Register, bool> &MatchInfo) { 295 assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && 296 "Expected a G_VECREDUCE_ADD instruction"); 297 assert(STI.hasDotProd() && "Target should have Dot Product feature"); 298 299 // Initialise the variables 300 unsigned DotOpcode = 301 std::get<2>(MatchInfo) ? AArch64::G_SDOT : AArch64::G_UDOT; 302 Register Ext1SrcReg = std::get<0>(MatchInfo); 303 304 // If there is one source register, create a vector of 0s as the second 305 // source register 306 Register Ext2SrcReg; 307 if (std::get<1>(MatchInfo) == 0) 308 Ext2SrcReg = Builder.buildConstant(MRI.getType(Ext1SrcReg), 1) 309 ->getOperand(0) 310 .getReg(); 311 else 312 Ext2SrcReg = std::get<1>(MatchInfo); 313 314 // Find out how many DOT instructions are needed 315 LLT SrcTy = MRI.getType(Ext1SrcReg); 316 LLT MidTy; 317 unsigned NumOfDotMI; 318 if (SrcTy.getNumElements() % 16 == 0) { 319 NumOfDotMI = SrcTy.getNumElements() / 16; 320 MidTy = LLT::fixed_vector(4, 32); 321 } else if (SrcTy.getNumElements() % 8 == 0) { 322 NumOfDotMI = SrcTy.getNumElements() / 8; 323 MidTy = LLT::fixed_vector(2, 32); 324 } else { 325 llvm_unreachable("Source type number of elements is not multiple of 8"); 326 } 327 328 // Handle case where one DOT instruction is needed 329 if (NumOfDotMI == 1) { 330 auto Zeroes = Builder.buildConstant(MidTy, 0)->getOperand(0).getReg(); 331 auto Dot = Builder.buildInstr(DotOpcode, {MidTy}, 332 {Zeroes, Ext1SrcReg, Ext2SrcReg}); 333 Builder.buildVecReduceAdd(MI.getOperand(0), Dot->getOperand(0)); 334 } else { 335 // If not pad the last v8 element with 0s to a v16 336 SmallVector<Register, 4> Ext1UnmergeReg; 337 SmallVector<Register, 4> Ext2UnmergeReg; 338 if (SrcTy.getNumElements() % 16 != 0) { 339 SmallVector<Register> Leftover1; 340 SmallVector<Register> Leftover2; 341 342 // Split the elements into v16i8 and v8i8 343 LLT MainTy = LLT::fixed_vector(16, 8); 344 LLT LeftoverTy1, LeftoverTy2; 345 if ((!extractParts(Ext1SrcReg, MRI.getType(Ext1SrcReg), MainTy, 346 LeftoverTy1, Ext1UnmergeReg, Leftover1, Builder, 347 MRI)) || 348 (!extractParts(Ext2SrcReg, MRI.getType(Ext2SrcReg), MainTy, 349 LeftoverTy2, Ext2UnmergeReg, Leftover2, Builder, 350 MRI))) { 351 llvm_unreachable("Unable to split this vector properly"); 352 } 353 354 // Pad the leftover v8i8 vector with register of 0s of type v8i8 355 Register v8Zeroes = Builder.buildConstant(LLT::fixed_vector(8, 8), 0) 356 ->getOperand(0) 357 .getReg(); 358 359 Ext1UnmergeReg.push_back( 360 Builder 361 .buildMergeLikeInstr(LLT::fixed_vector(16, 8), 362 {Leftover1[0], v8Zeroes}) 363 .getReg(0)); 364 Ext2UnmergeReg.push_back( 365 Builder 366 .buildMergeLikeInstr(LLT::fixed_vector(16, 8), 367 {Leftover2[0], v8Zeroes}) 368 .getReg(0)); 369 370 } else { 371 // Unmerge the source vectors to v16i8 372 unsigned SrcNumElts = SrcTy.getNumElements(); 373 extractParts(Ext1SrcReg, LLT::fixed_vector(16, 8), SrcNumElts / 16, 374 Ext1UnmergeReg, Builder, MRI); 375 extractParts(Ext2SrcReg, LLT::fixed_vector(16, 8), SrcNumElts / 16, 376 Ext2UnmergeReg, Builder, MRI); 377 } 378 379 // Build the UDOT instructions 380 SmallVector<Register, 2> DotReg; 381 unsigned NumElements = 0; 382 for (unsigned i = 0; i < Ext1UnmergeReg.size(); i++) { 383 LLT ZeroesLLT; 384 // Check if it is 16 or 8 elements. Set Zeroes to the according size 385 if (MRI.getType(Ext1UnmergeReg[i]).getNumElements() == 16) { 386 ZeroesLLT = LLT::fixed_vector(4, 32); 387 NumElements += 4; 388 } else { 389 ZeroesLLT = LLT::fixed_vector(2, 32); 390 NumElements += 2; 391 } 392 auto Zeroes = Builder.buildConstant(ZeroesLLT, 0)->getOperand(0).getReg(); 393 DotReg.push_back( 394 Builder 395 .buildInstr(DotOpcode, {MRI.getType(Zeroes)}, 396 {Zeroes, Ext1UnmergeReg[i], Ext2UnmergeReg[i]}) 397 .getReg(0)); 398 } 399 400 // Merge the output 401 auto ConcatMI = 402 Builder.buildConcatVectors(LLT::fixed_vector(NumElements, 32), DotReg); 403 404 // Put it through a vector reduction 405 Builder.buildVecReduceAdd(MI.getOperand(0).getReg(), 406 ConcatMI->getOperand(0).getReg()); 407 } 408 409 // Erase the dead instructions 410 MI.eraseFromParent(); 411 } 412 413 // Matches {U/S}ADDV(ext(x)) => {U/S}ADDLV(x) 414 // Ensure that the type coming from the extend instruction is the right size 415 bool matchExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI, 416 std::pair<Register, bool> &MatchInfo) { 417 assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && 418 "Expected G_VECREDUCE_ADD Opcode"); 419 420 // Check if the last instruction is an extend 421 MachineInstr *ExtMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI); 422 auto ExtOpc = ExtMI->getOpcode(); 423 424 if (ExtOpc == TargetOpcode::G_ZEXT) 425 std::get<1>(MatchInfo) = 0; 426 else if (ExtOpc == TargetOpcode::G_SEXT) 427 std::get<1>(MatchInfo) = 1; 428 else 429 return false; 430 431 // Check if the source register is a valid type 432 Register ExtSrcReg = ExtMI->getOperand(1).getReg(); 433 LLT ExtSrcTy = MRI.getType(ExtSrcReg); 434 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 435 if ((DstTy.getScalarSizeInBits() == 16 && 436 ExtSrcTy.getNumElements() % 8 == 0 && ExtSrcTy.getNumElements() < 256) || 437 (DstTy.getScalarSizeInBits() == 32 && 438 ExtSrcTy.getNumElements() % 4 == 0) || 439 (DstTy.getScalarSizeInBits() == 64 && 440 ExtSrcTy.getNumElements() % 4 == 0)) { 441 std::get<0>(MatchInfo) = ExtSrcReg; 442 return true; 443 } 444 return false; 445 } 446 447 void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI, 448 MachineIRBuilder &B, GISelChangeObserver &Observer, 449 std::pair<Register, bool> &MatchInfo) { 450 assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && 451 "Expected G_VECREDUCE_ADD Opcode"); 452 453 unsigned Opc = std::get<1>(MatchInfo) ? AArch64::G_SADDLV : AArch64::G_UADDLV; 454 Register SrcReg = std::get<0>(MatchInfo); 455 Register DstReg = MI.getOperand(0).getReg(); 456 LLT SrcTy = MRI.getType(SrcReg); 457 LLT DstTy = MRI.getType(DstReg); 458 459 // If SrcTy has more elements than expected, split them into multiple 460 // insructions and sum the results 461 LLT MainTy; 462 SmallVector<Register, 1> WorkingRegisters; 463 unsigned SrcScalSize = SrcTy.getScalarSizeInBits(); 464 unsigned SrcNumElem = SrcTy.getNumElements(); 465 if ((SrcScalSize == 8 && SrcNumElem > 16) || 466 (SrcScalSize == 16 && SrcNumElem > 8) || 467 (SrcScalSize == 32 && SrcNumElem > 4)) { 468 469 LLT LeftoverTy; 470 SmallVector<Register, 4> LeftoverRegs; 471 if (SrcScalSize == 8) 472 MainTy = LLT::fixed_vector(16, 8); 473 else if (SrcScalSize == 16) 474 MainTy = LLT::fixed_vector(8, 16); 475 else if (SrcScalSize == 32) 476 MainTy = LLT::fixed_vector(4, 32); 477 else 478 llvm_unreachable("Source's Scalar Size not supported"); 479 480 // Extract the parts and put each extracted sources through U/SADDLV and put 481 // the values inside a small vec 482 extractParts(SrcReg, SrcTy, MainTy, LeftoverTy, WorkingRegisters, 483 LeftoverRegs, B, MRI); 484 for (unsigned I = 0; I < LeftoverRegs.size(); I++) { 485 WorkingRegisters.push_back(LeftoverRegs[I]); 486 } 487 } else { 488 WorkingRegisters.push_back(SrcReg); 489 MainTy = SrcTy; 490 } 491 492 unsigned MidScalarSize = MainTy.getScalarSizeInBits() * 2; 493 LLT MidScalarLLT = LLT::scalar(MidScalarSize); 494 Register zeroReg = B.buildConstant(LLT::scalar(64), 0).getReg(0); 495 for (unsigned I = 0; I < WorkingRegisters.size(); I++) { 496 // If the number of elements is too small to build an instruction, extend 497 // its size before applying addlv 498 LLT WorkingRegTy = MRI.getType(WorkingRegisters[I]); 499 if ((WorkingRegTy.getScalarSizeInBits() == 8) && 500 (WorkingRegTy.getNumElements() == 4)) { 501 WorkingRegisters[I] = 502 B.buildInstr(std::get<1>(MatchInfo) ? TargetOpcode::G_SEXT 503 : TargetOpcode::G_ZEXT, 504 {LLT::fixed_vector(4, 16)}, {WorkingRegisters[I]}) 505 .getReg(0); 506 } 507 508 // Generate the {U/S}ADDLV instruction, whose output is always double of the 509 // Src's Scalar size 510 LLT addlvTy = MidScalarSize <= 32 ? LLT::fixed_vector(4, 32) 511 : LLT::fixed_vector(2, 64); 512 Register addlvReg = 513 B.buildInstr(Opc, {addlvTy}, {WorkingRegisters[I]}).getReg(0); 514 515 // The output from {U/S}ADDLV gets placed in the lowest lane of a v4i32 or 516 // v2i64 register. 517 // i16, i32 results uses v4i32 registers 518 // i64 results uses v2i64 registers 519 // Therefore we have to extract/truncate the the value to the right type 520 if (MidScalarSize == 32 || MidScalarSize == 64) { 521 WorkingRegisters[I] = B.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, 522 {MidScalarLLT}, {addlvReg, zeroReg}) 523 .getReg(0); 524 } else { 525 Register extractReg = B.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, 526 {LLT::scalar(32)}, {addlvReg, zeroReg}) 527 .getReg(0); 528 WorkingRegisters[I] = 529 B.buildTrunc({MidScalarLLT}, {extractReg}).getReg(0); 530 } 531 } 532 533 Register outReg; 534 if (WorkingRegisters.size() > 1) { 535 outReg = B.buildAdd(MidScalarLLT, WorkingRegisters[0], WorkingRegisters[1]) 536 .getReg(0); 537 for (unsigned I = 2; I < WorkingRegisters.size(); I++) { 538 outReg = B.buildAdd(MidScalarLLT, outReg, WorkingRegisters[I]).getReg(0); 539 } 540 } else { 541 outReg = WorkingRegisters[0]; 542 } 543 544 if (DstTy.getScalarSizeInBits() > MidScalarSize) { 545 // Handle the scalar value if the DstTy's Scalar Size is more than double 546 // Src's ScalarType 547 B.buildInstr(std::get<1>(MatchInfo) ? TargetOpcode::G_SEXT 548 : TargetOpcode::G_ZEXT, 549 {DstReg}, {outReg}); 550 } else { 551 B.buildCopy(DstReg, outReg); 552 } 553 554 MI.eraseFromParent(); 555 } 556 557 bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B, 558 CombinerHelper &Helper, GISelChangeObserver &Observer) { 559 // Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if 560 // result is only used in the no-overflow case. It is restricted to cases 561 // where we know that the high-bits of the operands are 0. If there's an 562 // overflow, then the 9th or 17th bit must be set, which can be checked 563 // using TBNZ. 564 // 565 // Change (for UADDOs on 8 and 16 bits): 566 // 567 // %z0 = G_ASSERT_ZEXT _ 568 // %op0 = G_TRUNC %z0 569 // %z1 = G_ASSERT_ZEXT _ 570 // %op1 = G_TRUNC %z1 571 // %val, %cond = G_UADDO %op0, %op1 572 // G_BRCOND %cond, %error.bb 573 // 574 // error.bb: 575 // (no successors and no uses of %val) 576 // 577 // To: 578 // 579 // %z0 = G_ASSERT_ZEXT _ 580 // %z1 = G_ASSERT_ZEXT _ 581 // %add = G_ADD %z0, %z1 582 // %val = G_TRUNC %add 583 // %bit = G_AND %add, 1 << scalar-size-in-bits(%op1) 584 // %cond = G_ICMP NE, %bit, 0 585 // G_BRCOND %cond, %error.bb 586 587 auto &MRI = *B.getMRI(); 588 589 MachineOperand *DefOp0 = MRI.getOneDef(MI.getOperand(2).getReg()); 590 MachineOperand *DefOp1 = MRI.getOneDef(MI.getOperand(3).getReg()); 591 Register Op0Wide; 592 Register Op1Wide; 593 if (!mi_match(DefOp0->getParent(), MRI, m_GTrunc(m_Reg(Op0Wide))) || 594 !mi_match(DefOp1->getParent(), MRI, m_GTrunc(m_Reg(Op1Wide)))) 595 return false; 596 LLT WideTy0 = MRI.getType(Op0Wide); 597 LLT WideTy1 = MRI.getType(Op1Wide); 598 Register ResVal = MI.getOperand(0).getReg(); 599 LLT OpTy = MRI.getType(ResVal); 600 MachineInstr *Op0WideDef = MRI.getVRegDef(Op0Wide); 601 MachineInstr *Op1WideDef = MRI.getVRegDef(Op1Wide); 602 603 unsigned OpTySize = OpTy.getScalarSizeInBits(); 604 // First check that the G_TRUNC feeding the G_UADDO are no-ops, because the 605 // inputs have been zero-extended. 606 if (Op0WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT || 607 Op1WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT || 608 OpTySize != Op0WideDef->getOperand(2).getImm() || 609 OpTySize != Op1WideDef->getOperand(2).getImm()) 610 return false; 611 612 // Only scalar UADDO with either 8 or 16 bit operands are handled. 613 if (!WideTy0.isScalar() || !WideTy1.isScalar() || WideTy0 != WideTy1 || 614 OpTySize >= WideTy0.getScalarSizeInBits() || 615 (OpTySize != 8 && OpTySize != 16)) 616 return false; 617 618 // The overflow-status result must be used by a branch only. 619 Register ResStatus = MI.getOperand(1).getReg(); 620 if (!MRI.hasOneNonDBGUse(ResStatus)) 621 return false; 622 MachineInstr *CondUser = &*MRI.use_instr_nodbg_begin(ResStatus); 623 if (CondUser->getOpcode() != TargetOpcode::G_BRCOND) 624 return false; 625 626 // Make sure the computed result is only used in the no-overflow blocks. 627 MachineBasicBlock *CurrentMBB = MI.getParent(); 628 MachineBasicBlock *FailMBB = CondUser->getOperand(1).getMBB(); 629 if (!FailMBB->succ_empty() || CondUser->getParent() != CurrentMBB) 630 return false; 631 if (any_of(MRI.use_nodbg_instructions(ResVal), 632 [&MI, FailMBB, CurrentMBB](MachineInstr &I) { 633 return &MI != &I && 634 (I.getParent() == FailMBB || I.getParent() == CurrentMBB); 635 })) 636 return false; 637 638 // Remove G_ADDO. 639 B.setInstrAndDebugLoc(*MI.getNextNode()); 640 MI.eraseFromParent(); 641 642 // Emit wide add. 643 Register AddDst = MRI.cloneVirtualRegister(Op0Wide); 644 B.buildInstr(TargetOpcode::G_ADD, {AddDst}, {Op0Wide, Op1Wide}); 645 646 // Emit check of the 9th or 17th bit and update users (the branch). This will 647 // later be folded to TBNZ. 648 Register CondBit = MRI.cloneVirtualRegister(Op0Wide); 649 B.buildAnd( 650 CondBit, AddDst, 651 B.buildConstant(LLT::scalar(32), OpTySize == 8 ? 1 << 8 : 1 << 16)); 652 B.buildICmp(CmpInst::ICMP_NE, ResStatus, CondBit, 653 B.buildConstant(LLT::scalar(32), 0)); 654 655 // Update ZEXts users of the result value. Because all uses are in the 656 // no-overflow case, we know that the top bits are 0 and we can ignore ZExts. 657 B.buildZExtOrTrunc(ResVal, AddDst); 658 for (MachineOperand &U : make_early_inc_range(MRI.use_operands(ResVal))) { 659 Register WideReg; 660 if (mi_match(U.getParent(), MRI, m_GZExt(m_Reg(WideReg)))) { 661 auto OldR = U.getParent()->getOperand(0).getReg(); 662 Observer.erasingInstr(*U.getParent()); 663 U.getParent()->eraseFromParent(); 664 Helper.replaceRegWith(MRI, OldR, AddDst); 665 } 666 } 667 668 return true; 669 } 670 671 class AArch64PreLegalizerCombinerImpl : public Combiner { 672 protected: 673 // TODO: Make CombinerHelper methods const. 674 mutable CombinerHelper Helper; 675 const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig; 676 const AArch64Subtarget &STI; 677 678 public: 679 AArch64PreLegalizerCombinerImpl( 680 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 681 GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 682 const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig, 683 const AArch64Subtarget &STI, MachineDominatorTree *MDT, 684 const LegalizerInfo *LI); 685 686 static const char *getName() { return "AArch6400PreLegalizerCombiner"; } 687 688 bool tryCombineAll(MachineInstr &I) const override; 689 690 bool tryCombineAllImpl(MachineInstr &I) const; 691 692 private: 693 #define GET_GICOMBINER_CLASS_MEMBERS 694 #include "AArch64GenPreLegalizeGICombiner.inc" 695 #undef GET_GICOMBINER_CLASS_MEMBERS 696 }; 697 698 #define GET_GICOMBINER_IMPL 699 #include "AArch64GenPreLegalizeGICombiner.inc" 700 #undef GET_GICOMBINER_IMPL 701 702 AArch64PreLegalizerCombinerImpl::AArch64PreLegalizerCombinerImpl( 703 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 704 GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 705 const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig, 706 const AArch64Subtarget &STI, MachineDominatorTree *MDT, 707 const LegalizerInfo *LI) 708 : Combiner(MF, CInfo, TPC, &KB, CSEInfo), 709 Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI), 710 RuleConfig(RuleConfig), STI(STI), 711 #define GET_GICOMBINER_CONSTRUCTOR_INITS 712 #include "AArch64GenPreLegalizeGICombiner.inc" 713 #undef GET_GICOMBINER_CONSTRUCTOR_INITS 714 { 715 } 716 717 bool AArch64PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { 718 if (tryCombineAllImpl(MI)) 719 return true; 720 721 unsigned Opc = MI.getOpcode(); 722 switch (Opc) { 723 case TargetOpcode::G_CONCAT_VECTORS: 724 return Helper.tryCombineConcatVectors(MI); 725 case TargetOpcode::G_SHUFFLE_VECTOR: 726 return Helper.tryCombineShuffleVector(MI); 727 case TargetOpcode::G_UADDO: 728 return tryToSimplifyUADDO(MI, B, Helper, Observer); 729 case TargetOpcode::G_MEMCPY_INLINE: 730 return Helper.tryEmitMemcpyInline(MI); 731 case TargetOpcode::G_MEMCPY: 732 case TargetOpcode::G_MEMMOVE: 733 case TargetOpcode::G_MEMSET: { 734 // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other 735 // heuristics decide. 736 unsigned MaxLen = CInfo.EnableOpt ? 0 : 32; 737 // Try to inline memcpy type calls if optimizations are enabled. 738 if (Helper.tryCombineMemCpyFamily(MI, MaxLen)) 739 return true; 740 if (Opc == TargetOpcode::G_MEMSET) 741 return llvm::AArch64GISelUtils::tryEmitBZero(MI, B, CInfo.EnableMinSize); 742 return false; 743 } 744 } 745 746 return false; 747 } 748 749 // Pass boilerplate 750 // ================ 751 752 class AArch64PreLegalizerCombiner : public MachineFunctionPass { 753 public: 754 static char ID; 755 756 AArch64PreLegalizerCombiner(); 757 758 StringRef getPassName() const override { 759 return "AArch64PreLegalizerCombiner"; 760 } 761 762 bool runOnMachineFunction(MachineFunction &MF) override; 763 764 void getAnalysisUsage(AnalysisUsage &AU) const override; 765 766 private: 767 AArch64PreLegalizerCombinerImplRuleConfig RuleConfig; 768 }; 769 } // end anonymous namespace 770 771 void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 772 AU.addRequired<TargetPassConfig>(); 773 AU.setPreservesCFG(); 774 getSelectionDAGFallbackAnalysisUsage(AU); 775 AU.addRequired<GISelKnownBitsAnalysis>(); 776 AU.addPreserved<GISelKnownBitsAnalysis>(); 777 AU.addRequired<MachineDominatorTree>(); 778 AU.addPreserved<MachineDominatorTree>(); 779 AU.addRequired<GISelCSEAnalysisWrapperPass>(); 780 AU.addPreserved<GISelCSEAnalysisWrapperPass>(); 781 MachineFunctionPass::getAnalysisUsage(AU); 782 } 783 784 AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner() 785 : MachineFunctionPass(ID) { 786 initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 787 788 if (!RuleConfig.parseCommandLineOption()) 789 report_fatal_error("Invalid rule identifier"); 790 } 791 792 bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 793 if (MF.getProperties().hasProperty( 794 MachineFunctionProperties::Property::FailedISel)) 795 return false; 796 auto &TPC = getAnalysis<TargetPassConfig>(); 797 798 // Enable CSE. 799 GISelCSEAnalysisWrapper &Wrapper = 800 getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); 801 auto *CSEInfo = &Wrapper.get(TPC.getCSEConfig()); 802 803 const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>(); 804 const auto *LI = ST.getLegalizerInfo(); 805 806 const Function &F = MF.getFunction(); 807 bool EnableOpt = 808 MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); 809 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 810 MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>(); 811 CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, 812 /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(), 813 F.hasMinSize()); 814 AArch64PreLegalizerCombinerImpl Impl(MF, CInfo, &TPC, *KB, CSEInfo, 815 RuleConfig, ST, MDT, LI); 816 return Impl.combineMachineInstrs(); 817 } 818 819 char AArch64PreLegalizerCombiner::ID = 0; 820 INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE, 821 "Combine AArch64 machine instrs before legalization", 822 false, false) 823 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 824 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 825 INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) 826 INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE, 827 "Combine AArch64 machine instrs before legalization", false, 828 false) 829 830 namespace llvm { 831 FunctionPass *createAArch64PreLegalizerCombiner() { 832 return new AArch64PreLegalizerCombiner(); 833 } 834 } // end namespace llvm 835