xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp (revision e3f4a63af63bea70bc86b6c790b14aa5ee99fcd0)
1 //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
9 // operand. If any of the use instruction cannot be combined with the mov the
10 // whole sequence is reverted.
11 //
12 // $old = ...
13 // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
14 //                            dpp_controls..., $row_mask, $bank_mask, $bound_ctrl
15 // $res = VALU $dpp_value [, src1]
16 //
17 // to
18 //
19 // $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,]
20 //                 dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl
21 //
22 // Combining rules :
23 //
24 // if $row_mask and $bank_mask are fully enabled (0xF) and
25 //    $bound_ctrl==DPP_BOUND_ZERO or $old==0
26 // -> $combined_old = undef,
27 //    $combined_bound_ctrl = DPP_BOUND_ZERO
28 //
29 // if the VALU op is binary and
30 //    $bound_ctrl==DPP_BOUND_OFF and
31 //    $old==identity value (immediate) for the VALU op
32 // -> $combined_old = src1,
33 //    $combined_bound_ctrl = DPP_BOUND_OFF
34 //
35 // Otherwise cancel.
36 //
37 // The mov_dpp instruction should reside in the same BB as all its uses
38 //===----------------------------------------------------------------------===//
39 
40 #include "GCNDPPCombine.h"
41 #include "AMDGPU.h"
42 #include "GCNSubtarget.h"
43 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
44 #include "llvm/ADT/Statistic.h"
45 #include "llvm/CodeGen/MachineFunctionPass.h"
46 
47 using namespace llvm;
48 
49 #define DEBUG_TYPE "gcn-dpp-combine"
50 
51 STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined.");
52 
53 namespace {
54 
55 class GCNDPPCombine {
56   MachineRegisterInfo *MRI;
57   const SIInstrInfo *TII;
58   const GCNSubtarget *ST;
59 
60   using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
61 
62   MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
63 
64   MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI,
65                               RegSubRegPair CombOldVGPR,
66                               MachineOperand *OldOpnd, bool CombBCZ,
67                               bool IsShrinkable) const;
68 
69   MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI,
70                               RegSubRegPair CombOldVGPR, bool CombBCZ,
71                               bool IsShrinkable) const;
72 
73   bool hasNoImmOrEqual(MachineInstr &MI, AMDGPU::OpName OpndName, int64_t Value,
74                        int64_t Mask = -1) const;
75 
76   bool combineDPPMov(MachineInstr &MI) const;
77 
78   int getDPPOp(unsigned Op, bool IsShrinkable) const;
79   bool isShrinkable(MachineInstr &MI) const;
80 
81 public:
82   bool run(MachineFunction &MF);
83 };
84 
85 class GCNDPPCombineLegacy : public MachineFunctionPass {
86 public:
87   static char ID;
88 
89   GCNDPPCombineLegacy() : MachineFunctionPass(ID) {}
90 
91   bool runOnMachineFunction(MachineFunction &MF) override;
92 
93   StringRef getPassName() const override { return "GCN DPP Combine"; }
94 
95   void getAnalysisUsage(AnalysisUsage &AU) const override {
96     AU.setPreservesCFG();
97     MachineFunctionPass::getAnalysisUsage(AU);
98   }
99 
100   MachineFunctionProperties getRequiredProperties() const override {
101     return MachineFunctionProperties().setIsSSA();
102   }
103 };
104 
105 } // end anonymous namespace
106 
107 INITIALIZE_PASS(GCNDPPCombineLegacy, DEBUG_TYPE, "GCN DPP Combine", false,
108                 false)
109 
110 char GCNDPPCombineLegacy::ID = 0;
111 
112 char &llvm::GCNDPPCombineLegacyID = GCNDPPCombineLegacy::ID;
113 
114 FunctionPass *llvm::createGCNDPPCombinePass() {
115   return new GCNDPPCombineLegacy();
116 }
117 
118 bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const {
119   unsigned Op = MI.getOpcode();
120   if (!TII->isVOP3(Op)) {
121     return false;
122   }
123   if (!TII->hasVALU32BitEncoding(Op)) {
124     LLVM_DEBUG(dbgs() << "  Inst hasn't e32 equivalent\n");
125     return false;
126   }
127   // Do not shrink True16 instructions pre-RA to avoid the restriction in
128   // register allocation from only being able to use 128 VGPRs
129   if (AMDGPU::isTrue16Inst(Op))
130     return false;
131   if (const auto *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
132     // Give up if there are any uses of the sdst in carry-out or VOPC.
133     // The shrunken form of the instruction would write it to vcc instead of to
134     // a virtual register. If we rewrote the uses the shrinking would be
135     // possible.
136     if (!MRI->use_nodbg_empty(SDst->getReg()))
137       return false;
138   }
139   // check if other than abs|neg modifiers are set (opsel for example)
140   const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
141   if (!hasNoImmOrEqual(MI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
142       !hasNoImmOrEqual(MI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
143       !hasNoImmOrEqual(MI, AMDGPU::OpName::clamp, 0) ||
144       !hasNoImmOrEqual(MI, AMDGPU::OpName::omod, 0) ||
145       !hasNoImmOrEqual(MI, AMDGPU::OpName::byte_sel, 0)) {
146     LLVM_DEBUG(dbgs() << "  Inst has non-default modifiers\n");
147     return false;
148   }
149   return true;
150 }
151 
152 int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const {
153   int DPP32 = AMDGPU::getDPPOp32(Op);
154   if (IsShrinkable) {
155     assert(DPP32 == -1);
156     int E32 = AMDGPU::getVOPe32(Op);
157     DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(E32);
158   }
159   if (DPP32 != -1 && TII->pseudoToMCOpcode(DPP32) != -1)
160     return DPP32;
161   int DPP64 = -1;
162   if (ST->hasVOP3DPP())
163     DPP64 = AMDGPU::getDPPOp64(Op);
164   if (DPP64 != -1 && TII->pseudoToMCOpcode(DPP64) != -1)
165     return DPP64;
166   return -1;
167 }
168 
169 // tracks the register operand definition and returns:
170 //   1. immediate operand used to initialize the register if found
171 //   2. nullptr if the register operand is undef
172 //   3. the operand itself otherwise
173 MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
174   auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI);
175   if (!Def)
176     return nullptr;
177 
178   switch(Def->getOpcode()) {
179   default: break;
180   case AMDGPU::IMPLICIT_DEF:
181     return nullptr;
182   case AMDGPU::COPY:
183   case AMDGPU::V_MOV_B32_e32:
184   case AMDGPU::V_MOV_B64_PSEUDO:
185   case AMDGPU::V_MOV_B64_e32:
186   case AMDGPU::V_MOV_B64_e64: {
187     auto &Op1 = Def->getOperand(1);
188     if (Op1.isImm())
189       return &Op1;
190     break;
191   }
192   }
193   return &OldOpnd;
194 }
195 
196 [[maybe_unused]] static unsigned getOperandSize(MachineInstr &MI, unsigned Idx,
197                                MachineRegisterInfo &MRI) {
198   int16_t RegClass = MI.getDesc().operands()[Idx].RegClass;
199   if (RegClass == -1)
200     return 0;
201 
202   const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
203   return TRI->getRegSizeInBits(*TRI->getRegClass(RegClass));
204 }
205 
206 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
207                                            MachineInstr &MovMI,
208                                            RegSubRegPair CombOldVGPR,
209                                            bool CombBCZ,
210                                            bool IsShrinkable) const {
211   assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
212          MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp ||
213          MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
214 
215   bool HasVOP3DPP = ST->hasVOP3DPP();
216   auto OrigOp = OrigMI.getOpcode();
217   if (ST->useRealTrue16Insts() && AMDGPU::isTrue16Inst(OrigOp)) {
218     LLVM_DEBUG(
219         dbgs() << "  failed: Did not expect any 16-bit uses of dpp values\n");
220     return nullptr;
221   }
222   auto DPPOp = getDPPOp(OrigOp, IsShrinkable);
223   if (DPPOp == -1) {
224     LLVM_DEBUG(dbgs() << "  failed: no DPP opcode\n");
225     return nullptr;
226   }
227   int OrigOpE32 = AMDGPU::getVOPe32(OrigOp);
228   // Prior checks cover Mask with VOPC condition, but not on purpose
229   auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
230   assert(RowMaskOpnd && RowMaskOpnd->isImm());
231   auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
232   assert(BankMaskOpnd && BankMaskOpnd->isImm());
233   const bool MaskAllLanes =
234       RowMaskOpnd->getImm() == 0xF && BankMaskOpnd->getImm() == 0xF;
235   (void)MaskAllLanes;
236   assert((MaskAllLanes ||
237           !(TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 &&
238                                    TII->isVOPC(OrigOpE32)))) &&
239          "VOPC cannot form DPP unless mask is full");
240 
241   auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
242                          OrigMI.getDebugLoc(), TII->get(DPPOp))
243     .setMIFlags(OrigMI.getFlags());
244 
245   bool Fail = false;
246   do {
247     int NumOperands = 0;
248     if (auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst)) {
249       DPPInst.add(*Dst);
250       ++NumOperands;
251     }
252     if (auto *SDst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) {
253       if (TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) {
254         DPPInst.add(*SDst);
255         ++NumOperands;
256       }
257       // If we shrunk a 64bit vop3b to 32bits, just ignore the sdst
258     }
259 
260     const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
261     if (OldIdx != -1) {
262       assert(OldIdx == NumOperands);
263       assert(isOfRegClass(
264           CombOldVGPR,
265           *MRI->getRegClass(
266               TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()),
267           *MRI));
268       auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI);
269       DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef,
270                      CombOldVGPR.SubReg);
271       ++NumOperands;
272     } else if (TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 &&
273                                       TII->isVOPC(OrigOpE32))) {
274       // VOPC DPP and VOPC promoted to VOP3 DPP do not have an old operand
275       // because they write to SGPRs not VGPRs
276     } else {
277       // TODO: this discards MAC/FMA instructions for now, let's add it later
278       LLVM_DEBUG(dbgs() << "  failed: no old operand in DPP instruction,"
279                            " TBD\n");
280       Fail = true;
281       break;
282     }
283 
284     auto *Mod0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0_modifiers);
285     if (Mod0) {
286       assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
287                                           AMDGPU::OpName::src0_modifiers));
288       assert(HasVOP3DPP ||
289              (0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
290       DPPInst.addImm(Mod0->getImm());
291       ++NumOperands;
292     } else if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src0_modifiers)) {
293       DPPInst.addImm(0);
294       ++NumOperands;
295     }
296     auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
297     assert(Src0);
298     int Src0Idx = NumOperands;
299     if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
300       LLVM_DEBUG(dbgs() << "  failed: src0 is illegal\n");
301       Fail = true;
302       break;
303     }
304     DPPInst.add(*Src0);
305     DPPInst->getOperand(NumOperands).setIsKill(false);
306     ++NumOperands;
307 
308     auto *Mod1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1_modifiers);
309     if (Mod1) {
310       assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
311                                           AMDGPU::OpName::src1_modifiers));
312       assert(HasVOP3DPP ||
313              (0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
314       DPPInst.addImm(Mod1->getImm());
315       ++NumOperands;
316     } else if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src1_modifiers)) {
317       DPPInst.addImm(0);
318       ++NumOperands;
319     }
320     auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
321     if (Src1) {
322       int OpNum = NumOperands;
323       // If subtarget does not support SGPRs for src1 operand then the
324       // requirements are the same as for src0. We check src0 instead because
325       // pseudos are shared between subtargets and allow SGPR for src1 on all.
326       if (!ST->hasDPPSrc1SGPR()) {
327         assert(getOperandSize(*DPPInst, Src0Idx, *MRI) ==
328                    getOperandSize(*DPPInst, NumOperands, *MRI) &&
329                "Src0 and Src1 operands should have the same size");
330         OpNum = Src0Idx;
331       }
332       if (!TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) {
333         LLVM_DEBUG(dbgs() << "  failed: src1 is illegal\n");
334         Fail = true;
335         break;
336       }
337       DPPInst.add(*Src1);
338       ++NumOperands;
339     }
340 
341     auto *Mod2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers);
342     if (Mod2) {
343       assert(NumOperands ==
344              AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers));
345       assert(HasVOP3DPP ||
346              (0LL == (Mod2->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
347       DPPInst.addImm(Mod2->getImm());
348       ++NumOperands;
349     }
350     auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
351     if (Src2) {
352       if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) ||
353           !TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
354         LLVM_DEBUG(dbgs() << "  failed: src2 is illegal\n");
355         Fail = true;
356         break;
357       }
358       DPPInst.add(*Src2);
359       ++NumOperands;
360     }
361 
362     if (HasVOP3DPP) {
363       auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp);
364       if (ClampOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::clamp)) {
365         DPPInst.addImm(ClampOpr->getImm());
366       }
367       auto *VdstInOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst_in);
368       if (VdstInOpr &&
369           AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::vdst_in)) {
370         DPPInst.add(*VdstInOpr);
371       }
372       auto *OmodOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::omod);
373       if (OmodOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::omod)) {
374         DPPInst.addImm(OmodOpr->getImm());
375       }
376       // Validate OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to
377       // all 1.
378       if (TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) {
379         int64_t OpSel = 0;
380         OpSel |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_0) << 0) : 0);
381         OpSel |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_0) << 1) : 0);
382         OpSel |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_0) << 2) : 0);
383         if (Mod0 && TII->isVOP3(OrigMI) && !TII->isVOP3P(OrigMI))
384           OpSel |= !!(Mod0->getImm() & SISrcMods::DST_OP_SEL) << 3;
385 
386         if (OpSel != 0) {
387           LLVM_DEBUG(dbgs() << "  failed: op_sel must be zero\n");
388           Fail = true;
389           break;
390         }
391         if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::op_sel))
392           DPPInst.addImm(OpSel);
393       }
394       if (TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) {
395         int64_t OpSelHi = 0;
396         OpSelHi |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_1) << 0) : 0);
397         OpSelHi |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_1) << 1) : 0);
398         OpSelHi |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_1) << 2) : 0);
399 
400         // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check
401         // the bitmask for 3 op_sel_hi bits set
402         assert(Src2 && "Expected vop3p with 3 operands");
403         if (OpSelHi != 7) {
404           LLVM_DEBUG(dbgs() << "  failed: op_sel_hi must be all set to one\n");
405           Fail = true;
406           break;
407         }
408         if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::op_sel_hi))
409           DPPInst.addImm(OpSelHi);
410       }
411       auto *NegOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_lo);
412       if (NegOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::neg_lo)) {
413         DPPInst.addImm(NegOpr->getImm());
414       }
415       auto *NegHiOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_hi);
416       if (NegHiOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::neg_hi)) {
417         DPPInst.addImm(NegHiOpr->getImm());
418       }
419       auto *ByteSelOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::byte_sel);
420       if (ByteSelOpr &&
421           AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::byte_sel)) {
422         DPPInst.addImm(ByteSelOpr->getImm());
423       }
424     }
425     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
426     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
427     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
428     DPPInst.addImm(CombBCZ ? 1 : 0);
429   } while (false);
430 
431   if (Fail) {
432     DPPInst.getInstr()->eraseFromParent();
433     return nullptr;
434   }
435   LLVM_DEBUG(dbgs() << "  combined:  " << *DPPInst.getInstr());
436   return DPPInst.getInstr();
437 }
438 
439 static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
440   assert(OldOpnd->isImm());
441   switch (OrigMIOp) {
442   default: break;
443   case AMDGPU::V_ADD_U32_e32:
444   case AMDGPU::V_ADD_U32_e64:
445   case AMDGPU::V_ADD_CO_U32_e32:
446   case AMDGPU::V_ADD_CO_U32_e64:
447   case AMDGPU::V_OR_B32_e32:
448   case AMDGPU::V_OR_B32_e64:
449   case AMDGPU::V_SUBREV_U32_e32:
450   case AMDGPU::V_SUBREV_U32_e64:
451   case AMDGPU::V_SUBREV_CO_U32_e32:
452   case AMDGPU::V_SUBREV_CO_U32_e64:
453   case AMDGPU::V_MAX_U32_e32:
454   case AMDGPU::V_MAX_U32_e64:
455   case AMDGPU::V_XOR_B32_e32:
456   case AMDGPU::V_XOR_B32_e64:
457     if (OldOpnd->getImm() == 0)
458       return true;
459     break;
460   case AMDGPU::V_AND_B32_e32:
461   case AMDGPU::V_AND_B32_e64:
462   case AMDGPU::V_MIN_U32_e32:
463   case AMDGPU::V_MIN_U32_e64:
464     if (static_cast<uint32_t>(OldOpnd->getImm()) ==
465         std::numeric_limits<uint32_t>::max())
466       return true;
467     break;
468   case AMDGPU::V_MIN_I32_e32:
469   case AMDGPU::V_MIN_I32_e64:
470     if (static_cast<int32_t>(OldOpnd->getImm()) ==
471         std::numeric_limits<int32_t>::max())
472       return true;
473     break;
474   case AMDGPU::V_MAX_I32_e32:
475   case AMDGPU::V_MAX_I32_e64:
476     if (static_cast<int32_t>(OldOpnd->getImm()) ==
477         std::numeric_limits<int32_t>::min())
478       return true;
479     break;
480   case AMDGPU::V_MUL_I32_I24_e32:
481   case AMDGPU::V_MUL_I32_I24_e64:
482   case AMDGPU::V_MUL_U32_U24_e32:
483   case AMDGPU::V_MUL_U32_U24_e64:
484     if (OldOpnd->getImm() == 1)
485       return true;
486     break;
487   }
488   return false;
489 }
490 
491 MachineInstr *GCNDPPCombine::createDPPInst(
492     MachineInstr &OrigMI, MachineInstr &MovMI, RegSubRegPair CombOldVGPR,
493     MachineOperand *OldOpndValue, bool CombBCZ, bool IsShrinkable) const {
494   assert(CombOldVGPR.Reg);
495   if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) {
496     auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
497     if (!Src1 || !Src1->isReg()) {
498       LLVM_DEBUG(dbgs() << "  failed: no src1 or it isn't a register\n");
499       return nullptr;
500     }
501     if (!isIdentityValue(OrigMI.getOpcode(), OldOpndValue)) {
502       LLVM_DEBUG(dbgs() << "  failed: old immediate isn't an identity\n");
503       return nullptr;
504     }
505     CombOldVGPR = getRegSubRegPair(*Src1);
506     auto *MovDst = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
507     const TargetRegisterClass *RC = MRI->getRegClass(MovDst->getReg());
508     if (!isOfRegClass(CombOldVGPR, *RC, *MRI)) {
509       LLVM_DEBUG(dbgs() << "  failed: src1 has wrong register class\n");
510       return nullptr;
511     }
512   }
513   return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ, IsShrinkable);
514 }
515 
516 // returns true if MI doesn't have OpndName immediate operand or the
517 // operand has Value
518 bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, AMDGPU::OpName OpndName,
519                                     int64_t Value, int64_t Mask) const {
520   auto *Imm = TII->getNamedOperand(MI, OpndName);
521   if (!Imm)
522     return true;
523 
524   assert(Imm->isImm());
525   return (Imm->getImm() & Mask) == Value;
526 }
527 
528 bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
529   assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
530          MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp ||
531          MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
532   LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
533 
534   auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
535   assert(DstOpnd && DstOpnd->isReg());
536   auto DPPMovReg = DstOpnd->getReg();
537   if (DPPMovReg.isPhysical()) {
538     LLVM_DEBUG(dbgs() << "  failed: dpp move writes physreg\n");
539     return false;
540   }
541   if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
542     LLVM_DEBUG(dbgs() << "  failed: EXEC mask should remain the same"
543                          " for all uses\n");
544     return false;
545   }
546 
547   if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
548       MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
549     auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
550     assert(DppCtrl && DppCtrl->isImm());
551     if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl->getImm())) {
552       LLVM_DEBUG(dbgs() << "  failed: 64 bit dpp move uses unsupported"
553                            " control value\n");
554       // Let it split, then control may become legal.
555       return false;
556     }
557   }
558 
559   auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
560   assert(RowMaskOpnd && RowMaskOpnd->isImm());
561   auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
562   assert(BankMaskOpnd && BankMaskOpnd->isImm());
563   const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF &&
564                             BankMaskOpnd->getImm() == 0xF;
565 
566   auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
567   assert(BCZOpnd && BCZOpnd->isImm());
568   bool BoundCtrlZero = BCZOpnd->getImm();
569 
570   auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
571   auto *SrcOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
572   assert(OldOpnd && OldOpnd->isReg());
573   assert(SrcOpnd && SrcOpnd->isReg());
574   if (OldOpnd->getReg().isPhysical() || SrcOpnd->getReg().isPhysical()) {
575     LLVM_DEBUG(dbgs() << "  failed: dpp move reads physreg\n");
576     return false;
577   }
578 
579   auto * const OldOpndValue = getOldOpndValue(*OldOpnd);
580   // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
581   // We could use: assert(!OldOpndValue || OldOpndValue->isImm())
582   // but the third option is used to distinguish undef from non-immediate
583   // to reuse IMPLICIT_DEF instruction later
584   assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
585 
586   bool CombBCZ = false;
587 
588   if (MaskAllLanes && BoundCtrlZero) { // [1]
589     CombBCZ = true;
590   } else {
591     if (!OldOpndValue || !OldOpndValue->isImm()) {
592       LLVM_DEBUG(dbgs() << "  failed: the DPP mov isn't combinable\n");
593       return false;
594     }
595 
596     if (OldOpndValue->getImm() == 0) {
597       if (MaskAllLanes) {
598         assert(!BoundCtrlZero); // by check [1]
599         CombBCZ = true;
600       }
601     } else if (BoundCtrlZero) {
602       assert(!MaskAllLanes); // by check [1]
603       LLVM_DEBUG(dbgs() <<
604         "  failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
605       return false;
606     }
607   }
608 
609   LLVM_DEBUG(dbgs() << "  old=";
610     if (!OldOpndValue)
611       dbgs() << "undef";
612     else
613       dbgs() << *OldOpndValue;
614     dbgs() << ", bound_ctrl=" << CombBCZ << '\n');
615 
616   SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs;
617   DenseMap<MachineInstr*, SmallVector<unsigned, 4>> RegSeqWithOpNos;
618   auto CombOldVGPR = getRegSubRegPair(*OldOpnd);
619   // try to reuse previous old reg if its undefined (IMPLICIT_DEF)
620   if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
621     const TargetRegisterClass *RC = MRI->getRegClass(DPPMovReg);
622     CombOldVGPR = RegSubRegPair(
623       MRI->createVirtualRegister(RC));
624     auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
625                              TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg);
626     DPPMIs.push_back(UndefInst.getInstr());
627   }
628 
629   OrigMIs.push_back(&MovMI);
630   bool Rollback = true;
631   SmallVector<MachineOperand *, 16> Uses(
632       llvm::make_pointer_range(MRI->use_nodbg_operands(DPPMovReg)));
633 
634   while (!Uses.empty()) {
635     MachineOperand *Use = Uses.pop_back_val();
636     Rollback = true;
637 
638     auto &OrigMI = *Use->getParent();
639     LLVM_DEBUG(dbgs() << "  try: " << OrigMI);
640 
641     auto OrigOp = OrigMI.getOpcode();
642     assert((TII->get(OrigOp).getSize() != 4 || !AMDGPU::isTrue16Inst(OrigOp)) &&
643            "There should not be e32 True16 instructions pre-RA");
644     if (OrigOp == AMDGPU::REG_SEQUENCE) {
645       Register FwdReg = OrigMI.getOperand(0).getReg();
646       unsigned FwdSubReg = 0;
647 
648       if (execMayBeModifiedBeforeAnyUse(*MRI, FwdReg, OrigMI)) {
649         LLVM_DEBUG(dbgs() << "  failed: EXEC mask should remain the same"
650                              " for all uses\n");
651         break;
652       }
653 
654       unsigned OpNo, E = OrigMI.getNumOperands();
655       for (OpNo = 1; OpNo < E; OpNo += 2) {
656         if (OrigMI.getOperand(OpNo).getReg() == DPPMovReg) {
657           FwdSubReg = OrigMI.getOperand(OpNo + 1).getImm();
658           break;
659         }
660       }
661 
662       if (!FwdSubReg)
663         break;
664 
665       for (auto &Op : MRI->use_nodbg_operands(FwdReg)) {
666         if (Op.getSubReg() == FwdSubReg)
667           Uses.push_back(&Op);
668       }
669       RegSeqWithOpNos[&OrigMI].push_back(OpNo);
670       continue;
671     }
672 
673     bool IsShrinkable = isShrinkable(OrigMI);
674     if (!(IsShrinkable ||
675           ((TII->isVOP3P(OrigOp) || TII->isVOPC(OrigOp) ||
676             TII->isVOP3(OrigOp)) &&
677            ST->hasVOP3DPP()) ||
678           TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) {
679       LLVM_DEBUG(dbgs() << "  failed: not VOP1/2/3/3P/C\n");
680       break;
681     }
682     if (OrigMI.modifiesRegister(AMDGPU::EXEC, ST->getRegisterInfo())) {
683       LLVM_DEBUG(dbgs() << "  failed: can't combine v_cmpx\n");
684       break;
685     }
686 
687     auto *Src0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0);
688     auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
689     if (Use != Src0 && !(Use == Src1 && OrigMI.isCommutable())) { // [1]
690       LLVM_DEBUG(dbgs() << "  failed: no suitable operands\n");
691       break;
692     }
693 
694     auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
695     assert(Src0 && "Src1 without Src0?");
696     if ((Use == Src0 && ((Src1 && Src1->isIdenticalTo(*Src0)) ||
697                          (Src2 && Src2->isIdenticalTo(*Src0)))) ||
698         (Use == Src1 && (Src1->isIdenticalTo(*Src0) ||
699                          (Src2 && Src2->isIdenticalTo(*Src1))))) {
700       LLVM_DEBUG(
701           dbgs()
702           << "  " << OrigMI
703           << "  failed: DPP register is used more than once per instruction\n");
704       break;
705     }
706 
707     LLVM_DEBUG(dbgs() << "  combining: " << OrigMI);
708     if (Use == Src0) {
709       if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
710                                         OldOpndValue, CombBCZ, IsShrinkable)) {
711         DPPMIs.push_back(DPPInst);
712         Rollback = false;
713       }
714     } else {
715       assert(Use == Src1 && OrigMI.isCommutable()); // by check [1]
716       auto *BB = OrigMI.getParent();
717       auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
718       BB->insert(OrigMI, NewMI);
719       if (TII->commuteInstruction(*NewMI)) {
720         LLVM_DEBUG(dbgs() << "  commuted:  " << *NewMI);
721         if (auto *DPPInst =
722                 createDPPInst(*NewMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ,
723                               IsShrinkable)) {
724           DPPMIs.push_back(DPPInst);
725           Rollback = false;
726         }
727       } else
728         LLVM_DEBUG(dbgs() << "  failed: cannot be commuted\n");
729       NewMI->eraseFromParent();
730     }
731     if (Rollback)
732       break;
733     OrigMIs.push_back(&OrigMI);
734   }
735 
736   Rollback |= !Uses.empty();
737 
738   for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
739     MI->eraseFromParent();
740 
741   if (!Rollback) {
742     for (auto &S : RegSeqWithOpNos) {
743       if (MRI->use_nodbg_empty(S.first->getOperand(0).getReg())) {
744         S.first->eraseFromParent();
745         continue;
746       }
747       while (!S.second.empty())
748         S.first->getOperand(S.second.pop_back_val()).setIsUndef();
749     }
750   }
751 
752   return !Rollback;
753 }
754 
755 bool GCNDPPCombineLegacy::runOnMachineFunction(MachineFunction &MF) {
756   if (skipFunction(MF.getFunction()))
757     return false;
758 
759   return GCNDPPCombine().run(MF);
760 }
761 
762 bool GCNDPPCombine::run(MachineFunction &MF) {
763   ST = &MF.getSubtarget<GCNSubtarget>();
764   if (!ST->hasDPP())
765     return false;
766 
767   MRI = &MF.getRegInfo();
768   TII = ST->getInstrInfo();
769 
770   bool Changed = false;
771   for (auto &MBB : MF) {
772     for (MachineInstr &MI : llvm::make_early_inc_range(llvm::reverse(MBB))) {
773       if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
774         Changed = true;
775         ++NumDPPMovsCombined;
776       } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
777                  MI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
778         if (ST->hasDPALU_DPP() && combineDPPMov(MI)) {
779           Changed = true;
780           ++NumDPPMovsCombined;
781         } else {
782           auto Split = TII->expandMovDPP64(MI);
783           for (auto *M : {Split.first, Split.second}) {
784             if (M && combineDPPMov(*M))
785               ++NumDPPMovsCombined;
786           }
787           Changed = true;
788         }
789       }
790     }
791   }
792   return Changed;
793 }
794 
795 PreservedAnalyses GCNDPPCombinePass::run(MachineFunction &MF,
796                                          MachineFunctionAnalysisManager &) {
797   MFPropsModifier _(*this, MF);
798 
799   if (MF.getFunction().hasOptNone())
800     return PreservedAnalyses::all();
801 
802   bool Changed = GCNDPPCombine().run(MF);
803   if (!Changed)
804     return PreservedAnalyses::all();
805 
806   auto PA = getMachineFunctionPassPreservedAnalyses();
807   PA.preserveSet<CFGAnalyses>();
808   return PA;
809 }
810