xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp (revision 6ba2210ee039f2f12878c217bcf058e9c8b26b29)
1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass tries to apply several peephole SDWA patterns.
10 ///
11 /// E.g. original:
12 ///   V_LSHRREV_B32_e32 %0, 16, %1
13 ///   V_ADD_CO_U32_e32 %2, %0, %3
14 ///   V_LSHLREV_B32_e32 %4, 16, %2
15 ///
16 /// Replace:
17 ///   V_ADD_CO_U32_sdwa %4, %1, %3
18 ///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
19 ///
20 //===----------------------------------------------------------------------===//
21 
22 #include "AMDGPU.h"
23 #include "GCNSubtarget.h"
24 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
25 #include "llvm/ADT/MapVector.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/CodeGen/MachineFunctionPass.h"
28 
29 using namespace llvm;
30 
31 #define DEBUG_TYPE "si-peephole-sdwa"
32 
33 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
34 STATISTIC(NumSDWAInstructionsPeepholed,
35           "Number of instruction converted to SDWA.");
36 
37 namespace {
38 
39 class SDWAOperand;
40 class SDWADstOperand;
41 
42 class SIPeepholeSDWA : public MachineFunctionPass {
43 public:
44   using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
45 
46 private:
47   MachineRegisterInfo *MRI;
48   const SIRegisterInfo *TRI;
49   const SIInstrInfo *TII;
50 
51   MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
52   MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches;
53   SmallVector<MachineInstr *, 8> ConvertedInstructions;
54 
55   Optional<int64_t> foldToImm(const MachineOperand &Op) const;
56 
57 public:
58   static char ID;
59 
60   SIPeepholeSDWA() : MachineFunctionPass(ID) {
61     initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
62   }
63 
64   bool runOnMachineFunction(MachineFunction &MF) override;
65   void matchSDWAOperands(MachineBasicBlock &MBB);
66   std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
67   bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
68   void pseudoOpConvertToVOP2(MachineInstr &MI,
69                              const GCNSubtarget &ST) const;
70   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
71   void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
72 
73   StringRef getPassName() const override { return "SI Peephole SDWA"; }
74 
75   void getAnalysisUsage(AnalysisUsage &AU) const override {
76     AU.setPreservesCFG();
77     MachineFunctionPass::getAnalysisUsage(AU);
78   }
79 };
80 
81 class SDWAOperand {
82 private:
83   MachineOperand *Target; // Operand that would be used in converted instruction
84   MachineOperand *Replaced; // Operand that would be replace by Target
85 
86 public:
87   SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
88       : Target(TargetOp), Replaced(ReplacedOp) {
89     assert(Target->isReg());
90     assert(Replaced->isReg());
91   }
92 
93   virtual ~SDWAOperand() = default;
94 
95   virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
96   virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
97 
98   MachineOperand *getTargetOperand() const { return Target; }
99   MachineOperand *getReplacedOperand() const { return Replaced; }
100   MachineInstr *getParentInst() const { return Target->getParent(); }
101 
102   MachineRegisterInfo *getMRI() const {
103     return &getParentInst()->getParent()->getParent()->getRegInfo();
104   }
105 
106 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
107   virtual void print(raw_ostream& OS) const = 0;
108   void dump() const { print(dbgs()); }
109 #endif
110 };
111 
112 using namespace AMDGPU::SDWA;
113 
114 class SDWASrcOperand : public SDWAOperand {
115 private:
116   SdwaSel SrcSel;
117   bool Abs;
118   bool Neg;
119   bool Sext;
120 
121 public:
122   SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
123                  SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
124                  bool Sext_ = false)
125       : SDWAOperand(TargetOp, ReplacedOp),
126         SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
127 
128   MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
129   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
130 
131   SdwaSel getSrcSel() const { return SrcSel; }
132   bool getAbs() const { return Abs; }
133   bool getNeg() const { return Neg; }
134   bool getSext() const { return Sext; }
135 
136   uint64_t getSrcMods(const SIInstrInfo *TII,
137                       const MachineOperand *SrcOp) const;
138 
139 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
140   void print(raw_ostream& OS) const override;
141 #endif
142 };
143 
144 class SDWADstOperand : public SDWAOperand {
145 private:
146   SdwaSel DstSel;
147   DstUnused DstUn;
148 
149 public:
150 
151   SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
152                  SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
153     : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
154 
155   MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
156   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
157 
158   SdwaSel getDstSel() const { return DstSel; }
159   DstUnused getDstUnused() const { return DstUn; }
160 
161 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
162   void print(raw_ostream& OS) const override;
163 #endif
164 };
165 
166 class SDWADstPreserveOperand : public SDWADstOperand {
167 private:
168   MachineOperand *Preserve;
169 
170 public:
171   SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
172                          MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
173       : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
174         Preserve(PreserveOp) {}
175 
176   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
177 
178   MachineOperand *getPreservedOperand() const { return Preserve; }
179 
180 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
181   void print(raw_ostream& OS) const override;
182 #endif
183 };
184 
185 } // end anonymous namespace
186 
187 INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
188 
189 char SIPeepholeSDWA::ID = 0;
190 
191 char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
192 
193 FunctionPass *llvm::createSIPeepholeSDWAPass() {
194   return new SIPeepholeSDWA();
195 }
196 
197 
198 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
199 static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
200   switch(Sel) {
201   case BYTE_0: OS << "BYTE_0"; break;
202   case BYTE_1: OS << "BYTE_1"; break;
203   case BYTE_2: OS << "BYTE_2"; break;
204   case BYTE_3: OS << "BYTE_3"; break;
205   case WORD_0: OS << "WORD_0"; break;
206   case WORD_1: OS << "WORD_1"; break;
207   case DWORD:  OS << "DWORD"; break;
208   }
209   return OS;
210 }
211 
212 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
213   switch(Un) {
214   case UNUSED_PAD: OS << "UNUSED_PAD"; break;
215   case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
216   case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
217   }
218   return OS;
219 }
220 
221 LLVM_DUMP_METHOD
222 void SDWASrcOperand::print(raw_ostream& OS) const {
223   OS << "SDWA src: " << *getTargetOperand()
224     << " src_sel:" << getSrcSel()
225     << " abs:" << getAbs() << " neg:" << getNeg()
226     << " sext:" << getSext() << '\n';
227 }
228 
229 LLVM_DUMP_METHOD
230 void SDWADstOperand::print(raw_ostream& OS) const {
231   OS << "SDWA dst: " << *getTargetOperand()
232     << " dst_sel:" << getDstSel()
233     << " dst_unused:" << getDstUnused() << '\n';
234 }
235 
236 LLVM_DUMP_METHOD
237 void SDWADstPreserveOperand::print(raw_ostream& OS) const {
238   OS << "SDWA preserve dst: " << *getTargetOperand()
239     << " dst_sel:" << getDstSel()
240     << " preserve:" << *getPreservedOperand() << '\n';
241 }
242 
243 #endif
244 
245 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
246   assert(To.isReg() && From.isReg());
247   To.setReg(From.getReg());
248   To.setSubReg(From.getSubReg());
249   To.setIsUndef(From.isUndef());
250   if (To.isUse()) {
251     To.setIsKill(From.isKill());
252   } else {
253     To.setIsDead(From.isDead());
254   }
255 }
256 
257 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
258   return LHS.isReg() &&
259          RHS.isReg() &&
260          LHS.getReg() == RHS.getReg() &&
261          LHS.getSubReg() == RHS.getSubReg();
262 }
263 
264 static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
265                                         const MachineRegisterInfo *MRI) {
266   if (!Reg->isReg() || !Reg->isDef())
267     return nullptr;
268 
269   MachineOperand *ResMO = nullptr;
270   for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {
271     // If there exist use of subreg of Reg then return nullptr
272     if (!isSameReg(UseMO, *Reg))
273       return nullptr;
274 
275     // Check that there is only one instruction that uses Reg
276     if (!ResMO) {
277       ResMO = &UseMO;
278     } else if (ResMO->getParent() != UseMO.getParent()) {
279       return nullptr;
280     }
281   }
282 
283   return ResMO;
284 }
285 
286 static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
287                                         const MachineRegisterInfo *MRI) {
288   if (!Reg->isReg())
289     return nullptr;
290 
291   MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());
292   if (!DefInstr)
293     return nullptr;
294 
295   for (auto &DefMO : DefInstr->defs()) {
296     if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
297       return &DefMO;
298   }
299 
300   // Ignore implicit defs.
301   return nullptr;
302 }
303 
304 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
305                                     const MachineOperand *SrcOp) const {
306   uint64_t Mods = 0;
307   const auto *MI = SrcOp->getParent();
308   if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
309     if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
310       Mods = Mod->getImm();
311     }
312   } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
313     if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
314       Mods = Mod->getImm();
315     }
316   }
317   if (Abs || Neg) {
318     assert(!Sext &&
319            "Float and integer src modifiers can't be set simulteniously");
320     Mods |= Abs ? SISrcMods::ABS : 0u;
321     Mods ^= Neg ? SISrcMods::NEG : 0u;
322   } else if (Sext) {
323     Mods |= SISrcMods::SEXT;
324   }
325 
326   return Mods;
327 }
328 
329 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
330   // For SDWA src operand potential instruction is one that use register
331   // defined by parent instruction
332   MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
333   if (!PotentialMO)
334     return nullptr;
335 
336   return PotentialMO->getParent();
337 }
338 
339 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
340   // Find operand in instruction that matches source operand and replace it with
341   // target operand. Set corresponding src_sel
342   bool IsPreserveSrc = false;
343   MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
344   MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
345   MachineOperand *SrcMods =
346       TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
347   assert(Src && (Src->isReg() || Src->isImm()));
348   if (!isSameReg(*Src, *getReplacedOperand())) {
349     // If this is not src0 then it could be src1
350     Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
351     SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
352     SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
353 
354     if (!Src ||
355         !isSameReg(*Src, *getReplacedOperand())) {
356       // It's possible this Src is a tied operand for
357       // UNUSED_PRESERVE, in which case we can either
358       // abandon the peephole attempt, or if legal we can
359       // copy the target operand into the tied slot
360       // if the preserve operation will effectively cause the same
361       // result by overwriting the rest of the dst.
362       MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
363       MachineOperand *DstUnused =
364         TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
365 
366       if (Dst &&
367           DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
368         // This will work if the tied src is acessing WORD_0, and the dst is
369         // writing WORD_1. Modifiers don't matter because all the bits that
370         // would be impacted are being overwritten by the dst.
371         // Any other case will not work.
372         SdwaSel DstSel = static_cast<SdwaSel>(
373             TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
374         if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
375             getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
376           IsPreserveSrc = true;
377           auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
378                                                    AMDGPU::OpName::vdst);
379           auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
380           Src = &MI.getOperand(TiedIdx);
381           SrcSel = nullptr;
382           SrcMods = nullptr;
383         } else {
384           // Not legal to convert this src
385           return false;
386         }
387       }
388     }
389     assert(Src && Src->isReg());
390 
391     if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
392          MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
393          MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
394          MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
395          !isSameReg(*Src, *getReplacedOperand())) {
396       // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
397       // src2. This is not allowed.
398       return false;
399     }
400 
401     assert(isSameReg(*Src, *getReplacedOperand()) &&
402            (IsPreserveSrc || (SrcSel && SrcMods)));
403   }
404   copyRegOperand(*Src, *getTargetOperand());
405   if (!IsPreserveSrc) {
406     SrcSel->setImm(getSrcSel());
407     SrcMods->setImm(getSrcMods(TII, Src));
408   }
409   getTargetOperand()->setIsKill(false);
410   return true;
411 }
412 
413 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
414   // For SDWA dst operand potential instruction is one that defines register
415   // that this operand uses
416   MachineRegisterInfo *MRI = getMRI();
417   MachineInstr *ParentMI = getParentInst();
418 
419   MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);
420   if (!PotentialMO)
421     return nullptr;
422 
423   // Check that ParentMI is the only instruction that uses replaced register
424   for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) {
425     if (&UseInst != ParentMI)
426       return nullptr;
427   }
428 
429   return PotentialMO->getParent();
430 }
431 
432 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
433   // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
434 
435   if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
436        MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
437        MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
438        MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
439       getDstSel() != AMDGPU::SDWA::DWORD) {
440     // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
441     return false;
442   }
443 
444   MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
445   assert(Operand &&
446          Operand->isReg() &&
447          isSameReg(*Operand, *getReplacedOperand()));
448   copyRegOperand(*Operand, *getTargetOperand());
449   MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
450   assert(DstSel);
451   DstSel->setImm(getDstSel());
452   MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
453   assert(DstUnused);
454   DstUnused->setImm(getDstUnused());
455 
456   // Remove original instruction  because it would conflict with our new
457   // instruction by register definition
458   getParentInst()->eraseFromParent();
459   return true;
460 }
461 
462 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
463                                            const SIInstrInfo *TII) {
464   // MI should be moved right before v_or_b32.
465   // For this we should clear all kill flags on uses of MI src-operands or else
466   // we can encounter problem with use of killed operand.
467   for (MachineOperand &MO : MI.uses()) {
468     if (!MO.isReg())
469       continue;
470     getMRI()->clearKillFlags(MO.getReg());
471   }
472 
473   // Move MI before v_or_b32
474   auto MBB = MI.getParent();
475   MBB->remove(&MI);
476   MBB->insert(getParentInst(), &MI);
477 
478   // Add Implicit use of preserved register
479   MachineInstrBuilder MIB(*MBB->getParent(), MI);
480   MIB.addReg(getPreservedOperand()->getReg(),
481              RegState::ImplicitKill,
482              getPreservedOperand()->getSubReg());
483 
484   // Tie dst to implicit use
485   MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),
486                  MI.getNumOperands() - 1);
487 
488   // Convert MI as any other SDWADstOperand and remove v_or_b32
489   return SDWADstOperand::convertToSDWA(MI, TII);
490 }
491 
492 Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
493   if (Op.isImm()) {
494     return Op.getImm();
495   }
496 
497   // If this is not immediate then it can be copy of immediate value, e.g.:
498   // %1 = S_MOV_B32 255;
499   if (Op.isReg()) {
500     for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
501       if (!isSameReg(Op, Def))
502         continue;
503 
504       const MachineInstr *DefInst = Def.getParent();
505       if (!TII->isFoldableCopy(*DefInst))
506         return None;
507 
508       const MachineOperand &Copied = DefInst->getOperand(1);
509       if (!Copied.isImm())
510         return None;
511 
512       return Copied.getImm();
513     }
514   }
515 
516   return None;
517 }
518 
519 std::unique_ptr<SDWAOperand>
520 SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
521   unsigned Opcode = MI.getOpcode();
522   switch (Opcode) {
523   case AMDGPU::V_LSHRREV_B32_e32:
524   case AMDGPU::V_ASHRREV_I32_e32:
525   case AMDGPU::V_LSHLREV_B32_e32:
526   case AMDGPU::V_LSHRREV_B32_e64:
527   case AMDGPU::V_ASHRREV_I32_e64:
528   case AMDGPU::V_LSHLREV_B32_e64: {
529     // from: v_lshrrev_b32_e32 v1, 16/24, v0
530     // to SDWA src:v0 src_sel:WORD_1/BYTE_3
531 
532     // from: v_ashrrev_i32_e32 v1, 16/24, v0
533     // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
534 
535     // from: v_lshlrev_b32_e32 v1, 16/24, v0
536     // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
537     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
538     auto Imm = foldToImm(*Src0);
539     if (!Imm)
540       break;
541 
542     if (*Imm != 16 && *Imm != 24)
543       break;
544 
545     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
546     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
547     if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical())
548       break;
549 
550     if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
551         Opcode == AMDGPU::V_LSHLREV_B32_e64) {
552       return std::make_unique<SDWADstOperand>(
553           Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
554     } else {
555       return std::make_unique<SDWASrcOperand>(
556           Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
557           Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
558           Opcode != AMDGPU::V_LSHRREV_B32_e64);
559     }
560     break;
561   }
562 
563   case AMDGPU::V_LSHRREV_B16_e32:
564   case AMDGPU::V_ASHRREV_I16_e32:
565   case AMDGPU::V_LSHLREV_B16_e32:
566   case AMDGPU::V_LSHRREV_B16_e64:
567   case AMDGPU::V_ASHRREV_I16_e64:
568   case AMDGPU::V_LSHLREV_B16_e64: {
569     // from: v_lshrrev_b16_e32 v1, 8, v0
570     // to SDWA src:v0 src_sel:BYTE_1
571 
572     // from: v_ashrrev_i16_e32 v1, 8, v0
573     // to SDWA src:v0 src_sel:BYTE_1 sext:1
574 
575     // from: v_lshlrev_b16_e32 v1, 8, v0
576     // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
577     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
578     auto Imm = foldToImm(*Src0);
579     if (!Imm || *Imm != 8)
580       break;
581 
582     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
583     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
584 
585     if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical())
586       break;
587 
588     if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
589         Opcode == AMDGPU::V_LSHLREV_B16_e64) {
590       return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
591     } else {
592       return std::make_unique<SDWASrcOperand>(
593             Src1, Dst, BYTE_1, false, false,
594             Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
595             Opcode != AMDGPU::V_LSHRREV_B16_e64);
596     }
597     break;
598   }
599 
600   case AMDGPU::V_BFE_I32_e64:
601   case AMDGPU::V_BFE_U32_e64: {
602     // e.g.:
603     // from: v_bfe_u32 v1, v0, 8, 8
604     // to SDWA src:v0 src_sel:BYTE_1
605 
606     // offset | width | src_sel
607     // ------------------------
608     // 0      | 8     | BYTE_0
609     // 0      | 16    | WORD_0
610     // 0      | 32    | DWORD ?
611     // 8      | 8     | BYTE_1
612     // 16     | 8     | BYTE_2
613     // 16     | 16    | WORD_1
614     // 24     | 8     | BYTE_3
615 
616     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
617     auto Offset = foldToImm(*Src1);
618     if (!Offset)
619       break;
620 
621     MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
622     auto Width = foldToImm(*Src2);
623     if (!Width)
624       break;
625 
626     SdwaSel SrcSel = DWORD;
627 
628     if (*Offset == 0 && *Width == 8)
629       SrcSel = BYTE_0;
630     else if (*Offset == 0 && *Width == 16)
631       SrcSel = WORD_0;
632     else if (*Offset == 0 && *Width == 32)
633       SrcSel = DWORD;
634     else if (*Offset == 8 && *Width == 8)
635       SrcSel = BYTE_1;
636     else if (*Offset == 16 && *Width == 8)
637       SrcSel = BYTE_2;
638     else if (*Offset == 16 && *Width == 16)
639       SrcSel = WORD_1;
640     else if (*Offset == 24 && *Width == 8)
641       SrcSel = BYTE_3;
642     else
643       break;
644 
645     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
646     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
647 
648     if (Src0->getReg().isPhysical() || Dst->getReg().isPhysical())
649       break;
650 
651     return std::make_unique<SDWASrcOperand>(
652           Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64);
653   }
654 
655   case AMDGPU::V_AND_B32_e32:
656   case AMDGPU::V_AND_B32_e64: {
657     // e.g.:
658     // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
659     // to SDWA src:v0 src_sel:WORD_0/BYTE_0
660 
661     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
662     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
663     auto ValSrc = Src1;
664     auto Imm = foldToImm(*Src0);
665 
666     if (!Imm) {
667       Imm = foldToImm(*Src1);
668       ValSrc = Src0;
669     }
670 
671     if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
672       break;
673 
674     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
675 
676     if (ValSrc->getReg().isPhysical() || Dst->getReg().isPhysical())
677       break;
678 
679     return std::make_unique<SDWASrcOperand>(
680         ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
681   }
682 
683   case AMDGPU::V_OR_B32_e32:
684   case AMDGPU::V_OR_B32_e64: {
685     // Patterns for dst_unused:UNUSED_PRESERVE.
686     // e.g., from:
687     // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
688     //                           src1_sel:WORD_1 src2_sel:WORD1
689     // v_add_f16_e32 v3, v1, v2
690     // v_or_b32_e32 v4, v0, v3
691     // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
692 
693     // Check if one of operands of v_or_b32 is SDWA instruction
694     using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>;
695     auto CheckOROperandsForSDWA =
696       [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
697         if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
698           return CheckRetType(None);
699 
700         MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);
701         if (!Op1Def)
702           return CheckRetType(None);
703 
704         MachineInstr *Op1Inst = Op1Def->getParent();
705         if (!TII->isSDWA(*Op1Inst))
706           return CheckRetType(None);
707 
708         MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);
709         if (!Op2Def)
710           return CheckRetType(None);
711 
712         return CheckRetType(std::make_pair(Op1Def, Op2Def));
713       };
714 
715     MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
716     MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
717     assert(OrSDWA && OrOther);
718     auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
719     if (!Res) {
720       OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
721       OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
722       assert(OrSDWA && OrOther);
723       Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
724       if (!Res)
725         break;
726     }
727 
728     MachineOperand *OrSDWADef = Res->first;
729     MachineOperand *OrOtherDef = Res->second;
730     assert(OrSDWADef && OrOtherDef);
731 
732     MachineInstr *SDWAInst = OrSDWADef->getParent();
733     MachineInstr *OtherInst = OrOtherDef->getParent();
734 
735     // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
736     // destination patterns don't overlap. Compatible instruction can be either
737     // regular instruction with compatible bitness or SDWA instruction with
738     // correct dst_sel
739     // SDWAInst | OtherInst bitness / OtherInst dst_sel
740     // -----------------------------------------------------
741     // DWORD    | no                    / no
742     // WORD_0   | no                    / BYTE_2/3, WORD_1
743     // WORD_1   | 8/16-bit instructions / BYTE_0/1, WORD_0
744     // BYTE_0   | no                    / BYTE_1/2/3, WORD_1
745     // BYTE_1   | 8-bit                 / BYTE_0/2/3, WORD_1
746     // BYTE_2   | 8/16-bit              / BYTE_0/1/3. WORD_0
747     // BYTE_3   | 8/16/24-bit           / BYTE_0/1/2, WORD_0
748     // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
749     // but v_add_f32 is not.
750 
751     // TODO: add support for non-SDWA instructions as OtherInst.
752     // For now this only works with SDWA instructions. For regular instructions
753     // there is no way to determine if the instruction writes only 8/16/24-bit
754     // out of full register size and all registers are at min 32-bit wide.
755     if (!TII->isSDWA(*OtherInst))
756       break;
757 
758     SdwaSel DstSel = static_cast<SdwaSel>(
759       TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));;
760     SdwaSel OtherDstSel = static_cast<SdwaSel>(
761       TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
762 
763     bool DstSelAgree = false;
764     switch (DstSel) {
765     case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
766                                 (OtherDstSel == BYTE_3) ||
767                                 (OtherDstSel == WORD_1));
768       break;
769     case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
770                                 (OtherDstSel == BYTE_1) ||
771                                 (OtherDstSel == WORD_0));
772       break;
773     case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
774                                 (OtherDstSel == BYTE_2) ||
775                                 (OtherDstSel == BYTE_3) ||
776                                 (OtherDstSel == WORD_1));
777       break;
778     case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
779                                 (OtherDstSel == BYTE_2) ||
780                                 (OtherDstSel == BYTE_3) ||
781                                 (OtherDstSel == WORD_1));
782       break;
783     case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
784                                 (OtherDstSel == BYTE_1) ||
785                                 (OtherDstSel == BYTE_3) ||
786                                 (OtherDstSel == WORD_0));
787       break;
788     case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
789                                 (OtherDstSel == BYTE_1) ||
790                                 (OtherDstSel == BYTE_2) ||
791                                 (OtherDstSel == WORD_0));
792       break;
793     default: DstSelAgree = false;
794     }
795 
796     if (!DstSelAgree)
797       break;
798 
799     // Also OtherInst dst_unused should be UNUSED_PAD
800     DstUnused OtherDstUnused = static_cast<DstUnused>(
801       TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
802     if (OtherDstUnused != DstUnused::UNUSED_PAD)
803       break;
804 
805     // Create DstPreserveOperand
806     MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
807     assert(OrDst && OrDst->isReg());
808 
809     return std::make_unique<SDWADstPreserveOperand>(
810       OrDst, OrSDWADef, OrOtherDef, DstSel);
811 
812   }
813   }
814 
815   return std::unique_ptr<SDWAOperand>(nullptr);
816 }
817 
818 #if !defined(NDEBUG)
819 static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
820   Operand.print(OS);
821   return OS;
822 }
823 #endif
824 
825 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
826   for (MachineInstr &MI : MBB) {
827     if (auto Operand = matchSDWAOperand(MI)) {
828       LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
829       SDWAOperands[&MI] = std::move(Operand);
830       ++NumSDWAPatternsFound;
831     }
832   }
833 }
834 
835 // Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and
836 // V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows isConvertibleToSDWA
837 // to perform its transformation on V_ADD_CO_U32_e32 into V_ADD_CO_U32_sdwa.
838 //
839 // We are transforming from a VOP3 into a VOP2 form of the instruction.
840 //   %19:vgpr_32 = V_AND_B32_e32 255,
841 //       killed %16:vgpr_32, implicit $exec
842 //   %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64
843 //       %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
844 //  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
845 //       %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
846 //
847 // becomes
848 //   %47:vgpr_32 = V_ADD_CO_U32_sdwa
849 //       0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
850 //       implicit-def $vcc, implicit $exec
851 //  %48:vgpr_32 = V_ADDC_U32_e32
852 //       0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec
853 void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
854                                            const GCNSubtarget &ST) const {
855   int Opc = MI.getOpcode();
856   assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) &&
857          "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
858 
859   // Can the candidate MI be shrunk?
860   if (!TII->canShrink(MI, *MRI))
861     return;
862   Opc = AMDGPU::getVOPe32(Opc);
863   // Find the related ADD instruction.
864   const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
865   if (!Sdst)
866     return;
867   MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);
868   if (!NextOp)
869     return;
870   MachineInstr &MISucc = *NextOp->getParent();
871   // Can the successor be shrunk?
872   if (!TII->canShrink(MISucc, *MRI))
873     return;
874   int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode());
875   // Make sure the carry in/out are subsequently unused.
876   MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
877   if (!CarryIn)
878     return;
879   MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
880   if (!CarryOut)
881     return;
882   if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg()))
883     return;
884   // Make sure VCC or its subregs are dead before MI.
885   MachineBasicBlock &MBB = *MI.getParent();
886   auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
887   if (Liveness != MachineBasicBlock::LQR_Dead)
888     return;
889   // Check if VCC is referenced in range of (MI,MISucc].
890   for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();
891        I != E; ++I) {
892     if (I->modifiesRegister(AMDGPU::VCC, TRI))
893       return;
894   }
895 
896   // Make the two new e32 instruction variants.
897   // Replace MI with V_{SUB|ADD}_I32_e32
898   BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc))
899     .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
900     .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
901     .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
902     .setMIFlags(MI.getFlags());
903 
904   MI.eraseFromParent();
905 
906   // Replace MISucc with V_{SUBB|ADDC}_U32_e32
907   BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc))
908     .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst))
909     .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0))
910     .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1))
911     .setMIFlags(MISucc.getFlags());
912 
913   MISucc.eraseFromParent();
914 }
915 
916 bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
917                                          const GCNSubtarget &ST) const {
918   // Check if this is already an SDWA instruction
919   unsigned Opc = MI.getOpcode();
920   if (TII->isSDWA(Opc))
921     return true;
922 
923   // Check if this instruction has opcode that supports SDWA
924   if (AMDGPU::getSDWAOp(Opc) == -1)
925     Opc = AMDGPU::getVOPe32(Opc);
926 
927   if (AMDGPU::getSDWAOp(Opc) == -1)
928     return false;
929 
930   if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
931     return false;
932 
933   if (TII->isVOPC(Opc)) {
934     if (!ST.hasSDWASdst()) {
935       const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
936       if (SDst && (SDst->getReg() != AMDGPU::VCC &&
937                    SDst->getReg() != AMDGPU::VCC_LO))
938         return false;
939     }
940 
941     if (!ST.hasSDWAOutModsVOPC() &&
942         (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
943          TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
944       return false;
945 
946   } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
947              !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
948     return false;
949   }
950 
951   if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
952                            Opc == AMDGPU::V_FMAC_F32_e32 ||
953                            Opc == AMDGPU::V_MAC_F16_e32 ||
954                            Opc == AMDGPU::V_MAC_F32_e32))
955     return false;
956 
957   // Check if target supports this SDWA opcode
958   if (TII->pseudoToMCOpcode(Opc) == -1)
959     return false;
960 
961   // FIXME: has SDWA but require handling of implicit VCC use
962   if (Opc == AMDGPU::V_CNDMASK_B32_e32)
963     return false;
964 
965   if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) {
966     if (!Src0->isReg() && !Src0->isImm())
967       return false;
968   }
969 
970   if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) {
971     if (!Src1->isReg() && !Src1->isImm())
972       return false;
973   }
974 
975   return true;
976 }
977 
978 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
979                                    const SDWAOperandsVector &SDWAOperands) {
980 
981   LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
982 
983   // Convert to sdwa
984   int SDWAOpcode;
985   unsigned Opcode = MI.getOpcode();
986   if (TII->isSDWA(Opcode)) {
987     SDWAOpcode = Opcode;
988   } else {
989     SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
990     if (SDWAOpcode == -1)
991       SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
992   }
993   assert(SDWAOpcode != -1);
994 
995   const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
996 
997   // Create SDWA version of instruction MI and initialize its operands
998   MachineInstrBuilder SDWAInst =
999     BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc)
1000     .setMIFlags(MI.getFlags());
1001 
1002   // Copy dst, if it is present in original then should also be present in SDWA
1003   MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1004   if (Dst) {
1005     assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
1006     SDWAInst.add(*Dst);
1007   } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {
1008     assert(Dst &&
1009            AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
1010     SDWAInst.add(*Dst);
1011   } else {
1012     assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
1013     SDWAInst.addReg(TRI->getVCC(), RegState::Define);
1014   }
1015 
1016   // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1017   // src0_modifiers (except for v_nop_sdwa, but it can't get here)
1018   MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1019   assert(
1020     Src0 &&
1021     AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
1022     AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
1023   if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
1024     SDWAInst.addImm(Mod->getImm());
1025   else
1026     SDWAInst.addImm(0);
1027   SDWAInst.add(*Src0);
1028 
1029   // Copy src1 if present, initialize src1_modifiers.
1030   MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1031   if (Src1) {
1032     assert(
1033       AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
1034       AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
1035     if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
1036       SDWAInst.addImm(Mod->getImm());
1037     else
1038       SDWAInst.addImm(0);
1039     SDWAInst.add(*Src1);
1040   }
1041 
1042   if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1043       SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1044       SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1045       SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1046     // v_mac_f16/32 has additional src2 operand tied to vdst
1047     MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1048     assert(Src2);
1049     SDWAInst.add(*Src2);
1050   }
1051 
1052   // Copy clamp if present, initialize otherwise
1053   assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
1054   MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
1055   if (Clamp) {
1056     SDWAInst.add(*Clamp);
1057   } else {
1058     SDWAInst.addImm(0);
1059   }
1060 
1061   // Copy omod if present, initialize otherwise if needed
1062   if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
1063     MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
1064     if (OMod) {
1065       SDWAInst.add(*OMod);
1066     } else {
1067       SDWAInst.addImm(0);
1068     }
1069   }
1070 
1071   // Copy dst_sel if present, initialize otherwise if needed
1072   if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) {
1073     MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
1074     if (DstSel) {
1075       SDWAInst.add(*DstSel);
1076     } else {
1077       SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1078     }
1079   }
1080 
1081   // Copy dst_unused if present, initialize otherwise if needed
1082   if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) {
1083     MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1084     if (DstUnused) {
1085       SDWAInst.add(*DstUnused);
1086     } else {
1087       SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1088     }
1089   }
1090 
1091   // Copy src0_sel if present, initialize otherwise
1092   assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
1093   MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
1094   if (Src0Sel) {
1095     SDWAInst.add(*Src0Sel);
1096   } else {
1097     SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1098   }
1099 
1100   // Copy src1_sel if present, initialize otherwise if needed
1101   if (Src1) {
1102     assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
1103     MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
1104     if (Src1Sel) {
1105       SDWAInst.add(*Src1Sel);
1106     } else {
1107       SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1108     }
1109   }
1110 
1111   // Check for a preserved register that needs to be copied.
1112   auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1113   if (DstUnused &&
1114       DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
1115     // We expect, if we are here, that the instruction was already in it's SDWA form,
1116     // with a tied operand.
1117     assert(Dst && Dst->isTied());
1118     assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
1119     // We also expect a vdst, since sdst can't preserve.
1120     auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
1121     assert(PreserveDstIdx != -1);
1122 
1123     auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
1124     auto Tied = MI.getOperand(TiedIdx);
1125 
1126     SDWAInst.add(Tied);
1127     SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
1128   }
1129 
1130   // Apply all sdwa operand patterns.
1131   bool Converted = false;
1132   for (auto &Operand : SDWAOperands) {
1133     LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
1134     // There should be no intesection between SDWA operands and potential MIs
1135     // e.g.:
1136     // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1137     // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1138     // v_add_u32 v3, v4, v2
1139     //
1140     // In that example it is possible that we would fold 2nd instruction into 3rd
1141     // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
1142     // already destroyed). So if SDWAOperand is also a potential MI then do not
1143     // apply it.
1144     if (PotentialMatches.count(Operand->getParentInst()) == 0)
1145       Converted |= Operand->convertToSDWA(*SDWAInst, TII);
1146   }
1147   if (Converted) {
1148     ConvertedInstructions.push_back(SDWAInst);
1149   } else {
1150     SDWAInst->eraseFromParent();
1151     return false;
1152   }
1153 
1154   LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
1155   ++NumSDWAInstructionsPeepholed;
1156 
1157   MI.eraseFromParent();
1158   return true;
1159 }
1160 
1161 // If an instruction was converted to SDWA it should not have immediates or SGPR
1162 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1163 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1164                                             const GCNSubtarget &ST) const {
1165   const MCInstrDesc &Desc = TII->get(MI.getOpcode());
1166   unsigned ConstantBusCount = 0;
1167   for (MachineOperand &Op : MI.explicit_uses()) {
1168     if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
1169       continue;
1170 
1171     unsigned I = MI.getOperandNo(&Op);
1172     if (Desc.OpInfo[I].RegClass == -1 ||
1173        !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
1174       continue;
1175 
1176     if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
1177         TRI->isSGPRReg(*MRI, Op.getReg())) {
1178       ++ConstantBusCount;
1179       continue;
1180     }
1181 
1182     Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1183     auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1184                         TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1185     if (Op.isImm())
1186       Copy.addImm(Op.getImm());
1187     else if (Op.isReg())
1188       Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
1189                   Op.getSubReg());
1190     Op.ChangeToRegister(VGPR, false);
1191   }
1192 }
1193 
1194 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
1195   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1196 
1197   if (!ST.hasSDWA() || skipFunction(MF.getFunction()))
1198     return false;
1199 
1200   MRI = &MF.getRegInfo();
1201   TRI = ST.getRegisterInfo();
1202   TII = ST.getInstrInfo();
1203 
1204   // Find all SDWA operands in MF.
1205   bool Ret = false;
1206   for (MachineBasicBlock &MBB : MF) {
1207     bool Changed = false;
1208     do {
1209       // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1210       // Look for a possible ADD or SUB that resulted from a previously lowered
1211       // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1212       // lowers the pair of instructions into e32 form.
1213       matchSDWAOperands(MBB);
1214       for (const auto &OperandPair : SDWAOperands) {
1215         const auto &Operand = OperandPair.second;
1216         MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1217         if (PotentialMI &&
1218            (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
1219             PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
1220           pseudoOpConvertToVOP2(*PotentialMI, ST);
1221       }
1222       SDWAOperands.clear();
1223 
1224       // Generate potential match list.
1225       matchSDWAOperands(MBB);
1226 
1227       for (const auto &OperandPair : SDWAOperands) {
1228         const auto &Operand = OperandPair.second;
1229         MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1230         if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
1231           PotentialMatches[PotentialMI].push_back(Operand.get());
1232         }
1233       }
1234 
1235       for (auto &PotentialPair : PotentialMatches) {
1236         MachineInstr &PotentialMI = *PotentialPair.first;
1237         convertToSDWA(PotentialMI, PotentialPair.second);
1238       }
1239 
1240       PotentialMatches.clear();
1241       SDWAOperands.clear();
1242 
1243       Changed = !ConvertedInstructions.empty();
1244 
1245       if (Changed)
1246         Ret = true;
1247       while (!ConvertedInstructions.empty())
1248         legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
1249     } while (Changed);
1250   }
1251 
1252   return Ret;
1253 }
1254