xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp (revision 1342eb5a832fa10e689a29faab3acb6054e4778c)
1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass tries to apply several peephole SDWA patterns.
10 ///
11 /// E.g. original:
12 ///   V_LSHRREV_B32_e32 %0, 16, %1
13 ///   V_ADD_CO_U32_e32 %2, %0, %3
14 ///   V_LSHLREV_B32_e32 %4, 16, %2
15 ///
16 /// Replace:
17 ///   V_ADD_CO_U32_sdwa %4, %1, %3
18 ///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
19 ///
20 //===----------------------------------------------------------------------===//
21 
22 #include "SIPeepholeSDWA.h"
23 #include "AMDGPU.h"
24 #include "GCNSubtarget.h"
25 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
26 #include "llvm/ADT/MapVector.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/CodeGen/MachineFunctionPass.h"
29 #include <optional>
30 
31 using namespace llvm;
32 
33 #define DEBUG_TYPE "si-peephole-sdwa"
34 
35 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
36 STATISTIC(NumSDWAInstructionsPeepholed,
37           "Number of instruction converted to SDWA.");
38 
39 namespace {
40 
41 bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST,
42                          const SIInstrInfo *TII);
43 class SDWAOperand;
44 class SDWADstOperand;
45 
46 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
47 using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>;
48 
49 class SIPeepholeSDWA {
50 private:
51   MachineRegisterInfo *MRI;
52   const SIRegisterInfo *TRI;
53   const SIInstrInfo *TII;
54 
55   MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
56   SDWAOperandsMap PotentialMatches;
57   SmallVector<MachineInstr *, 8> ConvertedInstructions;
58 
59   std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
60 
61   void matchSDWAOperands(MachineBasicBlock &MBB);
62   std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
63   void pseudoOpConvertToVOP2(MachineInstr &MI,
64                              const GCNSubtarget &ST) const;
65   void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const;
66   MachineInstr *createSDWAVersion(MachineInstr &MI);
67   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
68   void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
69 
70 public:
71   bool run(MachineFunction &MF);
72 };
73 
74 class SIPeepholeSDWALegacy : public MachineFunctionPass {
75 public:
76   static char ID;
77 
78   SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {}
79 
80   StringRef getPassName() const override { return "SI Peephole SDWA"; }
81 
82   bool runOnMachineFunction(MachineFunction &MF) override;
83 
84   void getAnalysisUsage(AnalysisUsage &AU) const override {
85     AU.setPreservesCFG();
86     MachineFunctionPass::getAnalysisUsage(AU);
87   }
88 };
89 
90 using namespace AMDGPU::SDWA;
91 
92 class SDWAOperand {
93 private:
94   MachineOperand *Target; // Operand that would be used in converted instruction
95   MachineOperand *Replaced; // Operand that would be replace by Target
96 
97   /// Returns true iff the SDWA selection of this SDWAOperand can be combined
98   /// with the SDWA selections of its uses in \p MI.
99   virtual bool canCombineSelections(const MachineInstr &MI,
100                                     const SIInstrInfo *TII) = 0;
101 
102 public:
103   SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
104       : Target(TargetOp), Replaced(ReplacedOp) {
105     assert(Target->isReg());
106     assert(Replaced->isReg());
107   }
108 
109   virtual ~SDWAOperand() = default;
110 
111   virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII,
112                                            const GCNSubtarget &ST,
113                                            SDWAOperandsMap *PotentialMatches = nullptr) = 0;
114   virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
115 
116   MachineOperand *getTargetOperand() const { return Target; }
117   MachineOperand *getReplacedOperand() const { return Replaced; }
118   MachineInstr *getParentInst() const { return Target->getParent(); }
119 
120   MachineRegisterInfo *getMRI() const {
121     return &getParentInst()->getParent()->getParent()->getRegInfo();
122   }
123 
124 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
125   virtual void print(raw_ostream& OS) const = 0;
126   void dump() const { print(dbgs()); }
127 #endif
128 };
129 
130 class SDWASrcOperand : public SDWAOperand {
131 private:
132   SdwaSel SrcSel;
133   bool Abs;
134   bool Neg;
135   bool Sext;
136 
137 public:
138   SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
139                  SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
140                  bool Sext_ = false)
141       : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
142         Neg(Neg_), Sext(Sext_) {}
143 
144   MachineInstr *potentialToConvert(const SIInstrInfo *TII,
145                                    const GCNSubtarget &ST,
146                                    SDWAOperandsMap *PotentialMatches = nullptr) override;
147   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
148   bool canCombineSelections(const MachineInstr &MI,
149                             const SIInstrInfo *TII) override;
150 
151   SdwaSel getSrcSel() const { return SrcSel; }
152   bool getAbs() const { return Abs; }
153   bool getNeg() const { return Neg; }
154   bool getSext() const { return Sext; }
155 
156   uint64_t getSrcMods(const SIInstrInfo *TII,
157                       const MachineOperand *SrcOp) const;
158 
159 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
160   void print(raw_ostream& OS) const override;
161 #endif
162 };
163 
164 class SDWADstOperand : public SDWAOperand {
165 private:
166   SdwaSel DstSel;
167   DstUnused DstUn;
168 
169 public:
170   SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
171                  SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
172       : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
173 
174   MachineInstr *potentialToConvert(const SIInstrInfo *TII,
175                                    const GCNSubtarget &ST,
176                                    SDWAOperandsMap *PotentialMatches = nullptr) override;
177   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
178   bool canCombineSelections(const MachineInstr &MI,
179                             const SIInstrInfo *TII) override;
180 
181   SdwaSel getDstSel() const { return DstSel; }
182   DstUnused getDstUnused() const { return DstUn; }
183 
184 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
185   void print(raw_ostream& OS) const override;
186 #endif
187 };
188 
189 class SDWADstPreserveOperand : public SDWADstOperand {
190 private:
191   MachineOperand *Preserve;
192 
193 public:
194   SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
195                          MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
196       : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
197         Preserve(PreserveOp) {}
198 
199   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
200   bool canCombineSelections(const MachineInstr &MI,
201                             const SIInstrInfo *TII) override;
202 
203   MachineOperand *getPreservedOperand() const { return Preserve; }
204 
205 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
206   void print(raw_ostream& OS) const override;
207 #endif
208 };
209 
210 } // end anonymous namespace
211 
212 INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA", false,
213                 false)
214 
215 char SIPeepholeSDWALegacy::ID = 0;
216 
217 char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID;
218 
219 FunctionPass *llvm::createSIPeepholeSDWALegacyPass() {
220   return new SIPeepholeSDWALegacy();
221 }
222 
223 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
224 static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
225   switch(Sel) {
226   case BYTE_0: OS << "BYTE_0"; break;
227   case BYTE_1: OS << "BYTE_1"; break;
228   case BYTE_2: OS << "BYTE_2"; break;
229   case BYTE_3: OS << "BYTE_3"; break;
230   case WORD_0: OS << "WORD_0"; break;
231   case WORD_1: OS << "WORD_1"; break;
232   case DWORD:  OS << "DWORD"; break;
233   }
234   return OS;
235 }
236 
237 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
238   switch(Un) {
239   case UNUSED_PAD: OS << "UNUSED_PAD"; break;
240   case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
241   case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
242   }
243   return OS;
244 }
245 
246 LLVM_DUMP_METHOD
247 void SDWASrcOperand::print(raw_ostream& OS) const {
248   OS << "SDWA src: " << *getTargetOperand()
249     << " src_sel:" << getSrcSel()
250     << " abs:" << getAbs() << " neg:" << getNeg()
251     << " sext:" << getSext() << '\n';
252 }
253 
254 LLVM_DUMP_METHOD
255 void SDWADstOperand::print(raw_ostream& OS) const {
256   OS << "SDWA dst: " << *getTargetOperand()
257     << " dst_sel:" << getDstSel()
258     << " dst_unused:" << getDstUnused() << '\n';
259 }
260 
261 LLVM_DUMP_METHOD
262 void SDWADstPreserveOperand::print(raw_ostream& OS) const {
263   OS << "SDWA preserve dst: " << *getTargetOperand()
264     << " dst_sel:" << getDstSel()
265     << " preserve:" << *getPreservedOperand() << '\n';
266 }
267 
268 #endif
269 
270 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
271   assert(To.isReg() && From.isReg());
272   To.setReg(From.getReg());
273   To.setSubReg(From.getSubReg());
274   To.setIsUndef(From.isUndef());
275   if (To.isUse()) {
276     To.setIsKill(From.isKill());
277   } else {
278     To.setIsDead(From.isDead());
279   }
280 }
281 
282 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
283   return LHS.isReg() &&
284          RHS.isReg() &&
285          LHS.getReg() == RHS.getReg() &&
286          LHS.getSubReg() == RHS.getSubReg();
287 }
288 
289 static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
290                                         const MachineRegisterInfo *MRI) {
291   if (!Reg->isReg() || !Reg->isDef())
292     return nullptr;
293 
294   MachineOperand *ResMO = nullptr;
295   for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {
296     // If there exist use of subreg of Reg then return nullptr
297     if (!isSameReg(UseMO, *Reg))
298       return nullptr;
299 
300     // Check that there is only one instruction that uses Reg
301     if (!ResMO) {
302       ResMO = &UseMO;
303     } else if (ResMO->getParent() != UseMO.getParent()) {
304       return nullptr;
305     }
306   }
307 
308   return ResMO;
309 }
310 
311 static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
312                                         const MachineRegisterInfo *MRI) {
313   if (!Reg->isReg())
314     return nullptr;
315 
316   MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());
317   if (!DefInstr)
318     return nullptr;
319 
320   for (auto &DefMO : DefInstr->defs()) {
321     if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
322       return &DefMO;
323   }
324 
325   // Ignore implicit defs.
326   return nullptr;
327 }
328 
329 /// Combine an SDWA instruction's existing SDWA selection \p Sel with
330 /// the SDWA selection \p OperandSel of its operand. If the selections
331 /// are compatible, return the combined selection, otherwise return a
332 /// nullopt.
333 /// For example, if we have Sel = BYTE_0 Sel and OperandSel = WORD_1:
334 ///     BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X)
335 static std::optional<SdwaSel> combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel) {
336   if (Sel == SdwaSel::DWORD)
337     return OperandSel;
338 
339   if (Sel == OperandSel || OperandSel == SdwaSel::DWORD)
340     return Sel;
341 
342   if (Sel == SdwaSel::WORD_1 || Sel == SdwaSel::BYTE_2 ||
343       Sel == SdwaSel::BYTE_3)
344     return {};
345 
346   if (OperandSel == SdwaSel::WORD_0)
347     return Sel;
348 
349   if (OperandSel == SdwaSel::WORD_1) {
350     if (Sel == SdwaSel::BYTE_0)
351       return SdwaSel::BYTE_2;
352     if (Sel == SdwaSel::BYTE_1)
353       return SdwaSel::BYTE_3;
354     if (Sel == SdwaSel::WORD_0)
355       return SdwaSel::WORD_1;
356   }
357 
358   return {};
359 }
360 
361 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
362                                     const MachineOperand *SrcOp) const {
363   uint64_t Mods = 0;
364   const auto *MI = SrcOp->getParent();
365   if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
366     if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
367       Mods = Mod->getImm();
368     }
369   } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
370     if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
371       Mods = Mod->getImm();
372     }
373   }
374   if (Abs || Neg) {
375     assert(!Sext &&
376            "Float and integer src modifiers can't be set simultaneously");
377     Mods |= Abs ? SISrcMods::ABS : 0u;
378     Mods ^= Neg ? SISrcMods::NEG : 0u;
379   } else if (Sext) {
380     Mods |= SISrcMods::SEXT;
381   }
382 
383   return Mods;
384 }
385 
386 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
387                                                  const GCNSubtarget &ST,
388                                                  SDWAOperandsMap *PotentialMatches) {
389   if (PotentialMatches != nullptr) {
390     // Fill out the map for all uses if all can be converted
391     MachineOperand *Reg = getReplacedOperand();
392     if (!Reg->isReg() || !Reg->isDef())
393       return nullptr;
394 
395     for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg()))
396       // Check that all instructions that use Reg can be converted
397       if (!isConvertibleToSDWA(UseMI, ST, TII) ||
398           !canCombineSelections(UseMI, TII))
399         return nullptr;
400 
401     // Now that it's guaranteed all uses are legal, iterate over the uses again
402     // to add them for later conversion.
403     for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
404       // Should not get a subregister here
405       assert(isSameReg(UseMO, *Reg));
406 
407       SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
408       MachineInstr *UseMI = UseMO.getParent();
409       potentialMatchesMap[UseMI].push_back(this);
410     }
411     return nullptr;
412   }
413 
414   // For SDWA src operand potential instruction is one that use register
415   // defined by parent instruction
416   MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
417   if (!PotentialMO)
418     return nullptr;
419 
420   MachineInstr *Parent = PotentialMO->getParent();
421 
422   return canCombineSelections(*Parent, TII) ? Parent : nullptr;
423 }
424 
425 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
426   switch (MI.getOpcode()) {
427   case AMDGPU::V_CVT_F32_FP8_sdwa:
428   case AMDGPU::V_CVT_F32_BF8_sdwa:
429   case AMDGPU::V_CVT_PK_F32_FP8_sdwa:
430   case AMDGPU::V_CVT_PK_F32_BF8_sdwa:
431     // Does not support input modifiers: noabs, noneg, nosext.
432     return false;
433   case AMDGPU::V_CNDMASK_B32_sdwa:
434     // SISrcMods uses the same bitmask for SEXT and NEG modifiers and
435     // hence the compiler can only support one type of modifier for
436     // each SDWA instruction.  For V_CNDMASK_B32_sdwa, this is NEG
437     // since its operands get printed using
438     // AMDGPUInstPrinter::printOperandAndFPInputMods which produces
439     // the output intended for NEG if SEXT is set.
440     //
441     // The ISA does actually support both modifiers on most SDWA
442     // instructions.
443     //
444     // FIXME Accept SEXT here after fixing this issue.
445     if (Sext)
446       return false;
447     break;
448   }
449 
450   // Find operand in instruction that matches source operand and replace it with
451   // target operand. Set corresponding src_sel
452   bool IsPreserveSrc = false;
453   MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
454   MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
455   MachineOperand *SrcMods =
456       TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
457   assert(Src && (Src->isReg() || Src->isImm()));
458   if (!isSameReg(*Src, *getReplacedOperand())) {
459     // If this is not src0 then it could be src1
460     Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
461     SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
462     SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
463 
464     if (!Src ||
465         !isSameReg(*Src, *getReplacedOperand())) {
466       // It's possible this Src is a tied operand for
467       // UNUSED_PRESERVE, in which case we can either
468       // abandon the peephole attempt, or if legal we can
469       // copy the target operand into the tied slot
470       // if the preserve operation will effectively cause the same
471       // result by overwriting the rest of the dst.
472       MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
473       MachineOperand *DstUnused =
474         TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
475 
476       if (Dst &&
477           DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
478         // This will work if the tied src is accessing WORD_0, and the dst is
479         // writing WORD_1. Modifiers don't matter because all the bits that
480         // would be impacted are being overwritten by the dst.
481         // Any other case will not work.
482         SdwaSel DstSel = static_cast<SdwaSel>(
483             TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
484         if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
485             getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
486           IsPreserveSrc = true;
487           auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
488                                                    AMDGPU::OpName::vdst);
489           auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
490           Src = &MI.getOperand(TiedIdx);
491           SrcSel = nullptr;
492           SrcMods = nullptr;
493         } else {
494           // Not legal to convert this src
495           return false;
496         }
497       }
498     }
499     assert(Src && Src->isReg());
500 
501     if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
502          MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
503          MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
504          MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
505          !isSameReg(*Src, *getReplacedOperand())) {
506       // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
507       // src2. This is not allowed.
508       return false;
509     }
510 
511     assert(isSameReg(*Src, *getReplacedOperand()) &&
512            (IsPreserveSrc || (SrcSel && SrcMods)));
513   }
514   copyRegOperand(*Src, *getTargetOperand());
515   if (!IsPreserveSrc) {
516     SdwaSel ExistingSel = static_cast<SdwaSel>(SrcSel->getImm());
517     SrcSel->setImm(*combineSdwaSel(ExistingSel, getSrcSel()));
518     SrcMods->setImm(getSrcMods(TII, Src));
519   }
520   getTargetOperand()->setIsKill(false);
521   return true;
522 }
523 
524 /// Verify that the SDWA selection operand \p SrcSelOpName of the SDWA
525 /// instruction \p MI can be combined with the selection \p OpSel.
526 static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII,
527                             AMDGPU::OpName SrcSelOpName, SdwaSel OpSel) {
528   assert(TII->isSDWA(MI.getOpcode()));
529 
530   const MachineOperand *SrcSelOp = TII->getNamedOperand(MI, SrcSelOpName);
531   SdwaSel SrcSel = static_cast<SdwaSel>(SrcSelOp->getImm());
532 
533   return combineSdwaSel(SrcSel, OpSel).has_value();
534 }
535 
536 /// Verify that \p Op is the same register as the operand of the SDWA
537 /// instruction \p MI named by \p SrcOpName and that the SDWA
538 /// selection \p SrcSelOpName can be combined with the \p OpSel.
539 static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII,
540                             AMDGPU::OpName SrcOpName,
541                             AMDGPU::OpName SrcSelOpName, MachineOperand *Op,
542                             SdwaSel OpSel) {
543   assert(TII->isSDWA(MI.getOpcode()));
544 
545   const MachineOperand *Src = TII->getNamedOperand(MI, SrcOpName);
546   if (!Src || !isSameReg(*Src, *Op))
547     return true;
548 
549   return canCombineOpSel(MI, TII, SrcSelOpName, OpSel);
550 }
551 
552 bool SDWASrcOperand::canCombineSelections(const MachineInstr &MI,
553                                           const SIInstrInfo *TII) {
554   if (!TII->isSDWA(MI.getOpcode()))
555     return true;
556 
557   using namespace AMDGPU;
558 
559   return canCombineOpSel(MI, TII, OpName::src0, OpName::src0_sel,
560                          getReplacedOperand(), getSrcSel()) &&
561          canCombineOpSel(MI, TII, OpName::src1, OpName::src1_sel,
562                          getReplacedOperand(), getSrcSel());
563 }
564 
565 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
566                                                  const GCNSubtarget &ST,
567                                                  SDWAOperandsMap *PotentialMatches) {
568   // For SDWA dst operand potential instruction is one that defines register
569   // that this operand uses
570   MachineRegisterInfo *MRI = getMRI();
571   MachineInstr *ParentMI = getParentInst();
572 
573   MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);
574   if (!PotentialMO)
575     return nullptr;
576 
577   // Check that ParentMI is the only instruction that uses replaced register
578   for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) {
579     if (&UseInst != ParentMI)
580       return nullptr;
581   }
582 
583   MachineInstr *Parent = PotentialMO->getParent();
584   return canCombineSelections(*Parent, TII) ? Parent : nullptr;
585 }
586 
587 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
588   // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
589 
590   if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
591        MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
592        MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
593        MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
594       getDstSel() != AMDGPU::SDWA::DWORD) {
595     // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
596     return false;
597   }
598 
599   MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
600   assert(Operand &&
601          Operand->isReg() &&
602          isSameReg(*Operand, *getReplacedOperand()));
603   copyRegOperand(*Operand, *getTargetOperand());
604   MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
605   assert(DstSel);
606 
607   SdwaSel ExistingSel = static_cast<SdwaSel>(DstSel->getImm());
608   DstSel->setImm(combineSdwaSel(ExistingSel, getDstSel()).value());
609 
610   MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
611   assert(DstUnused);
612   DstUnused->setImm(getDstUnused());
613 
614   // Remove original instruction  because it would conflict with our new
615   // instruction by register definition
616   getParentInst()->eraseFromParent();
617   return true;
618 }
619 
620 bool SDWADstOperand::canCombineSelections(const MachineInstr &MI,
621                                           const SIInstrInfo *TII) {
622   if (!TII->isSDWA(MI.getOpcode()))
623     return true;
624 
625   return canCombineOpSel(MI, TII, AMDGPU::OpName::dst_sel, getDstSel());
626 }
627 
628 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
629                                            const SIInstrInfo *TII) {
630   // MI should be moved right before v_or_b32.
631   // For this we should clear all kill flags on uses of MI src-operands or else
632   // we can encounter problem with use of killed operand.
633   for (MachineOperand &MO : MI.uses()) {
634     if (!MO.isReg())
635       continue;
636     getMRI()->clearKillFlags(MO.getReg());
637   }
638 
639   // Move MI before v_or_b32
640   MI.getParent()->remove(&MI);
641   getParentInst()->getParent()->insert(getParentInst(), &MI);
642 
643   // Add Implicit use of preserved register
644   MachineInstrBuilder MIB(*MI.getMF(), MI);
645   MIB.addReg(getPreservedOperand()->getReg(),
646              RegState::ImplicitKill,
647              getPreservedOperand()->getSubReg());
648 
649   // Tie dst to implicit use
650   MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),
651                  MI.getNumOperands() - 1);
652 
653   // Convert MI as any other SDWADstOperand and remove v_or_b32
654   return SDWADstOperand::convertToSDWA(MI, TII);
655 }
656 
657 bool SDWADstPreserveOperand::canCombineSelections(const MachineInstr &MI,
658                                                   const SIInstrInfo *TII) {
659   return SDWADstOperand::canCombineSelections(MI, TII);
660 }
661 
662 std::optional<int64_t>
663 SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
664   if (Op.isImm()) {
665     return Op.getImm();
666   }
667 
668   // If this is not immediate then it can be copy of immediate value, e.g.:
669   // %1 = S_MOV_B32 255;
670   if (Op.isReg()) {
671     for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
672       if (!isSameReg(Op, Def))
673         continue;
674 
675       const MachineInstr *DefInst = Def.getParent();
676       if (!TII->isFoldableCopy(*DefInst))
677         return std::nullopt;
678 
679       const MachineOperand &Copied = DefInst->getOperand(1);
680       if (!Copied.isImm())
681         return std::nullopt;
682 
683       return Copied.getImm();
684     }
685   }
686 
687   return std::nullopt;
688 }
689 
690 std::unique_ptr<SDWAOperand>
691 SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
692   unsigned Opcode = MI.getOpcode();
693   switch (Opcode) {
694   case AMDGPU::V_LSHRREV_B32_e32:
695   case AMDGPU::V_ASHRREV_I32_e32:
696   case AMDGPU::V_LSHLREV_B32_e32:
697   case AMDGPU::V_LSHRREV_B32_e64:
698   case AMDGPU::V_ASHRREV_I32_e64:
699   case AMDGPU::V_LSHLREV_B32_e64: {
700     // from: v_lshrrev_b32_e32 v1, 16/24, v0
701     // to SDWA src:v0 src_sel:WORD_1/BYTE_3
702 
703     // from: v_ashrrev_i32_e32 v1, 16/24, v0
704     // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
705 
706     // from: v_lshlrev_b32_e32 v1, 16/24, v0
707     // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
708     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
709     auto Imm = foldToImm(*Src0);
710     if (!Imm)
711       break;
712 
713     if (*Imm != 16 && *Imm != 24)
714       break;
715 
716     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
717     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
718     if (!Src1->isReg() || Src1->getReg().isPhysical() ||
719         Dst->getReg().isPhysical())
720       break;
721 
722     if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
723         Opcode == AMDGPU::V_LSHLREV_B32_e64) {
724       return std::make_unique<SDWADstOperand>(
725           Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
726     }
727     return std::make_unique<SDWASrcOperand>(
728         Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
729         Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
730             Opcode != AMDGPU::V_LSHRREV_B32_e64);
731     break;
732   }
733 
734   case AMDGPU::V_LSHRREV_B16_e32:
735   case AMDGPU::V_ASHRREV_I16_e32:
736   case AMDGPU::V_LSHLREV_B16_e32:
737   case AMDGPU::V_LSHRREV_B16_e64:
738   case AMDGPU::V_LSHRREV_B16_opsel_e64:
739   case AMDGPU::V_ASHRREV_I16_e64:
740   case AMDGPU::V_LSHLREV_B16_opsel_e64:
741   case AMDGPU::V_LSHLREV_B16_e64: {
742     // from: v_lshrrev_b16_e32 v1, 8, v0
743     // to SDWA src:v0 src_sel:BYTE_1
744 
745     // from: v_ashrrev_i16_e32 v1, 8, v0
746     // to SDWA src:v0 src_sel:BYTE_1 sext:1
747 
748     // from: v_lshlrev_b16_e32 v1, 8, v0
749     // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
750     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
751     auto Imm = foldToImm(*Src0);
752     if (!Imm || *Imm != 8)
753       break;
754 
755     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
756     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
757 
758     if (!Src1->isReg() || Src1->getReg().isPhysical() ||
759         Dst->getReg().isPhysical())
760       break;
761 
762     if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
763         Opcode == AMDGPU::V_LSHLREV_B16_opsel_e64 ||
764         Opcode == AMDGPU::V_LSHLREV_B16_e64)
765       return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
766     return std::make_unique<SDWASrcOperand>(
767         Src1, Dst, BYTE_1, false, false,
768         Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
769             Opcode != AMDGPU::V_LSHRREV_B16_opsel_e64 &&
770             Opcode != AMDGPU::V_LSHRREV_B16_e64);
771     break;
772   }
773 
774   case AMDGPU::V_BFE_I32_e64:
775   case AMDGPU::V_BFE_U32_e64: {
776     // e.g.:
777     // from: v_bfe_u32 v1, v0, 8, 8
778     // to SDWA src:v0 src_sel:BYTE_1
779 
780     // offset | width | src_sel
781     // ------------------------
782     // 0      | 8     | BYTE_0
783     // 0      | 16    | WORD_0
784     // 0      | 32    | DWORD ?
785     // 8      | 8     | BYTE_1
786     // 16     | 8     | BYTE_2
787     // 16     | 16    | WORD_1
788     // 24     | 8     | BYTE_3
789 
790     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
791     auto Offset = foldToImm(*Src1);
792     if (!Offset)
793       break;
794 
795     MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
796     auto Width = foldToImm(*Src2);
797     if (!Width)
798       break;
799 
800     SdwaSel SrcSel = DWORD;
801 
802     if (*Offset == 0 && *Width == 8)
803       SrcSel = BYTE_0;
804     else if (*Offset == 0 && *Width == 16)
805       SrcSel = WORD_0;
806     else if (*Offset == 0 && *Width == 32)
807       SrcSel = DWORD;
808     else if (*Offset == 8 && *Width == 8)
809       SrcSel = BYTE_1;
810     else if (*Offset == 16 && *Width == 8)
811       SrcSel = BYTE_2;
812     else if (*Offset == 16 && *Width == 16)
813       SrcSel = WORD_1;
814     else if (*Offset == 24 && *Width == 8)
815       SrcSel = BYTE_3;
816     else
817       break;
818 
819     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
820     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
821 
822     if (!Src0->isReg() || Src0->getReg().isPhysical() ||
823         Dst->getReg().isPhysical())
824       break;
825 
826     return std::make_unique<SDWASrcOperand>(
827           Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64);
828   }
829 
830   case AMDGPU::V_AND_B32_e32:
831   case AMDGPU::V_AND_B32_e64: {
832     // e.g.:
833     // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
834     // to SDWA src:v0 src_sel:WORD_0/BYTE_0
835 
836     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
837     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
838     auto *ValSrc = Src1;
839     auto Imm = foldToImm(*Src0);
840 
841     if (!Imm) {
842       Imm = foldToImm(*Src1);
843       ValSrc = Src0;
844     }
845 
846     if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
847       break;
848 
849     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
850 
851     if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() ||
852         Dst->getReg().isPhysical())
853       break;
854 
855     return std::make_unique<SDWASrcOperand>(
856         ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
857   }
858 
859   case AMDGPU::V_OR_B32_e32:
860   case AMDGPU::V_OR_B32_e64: {
861     // Patterns for dst_unused:UNUSED_PRESERVE.
862     // e.g., from:
863     // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
864     //                           src1_sel:WORD_1 src2_sel:WORD1
865     // v_add_f16_e32 v3, v1, v2
866     // v_or_b32_e32 v4, v0, v3
867     // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
868 
869     // Check if one of operands of v_or_b32 is SDWA instruction
870     using CheckRetType =
871         std::optional<std::pair<MachineOperand *, MachineOperand *>>;
872     auto CheckOROperandsForSDWA =
873       [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
874         if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
875           return CheckRetType(std::nullopt);
876 
877         MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);
878         if (!Op1Def)
879           return CheckRetType(std::nullopt);
880 
881         MachineInstr *Op1Inst = Op1Def->getParent();
882         if (!TII->isSDWA(*Op1Inst))
883           return CheckRetType(std::nullopt);
884 
885         MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);
886         if (!Op2Def)
887           return CheckRetType(std::nullopt);
888 
889         return CheckRetType(std::pair(Op1Def, Op2Def));
890       };
891 
892     MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
893     MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
894     assert(OrSDWA && OrOther);
895     auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
896     if (!Res) {
897       OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
898       OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
899       assert(OrSDWA && OrOther);
900       Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
901       if (!Res)
902         break;
903     }
904 
905     MachineOperand *OrSDWADef = Res->first;
906     MachineOperand *OrOtherDef = Res->second;
907     assert(OrSDWADef && OrOtherDef);
908 
909     MachineInstr *SDWAInst = OrSDWADef->getParent();
910     MachineInstr *OtherInst = OrOtherDef->getParent();
911 
912     // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
913     // destination patterns don't overlap. Compatible instruction can be either
914     // regular instruction with compatible bitness or SDWA instruction with
915     // correct dst_sel
916     // SDWAInst | OtherInst bitness / OtherInst dst_sel
917     // -----------------------------------------------------
918     // DWORD    | no                    / no
919     // WORD_0   | no                    / BYTE_2/3, WORD_1
920     // WORD_1   | 8/16-bit instructions / BYTE_0/1, WORD_0
921     // BYTE_0   | no                    / BYTE_1/2/3, WORD_1
922     // BYTE_1   | 8-bit                 / BYTE_0/2/3, WORD_1
923     // BYTE_2   | 8/16-bit              / BYTE_0/1/3. WORD_0
924     // BYTE_3   | 8/16/24-bit           / BYTE_0/1/2, WORD_0
925     // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
926     // but v_add_f32 is not.
927 
928     // TODO: add support for non-SDWA instructions as OtherInst.
929     // For now this only works with SDWA instructions. For regular instructions
930     // there is no way to determine if the instruction writes only 8/16/24-bit
931     // out of full register size and all registers are at min 32-bit wide.
932     if (!TII->isSDWA(*OtherInst))
933       break;
934 
935     SdwaSel DstSel = static_cast<SdwaSel>(
936         TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));
937     SdwaSel OtherDstSel = static_cast<SdwaSel>(
938       TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
939 
940     bool DstSelAgree = false;
941     switch (DstSel) {
942     case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
943                                 (OtherDstSel == BYTE_3) ||
944                                 (OtherDstSel == WORD_1));
945       break;
946     case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
947                                 (OtherDstSel == BYTE_1) ||
948                                 (OtherDstSel == WORD_0));
949       break;
950     case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
951                                 (OtherDstSel == BYTE_2) ||
952                                 (OtherDstSel == BYTE_3) ||
953                                 (OtherDstSel == WORD_1));
954       break;
955     case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
956                                 (OtherDstSel == BYTE_2) ||
957                                 (OtherDstSel == BYTE_3) ||
958                                 (OtherDstSel == WORD_1));
959       break;
960     case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
961                                 (OtherDstSel == BYTE_1) ||
962                                 (OtherDstSel == BYTE_3) ||
963                                 (OtherDstSel == WORD_0));
964       break;
965     case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
966                                 (OtherDstSel == BYTE_1) ||
967                                 (OtherDstSel == BYTE_2) ||
968                                 (OtherDstSel == WORD_0));
969       break;
970     default: DstSelAgree = false;
971     }
972 
973     if (!DstSelAgree)
974       break;
975 
976     // Also OtherInst dst_unused should be UNUSED_PAD
977     DstUnused OtherDstUnused = static_cast<DstUnused>(
978       TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
979     if (OtherDstUnused != DstUnused::UNUSED_PAD)
980       break;
981 
982     // Create DstPreserveOperand
983     MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
984     assert(OrDst && OrDst->isReg());
985 
986     return std::make_unique<SDWADstPreserveOperand>(
987       OrDst, OrSDWADef, OrOtherDef, DstSel);
988 
989   }
990   }
991 
992   return std::unique_ptr<SDWAOperand>(nullptr);
993 }
994 
995 #if !defined(NDEBUG)
996 static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
997   Operand.print(OS);
998   return OS;
999 }
1000 #endif
1001 
1002 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
1003   for (MachineInstr &MI : MBB) {
1004     if (auto Operand = matchSDWAOperand(MI)) {
1005       LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
1006       SDWAOperands[&MI] = std::move(Operand);
1007       ++NumSDWAPatternsFound;
1008     }
1009   }
1010 }
1011 
1012 // Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows
1013 // isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into
1014 // V_ADD_CO_U32_sdwa.
1015 //
1016 // We are transforming from a VOP3 into a VOP2 form of the instruction.
1017 //   %19:vgpr_32 = V_AND_B32_e32 255,
1018 //       killed %16:vgpr_32, implicit $exec
1019 //   %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64
1020 //       %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
1021 //  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
1022 //       %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
1023 //
1024 // becomes
1025 //   %47:vgpr_32 = V_ADD_CO_U32_sdwa
1026 //       0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
1027 //       implicit-def $vcc, implicit $exec
1028 //  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
1029 //       %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec
1030 void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
1031                                            const GCNSubtarget &ST) const {
1032   int Opc = MI.getOpcode();
1033   assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) &&
1034          "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
1035 
1036   // Can the candidate MI be shrunk?
1037   if (!TII->canShrink(MI, *MRI))
1038     return;
1039   Opc = AMDGPU::getVOPe32(Opc);
1040   // Find the related ADD instruction.
1041   const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
1042   if (!Sdst)
1043     return;
1044   MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);
1045   if (!NextOp)
1046     return;
1047   MachineInstr &MISucc = *NextOp->getParent();
1048 
1049   // Make sure the carry in/out are subsequently unused.
1050   MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
1051   if (!CarryIn)
1052     return;
1053   MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
1054   if (!CarryOut)
1055     return;
1056   if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg()))
1057     return;
1058   // Make sure VCC or its subregs are dead before MI.
1059   MachineBasicBlock &MBB = *MI.getParent();
1060   MachineBasicBlock::LivenessQueryResult Liveness =
1061       MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
1062   if (Liveness != MachineBasicBlock::LQR_Dead)
1063     return;
1064   // Check if VCC is referenced in range of (MI,MISucc].
1065   for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();
1066        I != E; ++I) {
1067     if (I->modifiesRegister(AMDGPU::VCC, TRI))
1068       return;
1069   }
1070 
1071   // Replace MI with V_{SUB|ADD}_I32_e32
1072   BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc))
1073     .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
1074     .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
1075     .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
1076     .setMIFlags(MI.getFlags());
1077 
1078   MI.eraseFromParent();
1079 
1080   // Since the carry output of MI is now VCC, update its use in MISucc.
1081 
1082   MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
1083 }
1084 
1085 /// Try to convert an \p MI in VOP3 which takes an src2 carry-in
1086 /// operand into the corresponding VOP2 form which expects the
1087 /// argument in VCC. To this end, add an copy from the carry-in to
1088 /// VCC.  The conversion will only be applied if \p MI can be shrunk
1089 /// to VOP2 and if VCC can be proven to be dead before \p MI.
1090 void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI,
1091                                            const GCNSubtarget &ST) const {
1092   assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64);
1093 
1094   LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI);
1095   if (!TII->canShrink(MI, *MRI)) {
1096     LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n");
1097     return;
1098   }
1099 
1100   const MachineOperand &CarryIn =
1101       *TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1102   Register CarryReg = CarryIn.getReg();
1103   MachineInstr *CarryDef = MRI->getVRegDef(CarryReg);
1104   if (!CarryDef) {
1105     LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n");
1106     return;
1107   }
1108 
1109   // Make sure VCC or its subregs are dead before MI.
1110   MCRegister Vcc = TRI->getVCC();
1111   MachineBasicBlock &MBB = *MI.getParent();
1112   MachineBasicBlock::LivenessQueryResult Liveness =
1113       MBB.computeRegisterLiveness(TRI, Vcc, MI);
1114   if (Liveness != MachineBasicBlock::LQR_Dead) {
1115     LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n");
1116     return;
1117   }
1118 
1119   BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), Vcc).add(CarryIn);
1120 
1121   auto Converted = BuildMI(MBB, MI, MI.getDebugLoc(),
1122                            TII->get(AMDGPU::getVOPe32(MI.getOpcode())))
1123                        .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
1124                        .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
1125                        .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
1126                        .setMIFlags(MI.getFlags());
1127   TII->fixImplicitOperands(*Converted);
1128   LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted);
1129   (void)Converted;
1130   MI.eraseFromParent();
1131 }
1132 
1133 namespace {
1134 bool isConvertibleToSDWA(MachineInstr &MI,
1135                          const GCNSubtarget &ST,
1136                          const SIInstrInfo* TII) {
1137   // Check if this is already an SDWA instruction
1138   unsigned Opc = MI.getOpcode();
1139   if (TII->isSDWA(Opc))
1140     return true;
1141 
1142   // Can only be handled after ealier conversion to
1143   // AMDGPU::V_CNDMASK_B32_e32 which is not always possible.
1144   if (Opc == AMDGPU::V_CNDMASK_B32_e64)
1145     return false;
1146 
1147   // Check if this instruction has opcode that supports SDWA
1148   if (AMDGPU::getSDWAOp(Opc) == -1)
1149     Opc = AMDGPU::getVOPe32(Opc);
1150 
1151   if (AMDGPU::getSDWAOp(Opc) == -1)
1152     return false;
1153 
1154   if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1155     return false;
1156 
1157   if (TII->isVOPC(Opc)) {
1158     if (!ST.hasSDWASdst()) {
1159       const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
1160       if (SDst && (SDst->getReg() != AMDGPU::VCC &&
1161                    SDst->getReg() != AMDGPU::VCC_LO))
1162         return false;
1163     }
1164 
1165     if (!ST.hasSDWAOutModsVOPC() &&
1166         (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
1167          TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
1168       return false;
1169 
1170   } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
1171              !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1172     return false;
1173   }
1174 
1175   if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
1176                            Opc == AMDGPU::V_FMAC_F32_e32 ||
1177                            Opc == AMDGPU::V_MAC_F16_e32 ||
1178                            Opc == AMDGPU::V_MAC_F32_e32))
1179     return false;
1180 
1181   // Check if target supports this SDWA opcode
1182   if (TII->pseudoToMCOpcode(Opc) == -1)
1183     return false;
1184 
1185   if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) {
1186     if (!Src0->isReg() && !Src0->isImm())
1187       return false;
1188   }
1189 
1190   if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) {
1191     if (!Src1->isReg() && !Src1->isImm())
1192       return false;
1193   }
1194 
1195   return true;
1196 }
1197 } // namespace
1198 
1199 MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) {
1200   unsigned Opcode = MI.getOpcode();
1201   assert(!TII->isSDWA(Opcode));
1202 
1203   int SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
1204   if (SDWAOpcode == -1)
1205     SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
1206   assert(SDWAOpcode != -1);
1207 
1208   const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
1209 
1210   // Create SDWA version of instruction MI and initialize its operands
1211   MachineInstrBuilder SDWAInst =
1212     BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc)
1213     .setMIFlags(MI.getFlags());
1214 
1215   // Copy dst, if it is present in original then should also be present in SDWA
1216   MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1217   if (Dst) {
1218     assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
1219     SDWAInst.add(*Dst);
1220   } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {
1221     assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1222     SDWAInst.add(*Dst);
1223   } else {
1224     assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1225     SDWAInst.addReg(TRI->getVCC(), RegState::Define);
1226   }
1227 
1228   // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1229   // src0_modifiers (except for v_nop_sdwa, but it can't get here)
1230   MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1231   assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) &&
1232          AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers));
1233   if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
1234     SDWAInst.addImm(Mod->getImm());
1235   else
1236     SDWAInst.addImm(0);
1237   SDWAInst.add(*Src0);
1238 
1239   // Copy src1 if present, initialize src1_modifiers.
1240   MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1241   if (Src1) {
1242     assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) &&
1243            AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers));
1244     if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
1245       SDWAInst.addImm(Mod->getImm());
1246     else
1247       SDWAInst.addImm(0);
1248     SDWAInst.add(*Src1);
1249   }
1250 
1251   if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1252       SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1253       SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1254       SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1255     // v_mac_f16/32 has additional src2 operand tied to vdst
1256     MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1257     assert(Src2);
1258     SDWAInst.add(*Src2);
1259   }
1260 
1261   // Copy clamp if present, initialize otherwise
1262   assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp));
1263   MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
1264   if (Clamp) {
1265     SDWAInst.add(*Clamp);
1266   } else {
1267     SDWAInst.addImm(0);
1268   }
1269 
1270   // Copy omod if present, initialize otherwise if needed
1271   if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::omod)) {
1272     MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
1273     if (OMod) {
1274       SDWAInst.add(*OMod);
1275     } else {
1276       SDWAInst.addImm(0);
1277     }
1278   }
1279 
1280   // Initialize SDWA specific operands
1281   if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel))
1282     SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1283 
1284   if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused))
1285     SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1286 
1287   assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
1288   SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1289 
1290   if (Src1) {
1291     assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
1292     SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1293   }
1294 
1295   // Check for a preserved register that needs to be copied.
1296   MachineInstr *Ret = SDWAInst.getInstr();
1297   TII->fixImplicitOperands(*Ret);
1298   return Ret;
1299 }
1300 
1301 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
1302                                    const SDWAOperandsVector &SDWAOperands) {
1303   LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
1304 
1305   MachineInstr *SDWAInst;
1306   if (TII->isSDWA(MI.getOpcode())) {
1307     // Clone the instruction to allow revoking changes
1308     // made to MI during the processing of the operands
1309     // if the conversion fails.
1310     SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(&MI);
1311     MI.getParent()->insert(MI.getIterator(), SDWAInst);
1312   } else {
1313     SDWAInst = createSDWAVersion(MI);
1314   }
1315 
1316   // Apply all sdwa operand patterns.
1317   bool Converted = false;
1318   for (auto &Operand : SDWAOperands) {
1319     LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
1320     // There should be no intersection between SDWA operands and potential MIs
1321     // e.g.:
1322     // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1323     // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1324     // v_add_u32 v3, v4, v2
1325     //
1326     // In that example it is possible that we would fold 2nd instruction into
1327     // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that
1328     // was already destroyed). So if SDWAOperand is also a potential MI then do
1329     // not apply it.
1330     if (PotentialMatches.count(Operand->getParentInst()) == 0)
1331       Converted |= Operand->convertToSDWA(*SDWAInst, TII);
1332   }
1333 
1334   if (!Converted) {
1335     SDWAInst->eraseFromParent();
1336     return false;
1337   }
1338 
1339   ConvertedInstructions.push_back(SDWAInst);
1340   for (MachineOperand &MO : SDWAInst->uses()) {
1341     if (!MO.isReg())
1342       continue;
1343 
1344     MRI->clearKillFlags(MO.getReg());
1345   }
1346   LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
1347   ++NumSDWAInstructionsPeepholed;
1348 
1349   MI.eraseFromParent();
1350   return true;
1351 }
1352 
1353 // If an instruction was converted to SDWA it should not have immediates or SGPR
1354 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1355 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1356                                             const GCNSubtarget &ST) const {
1357   const MCInstrDesc &Desc = TII->get(MI.getOpcode());
1358   unsigned ConstantBusCount = 0;
1359   for (MachineOperand &Op : MI.explicit_uses()) {
1360     if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
1361       continue;
1362 
1363     unsigned I = Op.getOperandNo();
1364     if (Desc.operands()[I].RegClass == -1 ||
1365         !TRI->isVSSuperClass(TRI->getRegClass(Desc.operands()[I].RegClass)))
1366       continue;
1367 
1368     if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
1369         TRI->isSGPRReg(*MRI, Op.getReg())) {
1370       ++ConstantBusCount;
1371       continue;
1372     }
1373 
1374     Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1375     auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1376                         TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1377     if (Op.isImm())
1378       Copy.addImm(Op.getImm());
1379     else if (Op.isReg())
1380       Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
1381                   Op.getSubReg());
1382     Op.ChangeToRegister(VGPR, false);
1383   }
1384 }
1385 
1386 bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
1387   if (skipFunction(MF.getFunction()))
1388     return false;
1389 
1390   return SIPeepholeSDWA().run(MF);
1391 }
1392 
1393 bool SIPeepholeSDWA::run(MachineFunction &MF) {
1394   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1395 
1396   if (!ST.hasSDWA())
1397     return false;
1398 
1399   MRI = &MF.getRegInfo();
1400   TRI = ST.getRegisterInfo();
1401   TII = ST.getInstrInfo();
1402 
1403   // Find all SDWA operands in MF.
1404   bool Ret = false;
1405   for (MachineBasicBlock &MBB : MF) {
1406     bool Changed = false;
1407     do {
1408       // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1409       // Look for a possible ADD or SUB that resulted from a previously lowered
1410       // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1411       // lowers the pair of instructions into e32 form.
1412       matchSDWAOperands(MBB);
1413       for (const auto &OperandPair : SDWAOperands) {
1414         const auto &Operand = OperandPair.second;
1415         MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
1416         if (!PotentialMI)
1417           continue;
1418 
1419         switch (PotentialMI->getOpcode()) {
1420         case AMDGPU::V_ADD_CO_U32_e64:
1421         case AMDGPU::V_SUB_CO_U32_e64:
1422           pseudoOpConvertToVOP2(*PotentialMI, ST);
1423           break;
1424         case AMDGPU::V_CNDMASK_B32_e64:
1425           convertVcndmaskToVOP2(*PotentialMI, ST);
1426           break;
1427         };
1428       }
1429       SDWAOperands.clear();
1430 
1431       // Generate potential match list.
1432       matchSDWAOperands(MBB);
1433 
1434       for (const auto &OperandPair : SDWAOperands) {
1435         const auto &Operand = OperandPair.second;
1436         MachineInstr *PotentialMI =
1437             Operand->potentialToConvert(TII, ST, &PotentialMatches);
1438 
1439         if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII))
1440           PotentialMatches[PotentialMI].push_back(Operand.get());
1441       }
1442 
1443       for (auto &PotentialPair : PotentialMatches) {
1444         MachineInstr &PotentialMI = *PotentialPair.first;
1445         convertToSDWA(PotentialMI, PotentialPair.second);
1446       }
1447 
1448       PotentialMatches.clear();
1449       SDWAOperands.clear();
1450 
1451       Changed = !ConvertedInstructions.empty();
1452 
1453       if (Changed)
1454         Ret = true;
1455       while (!ConvertedInstructions.empty())
1456         legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
1457     } while (Changed);
1458   }
1459 
1460   return Ret;
1461 }
1462 
1463 PreservedAnalyses SIPeepholeSDWAPass::run(MachineFunction &MF,
1464                                           MachineFunctionAnalysisManager &) {
1465   if (MF.getFunction().hasOptNone() || !SIPeepholeSDWA().run(MF))
1466     return PreservedAnalyses::all();
1467 
1468   PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
1469   PA.preserveSet<CFGAnalyses>();
1470   return PA;
1471 }
1472