xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83 #include "llvm/CodeGen/RegisterBank.h"
84 #include "llvm/IR/IntrinsicsAMDGPU.h"
85 
86 #define GET_TARGET_REGBANK_IMPL
87 #include "AMDGPUGenRegisterBank.inc"
88 
89 // This file will be TableGen'ed at some point.
90 #include "AMDGPUGenRegisterBankInfo.def"
91 
92 using namespace llvm;
93 using namespace MIPatternMatch;
94 
95 namespace {
96 
97 // Observer to apply a register bank to new registers created by LegalizerHelper.
98 class ApplyRegBankMapping final : public GISelChangeObserver {
99 private:
100   MachineIRBuilder &B;
101   const AMDGPURegisterBankInfo &RBI;
102   MachineRegisterInfo &MRI;
103   const RegisterBank *NewBank;
104   SmallVector<MachineInstr *, 4> NewInsts;
105 
106 public:
ApplyRegBankMapping(MachineIRBuilder & B,const AMDGPURegisterBankInfo & RBI_,MachineRegisterInfo & MRI_,const RegisterBank * RB)107   ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_,
108                       MachineRegisterInfo &MRI_, const RegisterBank *RB)
109       : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) {
110     assert(!B.isObservingChanges());
111     B.setChangeObserver(*this);
112   }
113 
~ApplyRegBankMapping()114   ~ApplyRegBankMapping() override {
115     for (MachineInstr *MI : NewInsts)
116       applyBank(*MI);
117 
118     B.stopObservingChanges();
119   }
120 
121   /// Set any registers that don't have a set register class or bank to SALU.
applyBank(MachineInstr & MI)122   void applyBank(MachineInstr &MI) {
123     const unsigned Opc = MI.getOpcode();
124     if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
125         Opc == AMDGPU::G_SEXT) {
126       // LegalizerHelper wants to use the basic legalization artifacts when
127       // widening etc. We don't handle selection with vcc in artifact sources,
128       // so we need to use a select instead to handle these properly.
129       Register DstReg = MI.getOperand(0).getReg();
130       Register SrcReg = MI.getOperand(1).getReg();
131       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
132       if (SrcBank == &AMDGPU::VCCRegBank) {
133         const LLT S32 = LLT::scalar(32);
134         assert(MRI.getType(SrcReg) == LLT::scalar(1));
135         assert(MRI.getType(DstReg) == S32);
136         assert(NewBank == &AMDGPU::VGPRRegBank);
137 
138         // Replace the extension with a select, which really uses the boolean
139         // source.
140         B.setInsertPt(*MI.getParent(), MI);
141 
142         auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
143         auto False = B.buildConstant(S32, 0);
144         B.buildSelect(DstReg, SrcReg, True, False);
145         MRI.setRegBank(True.getReg(0), *NewBank);
146         MRI.setRegBank(False.getReg(0), *NewBank);
147         MI.eraseFromParent();
148       }
149 
150       assert(!MRI.getRegClassOrRegBank(DstReg));
151       MRI.setRegBank(DstReg, *NewBank);
152       return;
153     }
154 
155 #ifndef NDEBUG
156     if (Opc == AMDGPU::G_TRUNC) {
157       Register DstReg = MI.getOperand(0).getReg();
158       const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
159       assert(DstBank != &AMDGPU::VCCRegBank);
160     }
161 #endif
162 
163     for (MachineOperand &Op : MI.operands()) {
164       if (!Op.isReg())
165         continue;
166 
167       // We may see physical registers if building a real MI
168       Register Reg = Op.getReg();
169       if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
170         continue;
171 
172       const RegisterBank *RB = NewBank;
173       if (MRI.getType(Reg) == LLT::scalar(1)) {
174         assert(NewBank == &AMDGPU::VGPRRegBank &&
175                "s1 operands should only be used for vector bools");
176         assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
177                 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
178                "not expecting legalization artifacts here");
179         RB = &AMDGPU::VCCRegBank;
180       }
181 
182       MRI.setRegBank(Reg, *RB);
183     }
184   }
185 
erasingInstr(MachineInstr & MI)186   void erasingInstr(MachineInstr &MI) override {}
187 
createdInstr(MachineInstr & MI)188   void createdInstr(MachineInstr &MI) override {
189     // At this point, the instruction was just inserted and has no operands.
190     NewInsts.push_back(&MI);
191   }
192 
changingInstr(MachineInstr & MI)193   void changingInstr(MachineInstr &MI) override {}
changedInstr(MachineInstr & MI)194   void changedInstr(MachineInstr &MI) override {
195     // FIXME: In principle we should probably add the instruction to NewInsts,
196     // but the way the LegalizerHelper uses the observer, we will always see the
197     // registers we need to set the regbank on also referenced in a new
198     // instruction.
199   }
200 };
201 
202 } // anonymous namespace
203 
AMDGPURegisterBankInfo(const GCNSubtarget & ST)204 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
205     : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
206       TII(Subtarget.getInstrInfo()) {
207 
208   // HACK: Until this is fully tablegen'd.
209   static llvm::once_flag InitializeRegisterBankFlag;
210 
211   static auto InitializeRegisterBankOnce = [this]() {
212     assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
213            &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
214            &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
215     (void)this;
216   };
217 
218   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
219 }
220 
isVectorRegisterBank(const RegisterBank & Bank)221 static bool isVectorRegisterBank(const RegisterBank &Bank) {
222   unsigned BankID = Bank.getID();
223   return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
224 }
225 
isDivergentRegBank(const RegisterBank * RB) const226 bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const {
227   return RB != &AMDGPU::SGPRRegBank;
228 }
229 
copyCost(const RegisterBank & Dst,const RegisterBank & Src,TypeSize Size) const230 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
231                                           const RegisterBank &Src,
232                                           TypeSize Size) const {
233   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
234   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
235       (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
236     return std::numeric_limits<unsigned>::max();
237   }
238 
239   // Bool values are tricky, because the meaning is based on context. The SCC
240   // and VCC banks are for the natural scalar and vector conditions produced by
241   // a compare.
242   //
243   // Legalization doesn't know about the necessary context, so an s1 use may
244   // have been a truncate from an arbitrary value, in which case a copy (lowered
245   // as a compare with 0) needs to be inserted.
246   if (Size == 1 &&
247       (Dst.getID() == AMDGPU::SGPRRegBankID) &&
248       (isVectorRegisterBank(Src) ||
249        Src.getID() == AMDGPU::SGPRRegBankID ||
250        Src.getID() == AMDGPU::VCCRegBankID))
251     return std::numeric_limits<unsigned>::max();
252 
253   // There is no direct copy between AGPRs.
254   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
255       Src.getID() == AMDGPU::AGPRRegBankID)
256     return 4;
257 
258   return RegisterBankInfo::copyCost(Dst, Src, Size);
259 }
260 
getBreakDownCost(const ValueMapping & ValMapping,const RegisterBank * CurBank) const261 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
262   const ValueMapping &ValMapping,
263   const RegisterBank *CurBank) const {
264   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
265   // VGPR.
266   // FIXME: Is there a better way to do this?
267   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
268     return 10; // This is expensive.
269 
270   assert(ValMapping.NumBreakDowns == 2 &&
271          ValMapping.BreakDown[0].Length == 32 &&
272          ValMapping.BreakDown[0].StartIdx == 0 &&
273          ValMapping.BreakDown[1].Length == 32 &&
274          ValMapping.BreakDown[1].StartIdx == 32 &&
275          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
276 
277   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
278   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
279   // want.
280 
281   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
282   // alignment restrictions, but this probably isn't important.
283   return 1;
284 }
285 
286 const RegisterBank &
getRegBankFromRegClass(const TargetRegisterClass & RC,LLT Ty) const287 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
288                                                LLT Ty) const {
289   if (&RC == &AMDGPU::SReg_1RegClass)
290     return AMDGPU::VCCRegBank;
291 
292   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
293   // VCC-like use.
294   if (TRI->isSGPRClass(&RC)) {
295     // FIXME: This probably came from a copy from a physical register, which
296     // should be inferable from the copied to-type. We don't have many boolean
297     // physical register constraints so just assume a normal SGPR for now.
298     if (!Ty.isValid())
299       return AMDGPU::SGPRRegBank;
300 
301     return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
302   }
303 
304   return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
305 }
306 
307 template <unsigned NumOps>
308 RegisterBankInfo::InstructionMappings
addMappingFromTable(const MachineInstr & MI,const MachineRegisterInfo & MRI,const std::array<unsigned,NumOps> RegSrcOpIdx,ArrayRef<OpRegBankEntry<NumOps>> Table) const309 AMDGPURegisterBankInfo::addMappingFromTable(
310     const MachineInstr &MI, const MachineRegisterInfo &MRI,
311     const std::array<unsigned, NumOps> RegSrcOpIdx,
312     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
313 
314   InstructionMappings AltMappings;
315 
316   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
317 
318   unsigned Sizes[NumOps];
319   for (unsigned I = 0; I < NumOps; ++I) {
320     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
321     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
322   }
323 
324   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
325     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
326     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
327   }
328 
329   // getInstrMapping's default mapping uses ID 1, so start at 2.
330   unsigned MappingID = 2;
331   for (const auto &Entry : Table) {
332     for (unsigned I = 0; I < NumOps; ++I) {
333       int OpIdx = RegSrcOpIdx[I];
334       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
335     }
336 
337     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
338                                                  getOperandsMapping(Operands),
339                                                  Operands.size()));
340   }
341 
342   return AltMappings;
343 }
344 
345 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsic(const MachineInstr & MI,const MachineRegisterInfo & MRI) const346 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
347     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
348   switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
349   case Intrinsic::amdgcn_readlane: {
350     static const OpRegBankEntry<3> Table[2] = {
351       // Perfectly legal.
352       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
353 
354       // Need a readfirstlane for the index.
355       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
356     };
357 
358     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
359     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
360   }
361   case Intrinsic::amdgcn_writelane: {
362     static const OpRegBankEntry<4> Table[4] = {
363       // Perfectly legal.
364       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
365 
366       // Need readfirstlane of first op
367       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
368 
369       // Need readfirstlane of second op
370       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
371 
372       // Need readfirstlane of both ops
373       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
374     };
375 
376     // rsrc, voffset, offset
377     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
378     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);
379   }
380   default:
381     return RegisterBankInfo::getInstrAlternativeMappings(MI);
382   }
383 }
384 
385 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr & MI,const MachineRegisterInfo & MRI) const386 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
387     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
388 
389   switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
390   case Intrinsic::amdgcn_s_buffer_load: {
391     static const OpRegBankEntry<2> Table[4] = {
392       // Perfectly legal.
393       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
394 
395       // Only need 1 register in loop
396       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
397 
398       // Have to waterfall the resource.
399       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
400 
401       // Have to waterfall the resource, and the offset.
402       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
403     };
404 
405     // rsrc, offset
406     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
407     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);
408   }
409   case Intrinsic::amdgcn_ds_ordered_add:
410   case Intrinsic::amdgcn_ds_ordered_swap: {
411     // VGPR = M0, VGPR
412     static const OpRegBankEntry<3> Table[2] = {
413       // Perfectly legal.
414       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
415 
416       // Need a readfirstlane for m0
417       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
418     };
419 
420     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
421     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
422   }
423   case Intrinsic::amdgcn_s_sendmsg:
424   case Intrinsic::amdgcn_s_sendmsghalt: {
425     // FIXME: Should have no register for immediate
426     static const OpRegBankEntry<1> Table[2] = {
427       // Perfectly legal.
428       { { AMDGPU::SGPRRegBankID }, 1 },
429 
430       // Need readlane
431       { { AMDGPU::VGPRRegBankID }, 3 }
432     };
433 
434     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
435     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);
436   }
437   default:
438     return RegisterBankInfo::getInstrAlternativeMappings(MI);
439   }
440 }
441 
442 // FIXME: Returns uniform if there's no source value information. This is
443 // probably wrong.
isScalarLoadLegal(const MachineInstr & MI) const444 bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
445   if (!MI.hasOneMemOperand())
446     return false;
447 
448   const MachineMemOperand *MMO = *MI.memoperands_begin();
449   const unsigned AS = MMO->getAddrSpace();
450   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
451                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
452   const unsigned MemSize = 8 * MMO->getSize().getValue();
453 
454   // Require 4-byte alignment.
455   return (MMO->getAlign() >= Align(4) ||
456           (Subtarget.hasScalarSubwordLoads() &&
457            ((MemSize == 16 && MMO->getAlign() >= Align(2)) ||
458             (MemSize == 8 && MMO->getAlign() >= Align(1))))) &&
459          // Can't do a scalar atomic load.
460          !MMO->isAtomic() &&
461          // Don't use scalar loads for volatile accesses to non-constant address
462          // spaces.
463          (IsConst || !MMO->isVolatile()) &&
464          // Memory must be known constant, or not written before this load.
465          (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
466          AMDGPU::isUniformMMO(MMO);
467 }
468 
469 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappings(const MachineInstr & MI) const470 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
471     const MachineInstr &MI) const {
472 
473   const MachineFunction &MF = *MI.getParent()->getParent();
474   const MachineRegisterInfo &MRI = MF.getRegInfo();
475 
476 
477   InstructionMappings AltMappings;
478   switch (MI.getOpcode()) {
479   case TargetOpcode::G_CONSTANT:
480   case TargetOpcode::G_IMPLICIT_DEF: {
481     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
482     if (Size == 1) {
483       static const OpRegBankEntry<1> Table[3] = {
484         { { AMDGPU::VGPRRegBankID }, 1 },
485         { { AMDGPU::SGPRRegBankID }, 1 },
486         { { AMDGPU::VCCRegBankID }, 1 }
487       };
488 
489       return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
490     }
491 
492     [[fallthrough]];
493   }
494   case TargetOpcode::G_FCONSTANT:
495   case TargetOpcode::G_FRAME_INDEX:
496   case TargetOpcode::G_GLOBAL_VALUE: {
497     static const OpRegBankEntry<1> Table[2] = {
498       { { AMDGPU::VGPRRegBankID }, 1 },
499       { { AMDGPU::SGPRRegBankID }, 1 }
500     };
501 
502     return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
503   }
504   case TargetOpcode::G_AND:
505   case TargetOpcode::G_OR:
506   case TargetOpcode::G_XOR: {
507     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
508 
509     if (Size == 1) {
510       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
511       const InstructionMapping &SCCMapping = getInstructionMapping(
512         1, 1, getOperandsMapping(
513           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
514            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
515            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
516         3); // Num Operands
517       AltMappings.push_back(&SCCMapping);
518 
519       const InstructionMapping &VCCMapping0 = getInstructionMapping(
520         2, 1, getOperandsMapping(
521           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
522            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
523            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
524         3); // Num Operands
525       AltMappings.push_back(&VCCMapping0);
526       return AltMappings;
527     }
528 
529     if (Size != 64)
530       break;
531 
532     const InstructionMapping &SSMapping = getInstructionMapping(
533       1, 1, getOperandsMapping(
534         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
535          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
536          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
537       3); // Num Operands
538     AltMappings.push_back(&SSMapping);
539 
540     const InstructionMapping &VVMapping = getInstructionMapping(
541       2, 2, getOperandsMapping(
542         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
543          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
544          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
545       3); // Num Operands
546     AltMappings.push_back(&VVMapping);
547     break;
548   }
549   case TargetOpcode::G_LOAD:
550   case TargetOpcode::G_ZEXTLOAD:
551   case TargetOpcode::G_SEXTLOAD: {
552     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
553     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
554     unsigned PtrSize = PtrTy.getSizeInBits();
555     unsigned AS = PtrTy.getAddressSpace();
556 
557     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
558          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
559         isScalarLoadLegal(MI)) {
560       const InstructionMapping &SSMapping = getInstructionMapping(
561           1, 1, getOperandsMapping(
562                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
563                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
564           2); // Num Operands
565       AltMappings.push_back(&SSMapping);
566     }
567 
568     const InstructionMapping &VVMapping = getInstructionMapping(
569         2, 1,
570         getOperandsMapping(
571             {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
572              AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
573         2); // Num Operands
574     AltMappings.push_back(&VVMapping);
575 
576     // It may be possible to have a vgpr = load sgpr mapping here, because
577     // the mubuf instructions support this kind of load, but probably for only
578     // gfx7 and older.  However, the addressing mode matching in the instruction
579     // selector should be able to do a better job of detecting and selecting
580     // these kinds of loads from the vgpr = load vgpr mapping.
581 
582     return AltMappings;
583 
584   }
585   case TargetOpcode::G_SELECT: {
586     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
587     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
588       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
589                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
590                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
591                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
592       4); // Num Operands
593     AltMappings.push_back(&SSMapping);
594 
595     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
596       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
597                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
598                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
599                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
600       4); // Num Operands
601     AltMappings.push_back(&VVMapping);
602 
603     return AltMappings;
604   }
605   case TargetOpcode::G_UADDE:
606   case TargetOpcode::G_USUBE:
607   case TargetOpcode::G_SADDE:
608   case TargetOpcode::G_SSUBE: {
609     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
610     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
611       getOperandsMapping(
612         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
613          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
614          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
615          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
616          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
617       5); // Num Operands
618     AltMappings.push_back(&SSMapping);
619 
620     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
621       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
622                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
623                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
624                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
625                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
626       5); // Num Operands
627     AltMappings.push_back(&VVMapping);
628     return AltMappings;
629   }
630   case AMDGPU::G_BRCOND: {
631     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
632 
633     // TODO: Change type to 32 for scalar
634     const InstructionMapping &SMapping = getInstructionMapping(
635       1, 1, getOperandsMapping(
636         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
637       2); // Num Operands
638     AltMappings.push_back(&SMapping);
639 
640     const InstructionMapping &VMapping = getInstructionMapping(
641       1, 1, getOperandsMapping(
642         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
643       2); // Num Operands
644     AltMappings.push_back(&VMapping);
645     return AltMappings;
646   }
647   case AMDGPU::G_INTRINSIC:
648   case AMDGPU::G_INTRINSIC_CONVERGENT:
649     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
650   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
651   case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
652     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
653   default:
654     break;
655   }
656   return RegisterBankInfo::getInstrAlternativeMappings(MI);
657 }
658 
split64BitValueForMapping(MachineIRBuilder & B,SmallVector<Register,2> & Regs,LLT HalfTy,Register Reg) const659 void AMDGPURegisterBankInfo::split64BitValueForMapping(
660   MachineIRBuilder &B,
661   SmallVector<Register, 2> &Regs,
662   LLT HalfTy,
663   Register Reg) const {
664   assert(HalfTy.getSizeInBits() == 32);
665   MachineRegisterInfo *MRI = B.getMRI();
666   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
667   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
668   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
669   MRI->setRegBank(LoLHS, *Bank);
670   MRI->setRegBank(HiLHS, *Bank);
671 
672   Regs.push_back(LoLHS);
673   Regs.push_back(HiLHS);
674 
675   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
676     .addDef(LoLHS)
677     .addDef(HiLHS)
678     .addUse(Reg);
679 }
680 
681 /// Replace the current type each register in \p Regs has with \p NewTy
setRegsToType(MachineRegisterInfo & MRI,ArrayRef<Register> Regs,LLT NewTy)682 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
683                           LLT NewTy) {
684   for (Register Reg : Regs) {
685     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
686     MRI.setType(Reg, NewTy);
687   }
688 }
689 
getHalfSizedType(LLT Ty)690 static LLT getHalfSizedType(LLT Ty) {
691   if (Ty.isVector()) {
692     assert(Ty.getElementCount().isKnownMultipleOf(2));
693     return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
694                                Ty.getElementType());
695   }
696 
697   assert(Ty.getScalarSizeInBits() % 2 == 0);
698   return LLT::scalar(Ty.getScalarSizeInBits() / 2);
699 }
700 
701 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
702 // source value into a scalar register.
buildReadFirstLane(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Src) const703 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
704                                                     MachineRegisterInfo &MRI,
705                                                     Register Src) const {
706   LLT Ty = MRI.getType(Src);
707   const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
708 
709   if (Bank == &AMDGPU::SGPRRegBank)
710     return Src;
711 
712   unsigned Bits = Ty.getSizeInBits();
713   assert(Bits % 32 == 0);
714 
715   if (Bank != &AMDGPU::VGPRRegBank) {
716     // We need to copy from AGPR to VGPR
717     Src = B.buildCopy(Ty, Src).getReg(0);
718     MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
719   }
720 
721   LLT S32 = LLT::scalar(32);
722   unsigned NumParts = Bits / 32;
723   SmallVector<Register, 8> SrcParts;
724   SmallVector<Register, 8> DstParts;
725 
726   if (Bits == 32) {
727     SrcParts.push_back(Src);
728   } else {
729     auto Unmerge = B.buildUnmerge(S32, Src);
730     for (unsigned i = 0; i < NumParts; ++i)
731       SrcParts.push_back(Unmerge.getReg(i));
732   }
733 
734   for (unsigned i = 0; i < NumParts; ++i) {
735     Register SrcPart = SrcParts[i];
736     Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
737     MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
738 
739     const TargetRegisterClass *Constrained =
740         constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
741     (void)Constrained;
742     assert(Constrained && "Failed to constrain readfirstlane src reg");
743 
744     B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
745 
746     DstParts.push_back(DstPart);
747   }
748 
749   if (Bits == 32)
750     return DstParts[0];
751 
752   Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0);
753   MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
754   return Dst;
755 }
756 
757 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
758 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
759 /// execute the instruction for each unique combination of values in all lanes
760 /// in the wave. The block will be split such that rest of the instructions are
761 /// moved to a new block.
762 ///
763 /// Essentially performs this loop:
764 //
765 /// Save Execution Mask
766 /// For (Lane : Wavefront) {
767 ///   Enable Lane, Disable all other lanes
768 ///   SGPR = read SGPR value for current lane from VGPR
769 ///   VGPRResult[Lane] = use_op SGPR
770 /// }
771 /// Restore Execution Mask
772 ///
773 /// There is additional complexity to try for compare values to identify the
774 /// unique values used.
executeInWaterfallLoop(MachineIRBuilder & B,iterator_range<MachineBasicBlock::iterator> Range,SmallSet<Register,4> & SGPROperandRegs) const775 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
776     MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
777     SmallSet<Register, 4> &SGPROperandRegs) const {
778   // Track use registers which have already been expanded with a readfirstlane
779   // sequence. This may have multiple uses if moving a sequence.
780   DenseMap<Register, Register> WaterfalledRegMap;
781 
782   MachineBasicBlock &MBB = B.getMBB();
783   MachineFunction *MF = &B.getMF();
784 
785   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
786   const unsigned MovExecOpc =
787       Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
788   const unsigned MovExecTermOpc =
789       Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
790 
791   const unsigned XorTermOpc = Subtarget.isWave32() ?
792     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
793   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
794     AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
795   const unsigned ExecReg =  Subtarget.isWave32() ?
796     AMDGPU::EXEC_LO : AMDGPU::EXEC;
797 
798 #ifndef NDEBUG
799   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
800 #endif
801 
802   MachineRegisterInfo &MRI = *B.getMRI();
803   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
804   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
805 
806   // Don't bother using generic instructions/registers for the exec mask.
807   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
808     .addDef(InitSaveExecReg);
809 
810   Register PhiExec = MRI.createVirtualRegister(WaveRC);
811   Register NewExec = MRI.createVirtualRegister(WaveRC);
812 
813   // To insert the loop we need to split the block. Move everything before this
814   // point to a new block, and insert a new empty block before this instruction.
815   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
816   MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
817   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
818   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
819   MachineFunction::iterator MBBI(MBB);
820   ++MBBI;
821   MF->insert(MBBI, LoopBB);
822   MF->insert(MBBI, BodyBB);
823   MF->insert(MBBI, RestoreExecBB);
824   MF->insert(MBBI, RemainderBB);
825 
826   LoopBB->addSuccessor(BodyBB);
827   BodyBB->addSuccessor(RestoreExecBB);
828   BodyBB->addSuccessor(LoopBB);
829 
830   // Move the rest of the block into a new block.
831   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
832   RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
833 
834   MBB.addSuccessor(LoopBB);
835   RestoreExecBB->addSuccessor(RemainderBB);
836 
837   B.setInsertPt(*LoopBB, LoopBB->end());
838 
839   B.buildInstr(TargetOpcode::PHI)
840       .addDef(PhiExec)
841       .addReg(InitSaveExecReg)
842       .addMBB(&MBB)
843       .addReg(NewExec)
844       .addMBB(BodyBB);
845 
846   const DebugLoc &DL = B.getDL();
847 
848   MachineInstr &FirstInst = *Range.begin();
849 
850   // Move the instruction into the loop body. Note we moved everything after
851   // Range.end() already into a new block, so Range.end() is no longer valid.
852   BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
853 
854   // Figure out the iterator range after splicing the instructions.
855   MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
856   auto NewEnd = BodyBB->end();
857 
858   B.setMBB(*LoopBB);
859 
860   LLT S1 = LLT::scalar(1);
861   Register CondReg;
862 
863   assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
864 
865   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
866     for (MachineOperand &Op : MI.all_uses()) {
867       Register OldReg = Op.getReg();
868       if (!SGPROperandRegs.count(OldReg))
869         continue;
870 
871       // See if we already processed this register in another instruction in the
872       // sequence.
873       auto OldVal = WaterfalledRegMap.find(OldReg);
874       if (OldVal != WaterfalledRegMap.end()) {
875         Op.setReg(OldVal->second);
876         continue;
877       }
878 
879       Register OpReg = Op.getReg();
880       LLT OpTy = MRI.getType(OpReg);
881 
882       const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
883       if (OpBank != &AMDGPU::VGPRRegBank) {
884         // Insert copy from AGPR to VGPR before the loop.
885         B.setMBB(MBB);
886         OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
887         MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
888         B.setMBB(*LoopBB);
889       }
890 
891       Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
892 
893       // Build the comparison(s).
894       unsigned OpSize = OpTy.getSizeInBits();
895       bool Is64 = OpSize % 64 == 0;
896       unsigned PartSize = Is64 ? 64 : 32;
897       LLT PartTy = LLT::scalar(PartSize);
898       unsigned NumParts = OpSize / PartSize;
899       SmallVector<Register, 8> OpParts;
900       SmallVector<Register, 8> CurrentLaneParts;
901 
902       if (NumParts == 1) {
903         OpParts.push_back(OpReg);
904         CurrentLaneParts.push_back(CurrentLaneReg);
905       } else {
906         auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
907         auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
908         for (unsigned i = 0; i < NumParts; ++i) {
909           OpParts.push_back(UnmergeOp.getReg(i));
910           CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
911           MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
912           MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
913         }
914       }
915 
916       for (unsigned i = 0; i < NumParts; ++i) {
917         auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
918                                   OpParts[i]).getReg(0);
919         MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
920 
921         if (!CondReg) {
922           CondReg = CmpReg;
923         } else {
924           CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
925           MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
926         }
927       }
928 
929       Op.setReg(CurrentLaneReg);
930 
931       // Make sure we don't re-process this register again.
932       WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
933     }
934   }
935 
936   // The ballot becomes a no-op during instruction selection.
937   CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
938                              {LLT::scalar(Subtarget.isWave32() ? 32 : 64)})
939                 .addReg(CondReg)
940                 .getReg(0);
941   MRI.setRegClass(CondReg, WaveRC);
942 
943   // Update EXEC, save the original EXEC value to VCC.
944   B.buildInstr(AndSaveExecOpc)
945     .addDef(NewExec)
946     .addReg(CondReg, RegState::Kill);
947 
948   MRI.setSimpleHint(NewExec, CondReg);
949 
950   B.setInsertPt(*BodyBB, BodyBB->end());
951 
952   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
953   B.buildInstr(XorTermOpc)
954     .addDef(ExecReg)
955     .addReg(ExecReg)
956     .addReg(NewExec);
957 
958   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
959   // s_cbranch_scc0?
960 
961   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
962   B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
963 
964   // Save the EXEC mask before the loop.
965   BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
966     .addReg(ExecReg);
967 
968   // Restore the EXEC mask after the loop.
969   B.setMBB(*RestoreExecBB);
970   B.buildInstr(MovExecTermOpc)
971     .addDef(ExecReg)
972     .addReg(SaveExecReg);
973 
974   // Set the insert point after the original instruction, so any new
975   // instructions will be in the remainder.
976   B.setInsertPt(*RemainderBB, RemainderBB->begin());
977 
978   return true;
979 }
980 
981 // Return any unique registers used by \p MI at \p OpIndices that need to be
982 // handled in a waterfall loop. Returns these registers in \p
983 // SGPROperandRegs. Returns true if there are any operands to handle and a
984 // waterfall loop is necessary.
collectWaterfallOperands(SmallSet<Register,4> & SGPROperandRegs,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const985 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
986   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
987   MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
988   for (unsigned Op : OpIndices) {
989     assert(MI.getOperand(Op).isUse());
990     Register Reg = MI.getOperand(Op).getReg();
991     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
992     if (OpBank->getID() != AMDGPU::SGPRRegBankID)
993       SGPROperandRegs.insert(Reg);
994   }
995 
996   // No operands need to be replaced, so no need to loop.
997   return !SGPROperandRegs.empty();
998 }
999 
executeInWaterfallLoop(MachineIRBuilder & B,MachineInstr & MI,ArrayRef<unsigned> OpIndices) const1000 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1001     MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const {
1002   // Use a set to avoid extra readfirstlanes in the case where multiple operands
1003   // are the same register.
1004   SmallSet<Register, 4> SGPROperandRegs;
1005 
1006   if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices))
1007     return false;
1008 
1009   MachineBasicBlock::iterator I = MI.getIterator();
1010   return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1011                                 SGPROperandRegs);
1012 }
1013 
1014 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
constrainOpWithReadfirstlane(MachineIRBuilder & B,MachineInstr & MI,unsigned OpIdx) const1015 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1016     MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const {
1017   Register Reg = MI.getOperand(OpIdx).getReg();
1018   MachineRegisterInfo &MRI = *B.getMRI();
1019   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1020   if (Bank == &AMDGPU::SGPRRegBank)
1021     return;
1022 
1023   Reg = buildReadFirstLane(B, MRI, Reg);
1024   MI.getOperand(OpIdx).setReg(Reg);
1025 }
1026 
1027 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1028 /// rest will be in the remainder.
splitUnequalType(LLT Ty,unsigned FirstSize)1029 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1030   unsigned TotalSize = Ty.getSizeInBits();
1031   if (!Ty.isVector())
1032     return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1033 
1034   LLT EltTy = Ty.getElementType();
1035   unsigned EltSize = EltTy.getSizeInBits();
1036   assert(FirstSize % EltSize == 0);
1037 
1038   unsigned FirstPartNumElts = FirstSize / EltSize;
1039   unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1040 
1041   return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1042           LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1043 }
1044 
widen96To128(LLT Ty)1045 static LLT widen96To128(LLT Ty) {
1046   if (!Ty.isVector())
1047     return LLT::scalar(128);
1048 
1049   LLT EltTy = Ty.getElementType();
1050   assert(128 % EltTy.getSizeInBits() == 0);
1051   return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1052 }
1053 
applyMappingLoad(MachineIRBuilder & B,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineInstr & MI) const1054 bool AMDGPURegisterBankInfo::applyMappingLoad(
1055     MachineIRBuilder &B,
1056     const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1057     MachineInstr &MI) const {
1058   MachineRegisterInfo &MRI = *B.getMRI();
1059   Register DstReg = MI.getOperand(0).getReg();
1060   const LLT LoadTy = MRI.getType(DstReg);
1061   unsigned LoadSize = LoadTy.getSizeInBits();
1062   MachineMemOperand *MMO = *MI.memoperands_begin();
1063   const unsigned MaxNonSmrdLoadSize = 128;
1064 
1065   const RegisterBank *DstBank =
1066       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1067   if (DstBank == &AMDGPU::SGPRRegBank) {
1068     // There are some special cases that we need to look at for 32 bit and 96
1069     // bit SGPR loads otherwise we have nothing to do.
1070     if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
1071       return false;
1072 
1073     const unsigned MemSize = 8 * MMO->getSize().getValue();
1074     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1075     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1076     // scalar loads should have a load size of 32 but memory access size of less
1077     // than 32.
1078     if (LoadSize == 32 &&
1079         (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1080       return false;
1081 
1082     if (LoadSize == 32 &&
1083         ((MemSize == 8 && MMO->getAlign() >= Align(1)) ||
1084          (MemSize == 16 && MMO->getAlign() >= Align(2))) &&
1085         isScalarLoadLegal(MI) &&
1086         Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12)
1087       return false;
1088 
1089     Register PtrReg = MI.getOperand(1).getReg();
1090 
1091     ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
1092 
1093     if (LoadSize == 32) {
1094       // This is an extending load from a sub-dword size. Widen the memory
1095       // access size to 4 bytes and clear the extra high bits appropriately
1096       const LLT S32 = LLT::scalar(32);
1097       if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1098         // Must extend the sign bit into higher bits for a G_SEXTLOAD
1099         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1100         B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1101       } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1102         // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1103         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1104         B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1105       } else
1106         // We do not need to touch the higher bits for regular loads.
1107         B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1108     } else {
1109       // 96-bit loads are only available for vector loads. We need to split this
1110       // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1111       if (MMO->getAlign() < Align(16)) {
1112         LegalizerHelper Helper(B.getMF(), ApplyBank, B);
1113         LLT Part64, Part32;
1114         std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1115         if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1116             LegalizerHelper::Legalized)
1117           return false;
1118         return true;
1119       }
1120       LLT WiderTy = widen96To128(LoadTy);
1121       auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1122       if (WiderTy.isScalar()) {
1123         B.buildTrunc(MI.getOperand(0), WideLoad);
1124       } else {
1125         B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1126                                             WideLoad);
1127       }
1128     }
1129 
1130     MI.eraseFromParent();
1131     return true;
1132   }
1133 
1134   // 128-bit loads are supported for all instruction types.
1135   if (LoadSize <= MaxNonSmrdLoadSize)
1136     return false;
1137 
1138   SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1139 
1140   if (SrcRegs.empty())
1141     SrcRegs.push_back(MI.getOperand(1).getReg());
1142 
1143   // RegBankSelect only emits scalar types, so we need to reset the pointer
1144   // operand to a pointer type.
1145   Register BasePtrReg = SrcRegs[0];
1146   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1147   MRI.setType(BasePtrReg, PtrTy);
1148 
1149   // The following are the loads not splitted enough during legalization
1150   // because it was not clear they are smem-load or vmem-load
1151   if (AMDGPU::isExtendedGlobalAddrSpace(MMO->getAddrSpace()) ||
1152       MMO->getAddrSpace() == AMDGPUAS::BUFFER_RESOURCE) {
1153     assert(LoadSize % MaxNonSmrdLoadSize == 0);
1154     unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1155     const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1156     ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
1157     LegalizerHelper Helper(B.getMF(), O, B);
1158     if (LoadTy.isVector()) {
1159       if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) !=
1160           LegalizerHelper::Legalized)
1161         return false;
1162     } else {
1163       if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1164         return false;
1165     }
1166   }
1167 
1168   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1169   return true;
1170 }
1171 
applyMappingDynStackAlloc(MachineIRBuilder & B,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineInstr & MI) const1172 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1173     MachineIRBuilder &B,
1174     const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1175     MachineInstr &MI) const {
1176   MachineRegisterInfo &MRI = *B.getMRI();
1177   const MachineFunction &MF = B.getMF();
1178   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1179   const auto &TFI = *ST.getFrameLowering();
1180 
1181   // Guard in case the stack growth direction ever changes with scratch
1182   // instructions.
1183   assert(TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
1184          "Stack grows upwards for AMDGPU");
1185 
1186   Register Dst = MI.getOperand(0).getReg();
1187   Register AllocSize = MI.getOperand(1).getReg();
1188   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1189 
1190   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1191 
1192   if (SizeBank != &AMDGPU::SGPRRegBank) {
1193     auto WaveReduction =
1194         B.buildIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, {LLT::scalar(32)})
1195             .addUse(AllocSize)
1196             .addImm(0);
1197     AllocSize = WaveReduction.getReg(0);
1198   }
1199 
1200   LLT PtrTy = MRI.getType(Dst);
1201   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1202 
1203   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1204   Register SPReg = Info->getStackPtrOffsetReg();
1205   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1206 
1207   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1208   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1209 
1210   auto OldSP = B.buildCopy(PtrTy, SPReg);
1211   if (Alignment > TFI.getStackAlign()) {
1212     auto StackAlignMask = (Alignment.value() << ST.getWavefrontSizeLog2()) - 1;
1213     auto Tmp1 = B.buildPtrAdd(PtrTy, OldSP,
1214                               B.buildConstant(LLT::scalar(32), StackAlignMask));
1215     B.buildMaskLowPtrBits(Dst, Tmp1,
1216                           Log2(Alignment) + ST.getWavefrontSizeLog2());
1217   } else {
1218     B.buildCopy(Dst, OldSP);
1219   }
1220   auto PtrAdd = B.buildPtrAdd(PtrTy, Dst, ScaledSize);
1221   B.buildCopy(SPReg, PtrAdd);
1222   MI.eraseFromParent();
1223   return true;
1224 }
1225 
applyMappingImage(MachineIRBuilder & B,MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,int RsrcIdx) const1226 bool AMDGPURegisterBankInfo::applyMappingImage(
1227     MachineIRBuilder &B, MachineInstr &MI,
1228     const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1229     int RsrcIdx) const {
1230   const int NumDefs = MI.getNumExplicitDefs();
1231 
1232   // The reported argument index is relative to the IR intrinsic call arguments,
1233   // so we need to shift by the number of defs and the intrinsic ID.
1234   RsrcIdx += NumDefs + 1;
1235 
1236   // Insert copies to VGPR arguments.
1237   applyDefaultMapping(OpdMapper);
1238 
1239   // Fixup any SGPR arguments.
1240   SmallVector<unsigned, 4> SGPRIndexes;
1241   for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1242     if (!MI.getOperand(I).isReg())
1243       continue;
1244 
1245     // If this intrinsic has a sampler, it immediately follows rsrc.
1246     if (I == RsrcIdx || I == RsrcIdx + 1)
1247       SGPRIndexes.push_back(I);
1248   }
1249 
1250   executeInWaterfallLoop(B, MI, SGPRIndexes);
1251   return true;
1252 }
1253 
1254 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1255 // the three offsets (voffset, soffset and instoffset)
setBufferOffsets(MachineIRBuilder & B,Register CombinedOffset,Register & VOffsetReg,Register & SOffsetReg,int64_t & InstOffsetVal,Align Alignment) const1256 unsigned AMDGPURegisterBankInfo::setBufferOffsets(
1257     MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
1258     Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
1259   const LLT S32 = LLT::scalar(32);
1260   MachineRegisterInfo *MRI = B.getMRI();
1261 
1262   if (std::optional<int64_t> Imm =
1263           getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1264     uint32_t SOffset, ImmOffset;
1265     if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) {
1266       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1267       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1268       InstOffsetVal = ImmOffset;
1269 
1270       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1271       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1272       return SOffset + ImmOffset;
1273     }
1274   }
1275 
1276   Register Base;
1277   unsigned Offset;
1278 
1279   std::tie(Base, Offset) =
1280       AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1281 
1282   uint32_t SOffset, ImmOffset;
1283   if ((int)Offset > 0 &&
1284       TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
1285     if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1286       VOffsetReg = Base;
1287       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1288       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1289       InstOffsetVal = ImmOffset;
1290       return 0; // XXX - Why is this 0?
1291     }
1292 
1293     // If we have SGPR base, we can use it for soffset.
1294     if (SOffset == 0) {
1295       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1296       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1297       SOffsetReg = Base;
1298       InstOffsetVal = ImmOffset;
1299       return 0; // XXX - Why is this 0?
1300     }
1301   }
1302 
1303   // Handle the variable sgpr + vgpr case.
1304   MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1305   if (Add && (int)Offset >= 0) {
1306     Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI);
1307     Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI);
1308 
1309     const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI);
1310     const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI);
1311 
1312     if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1313       VOffsetReg = Src0;
1314       SOffsetReg = Src1;
1315       return 0;
1316     }
1317 
1318     if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1319       VOffsetReg = Src1;
1320       SOffsetReg = Src0;
1321       return 0;
1322     }
1323   }
1324 
1325   // Ensure we have a VGPR for the combined offset. This could be an issue if we
1326   // have an SGPR offset and a VGPR resource.
1327   if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1328     VOffsetReg = CombinedOffset;
1329   } else {
1330     VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1331     B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1332   }
1333 
1334   SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1335   B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1336   return 0;
1337 }
1338 
getSBufferLoadCorrespondingBufferLoadOpcode(unsigned Opc)1339 static unsigned getSBufferLoadCorrespondingBufferLoadOpcode(unsigned Opc) {
1340   switch (Opc) {
1341   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
1342     return AMDGPU::G_AMDGPU_BUFFER_LOAD;
1343   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
1344     return AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
1345   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
1346     return AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE;
1347   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
1348     return AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
1349   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:
1350     return AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
1351   default:
1352     break;
1353   }
1354   llvm_unreachable("Unexpected s_buffer_load opcode");
1355 }
1356 
applyMappingSBufferLoad(MachineIRBuilder & B,const OperandsMapper & OpdMapper) const1357 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1358     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1359   MachineInstr &MI = OpdMapper.getMI();
1360   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1361 
1362   const LLT S32 = LLT::scalar(32);
1363   Register Dst = MI.getOperand(0).getReg();
1364   LLT Ty = MRI.getType(Dst);
1365 
1366   const RegisterBank *RSrcBank =
1367     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1368   const RegisterBank *OffsetBank =
1369     OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1370   if (RSrcBank == &AMDGPU::SGPRRegBank &&
1371       OffsetBank == &AMDGPU::SGPRRegBank)
1372     return true; // Legal mapping
1373 
1374   // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1375   // here but don't have an MMO.
1376 
1377   unsigned LoadSize = Ty.getSizeInBits();
1378   int NumLoads = 1;
1379   if (LoadSize == 256 || LoadSize == 512) {
1380     NumLoads = LoadSize / 128;
1381     Ty = Ty.divide(NumLoads);
1382   }
1383 
1384   // Use the alignment to ensure that the required offsets will fit into the
1385   // immediate offsets.
1386   const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1387 
1388   MachineFunction &MF = B.getMF();
1389 
1390   Register SOffset;
1391   Register VOffset;
1392   int64_t ImmOffset = 0;
1393 
1394   unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset,
1395                                         SOffset, ImmOffset, Alignment);
1396 
1397   // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1398   // can, but we need to track an MMO for that.
1399   const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1400   const Align MemAlign(4); // FIXME: ABI type alignment?
1401   MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1402     MachinePointerInfo(),
1403     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1404     MachineMemOperand::MOInvariant,
1405     MemSize, MemAlign);
1406   if (MMOOffset != 0)
1407     BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1408 
1409   // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1410   // assume that the buffer is unswizzled.
1411 
1412   Register RSrc = MI.getOperand(1).getReg();
1413   Register VIndex = B.buildConstant(S32, 0).getReg(0);
1414   B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1415 
1416   SmallVector<Register, 4> LoadParts(NumLoads);
1417 
1418   MachineBasicBlock::iterator MII = MI.getIterator();
1419   MachineInstrSpan Span(MII, &B.getMBB());
1420 
1421   for (int i = 0; i < NumLoads; ++i) {
1422     if (NumLoads == 1) {
1423       LoadParts[i] = Dst;
1424     } else {
1425       LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1426       MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1427     }
1428 
1429     MachineMemOperand *MMO = BaseMMO;
1430     if (i != 0)
1431       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1432 
1433     B.buildInstr(getSBufferLoadCorrespondingBufferLoadOpcode(MI.getOpcode()))
1434         .addDef(LoadParts[i])       // vdata
1435         .addUse(RSrc)               // rsrc
1436         .addUse(VIndex)             // vindex
1437         .addUse(VOffset)            // voffset
1438         .addUse(SOffset)            // soffset
1439         .addImm(ImmOffset + 16 * i) // offset(imm)
1440         .addImm(0)                  // cachepolicy, swizzled buffer(imm)
1441         .addImm(0)                  // idxen(imm)
1442         .addMemOperand(MMO);
1443   }
1444 
1445   // TODO: If only the resource is a VGPR, it may be better to execute the
1446   // scalar load in the waterfall loop if the resource is expected to frequently
1447   // be dynamically uniform.
1448   if (RSrcBank != &AMDGPU::SGPRRegBank) {
1449     // Remove the original instruction to avoid potentially confusing the
1450     // waterfall loop logic.
1451     B.setInstr(*Span.begin());
1452     MI.eraseFromParent();
1453 
1454     SmallSet<Register, 4> OpsToWaterfall;
1455 
1456     OpsToWaterfall.insert(RSrc);
1457     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1458                            OpsToWaterfall);
1459   }
1460 
1461   if (NumLoads != 1) {
1462     if (Ty.isVector())
1463       B.buildConcatVectors(Dst, LoadParts);
1464     else
1465       B.buildMergeLikeInstr(Dst, LoadParts);
1466   }
1467 
1468   // We removed the instruction earlier with a waterfall loop.
1469   if (RSrcBank == &AMDGPU::SGPRRegBank)
1470     MI.eraseFromParent();
1471 
1472   return true;
1473 }
1474 
applyMappingBFE(MachineIRBuilder & B,const OperandsMapper & OpdMapper,bool Signed) const1475 bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
1476                                              const OperandsMapper &OpdMapper,
1477                                              bool Signed) const {
1478   MachineInstr &MI = OpdMapper.getMI();
1479   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1480 
1481   // Insert basic copies
1482   applyDefaultMapping(OpdMapper);
1483 
1484   Register DstReg = MI.getOperand(0).getReg();
1485   LLT Ty = MRI.getType(DstReg);
1486 
1487   const LLT S32 = LLT::scalar(32);
1488 
1489   unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
1490   Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1491   Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1492   Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1493 
1494   const RegisterBank *DstBank =
1495     OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1496   if (DstBank == &AMDGPU::VGPRRegBank) {
1497     if (Ty == S32)
1498       return true;
1499 
1500     // There is no 64-bit vgpr bitfield extract instructions so the operation
1501     // is expanded to a sequence of instructions that implement the operation.
1502     ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
1503 
1504     const LLT S64 = LLT::scalar(64);
1505     // Shift the source operand so that extracted bits start at bit 0.
1506     auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1507                               : B.buildLShr(S64, SrcReg, OffsetReg);
1508     auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1509 
1510     // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1511     // if the width is a constant.
1512     if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1513       // Use the 32-bit bitfield extract instruction if the width is a constant.
1514       // Depending on the width size, use either the low or high 32-bits.
1515       auto Zero = B.buildConstant(S32, 0);
1516       auto WidthImm = ConstWidth->Value.getZExtValue();
1517       if (WidthImm <= 32) {
1518         // Use bitfield extract on the lower 32-bit source, and then sign-extend
1519         // or clear the upper 32-bits.
1520         auto Extract =
1521             Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1522                    : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1523         auto Extend =
1524             Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1525         B.buildMergeLikeInstr(DstReg, {Extract, Extend});
1526       } else {
1527         // Use bitfield extract on upper 32-bit source, and combine with lower
1528         // 32-bit source.
1529         auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1530         auto Extract =
1531             Signed
1532                 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1533                 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1534         B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
1535       }
1536       MI.eraseFromParent();
1537       return true;
1538     }
1539 
1540     // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1541     // operations.
1542     auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1543     auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1544     if (Signed)
1545       B.buildAShr(S64, SignBit, ExtShift);
1546     else
1547       B.buildLShr(S64, SignBit, ExtShift);
1548     MI.eraseFromParent();
1549     return true;
1550   }
1551 
1552   // The scalar form packs the offset and width in a single operand.
1553 
1554   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1555 
1556   // Ensure the high bits are clear to insert the offset.
1557   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1558   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1559 
1560   // Zeros out the low bits, so don't bother clamping the input value.
1561   auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1562 
1563   // Transformation function, pack the offset and width of a BFE into
1564   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1565   // source, bits [5:0] contain the offset and bits [22:16] the width.
1566   auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1567 
1568   // TODO: It might be worth using a pseudo here to avoid scc clobber and
1569   // register class constraints.
1570   unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1571                              (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1572 
1573   auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1574   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1575     llvm_unreachable("failed to constrain BFE");
1576 
1577   MI.eraseFromParent();
1578   return true;
1579 }
1580 
applyMappingMAD_64_32(MachineIRBuilder & B,const OperandsMapper & OpdMapper) const1581 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1582     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1583   MachineInstr &MI = OpdMapper.getMI();
1584   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1585 
1586   // Insert basic copies.
1587   applyDefaultMapping(OpdMapper);
1588 
1589   Register Dst0 = MI.getOperand(0).getReg();
1590   Register Dst1 = MI.getOperand(1).getReg();
1591   Register Src0 = MI.getOperand(2).getReg();
1592   Register Src1 = MI.getOperand(3).getReg();
1593   Register Src2 = MI.getOperand(4).getReg();
1594 
1595   if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
1596     return true;
1597 
1598   bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1599   LLT S1 = LLT::scalar(1);
1600   LLT S32 = LLT::scalar(32);
1601 
1602   bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
1603   bool Accumulate = true;
1604 
1605   if (!DstOnValu) {
1606     if (mi_match(Src2, MRI, m_ZeroInt()))
1607       Accumulate = false;
1608   }
1609 
1610   // Keep the multiplication on the SALU.
1611   Register DstHi;
1612   Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
1613   bool MulHiInVgpr = false;
1614 
1615   MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
1616 
1617   if (Subtarget.hasSMulHi()) {
1618     DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)
1619                        : B.buildSMulH(S32, Src0, Src1).getReg(0);
1620     MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
1621   } else {
1622     Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);
1623     Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);
1624 
1625     MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
1626     MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
1627 
1628     DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)
1629                        : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);
1630     MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1631 
1632     if (!DstOnValu) {
1633       DstHi = buildReadFirstLane(B, MRI, DstHi);
1634     } else {
1635       MulHiInVgpr = true;
1636     }
1637   }
1638 
1639   // Accumulate and produce the "carry-out" bit.
1640   //
1641   // The "carry-out" is defined as bit 64 of the result when computed as a
1642   // big integer. For unsigned multiply-add, this matches the usual definition
1643   // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1644   // result, which is determined as:
1645   //   sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1646   LLT CarryType = DstOnValu ? S1 : S32;
1647   const RegisterBank &CarryBank =
1648       DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1649   const RegisterBank &DstBank =
1650       DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1651   Register Carry;
1652   Register Zero;
1653 
1654   if (!IsUnsigned) {
1655     Zero = B.buildConstant(S32, 0).getReg(0);
1656     MRI.setRegBank(Zero,
1657                    MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1658 
1659     Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)
1660                 .getReg(0);
1661     MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
1662                                       : AMDGPU::SGPRRegBank);
1663 
1664     if (DstOnValu && !MulHiInVgpr) {
1665       Carry = B.buildTrunc(S1, Carry).getReg(0);
1666       MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
1667     }
1668   }
1669 
1670   if (Accumulate) {
1671     if (DstOnValu) {
1672       DstLo = B.buildCopy(S32, DstLo).getReg(0);
1673       DstHi = B.buildCopy(S32, DstHi).getReg(0);
1674       MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
1675       MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1676     }
1677 
1678     auto Unmerge = B.buildUnmerge(S32, Src2);
1679     Register Src2Lo = Unmerge.getReg(0);
1680     Register Src2Hi = Unmerge.getReg(1);
1681     MRI.setRegBank(Src2Lo, DstBank);
1682     MRI.setRegBank(Src2Hi, DstBank);
1683 
1684     if (!IsUnsigned) {
1685       auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);
1686       MRI.setRegBank(Src2Sign.getReg(0), CarryBank);
1687 
1688       Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);
1689       MRI.setRegBank(Carry, CarryBank);
1690     }
1691 
1692     auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);
1693     DstLo = AddLo.getReg(0);
1694     Register CarryLo = AddLo.getReg(1);
1695     MRI.setRegBank(DstLo, DstBank);
1696     MRI.setRegBank(CarryLo, CarryBank);
1697 
1698     auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);
1699     DstHi = AddHi.getReg(0);
1700     MRI.setRegBank(DstHi, DstBank);
1701 
1702     Register CarryHi = AddHi.getReg(1);
1703     MRI.setRegBank(CarryHi, CarryBank);
1704 
1705     if (IsUnsigned) {
1706       Carry = CarryHi;
1707     } else {
1708       Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);
1709       MRI.setRegBank(Carry, CarryBank);
1710     }
1711   } else {
1712     if (IsUnsigned) {
1713       Carry = B.buildConstant(CarryType, 0).getReg(0);
1714       MRI.setRegBank(Carry, CarryBank);
1715     }
1716   }
1717 
1718   B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
1719 
1720   if (DstOnValu) {
1721     B.buildCopy(Dst1, Carry);
1722   } else {
1723     B.buildTrunc(Dst1, Carry);
1724   }
1725 
1726   MI.eraseFromParent();
1727   return true;
1728 }
1729 
1730 // Return a suitable opcode for extending the operands of Opc when widening.
getExtendOp(unsigned Opc)1731 static unsigned getExtendOp(unsigned Opc) {
1732   switch (Opc) {
1733   case TargetOpcode::G_ASHR:
1734   case TargetOpcode::G_SMIN:
1735   case TargetOpcode::G_SMAX:
1736     return TargetOpcode::G_SEXT;
1737   case TargetOpcode::G_LSHR:
1738   case TargetOpcode::G_UMIN:
1739   case TargetOpcode::G_UMAX:
1740     return TargetOpcode::G_ZEXT;
1741   default:
1742     return TargetOpcode::G_ANYEXT;
1743   }
1744 }
1745 
1746 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1747 // any illegal vector extend or unmerge operations.
1748 static std::pair<Register, Register>
unpackV2S16ToS32(MachineIRBuilder & B,Register Src,unsigned ExtOpcode)1749 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1750   const LLT S32 = LLT::scalar(32);
1751   auto Bitcast = B.buildBitcast(S32, Src);
1752 
1753   if (ExtOpcode == TargetOpcode::G_SEXT) {
1754     auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1755     auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1756     return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1757   }
1758 
1759   auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1760   if (ExtOpcode == TargetOpcode::G_ZEXT) {
1761     auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1762     return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1763   }
1764 
1765   assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1766   return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1767 }
1768 
1769 // For cases where only a single copy is inserted for matching register banks.
1770 // Replace the register in the instruction operand
substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,unsigned OpIdx)1771 static bool substituteSimpleCopyRegs(
1772   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1773   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1774   if (!SrcReg.empty()) {
1775     assert(SrcReg.size() == 1);
1776     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1777     return true;
1778   }
1779 
1780   return false;
1781 }
1782 
1783 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg) const1784 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1785                                                 MachineRegisterInfo &MRI,
1786                                                 Register Reg) const {
1787   if (!Subtarget.hasUnpackedD16VMem())
1788     return Reg;
1789 
1790   const LLT S16 = LLT::scalar(16);
1791   LLT StoreVT = MRI.getType(Reg);
1792   if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1793     return Reg;
1794 
1795   auto Unmerge = B.buildUnmerge(S16, Reg);
1796 
1797 
1798   SmallVector<Register, 4> WideRegs;
1799   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1800     WideRegs.push_back(Unmerge.getReg(I));
1801 
1802   const LLT S32 = LLT::scalar(32);
1803   int NumElts = StoreVT.getNumElements();
1804 
1805   return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs)
1806       .getReg(0);
1807 }
1808 
1809 static std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo & MRI,Register Reg)1810 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1811   int64_t Const;
1812   if (mi_match(Reg, MRI, m_ICst(Const)))
1813     return std::pair(Register(), Const);
1814 
1815   Register Base;
1816   if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1817     return std::pair(Base, Const);
1818 
1819   // TODO: Handle G_OR used for add case
1820   return std::pair(Reg, 0);
1821 }
1822 
1823 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const1824 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1825                                            Register OrigOffset) const {
1826   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(Subtarget);
1827   Register BaseReg;
1828   unsigned ImmOffset;
1829   const LLT S32 = LLT::scalar(32);
1830 
1831   // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1832   std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1833                                                            OrigOffset);
1834 
1835   unsigned C1 = 0;
1836   if (ImmOffset != 0) {
1837     // If the immediate value is too big for the immoffset field, put only bits
1838     // that would normally fit in the immoffset field. The remaining value that
1839     // is copied/added for the voffset field is a large power of 2, and it
1840     // stands more chance of being CSEd with the copy/add for another similar
1841     // load/store.
1842     // However, do not do that rounding down if that is a negative
1843     // number, as it appears to be illegal to have a negative offset in the
1844     // vgpr, even if adding the immediate offset makes it positive.
1845     unsigned Overflow = ImmOffset & ~MaxImm;
1846     ImmOffset -= Overflow;
1847     if ((int32_t)Overflow < 0) {
1848       Overflow += ImmOffset;
1849       ImmOffset = 0;
1850     }
1851 
1852     C1 = ImmOffset;
1853     if (Overflow != 0) {
1854       if (!BaseReg)
1855         BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1856       else {
1857         auto OverflowVal = B.buildConstant(S32, Overflow);
1858         BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1859       }
1860     }
1861   }
1862 
1863   if (!BaseReg)
1864     BaseReg = B.buildConstant(S32, 0).getReg(0);
1865 
1866   return {BaseReg, C1};
1867 }
1868 
buildVCopy(MachineIRBuilder & B,Register DstReg,Register SrcReg) const1869 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1870                                         Register SrcReg) const {
1871   MachineRegisterInfo &MRI = *B.getMRI();
1872   LLT SrcTy = MRI.getType(SrcReg);
1873   if (SrcTy.getSizeInBits() == 32) {
1874     // Use a v_mov_b32 here to make the exec dependency explicit.
1875     B.buildInstr(AMDGPU::V_MOV_B32_e32)
1876       .addDef(DstReg)
1877       .addUse(SrcReg);
1878     return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1879            constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1880   }
1881 
1882   Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1883   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1884 
1885   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1886     .addDef(TmpReg0)
1887     .addUse(SrcReg, 0, AMDGPU::sub0);
1888   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1889     .addDef(TmpReg1)
1890     .addUse(SrcReg, 0, AMDGPU::sub1);
1891   B.buildInstr(AMDGPU::REG_SEQUENCE)
1892     .addDef(DstReg)
1893     .addUse(TmpReg0)
1894     .addImm(AMDGPU::sub0)
1895     .addUse(TmpReg1)
1896     .addImm(AMDGPU::sub1);
1897 
1898   return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1899          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1900 }
1901 
1902 /// Utility function for pushing dynamic vector indexes with a constant offset
1903 /// into waterfall loops.
reinsertVectorIndexAdd(MachineIRBuilder & B,MachineInstr & IdxUseInstr,unsigned OpIdx,unsigned ConstOffset)1904 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1905                                    MachineInstr &IdxUseInstr,
1906                                    unsigned OpIdx,
1907                                    unsigned ConstOffset) {
1908   MachineRegisterInfo &MRI = *B.getMRI();
1909   const LLT S32 = LLT::scalar(32);
1910   Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1911   B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1912 
1913   auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1914 
1915   auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1916   MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1917   MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1918   IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1919 }
1920 
1921 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1922 /// original 32-bit source value (to be inserted in the low part of the combined
1923 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1924 /// value.
extendLow32IntoHigh32(MachineIRBuilder & B,Register Hi32Reg,Register Lo32Reg,unsigned ExtOpc,const RegisterBank & RegBank,bool IsBooleanSrc=false)1925 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1926                                   Register Hi32Reg, Register Lo32Reg,
1927                                   unsigned ExtOpc,
1928                                   const RegisterBank &RegBank,
1929                                   bool IsBooleanSrc = false) {
1930   if (ExtOpc == AMDGPU::G_ZEXT) {
1931     B.buildConstant(Hi32Reg, 0);
1932   } else if (ExtOpc == AMDGPU::G_SEXT) {
1933     if (IsBooleanSrc) {
1934       // If we know the original source was an s1, the high half is the same as
1935       // the low.
1936       B.buildCopy(Hi32Reg, Lo32Reg);
1937     } else {
1938       // Replicate sign bit from 32-bit extended part.
1939       auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1940       B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1941       B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1942     }
1943   } else {
1944     assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1945     B.buildUndef(Hi32Reg);
1946   }
1947 }
1948 
foldExtractEltToCmpSelect(MachineIRBuilder & B,MachineInstr & MI,const OperandsMapper & OpdMapper) const1949 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1950     MachineIRBuilder &B, MachineInstr &MI,
1951     const OperandsMapper &OpdMapper) const {
1952   MachineRegisterInfo &MRI = *B.getMRI();
1953 
1954   Register VecReg = MI.getOperand(1).getReg();
1955   Register Idx = MI.getOperand(2).getReg();
1956 
1957   const RegisterBank &IdxBank =
1958     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1959 
1960   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1961 
1962   LLT VecTy = MRI.getType(VecReg);
1963   unsigned EltSize = VecTy.getScalarSizeInBits();
1964   unsigned NumElem = VecTy.getNumElements();
1965 
1966   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1967                                                   IsDivergentIdx, &Subtarget))
1968     return false;
1969 
1970   LLT S32 = LLT::scalar(32);
1971 
1972   const RegisterBank &DstBank =
1973     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1974   const RegisterBank &SrcBank =
1975     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1976 
1977   const RegisterBank &CCBank =
1978     (DstBank == AMDGPU::SGPRRegBank &&
1979      SrcBank == AMDGPU::SGPRRegBank &&
1980      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1981                                      : AMDGPU::VCCRegBank;
1982   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1983 
1984   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1985     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1986     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1987   }
1988 
1989   LLT EltTy = VecTy.getScalarType();
1990   SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1991   unsigned NumLanes = DstRegs.size();
1992   if (!NumLanes)
1993     NumLanes = 1;
1994   else
1995     EltTy = MRI.getType(DstRegs[0]);
1996 
1997   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1998   SmallVector<Register, 2> Res(NumLanes);
1999   for (unsigned L = 0; L < NumLanes; ++L)
2000     Res[L] = UnmergeToEltTy.getReg(L);
2001 
2002   for (unsigned I = 1; I < NumElem; ++I) {
2003     auto IC = B.buildConstant(S32, I);
2004     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2005     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2006     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2007 
2008     for (unsigned L = 0; L < NumLanes; ++L) {
2009       auto S = B.buildSelect(EltTy, Cmp,
2010                              UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
2011 
2012       for (unsigned N : { 0, 2, 3 })
2013         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
2014 
2015       Res[L] = S->getOperand(0).getReg();
2016     }
2017   }
2018 
2019   for (unsigned L = 0; L < NumLanes; ++L) {
2020     Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
2021     B.buildCopy(DstReg, Res[L]);
2022     MRI.setRegBank(DstReg, DstBank);
2023   }
2024 
2025   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2026   MI.eraseFromParent();
2027 
2028   return true;
2029 }
2030 
2031 // Insert a cross regbank copy for a register if it already has a bank that
2032 // differs from the one we want to set.
constrainRegToBank(MachineRegisterInfo & MRI,MachineIRBuilder & B,Register & Reg,const RegisterBank & Bank)2033 static Register constrainRegToBank(MachineRegisterInfo &MRI,
2034                                    MachineIRBuilder &B, Register &Reg,
2035                                    const RegisterBank &Bank) {
2036   const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2037   if (CurrBank && *CurrBank != Bank) {
2038     Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2039     MRI.setRegBank(Copy, Bank);
2040     return Copy;
2041   }
2042 
2043   MRI.setRegBank(Reg, Bank);
2044   return Reg;
2045 }
2046 
foldInsertEltToCmpSelect(MachineIRBuilder & B,MachineInstr & MI,const OperandsMapper & OpdMapper) const2047 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2048     MachineIRBuilder &B, MachineInstr &MI,
2049     const OperandsMapper &OpdMapper) const {
2050 
2051   MachineRegisterInfo &MRI = *B.getMRI();
2052   Register VecReg = MI.getOperand(1).getReg();
2053   Register Idx = MI.getOperand(3).getReg();
2054 
2055   const RegisterBank &IdxBank =
2056     *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2057 
2058   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2059 
2060   LLT VecTy = MRI.getType(VecReg);
2061   unsigned EltSize = VecTy.getScalarSizeInBits();
2062   unsigned NumElem = VecTy.getNumElements();
2063 
2064   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2065                                                   IsDivergentIdx, &Subtarget))
2066     return false;
2067 
2068   LLT S32 = LLT::scalar(32);
2069 
2070   const RegisterBank &DstBank =
2071     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2072   const RegisterBank &SrcBank =
2073     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2074   const RegisterBank &InsBank =
2075     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2076 
2077   const RegisterBank &CCBank =
2078     (DstBank == AMDGPU::SGPRRegBank &&
2079      SrcBank == AMDGPU::SGPRRegBank &&
2080      InsBank == AMDGPU::SGPRRegBank &&
2081      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2082                                      : AMDGPU::VCCRegBank;
2083   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2084 
2085   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2086     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2087     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2088   }
2089 
2090   LLT EltTy = VecTy.getScalarType();
2091   SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2092   unsigned NumLanes = InsRegs.size();
2093   if (!NumLanes) {
2094     NumLanes = 1;
2095     InsRegs.push_back(MI.getOperand(2).getReg());
2096   } else {
2097     EltTy = MRI.getType(InsRegs[0]);
2098   }
2099 
2100   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2101   SmallVector<Register, 16> Ops(NumElem * NumLanes);
2102 
2103   for (unsigned I = 0; I < NumElem; ++I) {
2104     auto IC = B.buildConstant(S32, I);
2105     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2106     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2107     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2108 
2109     for (unsigned L = 0; L < NumLanes; ++L) {
2110       Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2111       Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2112       Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2113 
2114       Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2115       MRI.setRegBank(Select, DstBank);
2116 
2117       Ops[I * NumLanes + L] = Select;
2118     }
2119   }
2120 
2121   LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2122   if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2123     B.buildBuildVector(MI.getOperand(0), Ops);
2124   } else {
2125     auto Vec = B.buildBuildVector(MergeTy, Ops);
2126     MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2127     B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2128   }
2129 
2130   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2131   MI.eraseFromParent();
2132 
2133   return true;
2134 }
2135 
2136 // Break s_mul_u64 into 32-bit vector operations.
applyMappingSMULU64(MachineIRBuilder & B,const OperandsMapper & OpdMapper) const2137 void AMDGPURegisterBankInfo::applyMappingSMULU64(
2138     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2139   SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2140   SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2141   SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2142 
2143   // All inputs are SGPRs, nothing special to do.
2144   if (DefRegs.empty()) {
2145     assert(Src0Regs.empty() && Src1Regs.empty());
2146     applyDefaultMapping(OpdMapper);
2147     return;
2148   }
2149 
2150   assert(DefRegs.size() == 2);
2151   assert(Src0Regs.size() == Src1Regs.size() &&
2152          (Src0Regs.empty() || Src0Regs.size() == 2));
2153 
2154   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2155   MachineInstr &MI = OpdMapper.getMI();
2156   Register DstReg = MI.getOperand(0).getReg();
2157   LLT HalfTy = LLT::scalar(32);
2158 
2159   // Depending on where the source registers came from, the generic code may
2160   // have decided to split the inputs already or not. If not, we still need to
2161   // extract the values.
2162 
2163   if (Src0Regs.empty())
2164     split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2165   else
2166     setRegsToType(MRI, Src0Regs, HalfTy);
2167 
2168   if (Src1Regs.empty())
2169     split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2170   else
2171     setRegsToType(MRI, Src1Regs, HalfTy);
2172 
2173   setRegsToType(MRI, DefRegs, HalfTy);
2174 
2175   // The multiplication is done as follows:
2176   //
2177   //                            Op1H  Op1L
2178   //                          * Op0H  Op0L
2179   //                       --------------------
2180   //                       Op1H*Op0L  Op1L*Op0L
2181   //          + Op1H*Op0H  Op1L*Op0H
2182   // -----------------------------------------
2183   // (Op1H*Op0L + Op1L*Op0H + carry)  Op1L*Op0L
2184   //
2185   //  We drop Op1H*Op0H because the result of the multiplication is a 64-bit
2186   //  value and that would overflow.
2187   //  The low 32-bit value is Op1L*Op0L.
2188   //  The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from
2189   //  Op1L*Op0L).
2190 
2191   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
2192 
2193   Register Hi = B.buildUMulH(HalfTy, Src0Regs[0], Src1Regs[0]).getReg(0);
2194   Register MulLoHi = B.buildMul(HalfTy, Src0Regs[0], Src1Regs[1]).getReg(0);
2195   Register Add = B.buildAdd(HalfTy, Hi, MulLoHi).getReg(0);
2196   Register MulHiLo = B.buildMul(HalfTy, Src0Regs[1], Src1Regs[0]).getReg(0);
2197   B.buildAdd(DefRegs[1], Add, MulHiLo);
2198   B.buildMul(DefRegs[0], Src0Regs[0], Src1Regs[0]);
2199 
2200   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2201   MI.eraseFromParent();
2202 }
2203 
applyMappingImpl(MachineIRBuilder & B,const OperandsMapper & OpdMapper) const2204 void AMDGPURegisterBankInfo::applyMappingImpl(
2205     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2206   MachineInstr &MI = OpdMapper.getMI();
2207   B.setInstrAndDebugLoc(MI);
2208   unsigned Opc = MI.getOpcode();
2209   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2210   switch (Opc) {
2211   case AMDGPU::G_CONSTANT:
2212   case AMDGPU::G_IMPLICIT_DEF: {
2213     Register DstReg = MI.getOperand(0).getReg();
2214     LLT DstTy = MRI.getType(DstReg);
2215     if (DstTy != LLT::scalar(1))
2216       break;
2217 
2218     const RegisterBank *DstBank =
2219         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2220     if (DstBank == &AMDGPU::VCCRegBank)
2221       break;
2222     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2223     if (DefRegs.empty())
2224       DefRegs.push_back(DstReg);
2225 
2226     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2227 
2228     Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
2229     LLVMContext &Ctx = B.getMF().getFunction().getContext();
2230 
2231     MI.getOperand(0).setReg(NewDstReg);
2232     if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2233       uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
2234       MI.getOperand(1).setCImm(
2235           ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal));
2236     }
2237 
2238     MRI.setRegBank(NewDstReg, *DstBank);
2239     B.buildTrunc(DefRegs[0], NewDstReg);
2240     return;
2241   }
2242   case AMDGPU::G_PHI: {
2243     Register DstReg = MI.getOperand(0).getReg();
2244     LLT DstTy = MRI.getType(DstReg);
2245     if (DstTy != LLT::scalar(1))
2246       break;
2247 
2248     const LLT S32 = LLT::scalar(32);
2249     const RegisterBank *DstBank =
2250       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2251     if (DstBank == &AMDGPU::VCCRegBank) {
2252       applyDefaultMapping(OpdMapper);
2253       // The standard handling only considers the result register bank for
2254       // phis. For VCC, blindly inserting a copy when the phi is lowered will
2255       // produce an invalid copy. We can only copy with some kind of compare to
2256       // get a vector boolean result. Insert a register bank copy that will be
2257       // correctly lowered to a compare.
2258       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2259         Register SrcReg = MI.getOperand(I).getReg();
2260         const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2261 
2262         if (SrcBank != &AMDGPU::VCCRegBank) {
2263           MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2264           B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2265 
2266           auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2267           MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2268           MI.getOperand(I).setReg(Copy.getReg(0));
2269         }
2270       }
2271 
2272       return;
2273     }
2274 
2275     // Phi handling is strange and only considers the bank of the destination.
2276     substituteSimpleCopyRegs(OpdMapper, 0);
2277 
2278     // Promote SGPR/VGPR booleans to s32
2279     ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2280     B.setInsertPt(B.getMBB(), MI);
2281     LegalizerHelper Helper(B.getMF(), ApplyBank, B);
2282 
2283     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2284       llvm_unreachable("widen scalar should have succeeded");
2285 
2286     return;
2287   }
2288   case AMDGPU::G_FCMP:
2289     if (!Subtarget.hasSALUFloatInsts())
2290       break;
2291     [[fallthrough]];
2292   case AMDGPU::G_ICMP:
2293   case AMDGPU::G_UADDO:
2294   case AMDGPU::G_USUBO:
2295   case AMDGPU::G_UADDE:
2296   case AMDGPU::G_SADDE:
2297   case AMDGPU::G_USUBE:
2298   case AMDGPU::G_SSUBE: {
2299     unsigned BoolDstOp =
2300         (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1;
2301     Register DstReg = MI.getOperand(BoolDstOp).getReg();
2302 
2303     const RegisterBank *DstBank =
2304       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2305     if (DstBank != &AMDGPU::SGPRRegBank)
2306       break;
2307 
2308     const bool HasCarryIn = MI.getNumOperands() == 5;
2309 
2310     // If this is a scalar compare, promote the result to s32, as the selection
2311     // will end up using a copy to a 32-bit vreg.
2312     const LLT S32 = LLT::scalar(32);
2313     Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2314     MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2315     MI.getOperand(BoolDstOp).setReg(NewDstReg);
2316 
2317     if (HasCarryIn) {
2318       Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2319       MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2320       B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2321       MI.getOperand(4).setReg(NewSrcReg);
2322     }
2323 
2324     MachineBasicBlock *MBB = MI.getParent();
2325     B.setInsertPt(*MBB, std::next(MI.getIterator()));
2326 
2327     // If we had a constrained VCC result register, a copy was inserted to VCC
2328     // from SGPR.
2329     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2330     if (DefRegs.empty())
2331       DefRegs.push_back(DstReg);
2332     B.buildTrunc(DefRegs[0], NewDstReg);
2333     return;
2334   }
2335   case AMDGPU::G_SELECT: {
2336     Register DstReg = MI.getOperand(0).getReg();
2337     LLT DstTy = MRI.getType(DstReg);
2338 
2339     SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2340     if (CondRegs.empty())
2341       CondRegs.push_back(MI.getOperand(1).getReg());
2342     else {
2343       assert(CondRegs.size() == 1);
2344     }
2345 
2346     const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2347     if (CondBank == &AMDGPU::SGPRRegBank) {
2348       const LLT S32 = LLT::scalar(32);
2349       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2350       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2351 
2352       MI.getOperand(1).setReg(NewCondReg);
2353       B.buildZExt(NewCondReg, CondRegs[0]);
2354     }
2355 
2356     if (DstTy.getSizeInBits() != 64)
2357       break;
2358 
2359     LLT HalfTy = getHalfSizedType(DstTy);
2360 
2361     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2362     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2363     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2364 
2365     // All inputs are SGPRs, nothing special to do.
2366     if (DefRegs.empty()) {
2367       assert(Src1Regs.empty() && Src2Regs.empty());
2368       break;
2369     }
2370 
2371     if (Src1Regs.empty())
2372       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2373     else {
2374       setRegsToType(MRI, Src1Regs, HalfTy);
2375     }
2376 
2377     if (Src2Regs.empty())
2378       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2379     else
2380       setRegsToType(MRI, Src2Regs, HalfTy);
2381 
2382     setRegsToType(MRI, DefRegs, HalfTy);
2383 
2384     auto Flags = MI.getFlags();
2385     B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0], Flags);
2386     B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1], Flags);
2387 
2388     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2389     MI.eraseFromParent();
2390     return;
2391   }
2392   case AMDGPU::G_BRCOND: {
2393     Register CondReg = MI.getOperand(0).getReg();
2394     // FIXME: Should use legalizer helper, but should change bool ext type.
2395     const RegisterBank *CondBank =
2396       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2397 
2398     if (CondBank == &AMDGPU::SGPRRegBank) {
2399       const LLT S32 = LLT::scalar(32);
2400       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2401       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2402 
2403       MI.getOperand(0).setReg(NewCondReg);
2404       B.buildZExt(NewCondReg, CondReg);
2405       return;
2406     }
2407 
2408     break;
2409   }
2410   case AMDGPU::G_AND:
2411   case AMDGPU::G_OR:
2412   case AMDGPU::G_XOR: {
2413     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2414     // there is a VGPR input.
2415     Register DstReg = MI.getOperand(0).getReg();
2416     LLT DstTy = MRI.getType(DstReg);
2417 
2418     const RegisterBank *DstBank =
2419         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2420 
2421     if (DstTy.getSizeInBits() == 1) {
2422       if (DstBank == &AMDGPU::VCCRegBank)
2423         break;
2424 
2425       MachineFunction *MF = MI.getParent()->getParent();
2426       ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2427       LegalizerHelper Helper(*MF, ApplyBank, B);
2428 
2429       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2430           LegalizerHelper::Legalized)
2431         llvm_unreachable("widen scalar should have succeeded");
2432       return;
2433     }
2434 
2435     if (DstTy.getSizeInBits() == 16 && DstBank == &AMDGPU::SGPRRegBank) {
2436       const LLT S32 = LLT::scalar(32);
2437       MachineBasicBlock *MBB = MI.getParent();
2438       MachineFunction *MF = MBB->getParent();
2439       ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2440       LegalizerHelper Helper(*MF, ApplySALU, B);
2441       // Widen to S32, but handle `G_XOR x, -1` differently. Legalizer widening
2442       // will use a G_ANYEXT to extend the -1 which prevents matching G_XOR -1
2443       // as "not".
2444       if (MI.getOpcode() == AMDGPU::G_XOR &&
2445           mi_match(MI.getOperand(2).getReg(), MRI, m_SpecificICstOrSplat(-1))) {
2446         Helper.widenScalarSrc(MI, S32, 1, AMDGPU::G_ANYEXT);
2447         Helper.widenScalarSrc(MI, S32, 2, AMDGPU::G_SEXT);
2448         Helper.widenScalarDst(MI, S32);
2449       } else {
2450         if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2451           llvm_unreachable("widen scalar should have succeeded");
2452       }
2453       return;
2454     }
2455 
2456     if (DstTy.getSizeInBits() != 64)
2457       break;
2458 
2459     LLT HalfTy = getHalfSizedType(DstTy);
2460     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2461     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2462     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2463 
2464     // All inputs are SGPRs, nothing special to do.
2465     if (DefRegs.empty()) {
2466       assert(Src0Regs.empty() && Src1Regs.empty());
2467       break;
2468     }
2469 
2470     assert(DefRegs.size() == 2);
2471     assert(Src0Regs.size() == Src1Regs.size() &&
2472            (Src0Regs.empty() || Src0Regs.size() == 2));
2473 
2474     // Depending on where the source registers came from, the generic code may
2475     // have decided to split the inputs already or not. If not, we still need to
2476     // extract the values.
2477 
2478     if (Src0Regs.empty())
2479       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2480     else
2481       setRegsToType(MRI, Src0Regs, HalfTy);
2482 
2483     if (Src1Regs.empty())
2484       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2485     else
2486       setRegsToType(MRI, Src1Regs, HalfTy);
2487 
2488     setRegsToType(MRI, DefRegs, HalfTy);
2489 
2490     auto Flags = MI.getFlags();
2491     B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]}, Flags);
2492     B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]}, Flags);
2493 
2494     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2495     MI.eraseFromParent();
2496     return;
2497   }
2498   case AMDGPU::G_ABS: {
2499     Register SrcReg = MI.getOperand(1).getReg();
2500     const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2501 
2502     // There is no VALU abs instruction so we need to replace it with a sub and
2503     // max combination.
2504     if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2505       MachineFunction *MF = MI.getParent()->getParent();
2506       ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
2507       LegalizerHelper Helper(*MF, Apply, B);
2508 
2509       if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2510         llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2511       return;
2512     }
2513     [[fallthrough]];
2514   }
2515   case AMDGPU::G_ADD:
2516   case AMDGPU::G_SUB:
2517   case AMDGPU::G_MUL:
2518   case AMDGPU::G_SHL:
2519   case AMDGPU::G_LSHR:
2520   case AMDGPU::G_ASHR:
2521   case AMDGPU::G_SMIN:
2522   case AMDGPU::G_SMAX:
2523   case AMDGPU::G_UMIN:
2524   case AMDGPU::G_UMAX: {
2525     Register DstReg = MI.getOperand(0).getReg();
2526     LLT DstTy = MRI.getType(DstReg);
2527 
2528     // Special case for s_mul_u64. There is not a vector equivalent of
2529     // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
2530     // multiplications.
2531     if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) {
2532       applyMappingSMULU64(B, OpdMapper);
2533       return;
2534     }
2535 
2536     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2537     // Packed 16-bit operations need to be scalarized and promoted.
2538     if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2539       break;
2540 
2541     const RegisterBank *DstBank =
2542         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2543     if (DstBank == &AMDGPU::VGPRRegBank)
2544       break;
2545 
2546     const LLT S32 = LLT::scalar(32);
2547     MachineBasicBlock *MBB = MI.getParent();
2548     MachineFunction *MF = MBB->getParent();
2549     ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2550 
2551     if (DstTy.isVector() && Opc == AMDGPU::G_ABS) {
2552       Register WideSrcLo, WideSrcHi;
2553 
2554       std::tie(WideSrcLo, WideSrcHi) =
2555           unpackV2S16ToS32(B, MI.getOperand(1).getReg(), TargetOpcode::G_SEXT);
2556       auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo});
2557       auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi});
2558       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2559       MI.eraseFromParent();
2560       return;
2561     }
2562 
2563     if (DstTy.isVector()) {
2564       Register WideSrc0Lo, WideSrc0Hi;
2565       Register WideSrc1Lo, WideSrc1Hi;
2566 
2567       unsigned ExtendOp = getExtendOp(MI.getOpcode());
2568       std::tie(WideSrc0Lo, WideSrc0Hi)
2569         = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2570       std::tie(WideSrc1Lo, WideSrc1Hi)
2571         = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2572       auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2573       auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2574       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2575       MI.eraseFromParent();
2576     } else {
2577       LegalizerHelper Helper(*MF, ApplySALU, B);
2578 
2579       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2580         llvm_unreachable("widen scalar should have succeeded");
2581 
2582       // FIXME: s16 shift amounts should be legal.
2583       if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2584           Opc == AMDGPU::G_ASHR) {
2585         B.setInsertPt(*MBB, MI.getIterator());
2586         if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2587           llvm_unreachable("widen scalar should have succeeded");
2588       }
2589     }
2590 
2591     return;
2592   }
2593   case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
2594   case AMDGPU::G_AMDGPU_S_MUL_U64_U32: {
2595     // This is a special case for s_mul_u64. We use
2596     // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation
2597     // where the 33 higher bits are sign-extended and
2598     // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation
2599     // where the 32 higher bits are zero-extended. In case scalar registers are
2600     // selected, both opcodes are lowered as s_mul_u64. If the vector registers
2601     // are selected, then G_AMDGPU_S_MUL_I64_I32 and
2602     // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.
2603 
2604     // Insert basic copies.
2605     applyDefaultMapping(OpdMapper);
2606 
2607     Register DstReg = MI.getOperand(0).getReg();
2608     Register SrcReg0 = MI.getOperand(1).getReg();
2609     Register SrcReg1 = MI.getOperand(2).getReg();
2610     const LLT S32 = LLT::scalar(32);
2611     const LLT S64 = LLT::scalar(64);
2612     assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "
2613                                          "that handles only 64-bit operands.");
2614     const RegisterBank *DstBank =
2615         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2616 
2617     // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2618     // with s_mul_u64 operation.
2619     if (DstBank == &AMDGPU::SGPRRegBank) {
2620       MI.setDesc(TII->get(AMDGPU::S_MUL_U64));
2621       MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass);
2622       MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass);
2623       MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass);
2624       return;
2625     }
2626 
2627     // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2628     // with a vector mad.
2629     assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank &&
2630            "The destination operand should be in vector registers.");
2631 
2632     // Extract the lower subregister from the first operand.
2633     Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2634     MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass);
2635     MRI.setType(Op0L, S32);
2636     B.buildTrunc(Op0L, SrcReg0);
2637 
2638     // Extract the lower subregister from the second operand.
2639     Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2640     MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass);
2641     MRI.setType(Op1L, S32);
2642     B.buildTrunc(Op1L, SrcReg1);
2643 
2644     unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32
2645                           ? AMDGPU::G_AMDGPU_MAD_U64_U32
2646                           : AMDGPU::G_AMDGPU_MAD_I64_I32;
2647 
2648     MachineIRBuilder B(MI);
2649     Register Zero64 = B.buildConstant(S64, 0).getReg(0);
2650     MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass);
2651     Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
2652     MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass);
2653     B.buildInstr(NewOpc, {DstReg, CarryOut}, {Op0L, Op1L, Zero64});
2654     MI.eraseFromParent();
2655     return;
2656   }
2657   case AMDGPU::G_SEXT_INREG: {
2658     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2659     if (SrcRegs.empty())
2660       break; // Nothing to repair
2661 
2662     const LLT S32 = LLT::scalar(32);
2663     ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
2664 
2665     // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2666     // we would need to further expand, and doesn't let us directly set the
2667     // result registers.
2668     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2669 
2670     int Amt = MI.getOperand(2).getImm();
2671     if (Amt <= 32) {
2672       // Downstream users have expectations for the high bit behavior, so freeze
2673       // incoming undefined bits.
2674       if (Amt == 32) {
2675         // The low bits are unchanged.
2676         B.buildFreeze(DstRegs[0], SrcRegs[0]);
2677       } else {
2678         auto Freeze = B.buildFreeze(S32, SrcRegs[0]);
2679         // Extend in the low bits and propagate the sign bit to the high half.
2680         B.buildSExtInReg(DstRegs[0], Freeze, Amt);
2681       }
2682 
2683       B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2684     } else {
2685       // The low bits are unchanged, and extend in the high bits.
2686       // No freeze required
2687       B.buildCopy(DstRegs[0], SrcRegs[0]);
2688       B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2689     }
2690 
2691     Register DstReg = MI.getOperand(0).getReg();
2692     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2693     MI.eraseFromParent();
2694     return;
2695   }
2696   case AMDGPU::G_CTPOP:
2697   case AMDGPU::G_BITREVERSE: {
2698     const RegisterBank *DstBank =
2699       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2700     if (DstBank == &AMDGPU::SGPRRegBank)
2701       break;
2702 
2703     Register SrcReg = MI.getOperand(1).getReg();
2704     const LLT S32 = LLT::scalar(32);
2705     LLT Ty = MRI.getType(SrcReg);
2706     if (Ty == S32)
2707       break;
2708 
2709     ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2710 
2711     MachineFunction &MF = B.getMF();
2712     LegalizerHelper Helper(MF, ApplyVALU, B);
2713 
2714     if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2715       llvm_unreachable("narrowScalar should have succeeded");
2716     return;
2717   }
2718   case AMDGPU::G_AMDGPU_FFBH_U32:
2719   case AMDGPU::G_AMDGPU_FFBL_B32:
2720   case AMDGPU::G_CTLZ_ZERO_UNDEF:
2721   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2722     const RegisterBank *DstBank =
2723         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2724     if (DstBank == &AMDGPU::SGPRRegBank)
2725       break;
2726 
2727     Register SrcReg = MI.getOperand(1).getReg();
2728     const LLT S32 = LLT::scalar(32);
2729     LLT Ty = MRI.getType(SrcReg);
2730     if (Ty == S32)
2731       break;
2732 
2733     // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2734     // which return -1 when the input is zero:
2735     // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2736     // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2737     // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2738     // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2739     ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2740     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2741     unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2742                           ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2743                           : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2744                                 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2745                                 : Opc;
2746     unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2747     auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2748     auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2749     unsigned AddOpc =
2750         Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2751             ? AMDGPU::G_ADD
2752             : AMDGPU::G_UADDSAT;
2753     Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2754     Register DstReg = MI.getOperand(0).getReg();
2755     B.buildUMin(DstReg, X, Y);
2756     MI.eraseFromParent();
2757     return;
2758   }
2759   case AMDGPU::G_SEXT:
2760   case AMDGPU::G_ZEXT:
2761   case AMDGPU::G_ANYEXT: {
2762     Register SrcReg = MI.getOperand(1).getReg();
2763     LLT SrcTy = MRI.getType(SrcReg);
2764     const bool Signed = Opc == AMDGPU::G_SEXT;
2765 
2766     assert(OpdMapper.getVRegs(1).empty());
2767 
2768     const RegisterBank *SrcBank =
2769       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2770 
2771     Register DstReg = MI.getOperand(0).getReg();
2772     LLT DstTy = MRI.getType(DstReg);
2773     if (DstTy.isScalar() &&
2774         SrcBank != &AMDGPU::SGPRRegBank &&
2775         SrcBank != &AMDGPU::VCCRegBank &&
2776         // FIXME: Should handle any type that round to s64 when irregular
2777         // breakdowns supported.
2778         DstTy.getSizeInBits() == 64 &&
2779         SrcTy.getSizeInBits() <= 32) {
2780       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2781 
2782       // Extend to 32-bit, and then extend the low half.
2783       if (Signed) {
2784         // TODO: Should really be buildSExtOrCopy
2785         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2786       } else if (Opc == AMDGPU::G_ZEXT) {
2787         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2788       } else {
2789         B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2790       }
2791 
2792       extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2793       MRI.setRegBank(DstReg, *SrcBank);
2794       MI.eraseFromParent();
2795       return;
2796     }
2797 
2798     if (SrcTy != LLT::scalar(1))
2799       return;
2800 
2801     // It is not legal to have a legalization artifact with a VCC source. Rather
2802     // than introducing a copy, insert the select we would have to select the
2803     // copy to.
2804     if (SrcBank == &AMDGPU::VCCRegBank) {
2805       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2806 
2807       const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2808 
2809       unsigned DstSize = DstTy.getSizeInBits();
2810       // 64-bit select is SGPR only
2811       const bool UseSel64 = DstSize > 32 &&
2812         SrcBank->getID() == AMDGPU::SGPRRegBankID;
2813 
2814       // TODO: Should s16 select be legal?
2815       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2816       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2817       auto False = B.buildConstant(SelType, 0);
2818 
2819       MRI.setRegBank(True.getReg(0), *DstBank);
2820       MRI.setRegBank(False.getReg(0), *DstBank);
2821       MRI.setRegBank(DstReg, *DstBank);
2822 
2823       if (DstSize > 32) {
2824         B.buildSelect(DefRegs[0], SrcReg, True, False);
2825         extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2826       } else if (DstSize < 32) {
2827         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2828         MRI.setRegBank(Sel.getReg(0), *DstBank);
2829         B.buildTrunc(DstReg, Sel);
2830       } else {
2831         B.buildSelect(DstReg, SrcReg, True, False);
2832       }
2833 
2834       MI.eraseFromParent();
2835       return;
2836     }
2837 
2838     break;
2839   }
2840   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2841     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2842 
2843     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2844 
2845     Register DstReg = MI.getOperand(0).getReg();
2846     Register SrcReg = MI.getOperand(1).getReg();
2847 
2848     const LLT S32 = LLT::scalar(32);
2849     LLT DstTy = MRI.getType(DstReg);
2850     LLT SrcTy = MRI.getType(SrcReg);
2851 
2852     if (foldExtractEltToCmpSelect(B, MI, OpdMapper))
2853       return;
2854 
2855     const ValueMapping &DstMapping
2856       = OpdMapper.getInstrMapping().getOperandMapping(0);
2857     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2858     const RegisterBank *SrcBank =
2859       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2860     const RegisterBank *IdxBank =
2861         OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2862 
2863     Register BaseIdxReg;
2864     unsigned ConstOffset;
2865     std::tie(BaseIdxReg, ConstOffset) =
2866         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2867 
2868     // See if the index is an add of a constant which will be foldable by moving
2869     // the base register of the index later if this is going to be executed in a
2870     // waterfall loop. This is essentially to reassociate the add of a constant
2871     // with the readfirstlane.
2872     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2873                                    ConstOffset > 0 &&
2874                                    ConstOffset < SrcTy.getNumElements();
2875 
2876     // Move the base register. We'll re-insert the add later.
2877     if (ShouldMoveIndexIntoLoop)
2878       MI.getOperand(2).setReg(BaseIdxReg);
2879 
2880     // If this is a VGPR result only because the index was a VGPR result, the
2881     // actual indexing will be done on the SGPR source vector, which will
2882     // produce a scalar result. We need to copy to the VGPR result inside the
2883     // waterfall loop.
2884     const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2885                                 SrcBank == &AMDGPU::SGPRRegBank;
2886     if (DstRegs.empty()) {
2887       applyDefaultMapping(OpdMapper);
2888 
2889       executeInWaterfallLoop(B, MI, {2});
2890 
2891       if (NeedCopyToVGPR) {
2892         // We don't want a phi for this temporary reg.
2893         Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2894         MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2895         MI.getOperand(0).setReg(TmpReg);
2896         B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2897 
2898         // Use a v_mov_b32 here to make the exec dependency explicit.
2899         buildVCopy(B, DstReg, TmpReg);
2900       }
2901 
2902       // Re-insert the constant offset add inside the waterfall loop.
2903       if (ShouldMoveIndexIntoLoop)
2904         reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2905 
2906       return;
2907     }
2908 
2909     assert(DstTy.getSizeInBits() == 64);
2910 
2911     LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2912 
2913     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2914     auto One = B.buildConstant(S32, 1);
2915 
2916     MachineBasicBlock::iterator MII = MI.getIterator();
2917 
2918     // Split the vector index into 32-bit pieces. Prepare to move all of the
2919     // new instructions into a waterfall loop if necessary.
2920     //
2921     // Don't put the bitcast or constant in the loop.
2922     MachineInstrSpan Span(MII, &B.getMBB());
2923 
2924     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2925     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2926     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2927 
2928     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2929     auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2930 
2931     MRI.setRegBank(DstReg, *DstBank);
2932     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2933     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2934     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2935     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2936 
2937     SmallSet<Register, 4> OpsToWaterfall;
2938     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2939       MI.eraseFromParent();
2940       return;
2941     }
2942 
2943     // Remove the original instruction to avoid potentially confusing the
2944     // waterfall loop logic.
2945     B.setInstr(*Span.begin());
2946     MI.eraseFromParent();
2947     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2948                            OpsToWaterfall);
2949 
2950     if (NeedCopyToVGPR) {
2951       MachineBasicBlock *LoopBB = Extract1->getParent();
2952       Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2953       Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2954       MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2955       MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2956 
2957       Extract0->getOperand(0).setReg(TmpReg0);
2958       Extract1->getOperand(0).setReg(TmpReg1);
2959 
2960       B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2961 
2962       buildVCopy(B, DstRegs[0], TmpReg0);
2963       buildVCopy(B, DstRegs[1], TmpReg1);
2964     }
2965 
2966     if (ShouldMoveIndexIntoLoop)
2967       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2968 
2969     return;
2970   }
2971   case AMDGPU::G_INSERT_VECTOR_ELT: {
2972     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2973 
2974     Register DstReg = MI.getOperand(0).getReg();
2975     LLT VecTy = MRI.getType(DstReg);
2976 
2977     assert(OpdMapper.getVRegs(0).empty());
2978     assert(OpdMapper.getVRegs(3).empty());
2979 
2980     if (substituteSimpleCopyRegs(OpdMapper, 1))
2981       MRI.setType(MI.getOperand(1).getReg(), VecTy);
2982 
2983     if (foldInsertEltToCmpSelect(B, MI, OpdMapper))
2984       return;
2985 
2986     const RegisterBank *IdxBank =
2987       OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2988 
2989     Register SrcReg = MI.getOperand(1).getReg();
2990     Register InsReg = MI.getOperand(2).getReg();
2991     LLT InsTy = MRI.getType(InsReg);
2992     (void)InsTy;
2993 
2994     Register BaseIdxReg;
2995     unsigned ConstOffset;
2996     std::tie(BaseIdxReg, ConstOffset) =
2997         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2998 
2999     // See if the index is an add of a constant which will be foldable by moving
3000     // the base register of the index later if this is going to be executed in a
3001     // waterfall loop. This is essentially to reassociate the add of a constant
3002     // with the readfirstlane.
3003     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
3004       ConstOffset > 0 &&
3005       ConstOffset < VecTy.getNumElements();
3006 
3007     // Move the base register. We'll re-insert the add later.
3008     if (ShouldMoveIndexIntoLoop)
3009       MI.getOperand(3).setReg(BaseIdxReg);
3010 
3011 
3012     if (InsRegs.empty()) {
3013       executeInWaterfallLoop(B, MI, {3});
3014 
3015       // Re-insert the constant offset add inside the waterfall loop.
3016       if (ShouldMoveIndexIntoLoop) {
3017         reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
3018       }
3019 
3020       return;
3021     }
3022 
3023     assert(InsTy.getSizeInBits() == 64);
3024 
3025     const LLT S32 = LLT::scalar(32);
3026     LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
3027 
3028     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
3029     auto One = B.buildConstant(S32, 1);
3030 
3031     // Split the vector index into 32-bit pieces. Prepare to move all of the
3032     // new instructions into a waterfall loop if necessary.
3033     //
3034     // Don't put the bitcast or constant in the loop.
3035     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
3036 
3037     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
3038     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
3039     auto IdxHi = B.buildAdd(S32, IdxLo, One);
3040 
3041     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
3042     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
3043 
3044     const RegisterBank *DstBank =
3045       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
3046     const RegisterBank *SrcBank =
3047       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
3048     const RegisterBank *InsSrcBank =
3049       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
3050 
3051     MRI.setRegBank(InsReg, *InsSrcBank);
3052     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
3053     MRI.setRegBank(InsLo.getReg(0), *DstBank);
3054     MRI.setRegBank(InsHi.getReg(0), *DstBank);
3055     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
3056     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
3057     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
3058 
3059 
3060     SmallSet<Register, 4> OpsToWaterfall;
3061     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
3062       B.setInsertPt(B.getMBB(), MI);
3063       B.buildBitcast(DstReg, InsHi);
3064       MI.eraseFromParent();
3065       return;
3066     }
3067 
3068     B.setInstr(*Span.begin());
3069     MI.eraseFromParent();
3070 
3071     // Figure out the point after the waterfall loop before mangling the control
3072     // flow.
3073     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
3074                            OpsToWaterfall);
3075 
3076     // The insertion point is now right after the original instruction.
3077     //
3078     // Keep the bitcast to the original vector type out of the loop. Doing this
3079     // saved an extra phi we don't need inside the loop.
3080     B.buildBitcast(DstReg, InsHi);
3081 
3082     // Re-insert the constant offset add inside the waterfall loop.
3083     if (ShouldMoveIndexIntoLoop)
3084       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
3085 
3086     return;
3087   }
3088   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3089   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3090   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3091   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3092   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3093   case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
3094   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
3095   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
3096   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
3097   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
3098   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3099   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
3100   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3101   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3102   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3103   case AMDGPU::G_AMDGPU_BUFFER_STORE:
3104   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3105   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3106   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3107   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
3108   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3109   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
3110     applyDefaultMapping(OpdMapper);
3111     executeInWaterfallLoop(B, MI, {1, 4});
3112     return;
3113   }
3114   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3115   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3116   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3117   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3118   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3119   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3120   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3121   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3122   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3123   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3124   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3125   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3126   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3127   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3128   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3129     applyDefaultMapping(OpdMapper);
3130     executeInWaterfallLoop(B, MI, {2, 5});
3131     return;
3132   }
3133   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3134     applyDefaultMapping(OpdMapper);
3135     executeInWaterfallLoop(B, MI, {3, 6});
3136     return;
3137   }
3138   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
3139   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
3140   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
3141   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
3142   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
3143     applyMappingSBufferLoad(B, OpdMapper);
3144     return;
3145   }
3146   case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
3147     constrainOpWithReadfirstlane(B, MI, 0);
3148     constrainOpWithReadfirstlane(B, MI, 2);
3149     return;
3150   case AMDGPU::G_INTRINSIC:
3151   case AMDGPU::G_INTRINSIC_CONVERGENT: {
3152     switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3153     case Intrinsic::amdgcn_readlane: {
3154       substituteSimpleCopyRegs(OpdMapper, 2);
3155 
3156       assert(OpdMapper.getVRegs(0).empty());
3157       assert(OpdMapper.getVRegs(3).empty());
3158 
3159       // Make sure the index is an SGPR. It doesn't make sense to run this in a
3160       // waterfall loop, so assume it's a uniform value.
3161       constrainOpWithReadfirstlane(B, MI, 3); // Index
3162       return;
3163     }
3164     case Intrinsic::amdgcn_writelane: {
3165       assert(OpdMapper.getVRegs(0).empty());
3166       assert(OpdMapper.getVRegs(2).empty());
3167       assert(OpdMapper.getVRegs(3).empty());
3168 
3169       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
3170       constrainOpWithReadfirstlane(B, MI, 2); // Source value
3171       constrainOpWithReadfirstlane(B, MI, 3); // Index
3172       return;
3173     }
3174     case Intrinsic::amdgcn_interp_p1:
3175     case Intrinsic::amdgcn_interp_p2:
3176     case Intrinsic::amdgcn_interp_mov:
3177     case Intrinsic::amdgcn_interp_p1_f16:
3178     case Intrinsic::amdgcn_interp_p2_f16:
3179     case Intrinsic::amdgcn_lds_param_load: {
3180       applyDefaultMapping(OpdMapper);
3181 
3182       // Readlane for m0 value, which is always the last operand.
3183       // FIXME: Should this be a waterfall loop instead?
3184       constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
3185       return;
3186     }
3187     case Intrinsic::amdgcn_interp_inreg_p10:
3188     case Intrinsic::amdgcn_interp_inreg_p2:
3189     case Intrinsic::amdgcn_interp_inreg_p10_f16:
3190     case Intrinsic::amdgcn_interp_inreg_p2_f16:
3191     case Intrinsic::amdgcn_interp_p10_rtz_f16:
3192     case Intrinsic::amdgcn_interp_p2_rtz_f16:
3193     case Intrinsic::amdgcn_permlane16_swap:
3194     case Intrinsic::amdgcn_permlane32_swap:
3195       applyDefaultMapping(OpdMapper);
3196       return;
3197     case Intrinsic::amdgcn_permlane16:
3198     case Intrinsic::amdgcn_permlanex16: {
3199       // Doing a waterfall loop over these wouldn't make any sense.
3200       substituteSimpleCopyRegs(OpdMapper, 2);
3201       substituteSimpleCopyRegs(OpdMapper, 3);
3202       constrainOpWithReadfirstlane(B, MI, 4);
3203       constrainOpWithReadfirstlane(B, MI, 5);
3204       return;
3205     }
3206     case Intrinsic::amdgcn_sbfe:
3207       applyMappingBFE(B, OpdMapper, true);
3208       return;
3209     case Intrinsic::amdgcn_ubfe:
3210       applyMappingBFE(B, OpdMapper, false);
3211       return;
3212     case Intrinsic::amdgcn_inverse_ballot:
3213     case Intrinsic::amdgcn_s_bitreplicate:
3214     case Intrinsic::amdgcn_s_quadmask:
3215     case Intrinsic::amdgcn_s_wqm:
3216       applyDefaultMapping(OpdMapper);
3217       constrainOpWithReadfirstlane(B, MI, 2); // Mask
3218       return;
3219     case Intrinsic::amdgcn_ballot:
3220       // Use default handling and insert copy to vcc source.
3221       break;
3222     }
3223     break;
3224   }
3225   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3226   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3227   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3228   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3229   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3230     const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3231         AMDGPU::lookupRsrcIntrinsic(AMDGPU::getIntrinsicID(MI));
3232     assert(RSrcIntrin && RSrcIntrin->IsImage);
3233     // Non-images can have complications from operands that allow both SGPR
3234     // and VGPR. For now it's too complicated to figure out the final opcode
3235     // to derive the register bank from the MCInstrDesc.
3236     applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3237     return;
3238   }
3239   case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
3240   case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
3241   case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: {
3242     bool IsDualOrBVH8 =
3243         MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY ||
3244         MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY;
3245     unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier
3246     unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods;
3247     applyDefaultMapping(OpdMapper);
3248     executeInWaterfallLoop(B, MI, {LastRegOpIdx});
3249     return;
3250   }
3251   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
3252   case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
3253     auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
3254     switch (IntrID) {
3255     case Intrinsic::amdgcn_ds_ordered_add:
3256     case Intrinsic::amdgcn_ds_ordered_swap: {
3257       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3258       assert(OpdMapper.getVRegs(0).empty());
3259       substituteSimpleCopyRegs(OpdMapper, 3);
3260       constrainOpWithReadfirstlane(B, MI, 2); // M0
3261       return;
3262     }
3263     case Intrinsic::amdgcn_ds_gws_init:
3264     case Intrinsic::amdgcn_ds_gws_barrier:
3265     case Intrinsic::amdgcn_ds_gws_sema_br: {
3266       // Only the first lane is executes, so readfirstlane is safe.
3267       substituteSimpleCopyRegs(OpdMapper, 1);
3268       constrainOpWithReadfirstlane(B, MI, 2); // M0
3269       return;
3270     }
3271     case Intrinsic::amdgcn_ds_gws_sema_v:
3272     case Intrinsic::amdgcn_ds_gws_sema_p:
3273     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3274       // Only the first lane is executes, so readfirstlane is safe.
3275       constrainOpWithReadfirstlane(B, MI, 1); // M0
3276       return;
3277     }
3278     case Intrinsic::amdgcn_ds_append:
3279     case Intrinsic::amdgcn_ds_consume: {
3280       constrainOpWithReadfirstlane(B, MI, 2); // M0
3281       return;
3282     }
3283     case Intrinsic::amdgcn_s_sendmsg:
3284     case Intrinsic::amdgcn_s_sendmsghalt: {
3285       // FIXME: Should this use a waterfall loop?
3286       constrainOpWithReadfirstlane(B, MI, 2); // M0
3287       return;
3288     }
3289     case Intrinsic::amdgcn_s_setreg: {
3290       constrainOpWithReadfirstlane(B, MI, 2);
3291       return;
3292     }
3293     case Intrinsic::amdgcn_s_ttracedata:
3294       constrainOpWithReadfirstlane(B, MI, 1); // M0
3295       return;
3296     case Intrinsic::amdgcn_raw_buffer_load_lds:
3297     case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
3298       applyDefaultMapping(OpdMapper);
3299       constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3300       constrainOpWithReadfirstlane(B, MI, 2); // M0
3301       constrainOpWithReadfirstlane(B, MI, 5); // soffset
3302       return;
3303     }
3304     case Intrinsic::amdgcn_struct_buffer_load_lds:
3305     case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
3306       applyDefaultMapping(OpdMapper);
3307       constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3308       constrainOpWithReadfirstlane(B, MI, 2); // M0
3309       constrainOpWithReadfirstlane(B, MI, 6); // soffset
3310       return;
3311     }
3312     case Intrinsic::amdgcn_load_to_lds:
3313     case Intrinsic::amdgcn_global_load_lds: {
3314       applyDefaultMapping(OpdMapper);
3315       constrainOpWithReadfirstlane(B, MI, 2);
3316       return;
3317     }
3318     case Intrinsic::amdgcn_lds_direct_load: {
3319       applyDefaultMapping(OpdMapper);
3320       // Readlane for m0 value, which is always the last operand.
3321       constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
3322       return;
3323     }
3324     case Intrinsic::amdgcn_exp_row:
3325       applyDefaultMapping(OpdMapper);
3326       constrainOpWithReadfirstlane(B, MI, 8); // M0
3327       return;
3328     case Intrinsic::amdgcn_s_sleep_var:
3329       assert(OpdMapper.getVRegs(1).empty());
3330       constrainOpWithReadfirstlane(B, MI, 1);
3331       return;
3332     case Intrinsic::amdgcn_s_barrier_signal_var:
3333       constrainOpWithReadfirstlane(B, MI, 1);
3334       constrainOpWithReadfirstlane(B, MI, 2);
3335       return;
3336     case Intrinsic::amdgcn_s_get_barrier_state:
3337     case Intrinsic::amdgcn_s_get_named_barrier_state: {
3338       constrainOpWithReadfirstlane(B, MI, 2);
3339       return;
3340     }
3341     case Intrinsic::amdgcn_s_prefetch_data: {
3342       Register PtrReg = MI.getOperand(1).getReg();
3343       unsigned AS = MRI.getType(PtrReg).getAddressSpace();
3344       if (AMDGPU::isFlatGlobalAddrSpace(AS)) {
3345         constrainOpWithReadfirstlane(B, MI, 1);
3346         constrainOpWithReadfirstlane(B, MI, 2);
3347       } else
3348         MI.eraseFromParent();
3349       return;
3350     }
3351     case Intrinsic::amdgcn_tensor_load_to_lds:
3352     case Intrinsic::amdgcn_tensor_store_from_lds: {
3353       constrainOpWithReadfirstlane(B, MI, 1);
3354       constrainOpWithReadfirstlane(B, MI, 2);
3355       constrainOpWithReadfirstlane(B, MI, 3);
3356       constrainOpWithReadfirstlane(B, MI, 4);
3357       return;
3358     }
3359     case Intrinsic::amdgcn_tensor_load_to_lds_d2:
3360     case Intrinsic::amdgcn_tensor_store_from_lds_d2: {
3361       constrainOpWithReadfirstlane(B, MI, 1);
3362       constrainOpWithReadfirstlane(B, MI, 2);
3363       return;
3364     }
3365     default: {
3366       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3367               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3368         // Non-images can have complications from operands that allow both SGPR
3369         // and VGPR. For now it's too complicated to figure out the final opcode
3370         // to derive the register bank from the MCInstrDesc.
3371         if (RSrcIntrin->IsImage) {
3372           applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3373           return;
3374         }
3375       }
3376 
3377       break;
3378     }
3379     }
3380     break;
3381   }
3382   case AMDGPU::G_SI_CALL: {
3383     // Use a set to avoid extra readfirstlanes in the case where multiple
3384     // operands are the same register.
3385     SmallSet<Register, 4> SGPROperandRegs;
3386 
3387     if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3388       break;
3389 
3390     // Move all copies to physical SGPRs that are used by the call instruction
3391     // into the loop block. Start searching for these copies until the
3392     // ADJCALLSTACKUP.
3393     unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3394     unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3395 
3396     // Move all non-copies before the copies, so that a complete range can be
3397     // moved into the waterfall loop.
3398     SmallVector<MachineInstr *, 4> NonCopyInstrs;
3399     // Count of NonCopyInstrs found until the current LastCopy.
3400     unsigned NonCopyInstrsLen = 0;
3401     MachineBasicBlock::iterator Start(&MI);
3402     MachineBasicBlock::iterator LastCopy = Start;
3403     MachineBasicBlock *MBB = MI.getParent();
3404     const SIMachineFunctionInfo *Info =
3405         MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3406     while (Start->getOpcode() != FrameSetupOpcode) {
3407       --Start;
3408       bool IsCopy = false;
3409       if (Start->getOpcode() == AMDGPU::COPY) {
3410         auto &Dst = Start->getOperand(0);
3411         if (Dst.isReg()) {
3412           Register Reg = Dst.getReg();
3413           if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3414             IsCopy = true;
3415           } else {
3416             // Also move the copy from the scratch rsrc descriptor into the loop
3417             // to allow it to be optimized away.
3418             auto &Src = Start->getOperand(1);
3419             if (Src.isReg()) {
3420               Reg = Src.getReg();
3421               IsCopy = Info->getScratchRSrcReg() == Reg;
3422             }
3423           }
3424         }
3425       }
3426 
3427       if (IsCopy) {
3428         LastCopy = Start;
3429         NonCopyInstrsLen = NonCopyInstrs.size();
3430       } else {
3431         NonCopyInstrs.push_back(&*Start);
3432       }
3433     }
3434     NonCopyInstrs.resize(NonCopyInstrsLen);
3435 
3436     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3437       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3438     }
3439     Start = LastCopy;
3440 
3441     // Do the same for copies after the loop
3442     NonCopyInstrs.clear();
3443     NonCopyInstrsLen = 0;
3444     MachineBasicBlock::iterator End(&MI);
3445     LastCopy = End;
3446     while (End->getOpcode() != FrameDestroyOpcode) {
3447       ++End;
3448       bool IsCopy = false;
3449       if (End->getOpcode() == AMDGPU::COPY) {
3450         auto &Src = End->getOperand(1);
3451         if (Src.isReg()) {
3452           Register Reg = Src.getReg();
3453           IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3454         }
3455       }
3456 
3457       if (IsCopy) {
3458         LastCopy = End;
3459         NonCopyInstrsLen = NonCopyInstrs.size();
3460       } else {
3461         NonCopyInstrs.push_back(&*End);
3462       }
3463     }
3464     NonCopyInstrs.resize(NonCopyInstrsLen);
3465 
3466     End = LastCopy;
3467     ++LastCopy;
3468     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3469       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3470     }
3471 
3472     ++End;
3473     B.setInsertPt(B.getMBB(), Start);
3474     executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs);
3475     break;
3476   }
3477   case AMDGPU::G_LOAD:
3478   case AMDGPU::G_ZEXTLOAD:
3479   case AMDGPU::G_SEXTLOAD: {
3480     if (applyMappingLoad(B, OpdMapper, MI))
3481       return;
3482     break;
3483   }
3484   case AMDGPU::G_DYN_STACKALLOC:
3485     applyMappingDynStackAlloc(B, OpdMapper, MI);
3486     return;
3487   case AMDGPU::G_STACKRESTORE: {
3488     applyDefaultMapping(OpdMapper);
3489     constrainOpWithReadfirstlane(B, MI, 0);
3490     return;
3491   }
3492   case AMDGPU::G_SBFX:
3493     applyMappingBFE(B, OpdMapper, /*Signed*/ true);
3494     return;
3495   case AMDGPU::G_UBFX:
3496     applyMappingBFE(B, OpdMapper, /*Signed*/ false);
3497     return;
3498   case AMDGPU::G_AMDGPU_MAD_U64_U32:
3499   case AMDGPU::G_AMDGPU_MAD_I64_I32:
3500     applyMappingMAD_64_32(B, OpdMapper);
3501     return;
3502   case AMDGPU::G_PREFETCH: {
3503     if (!Subtarget.hasPrefetch() || !Subtarget.hasSafeSmemPrefetch()) {
3504       MI.eraseFromParent();
3505       return;
3506     }
3507     Register PtrReg = MI.getOperand(0).getReg();
3508     unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);
3509     if (PtrBank == AMDGPU::VGPRRegBankID) {
3510       MI.eraseFromParent();
3511       return;
3512     }
3513     unsigned AS = MRI.getType(PtrReg).getAddressSpace();
3514     if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
3515         AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3516       MI.eraseFromParent();
3517       return;
3518     }
3519     applyDefaultMapping(OpdMapper);
3520     return;
3521   }
3522   default:
3523     break;
3524   }
3525 
3526   return applyDefaultMapping(OpdMapper);
3527 }
3528 
3529 // vgpr, sgpr -> vgpr
3530 // vgpr, agpr -> vgpr
3531 // agpr, agpr -> agpr
3532 // agpr, sgpr -> vgpr
regBankUnion(unsigned RB0,unsigned RB1)3533 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3534   if (RB0 == AMDGPU::InvalidRegBankID)
3535     return RB1;
3536   if (RB1 == AMDGPU::InvalidRegBankID)
3537     return RB0;
3538 
3539   if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3540     return AMDGPU::SGPRRegBankID;
3541 
3542   if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3543     return AMDGPU::AGPRRegBankID;
3544 
3545   return AMDGPU::VGPRRegBankID;
3546 }
3547 
regBankBoolUnion(unsigned RB0,unsigned RB1)3548 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3549   if (RB0 == AMDGPU::InvalidRegBankID)
3550     return RB1;
3551   if (RB1 == AMDGPU::InvalidRegBankID)
3552     return RB0;
3553 
3554   // vcc, vcc -> vcc
3555   // vcc, sgpr -> vcc
3556   // vcc, vgpr -> vcc
3557   if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3558     return AMDGPU::VCCRegBankID;
3559 
3560   // vcc, vgpr -> vgpr
3561   return regBankUnion(RB0, RB1);
3562 }
3563 
getMappingType(const MachineRegisterInfo & MRI,const MachineInstr & MI) const3564 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3565                                                 const MachineInstr &MI) const {
3566   unsigned RegBank = AMDGPU::InvalidRegBankID;
3567 
3568   for (const MachineOperand &MO : MI.operands()) {
3569     if (!MO.isReg())
3570       continue;
3571     Register Reg = MO.getReg();
3572     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3573       RegBank = regBankUnion(RegBank, Bank->getID());
3574       if (RegBank == AMDGPU::VGPRRegBankID)
3575         break;
3576     }
3577   }
3578 
3579   return RegBank;
3580 }
3581 
isSALUMapping(const MachineInstr & MI) const3582 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3583   const MachineFunction &MF = *MI.getParent()->getParent();
3584   const MachineRegisterInfo &MRI = MF.getRegInfo();
3585   for (const MachineOperand &MO : MI.operands()) {
3586     if (!MO.isReg())
3587       continue;
3588     Register Reg = MO.getReg();
3589     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3590       if (Bank->getID() != AMDGPU::SGPRRegBankID)
3591         return false;
3592     }
3593   }
3594   return true;
3595 }
3596 
3597 const RegisterBankInfo::InstructionMapping &
getDefaultMappingSOP(const MachineInstr & MI) const3598 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3599   const MachineFunction &MF = *MI.getParent()->getParent();
3600   const MachineRegisterInfo &MRI = MF.getRegInfo();
3601   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3602 
3603   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3604     const MachineOperand &SrcOp = MI.getOperand(i);
3605     if (!SrcOp.isReg())
3606       continue;
3607 
3608     unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3609     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3610   }
3611   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3612                                MI.getNumOperands());
3613 }
3614 
3615 const RegisterBankInfo::InstructionMapping &
getDefaultMappingVOP(const MachineInstr & MI) const3616 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3617   const MachineFunction &MF = *MI.getParent()->getParent();
3618   const MachineRegisterInfo &MRI = MF.getRegInfo();
3619   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3620 
3621   // Even though we technically could use SGPRs, this would require knowledge of
3622   // the constant bus restriction. Force all sources to VGPR (except for VCC).
3623   //
3624   // TODO: Unary ops are trivially OK, so accept SGPRs?
3625   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3626     const MachineOperand &Src = MI.getOperand(i);
3627     if (!Src.isReg())
3628       continue;
3629 
3630     unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3631     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3632     OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3633   }
3634 
3635   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3636                                MI.getNumOperands());
3637 }
3638 
3639 const RegisterBankInfo::InstructionMapping &
getDefaultMappingAllVGPR(const MachineInstr & MI) const3640 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3641   const MachineFunction &MF = *MI.getParent()->getParent();
3642   const MachineRegisterInfo &MRI = MF.getRegInfo();
3643   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3644 
3645   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3646     const MachineOperand &Op = MI.getOperand(I);
3647     if (!Op.isReg())
3648       continue;
3649 
3650     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3651     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3652   }
3653 
3654   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3655                                MI.getNumOperands());
3656 }
3657 
3658 const RegisterBankInfo::InstructionMapping &
getImageMapping(const MachineRegisterInfo & MRI,const MachineInstr & MI,int RsrcIdx) const3659 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3660                                         const MachineInstr &MI,
3661                                         int RsrcIdx) const {
3662   // The reported argument index is relative to the IR intrinsic call arguments,
3663   // so we need to shift by the number of defs and the intrinsic ID.
3664   RsrcIdx += MI.getNumExplicitDefs() + 1;
3665 
3666   const int NumOps = MI.getNumOperands();
3667   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3668 
3669   // TODO: Should packed/unpacked D16 difference be reported here as part of
3670   // the value mapping?
3671   for (int I = 0; I != NumOps; ++I) {
3672     if (!MI.getOperand(I).isReg())
3673       continue;
3674 
3675     Register OpReg = MI.getOperand(I).getReg();
3676     // We replace some dead address operands with $noreg
3677     if (!OpReg)
3678       continue;
3679 
3680     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3681 
3682     // FIXME: Probably need a new intrinsic register bank searchable table to
3683     // handle arbitrary intrinsics easily.
3684     //
3685     // If this has a sampler, it immediately follows rsrc.
3686     const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3687 
3688     if (MustBeSGPR) {
3689       // If this must be an SGPR, so we must report whatever it is as legal.
3690       unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3691       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3692     } else {
3693       // Some operands must be VGPR, and these are easy to copy to.
3694       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3695     }
3696   }
3697 
3698   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3699 }
3700 
3701 /// Return the mapping for a pointer argument.
3702 const RegisterBankInfo::ValueMapping *
getValueMappingForPtr(const MachineRegisterInfo & MRI,Register PtrReg) const3703 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3704                                               Register PtrReg) const {
3705   LLT PtrTy = MRI.getType(PtrReg);
3706   unsigned Size = PtrTy.getSizeInBits();
3707   if (Subtarget.useFlatForGlobal() ||
3708       !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3709     return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3710 
3711   // If we're using MUBUF instructions for global memory, an SGPR base register
3712   // is possible. Otherwise this needs to be a VGPR.
3713   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3714   return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3715 }
3716 
3717 const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr & MI) const3718 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3719 
3720   const MachineFunction &MF = *MI.getParent()->getParent();
3721   const MachineRegisterInfo &MRI = MF.getRegInfo();
3722   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3723   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3724   Register PtrReg = MI.getOperand(1).getReg();
3725   LLT PtrTy = MRI.getType(PtrReg);
3726   unsigned AS = PtrTy.getAddressSpace();
3727   unsigned PtrSize = PtrTy.getSizeInBits();
3728 
3729   const ValueMapping *ValMapping;
3730   const ValueMapping *PtrMapping;
3731 
3732   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3733 
3734   if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3735     if (isScalarLoadLegal(MI)) {
3736       // We have a uniform instruction so we want to use an SMRD load
3737       ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3738       PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3739     } else {
3740       ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3741 
3742       // If we're using MUBUF instructions for global memory, an SGPR base
3743       // register is possible. Otherwise this needs to be a VGPR.
3744       unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3745         AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3746 
3747       PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3748     }
3749   } else {
3750     ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3751     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3752   }
3753 
3754   OpdsMapping[0] = ValMapping;
3755   OpdsMapping[1] = PtrMapping;
3756   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3757       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3758   return Mapping;
3759 
3760   // FIXME: Do we want to add a mapping for FLAT load, or should we just
3761   // handle that during instruction selection?
3762 }
3763 
3764 unsigned
getRegBankID(Register Reg,const MachineRegisterInfo & MRI,unsigned Default) const3765 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3766                                      const MachineRegisterInfo &MRI,
3767                                      unsigned Default) const {
3768   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3769   return Bank ? Bank->getID() : Default;
3770 }
3771 
3772 const RegisterBankInfo::ValueMapping *
getSGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3773 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3774                                          const MachineRegisterInfo &MRI,
3775                                          const TargetRegisterInfo &TRI) const {
3776   // Lie and claim anything is legal, even though this needs to be an SGPR
3777   // applyMapping will have to deal with it as a waterfall loop.
3778   unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3779   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3780   return AMDGPU::getValueMapping(Bank, Size);
3781 }
3782 
3783 const RegisterBankInfo::ValueMapping *
getVGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3784 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3785                                          const MachineRegisterInfo &MRI,
3786                                          const TargetRegisterInfo &TRI) const {
3787   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3788   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3789 }
3790 
3791 const RegisterBankInfo::ValueMapping *
getAGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3792 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3793                                          const MachineRegisterInfo &MRI,
3794                                          const TargetRegisterInfo &TRI) const {
3795   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3796   return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3797 }
3798 
3799 ///
3800 /// This function must return a legal mapping, because
3801 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3802 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
3803 /// VGPR to SGPR generated is illegal.
3804 ///
3805 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3806 // legal. These will be dealt with in applyMappingImpl.
3807 //
3808 const RegisterBankInfo::InstructionMapping &
getInstrMapping(const MachineInstr & MI) const3809 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3810   const MachineFunction &MF = *MI.getParent()->getParent();
3811   const MachineRegisterInfo &MRI = MF.getRegInfo();
3812 
3813   if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3814     Register DstReg = MI.getOperand(0).getReg();
3815     Register SrcReg = MI.getOperand(1).getReg();
3816 
3817     // The default logic bothers to analyze impossible alternative mappings. We
3818     // want the most straightforward mapping, so just directly handle this.
3819     const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
3820     const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
3821     assert(SrcBank && "src bank should have been assigned already");
3822 
3823     // For COPY between a physical reg and an s1, there is no type associated so
3824     // we need to take the virtual register's type as a hint on how to interpret
3825     // s1 values.
3826     if (!SrcReg.isVirtual() && !DstBank &&
3827         MRI.getType(DstReg) == LLT::scalar(1))
3828       DstBank = &AMDGPU::VCCRegBank;
3829     else if (!DstReg.isVirtual() && MRI.getType(SrcReg) == LLT::scalar(1))
3830       DstBank = &AMDGPU::VCCRegBank;
3831 
3832     if (!DstBank)
3833       DstBank = SrcBank;
3834 
3835     unsigned Size = getSizeInBits(DstReg, MRI, *TRI);
3836     if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3837         cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size)))
3838       return getInvalidInstructionMapping();
3839 
3840     const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3841     unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3842     SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3843     OpdsMapping[0] = &ValMap;
3844     if (MI.getOpcode() == AMDGPU::G_FREEZE)
3845       OpdsMapping[1] = &ValMap;
3846 
3847     return getInstructionMapping(
3848         1, /*Cost*/ 1,
3849         /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3850   }
3851 
3852   if (MI.isRegSequence()) {
3853     // If any input is a VGPR, the result must be a VGPR. The default handling
3854     // assumes any copy between banks is legal.
3855     unsigned BankID = AMDGPU::SGPRRegBankID;
3856 
3857     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3858       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3859       // It doesn't make sense to use vcc or scc banks here, so just ignore
3860       // them.
3861       if (OpBank != AMDGPU::SGPRRegBankID) {
3862         BankID = AMDGPU::VGPRRegBankID;
3863         break;
3864       }
3865     }
3866     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3867 
3868     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3869     return getInstructionMapping(
3870         1, /*Cost*/ 1,
3871         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3872   }
3873 
3874   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3875   // properly.
3876   //
3877   // TODO: There are additional exec masking dependencies to analyze.
3878   if (auto *PHI = dyn_cast<GPhi>(&MI)) {
3879     unsigned ResultBank = AMDGPU::InvalidRegBankID;
3880     Register DstReg = PHI->getReg(0);
3881 
3882     // Sometimes the result may have already been assigned a bank.
3883     if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3884       ResultBank = DstBank->getID();
3885 
3886     for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) {
3887       Register Reg = PHI->getIncomingValue(I);
3888       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3889 
3890       // FIXME: Assuming VGPR for any undetermined inputs.
3891       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3892         ResultBank = AMDGPU::VGPRRegBankID;
3893         break;
3894       }
3895 
3896       // FIXME: Need to promote SGPR case to s32
3897       unsigned OpBank = Bank->getID();
3898       ResultBank = regBankBoolUnion(ResultBank, OpBank);
3899     }
3900 
3901     assert(ResultBank != AMDGPU::InvalidRegBankID);
3902 
3903     unsigned Size = MRI.getType(DstReg).getSizeInBits();
3904 
3905     const ValueMapping &ValMap =
3906         getValueMapping(0, Size, getRegBank(ResultBank));
3907     return getInstructionMapping(
3908         1, /*Cost*/ 1,
3909         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3910   }
3911 
3912   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3913   if (Mapping.isValid())
3914     return Mapping;
3915 
3916   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3917 
3918   switch (MI.getOpcode()) {
3919   default:
3920     return getInvalidInstructionMapping();
3921 
3922   case AMDGPU::G_AND:
3923   case AMDGPU::G_OR:
3924   case AMDGPU::G_XOR:
3925   case AMDGPU::G_MUL: {
3926     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3927     if (Size == 1) {
3928       const RegisterBank *DstBank
3929         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3930 
3931       unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3932       unsigned BankLHS = AMDGPU::InvalidRegBankID;
3933       unsigned BankRHS = AMDGPU::InvalidRegBankID;
3934       if (DstBank) {
3935         TargetBankID = DstBank->getID();
3936         if (DstBank == &AMDGPU::VCCRegBank) {
3937           TargetBankID = AMDGPU::VCCRegBankID;
3938           BankLHS = AMDGPU::VCCRegBankID;
3939           BankRHS = AMDGPU::VCCRegBankID;
3940         } else {
3941           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3942                                  AMDGPU::SGPRRegBankID);
3943           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3944                                  AMDGPU::SGPRRegBankID);
3945         }
3946       } else {
3947         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3948                                AMDGPU::VCCRegBankID);
3949         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3950                                AMDGPU::VCCRegBankID);
3951 
3952         // Both inputs should be true booleans to produce a boolean result.
3953         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3954           TargetBankID = AMDGPU::VGPRRegBankID;
3955         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3956           TargetBankID = AMDGPU::VCCRegBankID;
3957           BankLHS = AMDGPU::VCCRegBankID;
3958           BankRHS = AMDGPU::VCCRegBankID;
3959         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3960           TargetBankID = AMDGPU::SGPRRegBankID;
3961         }
3962       }
3963 
3964       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3965       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3966       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3967       break;
3968     }
3969 
3970     if (Size == 64) {
3971 
3972       if (isSALUMapping(MI)) {
3973         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3974         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3975       } else {
3976         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3977         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3978         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3979 
3980         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3981         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3982       }
3983 
3984       break;
3985     }
3986 
3987     [[fallthrough]];
3988   }
3989   case AMDGPU::G_PTR_ADD:
3990   case AMDGPU::G_PTRMASK:
3991   case AMDGPU::G_ADD:
3992   case AMDGPU::G_SUB:
3993   case AMDGPU::G_SHL:
3994   case AMDGPU::G_LSHR:
3995   case AMDGPU::G_ASHR:
3996   case AMDGPU::G_UADDO:
3997   case AMDGPU::G_USUBO:
3998   case AMDGPU::G_UADDE:
3999   case AMDGPU::G_SADDE:
4000   case AMDGPU::G_USUBE:
4001   case AMDGPU::G_SSUBE:
4002   case AMDGPU::G_SMIN:
4003   case AMDGPU::G_SMAX:
4004   case AMDGPU::G_UMIN:
4005   case AMDGPU::G_UMAX:
4006   case AMDGPU::G_ABS:
4007   case AMDGPU::G_SHUFFLE_VECTOR:
4008   case AMDGPU::G_SBFX:
4009   case AMDGPU::G_UBFX:
4010   case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
4011   case AMDGPU::G_AMDGPU_S_MUL_U64_U32:
4012     if (isSALUMapping(MI))
4013       return getDefaultMappingSOP(MI);
4014     return getDefaultMappingVOP(MI);
4015   case AMDGPU::G_FADD:
4016   case AMDGPU::G_FSUB:
4017   case AMDGPU::G_FMUL:
4018   case AMDGPU::G_FMA:
4019   case AMDGPU::G_FFLOOR:
4020   case AMDGPU::G_FCEIL:
4021   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
4022   case AMDGPU::G_FMINNUM:
4023   case AMDGPU::G_FMAXNUM:
4024   case AMDGPU::G_FMINIMUM:
4025   case AMDGPU::G_FMAXIMUM:
4026   case AMDGPU::G_FMINIMUMNUM:
4027   case AMDGPU::G_FMAXIMUMNUM:
4028   case AMDGPU::G_INTRINSIC_TRUNC:
4029   case AMDGPU::G_STRICT_FADD:
4030   case AMDGPU::G_STRICT_FSUB:
4031   case AMDGPU::G_STRICT_FMUL:
4032   case AMDGPU::G_STRICT_FMA: {
4033     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4034     unsigned Size = Ty.getSizeInBits();
4035     if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
4036         (Size == 32 || Size == 16) && isSALUMapping(MI))
4037       return getDefaultMappingSOP(MI);
4038     return getDefaultMappingVOP(MI);
4039   }
4040   case AMDGPU::G_FPTOSI:
4041   case AMDGPU::G_FPTOUI:
4042   case AMDGPU::G_SITOFP:
4043   case AMDGPU::G_UITOFP: {
4044     unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4045     unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4046     if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 &&
4047         isSALUMapping(MI))
4048       return getDefaultMappingSOP(MI);
4049     return getDefaultMappingVOP(MI);
4050   }
4051   case AMDGPU::G_FPTRUNC:
4052   case AMDGPU::G_FPEXT: {
4053     unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4054     unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4055     if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 &&
4056         isSALUMapping(MI))
4057       return getDefaultMappingSOP(MI);
4058     return getDefaultMappingVOP(MI);
4059   }
4060   case AMDGPU::G_FSQRT:
4061   case AMDGPU::G_FEXP2:
4062   case AMDGPU::G_FLOG2: {
4063     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4064     if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4065         isSALUMapping(MI))
4066       return getDefaultMappingSOP(MI);
4067     return getDefaultMappingVOP(MI);
4068   }
4069   case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
4070   case AMDGPU::G_SSUBSAT:
4071   case AMDGPU::G_UADDSAT:
4072   case AMDGPU::G_USUBSAT:
4073   case AMDGPU::G_FMAD:
4074   case AMDGPU::G_FLDEXP:
4075   case AMDGPU::G_FMINNUM_IEEE:
4076   case AMDGPU::G_FMAXNUM_IEEE:
4077   case AMDGPU::G_FCANONICALIZE:
4078   case AMDGPU::G_STRICT_FLDEXP:
4079   case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
4080   case AMDGPU::G_FSHR: // TODO: Expand for scalar
4081   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
4082   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
4083   case AMDGPU::G_AMDGPU_RCP_IFLAG:
4084   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
4085   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
4086   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
4087   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
4088   case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
4089   case AMDGPU::G_AMDGPU_SMED3:
4090   case AMDGPU::G_AMDGPU_FMED3:
4091     return getDefaultMappingVOP(MI);
4092   case AMDGPU::G_UMULH:
4093   case AMDGPU::G_SMULH: {
4094     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
4095       return getDefaultMappingSOP(MI);
4096     return getDefaultMappingVOP(MI);
4097   }
4098   case AMDGPU::G_AMDGPU_MAD_U64_U32:
4099   case AMDGPU::G_AMDGPU_MAD_I64_I32: {
4100     // Three possible mappings:
4101     //
4102     //  - Default SOP
4103     //  - Default VOP
4104     //  - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
4105     //
4106     // This allows instruction selection to keep the multiplication part of the
4107     // instruction on the SALU.
4108     bool AllSalu = true;
4109     bool MulSalu = true;
4110     for (unsigned i = 0; i < 5; ++i) {
4111       Register Reg = MI.getOperand(i).getReg();
4112       if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
4113         if (Bank->getID() != AMDGPU::SGPRRegBankID) {
4114           AllSalu = false;
4115           if (i == 2 || i == 3) {
4116             MulSalu = false;
4117             break;
4118           }
4119         }
4120       }
4121     }
4122 
4123     if (AllSalu)
4124       return getDefaultMappingSOP(MI);
4125 
4126     // If the multiply-add is full-rate in VALU, use that even if the
4127     // multiplication part is scalar. Accumulating separately on the VALU would
4128     // take two instructions.
4129     if (!MulSalu || Subtarget.hasFullRate64Ops())
4130       return getDefaultMappingVOP(MI);
4131 
4132     // Keep the multiplication on the SALU, then accumulate on the VALU.
4133     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
4134     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4135     OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4136     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4137     OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
4138     break;
4139   }
4140   case AMDGPU::G_IMPLICIT_DEF: {
4141     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4142     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4143     break;
4144   }
4145   case AMDGPU::G_FCONSTANT:
4146   case AMDGPU::G_CONSTANT:
4147   case AMDGPU::G_GLOBAL_VALUE:
4148   case AMDGPU::G_FRAME_INDEX:
4149   case AMDGPU::G_BLOCK_ADDR:
4150   case AMDGPU::G_READSTEADYCOUNTER:
4151   case AMDGPU::G_READCYCLECOUNTER: {
4152     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4153     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4154     break;
4155   }
4156   case AMDGPU::G_DYN_STACKALLOC: {
4157     // Result is always uniform, and a wave reduction is needed for the source.
4158     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4159     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4160     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
4161     break;
4162   }
4163   case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
4164     // This case is weird because we expect a physical register in the source,
4165     // but need to set a bank anyway.
4166     //
4167     // TODO: We could select the result to SGPR or VGPR
4168     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4169     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4170     break;
4171   }
4172   case AMDGPU::G_INSERT: {
4173     unsigned BankID = getMappingType(MRI, MI);
4174     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4175     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4176     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
4177     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
4178     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
4179     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
4180     OpdsMapping[3] = nullptr;
4181     break;
4182   }
4183   case AMDGPU::G_EXTRACT: {
4184     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4185     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4186     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4187     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
4188     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
4189     OpdsMapping[2] = nullptr;
4190     break;
4191   }
4192   case AMDGPU::G_BUILD_VECTOR:
4193   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
4194     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
4195     if (DstTy == LLT::fixed_vector(2, 16)) {
4196       unsigned DstSize = DstTy.getSizeInBits();
4197       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4198       unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4199       unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4200       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
4201 
4202       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
4203       OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
4204       OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
4205       break;
4206     }
4207 
4208     [[fallthrough]];
4209   }
4210   case AMDGPU::G_MERGE_VALUES:
4211   case AMDGPU::G_CONCAT_VECTORS: {
4212     unsigned Bank = getMappingType(MRI, MI);
4213     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4214     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4215 
4216     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
4217     // Op1 and Dst should use the same register bank.
4218     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
4219       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
4220     break;
4221   }
4222   case AMDGPU::G_BITREVERSE:
4223   case AMDGPU::G_BITCAST:
4224   case AMDGPU::G_INTTOPTR:
4225   case AMDGPU::G_PTRTOINT:
4226   case AMDGPU::G_FABS:
4227   case AMDGPU::G_FNEG: {
4228     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4229     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4230     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4231     break;
4232   }
4233   case AMDGPU::G_AMDGPU_FFBH_U32:
4234   case AMDGPU::G_AMDGPU_FFBL_B32:
4235   case AMDGPU::G_CTLZ_ZERO_UNDEF:
4236   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
4237     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4238     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4239     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
4240     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
4241     break;
4242   }
4243   case AMDGPU::G_CTPOP: {
4244     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4245     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4246     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
4247 
4248     // This should really be getValueMappingSGPR64Only, but allowing the generic
4249     // code to handle the register split just makes using LegalizerHelper more
4250     // difficult.
4251     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4252     break;
4253   }
4254   case AMDGPU::G_TRUNC: {
4255     Register Dst = MI.getOperand(0).getReg();
4256     Register Src = MI.getOperand(1).getReg();
4257     unsigned Bank = getRegBankID(Src, MRI);
4258     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4259     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4260     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
4261     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
4262     break;
4263   }
4264   case AMDGPU::G_ZEXT:
4265   case AMDGPU::G_SEXT:
4266   case AMDGPU::G_ANYEXT:
4267   case AMDGPU::G_SEXT_INREG: {
4268     Register Dst = MI.getOperand(0).getReg();
4269     Register Src = MI.getOperand(1).getReg();
4270     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4271     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4272 
4273     unsigned DstBank;
4274     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
4275     assert(SrcBank);
4276     switch (SrcBank->getID()) {
4277     case AMDGPU::SGPRRegBankID:
4278       DstBank = AMDGPU::SGPRRegBankID;
4279       break;
4280     default:
4281       DstBank = AMDGPU::VGPRRegBankID;
4282       break;
4283     }
4284 
4285     // Scalar extend can use 64-bit BFE, but VGPRs require extending to
4286     // 32-bits, and then to 64.
4287     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
4288     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
4289                                                        SrcSize);
4290     break;
4291   }
4292   case AMDGPU::G_IS_FPCLASS: {
4293     Register SrcReg = MI.getOperand(1).getReg();
4294     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4295     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4296     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4297     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4298     break;
4299   }
4300   case AMDGPU::G_STORE: {
4301     assert(MI.getOperand(0).isReg());
4302     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4303 
4304     // FIXME: We need to specify a different reg bank once scalar stores are
4305     // supported.
4306     const ValueMapping *ValMapping =
4307         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4308     OpdsMapping[0] = ValMapping;
4309     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4310     break;
4311   }
4312   case AMDGPU::G_ICMP:
4313   case AMDGPU::G_FCMP: {
4314     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4315 
4316     // See if the result register has already been constrained to vcc, which may
4317     // happen due to control flow intrinsic lowering.
4318     unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4319                                     AMDGPU::SGPRRegBankID);
4320     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4321     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
4322 
4323     auto canUseSCCICMP = [&]() {
4324       auto Pred =
4325           static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
4326       return Size == 32 ||
4327              (Size == 64 &&
4328               (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4329               Subtarget.hasScalarCompareEq64());
4330     };
4331     auto canUseSCCFCMP = [&]() {
4332       return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16);
4333     };
4334 
4335     bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;
4336     bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4337                      Op2Bank == AMDGPU::SGPRRegBankID &&
4338                      Op3Bank == AMDGPU::SGPRRegBankID &&
4339                      (isICMP ? canUseSCCICMP() : canUseSCCFCMP());
4340 
4341     DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4342     unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4343 
4344     // TODO: Use 32-bit for scalar output size.
4345     // SCC results will need to be copied to a 32-bit SGPR virtual register.
4346     const unsigned ResultSize = 1;
4347 
4348     OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
4349     OpdsMapping[1] = nullptr; // Predicate Operand.
4350     OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
4351     OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
4352     break;
4353   }
4354   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4355     // VGPR index can be used for waterfall when indexing a SGPR vector.
4356     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4357     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4358     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4359     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4360     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4361     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
4362 
4363     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
4364     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
4365 
4366     // The index can be either if the source vector is VGPR.
4367     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4368     break;
4369   }
4370   case AMDGPU::G_INSERT_VECTOR_ELT: {
4371     unsigned OutputBankID = isSALUMapping(MI) ?
4372       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4373 
4374     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4375     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4376     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4377     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4378     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
4379 
4380     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4381     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4382 
4383     // This is a weird case, because we need to break down the mapping based on
4384     // the register bank of a different operand.
4385     if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4386       OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
4387                                                       InsertSize);
4388     } else {
4389       assert(InsertSize == 32 || InsertSize == 64);
4390       OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
4391     }
4392 
4393     // The index can be either if the source vector is VGPR.
4394     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
4395     break;
4396   }
4397   case AMDGPU::G_UNMERGE_VALUES: {
4398     unsigned Bank = getMappingType(MRI, MI);
4399 
4400     // Op1 and Dst should use the same register bank.
4401     // FIXME: Shouldn't this be the default? Why do we need to handle this?
4402     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4403       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
4404       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
4405     }
4406     break;
4407   }
4408   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4409   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4410   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4411   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4412   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4413   case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
4414   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
4415   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
4416   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
4417   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
4418   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4419   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4420   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4421   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4422   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4423   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4424   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4425   case AMDGPU::G_AMDGPU_BUFFER_STORE:
4426   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4427   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4428   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4429   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4430     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4431 
4432     // rsrc
4433     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4434 
4435     // vindex
4436     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4437 
4438     // voffset
4439     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4440 
4441     // soffset
4442     OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4443 
4444     // Any remaining operands are immediates and were correctly null
4445     // initialized.
4446     break;
4447   }
4448   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4449   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4450   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4451   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4452   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4453   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4454   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4455   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4456   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4457   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4458   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4459   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4460   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4461   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4462   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4463     // vdata_out
4464     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4465 
4466     // vdata_in
4467     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4468 
4469     // rsrc
4470     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4471 
4472     // vindex
4473     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4474 
4475     // voffset
4476     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4477 
4478     // soffset
4479     OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4480 
4481     // Any remaining operands are immediates and were correctly null
4482     // initialized.
4483     break;
4484   }
4485   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4486     // vdata_out
4487     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4488 
4489     // vdata_in
4490     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4491 
4492     // cmp
4493     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4494 
4495     // rsrc
4496     OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4497 
4498     // vindex
4499     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4500 
4501     // voffset
4502     OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4503 
4504     // soffset
4505     OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4506 
4507     // Any remaining operands are immediates and were correctly null
4508     // initialized.
4509     break;
4510   }
4511   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
4512   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
4513   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
4514   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
4515   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
4516     // Lie and claim everything is legal, even though some need to be
4517     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4518     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4519     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4520 
4521     // We need to convert this to a MUBUF if either the resource of offset is
4522     // VGPR.
4523     unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4524     unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4525     unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4526 
4527     unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4528     OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4529     break;
4530   }
4531   case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
4532     OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4533     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4534     break;
4535   case AMDGPU::G_INTRINSIC:
4536   case AMDGPU::G_INTRINSIC_CONVERGENT: {
4537     switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
4538     default:
4539       return getInvalidInstructionMapping();
4540     case Intrinsic::amdgcn_div_fmas:
4541     case Intrinsic::amdgcn_div_fixup:
4542     case Intrinsic::amdgcn_trig_preop:
4543     case Intrinsic::amdgcn_sin:
4544     case Intrinsic::amdgcn_cos:
4545     case Intrinsic::amdgcn_log_clamp:
4546     case Intrinsic::amdgcn_rcp_legacy:
4547     case Intrinsic::amdgcn_rsq_legacy:
4548     case Intrinsic::amdgcn_rsq_clamp:
4549     case Intrinsic::amdgcn_tanh:
4550     case Intrinsic::amdgcn_fmul_legacy:
4551     case Intrinsic::amdgcn_fma_legacy:
4552     case Intrinsic::amdgcn_frexp_mant:
4553     case Intrinsic::amdgcn_frexp_exp:
4554     case Intrinsic::amdgcn_fract:
4555     case Intrinsic::amdgcn_cvt_pknorm_i16:
4556     case Intrinsic::amdgcn_cvt_pknorm_u16:
4557     case Intrinsic::amdgcn_cvt_pk_i16:
4558     case Intrinsic::amdgcn_cvt_pk_u16:
4559     case Intrinsic::amdgcn_cvt_pk_f16_fp8:
4560     case Intrinsic::amdgcn_cvt_pk_f16_bf8:
4561     case Intrinsic::amdgcn_fmed3:
4562     case Intrinsic::amdgcn_cubeid:
4563     case Intrinsic::amdgcn_cubema:
4564     case Intrinsic::amdgcn_cubesc:
4565     case Intrinsic::amdgcn_cubetc:
4566     case Intrinsic::amdgcn_sffbh:
4567     case Intrinsic::amdgcn_fmad_ftz:
4568     case Intrinsic::amdgcn_mbcnt_lo:
4569     case Intrinsic::amdgcn_mbcnt_hi:
4570     case Intrinsic::amdgcn_mul_u24:
4571     case Intrinsic::amdgcn_mul_i24:
4572     case Intrinsic::amdgcn_mulhi_u24:
4573     case Intrinsic::amdgcn_mulhi_i24:
4574     case Intrinsic::amdgcn_lerp:
4575     case Intrinsic::amdgcn_sad_u8:
4576     case Intrinsic::amdgcn_msad_u8:
4577     case Intrinsic::amdgcn_sad_hi_u8:
4578     case Intrinsic::amdgcn_sad_u16:
4579     case Intrinsic::amdgcn_qsad_pk_u16_u8:
4580     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4581     case Intrinsic::amdgcn_mqsad_u32_u8:
4582     case Intrinsic::amdgcn_cvt_pk_u8_f32:
4583     case Intrinsic::amdgcn_alignbyte:
4584     case Intrinsic::amdgcn_perm:
4585     case Intrinsic::amdgcn_prng_b32:
4586     case Intrinsic::amdgcn_fdot2:
4587     case Intrinsic::amdgcn_sdot2:
4588     case Intrinsic::amdgcn_udot2:
4589     case Intrinsic::amdgcn_sdot4:
4590     case Intrinsic::amdgcn_udot4:
4591     case Intrinsic::amdgcn_sdot8:
4592     case Intrinsic::amdgcn_udot8:
4593     case Intrinsic::amdgcn_fdot2_bf16_bf16:
4594     case Intrinsic::amdgcn_fdot2_f16_f16:
4595     case Intrinsic::amdgcn_fdot2_f32_bf16:
4596     case Intrinsic::amdgcn_fdot2c_f32_bf16:
4597     case Intrinsic::amdgcn_sudot4:
4598     case Intrinsic::amdgcn_sudot8:
4599     case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
4600     case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
4601     case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
4602     case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
4603     case Intrinsic::amdgcn_cvt_f32_fp8:
4604     case Intrinsic::amdgcn_cvt_f32_fp8_e5m3:
4605     case Intrinsic::amdgcn_cvt_f32_bf8:
4606     case Intrinsic::amdgcn_cvt_off_f32_i4:
4607     case Intrinsic::amdgcn_cvt_pk_f32_fp8:
4608     case Intrinsic::amdgcn_cvt_pk_f32_bf8:
4609     case Intrinsic::amdgcn_cvt_pk_fp8_f32:
4610     case Intrinsic::amdgcn_cvt_pk_bf8_f32:
4611     case Intrinsic::amdgcn_cvt_sr_fp8_f32:
4612     case Intrinsic::amdgcn_cvt_sr_bf8_f32:
4613     case Intrinsic::amdgcn_cvt_sr_bf16_f32:
4614     case Intrinsic::amdgcn_cvt_sr_f16_f32:
4615     case Intrinsic::amdgcn_cvt_f16_fp8:
4616     case Intrinsic::amdgcn_cvt_f16_bf8:
4617     case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_f16:
4618     case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16:
4619     case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16:
4620     case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16:
4621     case Intrinsic::amdgcn_cvt_scalef32_f16_fp8:
4622     case Intrinsic::amdgcn_cvt_scalef32_f16_bf8:
4623     case Intrinsic::amdgcn_cvt_scalef32_f32_fp8:
4624     case Intrinsic::amdgcn_cvt_scalef32_f32_bf8:
4625     case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f32:
4626     case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f32:
4627     case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp8:
4628     case Intrinsic::amdgcn_cvt_scalef32_pk_f32_bf8:
4629     case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f16:
4630     case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_bf16:
4631     case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f16:
4632     case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_bf16:
4633     case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp4:
4634     case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f32:
4635     case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp4:
4636     case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp4:
4637     case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_fp6:
4638     case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_bf6:
4639     case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_bf6:
4640     case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_bf6:
4641     case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_fp6:
4642     case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_fp6:
4643     case Intrinsic::amdgcn_cvt_scalef32_pk_f16_bf8:
4644     case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_bf8:
4645     case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp8:
4646     case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp8:
4647     case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f16:
4648     case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_bf16:
4649     case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f16:
4650     case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_bf16:
4651     case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f32:
4652     case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_bf16:
4653     case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f16:
4654     case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f32:
4655     case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_bf16:
4656     case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f16:
4657     case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f32:
4658     case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_bf16:
4659     case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f16:
4660     case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f32:
4661     case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_bf16:
4662     case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f16:
4663     case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f32:
4664     case Intrinsic::amdgcn_ashr_pk_i8_i32:
4665     case Intrinsic::amdgcn_ashr_pk_u8_i32:
4666     case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:
4667     case Intrinsic::amdgcn_cvt_scalef32_2xpk16_bf6_f32:
4668     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4669     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4670     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
4671     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
4672     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4673     case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4674     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4675     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4676     case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
4677     case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
4678     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
4679     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
4680     case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
4681     case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
4682     case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
4683     case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
4684     case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
4685     case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
4686     case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
4687     case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
4688     case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
4689     case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
4690     case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
4691     case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
4692       return getDefaultMappingVOP(MI);
4693     case Intrinsic::amdgcn_log:
4694     case Intrinsic::amdgcn_exp2:
4695     case Intrinsic::amdgcn_rcp:
4696     case Intrinsic::amdgcn_rsq:
4697     case Intrinsic::amdgcn_sqrt: {
4698       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4699       if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4700           isSALUMapping(MI))
4701         return getDefaultMappingSOP(MI);
4702       return getDefaultMappingVOP(MI);
4703     }
4704     case Intrinsic::amdgcn_sbfe:
4705     case Intrinsic::amdgcn_ubfe:
4706       if (isSALUMapping(MI))
4707         return getDefaultMappingSOP(MI);
4708       return getDefaultMappingVOP(MI);
4709     case Intrinsic::amdgcn_ds_swizzle:
4710     case Intrinsic::amdgcn_ds_permute:
4711     case Intrinsic::amdgcn_ds_bpermute:
4712     case Intrinsic::amdgcn_update_dpp:
4713     case Intrinsic::amdgcn_mov_dpp8:
4714     case Intrinsic::amdgcn_mov_dpp:
4715     case Intrinsic::amdgcn_strict_wwm:
4716     case Intrinsic::amdgcn_wwm:
4717     case Intrinsic::amdgcn_strict_wqm:
4718     case Intrinsic::amdgcn_wqm:
4719     case Intrinsic::amdgcn_softwqm:
4720     case Intrinsic::amdgcn_set_inactive:
4721     case Intrinsic::amdgcn_set_inactive_chain_arg:
4722     case Intrinsic::amdgcn_permlane64:
4723     case Intrinsic::amdgcn_ds_bpermute_fi_b32:
4724       return getDefaultMappingAllVGPR(MI);
4725     case Intrinsic::amdgcn_cvt_pkrtz:
4726       if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
4727         return getDefaultMappingSOP(MI);
4728       return getDefaultMappingVOP(MI);
4729     case Intrinsic::amdgcn_kernarg_segment_ptr:
4730     case Intrinsic::amdgcn_s_getpc:
4731     case Intrinsic::amdgcn_groupstaticsize:
4732     case Intrinsic::amdgcn_reloc_constant:
4733     case Intrinsic::returnaddress: {
4734       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4735       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4736       break;
4737     }
4738     case Intrinsic::amdgcn_wqm_vote: {
4739       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4740       OpdsMapping[0] = OpdsMapping[2]
4741         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4742       break;
4743     }
4744     case Intrinsic::amdgcn_ps_live: {
4745       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4746       break;
4747     }
4748     case Intrinsic::amdgcn_div_scale: {
4749       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4750       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4751       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4752       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4753 
4754       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4755       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4756       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4757       break;
4758     }
4759     case Intrinsic::amdgcn_class: {
4760       Register Src0Reg = MI.getOperand(2).getReg();
4761       Register Src1Reg = MI.getOperand(3).getReg();
4762       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4763       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4764       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4765       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4766       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4767       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4768       break;
4769     }
4770     case Intrinsic::amdgcn_icmp:
4771     case Intrinsic::amdgcn_fcmp: {
4772       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4773       // This is not VCCRegBank because this is not used in boolean contexts.
4774       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4775       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4776       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4777       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4778       break;
4779     }
4780     case Intrinsic::amdgcn_readlane: {
4781       // This must be an SGPR, but accept a VGPR.
4782       Register IdxReg = MI.getOperand(3).getReg();
4783       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4784       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4785       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4786       [[fallthrough]];
4787     }
4788     case Intrinsic::amdgcn_readfirstlane: {
4789       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4790       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4791       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4792       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4793       break;
4794     }
4795     case Intrinsic::amdgcn_writelane: {
4796       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4797       Register SrcReg = MI.getOperand(2).getReg();
4798       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4799       unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4800       Register IdxReg = MI.getOperand(3).getReg();
4801       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4802       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4803       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4804 
4805       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4806       // to legalize.
4807       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4808       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4809       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4810       break;
4811     }
4812     case Intrinsic::amdgcn_if_break: {
4813       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4814       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4815       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4816       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4817       break;
4818     }
4819     case Intrinsic::amdgcn_permlane16:
4820     case Intrinsic::amdgcn_permlanex16: {
4821       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4822       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4823       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4824       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4825       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4826       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4827       break;
4828     }
4829     case Intrinsic::amdgcn_permlane16_var:
4830     case Intrinsic::amdgcn_permlanex16_var: {
4831       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4832       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4833       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4834       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4835       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4836       break;
4837     }
4838     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4839     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4840     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4841     case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4842     case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4843     case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4844     case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4845     case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4846     case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4847     case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4848     case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4849     case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4850     case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4851     case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4852     case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4853     case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4854     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4855     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4856     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4857     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4858     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4859     case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4860     case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4861     case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4862     case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4863     case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4864     case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4865     case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4866     case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4867     case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4868     case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
4869     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
4870     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
4871     case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
4872     case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
4873     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
4874     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
4875     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
4876     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
4877     case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
4878     case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
4879     case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
4880     case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
4881     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: {
4882       // Default for MAI intrinsics.
4883       // srcC can also be an immediate which can be folded later.
4884       // FIXME: Should we eventually add an alternative mapping with AGPR src
4885       // for srcA/srcB?
4886       //
4887       // vdst, srcA, srcB, srcC
4888       const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4889       OpdsMapping[0] =
4890           Info->mayNeedAGPRs()
4891               ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4892               : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4893       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4894       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4895       OpdsMapping[4] =
4896           Info->mayNeedAGPRs()
4897               ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4898               : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4899       break;
4900     }
4901     case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
4902     case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
4903       const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4904       OpdsMapping[0] =
4905           Info->mayNeedAGPRs()
4906               ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4907               : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4908 
4909       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4910       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4911       OpdsMapping[4] =
4912           Info->mayNeedAGPRs()
4913               ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4914               : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4915 
4916       OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
4917       OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI);
4918       break;
4919     }
4920     case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
4921     case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
4922     case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
4923     case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
4924     case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
4925     case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
4926     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
4927     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
4928     case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
4929     case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
4930     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
4931     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
4932     case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
4933     case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
4934     case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
4935     case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
4936     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
4937     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
4938     case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
4939     case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
4940     case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
4941     case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
4942     case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
4943     case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
4944     case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
4945     case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
4946     case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
4947     case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: {
4948       // vdst, srcA, srcB, srcC, idx
4949       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4950       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4951       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4952       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4953       OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4954       break;
4955     }
4956     case Intrinsic::amdgcn_interp_p1:
4957     case Intrinsic::amdgcn_interp_p2:
4958     case Intrinsic::amdgcn_interp_mov:
4959     case Intrinsic::amdgcn_interp_p1_f16:
4960     case Intrinsic::amdgcn_interp_p2_f16:
4961     case Intrinsic::amdgcn_lds_param_load: {
4962       const int M0Idx = MI.getNumOperands() - 1;
4963       Register M0Reg = MI.getOperand(M0Idx).getReg();
4964       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4965       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4966 
4967       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4968       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4969         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4970 
4971       // Must be SGPR, but we must take whatever the original bank is and fix it
4972       // later.
4973       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4974       break;
4975     }
4976     case Intrinsic::amdgcn_interp_inreg_p10:
4977     case Intrinsic::amdgcn_interp_inreg_p2:
4978     case Intrinsic::amdgcn_interp_inreg_p10_f16:
4979     case Intrinsic::amdgcn_interp_inreg_p2_f16:
4980     case Intrinsic::amdgcn_interp_p10_rtz_f16:
4981     case Intrinsic::amdgcn_interp_p2_rtz_f16: {
4982       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4983       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4984       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4985       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4986       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4987       break;
4988     }
4989     case Intrinsic::amdgcn_permlane16_swap:
4990     case Intrinsic::amdgcn_permlane32_swap: {
4991       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4992       OpdsMapping[0] = OpdsMapping[1] = OpdsMapping[3] = OpdsMapping[4] =
4993           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4994       break;
4995     }
4996     case Intrinsic::amdgcn_ballot: {
4997       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4998       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4999       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
5000       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
5001       break;
5002     }
5003     case Intrinsic::amdgcn_inverse_ballot: {
5004       // This must be an SGPR, but accept a VGPR.
5005       Register MaskReg = MI.getOperand(2).getReg();
5006       unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
5007       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
5008       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
5009       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
5010       break;
5011     }
5012     case Intrinsic::amdgcn_bitop3: {
5013       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
5014       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
5015       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
5016       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
5017       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
5018       break;
5019     }
5020     case Intrinsic::amdgcn_s_quadmask:
5021     case Intrinsic::amdgcn_s_wqm: {
5022       Register MaskReg = MI.getOperand(2).getReg();
5023       unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
5024       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
5025       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize);
5026       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
5027       break;
5028     }
5029     case Intrinsic::amdgcn_wave_reduce_add:
5030     case Intrinsic::amdgcn_wave_reduce_sub:
5031     case Intrinsic::amdgcn_wave_reduce_min:
5032     case Intrinsic::amdgcn_wave_reduce_umin:
5033     case Intrinsic::amdgcn_wave_reduce_max:
5034     case Intrinsic::amdgcn_wave_reduce_umax:
5035     case Intrinsic::amdgcn_wave_reduce_and:
5036     case Intrinsic::amdgcn_wave_reduce_or:
5037     case Intrinsic::amdgcn_wave_reduce_xor: {
5038       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5039       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
5040       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
5041       auto regBankID =
5042           isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5043       OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
5044       break;
5045     }
5046     case Intrinsic::amdgcn_s_bitreplicate:
5047       Register MaskReg = MI.getOperand(2).getReg();
5048       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
5049       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
5050       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
5051     }
5052     break;
5053   }
5054   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
5055   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
5056   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
5057   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
5058   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
5059     auto IntrID = AMDGPU::getIntrinsicID(MI);
5060     const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
5061     assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
5062     // Non-images can have complications from operands that allow both SGPR
5063     // and VGPR. For now it's too complicated to figure out the final opcode
5064     // to derive the register bank from the MCInstrDesc.
5065     assert(RSrcIntrin->IsImage);
5066     return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
5067   }
5068   case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
5069   case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
5070   case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: {
5071     bool IsDualOrBVH8 =
5072         MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY ||
5073         MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY;
5074     unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier
5075     unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods;
5076     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5077     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
5078     if (IsDualOrBVH8) {
5079       OpdsMapping[1] = AMDGPU::getValueMapping(
5080           AMDGPU::VGPRRegBankID,
5081           MRI.getType(MI.getOperand(1).getReg()).getSizeInBits());
5082       OpdsMapping[2] = AMDGPU::getValueMapping(
5083           AMDGPU::VGPRRegBankID,
5084           MRI.getType(MI.getOperand(2).getReg()).getSizeInBits());
5085     }
5086     OpdsMapping[LastRegOpIdx] =
5087         getSGPROpMapping(MI.getOperand(LastRegOpIdx).getReg(), MRI, *TRI);
5088     if (LastRegOpIdx == 3) {
5089       // Sequential form: all operands combined into VGPR256/VGPR512
5090       unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
5091       if (Size > 256)
5092         Size = 512;
5093       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
5094     } else {
5095       // NSA form
5096       unsigned FirstSrcOpIdx = IsDualOrBVH8 ? 4 : 2;
5097       for (unsigned I = FirstSrcOpIdx; I < LastRegOpIdx; ++I) {
5098         unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();
5099         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
5100       }
5101     }
5102     break;
5103   }
5104   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
5105   case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
5106     auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
5107     switch (IntrID) {
5108     case Intrinsic::amdgcn_s_getreg:
5109     case Intrinsic::amdgcn_s_memtime:
5110     case Intrinsic::amdgcn_s_memrealtime:
5111     case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
5112     case Intrinsic::amdgcn_s_sendmsg_rtn: {
5113       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5114       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
5115       break;
5116     }
5117     case Intrinsic::amdgcn_global_atomic_csub:
5118     case Intrinsic::amdgcn_global_atomic_fmin_num:
5119     case Intrinsic::amdgcn_global_atomic_fmax_num:
5120     case Intrinsic::amdgcn_flat_atomic_fmin_num:
5121     case Intrinsic::amdgcn_flat_atomic_fmax_num:
5122     case Intrinsic::amdgcn_atomic_cond_sub_u32:
5123     case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
5124     case Intrinsic::amdgcn_global_load_tr_b64:
5125     case Intrinsic::amdgcn_global_load_tr_b128:
5126     case Intrinsic::amdgcn_global_load_tr4_b64:
5127     case Intrinsic::amdgcn_global_load_tr6_b96:
5128     case Intrinsic::amdgcn_ds_load_tr8_b64:
5129     case Intrinsic::amdgcn_ds_load_tr16_b128:
5130     case Intrinsic::amdgcn_ds_load_tr4_b64:
5131     case Intrinsic::amdgcn_ds_load_tr6_b96:
5132     case Intrinsic::amdgcn_ds_read_tr4_b64:
5133     case Intrinsic::amdgcn_ds_read_tr6_b96:
5134     case Intrinsic::amdgcn_ds_read_tr8_b64:
5135     case Intrinsic::amdgcn_ds_read_tr16_b64:
5136     case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
5137     case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
5138       return getDefaultMappingAllVGPR(MI);
5139     case Intrinsic::amdgcn_ds_ordered_add:
5140     case Intrinsic::amdgcn_ds_ordered_swap: {
5141       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5142       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
5143       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5144                                  AMDGPU::SGPRRegBankID);
5145       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
5146       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5147       break;
5148     }
5149     case Intrinsic::amdgcn_ds_append:
5150     case Intrinsic::amdgcn_ds_consume: {
5151       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5152       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
5153       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5154       break;
5155     }
5156     case Intrinsic::amdgcn_exp_compr:
5157       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5158       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5159       break;
5160     case Intrinsic::amdgcn_exp:
5161       // FIXME: Could we support packed types here?
5162       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5163       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5164       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5165       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5166       break;
5167     case Intrinsic::amdgcn_exp_row:
5168       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5169       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5170       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5171       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5172       OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
5173       break;
5174     case Intrinsic::amdgcn_s_sendmsg:
5175     case Intrinsic::amdgcn_s_sendmsghalt: {
5176       // This must be an SGPR, but accept a VGPR.
5177       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5178                                    AMDGPU::SGPRRegBankID);
5179       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
5180       break;
5181     }
5182     case Intrinsic::amdgcn_s_setreg: {
5183       // This must be an SGPR, but accept a VGPR.
5184       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5185                                    AMDGPU::SGPRRegBankID);
5186       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
5187       break;
5188     }
5189     case Intrinsic::amdgcn_s_ttracedata: {
5190       // This must be an SGPR, but accept a VGPR.
5191       unsigned Bank =
5192           getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID);
5193       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
5194       break;
5195     }
5196     case Intrinsic::amdgcn_end_cf: {
5197       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
5198       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
5199       break;
5200     }
5201     case Intrinsic::amdgcn_else: {
5202       unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
5203       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
5204       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
5205       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
5206       break;
5207     }
5208     case Intrinsic::amdgcn_init_whole_wave:
5209     case Intrinsic::amdgcn_live_mask: {
5210       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
5211       break;
5212     }
5213     case Intrinsic::amdgcn_wqm_demote:
5214     case Intrinsic::amdgcn_kill: {
5215       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
5216       break;
5217     }
5218     case Intrinsic::amdgcn_raw_buffer_load:
5219     case Intrinsic::amdgcn_raw_ptr_buffer_load:
5220     case Intrinsic::amdgcn_raw_atomic_buffer_load:
5221     case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
5222     case Intrinsic::amdgcn_raw_tbuffer_load:
5223     case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
5224       // FIXME: Should make intrinsic ID the last operand of the instruction,
5225       // then this would be the same as store
5226       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5227       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5228       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5229       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5230       break;
5231     }
5232     case Intrinsic::amdgcn_raw_buffer_load_lds:
5233     case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
5234       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5235       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5236       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5237       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5238       break;
5239     }
5240     case Intrinsic::amdgcn_raw_buffer_store:
5241     case Intrinsic::amdgcn_raw_ptr_buffer_store:
5242     case Intrinsic::amdgcn_raw_buffer_store_format:
5243     case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
5244     case Intrinsic::amdgcn_raw_tbuffer_store:
5245     case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
5246       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5247       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5248       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5249       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5250       break;
5251     }
5252     case Intrinsic::amdgcn_struct_buffer_load:
5253     case Intrinsic::amdgcn_struct_ptr_buffer_load:
5254     case Intrinsic::amdgcn_struct_tbuffer_load:
5255     case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
5256     case Intrinsic::amdgcn_struct_atomic_buffer_load:
5257     case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
5258       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5259       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5260       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5261       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5262       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5263       break;
5264     }
5265     case Intrinsic::amdgcn_struct_buffer_load_lds:
5266     case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
5267       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5268       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5269       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5270       OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5271       OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
5272       break;
5273     }
5274     case Intrinsic::amdgcn_struct_buffer_store:
5275     case Intrinsic::amdgcn_struct_ptr_buffer_store:
5276     case Intrinsic::amdgcn_struct_tbuffer_store:
5277     case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
5278       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5279       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5280       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5281       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5282       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5283       break;
5284     }
5285     case Intrinsic::amdgcn_init_exec_from_input: {
5286       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
5287       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
5288       break;
5289     }
5290     case Intrinsic::amdgcn_ds_gws_init:
5291     case Intrinsic::amdgcn_ds_gws_barrier:
5292     case Intrinsic::amdgcn_ds_gws_sema_br: {
5293       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5294 
5295       // This must be an SGPR, but accept a VGPR.
5296       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5297                                    AMDGPU::SGPRRegBankID);
5298       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
5299       break;
5300     }
5301     case Intrinsic::amdgcn_ds_gws_sema_v:
5302     case Intrinsic::amdgcn_ds_gws_sema_p:
5303     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
5304       // This must be an SGPR, but accept a VGPR.
5305       unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
5306                                    AMDGPU::SGPRRegBankID);
5307       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
5308       break;
5309     }
5310     case Intrinsic::amdgcn_load_to_lds:
5311     case Intrinsic::amdgcn_global_load_lds: {
5312       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5313       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5314       break;
5315     }
5316     case Intrinsic::amdgcn_lds_direct_load: {
5317       const int M0Idx = MI.getNumOperands() - 1;
5318       Register M0Reg = MI.getOperand(M0Idx).getReg();
5319       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
5320       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5321 
5322       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
5323       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
5324         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5325 
5326       // Must be SGPR, but we must take whatever the original bank is and fix it
5327       // later.
5328       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
5329       break;
5330     }
5331     case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
5332     case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
5333       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5334       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5335       break;
5336     case Intrinsic::amdgcn_ds_bvh_stack_rtn:
5337     case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
5338     case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
5339     case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
5340       OpdsMapping[0] =
5341           getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst
5342       OpdsMapping[1] =
5343           getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr
5344       OpdsMapping[3] =
5345           getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr
5346       OpdsMapping[4] =
5347           getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0
5348       OpdsMapping[5] =
5349           getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
5350       break;
5351     }
5352     case Intrinsic::amdgcn_s_sleep_var:
5353       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5354       break;
5355     case Intrinsic::amdgcn_s_barrier_signal_var:
5356       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5357       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5358       break;
5359     case Intrinsic::amdgcn_s_barrier_signal_isfirst: {
5360       const unsigned ResultSize = 1;
5361       OpdsMapping[0] =
5362           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
5363       break;
5364     }
5365     case Intrinsic::amdgcn_s_get_barrier_state:
5366     case Intrinsic::amdgcn_s_get_named_barrier_state: {
5367       OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5368       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5369       break;
5370     }
5371     case Intrinsic::amdgcn_pops_exiting_wave_id:
5372       return getDefaultMappingSOP(MI);
5373     case Intrinsic::amdgcn_tensor_load_to_lds_d2:
5374     case Intrinsic::amdgcn_tensor_store_from_lds_d2:
5375     case Intrinsic::amdgcn_tensor_load_to_lds:
5376     case Intrinsic::amdgcn_tensor_store_from_lds: {
5377       // Lie and claim everything is legal, even all operands need to be
5378       // SGPRs. applyMapping will have to deal with it with readfirstlane.
5379       for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
5380         if (MI.getOperand(I).isReg()) {
5381           Register Reg = MI.getOperand(I).getReg();
5382           auto OpBank = getRegBankID(Reg, MRI);
5383           unsigned Size = getSizeInBits(Reg, MRI, *TRI);
5384           OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
5385         }
5386       }
5387       break;
5388     }
5389     case Intrinsic::amdgcn_s_prefetch_data: {
5390       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5391       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5392       break;
5393     }
5394     default:
5395       return getInvalidInstructionMapping();
5396     }
5397     break;
5398   }
5399   case AMDGPU::G_SELECT: {
5400     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5401     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5402                                     AMDGPU::SGPRRegBankID);
5403     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
5404                                     AMDGPU::SGPRRegBankID);
5405     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
5406                     Op3Bank == AMDGPU::SGPRRegBankID;
5407 
5408     unsigned CondBankDefault = SGPRSrcs ?
5409       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5410     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
5411                                      CondBankDefault);
5412     if (CondBank == AMDGPU::SGPRRegBankID)
5413       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5414     else if (CondBank == AMDGPU::VGPRRegBankID)
5415       CondBank = AMDGPU::VCCRegBankID;
5416 
5417     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
5418       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5419 
5420     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
5421 
5422     // TODO: Should report 32-bit for scalar condition type.
5423     if (Size == 64) {
5424       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5425       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
5426       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5427       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5428     } else {
5429       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
5430       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
5431       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
5432       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
5433     }
5434 
5435     break;
5436   }
5437 
5438   case AMDGPU::G_SI_CALL: {
5439     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
5440     // Lie and claim everything is legal, even though some need to be
5441     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
5442     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5443 
5444     // Allow anything for implicit arguments
5445     for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
5446       if (MI.getOperand(I).isReg()) {
5447         Register Reg = MI.getOperand(I).getReg();
5448         auto OpBank = getRegBankID(Reg, MRI);
5449         unsigned Size = getSizeInBits(Reg, MRI, *TRI);
5450         OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
5451       }
5452     }
5453     break;
5454   }
5455   case AMDGPU::G_LOAD:
5456   case AMDGPU::G_ZEXTLOAD:
5457   case AMDGPU::G_SEXTLOAD:
5458     return getInstrMappingForLoad(MI);
5459 
5460   case AMDGPU::G_ATOMICRMW_XCHG:
5461   case AMDGPU::G_ATOMICRMW_ADD:
5462   case AMDGPU::G_ATOMICRMW_SUB:
5463   case AMDGPU::G_ATOMICRMW_AND:
5464   case AMDGPU::G_ATOMICRMW_OR:
5465   case AMDGPU::G_ATOMICRMW_XOR:
5466   case AMDGPU::G_ATOMICRMW_MAX:
5467   case AMDGPU::G_ATOMICRMW_MIN:
5468   case AMDGPU::G_ATOMICRMW_UMAX:
5469   case AMDGPU::G_ATOMICRMW_UMIN:
5470   case AMDGPU::G_ATOMICRMW_FADD:
5471   case AMDGPU::G_ATOMICRMW_FMIN:
5472   case AMDGPU::G_ATOMICRMW_FMAX:
5473   case AMDGPU::G_ATOMICRMW_UINC_WRAP:
5474   case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
5475   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
5476     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5477     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
5478     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5479     break;
5480   }
5481   case AMDGPU::G_ATOMIC_CMPXCHG: {
5482     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5483     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
5484     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5485     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5486     break;
5487   }
5488   case AMDGPU::G_BRCOND: {
5489     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
5490                                  AMDGPU::SGPRRegBankID);
5491     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
5492     if (Bank != AMDGPU::SGPRRegBankID)
5493       Bank = AMDGPU::VCCRegBankID;
5494 
5495     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
5496     break;
5497   }
5498   case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
5499     return getDefaultMappingVOP(MI);
5500   case AMDGPU::G_PREFETCH:
5501     OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5502     break;
5503   }
5504 
5505   return getInstructionMapping(/*ID*/1, /*Cost*/1,
5506                                getOperandsMapping(OpdsMapping),
5507                                MI.getNumOperands());
5508 }
5509