xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83 #include "llvm/CodeGen/RegisterBank.h"
84 #include "llvm/IR/IntrinsicsAMDGPU.h"
85 
86 #define GET_TARGET_REGBANK_IMPL
87 #include "AMDGPUGenRegisterBank.inc"
88 
89 // This file will be TableGen'ed at some point.
90 #include "AMDGPUGenRegisterBankInfo.def"
91 
92 using namespace llvm;
93 using namespace MIPatternMatch;
94 
95 namespace {
96 
97 // Observer to apply a register bank to new registers created by LegalizerHelper.
98 class ApplyRegBankMapping final : public GISelChangeObserver {
99 private:
100   MachineIRBuilder &B;
101   const AMDGPURegisterBankInfo &RBI;
102   MachineRegisterInfo &MRI;
103   const RegisterBank *NewBank;
104   SmallVector<MachineInstr *, 4> NewInsts;
105 
106 public:
ApplyRegBankMapping(MachineIRBuilder & B,const AMDGPURegisterBankInfo & RBI_,MachineRegisterInfo & MRI_,const RegisterBank * RB)107   ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_,
108                       MachineRegisterInfo &MRI_, const RegisterBank *RB)
109       : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) {
110     assert(!B.isObservingChanges());
111     B.setChangeObserver(*this);
112   }
113 
~ApplyRegBankMapping()114   ~ApplyRegBankMapping() override {
115     for (MachineInstr *MI : NewInsts)
116       applyBank(*MI);
117 
118     B.stopObservingChanges();
119   }
120 
121   /// Set any registers that don't have a set register class or bank to SALU.
applyBank(MachineInstr & MI)122   void applyBank(MachineInstr &MI) {
123     const unsigned Opc = MI.getOpcode();
124     if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
125         Opc == AMDGPU::G_SEXT) {
126       // LegalizerHelper wants to use the basic legalization artifacts when
127       // widening etc. We don't handle selection with vcc in artifact sources,
128       // so we need to use a select instead to handle these properly.
129       Register DstReg = MI.getOperand(0).getReg();
130       Register SrcReg = MI.getOperand(1).getReg();
131       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
132       if (SrcBank == &AMDGPU::VCCRegBank) {
133         const LLT S32 = LLT::scalar(32);
134         assert(MRI.getType(SrcReg) == LLT::scalar(1));
135         assert(MRI.getType(DstReg) == S32);
136         assert(NewBank == &AMDGPU::VGPRRegBank);
137 
138         // Replace the extension with a select, which really uses the boolean
139         // source.
140         B.setInsertPt(*MI.getParent(), MI);
141 
142         auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
143         auto False = B.buildConstant(S32, 0);
144         B.buildSelect(DstReg, SrcReg, True, False);
145         MRI.setRegBank(True.getReg(0), *NewBank);
146         MRI.setRegBank(False.getReg(0), *NewBank);
147         MI.eraseFromParent();
148       }
149 
150       assert(!MRI.getRegClassOrRegBank(DstReg));
151       MRI.setRegBank(DstReg, *NewBank);
152       return;
153     }
154 
155 #ifndef NDEBUG
156     if (Opc == AMDGPU::G_TRUNC) {
157       Register DstReg = MI.getOperand(0).getReg();
158       const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
159       assert(DstBank != &AMDGPU::VCCRegBank);
160     }
161 #endif
162 
163     for (MachineOperand &Op : MI.operands()) {
164       if (!Op.isReg())
165         continue;
166 
167       // We may see physical registers if building a real MI
168       Register Reg = Op.getReg();
169       if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
170         continue;
171 
172       const RegisterBank *RB = NewBank;
173       if (MRI.getType(Reg) == LLT::scalar(1)) {
174         assert(NewBank == &AMDGPU::VGPRRegBank &&
175                "s1 operands should only be used for vector bools");
176         assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
177                 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
178                "not expecting legalization artifacts here");
179         RB = &AMDGPU::VCCRegBank;
180       }
181 
182       MRI.setRegBank(Reg, *RB);
183     }
184   }
185 
erasingInstr(MachineInstr & MI)186   void erasingInstr(MachineInstr &MI) override {}
187 
createdInstr(MachineInstr & MI)188   void createdInstr(MachineInstr &MI) override {
189     // At this point, the instruction was just inserted and has no operands.
190     NewInsts.push_back(&MI);
191   }
192 
changingInstr(MachineInstr & MI)193   void changingInstr(MachineInstr &MI) override {}
changedInstr(MachineInstr & MI)194   void changedInstr(MachineInstr &MI) override {
195     // FIXME: In principle we should probably add the instruction to NewInsts,
196     // but the way the LegalizerHelper uses the observer, we will always see the
197     // registers we need to set the regbank on also referenced in a new
198     // instruction.
199   }
200 };
201 
202 } // anonymous namespace
203 
AMDGPURegisterBankInfo(const GCNSubtarget & ST)204 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
205     : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
206       TII(Subtarget.getInstrInfo()) {
207 
208   // HACK: Until this is fully tablegen'd.
209   static llvm::once_flag InitializeRegisterBankFlag;
210 
211   static auto InitializeRegisterBankOnce = [this]() {
212     assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
213            &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
214            &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
215     (void)this;
216   };
217 
218   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
219 }
220 
isVectorRegisterBank(const RegisterBank & Bank)221 static bool isVectorRegisterBank(const RegisterBank &Bank) {
222   unsigned BankID = Bank.getID();
223   return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
224 }
225 
isDivergentRegBank(const RegisterBank * RB) const226 bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const {
227   return RB != &AMDGPU::SGPRRegBank;
228 }
229 
copyCost(const RegisterBank & Dst,const RegisterBank & Src,TypeSize Size) const230 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
231                                           const RegisterBank &Src,
232                                           TypeSize Size) const {
233   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
234   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
235       (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
236     return std::numeric_limits<unsigned>::max();
237   }
238 
239   // Bool values are tricky, because the meaning is based on context. The SCC
240   // and VCC banks are for the natural scalar and vector conditions produced by
241   // a compare.
242   //
243   // Legalization doesn't know about the necessary context, so an s1 use may
244   // have been a truncate from an arbitrary value, in which case a copy (lowered
245   // as a compare with 0) needs to be inserted.
246   if (Size == 1 &&
247       (Dst.getID() == AMDGPU::SGPRRegBankID) &&
248       (isVectorRegisterBank(Src) ||
249        Src.getID() == AMDGPU::SGPRRegBankID ||
250        Src.getID() == AMDGPU::VCCRegBankID))
251     return std::numeric_limits<unsigned>::max();
252 
253   // There is no direct copy between AGPRs.
254   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
255       Src.getID() == AMDGPU::AGPRRegBankID)
256     return 4;
257 
258   return RegisterBankInfo::copyCost(Dst, Src, Size);
259 }
260 
getBreakDownCost(const ValueMapping & ValMapping,const RegisterBank * CurBank) const261 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
262   const ValueMapping &ValMapping,
263   const RegisterBank *CurBank) const {
264   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
265   // VGPR.
266   // FIXME: Is there a better way to do this?
267   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
268     return 10; // This is expensive.
269 
270   assert(ValMapping.NumBreakDowns == 2 &&
271          ValMapping.BreakDown[0].Length == 32 &&
272          ValMapping.BreakDown[0].StartIdx == 0 &&
273          ValMapping.BreakDown[1].Length == 32 &&
274          ValMapping.BreakDown[1].StartIdx == 32 &&
275          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
276 
277   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
278   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
279   // want.
280 
281   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
282   // alignment restrictions, but this probably isn't important.
283   return 1;
284 }
285 
286 const RegisterBank &
getRegBankFromRegClass(const TargetRegisterClass & RC,LLT Ty) const287 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
288                                                LLT Ty) const {
289   if (&RC == &AMDGPU::SReg_1RegClass)
290     return AMDGPU::VCCRegBank;
291 
292   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
293   // VCC-like use.
294   if (TRI->isSGPRClass(&RC)) {
295     // FIXME: This probably came from a copy from a physical register, which
296     // should be inferable from the copied to-type. We don't have many boolean
297     // physical register constraints so just assume a normal SGPR for now.
298     if (!Ty.isValid())
299       return AMDGPU::SGPRRegBank;
300 
301     return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
302   }
303 
304   return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
305 }
306 
307 template <unsigned NumOps>
308 RegisterBankInfo::InstructionMappings
addMappingFromTable(const MachineInstr & MI,const MachineRegisterInfo & MRI,const std::array<unsigned,NumOps> RegSrcOpIdx,ArrayRef<OpRegBankEntry<NumOps>> Table) const309 AMDGPURegisterBankInfo::addMappingFromTable(
310     const MachineInstr &MI, const MachineRegisterInfo &MRI,
311     const std::array<unsigned, NumOps> RegSrcOpIdx,
312     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
313 
314   InstructionMappings AltMappings;
315 
316   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
317 
318   unsigned Sizes[NumOps];
319   for (unsigned I = 0; I < NumOps; ++I) {
320     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
321     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
322   }
323 
324   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
325     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
326     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
327   }
328 
329   // getInstrMapping's default mapping uses ID 1, so start at 2.
330   unsigned MappingID = 2;
331   for (const auto &Entry : Table) {
332     for (unsigned I = 0; I < NumOps; ++I) {
333       int OpIdx = RegSrcOpIdx[I];
334       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
335     }
336 
337     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
338                                                  getOperandsMapping(Operands),
339                                                  Operands.size()));
340   }
341 
342   return AltMappings;
343 }
344 
345 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsic(const MachineInstr & MI,const MachineRegisterInfo & MRI) const346 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
347     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
348   switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
349   case Intrinsic::amdgcn_readlane: {
350     static const OpRegBankEntry<3> Table[2] = {
351       // Perfectly legal.
352       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
353 
354       // Need a readfirstlane for the index.
355       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
356     };
357 
358     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
359     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
360   }
361   case Intrinsic::amdgcn_writelane: {
362     static const OpRegBankEntry<4> Table[4] = {
363       // Perfectly legal.
364       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
365 
366       // Need readfirstlane of first op
367       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
368 
369       // Need readfirstlane of second op
370       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
371 
372       // Need readfirstlane of both ops
373       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
374     };
375 
376     // rsrc, voffset, offset
377     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
378     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);
379   }
380   default:
381     return RegisterBankInfo::getInstrAlternativeMappings(MI);
382   }
383 }
384 
385 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr & MI,const MachineRegisterInfo & MRI) const386 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
387     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
388 
389   switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
390   case Intrinsic::amdgcn_s_buffer_load: {
391     static const OpRegBankEntry<2> Table[4] = {
392       // Perfectly legal.
393       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
394 
395       // Only need 1 register in loop
396       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
397 
398       // Have to waterfall the resource.
399       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
400 
401       // Have to waterfall the resource, and the offset.
402       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
403     };
404 
405     // rsrc, offset
406     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
407     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);
408   }
409   case Intrinsic::amdgcn_ds_ordered_add:
410   case Intrinsic::amdgcn_ds_ordered_swap: {
411     // VGPR = M0, VGPR
412     static const OpRegBankEntry<3> Table[2] = {
413       // Perfectly legal.
414       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
415 
416       // Need a readfirstlane for m0
417       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
418     };
419 
420     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
421     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
422   }
423   case Intrinsic::amdgcn_s_sendmsg:
424   case Intrinsic::amdgcn_s_sendmsghalt: {
425     // FIXME: Should have no register for immediate
426     static const OpRegBankEntry<1> Table[2] = {
427       // Perfectly legal.
428       { { AMDGPU::SGPRRegBankID }, 1 },
429 
430       // Need readlane
431       { { AMDGPU::VGPRRegBankID }, 3 }
432     };
433 
434     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
435     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);
436   }
437   default:
438     return RegisterBankInfo::getInstrAlternativeMappings(MI);
439   }
440 }
441 
442 // FIXME: Returns uniform if there's no source value information. This is
443 // probably wrong.
isScalarLoadLegal(const MachineInstr & MI) const444 bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
445   if (!MI.hasOneMemOperand())
446     return false;
447 
448   const MachineMemOperand *MMO = *MI.memoperands_begin();
449   const unsigned AS = MMO->getAddrSpace();
450   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
451                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
452   const unsigned MemSize = 8 * MMO->getSize().getValue();
453 
454   // Require 4-byte alignment.
455   return (MMO->getAlign() >= Align(4) ||
456           (Subtarget.hasScalarSubwordLoads() &&
457            ((MemSize == 16 && MMO->getAlign() >= Align(2)) ||
458             (MemSize == 8 && MMO->getAlign() >= Align(1))))) &&
459          // Can't do a scalar atomic load.
460          !MMO->isAtomic() &&
461          // Don't use scalar loads for volatile accesses to non-constant address
462          // spaces.
463          (IsConst || !MMO->isVolatile()) &&
464          // Memory must be known constant, or not written before this load.
465          (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
466          AMDGPUInstrInfo::isUniformMMO(MMO);
467 }
468 
469 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappings(const MachineInstr & MI) const470 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
471     const MachineInstr &MI) const {
472 
473   const MachineFunction &MF = *MI.getParent()->getParent();
474   const MachineRegisterInfo &MRI = MF.getRegInfo();
475 
476 
477   InstructionMappings AltMappings;
478   switch (MI.getOpcode()) {
479   case TargetOpcode::G_CONSTANT:
480   case TargetOpcode::G_IMPLICIT_DEF: {
481     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
482     if (Size == 1) {
483       static const OpRegBankEntry<1> Table[3] = {
484         { { AMDGPU::VGPRRegBankID }, 1 },
485         { { AMDGPU::SGPRRegBankID }, 1 },
486         { { AMDGPU::VCCRegBankID }, 1 }
487       };
488 
489       return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
490     }
491 
492     [[fallthrough]];
493   }
494   case TargetOpcode::G_FCONSTANT:
495   case TargetOpcode::G_FRAME_INDEX:
496   case TargetOpcode::G_GLOBAL_VALUE: {
497     static const OpRegBankEntry<1> Table[2] = {
498       { { AMDGPU::VGPRRegBankID }, 1 },
499       { { AMDGPU::SGPRRegBankID }, 1 }
500     };
501 
502     return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
503   }
504   case TargetOpcode::G_AND:
505   case TargetOpcode::G_OR:
506   case TargetOpcode::G_XOR: {
507     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
508 
509     if (Size == 1) {
510       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
511       const InstructionMapping &SCCMapping = getInstructionMapping(
512         1, 1, getOperandsMapping(
513           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
514            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
515            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
516         3); // Num Operands
517       AltMappings.push_back(&SCCMapping);
518 
519       const InstructionMapping &VCCMapping0 = getInstructionMapping(
520         2, 1, getOperandsMapping(
521           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
522            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
523            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
524         3); // Num Operands
525       AltMappings.push_back(&VCCMapping0);
526       return AltMappings;
527     }
528 
529     if (Size != 64)
530       break;
531 
532     const InstructionMapping &SSMapping = getInstructionMapping(
533       1, 1, getOperandsMapping(
534         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
535          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
536          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
537       3); // Num Operands
538     AltMappings.push_back(&SSMapping);
539 
540     const InstructionMapping &VVMapping = getInstructionMapping(
541       2, 2, getOperandsMapping(
542         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
543          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
544          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
545       3); // Num Operands
546     AltMappings.push_back(&VVMapping);
547     break;
548   }
549   case TargetOpcode::G_LOAD:
550   case TargetOpcode::G_ZEXTLOAD:
551   case TargetOpcode::G_SEXTLOAD: {
552     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
553     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
554     unsigned PtrSize = PtrTy.getSizeInBits();
555     unsigned AS = PtrTy.getAddressSpace();
556 
557     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
558          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
559         isScalarLoadLegal(MI)) {
560       const InstructionMapping &SSMapping = getInstructionMapping(
561           1, 1, getOperandsMapping(
562                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
563                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
564           2); // Num Operands
565       AltMappings.push_back(&SSMapping);
566     }
567 
568     const InstructionMapping &VVMapping = getInstructionMapping(
569         2, 1,
570         getOperandsMapping(
571             {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
572              AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
573         2); // Num Operands
574     AltMappings.push_back(&VVMapping);
575 
576     // It may be possible to have a vgpr = load sgpr mapping here, because
577     // the mubuf instructions support this kind of load, but probably for only
578     // gfx7 and older.  However, the addressing mode matching in the instruction
579     // selector should be able to do a better job of detecting and selecting
580     // these kinds of loads from the vgpr = load vgpr mapping.
581 
582     return AltMappings;
583 
584   }
585   case TargetOpcode::G_SELECT: {
586     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
587     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
588       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
589                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
590                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
591                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
592       4); // Num Operands
593     AltMappings.push_back(&SSMapping);
594 
595     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
596       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
597                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
598                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
599                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
600       4); // Num Operands
601     AltMappings.push_back(&VVMapping);
602 
603     return AltMappings;
604   }
605   case TargetOpcode::G_UADDE:
606   case TargetOpcode::G_USUBE:
607   case TargetOpcode::G_SADDE:
608   case TargetOpcode::G_SSUBE: {
609     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
610     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
611       getOperandsMapping(
612         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
613          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
614          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
615          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
616          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
617       5); // Num Operands
618     AltMappings.push_back(&SSMapping);
619 
620     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
621       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
622                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
623                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
624                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
625                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
626       5); // Num Operands
627     AltMappings.push_back(&VVMapping);
628     return AltMappings;
629   }
630   case AMDGPU::G_BRCOND: {
631     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
632 
633     // TODO: Change type to 32 for scalar
634     const InstructionMapping &SMapping = getInstructionMapping(
635       1, 1, getOperandsMapping(
636         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
637       2); // Num Operands
638     AltMappings.push_back(&SMapping);
639 
640     const InstructionMapping &VMapping = getInstructionMapping(
641       1, 1, getOperandsMapping(
642         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
643       2); // Num Operands
644     AltMappings.push_back(&VMapping);
645     return AltMappings;
646   }
647   case AMDGPU::G_INTRINSIC:
648   case AMDGPU::G_INTRINSIC_CONVERGENT:
649     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
650   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
651   case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
652     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
653   default:
654     break;
655   }
656   return RegisterBankInfo::getInstrAlternativeMappings(MI);
657 }
658 
split64BitValueForMapping(MachineIRBuilder & B,SmallVector<Register,2> & Regs,LLT HalfTy,Register Reg) const659 void AMDGPURegisterBankInfo::split64BitValueForMapping(
660   MachineIRBuilder &B,
661   SmallVector<Register, 2> &Regs,
662   LLT HalfTy,
663   Register Reg) const {
664   assert(HalfTy.getSizeInBits() == 32);
665   MachineRegisterInfo *MRI = B.getMRI();
666   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
667   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
668   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
669   MRI->setRegBank(LoLHS, *Bank);
670   MRI->setRegBank(HiLHS, *Bank);
671 
672   Regs.push_back(LoLHS);
673   Regs.push_back(HiLHS);
674 
675   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
676     .addDef(LoLHS)
677     .addDef(HiLHS)
678     .addUse(Reg);
679 }
680 
681 /// Replace the current type each register in \p Regs has with \p NewTy
setRegsToType(MachineRegisterInfo & MRI,ArrayRef<Register> Regs,LLT NewTy)682 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
683                           LLT NewTy) {
684   for (Register Reg : Regs) {
685     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
686     MRI.setType(Reg, NewTy);
687   }
688 }
689 
getHalfSizedType(LLT Ty)690 static LLT getHalfSizedType(LLT Ty) {
691   if (Ty.isVector()) {
692     assert(Ty.getElementCount().isKnownMultipleOf(2));
693     return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
694                                Ty.getElementType());
695   }
696 
697   assert(Ty.getScalarSizeInBits() % 2 == 0);
698   return LLT::scalar(Ty.getScalarSizeInBits() / 2);
699 }
700 
701 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
702 // source value into a scalar register.
buildReadFirstLane(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Src) const703 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
704                                                     MachineRegisterInfo &MRI,
705                                                     Register Src) const {
706   LLT Ty = MRI.getType(Src);
707   const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
708 
709   if (Bank == &AMDGPU::SGPRRegBank)
710     return Src;
711 
712   unsigned Bits = Ty.getSizeInBits();
713   assert(Bits % 32 == 0);
714 
715   if (Bank != &AMDGPU::VGPRRegBank) {
716     // We need to copy from AGPR to VGPR
717     Src = B.buildCopy(Ty, Src).getReg(0);
718     MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
719   }
720 
721   LLT S32 = LLT::scalar(32);
722   unsigned NumParts = Bits / 32;
723   SmallVector<Register, 8> SrcParts;
724   SmallVector<Register, 8> DstParts;
725 
726   if (Bits == 32) {
727     SrcParts.push_back(Src);
728   } else {
729     auto Unmerge = B.buildUnmerge(S32, Src);
730     for (unsigned i = 0; i < NumParts; ++i)
731       SrcParts.push_back(Unmerge.getReg(i));
732   }
733 
734   for (unsigned i = 0; i < NumParts; ++i) {
735     Register SrcPart = SrcParts[i];
736     Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
737     MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
738 
739     const TargetRegisterClass *Constrained =
740         constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
741     (void)Constrained;
742     assert(Constrained && "Failed to constrain readfirstlane src reg");
743 
744     B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
745 
746     DstParts.push_back(DstPart);
747   }
748 
749   if (Bits == 32)
750     return DstParts[0];
751 
752   Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0);
753   MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
754   return Dst;
755 }
756 
757 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
758 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
759 /// execute the instruction for each unique combination of values in all lanes
760 /// in the wave. The block will be split such that rest of the instructions are
761 /// moved to a new block.
762 ///
763 /// Essentially performs this loop:
764 //
765 /// Save Execution Mask
766 /// For (Lane : Wavefront) {
767 ///   Enable Lane, Disable all other lanes
768 ///   SGPR = read SGPR value for current lane from VGPR
769 ///   VGPRResult[Lane] = use_op SGPR
770 /// }
771 /// Restore Execution Mask
772 ///
773 /// There is additional complexity to try for compare values to identify the
774 /// unique values used.
executeInWaterfallLoop(MachineIRBuilder & B,iterator_range<MachineBasicBlock::iterator> Range,SmallSet<Register,4> & SGPROperandRegs) const775 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
776     MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
777     SmallSet<Register, 4> &SGPROperandRegs) const {
778   // Track use registers which have already been expanded with a readfirstlane
779   // sequence. This may have multiple uses if moving a sequence.
780   DenseMap<Register, Register> WaterfalledRegMap;
781 
782   MachineBasicBlock &MBB = B.getMBB();
783   MachineFunction *MF = &B.getMF();
784 
785   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
786   const unsigned MovExecOpc =
787       Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
788   const unsigned MovExecTermOpc =
789       Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
790 
791   const unsigned XorTermOpc = Subtarget.isWave32() ?
792     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
793   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
794     AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
795   const unsigned ExecReg =  Subtarget.isWave32() ?
796     AMDGPU::EXEC_LO : AMDGPU::EXEC;
797 
798 #ifndef NDEBUG
799   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
800 #endif
801 
802   MachineRegisterInfo &MRI = *B.getMRI();
803   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
804   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
805 
806   // Don't bother using generic instructions/registers for the exec mask.
807   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
808     .addDef(InitSaveExecReg);
809 
810   Register PhiExec = MRI.createVirtualRegister(WaveRC);
811   Register NewExec = MRI.createVirtualRegister(WaveRC);
812 
813   // To insert the loop we need to split the block. Move everything before this
814   // point to a new block, and insert a new empty block before this instruction.
815   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
816   MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
817   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
818   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
819   MachineFunction::iterator MBBI(MBB);
820   ++MBBI;
821   MF->insert(MBBI, LoopBB);
822   MF->insert(MBBI, BodyBB);
823   MF->insert(MBBI, RestoreExecBB);
824   MF->insert(MBBI, RemainderBB);
825 
826   LoopBB->addSuccessor(BodyBB);
827   BodyBB->addSuccessor(RestoreExecBB);
828   BodyBB->addSuccessor(LoopBB);
829 
830   // Move the rest of the block into a new block.
831   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
832   RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
833 
834   MBB.addSuccessor(LoopBB);
835   RestoreExecBB->addSuccessor(RemainderBB);
836 
837   B.setInsertPt(*LoopBB, LoopBB->end());
838 
839   B.buildInstr(TargetOpcode::PHI)
840       .addDef(PhiExec)
841       .addReg(InitSaveExecReg)
842       .addMBB(&MBB)
843       .addReg(NewExec)
844       .addMBB(BodyBB);
845 
846   const DebugLoc &DL = B.getDL();
847 
848   MachineInstr &FirstInst = *Range.begin();
849 
850   // Move the instruction into the loop body. Note we moved everything after
851   // Range.end() already into a new block, so Range.end() is no longer valid.
852   BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
853 
854   // Figure out the iterator range after splicing the instructions.
855   MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
856   auto NewEnd = BodyBB->end();
857 
858   B.setMBB(*LoopBB);
859 
860   LLT S1 = LLT::scalar(1);
861   Register CondReg;
862 
863   assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
864 
865   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
866     for (MachineOperand &Op : MI.all_uses()) {
867       Register OldReg = Op.getReg();
868       if (!SGPROperandRegs.count(OldReg))
869         continue;
870 
871       // See if we already processed this register in another instruction in the
872       // sequence.
873       auto OldVal = WaterfalledRegMap.find(OldReg);
874       if (OldVal != WaterfalledRegMap.end()) {
875         Op.setReg(OldVal->second);
876         continue;
877       }
878 
879       Register OpReg = Op.getReg();
880       LLT OpTy = MRI.getType(OpReg);
881 
882       const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
883       if (OpBank != &AMDGPU::VGPRRegBank) {
884         // Insert copy from AGPR to VGPR before the loop.
885         B.setMBB(MBB);
886         OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
887         MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
888         B.setMBB(*LoopBB);
889       }
890 
891       Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
892 
893       // Build the comparison(s).
894       unsigned OpSize = OpTy.getSizeInBits();
895       bool Is64 = OpSize % 64 == 0;
896       unsigned PartSize = Is64 ? 64 : 32;
897       LLT PartTy = LLT::scalar(PartSize);
898       unsigned NumParts = OpSize / PartSize;
899       SmallVector<Register, 8> OpParts;
900       SmallVector<Register, 8> CurrentLaneParts;
901 
902       if (NumParts == 1) {
903         OpParts.push_back(OpReg);
904         CurrentLaneParts.push_back(CurrentLaneReg);
905       } else {
906         auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
907         auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
908         for (unsigned i = 0; i < NumParts; ++i) {
909           OpParts.push_back(UnmergeOp.getReg(i));
910           CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
911           MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
912           MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
913         }
914       }
915 
916       for (unsigned i = 0; i < NumParts; ++i) {
917         auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
918                                   OpParts[i]).getReg(0);
919         MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
920 
921         if (!CondReg) {
922           CondReg = CmpReg;
923         } else {
924           CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
925           MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
926         }
927       }
928 
929       Op.setReg(CurrentLaneReg);
930 
931       // Make sure we don't re-process this register again.
932       WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
933     }
934   }
935 
936   // The ballot becomes a no-op during instruction selection.
937   CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
938                              {LLT::scalar(Subtarget.isWave32() ? 32 : 64)})
939                 .addReg(CondReg)
940                 .getReg(0);
941   MRI.setRegClass(CondReg, WaveRC);
942 
943   // Update EXEC, save the original EXEC value to VCC.
944   B.buildInstr(AndSaveExecOpc)
945     .addDef(NewExec)
946     .addReg(CondReg, RegState::Kill);
947 
948   MRI.setSimpleHint(NewExec, CondReg);
949 
950   B.setInsertPt(*BodyBB, BodyBB->end());
951 
952   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
953   B.buildInstr(XorTermOpc)
954     .addDef(ExecReg)
955     .addReg(ExecReg)
956     .addReg(NewExec);
957 
958   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
959   // s_cbranch_scc0?
960 
961   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
962   B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
963 
964   // Save the EXEC mask before the loop.
965   BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
966     .addReg(ExecReg);
967 
968   // Restore the EXEC mask after the loop.
969   B.setMBB(*RestoreExecBB);
970   B.buildInstr(MovExecTermOpc)
971     .addDef(ExecReg)
972     .addReg(SaveExecReg);
973 
974   // Set the insert point after the original instruction, so any new
975   // instructions will be in the remainder.
976   B.setInsertPt(*RemainderBB, RemainderBB->begin());
977 
978   return true;
979 }
980 
981 // Return any unique registers used by \p MI at \p OpIndices that need to be
982 // handled in a waterfall loop. Returns these registers in \p
983 // SGPROperandRegs. Returns true if there are any operands to handle and a
984 // waterfall loop is necessary.
collectWaterfallOperands(SmallSet<Register,4> & SGPROperandRegs,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const985 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
986   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
987   MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
988   for (unsigned Op : OpIndices) {
989     assert(MI.getOperand(Op).isUse());
990     Register Reg = MI.getOperand(Op).getReg();
991     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
992     if (OpBank->getID() != AMDGPU::SGPRRegBankID)
993       SGPROperandRegs.insert(Reg);
994   }
995 
996   // No operands need to be replaced, so no need to loop.
997   return !SGPROperandRegs.empty();
998 }
999 
executeInWaterfallLoop(MachineIRBuilder & B,MachineInstr & MI,ArrayRef<unsigned> OpIndices) const1000 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1001     MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const {
1002   // Use a set to avoid extra readfirstlanes in the case where multiple operands
1003   // are the same register.
1004   SmallSet<Register, 4> SGPROperandRegs;
1005 
1006   if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices))
1007     return false;
1008 
1009   MachineBasicBlock::iterator I = MI.getIterator();
1010   return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1011                                 SGPROperandRegs);
1012 }
1013 
1014 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
constrainOpWithReadfirstlane(MachineIRBuilder & B,MachineInstr & MI,unsigned OpIdx) const1015 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1016     MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const {
1017   Register Reg = MI.getOperand(OpIdx).getReg();
1018   MachineRegisterInfo &MRI = *B.getMRI();
1019   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1020   if (Bank == &AMDGPU::SGPRRegBank)
1021     return;
1022 
1023   Reg = buildReadFirstLane(B, MRI, Reg);
1024   MI.getOperand(OpIdx).setReg(Reg);
1025 }
1026 
1027 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1028 /// rest will be in the remainder.
splitUnequalType(LLT Ty,unsigned FirstSize)1029 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1030   unsigned TotalSize = Ty.getSizeInBits();
1031   if (!Ty.isVector())
1032     return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1033 
1034   LLT EltTy = Ty.getElementType();
1035   unsigned EltSize = EltTy.getSizeInBits();
1036   assert(FirstSize % EltSize == 0);
1037 
1038   unsigned FirstPartNumElts = FirstSize / EltSize;
1039   unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1040 
1041   return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1042           LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1043 }
1044 
widen96To128(LLT Ty)1045 static LLT widen96To128(LLT Ty) {
1046   if (!Ty.isVector())
1047     return LLT::scalar(128);
1048 
1049   LLT EltTy = Ty.getElementType();
1050   assert(128 % EltTy.getSizeInBits() == 0);
1051   return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1052 }
1053 
applyMappingLoad(MachineIRBuilder & B,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineInstr & MI) const1054 bool AMDGPURegisterBankInfo::applyMappingLoad(
1055     MachineIRBuilder &B,
1056     const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1057     MachineInstr &MI) const {
1058   MachineRegisterInfo &MRI = *B.getMRI();
1059   Register DstReg = MI.getOperand(0).getReg();
1060   const LLT LoadTy = MRI.getType(DstReg);
1061   unsigned LoadSize = LoadTy.getSizeInBits();
1062   const unsigned MaxNonSmrdLoadSize = 128;
1063 
1064   const RegisterBank *DstBank =
1065       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1066   if (DstBank == &AMDGPU::SGPRRegBank) {
1067     // There are some special cases that we need to look at for 32 bit and 96
1068     // bit SGPR loads otherwise we have nothing to do.
1069     if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
1070       return false;
1071 
1072     MachineMemOperand *MMO = *MI.memoperands_begin();
1073     const unsigned MemSize = 8 * MMO->getSize().getValue();
1074     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1075     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1076     // scalar loads should have a load size of 32 but memory access size of less
1077     // than 32.
1078     if (LoadSize == 32 &&
1079         (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1080       return false;
1081 
1082     if (LoadSize == 32 &&
1083         ((MemSize == 8 && MMO->getAlign() >= Align(1)) ||
1084          (MemSize == 16 && MMO->getAlign() >= Align(2))) &&
1085         isScalarLoadLegal(MI) &&
1086         Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12)
1087       return false;
1088 
1089     Register PtrReg = MI.getOperand(1).getReg();
1090 
1091     ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
1092 
1093     if (LoadSize == 32) {
1094       // This is an extending load from a sub-dword size. Widen the memory
1095       // access size to 4 bytes and clear the extra high bits appropriately
1096       const LLT S32 = LLT::scalar(32);
1097       if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1098         // Must extend the sign bit into higher bits for a G_SEXTLOAD
1099         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1100         B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1101       } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1102         // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1103         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1104         B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1105       } else
1106         // We do not need to touch the higher bits for regular loads.
1107         B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1108     } else {
1109       // 96-bit loads are only available for vector loads. We need to split this
1110       // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1111       if (MMO->getAlign() < Align(16)) {
1112         LegalizerHelper Helper(B.getMF(), ApplyBank, B);
1113         LLT Part64, Part32;
1114         std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1115         if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1116             LegalizerHelper::Legalized)
1117           return false;
1118         return true;
1119       }
1120       LLT WiderTy = widen96To128(LoadTy);
1121       auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1122       if (WiderTy.isScalar()) {
1123         B.buildTrunc(MI.getOperand(0), WideLoad);
1124       } else {
1125         B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1126                                             WideLoad);
1127       }
1128     }
1129 
1130     MI.eraseFromParent();
1131     return true;
1132   }
1133 
1134   // 128-bit loads are supported for all instruction types.
1135   if (LoadSize <= MaxNonSmrdLoadSize)
1136     return false;
1137 
1138   SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1139   SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1140 
1141   if (SrcRegs.empty())
1142     SrcRegs.push_back(MI.getOperand(1).getReg());
1143 
1144   assert(LoadSize % MaxNonSmrdLoadSize == 0);
1145 
1146   // RegBankSelect only emits scalar types, so we need to reset the pointer
1147   // operand to a pointer type.
1148   Register BasePtrReg = SrcRegs[0];
1149   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1150   MRI.setType(BasePtrReg, PtrTy);
1151 
1152   unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1153   const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1154   ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
1155   LegalizerHelper Helper(B.getMF(), O, B);
1156 
1157   if (LoadTy.isVector()) {
1158     if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1159       return false;
1160   } else {
1161     if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1162       return false;
1163   }
1164 
1165   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1166   return true;
1167 }
1168 
applyMappingDynStackAlloc(MachineIRBuilder & B,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineInstr & MI) const1169 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1170     MachineIRBuilder &B,
1171     const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1172     MachineInstr &MI) const {
1173   MachineRegisterInfo &MRI = *B.getMRI();
1174   const MachineFunction &MF = B.getMF();
1175   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1176   const auto &TFI = *ST.getFrameLowering();
1177 
1178   // Guard in case the stack growth direction ever changes with scratch
1179   // instructions.
1180   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1181     return false;
1182 
1183   Register Dst = MI.getOperand(0).getReg();
1184   Register AllocSize = MI.getOperand(1).getReg();
1185   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1186 
1187   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1188 
1189   // TODO: Need to emit a wave reduction to get the maximum size.
1190   if (SizeBank != &AMDGPU::SGPRRegBank)
1191     return false;
1192 
1193   LLT PtrTy = MRI.getType(Dst);
1194   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1195 
1196   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1197   Register SPReg = Info->getStackPtrOffsetReg();
1198   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1199 
1200   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1201   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1202 
1203   auto SPCopy = B.buildCopy(PtrTy, SPReg);
1204   if (Alignment > TFI.getStackAlign()) {
1205     auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1206     B.buildMaskLowPtrBits(Dst, PtrAdd,
1207                           Log2(Alignment) + ST.getWavefrontSizeLog2());
1208   } else {
1209     B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1210   }
1211 
1212   MI.eraseFromParent();
1213   return true;
1214 }
1215 
applyMappingImage(MachineIRBuilder & B,MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,int RsrcIdx) const1216 bool AMDGPURegisterBankInfo::applyMappingImage(
1217     MachineIRBuilder &B, MachineInstr &MI,
1218     const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1219     int RsrcIdx) const {
1220   const int NumDefs = MI.getNumExplicitDefs();
1221 
1222   // The reported argument index is relative to the IR intrinsic call arguments,
1223   // so we need to shift by the number of defs and the intrinsic ID.
1224   RsrcIdx += NumDefs + 1;
1225 
1226   // Insert copies to VGPR arguments.
1227   applyDefaultMapping(OpdMapper);
1228 
1229   // Fixup any SGPR arguments.
1230   SmallVector<unsigned, 4> SGPRIndexes;
1231   for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1232     if (!MI.getOperand(I).isReg())
1233       continue;
1234 
1235     // If this intrinsic has a sampler, it immediately follows rsrc.
1236     if (I == RsrcIdx || I == RsrcIdx + 1)
1237       SGPRIndexes.push_back(I);
1238   }
1239 
1240   executeInWaterfallLoop(B, MI, SGPRIndexes);
1241   return true;
1242 }
1243 
1244 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1245 // the three offsets (voffset, soffset and instoffset)
setBufferOffsets(MachineIRBuilder & B,Register CombinedOffset,Register & VOffsetReg,Register & SOffsetReg,int64_t & InstOffsetVal,Align Alignment) const1246 unsigned AMDGPURegisterBankInfo::setBufferOffsets(
1247     MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
1248     Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
1249   const LLT S32 = LLT::scalar(32);
1250   MachineRegisterInfo *MRI = B.getMRI();
1251 
1252   if (std::optional<int64_t> Imm =
1253           getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1254     uint32_t SOffset, ImmOffset;
1255     if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) {
1256       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1257       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1258       InstOffsetVal = ImmOffset;
1259 
1260       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1261       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1262       return SOffset + ImmOffset;
1263     }
1264   }
1265 
1266   Register Base;
1267   unsigned Offset;
1268 
1269   std::tie(Base, Offset) =
1270       AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1271 
1272   uint32_t SOffset, ImmOffset;
1273   if ((int)Offset > 0 &&
1274       TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
1275     if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1276       VOffsetReg = Base;
1277       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1278       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1279       InstOffsetVal = ImmOffset;
1280       return 0; // XXX - Why is this 0?
1281     }
1282 
1283     // If we have SGPR base, we can use it for soffset.
1284     if (SOffset == 0) {
1285       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1286       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1287       SOffsetReg = Base;
1288       InstOffsetVal = ImmOffset;
1289       return 0; // XXX - Why is this 0?
1290     }
1291   }
1292 
1293   // Handle the variable sgpr + vgpr case.
1294   MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1295   if (Add && (int)Offset >= 0) {
1296     Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI);
1297     Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI);
1298 
1299     const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI);
1300     const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI);
1301 
1302     if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1303       VOffsetReg = Src0;
1304       SOffsetReg = Src1;
1305       return 0;
1306     }
1307 
1308     if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1309       VOffsetReg = Src1;
1310       SOffsetReg = Src0;
1311       return 0;
1312     }
1313   }
1314 
1315   // Ensure we have a VGPR for the combined offset. This could be an issue if we
1316   // have an SGPR offset and a VGPR resource.
1317   if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1318     VOffsetReg = CombinedOffset;
1319   } else {
1320     VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1321     B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1322   }
1323 
1324   SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1325   B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1326   return 0;
1327 }
1328 
applyMappingSBufferLoad(MachineIRBuilder & B,const OperandsMapper & OpdMapper) const1329 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1330     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1331   MachineInstr &MI = OpdMapper.getMI();
1332   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1333 
1334   const LLT S32 = LLT::scalar(32);
1335   Register Dst = MI.getOperand(0).getReg();
1336   LLT Ty = MRI.getType(Dst);
1337 
1338   const RegisterBank *RSrcBank =
1339     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1340   const RegisterBank *OffsetBank =
1341     OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1342   if (RSrcBank == &AMDGPU::SGPRRegBank &&
1343       OffsetBank == &AMDGPU::SGPRRegBank)
1344     return true; // Legal mapping
1345 
1346   // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1347   // here but don't have an MMO.
1348 
1349   unsigned LoadSize = Ty.getSizeInBits();
1350   int NumLoads = 1;
1351   if (LoadSize == 256 || LoadSize == 512) {
1352     NumLoads = LoadSize / 128;
1353     Ty = Ty.divide(NumLoads);
1354   }
1355 
1356   // Use the alignment to ensure that the required offsets will fit into the
1357   // immediate offsets.
1358   const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1359 
1360   MachineFunction &MF = B.getMF();
1361 
1362   Register SOffset;
1363   Register VOffset;
1364   int64_t ImmOffset = 0;
1365 
1366   unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset,
1367                                         SOffset, ImmOffset, Alignment);
1368 
1369   // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1370   // can, but we need to track an MMO for that.
1371   const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1372   const Align MemAlign(4); // FIXME: ABI type alignment?
1373   MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1374     MachinePointerInfo(),
1375     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1376     MachineMemOperand::MOInvariant,
1377     MemSize, MemAlign);
1378   if (MMOOffset != 0)
1379     BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1380 
1381   // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1382   // assume that the buffer is unswizzled.
1383 
1384   Register RSrc = MI.getOperand(1).getReg();
1385   Register VIndex = B.buildConstant(S32, 0).getReg(0);
1386   B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1387 
1388   SmallVector<Register, 4> LoadParts(NumLoads);
1389 
1390   MachineBasicBlock::iterator MII = MI.getIterator();
1391   MachineInstrSpan Span(MII, &B.getMBB());
1392 
1393   for (int i = 0; i < NumLoads; ++i) {
1394     if (NumLoads == 1) {
1395       LoadParts[i] = Dst;
1396     } else {
1397       LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1398       MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1399     }
1400 
1401     MachineMemOperand *MMO = BaseMMO;
1402     if (i != 0)
1403       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1404 
1405     B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1406       .addDef(LoadParts[i])       // vdata
1407       .addUse(RSrc)               // rsrc
1408       .addUse(VIndex)             // vindex
1409       .addUse(VOffset)            // voffset
1410       .addUse(SOffset)            // soffset
1411       .addImm(ImmOffset + 16 * i) // offset(imm)
1412       .addImm(0)                  // cachepolicy, swizzled buffer(imm)
1413       .addImm(0)                  // idxen(imm)
1414       .addMemOperand(MMO);
1415   }
1416 
1417   // TODO: If only the resource is a VGPR, it may be better to execute the
1418   // scalar load in the waterfall loop if the resource is expected to frequently
1419   // be dynamically uniform.
1420   if (RSrcBank != &AMDGPU::SGPRRegBank) {
1421     // Remove the original instruction to avoid potentially confusing the
1422     // waterfall loop logic.
1423     B.setInstr(*Span.begin());
1424     MI.eraseFromParent();
1425 
1426     SmallSet<Register, 4> OpsToWaterfall;
1427 
1428     OpsToWaterfall.insert(RSrc);
1429     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1430                            OpsToWaterfall);
1431   }
1432 
1433   if (NumLoads != 1) {
1434     if (Ty.isVector())
1435       B.buildConcatVectors(Dst, LoadParts);
1436     else
1437       B.buildMergeLikeInstr(Dst, LoadParts);
1438   }
1439 
1440   // We removed the instruction earlier with a waterfall loop.
1441   if (RSrcBank == &AMDGPU::SGPRRegBank)
1442     MI.eraseFromParent();
1443 
1444   return true;
1445 }
1446 
applyMappingBFE(MachineIRBuilder & B,const OperandsMapper & OpdMapper,bool Signed) const1447 bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
1448                                              const OperandsMapper &OpdMapper,
1449                                              bool Signed) const {
1450   MachineInstr &MI = OpdMapper.getMI();
1451   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1452 
1453   // Insert basic copies
1454   applyDefaultMapping(OpdMapper);
1455 
1456   Register DstReg = MI.getOperand(0).getReg();
1457   LLT Ty = MRI.getType(DstReg);
1458 
1459   const LLT S32 = LLT::scalar(32);
1460 
1461   unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
1462   Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1463   Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1464   Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1465 
1466   const RegisterBank *DstBank =
1467     OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1468   if (DstBank == &AMDGPU::VGPRRegBank) {
1469     if (Ty == S32)
1470       return true;
1471 
1472     // There is no 64-bit vgpr bitfield extract instructions so the operation
1473     // is expanded to a sequence of instructions that implement the operation.
1474     ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
1475 
1476     const LLT S64 = LLT::scalar(64);
1477     // Shift the source operand so that extracted bits start at bit 0.
1478     auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1479                               : B.buildLShr(S64, SrcReg, OffsetReg);
1480     auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1481 
1482     // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1483     // if the width is a constant.
1484     if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1485       // Use the 32-bit bitfield extract instruction if the width is a constant.
1486       // Depending on the width size, use either the low or high 32-bits.
1487       auto Zero = B.buildConstant(S32, 0);
1488       auto WidthImm = ConstWidth->Value.getZExtValue();
1489       if (WidthImm <= 32) {
1490         // Use bitfield extract on the lower 32-bit source, and then sign-extend
1491         // or clear the upper 32-bits.
1492         auto Extract =
1493             Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1494                    : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1495         auto Extend =
1496             Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1497         B.buildMergeLikeInstr(DstReg, {Extract, Extend});
1498       } else {
1499         // Use bitfield extract on upper 32-bit source, and combine with lower
1500         // 32-bit source.
1501         auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1502         auto Extract =
1503             Signed
1504                 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1505                 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1506         B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
1507       }
1508       MI.eraseFromParent();
1509       return true;
1510     }
1511 
1512     // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1513     // operations.
1514     auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1515     auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1516     if (Signed)
1517       B.buildAShr(S64, SignBit, ExtShift);
1518     else
1519       B.buildLShr(S64, SignBit, ExtShift);
1520     MI.eraseFromParent();
1521     return true;
1522   }
1523 
1524   // The scalar form packs the offset and width in a single operand.
1525 
1526   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1527 
1528   // Ensure the high bits are clear to insert the offset.
1529   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1530   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1531 
1532   // Zeros out the low bits, so don't bother clamping the input value.
1533   auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1534 
1535   // Transformation function, pack the offset and width of a BFE into
1536   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1537   // source, bits [5:0] contain the offset and bits [22:16] the width.
1538   auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1539 
1540   // TODO: It might be worth using a pseudo here to avoid scc clobber and
1541   // register class constraints.
1542   unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1543                              (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1544 
1545   auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1546   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1547     llvm_unreachable("failed to constrain BFE");
1548 
1549   MI.eraseFromParent();
1550   return true;
1551 }
1552 
applyMappingMAD_64_32(MachineIRBuilder & B,const OperandsMapper & OpdMapper) const1553 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1554     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1555   MachineInstr &MI = OpdMapper.getMI();
1556   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1557 
1558   // Insert basic copies.
1559   applyDefaultMapping(OpdMapper);
1560 
1561   Register Dst0 = MI.getOperand(0).getReg();
1562   Register Dst1 = MI.getOperand(1).getReg();
1563   Register Src0 = MI.getOperand(2).getReg();
1564   Register Src1 = MI.getOperand(3).getReg();
1565   Register Src2 = MI.getOperand(4).getReg();
1566 
1567   if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
1568     return true;
1569 
1570   bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1571   LLT S1 = LLT::scalar(1);
1572   LLT S32 = LLT::scalar(32);
1573 
1574   bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
1575   bool Accumulate = true;
1576 
1577   if (!DstOnValu) {
1578     if (mi_match(Src2, MRI, m_ZeroInt()))
1579       Accumulate = false;
1580   }
1581 
1582   // Keep the multiplication on the SALU.
1583   Register DstHi;
1584   Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
1585   bool MulHiInVgpr = false;
1586 
1587   MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
1588 
1589   if (Subtarget.hasSMulHi()) {
1590     DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)
1591                        : B.buildSMulH(S32, Src0, Src1).getReg(0);
1592     MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
1593   } else {
1594     Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);
1595     Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);
1596 
1597     MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
1598     MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
1599 
1600     DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)
1601                        : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);
1602     MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1603 
1604     if (!DstOnValu) {
1605       DstHi = buildReadFirstLane(B, MRI, DstHi);
1606     } else {
1607       MulHiInVgpr = true;
1608     }
1609   }
1610 
1611   // Accumulate and produce the "carry-out" bit.
1612   //
1613   // The "carry-out" is defined as bit 64 of the result when computed as a
1614   // big integer. For unsigned multiply-add, this matches the usual definition
1615   // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1616   // result, which is determined as:
1617   //   sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1618   LLT CarryType = DstOnValu ? S1 : S32;
1619   const RegisterBank &CarryBank =
1620       DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1621   const RegisterBank &DstBank =
1622       DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1623   Register Carry;
1624   Register Zero;
1625 
1626   if (!IsUnsigned) {
1627     Zero = B.buildConstant(S32, 0).getReg(0);
1628     MRI.setRegBank(Zero,
1629                    MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1630 
1631     Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)
1632                 .getReg(0);
1633     MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
1634                                       : AMDGPU::SGPRRegBank);
1635 
1636     if (DstOnValu && !MulHiInVgpr) {
1637       Carry = B.buildTrunc(S1, Carry).getReg(0);
1638       MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
1639     }
1640   }
1641 
1642   if (Accumulate) {
1643     if (DstOnValu) {
1644       DstLo = B.buildCopy(S32, DstLo).getReg(0);
1645       DstHi = B.buildCopy(S32, DstHi).getReg(0);
1646       MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
1647       MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1648     }
1649 
1650     auto Unmerge = B.buildUnmerge(S32, Src2);
1651     Register Src2Lo = Unmerge.getReg(0);
1652     Register Src2Hi = Unmerge.getReg(1);
1653     MRI.setRegBank(Src2Lo, DstBank);
1654     MRI.setRegBank(Src2Hi, DstBank);
1655 
1656     if (!IsUnsigned) {
1657       auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);
1658       MRI.setRegBank(Src2Sign.getReg(0), CarryBank);
1659 
1660       Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);
1661       MRI.setRegBank(Carry, CarryBank);
1662     }
1663 
1664     auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);
1665     DstLo = AddLo.getReg(0);
1666     Register CarryLo = AddLo.getReg(1);
1667     MRI.setRegBank(DstLo, DstBank);
1668     MRI.setRegBank(CarryLo, CarryBank);
1669 
1670     auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);
1671     DstHi = AddHi.getReg(0);
1672     MRI.setRegBank(DstHi, DstBank);
1673 
1674     Register CarryHi = AddHi.getReg(1);
1675     MRI.setRegBank(CarryHi, CarryBank);
1676 
1677     if (IsUnsigned) {
1678       Carry = CarryHi;
1679     } else {
1680       Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);
1681       MRI.setRegBank(Carry, CarryBank);
1682     }
1683   } else {
1684     if (IsUnsigned) {
1685       Carry = B.buildConstant(CarryType, 0).getReg(0);
1686       MRI.setRegBank(Carry, CarryBank);
1687     }
1688   }
1689 
1690   B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
1691 
1692   if (DstOnValu) {
1693     B.buildCopy(Dst1, Carry);
1694   } else {
1695     B.buildTrunc(Dst1, Carry);
1696   }
1697 
1698   MI.eraseFromParent();
1699   return true;
1700 }
1701 
1702 // Return a suitable opcode for extending the operands of Opc when widening.
getExtendOp(unsigned Opc)1703 static unsigned getExtendOp(unsigned Opc) {
1704   switch (Opc) {
1705   case TargetOpcode::G_ASHR:
1706   case TargetOpcode::G_SMIN:
1707   case TargetOpcode::G_SMAX:
1708     return TargetOpcode::G_SEXT;
1709   case TargetOpcode::G_LSHR:
1710   case TargetOpcode::G_UMIN:
1711   case TargetOpcode::G_UMAX:
1712     return TargetOpcode::G_ZEXT;
1713   default:
1714     return TargetOpcode::G_ANYEXT;
1715   }
1716 }
1717 
1718 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1719 // any illegal vector extend or unmerge operations.
1720 static std::pair<Register, Register>
unpackV2S16ToS32(MachineIRBuilder & B,Register Src,unsigned ExtOpcode)1721 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1722   const LLT S32 = LLT::scalar(32);
1723   auto Bitcast = B.buildBitcast(S32, Src);
1724 
1725   if (ExtOpcode == TargetOpcode::G_SEXT) {
1726     auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1727     auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1728     return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1729   }
1730 
1731   auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1732   if (ExtOpcode == TargetOpcode::G_ZEXT) {
1733     auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1734     return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1735   }
1736 
1737   assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1738   return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1739 }
1740 
1741 // For cases where only a single copy is inserted for matching register banks.
1742 // Replace the register in the instruction operand
substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,unsigned OpIdx)1743 static bool substituteSimpleCopyRegs(
1744   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1745   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1746   if (!SrcReg.empty()) {
1747     assert(SrcReg.size() == 1);
1748     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1749     return true;
1750   }
1751 
1752   return false;
1753 }
1754 
1755 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg) const1756 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1757                                                 MachineRegisterInfo &MRI,
1758                                                 Register Reg) const {
1759   if (!Subtarget.hasUnpackedD16VMem())
1760     return Reg;
1761 
1762   const LLT S16 = LLT::scalar(16);
1763   LLT StoreVT = MRI.getType(Reg);
1764   if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1765     return Reg;
1766 
1767   auto Unmerge = B.buildUnmerge(S16, Reg);
1768 
1769 
1770   SmallVector<Register, 4> WideRegs;
1771   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1772     WideRegs.push_back(Unmerge.getReg(I));
1773 
1774   const LLT S32 = LLT::scalar(32);
1775   int NumElts = StoreVT.getNumElements();
1776 
1777   return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs)
1778       .getReg(0);
1779 }
1780 
1781 static std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo & MRI,Register Reg)1782 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1783   int64_t Const;
1784   if (mi_match(Reg, MRI, m_ICst(Const)))
1785     return std::pair(Register(), Const);
1786 
1787   Register Base;
1788   if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1789     return std::pair(Base, Const);
1790 
1791   // TODO: Handle G_OR used for add case
1792   return std::pair(Reg, 0);
1793 }
1794 
1795 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const1796 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1797                                            Register OrigOffset) const {
1798   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(Subtarget);
1799   Register BaseReg;
1800   unsigned ImmOffset;
1801   const LLT S32 = LLT::scalar(32);
1802 
1803   // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1804   std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1805                                                            OrigOffset);
1806 
1807   unsigned C1 = 0;
1808   if (ImmOffset != 0) {
1809     // If the immediate value is too big for the immoffset field, put only bits
1810     // that would normally fit in the immoffset field. The remaining value that
1811     // is copied/added for the voffset field is a large power of 2, and it
1812     // stands more chance of being CSEd with the copy/add for another similar
1813     // load/store.
1814     // However, do not do that rounding down if that is a negative
1815     // number, as it appears to be illegal to have a negative offset in the
1816     // vgpr, even if adding the immediate offset makes it positive.
1817     unsigned Overflow = ImmOffset & ~MaxImm;
1818     ImmOffset -= Overflow;
1819     if ((int32_t)Overflow < 0) {
1820       Overflow += ImmOffset;
1821       ImmOffset = 0;
1822     }
1823 
1824     C1 = ImmOffset;
1825     if (Overflow != 0) {
1826       if (!BaseReg)
1827         BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1828       else {
1829         auto OverflowVal = B.buildConstant(S32, Overflow);
1830         BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1831       }
1832     }
1833   }
1834 
1835   if (!BaseReg)
1836     BaseReg = B.buildConstant(S32, 0).getReg(0);
1837 
1838   return {BaseReg, C1};
1839 }
1840 
buildVCopy(MachineIRBuilder & B,Register DstReg,Register SrcReg) const1841 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1842                                         Register SrcReg) const {
1843   MachineRegisterInfo &MRI = *B.getMRI();
1844   LLT SrcTy = MRI.getType(SrcReg);
1845   if (SrcTy.getSizeInBits() == 32) {
1846     // Use a v_mov_b32 here to make the exec dependency explicit.
1847     B.buildInstr(AMDGPU::V_MOV_B32_e32)
1848       .addDef(DstReg)
1849       .addUse(SrcReg);
1850     return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1851            constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1852   }
1853 
1854   Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1855   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1856 
1857   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1858     .addDef(TmpReg0)
1859     .addUse(SrcReg, 0, AMDGPU::sub0);
1860   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1861     .addDef(TmpReg1)
1862     .addUse(SrcReg, 0, AMDGPU::sub1);
1863   B.buildInstr(AMDGPU::REG_SEQUENCE)
1864     .addDef(DstReg)
1865     .addUse(TmpReg0)
1866     .addImm(AMDGPU::sub0)
1867     .addUse(TmpReg1)
1868     .addImm(AMDGPU::sub1);
1869 
1870   return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1871          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1872 }
1873 
1874 /// Utility function for pushing dynamic vector indexes with a constant offset
1875 /// into waterfall loops.
reinsertVectorIndexAdd(MachineIRBuilder & B,MachineInstr & IdxUseInstr,unsigned OpIdx,unsigned ConstOffset)1876 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1877                                    MachineInstr &IdxUseInstr,
1878                                    unsigned OpIdx,
1879                                    unsigned ConstOffset) {
1880   MachineRegisterInfo &MRI = *B.getMRI();
1881   const LLT S32 = LLT::scalar(32);
1882   Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1883   B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1884 
1885   auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1886 
1887   auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1888   MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1889   MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1890   IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1891 }
1892 
1893 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1894 /// original 32-bit source value (to be inserted in the low part of the combined
1895 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1896 /// value.
extendLow32IntoHigh32(MachineIRBuilder & B,Register Hi32Reg,Register Lo32Reg,unsigned ExtOpc,const RegisterBank & RegBank,bool IsBooleanSrc=false)1897 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1898                                   Register Hi32Reg, Register Lo32Reg,
1899                                   unsigned ExtOpc,
1900                                   const RegisterBank &RegBank,
1901                                   bool IsBooleanSrc = false) {
1902   if (ExtOpc == AMDGPU::G_ZEXT) {
1903     B.buildConstant(Hi32Reg, 0);
1904   } else if (ExtOpc == AMDGPU::G_SEXT) {
1905     if (IsBooleanSrc) {
1906       // If we know the original source was an s1, the high half is the same as
1907       // the low.
1908       B.buildCopy(Hi32Reg, Lo32Reg);
1909     } else {
1910       // Replicate sign bit from 32-bit extended part.
1911       auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1912       B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1913       B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1914     }
1915   } else {
1916     assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1917     B.buildUndef(Hi32Reg);
1918   }
1919 }
1920 
foldExtractEltToCmpSelect(MachineIRBuilder & B,MachineInstr & MI,const OperandsMapper & OpdMapper) const1921 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1922     MachineIRBuilder &B, MachineInstr &MI,
1923     const OperandsMapper &OpdMapper) const {
1924   MachineRegisterInfo &MRI = *B.getMRI();
1925 
1926   Register VecReg = MI.getOperand(1).getReg();
1927   Register Idx = MI.getOperand(2).getReg();
1928 
1929   const RegisterBank &IdxBank =
1930     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1931 
1932   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1933 
1934   LLT VecTy = MRI.getType(VecReg);
1935   unsigned EltSize = VecTy.getScalarSizeInBits();
1936   unsigned NumElem = VecTy.getNumElements();
1937 
1938   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1939                                                   IsDivergentIdx, &Subtarget))
1940     return false;
1941 
1942   LLT S32 = LLT::scalar(32);
1943 
1944   const RegisterBank &DstBank =
1945     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1946   const RegisterBank &SrcBank =
1947     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1948 
1949   const RegisterBank &CCBank =
1950     (DstBank == AMDGPU::SGPRRegBank &&
1951      SrcBank == AMDGPU::SGPRRegBank &&
1952      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1953                                      : AMDGPU::VCCRegBank;
1954   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1955 
1956   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1957     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1958     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1959   }
1960 
1961   LLT EltTy = VecTy.getScalarType();
1962   SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1963   unsigned NumLanes = DstRegs.size();
1964   if (!NumLanes)
1965     NumLanes = 1;
1966   else
1967     EltTy = MRI.getType(DstRegs[0]);
1968 
1969   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1970   SmallVector<Register, 2> Res(NumLanes);
1971   for (unsigned L = 0; L < NumLanes; ++L)
1972     Res[L] = UnmergeToEltTy.getReg(L);
1973 
1974   for (unsigned I = 1; I < NumElem; ++I) {
1975     auto IC = B.buildConstant(S32, I);
1976     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1977     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1978     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1979 
1980     for (unsigned L = 0; L < NumLanes; ++L) {
1981       auto S = B.buildSelect(EltTy, Cmp,
1982                              UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1983 
1984       for (unsigned N : { 0, 2, 3 })
1985         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1986 
1987       Res[L] = S->getOperand(0).getReg();
1988     }
1989   }
1990 
1991   for (unsigned L = 0; L < NumLanes; ++L) {
1992     Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1993     B.buildCopy(DstReg, Res[L]);
1994     MRI.setRegBank(DstReg, DstBank);
1995   }
1996 
1997   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
1998   MI.eraseFromParent();
1999 
2000   return true;
2001 }
2002 
2003 // Insert a cross regbank copy for a register if it already has a bank that
2004 // differs from the one we want to set.
constrainRegToBank(MachineRegisterInfo & MRI,MachineIRBuilder & B,Register & Reg,const RegisterBank & Bank)2005 static Register constrainRegToBank(MachineRegisterInfo &MRI,
2006                                    MachineIRBuilder &B, Register &Reg,
2007                                    const RegisterBank &Bank) {
2008   const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2009   if (CurrBank && *CurrBank != Bank) {
2010     Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2011     MRI.setRegBank(Copy, Bank);
2012     return Copy;
2013   }
2014 
2015   MRI.setRegBank(Reg, Bank);
2016   return Reg;
2017 }
2018 
foldInsertEltToCmpSelect(MachineIRBuilder & B,MachineInstr & MI,const OperandsMapper & OpdMapper) const2019 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2020     MachineIRBuilder &B, MachineInstr &MI,
2021     const OperandsMapper &OpdMapper) const {
2022 
2023   MachineRegisterInfo &MRI = *B.getMRI();
2024   Register VecReg = MI.getOperand(1).getReg();
2025   Register Idx = MI.getOperand(3).getReg();
2026 
2027   const RegisterBank &IdxBank =
2028     *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2029 
2030   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2031 
2032   LLT VecTy = MRI.getType(VecReg);
2033   unsigned EltSize = VecTy.getScalarSizeInBits();
2034   unsigned NumElem = VecTy.getNumElements();
2035 
2036   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2037                                                   IsDivergentIdx, &Subtarget))
2038     return false;
2039 
2040   LLT S32 = LLT::scalar(32);
2041 
2042   const RegisterBank &DstBank =
2043     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2044   const RegisterBank &SrcBank =
2045     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2046   const RegisterBank &InsBank =
2047     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2048 
2049   const RegisterBank &CCBank =
2050     (DstBank == AMDGPU::SGPRRegBank &&
2051      SrcBank == AMDGPU::SGPRRegBank &&
2052      InsBank == AMDGPU::SGPRRegBank &&
2053      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2054                                      : AMDGPU::VCCRegBank;
2055   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2056 
2057   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2058     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2059     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2060   }
2061 
2062   LLT EltTy = VecTy.getScalarType();
2063   SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2064   unsigned NumLanes = InsRegs.size();
2065   if (!NumLanes) {
2066     NumLanes = 1;
2067     InsRegs.push_back(MI.getOperand(2).getReg());
2068   } else {
2069     EltTy = MRI.getType(InsRegs[0]);
2070   }
2071 
2072   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2073   SmallVector<Register, 16> Ops(NumElem * NumLanes);
2074 
2075   for (unsigned I = 0; I < NumElem; ++I) {
2076     auto IC = B.buildConstant(S32, I);
2077     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2078     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2079     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2080 
2081     for (unsigned L = 0; L < NumLanes; ++L) {
2082       Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2083       Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2084       Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2085 
2086       Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2087       MRI.setRegBank(Select, DstBank);
2088 
2089       Ops[I * NumLanes + L] = Select;
2090     }
2091   }
2092 
2093   LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2094   if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2095     B.buildBuildVector(MI.getOperand(0), Ops);
2096   } else {
2097     auto Vec = B.buildBuildVector(MergeTy, Ops);
2098     MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2099     B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2100   }
2101 
2102   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2103   MI.eraseFromParent();
2104 
2105   return true;
2106 }
2107 
2108 // Break s_mul_u64 into 32-bit vector operations.
applyMappingSMULU64(MachineIRBuilder & B,const OperandsMapper & OpdMapper) const2109 void AMDGPURegisterBankInfo::applyMappingSMULU64(
2110     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2111   SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2112   SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2113   SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2114 
2115   // All inputs are SGPRs, nothing special to do.
2116   if (DefRegs.empty()) {
2117     assert(Src0Regs.empty() && Src1Regs.empty());
2118     applyDefaultMapping(OpdMapper);
2119     return;
2120   }
2121 
2122   assert(DefRegs.size() == 2);
2123   assert(Src0Regs.size() == Src1Regs.size() &&
2124          (Src0Regs.empty() || Src0Regs.size() == 2));
2125 
2126   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2127   MachineInstr &MI = OpdMapper.getMI();
2128   Register DstReg = MI.getOperand(0).getReg();
2129   LLT HalfTy = LLT::scalar(32);
2130 
2131   // Depending on where the source registers came from, the generic code may
2132   // have decided to split the inputs already or not. If not, we still need to
2133   // extract the values.
2134 
2135   if (Src0Regs.empty())
2136     split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2137   else
2138     setRegsToType(MRI, Src0Regs, HalfTy);
2139 
2140   if (Src1Regs.empty())
2141     split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2142   else
2143     setRegsToType(MRI, Src1Regs, HalfTy);
2144 
2145   setRegsToType(MRI, DefRegs, HalfTy);
2146 
2147   // The multiplication is done as follows:
2148   //
2149   //                            Op1H  Op1L
2150   //                          * Op0H  Op0L
2151   //                       --------------------
2152   //                       Op1H*Op0L  Op1L*Op0L
2153   //          + Op1H*Op0H  Op1L*Op0H
2154   // -----------------------------------------
2155   // (Op1H*Op0L + Op1L*Op0H + carry)  Op1L*Op0L
2156   //
2157   //  We drop Op1H*Op0H because the result of the multiplication is a 64-bit
2158   //  value and that would overflow.
2159   //  The low 32-bit value is Op1L*Op0L.
2160   //  The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from
2161   //  Op1L*Op0L).
2162 
2163   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
2164 
2165   Register Hi = B.buildUMulH(HalfTy, Src0Regs[0], Src1Regs[0]).getReg(0);
2166   Register MulLoHi = B.buildMul(HalfTy, Src0Regs[0], Src1Regs[1]).getReg(0);
2167   Register Add = B.buildAdd(HalfTy, Hi, MulLoHi).getReg(0);
2168   Register MulHiLo = B.buildMul(HalfTy, Src0Regs[1], Src1Regs[0]).getReg(0);
2169   B.buildAdd(DefRegs[1], Add, MulHiLo);
2170   B.buildMul(DefRegs[0], Src0Regs[0], Src1Regs[0]);
2171 
2172   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2173   MI.eraseFromParent();
2174 }
2175 
applyMappingImpl(MachineIRBuilder & B,const OperandsMapper & OpdMapper) const2176 void AMDGPURegisterBankInfo::applyMappingImpl(
2177     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2178   MachineInstr &MI = OpdMapper.getMI();
2179   B.setInstrAndDebugLoc(MI);
2180   unsigned Opc = MI.getOpcode();
2181   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2182   switch (Opc) {
2183   case AMDGPU::G_CONSTANT:
2184   case AMDGPU::G_IMPLICIT_DEF: {
2185     Register DstReg = MI.getOperand(0).getReg();
2186     LLT DstTy = MRI.getType(DstReg);
2187     if (DstTy != LLT::scalar(1))
2188       break;
2189 
2190     const RegisterBank *DstBank =
2191         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2192     if (DstBank == &AMDGPU::VCCRegBank)
2193       break;
2194     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2195     if (DefRegs.empty())
2196       DefRegs.push_back(DstReg);
2197 
2198     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2199 
2200     Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
2201     LLVMContext &Ctx = B.getMF().getFunction().getContext();
2202 
2203     MI.getOperand(0).setReg(NewDstReg);
2204     if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2205       uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
2206       MI.getOperand(1).setCImm(
2207           ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal));
2208     }
2209 
2210     MRI.setRegBank(NewDstReg, *DstBank);
2211     B.buildTrunc(DefRegs[0], NewDstReg);
2212     return;
2213   }
2214   case AMDGPU::G_PHI: {
2215     Register DstReg = MI.getOperand(0).getReg();
2216     LLT DstTy = MRI.getType(DstReg);
2217     if (DstTy != LLT::scalar(1))
2218       break;
2219 
2220     const LLT S32 = LLT::scalar(32);
2221     const RegisterBank *DstBank =
2222       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2223     if (DstBank == &AMDGPU::VCCRegBank) {
2224       applyDefaultMapping(OpdMapper);
2225       // The standard handling only considers the result register bank for
2226       // phis. For VCC, blindly inserting a copy when the phi is lowered will
2227       // produce an invalid copy. We can only copy with some kind of compare to
2228       // get a vector boolean result. Insert a register bank copy that will be
2229       // correctly lowered to a compare.
2230       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2231         Register SrcReg = MI.getOperand(I).getReg();
2232         const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2233 
2234         if (SrcBank != &AMDGPU::VCCRegBank) {
2235           MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2236           B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2237 
2238           auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2239           MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2240           MI.getOperand(I).setReg(Copy.getReg(0));
2241         }
2242       }
2243 
2244       return;
2245     }
2246 
2247     // Phi handling is strange and only considers the bank of the destination.
2248     substituteSimpleCopyRegs(OpdMapper, 0);
2249 
2250     // Promote SGPR/VGPR booleans to s32
2251     ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2252     B.setInsertPt(B.getMBB(), MI);
2253     LegalizerHelper Helper(B.getMF(), ApplyBank, B);
2254 
2255     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2256       llvm_unreachable("widen scalar should have succeeded");
2257 
2258     return;
2259   }
2260   case AMDGPU::G_FCMP:
2261     if (!Subtarget.hasSALUFloatInsts())
2262       break;
2263     [[fallthrough]];
2264   case AMDGPU::G_ICMP:
2265   case AMDGPU::G_UADDO:
2266   case AMDGPU::G_USUBO:
2267   case AMDGPU::G_UADDE:
2268   case AMDGPU::G_SADDE:
2269   case AMDGPU::G_USUBE:
2270   case AMDGPU::G_SSUBE: {
2271     unsigned BoolDstOp =
2272         (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1;
2273     Register DstReg = MI.getOperand(BoolDstOp).getReg();
2274 
2275     const RegisterBank *DstBank =
2276       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2277     if (DstBank != &AMDGPU::SGPRRegBank)
2278       break;
2279 
2280     const bool HasCarryIn = MI.getNumOperands() == 5;
2281 
2282     // If this is a scalar compare, promote the result to s32, as the selection
2283     // will end up using a copy to a 32-bit vreg.
2284     const LLT S32 = LLT::scalar(32);
2285     Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2286     MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2287     MI.getOperand(BoolDstOp).setReg(NewDstReg);
2288 
2289     if (HasCarryIn) {
2290       Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2291       MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2292       B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2293       MI.getOperand(4).setReg(NewSrcReg);
2294     }
2295 
2296     MachineBasicBlock *MBB = MI.getParent();
2297     B.setInsertPt(*MBB, std::next(MI.getIterator()));
2298 
2299     // If we had a constrained VCC result register, a copy was inserted to VCC
2300     // from SGPR.
2301     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2302     if (DefRegs.empty())
2303       DefRegs.push_back(DstReg);
2304     B.buildTrunc(DefRegs[0], NewDstReg);
2305     return;
2306   }
2307   case AMDGPU::G_SELECT: {
2308     Register DstReg = MI.getOperand(0).getReg();
2309     LLT DstTy = MRI.getType(DstReg);
2310 
2311     SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2312     if (CondRegs.empty())
2313       CondRegs.push_back(MI.getOperand(1).getReg());
2314     else {
2315       assert(CondRegs.size() == 1);
2316     }
2317 
2318     const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2319     if (CondBank == &AMDGPU::SGPRRegBank) {
2320       const LLT S32 = LLT::scalar(32);
2321       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2322       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2323 
2324       MI.getOperand(1).setReg(NewCondReg);
2325       B.buildZExt(NewCondReg, CondRegs[0]);
2326     }
2327 
2328     if (DstTy.getSizeInBits() != 64)
2329       break;
2330 
2331     LLT HalfTy = getHalfSizedType(DstTy);
2332 
2333     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2334     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2335     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2336 
2337     // All inputs are SGPRs, nothing special to do.
2338     if (DefRegs.empty()) {
2339       assert(Src1Regs.empty() && Src2Regs.empty());
2340       break;
2341     }
2342 
2343     if (Src1Regs.empty())
2344       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2345     else {
2346       setRegsToType(MRI, Src1Regs, HalfTy);
2347     }
2348 
2349     if (Src2Regs.empty())
2350       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2351     else
2352       setRegsToType(MRI, Src2Regs, HalfTy);
2353 
2354     setRegsToType(MRI, DefRegs, HalfTy);
2355 
2356     B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2357     B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2358 
2359     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2360     MI.eraseFromParent();
2361     return;
2362   }
2363   case AMDGPU::G_BRCOND: {
2364     Register CondReg = MI.getOperand(0).getReg();
2365     // FIXME: Should use legalizer helper, but should change bool ext type.
2366     const RegisterBank *CondBank =
2367       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2368 
2369     if (CondBank == &AMDGPU::SGPRRegBank) {
2370       const LLT S32 = LLT::scalar(32);
2371       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2372       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2373 
2374       MI.getOperand(0).setReg(NewCondReg);
2375       B.buildZExt(NewCondReg, CondReg);
2376       return;
2377     }
2378 
2379     break;
2380   }
2381   case AMDGPU::G_AND:
2382   case AMDGPU::G_OR:
2383   case AMDGPU::G_XOR: {
2384     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2385     // there is a VGPR input.
2386     Register DstReg = MI.getOperand(0).getReg();
2387     LLT DstTy = MRI.getType(DstReg);
2388 
2389     if (DstTy.getSizeInBits() == 1) {
2390       const RegisterBank *DstBank =
2391         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2392       if (DstBank == &AMDGPU::VCCRegBank)
2393         break;
2394 
2395       MachineFunction *MF = MI.getParent()->getParent();
2396       ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2397       LegalizerHelper Helper(*MF, ApplyBank, B);
2398 
2399       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2400           LegalizerHelper::Legalized)
2401         llvm_unreachable("widen scalar should have succeeded");
2402       return;
2403     }
2404 
2405     if (DstTy.getSizeInBits() != 64)
2406       break;
2407 
2408     LLT HalfTy = getHalfSizedType(DstTy);
2409     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2410     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2411     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2412 
2413     // All inputs are SGPRs, nothing special to do.
2414     if (DefRegs.empty()) {
2415       assert(Src0Regs.empty() && Src1Regs.empty());
2416       break;
2417     }
2418 
2419     assert(DefRegs.size() == 2);
2420     assert(Src0Regs.size() == Src1Regs.size() &&
2421            (Src0Regs.empty() || Src0Regs.size() == 2));
2422 
2423     // Depending on where the source registers came from, the generic code may
2424     // have decided to split the inputs already or not. If not, we still need to
2425     // extract the values.
2426 
2427     if (Src0Regs.empty())
2428       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2429     else
2430       setRegsToType(MRI, Src0Regs, HalfTy);
2431 
2432     if (Src1Regs.empty())
2433       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2434     else
2435       setRegsToType(MRI, Src1Regs, HalfTy);
2436 
2437     setRegsToType(MRI, DefRegs, HalfTy);
2438 
2439     B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2440     B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2441 
2442     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2443     MI.eraseFromParent();
2444     return;
2445   }
2446   case AMDGPU::G_ABS: {
2447     Register SrcReg = MI.getOperand(1).getReg();
2448     const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2449 
2450     // There is no VALU abs instruction so we need to replace it with a sub and
2451     // max combination.
2452     if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2453       MachineFunction *MF = MI.getParent()->getParent();
2454       ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
2455       LegalizerHelper Helper(*MF, Apply, B);
2456 
2457       if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2458         llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2459       return;
2460     }
2461     [[fallthrough]];
2462   }
2463   case AMDGPU::G_ADD:
2464   case AMDGPU::G_SUB:
2465   case AMDGPU::G_MUL:
2466   case AMDGPU::G_SHL:
2467   case AMDGPU::G_LSHR:
2468   case AMDGPU::G_ASHR:
2469   case AMDGPU::G_SMIN:
2470   case AMDGPU::G_SMAX:
2471   case AMDGPU::G_UMIN:
2472   case AMDGPU::G_UMAX: {
2473     Register DstReg = MI.getOperand(0).getReg();
2474     LLT DstTy = MRI.getType(DstReg);
2475 
2476     // Special case for s_mul_u64. There is not a vector equivalent of
2477     // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
2478     // multiplications.
2479     if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) {
2480       applyMappingSMULU64(B, OpdMapper);
2481       return;
2482     }
2483 
2484     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2485     // Packed 16-bit operations need to be scalarized and promoted.
2486     if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2487       break;
2488 
2489     const RegisterBank *DstBank =
2490         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2491     if (DstBank == &AMDGPU::VGPRRegBank)
2492       break;
2493 
2494     const LLT S32 = LLT::scalar(32);
2495     MachineBasicBlock *MBB = MI.getParent();
2496     MachineFunction *MF = MBB->getParent();
2497     ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2498 
2499     if (DstTy.isVector() && Opc == AMDGPU::G_ABS) {
2500       Register WideSrcLo, WideSrcHi;
2501 
2502       std::tie(WideSrcLo, WideSrcHi) =
2503           unpackV2S16ToS32(B, MI.getOperand(1).getReg(), TargetOpcode::G_SEXT);
2504       auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo});
2505       auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi});
2506       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2507       MI.eraseFromParent();
2508       return;
2509     }
2510 
2511     if (DstTy.isVector()) {
2512       Register WideSrc0Lo, WideSrc0Hi;
2513       Register WideSrc1Lo, WideSrc1Hi;
2514 
2515       unsigned ExtendOp = getExtendOp(MI.getOpcode());
2516       std::tie(WideSrc0Lo, WideSrc0Hi)
2517         = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2518       std::tie(WideSrc1Lo, WideSrc1Hi)
2519         = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2520       auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2521       auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2522       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2523       MI.eraseFromParent();
2524     } else {
2525       LegalizerHelper Helper(*MF, ApplySALU, B);
2526 
2527       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2528         llvm_unreachable("widen scalar should have succeeded");
2529 
2530       // FIXME: s16 shift amounts should be legal.
2531       if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2532           Opc == AMDGPU::G_ASHR) {
2533         B.setInsertPt(*MBB, MI.getIterator());
2534         if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2535           llvm_unreachable("widen scalar should have succeeded");
2536       }
2537     }
2538 
2539     return;
2540   }
2541   case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
2542   case AMDGPU::G_AMDGPU_S_MUL_U64_U32: {
2543     // This is a special case for s_mul_u64. We use
2544     // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation
2545     // where the 33 higher bits are sign-extended and
2546     // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation
2547     // where the 32 higher bits are zero-extended. In case scalar registers are
2548     // selected, both opcodes are lowered as s_mul_u64. If the vector registers
2549     // are selected, then G_AMDGPU_S_MUL_I64_I32 and
2550     // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.
2551 
2552     // Insert basic copies.
2553     applyDefaultMapping(OpdMapper);
2554 
2555     Register DstReg = MI.getOperand(0).getReg();
2556     Register SrcReg0 = MI.getOperand(1).getReg();
2557     Register SrcReg1 = MI.getOperand(2).getReg();
2558     const LLT S32 = LLT::scalar(32);
2559     const LLT S64 = LLT::scalar(64);
2560     assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "
2561                                          "that handles only 64-bit operands.");
2562     const RegisterBank *DstBank =
2563         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2564 
2565     // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2566     // with s_mul_u64 operation.
2567     if (DstBank == &AMDGPU::SGPRRegBank) {
2568       MI.setDesc(TII->get(AMDGPU::S_MUL_U64));
2569       MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass);
2570       MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass);
2571       MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass);
2572       return;
2573     }
2574 
2575     // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2576     // with a vector mad.
2577     assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank &&
2578            "The destination operand should be in vector registers.");
2579 
2580     DebugLoc DL = MI.getDebugLoc();
2581 
2582     // Extract the lower subregister from the first operand.
2583     Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2584     MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass);
2585     MRI.setType(Op0L, S32);
2586     B.buildTrunc(Op0L, SrcReg0);
2587 
2588     // Extract the lower subregister from the second operand.
2589     Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2590     MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass);
2591     MRI.setType(Op1L, S32);
2592     B.buildTrunc(Op1L, SrcReg1);
2593 
2594     unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32
2595                           ? AMDGPU::G_AMDGPU_MAD_U64_U32
2596                           : AMDGPU::G_AMDGPU_MAD_I64_I32;
2597 
2598     MachineIRBuilder B(MI);
2599     Register Zero64 = B.buildConstant(S64, 0).getReg(0);
2600     MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass);
2601     Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
2602     MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass);
2603     B.buildInstr(NewOpc, {DstReg, CarryOut}, {Op0L, Op1L, Zero64});
2604     MI.eraseFromParent();
2605     return;
2606   }
2607   case AMDGPU::G_SEXT_INREG: {
2608     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2609     if (SrcRegs.empty())
2610       break; // Nothing to repair
2611 
2612     const LLT S32 = LLT::scalar(32);
2613     ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
2614 
2615     // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2616     // we would need to further expand, and doesn't let us directly set the
2617     // result registers.
2618     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2619 
2620     int Amt = MI.getOperand(2).getImm();
2621     if (Amt <= 32) {
2622       // Downstream users have expectations for the high bit behavior, so freeze
2623       // incoming undefined bits.
2624       if (Amt == 32) {
2625         // The low bits are unchanged.
2626         B.buildFreeze(DstRegs[0], SrcRegs[0]);
2627       } else {
2628         auto Freeze = B.buildFreeze(S32, SrcRegs[0]);
2629         // Extend in the low bits and propagate the sign bit to the high half.
2630         B.buildSExtInReg(DstRegs[0], Freeze, Amt);
2631       }
2632 
2633       B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2634     } else {
2635       // The low bits are unchanged, and extend in the high bits.
2636       // No freeze required
2637       B.buildCopy(DstRegs[0], SrcRegs[0]);
2638       B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2639     }
2640 
2641     Register DstReg = MI.getOperand(0).getReg();
2642     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2643     MI.eraseFromParent();
2644     return;
2645   }
2646   case AMDGPU::G_CTPOP:
2647   case AMDGPU::G_BITREVERSE: {
2648     const RegisterBank *DstBank =
2649       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2650     if (DstBank == &AMDGPU::SGPRRegBank)
2651       break;
2652 
2653     Register SrcReg = MI.getOperand(1).getReg();
2654     const LLT S32 = LLT::scalar(32);
2655     LLT Ty = MRI.getType(SrcReg);
2656     if (Ty == S32)
2657       break;
2658 
2659     ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2660 
2661     MachineFunction &MF = B.getMF();
2662     LegalizerHelper Helper(MF, ApplyVALU, B);
2663 
2664     if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2665       llvm_unreachable("narrowScalar should have succeeded");
2666     return;
2667   }
2668   case AMDGPU::G_AMDGPU_FFBH_U32:
2669   case AMDGPU::G_AMDGPU_FFBL_B32:
2670   case AMDGPU::G_CTLZ_ZERO_UNDEF:
2671   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2672     const RegisterBank *DstBank =
2673         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2674     if (DstBank == &AMDGPU::SGPRRegBank)
2675       break;
2676 
2677     Register SrcReg = MI.getOperand(1).getReg();
2678     const LLT S32 = LLT::scalar(32);
2679     LLT Ty = MRI.getType(SrcReg);
2680     if (Ty == S32)
2681       break;
2682 
2683     // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2684     // which return -1 when the input is zero:
2685     // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2686     // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2687     // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2688     // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2689     ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2690     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2691     unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2692                           ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2693                           : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2694                                 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2695                                 : Opc;
2696     unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2697     auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2698     auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2699     unsigned AddOpc =
2700         Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2701             ? AMDGPU::G_ADD
2702             : AMDGPU::G_UADDSAT;
2703     Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2704     Register DstReg = MI.getOperand(0).getReg();
2705     B.buildUMin(DstReg, X, Y);
2706     MI.eraseFromParent();
2707     return;
2708   }
2709   case AMDGPU::G_SEXT:
2710   case AMDGPU::G_ZEXT:
2711   case AMDGPU::G_ANYEXT: {
2712     Register SrcReg = MI.getOperand(1).getReg();
2713     LLT SrcTy = MRI.getType(SrcReg);
2714     const bool Signed = Opc == AMDGPU::G_SEXT;
2715 
2716     assert(OpdMapper.getVRegs(1).empty());
2717 
2718     const RegisterBank *SrcBank =
2719       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2720 
2721     Register DstReg = MI.getOperand(0).getReg();
2722     LLT DstTy = MRI.getType(DstReg);
2723     if (DstTy.isScalar() &&
2724         SrcBank != &AMDGPU::SGPRRegBank &&
2725         SrcBank != &AMDGPU::VCCRegBank &&
2726         // FIXME: Should handle any type that round to s64 when irregular
2727         // breakdowns supported.
2728         DstTy.getSizeInBits() == 64 &&
2729         SrcTy.getSizeInBits() <= 32) {
2730       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2731 
2732       // Extend to 32-bit, and then extend the low half.
2733       if (Signed) {
2734         // TODO: Should really be buildSExtOrCopy
2735         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2736       } else if (Opc == AMDGPU::G_ZEXT) {
2737         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2738       } else {
2739         B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2740       }
2741 
2742       extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2743       MRI.setRegBank(DstReg, *SrcBank);
2744       MI.eraseFromParent();
2745       return;
2746     }
2747 
2748     if (SrcTy != LLT::scalar(1))
2749       return;
2750 
2751     // It is not legal to have a legalization artifact with a VCC source. Rather
2752     // than introducing a copy, insert the select we would have to select the
2753     // copy to.
2754     if (SrcBank == &AMDGPU::VCCRegBank) {
2755       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2756 
2757       const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2758 
2759       unsigned DstSize = DstTy.getSizeInBits();
2760       // 64-bit select is SGPR only
2761       const bool UseSel64 = DstSize > 32 &&
2762         SrcBank->getID() == AMDGPU::SGPRRegBankID;
2763 
2764       // TODO: Should s16 select be legal?
2765       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2766       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2767       auto False = B.buildConstant(SelType, 0);
2768 
2769       MRI.setRegBank(True.getReg(0), *DstBank);
2770       MRI.setRegBank(False.getReg(0), *DstBank);
2771       MRI.setRegBank(DstReg, *DstBank);
2772 
2773       if (DstSize > 32) {
2774         B.buildSelect(DefRegs[0], SrcReg, True, False);
2775         extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2776       } else if (DstSize < 32) {
2777         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2778         MRI.setRegBank(Sel.getReg(0), *DstBank);
2779         B.buildTrunc(DstReg, Sel);
2780       } else {
2781         B.buildSelect(DstReg, SrcReg, True, False);
2782       }
2783 
2784       MI.eraseFromParent();
2785       return;
2786     }
2787 
2788     break;
2789   }
2790   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2791     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2792 
2793     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2794 
2795     Register DstReg = MI.getOperand(0).getReg();
2796     Register SrcReg = MI.getOperand(1).getReg();
2797 
2798     const LLT S32 = LLT::scalar(32);
2799     LLT DstTy = MRI.getType(DstReg);
2800     LLT SrcTy = MRI.getType(SrcReg);
2801 
2802     if (foldExtractEltToCmpSelect(B, MI, OpdMapper))
2803       return;
2804 
2805     const ValueMapping &DstMapping
2806       = OpdMapper.getInstrMapping().getOperandMapping(0);
2807     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2808     const RegisterBank *SrcBank =
2809       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2810     const RegisterBank *IdxBank =
2811         OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2812 
2813     Register BaseIdxReg;
2814     unsigned ConstOffset;
2815     std::tie(BaseIdxReg, ConstOffset) =
2816         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2817 
2818     // See if the index is an add of a constant which will be foldable by moving
2819     // the base register of the index later if this is going to be executed in a
2820     // waterfall loop. This is essentially to reassociate the add of a constant
2821     // with the readfirstlane.
2822     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2823                                    ConstOffset > 0 &&
2824                                    ConstOffset < SrcTy.getNumElements();
2825 
2826     // Move the base register. We'll re-insert the add later.
2827     if (ShouldMoveIndexIntoLoop)
2828       MI.getOperand(2).setReg(BaseIdxReg);
2829 
2830     // If this is a VGPR result only because the index was a VGPR result, the
2831     // actual indexing will be done on the SGPR source vector, which will
2832     // produce a scalar result. We need to copy to the VGPR result inside the
2833     // waterfall loop.
2834     const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2835                                 SrcBank == &AMDGPU::SGPRRegBank;
2836     if (DstRegs.empty()) {
2837       applyDefaultMapping(OpdMapper);
2838 
2839       executeInWaterfallLoop(B, MI, {2});
2840 
2841       if (NeedCopyToVGPR) {
2842         // We don't want a phi for this temporary reg.
2843         Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2844         MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2845         MI.getOperand(0).setReg(TmpReg);
2846         B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2847 
2848         // Use a v_mov_b32 here to make the exec dependency explicit.
2849         buildVCopy(B, DstReg, TmpReg);
2850       }
2851 
2852       // Re-insert the constant offset add inside the waterfall loop.
2853       if (ShouldMoveIndexIntoLoop)
2854         reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2855 
2856       return;
2857     }
2858 
2859     assert(DstTy.getSizeInBits() == 64);
2860 
2861     LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2862 
2863     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2864     auto One = B.buildConstant(S32, 1);
2865 
2866     MachineBasicBlock::iterator MII = MI.getIterator();
2867 
2868     // Split the vector index into 32-bit pieces. Prepare to move all of the
2869     // new instructions into a waterfall loop if necessary.
2870     //
2871     // Don't put the bitcast or constant in the loop.
2872     MachineInstrSpan Span(MII, &B.getMBB());
2873 
2874     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2875     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2876     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2877 
2878     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2879     auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2880 
2881     MRI.setRegBank(DstReg, *DstBank);
2882     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2883     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2884     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2885     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2886 
2887     SmallSet<Register, 4> OpsToWaterfall;
2888     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2889       MI.eraseFromParent();
2890       return;
2891     }
2892 
2893     // Remove the original instruction to avoid potentially confusing the
2894     // waterfall loop logic.
2895     B.setInstr(*Span.begin());
2896     MI.eraseFromParent();
2897     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2898                            OpsToWaterfall);
2899 
2900     if (NeedCopyToVGPR) {
2901       MachineBasicBlock *LoopBB = Extract1->getParent();
2902       Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2903       Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2904       MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2905       MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2906 
2907       Extract0->getOperand(0).setReg(TmpReg0);
2908       Extract1->getOperand(0).setReg(TmpReg1);
2909 
2910       B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2911 
2912       buildVCopy(B, DstRegs[0], TmpReg0);
2913       buildVCopy(B, DstRegs[1], TmpReg1);
2914     }
2915 
2916     if (ShouldMoveIndexIntoLoop)
2917       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2918 
2919     return;
2920   }
2921   case AMDGPU::G_INSERT_VECTOR_ELT: {
2922     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2923 
2924     Register DstReg = MI.getOperand(0).getReg();
2925     LLT VecTy = MRI.getType(DstReg);
2926 
2927     assert(OpdMapper.getVRegs(0).empty());
2928     assert(OpdMapper.getVRegs(3).empty());
2929 
2930     if (substituteSimpleCopyRegs(OpdMapper, 1))
2931       MRI.setType(MI.getOperand(1).getReg(), VecTy);
2932 
2933     if (foldInsertEltToCmpSelect(B, MI, OpdMapper))
2934       return;
2935 
2936     const RegisterBank *IdxBank =
2937       OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2938 
2939     Register SrcReg = MI.getOperand(1).getReg();
2940     Register InsReg = MI.getOperand(2).getReg();
2941     LLT InsTy = MRI.getType(InsReg);
2942     (void)InsTy;
2943 
2944     Register BaseIdxReg;
2945     unsigned ConstOffset;
2946     std::tie(BaseIdxReg, ConstOffset) =
2947         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2948 
2949     // See if the index is an add of a constant which will be foldable by moving
2950     // the base register of the index later if this is going to be executed in a
2951     // waterfall loop. This is essentially to reassociate the add of a constant
2952     // with the readfirstlane.
2953     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2954       ConstOffset > 0 &&
2955       ConstOffset < VecTy.getNumElements();
2956 
2957     // Move the base register. We'll re-insert the add later.
2958     if (ShouldMoveIndexIntoLoop)
2959       MI.getOperand(3).setReg(BaseIdxReg);
2960 
2961 
2962     if (InsRegs.empty()) {
2963       executeInWaterfallLoop(B, MI, {3});
2964 
2965       // Re-insert the constant offset add inside the waterfall loop.
2966       if (ShouldMoveIndexIntoLoop) {
2967         reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2968       }
2969 
2970       return;
2971     }
2972 
2973     assert(InsTy.getSizeInBits() == 64);
2974 
2975     const LLT S32 = LLT::scalar(32);
2976     LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2977 
2978     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2979     auto One = B.buildConstant(S32, 1);
2980 
2981     // Split the vector index into 32-bit pieces. Prepare to move all of the
2982     // new instructions into a waterfall loop if necessary.
2983     //
2984     // Don't put the bitcast or constant in the loop.
2985     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2986 
2987     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2988     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2989     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2990 
2991     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2992     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2993 
2994     const RegisterBank *DstBank =
2995       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2996     const RegisterBank *SrcBank =
2997       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2998     const RegisterBank *InsSrcBank =
2999       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
3000 
3001     MRI.setRegBank(InsReg, *InsSrcBank);
3002     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
3003     MRI.setRegBank(InsLo.getReg(0), *DstBank);
3004     MRI.setRegBank(InsHi.getReg(0), *DstBank);
3005     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
3006     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
3007     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
3008 
3009 
3010     SmallSet<Register, 4> OpsToWaterfall;
3011     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
3012       B.setInsertPt(B.getMBB(), MI);
3013       B.buildBitcast(DstReg, InsHi);
3014       MI.eraseFromParent();
3015       return;
3016     }
3017 
3018     B.setInstr(*Span.begin());
3019     MI.eraseFromParent();
3020 
3021     // Figure out the point after the waterfall loop before mangling the control
3022     // flow.
3023     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
3024                            OpsToWaterfall);
3025 
3026     // The insertion point is now right after the original instruction.
3027     //
3028     // Keep the bitcast to the original vector type out of the loop. Doing this
3029     // saved an extra phi we don't need inside the loop.
3030     B.buildBitcast(DstReg, InsHi);
3031 
3032     // Re-insert the constant offset add inside the waterfall loop.
3033     if (ShouldMoveIndexIntoLoop)
3034       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
3035 
3036     return;
3037   }
3038   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3039   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3040   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3041   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3042   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3043   case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
3044   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
3045   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
3046   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
3047   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
3048   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3049   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
3050   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3051   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3052   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3053   case AMDGPU::G_AMDGPU_BUFFER_STORE:
3054   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3055   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3056   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3057   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
3058   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3059   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
3060     applyDefaultMapping(OpdMapper);
3061     executeInWaterfallLoop(B, MI, {1, 4});
3062     return;
3063   }
3064   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3065   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3066   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3067   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3068   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3069   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3070   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3071   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3072   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3073   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3074   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3075   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
3076     applyDefaultMapping(OpdMapper);
3077     executeInWaterfallLoop(B, MI, {2, 5});
3078     return;
3079   }
3080   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3081   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3082   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3083     applyDefaultMapping(OpdMapper);
3084     executeInWaterfallLoop(B, MI, {2, 5});
3085     return;
3086   }
3087   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3088     applyDefaultMapping(OpdMapper);
3089     executeInWaterfallLoop(B, MI, {3, 6});
3090     return;
3091   }
3092   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
3093   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
3094   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
3095   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
3096   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
3097     applyMappingSBufferLoad(B, OpdMapper);
3098     return;
3099   }
3100   case AMDGPU::G_INTRINSIC:
3101   case AMDGPU::G_INTRINSIC_CONVERGENT: {
3102     switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3103     case Intrinsic::amdgcn_readlane: {
3104       substituteSimpleCopyRegs(OpdMapper, 2);
3105 
3106       assert(OpdMapper.getVRegs(0).empty());
3107       assert(OpdMapper.getVRegs(3).empty());
3108 
3109       // Make sure the index is an SGPR. It doesn't make sense to run this in a
3110       // waterfall loop, so assume it's a uniform value.
3111       constrainOpWithReadfirstlane(B, MI, 3); // Index
3112       return;
3113     }
3114     case Intrinsic::amdgcn_writelane: {
3115       assert(OpdMapper.getVRegs(0).empty());
3116       assert(OpdMapper.getVRegs(2).empty());
3117       assert(OpdMapper.getVRegs(3).empty());
3118 
3119       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
3120       constrainOpWithReadfirstlane(B, MI, 2); // Source value
3121       constrainOpWithReadfirstlane(B, MI, 3); // Index
3122       return;
3123     }
3124     case Intrinsic::amdgcn_interp_p1:
3125     case Intrinsic::amdgcn_interp_p2:
3126     case Intrinsic::amdgcn_interp_mov:
3127     case Intrinsic::amdgcn_interp_p1_f16:
3128     case Intrinsic::amdgcn_interp_p2_f16:
3129     case Intrinsic::amdgcn_lds_param_load: {
3130       applyDefaultMapping(OpdMapper);
3131 
3132       // Readlane for m0 value, which is always the last operand.
3133       // FIXME: Should this be a waterfall loop instead?
3134       constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
3135       return;
3136     }
3137     case Intrinsic::amdgcn_interp_inreg_p10:
3138     case Intrinsic::amdgcn_interp_inreg_p2:
3139     case Intrinsic::amdgcn_interp_inreg_p10_f16:
3140     case Intrinsic::amdgcn_interp_inreg_p2_f16:
3141     case Intrinsic::amdgcn_interp_p10_rtz_f16:
3142     case Intrinsic::amdgcn_interp_p2_rtz_f16:
3143       applyDefaultMapping(OpdMapper);
3144       return;
3145     case Intrinsic::amdgcn_permlane16:
3146     case Intrinsic::amdgcn_permlanex16: {
3147       // Doing a waterfall loop over these wouldn't make any sense.
3148       substituteSimpleCopyRegs(OpdMapper, 2);
3149       substituteSimpleCopyRegs(OpdMapper, 3);
3150       constrainOpWithReadfirstlane(B, MI, 4);
3151       constrainOpWithReadfirstlane(B, MI, 5);
3152       return;
3153     }
3154     case Intrinsic::amdgcn_sbfe:
3155       applyMappingBFE(B, OpdMapper, true);
3156       return;
3157     case Intrinsic::amdgcn_ubfe:
3158       applyMappingBFE(B, OpdMapper, false);
3159       return;
3160     case Intrinsic::amdgcn_inverse_ballot:
3161     case Intrinsic::amdgcn_s_bitreplicate:
3162     case Intrinsic::amdgcn_s_quadmask:
3163     case Intrinsic::amdgcn_s_wqm:
3164       applyDefaultMapping(OpdMapper);
3165       constrainOpWithReadfirstlane(B, MI, 2); // Mask
3166       return;
3167     case Intrinsic::amdgcn_ballot:
3168       // Use default handling and insert copy to vcc source.
3169       break;
3170     }
3171     break;
3172   }
3173   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3174   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3175   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3176   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3177   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3178     const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3179         AMDGPU::lookupRsrcIntrinsic(AMDGPU::getIntrinsicID(MI));
3180     assert(RSrcIntrin && RSrcIntrin->IsImage);
3181     // Non-images can have complications from operands that allow both SGPR
3182     // and VGPR. For now it's too complicated to figure out the final opcode
3183     // to derive the register bank from the MCInstrDesc.
3184     applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3185     return;
3186   }
3187   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3188     unsigned N = MI.getNumExplicitOperands() - 2;
3189     applyDefaultMapping(OpdMapper);
3190     executeInWaterfallLoop(B, MI, {N});
3191     return;
3192   }
3193   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
3194   case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
3195     auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
3196     switch (IntrID) {
3197     case Intrinsic::amdgcn_ds_ordered_add:
3198     case Intrinsic::amdgcn_ds_ordered_swap: {
3199       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3200       assert(OpdMapper.getVRegs(0).empty());
3201       substituteSimpleCopyRegs(OpdMapper, 3);
3202       constrainOpWithReadfirstlane(B, MI, 2); // M0
3203       return;
3204     }
3205     case Intrinsic::amdgcn_ds_gws_init:
3206     case Intrinsic::amdgcn_ds_gws_barrier:
3207     case Intrinsic::amdgcn_ds_gws_sema_br: {
3208       // Only the first lane is executes, so readfirstlane is safe.
3209       substituteSimpleCopyRegs(OpdMapper, 1);
3210       constrainOpWithReadfirstlane(B, MI, 2); // M0
3211       return;
3212     }
3213     case Intrinsic::amdgcn_ds_gws_sema_v:
3214     case Intrinsic::amdgcn_ds_gws_sema_p:
3215     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3216       // Only the first lane is executes, so readfirstlane is safe.
3217       constrainOpWithReadfirstlane(B, MI, 1); // M0
3218       return;
3219     }
3220     case Intrinsic::amdgcn_ds_append:
3221     case Intrinsic::amdgcn_ds_consume: {
3222       constrainOpWithReadfirstlane(B, MI, 2); // M0
3223       return;
3224     }
3225     case Intrinsic::amdgcn_s_sendmsg:
3226     case Intrinsic::amdgcn_s_sendmsghalt: {
3227       // FIXME: Should this use a waterfall loop?
3228       constrainOpWithReadfirstlane(B, MI, 2); // M0
3229       return;
3230     }
3231     case Intrinsic::amdgcn_s_setreg: {
3232       constrainOpWithReadfirstlane(B, MI, 2);
3233       return;
3234     }
3235     case Intrinsic::amdgcn_s_ttracedata:
3236       constrainOpWithReadfirstlane(B, MI, 1); // M0
3237       return;
3238     case Intrinsic::amdgcn_raw_buffer_load_lds:
3239     case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
3240       applyDefaultMapping(OpdMapper);
3241       constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3242       constrainOpWithReadfirstlane(B, MI, 2); // M0
3243       constrainOpWithReadfirstlane(B, MI, 5); // soffset
3244       return;
3245     }
3246     case Intrinsic::amdgcn_struct_buffer_load_lds:
3247     case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
3248       applyDefaultMapping(OpdMapper);
3249       constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3250       constrainOpWithReadfirstlane(B, MI, 2); // M0
3251       constrainOpWithReadfirstlane(B, MI, 6); // soffset
3252       return;
3253     }
3254     case Intrinsic::amdgcn_global_load_lds: {
3255       applyDefaultMapping(OpdMapper);
3256       constrainOpWithReadfirstlane(B, MI, 2);
3257       return;
3258     }
3259     case Intrinsic::amdgcn_lds_direct_load: {
3260       applyDefaultMapping(OpdMapper);
3261       // Readlane for m0 value, which is always the last operand.
3262       constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
3263       return;
3264     }
3265     case Intrinsic::amdgcn_exp_row:
3266       applyDefaultMapping(OpdMapper);
3267       constrainOpWithReadfirstlane(B, MI, 8); // M0
3268       return;
3269     case Intrinsic::amdgcn_s_sleep_var:
3270       assert(OpdMapper.getVRegs(1).empty());
3271       constrainOpWithReadfirstlane(B, MI, 1);
3272       return;
3273     case Intrinsic::amdgcn_s_barrier_signal_var:
3274     case Intrinsic::amdgcn_s_barrier_join:
3275     case Intrinsic::amdgcn_s_wakeup_barrier:
3276       constrainOpWithReadfirstlane(B, MI, 1);
3277       return;
3278     case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
3279       constrainOpWithReadfirstlane(B, MI, 2);
3280       return;
3281     case Intrinsic::amdgcn_s_barrier_init:
3282       constrainOpWithReadfirstlane(B, MI, 1);
3283       constrainOpWithReadfirstlane(B, MI, 2);
3284       return;
3285     case Intrinsic::amdgcn_s_get_barrier_state: {
3286       constrainOpWithReadfirstlane(B, MI, 2);
3287       return;
3288     }
3289     default: {
3290       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3291               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3292         // Non-images can have complications from operands that allow both SGPR
3293         // and VGPR. For now it's too complicated to figure out the final opcode
3294         // to derive the register bank from the MCInstrDesc.
3295         if (RSrcIntrin->IsImage) {
3296           applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3297           return;
3298         }
3299       }
3300 
3301       break;
3302     }
3303     }
3304     break;
3305   }
3306   case AMDGPU::G_SI_CALL: {
3307     // Use a set to avoid extra readfirstlanes in the case where multiple
3308     // operands are the same register.
3309     SmallSet<Register, 4> SGPROperandRegs;
3310 
3311     if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3312       break;
3313 
3314     // Move all copies to physical SGPRs that are used by the call instruction
3315     // into the loop block. Start searching for these copies until the
3316     // ADJCALLSTACKUP.
3317     unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3318     unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3319 
3320     // Move all non-copies before the copies, so that a complete range can be
3321     // moved into the waterfall loop.
3322     SmallVector<MachineInstr *, 4> NonCopyInstrs;
3323     // Count of NonCopyInstrs found until the current LastCopy.
3324     unsigned NonCopyInstrsLen = 0;
3325     MachineBasicBlock::iterator Start(&MI);
3326     MachineBasicBlock::iterator LastCopy = Start;
3327     MachineBasicBlock *MBB = MI.getParent();
3328     const SIMachineFunctionInfo *Info =
3329         MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3330     while (Start->getOpcode() != FrameSetupOpcode) {
3331       --Start;
3332       bool IsCopy = false;
3333       if (Start->getOpcode() == AMDGPU::COPY) {
3334         auto &Dst = Start->getOperand(0);
3335         if (Dst.isReg()) {
3336           Register Reg = Dst.getReg();
3337           if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3338             IsCopy = true;
3339           } else {
3340             // Also move the copy from the scratch rsrc descriptor into the loop
3341             // to allow it to be optimized away.
3342             auto &Src = Start->getOperand(1);
3343             if (Src.isReg()) {
3344               Reg = Src.getReg();
3345               IsCopy = Info->getScratchRSrcReg() == Reg;
3346             }
3347           }
3348         }
3349       }
3350 
3351       if (IsCopy) {
3352         LastCopy = Start;
3353         NonCopyInstrsLen = NonCopyInstrs.size();
3354       } else {
3355         NonCopyInstrs.push_back(&*Start);
3356       }
3357     }
3358     NonCopyInstrs.resize(NonCopyInstrsLen);
3359 
3360     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3361       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3362     }
3363     Start = LastCopy;
3364 
3365     // Do the same for copies after the loop
3366     NonCopyInstrs.clear();
3367     NonCopyInstrsLen = 0;
3368     MachineBasicBlock::iterator End(&MI);
3369     LastCopy = End;
3370     while (End->getOpcode() != FrameDestroyOpcode) {
3371       ++End;
3372       bool IsCopy = false;
3373       if (End->getOpcode() == AMDGPU::COPY) {
3374         auto &Src = End->getOperand(1);
3375         if (Src.isReg()) {
3376           Register Reg = Src.getReg();
3377           IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3378         }
3379       }
3380 
3381       if (IsCopy) {
3382         LastCopy = End;
3383         NonCopyInstrsLen = NonCopyInstrs.size();
3384       } else {
3385         NonCopyInstrs.push_back(&*End);
3386       }
3387     }
3388     NonCopyInstrs.resize(NonCopyInstrsLen);
3389 
3390     End = LastCopy;
3391     ++LastCopy;
3392     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3393       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3394     }
3395 
3396     ++End;
3397     B.setInsertPt(B.getMBB(), Start);
3398     executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs);
3399     break;
3400   }
3401   case AMDGPU::G_LOAD:
3402   case AMDGPU::G_ZEXTLOAD:
3403   case AMDGPU::G_SEXTLOAD: {
3404     if (applyMappingLoad(B, OpdMapper, MI))
3405       return;
3406     break;
3407   }
3408   case AMDGPU::G_DYN_STACKALLOC:
3409     applyMappingDynStackAlloc(B, OpdMapper, MI);
3410     return;
3411   case AMDGPU::G_STACKRESTORE: {
3412     applyDefaultMapping(OpdMapper);
3413     constrainOpWithReadfirstlane(B, MI, 0);
3414     return;
3415   }
3416   case AMDGPU::G_SBFX:
3417     applyMappingBFE(B, OpdMapper, /*Signed*/ true);
3418     return;
3419   case AMDGPU::G_UBFX:
3420     applyMappingBFE(B, OpdMapper, /*Signed*/ false);
3421     return;
3422   case AMDGPU::G_AMDGPU_MAD_U64_U32:
3423   case AMDGPU::G_AMDGPU_MAD_I64_I32:
3424     applyMappingMAD_64_32(B, OpdMapper);
3425     return;
3426   case AMDGPU::G_PREFETCH: {
3427     if (!Subtarget.hasPrefetch()) {
3428       MI.eraseFromParent();
3429       return;
3430     }
3431     Register PtrReg = MI.getOperand(0).getReg();
3432     unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);
3433     if (PtrBank == AMDGPU::VGPRRegBankID) {
3434       MI.eraseFromParent();
3435       return;
3436     }
3437     unsigned AS = MRI.getType(PtrReg).getAddressSpace();
3438     if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
3439         AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3440       MI.eraseFromParent();
3441       return;
3442     }
3443     applyDefaultMapping(OpdMapper);
3444     return;
3445   }
3446   default:
3447     break;
3448   }
3449 
3450   return applyDefaultMapping(OpdMapper);
3451 }
3452 
3453 // vgpr, sgpr -> vgpr
3454 // vgpr, agpr -> vgpr
3455 // agpr, agpr -> agpr
3456 // agpr, sgpr -> vgpr
regBankUnion(unsigned RB0,unsigned RB1)3457 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3458   if (RB0 == AMDGPU::InvalidRegBankID)
3459     return RB1;
3460   if (RB1 == AMDGPU::InvalidRegBankID)
3461     return RB0;
3462 
3463   if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3464     return AMDGPU::SGPRRegBankID;
3465 
3466   if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3467     return AMDGPU::AGPRRegBankID;
3468 
3469   return AMDGPU::VGPRRegBankID;
3470 }
3471 
regBankBoolUnion(unsigned RB0,unsigned RB1)3472 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3473   if (RB0 == AMDGPU::InvalidRegBankID)
3474     return RB1;
3475   if (RB1 == AMDGPU::InvalidRegBankID)
3476     return RB0;
3477 
3478   // vcc, vcc -> vcc
3479   // vcc, sgpr -> vcc
3480   // vcc, vgpr -> vcc
3481   if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3482     return AMDGPU::VCCRegBankID;
3483 
3484   // vcc, vgpr -> vgpr
3485   return regBankUnion(RB0, RB1);
3486 }
3487 
getMappingType(const MachineRegisterInfo & MRI,const MachineInstr & MI) const3488 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3489                                                 const MachineInstr &MI) const {
3490   unsigned RegBank = AMDGPU::InvalidRegBankID;
3491 
3492   for (const MachineOperand &MO : MI.operands()) {
3493     if (!MO.isReg())
3494       continue;
3495     Register Reg = MO.getReg();
3496     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3497       RegBank = regBankUnion(RegBank, Bank->getID());
3498       if (RegBank == AMDGPU::VGPRRegBankID)
3499         break;
3500     }
3501   }
3502 
3503   return RegBank;
3504 }
3505 
isSALUMapping(const MachineInstr & MI) const3506 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3507   const MachineFunction &MF = *MI.getParent()->getParent();
3508   const MachineRegisterInfo &MRI = MF.getRegInfo();
3509   for (const MachineOperand &MO : MI.operands()) {
3510     if (!MO.isReg())
3511       continue;
3512     Register Reg = MO.getReg();
3513     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3514       if (Bank->getID() != AMDGPU::SGPRRegBankID)
3515         return false;
3516     }
3517   }
3518   return true;
3519 }
3520 
3521 const RegisterBankInfo::InstructionMapping &
getDefaultMappingSOP(const MachineInstr & MI) const3522 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3523   const MachineFunction &MF = *MI.getParent()->getParent();
3524   const MachineRegisterInfo &MRI = MF.getRegInfo();
3525   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3526 
3527   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3528     const MachineOperand &SrcOp = MI.getOperand(i);
3529     if (!SrcOp.isReg())
3530       continue;
3531 
3532     unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3533     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3534   }
3535   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3536                                MI.getNumOperands());
3537 }
3538 
3539 const RegisterBankInfo::InstructionMapping &
getDefaultMappingVOP(const MachineInstr & MI) const3540 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3541   const MachineFunction &MF = *MI.getParent()->getParent();
3542   const MachineRegisterInfo &MRI = MF.getRegInfo();
3543   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3544 
3545   // Even though we technically could use SGPRs, this would require knowledge of
3546   // the constant bus restriction. Force all sources to VGPR (except for VCC).
3547   //
3548   // TODO: Unary ops are trivially OK, so accept SGPRs?
3549   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3550     const MachineOperand &Src = MI.getOperand(i);
3551     if (!Src.isReg())
3552       continue;
3553 
3554     unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3555     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3556     OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3557   }
3558 
3559   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3560                                MI.getNumOperands());
3561 }
3562 
3563 const RegisterBankInfo::InstructionMapping &
getDefaultMappingAllVGPR(const MachineInstr & MI) const3564 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3565   const MachineFunction &MF = *MI.getParent()->getParent();
3566   const MachineRegisterInfo &MRI = MF.getRegInfo();
3567   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3568 
3569   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3570     const MachineOperand &Op = MI.getOperand(I);
3571     if (!Op.isReg())
3572       continue;
3573 
3574     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3575     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3576   }
3577 
3578   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3579                                MI.getNumOperands());
3580 }
3581 
3582 const RegisterBankInfo::InstructionMapping &
getImageMapping(const MachineRegisterInfo & MRI,const MachineInstr & MI,int RsrcIdx) const3583 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3584                                         const MachineInstr &MI,
3585                                         int RsrcIdx) const {
3586   // The reported argument index is relative to the IR intrinsic call arguments,
3587   // so we need to shift by the number of defs and the intrinsic ID.
3588   RsrcIdx += MI.getNumExplicitDefs() + 1;
3589 
3590   const int NumOps = MI.getNumOperands();
3591   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3592 
3593   // TODO: Should packed/unpacked D16 difference be reported here as part of
3594   // the value mapping?
3595   for (int I = 0; I != NumOps; ++I) {
3596     if (!MI.getOperand(I).isReg())
3597       continue;
3598 
3599     Register OpReg = MI.getOperand(I).getReg();
3600     // We replace some dead address operands with $noreg
3601     if (!OpReg)
3602       continue;
3603 
3604     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3605 
3606     // FIXME: Probably need a new intrinsic register bank searchable table to
3607     // handle arbitrary intrinsics easily.
3608     //
3609     // If this has a sampler, it immediately follows rsrc.
3610     const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3611 
3612     if (MustBeSGPR) {
3613       // If this must be an SGPR, so we must report whatever it is as legal.
3614       unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3615       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3616     } else {
3617       // Some operands must be VGPR, and these are easy to copy to.
3618       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3619     }
3620   }
3621 
3622   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3623 }
3624 
3625 /// Return the mapping for a pointer argument.
3626 const RegisterBankInfo::ValueMapping *
getValueMappingForPtr(const MachineRegisterInfo & MRI,Register PtrReg) const3627 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3628                                               Register PtrReg) const {
3629   LLT PtrTy = MRI.getType(PtrReg);
3630   unsigned Size = PtrTy.getSizeInBits();
3631   if (Subtarget.useFlatForGlobal() ||
3632       !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3633     return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3634 
3635   // If we're using MUBUF instructions for global memory, an SGPR base register
3636   // is possible. Otherwise this needs to be a VGPR.
3637   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3638   return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3639 }
3640 
3641 const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr & MI) const3642 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3643 
3644   const MachineFunction &MF = *MI.getParent()->getParent();
3645   const MachineRegisterInfo &MRI = MF.getRegInfo();
3646   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3647   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3648   Register PtrReg = MI.getOperand(1).getReg();
3649   LLT PtrTy = MRI.getType(PtrReg);
3650   unsigned AS = PtrTy.getAddressSpace();
3651   unsigned PtrSize = PtrTy.getSizeInBits();
3652 
3653   const ValueMapping *ValMapping;
3654   const ValueMapping *PtrMapping;
3655 
3656   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3657 
3658   if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3659     if (isScalarLoadLegal(MI)) {
3660       // We have a uniform instruction so we want to use an SMRD load
3661       ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3662       PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3663     } else {
3664       ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3665 
3666       // If we're using MUBUF instructions for global memory, an SGPR base
3667       // register is possible. Otherwise this needs to be a VGPR.
3668       unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3669         AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3670 
3671       PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3672     }
3673   } else {
3674     ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3675     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3676   }
3677 
3678   OpdsMapping[0] = ValMapping;
3679   OpdsMapping[1] = PtrMapping;
3680   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3681       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3682   return Mapping;
3683 
3684   // FIXME: Do we want to add a mapping for FLAT load, or should we just
3685   // handle that during instruction selection?
3686 }
3687 
3688 unsigned
getRegBankID(Register Reg,const MachineRegisterInfo & MRI,unsigned Default) const3689 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3690                                      const MachineRegisterInfo &MRI,
3691                                      unsigned Default) const {
3692   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3693   return Bank ? Bank->getID() : Default;
3694 }
3695 
3696 const RegisterBankInfo::ValueMapping *
getSGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3697 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3698                                          const MachineRegisterInfo &MRI,
3699                                          const TargetRegisterInfo &TRI) const {
3700   // Lie and claim anything is legal, even though this needs to be an SGPR
3701   // applyMapping will have to deal with it as a waterfall loop.
3702   unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3703   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3704   return AMDGPU::getValueMapping(Bank, Size);
3705 }
3706 
3707 const RegisterBankInfo::ValueMapping *
getVGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3708 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3709                                          const MachineRegisterInfo &MRI,
3710                                          const TargetRegisterInfo &TRI) const {
3711   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3712   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3713 }
3714 
3715 const RegisterBankInfo::ValueMapping *
getAGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3716 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3717                                          const MachineRegisterInfo &MRI,
3718                                          const TargetRegisterInfo &TRI) const {
3719   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3720   return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3721 }
3722 
3723 ///
3724 /// This function must return a legal mapping, because
3725 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3726 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
3727 /// VGPR to SGPR generated is illegal.
3728 ///
3729 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3730 // legal. These will be dealt with in applyMappingImpl.
3731 //
3732 const RegisterBankInfo::InstructionMapping &
getInstrMapping(const MachineInstr & MI) const3733 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3734   const MachineFunction &MF = *MI.getParent()->getParent();
3735   const MachineRegisterInfo &MRI = MF.getRegInfo();
3736 
3737   if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3738     // The default logic bothers to analyze impossible alternative mappings. We
3739     // want the most straightforward mapping, so just directly handle this.
3740     const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3741                                              *TRI);
3742     const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3743                                              *TRI);
3744     assert(SrcBank && "src bank should have been assigned already");
3745     if (!DstBank)
3746       DstBank = SrcBank;
3747 
3748     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3749     if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3750         cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size)))
3751       return getInvalidInstructionMapping();
3752 
3753     const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3754     unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3755     SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3756     OpdsMapping[0] = &ValMap;
3757     if (MI.getOpcode() == AMDGPU::G_FREEZE)
3758       OpdsMapping[1] = &ValMap;
3759 
3760     return getInstructionMapping(
3761         1, /*Cost*/ 1,
3762         /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3763   }
3764 
3765   if (MI.isRegSequence()) {
3766     // If any input is a VGPR, the result must be a VGPR. The default handling
3767     // assumes any copy between banks is legal.
3768     unsigned BankID = AMDGPU::SGPRRegBankID;
3769 
3770     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3771       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3772       // It doesn't make sense to use vcc or scc banks here, so just ignore
3773       // them.
3774       if (OpBank != AMDGPU::SGPRRegBankID) {
3775         BankID = AMDGPU::VGPRRegBankID;
3776         break;
3777       }
3778     }
3779     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3780 
3781     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3782     return getInstructionMapping(
3783         1, /*Cost*/ 1,
3784         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3785   }
3786 
3787   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3788   // properly.
3789   //
3790   // TODO: There are additional exec masking dependencies to analyze.
3791   if (auto *PHI = dyn_cast<GPhi>(&MI)) {
3792     unsigned ResultBank = AMDGPU::InvalidRegBankID;
3793     Register DstReg = PHI->getReg(0);
3794 
3795     // Sometimes the result may have already been assigned a bank.
3796     if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3797       ResultBank = DstBank->getID();
3798 
3799     for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) {
3800       Register Reg = PHI->getIncomingValue(I);
3801       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3802 
3803       // FIXME: Assuming VGPR for any undetermined inputs.
3804       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3805         ResultBank = AMDGPU::VGPRRegBankID;
3806         break;
3807       }
3808 
3809       // FIXME: Need to promote SGPR case to s32
3810       unsigned OpBank = Bank->getID();
3811       ResultBank = regBankBoolUnion(ResultBank, OpBank);
3812     }
3813 
3814     assert(ResultBank != AMDGPU::InvalidRegBankID);
3815 
3816     unsigned Size = MRI.getType(DstReg).getSizeInBits();
3817 
3818     const ValueMapping &ValMap =
3819         getValueMapping(0, Size, getRegBank(ResultBank));
3820     return getInstructionMapping(
3821         1, /*Cost*/ 1,
3822         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3823   }
3824 
3825   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3826   if (Mapping.isValid())
3827     return Mapping;
3828 
3829   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3830 
3831   switch (MI.getOpcode()) {
3832   default:
3833     return getInvalidInstructionMapping();
3834 
3835   case AMDGPU::G_AND:
3836   case AMDGPU::G_OR:
3837   case AMDGPU::G_XOR:
3838   case AMDGPU::G_MUL: {
3839     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3840     if (Size == 1) {
3841       const RegisterBank *DstBank
3842         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3843 
3844       unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3845       unsigned BankLHS = AMDGPU::InvalidRegBankID;
3846       unsigned BankRHS = AMDGPU::InvalidRegBankID;
3847       if (DstBank) {
3848         TargetBankID = DstBank->getID();
3849         if (DstBank == &AMDGPU::VCCRegBank) {
3850           TargetBankID = AMDGPU::VCCRegBankID;
3851           BankLHS = AMDGPU::VCCRegBankID;
3852           BankRHS = AMDGPU::VCCRegBankID;
3853         } else {
3854           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3855                                  AMDGPU::SGPRRegBankID);
3856           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3857                                  AMDGPU::SGPRRegBankID);
3858         }
3859       } else {
3860         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3861                                AMDGPU::VCCRegBankID);
3862         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3863                                AMDGPU::VCCRegBankID);
3864 
3865         // Both inputs should be true booleans to produce a boolean result.
3866         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3867           TargetBankID = AMDGPU::VGPRRegBankID;
3868         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3869           TargetBankID = AMDGPU::VCCRegBankID;
3870           BankLHS = AMDGPU::VCCRegBankID;
3871           BankRHS = AMDGPU::VCCRegBankID;
3872         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3873           TargetBankID = AMDGPU::SGPRRegBankID;
3874         }
3875       }
3876 
3877       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3878       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3879       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3880       break;
3881     }
3882 
3883     if (Size == 64) {
3884 
3885       if (isSALUMapping(MI)) {
3886         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3887         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3888       } else {
3889         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3890         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3891         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3892 
3893         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3894         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3895       }
3896 
3897       break;
3898     }
3899 
3900     [[fallthrough]];
3901   }
3902   case AMDGPU::G_PTR_ADD:
3903   case AMDGPU::G_PTRMASK:
3904   case AMDGPU::G_ADD:
3905   case AMDGPU::G_SUB:
3906   case AMDGPU::G_SHL:
3907   case AMDGPU::G_LSHR:
3908   case AMDGPU::G_ASHR:
3909   case AMDGPU::G_UADDO:
3910   case AMDGPU::G_USUBO:
3911   case AMDGPU::G_UADDE:
3912   case AMDGPU::G_SADDE:
3913   case AMDGPU::G_USUBE:
3914   case AMDGPU::G_SSUBE:
3915   case AMDGPU::G_SMIN:
3916   case AMDGPU::G_SMAX:
3917   case AMDGPU::G_UMIN:
3918   case AMDGPU::G_UMAX:
3919   case AMDGPU::G_ABS:
3920   case AMDGPU::G_SHUFFLE_VECTOR:
3921   case AMDGPU::G_SBFX:
3922   case AMDGPU::G_UBFX:
3923   case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
3924   case AMDGPU::G_AMDGPU_S_MUL_U64_U32:
3925     if (isSALUMapping(MI))
3926       return getDefaultMappingSOP(MI);
3927     return getDefaultMappingVOP(MI);
3928   case AMDGPU::G_FADD:
3929   case AMDGPU::G_FSUB:
3930   case AMDGPU::G_FMUL:
3931   case AMDGPU::G_FMA:
3932   case AMDGPU::G_FFLOOR:
3933   case AMDGPU::G_FCEIL:
3934   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
3935   case AMDGPU::G_FMINNUM:
3936   case AMDGPU::G_FMAXNUM:
3937   case AMDGPU::G_FMINIMUM:
3938   case AMDGPU::G_FMAXIMUM:
3939   case AMDGPU::G_INTRINSIC_TRUNC:
3940   case AMDGPU::G_STRICT_FADD:
3941   case AMDGPU::G_STRICT_FSUB:
3942   case AMDGPU::G_STRICT_FMUL:
3943   case AMDGPU::G_STRICT_FMA: {
3944     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3945     unsigned Size = Ty.getSizeInBits();
3946     if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
3947         (Size == 32 || Size == 16) && isSALUMapping(MI))
3948       return getDefaultMappingSOP(MI);
3949     return getDefaultMappingVOP(MI);
3950   }
3951   case AMDGPU::G_FPTOSI:
3952   case AMDGPU::G_FPTOUI:
3953   case AMDGPU::G_SITOFP:
3954   case AMDGPU::G_UITOFP: {
3955     unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3956     unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3957     if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 &&
3958         isSALUMapping(MI))
3959       return getDefaultMappingSOP(MI);
3960     return getDefaultMappingVOP(MI);
3961   }
3962   case AMDGPU::G_FPTRUNC:
3963   case AMDGPU::G_FPEXT: {
3964     unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3965     unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3966     if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 &&
3967         isSALUMapping(MI))
3968       return getDefaultMappingSOP(MI);
3969     return getDefaultMappingVOP(MI);
3970   }
3971   case AMDGPU::G_FSQRT:
3972   case AMDGPU::G_FEXP2:
3973   case AMDGPU::G_FLOG2: {
3974     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3975     if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
3976         isSALUMapping(MI))
3977       return getDefaultMappingSOP(MI);
3978     return getDefaultMappingVOP(MI);
3979   }
3980   case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3981   case AMDGPU::G_SSUBSAT:
3982   case AMDGPU::G_UADDSAT:
3983   case AMDGPU::G_USUBSAT:
3984   case AMDGPU::G_FMAD:
3985   case AMDGPU::G_FLDEXP:
3986   case AMDGPU::G_FMINNUM_IEEE:
3987   case AMDGPU::G_FMAXNUM_IEEE:
3988   case AMDGPU::G_FCANONICALIZE:
3989   case AMDGPU::G_STRICT_FLDEXP:
3990   case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3991   case AMDGPU::G_FSHR: // TODO: Expand for scalar
3992   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3993   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3994   case AMDGPU::G_AMDGPU_RCP_IFLAG:
3995   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3996   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3997   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3998   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3999   case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
4000   case AMDGPU::G_AMDGPU_SMED3:
4001   case AMDGPU::G_AMDGPU_FMED3:
4002     return getDefaultMappingVOP(MI);
4003   case AMDGPU::G_UMULH:
4004   case AMDGPU::G_SMULH: {
4005     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
4006       return getDefaultMappingSOP(MI);
4007     return getDefaultMappingVOP(MI);
4008   }
4009   case AMDGPU::G_AMDGPU_MAD_U64_U32:
4010   case AMDGPU::G_AMDGPU_MAD_I64_I32: {
4011     // Three possible mappings:
4012     //
4013     //  - Default SOP
4014     //  - Default VOP
4015     //  - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
4016     //
4017     // This allows instruction selection to keep the multiplication part of the
4018     // instruction on the SALU.
4019     bool AllSalu = true;
4020     bool MulSalu = true;
4021     for (unsigned i = 0; i < 5; ++i) {
4022       Register Reg = MI.getOperand(i).getReg();
4023       if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
4024         if (Bank->getID() != AMDGPU::SGPRRegBankID) {
4025           AllSalu = false;
4026           if (i == 2 || i == 3) {
4027             MulSalu = false;
4028             break;
4029           }
4030         }
4031       }
4032     }
4033 
4034     if (AllSalu)
4035       return getDefaultMappingSOP(MI);
4036 
4037     // If the multiply-add is full-rate in VALU, use that even if the
4038     // multiplication part is scalar. Accumulating separately on the VALU would
4039     // take two instructions.
4040     if (!MulSalu || Subtarget.hasFullRate64Ops())
4041       return getDefaultMappingVOP(MI);
4042 
4043     // Keep the multiplication on the SALU, then accumulate on the VALU.
4044     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
4045     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4046     OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4047     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4048     OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
4049     break;
4050   }
4051   case AMDGPU::G_IMPLICIT_DEF: {
4052     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4053     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4054     break;
4055   }
4056   case AMDGPU::G_FCONSTANT:
4057   case AMDGPU::G_CONSTANT:
4058   case AMDGPU::G_GLOBAL_VALUE:
4059   case AMDGPU::G_BLOCK_ADDR:
4060   case AMDGPU::G_READSTEADYCOUNTER:
4061   case AMDGPU::G_READCYCLECOUNTER: {
4062     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4063     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4064     break;
4065   }
4066   case AMDGPU::G_FRAME_INDEX: {
4067     // TODO: This should be the same as other constants, but eliminateFrameIndex
4068     // currently assumes VALU uses.
4069     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4070     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4071     break;
4072   }
4073   case AMDGPU::G_DYN_STACKALLOC: {
4074     // Result is always uniform, and a wave reduction is needed for the source.
4075     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4076     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4077     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
4078     break;
4079   }
4080   case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
4081     // This case is weird because we expect a physical register in the source,
4082     // but need to set a bank anyway.
4083     //
4084     // TODO: We could select the result to SGPR or VGPR
4085     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4086     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4087     break;
4088   }
4089   case AMDGPU::G_INSERT: {
4090     unsigned BankID = getMappingType(MRI, MI);
4091     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4092     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4093     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
4094     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
4095     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
4096     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
4097     OpdsMapping[3] = nullptr;
4098     break;
4099   }
4100   case AMDGPU::G_EXTRACT: {
4101     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4102     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4103     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4104     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
4105     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
4106     OpdsMapping[2] = nullptr;
4107     break;
4108   }
4109   case AMDGPU::G_BUILD_VECTOR:
4110   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
4111     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
4112     if (DstTy == LLT::fixed_vector(2, 16)) {
4113       unsigned DstSize = DstTy.getSizeInBits();
4114       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4115       unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4116       unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4117       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
4118 
4119       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
4120       OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
4121       OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
4122       break;
4123     }
4124 
4125     [[fallthrough]];
4126   }
4127   case AMDGPU::G_MERGE_VALUES:
4128   case AMDGPU::G_CONCAT_VECTORS: {
4129     unsigned Bank = getMappingType(MRI, MI);
4130     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4131     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4132 
4133     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
4134     // Op1 and Dst should use the same register bank.
4135     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
4136       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
4137     break;
4138   }
4139   case AMDGPU::G_BITREVERSE:
4140   case AMDGPU::G_BITCAST:
4141   case AMDGPU::G_INTTOPTR:
4142   case AMDGPU::G_PTRTOINT:
4143   case AMDGPU::G_FABS:
4144   case AMDGPU::G_FNEG: {
4145     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4146     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4147     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4148     break;
4149   }
4150   case AMDGPU::G_AMDGPU_FFBH_U32:
4151   case AMDGPU::G_AMDGPU_FFBL_B32:
4152   case AMDGPU::G_CTLZ_ZERO_UNDEF:
4153   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
4154     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4155     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4156     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
4157     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
4158     break;
4159   }
4160   case AMDGPU::G_CTPOP: {
4161     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4162     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4163     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
4164 
4165     // This should really be getValueMappingSGPR64Only, but allowing the generic
4166     // code to handle the register split just makes using LegalizerHelper more
4167     // difficult.
4168     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4169     break;
4170   }
4171   case AMDGPU::G_TRUNC: {
4172     Register Dst = MI.getOperand(0).getReg();
4173     Register Src = MI.getOperand(1).getReg();
4174     unsigned Bank = getRegBankID(Src, MRI);
4175     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4176     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4177     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
4178     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
4179     break;
4180   }
4181   case AMDGPU::G_ZEXT:
4182   case AMDGPU::G_SEXT:
4183   case AMDGPU::G_ANYEXT:
4184   case AMDGPU::G_SEXT_INREG: {
4185     Register Dst = MI.getOperand(0).getReg();
4186     Register Src = MI.getOperand(1).getReg();
4187     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4188     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4189 
4190     unsigned DstBank;
4191     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
4192     assert(SrcBank);
4193     switch (SrcBank->getID()) {
4194     case AMDGPU::SGPRRegBankID:
4195       DstBank = AMDGPU::SGPRRegBankID;
4196       break;
4197     default:
4198       DstBank = AMDGPU::VGPRRegBankID;
4199       break;
4200     }
4201 
4202     // Scalar extend can use 64-bit BFE, but VGPRs require extending to
4203     // 32-bits, and then to 64.
4204     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
4205     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
4206                                                        SrcSize);
4207     break;
4208   }
4209   case AMDGPU::G_IS_FPCLASS: {
4210     Register SrcReg = MI.getOperand(1).getReg();
4211     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4212     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4213     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4214     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4215     break;
4216   }
4217   case AMDGPU::G_STORE: {
4218     assert(MI.getOperand(0).isReg());
4219     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4220 
4221     // FIXME: We need to specify a different reg bank once scalar stores are
4222     // supported.
4223     const ValueMapping *ValMapping =
4224         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4225     OpdsMapping[0] = ValMapping;
4226     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4227     break;
4228   }
4229   case AMDGPU::G_ICMP:
4230   case AMDGPU::G_FCMP: {
4231     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4232 
4233     // See if the result register has already been constrained to vcc, which may
4234     // happen due to control flow intrinsic lowering.
4235     unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4236                                     AMDGPU::SGPRRegBankID);
4237     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4238     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
4239 
4240     auto canUseSCCICMP = [&]() {
4241       auto Pred =
4242           static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
4243       return Size == 32 ||
4244              (Size == 64 &&
4245               (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4246               Subtarget.hasScalarCompareEq64());
4247     };
4248     auto canUseSCCFCMP = [&]() {
4249       return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16);
4250     };
4251 
4252     bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;
4253     bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4254                      Op2Bank == AMDGPU::SGPRRegBankID &&
4255                      Op3Bank == AMDGPU::SGPRRegBankID &&
4256                      (isICMP ? canUseSCCICMP() : canUseSCCFCMP());
4257 
4258     DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4259     unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4260 
4261     // TODO: Use 32-bit for scalar output size.
4262     // SCC results will need to be copied to a 32-bit SGPR virtual register.
4263     const unsigned ResultSize = 1;
4264 
4265     OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
4266     OpdsMapping[1] = nullptr; // Predicate Operand.
4267     OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
4268     OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
4269     break;
4270   }
4271   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4272     // VGPR index can be used for waterfall when indexing a SGPR vector.
4273     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4274     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4275     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4276     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4277     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4278     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
4279 
4280     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
4281     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
4282 
4283     // The index can be either if the source vector is VGPR.
4284     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4285     break;
4286   }
4287   case AMDGPU::G_INSERT_VECTOR_ELT: {
4288     unsigned OutputBankID = isSALUMapping(MI) ?
4289       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4290 
4291     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4292     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4293     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4294     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4295     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
4296 
4297     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4298     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4299 
4300     // This is a weird case, because we need to break down the mapping based on
4301     // the register bank of a different operand.
4302     if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4303       OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
4304                                                       InsertSize);
4305     } else {
4306       assert(InsertSize == 32 || InsertSize == 64);
4307       OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
4308     }
4309 
4310     // The index can be either if the source vector is VGPR.
4311     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
4312     break;
4313   }
4314   case AMDGPU::G_UNMERGE_VALUES: {
4315     unsigned Bank = getMappingType(MRI, MI);
4316 
4317     // Op1 and Dst should use the same register bank.
4318     // FIXME: Shouldn't this be the default? Why do we need to handle this?
4319     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4320       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
4321       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
4322     }
4323     break;
4324   }
4325   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4326   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4327   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4328   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4329   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4330   case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
4331   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
4332   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
4333   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
4334   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
4335   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4336   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4337   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4338   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4339   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4340   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4341   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4342   case AMDGPU::G_AMDGPU_BUFFER_STORE:
4343   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4344   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4345   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4346   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4347     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4348 
4349     // rsrc
4350     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4351 
4352     // vindex
4353     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4354 
4355     // voffset
4356     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4357 
4358     // soffset
4359     OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4360 
4361     // Any remaining operands are immediates and were correctly null
4362     // initialized.
4363     break;
4364   }
4365   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4366   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4367   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4368   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4369   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4370   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4371   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4372   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4373   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4374   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4375   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4376   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4377   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4378   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4379   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4380     // vdata_out
4381     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4382 
4383     // vdata_in
4384     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4385 
4386     // rsrc
4387     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4388 
4389     // vindex
4390     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4391 
4392     // voffset
4393     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4394 
4395     // soffset
4396     OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4397 
4398     // Any remaining operands are immediates and were correctly null
4399     // initialized.
4400     break;
4401   }
4402   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4403     // vdata_out
4404     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4405 
4406     // vdata_in
4407     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4408 
4409     // cmp
4410     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4411 
4412     // rsrc
4413     OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4414 
4415     // vindex
4416     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4417 
4418     // voffset
4419     OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4420 
4421     // soffset
4422     OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4423 
4424     // Any remaining operands are immediates and were correctly null
4425     // initialized.
4426     break;
4427   }
4428   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
4429   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
4430   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
4431   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
4432   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
4433     // Lie and claim everything is legal, even though some need to be
4434     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4435     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4436     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4437 
4438     // We need to convert this to a MUBUF if either the resource of offset is
4439     // VGPR.
4440     unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4441     unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4442     unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4443 
4444     unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4445     OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4446     break;
4447   }
4448   case AMDGPU::G_INTRINSIC:
4449   case AMDGPU::G_INTRINSIC_CONVERGENT: {
4450     switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
4451     default:
4452       return getInvalidInstructionMapping();
4453     case Intrinsic::amdgcn_div_fmas:
4454     case Intrinsic::amdgcn_div_fixup:
4455     case Intrinsic::amdgcn_trig_preop:
4456     case Intrinsic::amdgcn_sin:
4457     case Intrinsic::amdgcn_cos:
4458     case Intrinsic::amdgcn_log_clamp:
4459     case Intrinsic::amdgcn_rcp_legacy:
4460     case Intrinsic::amdgcn_rsq_legacy:
4461     case Intrinsic::amdgcn_rsq_clamp:
4462     case Intrinsic::amdgcn_fmul_legacy:
4463     case Intrinsic::amdgcn_fma_legacy:
4464     case Intrinsic::amdgcn_frexp_mant:
4465     case Intrinsic::amdgcn_frexp_exp:
4466     case Intrinsic::amdgcn_fract:
4467     case Intrinsic::amdgcn_cvt_pknorm_i16:
4468     case Intrinsic::amdgcn_cvt_pknorm_u16:
4469     case Intrinsic::amdgcn_cvt_pk_i16:
4470     case Intrinsic::amdgcn_cvt_pk_u16:
4471     case Intrinsic::amdgcn_fmed3:
4472     case Intrinsic::amdgcn_cubeid:
4473     case Intrinsic::amdgcn_cubema:
4474     case Intrinsic::amdgcn_cubesc:
4475     case Intrinsic::amdgcn_cubetc:
4476     case Intrinsic::amdgcn_sffbh:
4477     case Intrinsic::amdgcn_fmad_ftz:
4478     case Intrinsic::amdgcn_mbcnt_lo:
4479     case Intrinsic::amdgcn_mbcnt_hi:
4480     case Intrinsic::amdgcn_mul_u24:
4481     case Intrinsic::amdgcn_mul_i24:
4482     case Intrinsic::amdgcn_mulhi_u24:
4483     case Intrinsic::amdgcn_mulhi_i24:
4484     case Intrinsic::amdgcn_lerp:
4485     case Intrinsic::amdgcn_sad_u8:
4486     case Intrinsic::amdgcn_msad_u8:
4487     case Intrinsic::amdgcn_sad_hi_u8:
4488     case Intrinsic::amdgcn_sad_u16:
4489     case Intrinsic::amdgcn_qsad_pk_u16_u8:
4490     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4491     case Intrinsic::amdgcn_mqsad_u32_u8:
4492     case Intrinsic::amdgcn_cvt_pk_u8_f32:
4493     case Intrinsic::amdgcn_alignbyte:
4494     case Intrinsic::amdgcn_perm:
4495     case Intrinsic::amdgcn_fdot2:
4496     case Intrinsic::amdgcn_sdot2:
4497     case Intrinsic::amdgcn_udot2:
4498     case Intrinsic::amdgcn_sdot4:
4499     case Intrinsic::amdgcn_udot4:
4500     case Intrinsic::amdgcn_sdot8:
4501     case Intrinsic::amdgcn_udot8:
4502     case Intrinsic::amdgcn_fdot2_bf16_bf16:
4503     case Intrinsic::amdgcn_fdot2_f16_f16:
4504     case Intrinsic::amdgcn_fdot2_f32_bf16:
4505     case Intrinsic::amdgcn_sudot4:
4506     case Intrinsic::amdgcn_sudot8:
4507     case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
4508     case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
4509     case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
4510     case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
4511     case Intrinsic::amdgcn_cvt_f32_fp8:
4512     case Intrinsic::amdgcn_cvt_f32_bf8:
4513     case Intrinsic::amdgcn_cvt_pk_f32_fp8:
4514     case Intrinsic::amdgcn_cvt_pk_f32_bf8:
4515     case Intrinsic::amdgcn_cvt_pk_fp8_f32:
4516     case Intrinsic::amdgcn_cvt_pk_bf8_f32:
4517     case Intrinsic::amdgcn_cvt_sr_fp8_f32:
4518     case Intrinsic::amdgcn_cvt_sr_bf8_f32:
4519     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4520     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4521     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
4522     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
4523     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4524     case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4525     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4526     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4527     case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
4528     case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
4529     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
4530     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
4531     case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
4532     case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
4533     case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
4534     case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
4535     case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
4536     case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
4537     case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
4538     case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
4539     case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
4540     case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
4541     case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
4542     case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
4543       return getDefaultMappingVOP(MI);
4544     case Intrinsic::amdgcn_log:
4545     case Intrinsic::amdgcn_exp2:
4546     case Intrinsic::amdgcn_rcp:
4547     case Intrinsic::amdgcn_rsq:
4548     case Intrinsic::amdgcn_sqrt: {
4549       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4550       if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4551           isSALUMapping(MI))
4552         return getDefaultMappingSOP(MI);
4553       return getDefaultMappingVOP(MI);
4554     }
4555     case Intrinsic::amdgcn_sbfe:
4556     case Intrinsic::amdgcn_ubfe:
4557       if (isSALUMapping(MI))
4558         return getDefaultMappingSOP(MI);
4559       return getDefaultMappingVOP(MI);
4560     case Intrinsic::amdgcn_ds_swizzle:
4561     case Intrinsic::amdgcn_ds_permute:
4562     case Intrinsic::amdgcn_ds_bpermute:
4563     case Intrinsic::amdgcn_update_dpp:
4564     case Intrinsic::amdgcn_mov_dpp8:
4565     case Intrinsic::amdgcn_mov_dpp:
4566     case Intrinsic::amdgcn_strict_wwm:
4567     case Intrinsic::amdgcn_wwm:
4568     case Intrinsic::amdgcn_strict_wqm:
4569     case Intrinsic::amdgcn_wqm:
4570     case Intrinsic::amdgcn_softwqm:
4571     case Intrinsic::amdgcn_set_inactive:
4572     case Intrinsic::amdgcn_set_inactive_chain_arg:
4573     case Intrinsic::amdgcn_permlane64:
4574       return getDefaultMappingAllVGPR(MI);
4575     case Intrinsic::amdgcn_cvt_pkrtz:
4576       if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
4577         return getDefaultMappingSOP(MI);
4578       return getDefaultMappingVOP(MI);
4579     case Intrinsic::amdgcn_kernarg_segment_ptr:
4580     case Intrinsic::amdgcn_s_getpc:
4581     case Intrinsic::amdgcn_groupstaticsize:
4582     case Intrinsic::amdgcn_reloc_constant:
4583     case Intrinsic::returnaddress: {
4584       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4585       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4586       break;
4587     }
4588     case Intrinsic::amdgcn_wqm_vote: {
4589       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4590       OpdsMapping[0] = OpdsMapping[2]
4591         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4592       break;
4593     }
4594     case Intrinsic::amdgcn_ps_live: {
4595       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4596       break;
4597     }
4598     case Intrinsic::amdgcn_div_scale: {
4599       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4600       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4601       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4602       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4603 
4604       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4605       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4606       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4607       break;
4608     }
4609     case Intrinsic::amdgcn_class: {
4610       Register Src0Reg = MI.getOperand(2).getReg();
4611       Register Src1Reg = MI.getOperand(3).getReg();
4612       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4613       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4614       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4615       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4616       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4617       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4618       break;
4619     }
4620     case Intrinsic::amdgcn_icmp:
4621     case Intrinsic::amdgcn_fcmp: {
4622       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4623       // This is not VCCRegBank because this is not used in boolean contexts.
4624       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4625       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4626       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4627       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4628       break;
4629     }
4630     case Intrinsic::amdgcn_readlane: {
4631       // This must be an SGPR, but accept a VGPR.
4632       Register IdxReg = MI.getOperand(3).getReg();
4633       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4634       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4635       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4636       [[fallthrough]];
4637     }
4638     case Intrinsic::amdgcn_readfirstlane: {
4639       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4640       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4641       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4642       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4643       break;
4644     }
4645     case Intrinsic::amdgcn_writelane: {
4646       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4647       Register SrcReg = MI.getOperand(2).getReg();
4648       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4649       unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4650       Register IdxReg = MI.getOperand(3).getReg();
4651       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4652       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4653       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4654 
4655       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4656       // to legalize.
4657       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4658       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4659       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4660       break;
4661     }
4662     case Intrinsic::amdgcn_if_break: {
4663       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4664       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4665       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4666       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4667       break;
4668     }
4669     case Intrinsic::amdgcn_permlane16:
4670     case Intrinsic::amdgcn_permlanex16: {
4671       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4672       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4673       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4674       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4675       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4676       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4677       break;
4678     }
4679     case Intrinsic::amdgcn_permlane16_var:
4680     case Intrinsic::amdgcn_permlanex16_var: {
4681       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4682       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4683       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4684       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4685       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4686       break;
4687     }
4688     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4689     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4690     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4691     case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4692     case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4693     case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4694     case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4695     case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4696     case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4697     case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4698     case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4699     case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4700     case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4701     case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4702     case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4703     case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4704     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4705     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4706     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4707     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4708     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4709     case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4710     case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4711     case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4712     case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4713     case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4714     case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4715     case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4716     case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4717     case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4718     case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
4719     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
4720     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
4721     case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
4722     case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
4723     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
4724     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
4725     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
4726     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
4727       // Default for MAI intrinsics.
4728       // srcC can also be an immediate which can be folded later.
4729       // FIXME: Should we eventually add an alternative mapping with AGPR src
4730       // for srcA/srcB?
4731       //
4732       // vdst, srcA, srcB, srcC
4733       const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4734       OpdsMapping[0] =
4735           Info->mayNeedAGPRs()
4736               ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4737               : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4738       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4739       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4740       OpdsMapping[4] =
4741           Info->mayNeedAGPRs()
4742               ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4743               : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4744       break;
4745     }
4746     case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
4747     case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
4748     case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
4749     case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
4750     case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
4751     case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
4752     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
4753     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
4754     case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
4755     case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
4756     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
4757     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
4758     case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
4759     case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
4760       // vdst, srcA, srcB, srcC, idx
4761       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4762       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4763       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4764       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4765       OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4766       break;
4767     }
4768     case Intrinsic::amdgcn_interp_p1:
4769     case Intrinsic::amdgcn_interp_p2:
4770     case Intrinsic::amdgcn_interp_mov:
4771     case Intrinsic::amdgcn_interp_p1_f16:
4772     case Intrinsic::amdgcn_interp_p2_f16:
4773     case Intrinsic::amdgcn_lds_param_load: {
4774       const int M0Idx = MI.getNumOperands() - 1;
4775       Register M0Reg = MI.getOperand(M0Idx).getReg();
4776       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4777       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4778 
4779       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4780       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4781         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4782 
4783       // Must be SGPR, but we must take whatever the original bank is and fix it
4784       // later.
4785       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4786       break;
4787     }
4788     case Intrinsic::amdgcn_interp_inreg_p10:
4789     case Intrinsic::amdgcn_interp_inreg_p2:
4790     case Intrinsic::amdgcn_interp_inreg_p10_f16:
4791     case Intrinsic::amdgcn_interp_inreg_p2_f16:
4792     case Intrinsic::amdgcn_interp_p10_rtz_f16:
4793     case Intrinsic::amdgcn_interp_p2_rtz_f16: {
4794       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4795       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4796       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4797       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4798       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4799       break;
4800     }
4801     case Intrinsic::amdgcn_ballot: {
4802       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4803       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4804       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4805       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4806       break;
4807     }
4808     case Intrinsic::amdgcn_inverse_ballot: {
4809       // This must be an SGPR, but accept a VGPR.
4810       Register MaskReg = MI.getOperand(2).getReg();
4811       unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
4812       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4813       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4814       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
4815       break;
4816     }
4817     case Intrinsic::amdgcn_s_quadmask:
4818     case Intrinsic::amdgcn_s_wqm: {
4819       Register MaskReg = MI.getOperand(2).getReg();
4820       unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
4821       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4822       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize);
4823       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
4824       break;
4825     }
4826     case Intrinsic::amdgcn_wave_reduce_umin:
4827     case Intrinsic::amdgcn_wave_reduce_umax: {
4828       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4829       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4830       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4831       auto regBankID =
4832           isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4833       OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
4834       break;
4835     }
4836     case Intrinsic::amdgcn_s_bitreplicate:
4837       Register MaskReg = MI.getOperand(2).getReg();
4838       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4839       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4840       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
4841     }
4842     break;
4843   }
4844   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4845   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4846   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4847   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4848   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4849     auto IntrID = AMDGPU::getIntrinsicID(MI);
4850     const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4851     assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4852     // Non-images can have complications from operands that allow both SGPR
4853     // and VGPR. For now it's too complicated to figure out the final opcode
4854     // to derive the register bank from the MCInstrDesc.
4855     assert(RSrcIntrin->IsImage);
4856     return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4857   }
4858   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4859     unsigned N = MI.getNumExplicitOperands() - 2;
4860     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4861     OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4862     if (N == 3) {
4863       // Sequential form: all operands combined into VGPR256/VGPR512
4864       unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4865       if (Size > 256)
4866         Size = 512;
4867       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4868     } else {
4869       // NSA form
4870       for (unsigned I = 2; I < N; ++I) {
4871         unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();
4872         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4873       }
4874     }
4875     break;
4876   }
4877   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
4878   case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
4879     auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
4880     switch (IntrID) {
4881     case Intrinsic::amdgcn_s_getreg:
4882     case Intrinsic::amdgcn_s_memtime:
4883     case Intrinsic::amdgcn_s_memrealtime:
4884     case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
4885     case Intrinsic::amdgcn_s_sendmsg_rtn: {
4886       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4887       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4888       break;
4889     }
4890     case Intrinsic::amdgcn_global_atomic_fadd:
4891     case Intrinsic::amdgcn_global_atomic_csub:
4892     case Intrinsic::amdgcn_global_atomic_fmin:
4893     case Intrinsic::amdgcn_global_atomic_fmax:
4894     case Intrinsic::amdgcn_global_atomic_fmin_num:
4895     case Intrinsic::amdgcn_global_atomic_fmax_num:
4896     case Intrinsic::amdgcn_flat_atomic_fadd:
4897     case Intrinsic::amdgcn_flat_atomic_fmin:
4898     case Intrinsic::amdgcn_flat_atomic_fmax:
4899     case Intrinsic::amdgcn_flat_atomic_fmin_num:
4900     case Intrinsic::amdgcn_flat_atomic_fmax_num:
4901     case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
4902     case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
4903     case Intrinsic::amdgcn_atomic_cond_sub_u32:
4904     case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
4905     case Intrinsic::amdgcn_global_load_tr_b64:
4906     case Intrinsic::amdgcn_global_load_tr_b128:
4907       return getDefaultMappingAllVGPR(MI);
4908     case Intrinsic::amdgcn_ds_ordered_add:
4909     case Intrinsic::amdgcn_ds_ordered_swap: {
4910       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4911       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4912       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4913                                  AMDGPU::SGPRRegBankID);
4914       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4915       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4916       break;
4917     }
4918     case Intrinsic::amdgcn_ds_append:
4919     case Intrinsic::amdgcn_ds_consume: {
4920       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4921       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4922       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4923       break;
4924     }
4925     case Intrinsic::amdgcn_exp_compr:
4926       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4927       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4928       break;
4929     case Intrinsic::amdgcn_exp:
4930       // FIXME: Could we support packed types here?
4931       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4932       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4933       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4934       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4935       break;
4936     case Intrinsic::amdgcn_exp_row:
4937       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4938       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4939       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4940       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4941       OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
4942       break;
4943     case Intrinsic::amdgcn_s_sendmsg:
4944     case Intrinsic::amdgcn_s_sendmsghalt: {
4945       // This must be an SGPR, but accept a VGPR.
4946       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4947                                    AMDGPU::SGPRRegBankID);
4948       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4949       break;
4950     }
4951     case Intrinsic::amdgcn_s_setreg: {
4952       // This must be an SGPR, but accept a VGPR.
4953       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4954                                    AMDGPU::SGPRRegBankID);
4955       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4956       break;
4957     }
4958     case Intrinsic::amdgcn_s_ttracedata: {
4959       // This must be an SGPR, but accept a VGPR.
4960       unsigned Bank =
4961           getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID);
4962       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4963       break;
4964     }
4965     case Intrinsic::amdgcn_end_cf: {
4966       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4967       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4968       break;
4969     }
4970     case Intrinsic::amdgcn_else: {
4971       unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4972       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4973       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4974       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4975       break;
4976     }
4977     case Intrinsic::amdgcn_live_mask: {
4978       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4979       break;
4980     }
4981     case Intrinsic::amdgcn_wqm_demote:
4982     case Intrinsic::amdgcn_kill: {
4983       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4984       break;
4985     }
4986     case Intrinsic::amdgcn_raw_buffer_load:
4987     case Intrinsic::amdgcn_raw_ptr_buffer_load:
4988     case Intrinsic::amdgcn_raw_atomic_buffer_load:
4989     case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
4990     case Intrinsic::amdgcn_raw_tbuffer_load:
4991     case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
4992       // FIXME: Should make intrinsic ID the last operand of the instruction,
4993       // then this would be the same as store
4994       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4995       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4996       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4997       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4998       break;
4999     }
5000     case Intrinsic::amdgcn_raw_buffer_load_lds:
5001     case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
5002       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5003       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5004       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5005       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5006       break;
5007     }
5008     case Intrinsic::amdgcn_raw_buffer_store:
5009     case Intrinsic::amdgcn_raw_ptr_buffer_store:
5010     case Intrinsic::amdgcn_raw_buffer_store_format:
5011     case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
5012     case Intrinsic::amdgcn_raw_tbuffer_store:
5013     case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
5014       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5015       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5016       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5017       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5018       break;
5019     }
5020     case Intrinsic::amdgcn_struct_buffer_load:
5021     case Intrinsic::amdgcn_struct_ptr_buffer_load:
5022     case Intrinsic::amdgcn_struct_tbuffer_load:
5023     case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
5024       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5025       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5026       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5027       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5028       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5029       break;
5030     }
5031     case Intrinsic::amdgcn_struct_buffer_load_lds:
5032     case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
5033       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5034       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5035       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5036       OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5037       OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
5038       break;
5039     }
5040     case Intrinsic::amdgcn_struct_buffer_store:
5041     case Intrinsic::amdgcn_struct_ptr_buffer_store:
5042     case Intrinsic::amdgcn_struct_tbuffer_store:
5043     case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
5044       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5045       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5046       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5047       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5048       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5049       break;
5050     }
5051     case Intrinsic::amdgcn_init_exec_from_input: {
5052       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
5053       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
5054       break;
5055     }
5056     case Intrinsic::amdgcn_ds_gws_init:
5057     case Intrinsic::amdgcn_ds_gws_barrier:
5058     case Intrinsic::amdgcn_ds_gws_sema_br: {
5059       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5060 
5061       // This must be an SGPR, but accept a VGPR.
5062       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5063                                    AMDGPU::SGPRRegBankID);
5064       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
5065       break;
5066     }
5067     case Intrinsic::amdgcn_ds_gws_sema_v:
5068     case Intrinsic::amdgcn_ds_gws_sema_p:
5069     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
5070       // This must be an SGPR, but accept a VGPR.
5071       unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
5072                                    AMDGPU::SGPRRegBankID);
5073       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
5074       break;
5075     }
5076     case Intrinsic::amdgcn_global_load_lds: {
5077       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5078       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5079       break;
5080     }
5081     case Intrinsic::amdgcn_lds_direct_load: {
5082       const int M0Idx = MI.getNumOperands() - 1;
5083       Register M0Reg = MI.getOperand(M0Idx).getReg();
5084       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
5085       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5086 
5087       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
5088       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
5089         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5090 
5091       // Must be SGPR, but we must take whatever the original bank is and fix it
5092       // later.
5093       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
5094       break;
5095     }
5096     case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
5097     case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
5098       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5099       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5100       break;
5101     case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
5102       OpdsMapping[0] =
5103           getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst
5104       OpdsMapping[1] =
5105           getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr
5106       OpdsMapping[3] =
5107           getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr
5108       OpdsMapping[4] =
5109           getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0
5110       OpdsMapping[5] =
5111           getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
5112       break;
5113     }
5114     case Intrinsic::amdgcn_s_sleep_var:
5115       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5116       break;
5117     case Intrinsic::amdgcn_s_barrier_signal_var:
5118     case Intrinsic::amdgcn_s_barrier_join:
5119     case Intrinsic::amdgcn_s_wakeup_barrier:
5120       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5121       break;
5122     case Intrinsic::amdgcn_s_barrier_init:
5123       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5124       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5125       break;
5126     case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: {
5127       const unsigned ResultSize = 1;
5128       OpdsMapping[0] =
5129           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
5130       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5131       break;
5132     }
5133     case Intrinsic::amdgcn_s_barrier_signal_isfirst:
5134     case Intrinsic::amdgcn_s_barrier_leave: {
5135       const unsigned ResultSize = 1;
5136       OpdsMapping[0] =
5137           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
5138       break;
5139     }
5140     case Intrinsic::amdgcn_s_get_barrier_state: {
5141       OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5142       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5143       break;
5144     }
5145     case Intrinsic::amdgcn_pops_exiting_wave_id:
5146       return getDefaultMappingSOP(MI);
5147     default:
5148       return getInvalidInstructionMapping();
5149     }
5150     break;
5151   }
5152   case AMDGPU::G_SELECT: {
5153     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5154     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5155                                     AMDGPU::SGPRRegBankID);
5156     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
5157                                     AMDGPU::SGPRRegBankID);
5158     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
5159                     Op3Bank == AMDGPU::SGPRRegBankID;
5160 
5161     unsigned CondBankDefault = SGPRSrcs ?
5162       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5163     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
5164                                      CondBankDefault);
5165     if (CondBank == AMDGPU::SGPRRegBankID)
5166       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5167     else if (CondBank == AMDGPU::VGPRRegBankID)
5168       CondBank = AMDGPU::VCCRegBankID;
5169 
5170     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
5171       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5172 
5173     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
5174 
5175     // TODO: Should report 32-bit for scalar condition type.
5176     if (Size == 64) {
5177       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5178       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
5179       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5180       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5181     } else {
5182       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
5183       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
5184       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
5185       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
5186     }
5187 
5188     break;
5189   }
5190 
5191   case AMDGPU::G_SI_CALL: {
5192     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
5193     // Lie and claim everything is legal, even though some need to be
5194     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
5195     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5196 
5197     // Allow anything for implicit arguments
5198     for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
5199       if (MI.getOperand(I).isReg()) {
5200         Register Reg = MI.getOperand(I).getReg();
5201         auto OpBank = getRegBankID(Reg, MRI);
5202         unsigned Size = getSizeInBits(Reg, MRI, *TRI);
5203         OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
5204       }
5205     }
5206     break;
5207   }
5208   case AMDGPU::G_LOAD:
5209   case AMDGPU::G_ZEXTLOAD:
5210   case AMDGPU::G_SEXTLOAD:
5211     return getInstrMappingForLoad(MI);
5212 
5213   case AMDGPU::G_ATOMICRMW_XCHG:
5214   case AMDGPU::G_ATOMICRMW_ADD:
5215   case AMDGPU::G_ATOMICRMW_SUB:
5216   case AMDGPU::G_ATOMICRMW_AND:
5217   case AMDGPU::G_ATOMICRMW_OR:
5218   case AMDGPU::G_ATOMICRMW_XOR:
5219   case AMDGPU::G_ATOMICRMW_MAX:
5220   case AMDGPU::G_ATOMICRMW_MIN:
5221   case AMDGPU::G_ATOMICRMW_UMAX:
5222   case AMDGPU::G_ATOMICRMW_UMIN:
5223   case AMDGPU::G_ATOMICRMW_FADD:
5224   case AMDGPU::G_ATOMICRMW_FMIN:
5225   case AMDGPU::G_ATOMICRMW_FMAX:
5226   case AMDGPU::G_ATOMICRMW_UINC_WRAP:
5227   case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
5228   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
5229     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5230     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
5231     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5232     break;
5233   }
5234   case AMDGPU::G_ATOMIC_CMPXCHG: {
5235     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5236     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
5237     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5238     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5239     break;
5240   }
5241   case AMDGPU::G_BRCOND: {
5242     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
5243                                  AMDGPU::SGPRRegBankID);
5244     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
5245     if (Bank != AMDGPU::SGPRRegBankID)
5246       Bank = AMDGPU::VCCRegBankID;
5247 
5248     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
5249     break;
5250   }
5251   case AMDGPU::G_FPTRUNC_ROUND_UPWARD:
5252   case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD:
5253     return getDefaultMappingVOP(MI);
5254   case AMDGPU::G_PREFETCH:
5255     OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5256     break;
5257   }
5258 
5259   return getInstructionMapping(/*ID*/1, /*Cost*/1,
5260                                getOperandsMapping(OpdsMapping),
5261                                MI.getNumOperands());
5262 }
5263