xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83 #include "llvm/CodeGen/RegisterBank.h"
84 #include "llvm/IR/IntrinsicsAMDGPU.h"
85 
86 #define GET_TARGET_REGBANK_IMPL
87 #include "AMDGPUGenRegisterBank.inc"
88 
89 // This file will be TableGen'ed at some point.
90 #include "AMDGPUGenRegisterBankInfo.def"
91 
92 using namespace llvm;
93 using namespace MIPatternMatch;
94 
95 namespace {
96 
97 // Observer to apply a register bank to new registers created by LegalizerHelper.
98 class ApplyRegBankMapping final : public GISelChangeObserver {
99 private:
100   MachineIRBuilder &B;
101   const AMDGPURegisterBankInfo &RBI;
102   MachineRegisterInfo &MRI;
103   const RegisterBank *NewBank;
104   SmallVector<MachineInstr *, 4> NewInsts;
105 
106 public:
107   ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_,
108                       MachineRegisterInfo &MRI_, const RegisterBank *RB)
109       : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) {
110     assert(!B.isObservingChanges());
111     B.setChangeObserver(*this);
112   }
113 
114   ~ApplyRegBankMapping() {
115     for (MachineInstr *MI : NewInsts)
116       applyBank(*MI);
117 
118     B.stopObservingChanges();
119   }
120 
121   /// Set any registers that don't have a set register class or bank to SALU.
122   void applyBank(MachineInstr &MI) {
123     const unsigned Opc = MI.getOpcode();
124     if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
125         Opc == AMDGPU::G_SEXT) {
126       // LegalizerHelper wants to use the basic legalization artifacts when
127       // widening etc. We don't handle selection with vcc in artifact sources,
128       // so we need to use a select instead to handle these properly.
129       Register DstReg = MI.getOperand(0).getReg();
130       Register SrcReg = MI.getOperand(1).getReg();
131       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
132       if (SrcBank == &AMDGPU::VCCRegBank) {
133         const LLT S32 = LLT::scalar(32);
134         assert(MRI.getType(SrcReg) == LLT::scalar(1));
135         assert(MRI.getType(DstReg) == S32);
136         assert(NewBank == &AMDGPU::VGPRRegBank);
137 
138         // Replace the extension with a select, which really uses the boolean
139         // source.
140         B.setInsertPt(*MI.getParent(), MI);
141 
142         auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
143         auto False = B.buildConstant(S32, 0);
144         B.buildSelect(DstReg, SrcReg, True, False);
145         MRI.setRegBank(True.getReg(0), *NewBank);
146         MRI.setRegBank(False.getReg(0), *NewBank);
147         MI.eraseFromParent();
148       }
149 
150       assert(!MRI.getRegClassOrRegBank(DstReg));
151       MRI.setRegBank(DstReg, *NewBank);
152       return;
153     }
154 
155 #ifndef NDEBUG
156     if (Opc == AMDGPU::G_TRUNC) {
157       Register DstReg = MI.getOperand(0).getReg();
158       const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
159       assert(DstBank != &AMDGPU::VCCRegBank);
160     }
161 #endif
162 
163     for (MachineOperand &Op : MI.operands()) {
164       if (!Op.isReg())
165         continue;
166 
167       // We may see physical registers if building a real MI
168       Register Reg = Op.getReg();
169       if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
170         continue;
171 
172       const RegisterBank *RB = NewBank;
173       if (MRI.getType(Reg) == LLT::scalar(1)) {
174         assert(NewBank == &AMDGPU::VGPRRegBank &&
175                "s1 operands should only be used for vector bools");
176         assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
177                 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
178                "not expecting legalization artifacts here");
179         RB = &AMDGPU::VCCRegBank;
180       }
181 
182       MRI.setRegBank(Reg, *RB);
183     }
184   }
185 
186   void erasingInstr(MachineInstr &MI) override {}
187 
188   void createdInstr(MachineInstr &MI) override {
189     // At this point, the instruction was just inserted and has no operands.
190     NewInsts.push_back(&MI);
191   }
192 
193   void changingInstr(MachineInstr &MI) override {}
194   void changedInstr(MachineInstr &MI) override {
195     // FIXME: In principle we should probably add the instruction to NewInsts,
196     // but the way the LegalizerHelper uses the observer, we will always see the
197     // registers we need to set the regbank on also referenced in a new
198     // instruction.
199   }
200 };
201 
202 }
203 
204 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
205     : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
206       TII(Subtarget.getInstrInfo()) {
207 
208   // HACK: Until this is fully tablegen'd.
209   static llvm::once_flag InitializeRegisterBankFlag;
210 
211   static auto InitializeRegisterBankOnce = [this]() {
212     assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
213            &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
214            &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
215     (void)this;
216   };
217 
218   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
219 }
220 
221 static bool isVectorRegisterBank(const RegisterBank &Bank) {
222   unsigned BankID = Bank.getID();
223   return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
224 }
225 
226 bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const {
227   return RB != &AMDGPU::SGPRRegBank;
228 }
229 
230 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
231                                           const RegisterBank &Src,
232                                           TypeSize Size) const {
233   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
234   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
235       (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
236     return std::numeric_limits<unsigned>::max();
237   }
238 
239   // Bool values are tricky, because the meaning is based on context. The SCC
240   // and VCC banks are for the natural scalar and vector conditions produced by
241   // a compare.
242   //
243   // Legalization doesn't know about the necessary context, so an s1 use may
244   // have been a truncate from an arbitrary value, in which case a copy (lowered
245   // as a compare with 0) needs to be inserted.
246   if (Size == 1 &&
247       (Dst.getID() == AMDGPU::SGPRRegBankID) &&
248       (isVectorRegisterBank(Src) ||
249        Src.getID() == AMDGPU::SGPRRegBankID ||
250        Src.getID() == AMDGPU::VCCRegBankID))
251     return std::numeric_limits<unsigned>::max();
252 
253   // There is no direct copy between AGPRs.
254   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
255       Src.getID() == AMDGPU::AGPRRegBankID)
256     return 4;
257 
258   return RegisterBankInfo::copyCost(Dst, Src, Size);
259 }
260 
261 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
262   const ValueMapping &ValMapping,
263   const RegisterBank *CurBank) const {
264   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
265   // VGPR.
266   // FIXME: Is there a better way to do this?
267   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
268     return 10; // This is expensive.
269 
270   assert(ValMapping.NumBreakDowns == 2 &&
271          ValMapping.BreakDown[0].Length == 32 &&
272          ValMapping.BreakDown[0].StartIdx == 0 &&
273          ValMapping.BreakDown[1].Length == 32 &&
274          ValMapping.BreakDown[1].StartIdx == 32 &&
275          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
276 
277   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
278   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
279   // want.
280 
281   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
282   // alignment restrictions, but this probably isn't important.
283   return 1;
284 }
285 
286 const RegisterBank &
287 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
288                                                LLT Ty) const {
289   if (&RC == &AMDGPU::SReg_1RegClass)
290     return AMDGPU::VCCRegBank;
291 
292   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
293   // VCC-like use.
294   if (TRI->isSGPRClass(&RC)) {
295     // FIXME: This probably came from a copy from a physical register, which
296     // should be inferable from the copied to-type. We don't have many boolean
297     // physical register constraints so just assume a normal SGPR for now.
298     if (!Ty.isValid())
299       return AMDGPU::SGPRRegBank;
300 
301     return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
302   }
303 
304   return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
305 }
306 
307 template <unsigned NumOps>
308 RegisterBankInfo::InstructionMappings
309 AMDGPURegisterBankInfo::addMappingFromTable(
310     const MachineInstr &MI, const MachineRegisterInfo &MRI,
311     const std::array<unsigned, NumOps> RegSrcOpIdx,
312     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
313 
314   InstructionMappings AltMappings;
315 
316   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
317 
318   unsigned Sizes[NumOps];
319   for (unsigned I = 0; I < NumOps; ++I) {
320     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
321     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
322   }
323 
324   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
325     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
326     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
327   }
328 
329   // getInstrMapping's default mapping uses ID 1, so start at 2.
330   unsigned MappingID = 2;
331   for (const auto &Entry : Table) {
332     for (unsigned I = 0; I < NumOps; ++I) {
333       int OpIdx = RegSrcOpIdx[I];
334       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
335     }
336 
337     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
338                                                  getOperandsMapping(Operands),
339                                                  Operands.size()));
340   }
341 
342   return AltMappings;
343 }
344 
345 RegisterBankInfo::InstructionMappings
346 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
347     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
348   switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
349   case Intrinsic::amdgcn_readlane: {
350     static const OpRegBankEntry<3> Table[2] = {
351       // Perfectly legal.
352       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
353 
354       // Need a readfirstlane for the index.
355       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
356     };
357 
358     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
359     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
360   }
361   case Intrinsic::amdgcn_writelane: {
362     static const OpRegBankEntry<4> Table[4] = {
363       // Perfectly legal.
364       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
365 
366       // Need readfirstlane of first op
367       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
368 
369       // Need readfirstlane of second op
370       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
371 
372       // Need readfirstlane of both ops
373       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
374     };
375 
376     // rsrc, voffset, offset
377     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
378     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);
379   }
380   default:
381     return RegisterBankInfo::getInstrAlternativeMappings(MI);
382   }
383 }
384 
385 RegisterBankInfo::InstructionMappings
386 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
387     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
388 
389   switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
390   case Intrinsic::amdgcn_s_buffer_load: {
391     static const OpRegBankEntry<2> Table[4] = {
392       // Perfectly legal.
393       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
394 
395       // Only need 1 register in loop
396       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
397 
398       // Have to waterfall the resource.
399       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
400 
401       // Have to waterfall the resource, and the offset.
402       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
403     };
404 
405     // rsrc, offset
406     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
407     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);
408   }
409   case Intrinsic::amdgcn_ds_ordered_add:
410   case Intrinsic::amdgcn_ds_ordered_swap: {
411     // VGPR = M0, VGPR
412     static const OpRegBankEntry<3> Table[2] = {
413       // Perfectly legal.
414       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
415 
416       // Need a readfirstlane for m0
417       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
418     };
419 
420     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
421     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
422   }
423   case Intrinsic::amdgcn_s_sendmsg:
424   case Intrinsic::amdgcn_s_sendmsghalt: {
425     // FIXME: Should have no register for immediate
426     static const OpRegBankEntry<1> Table[2] = {
427       // Perfectly legal.
428       { { AMDGPU::SGPRRegBankID }, 1 },
429 
430       // Need readlane
431       { { AMDGPU::VGPRRegBankID }, 3 }
432     };
433 
434     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
435     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);
436   }
437   default:
438     return RegisterBankInfo::getInstrAlternativeMappings(MI);
439   }
440 }
441 
442 // FIXME: Returns uniform if there's no source value information. This is
443 // probably wrong.
444 bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
445   if (!MI.hasOneMemOperand())
446     return false;
447 
448   const MachineMemOperand *MMO = *MI.memoperands_begin();
449   const unsigned AS = MMO->getAddrSpace();
450   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
451                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
452   // Require 4-byte alignment.
453   return MMO->getAlign() >= Align(4) &&
454          // Can't do a scalar atomic load.
455          !MMO->isAtomic() &&
456          // Don't use scalar loads for volatile accesses to non-constant address
457          // spaces.
458          (IsConst || !MMO->isVolatile()) &&
459          // Memory must be known constant, or not written before this load.
460          (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
461          AMDGPUInstrInfo::isUniformMMO(MMO);
462 }
463 
464 RegisterBankInfo::InstructionMappings
465 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
466     const MachineInstr &MI) const {
467 
468   const MachineFunction &MF = *MI.getParent()->getParent();
469   const MachineRegisterInfo &MRI = MF.getRegInfo();
470 
471 
472   InstructionMappings AltMappings;
473   switch (MI.getOpcode()) {
474   case TargetOpcode::G_CONSTANT:
475   case TargetOpcode::G_IMPLICIT_DEF: {
476     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
477     if (Size == 1) {
478       static const OpRegBankEntry<1> Table[3] = {
479         { { AMDGPU::VGPRRegBankID }, 1 },
480         { { AMDGPU::SGPRRegBankID }, 1 },
481         { { AMDGPU::VCCRegBankID }, 1 }
482       };
483 
484       return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
485     }
486 
487     [[fallthrough]];
488   }
489   case TargetOpcode::G_FCONSTANT:
490   case TargetOpcode::G_FRAME_INDEX:
491   case TargetOpcode::G_GLOBAL_VALUE: {
492     static const OpRegBankEntry<1> Table[2] = {
493       { { AMDGPU::VGPRRegBankID }, 1 },
494       { { AMDGPU::SGPRRegBankID }, 1 }
495     };
496 
497     return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
498   }
499   case TargetOpcode::G_AND:
500   case TargetOpcode::G_OR:
501   case TargetOpcode::G_XOR: {
502     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
503 
504     if (Size == 1) {
505       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
506       const InstructionMapping &SCCMapping = getInstructionMapping(
507         1, 1, getOperandsMapping(
508           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
509            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
510            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
511         3); // Num Operands
512       AltMappings.push_back(&SCCMapping);
513 
514       const InstructionMapping &VCCMapping0 = getInstructionMapping(
515         2, 1, getOperandsMapping(
516           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
517            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
518            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
519         3); // Num Operands
520       AltMappings.push_back(&VCCMapping0);
521       return AltMappings;
522     }
523 
524     if (Size != 64)
525       break;
526 
527     const InstructionMapping &SSMapping = getInstructionMapping(
528       1, 1, getOperandsMapping(
529         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
530          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
531          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
532       3); // Num Operands
533     AltMappings.push_back(&SSMapping);
534 
535     const InstructionMapping &VVMapping = getInstructionMapping(
536       2, 2, getOperandsMapping(
537         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
538          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
539          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
540       3); // Num Operands
541     AltMappings.push_back(&VVMapping);
542     break;
543   }
544   case TargetOpcode::G_LOAD:
545   case TargetOpcode::G_ZEXTLOAD:
546   case TargetOpcode::G_SEXTLOAD: {
547     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
548     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
549     unsigned PtrSize = PtrTy.getSizeInBits();
550     unsigned AS = PtrTy.getAddressSpace();
551 
552     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
553          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
554         isScalarLoadLegal(MI)) {
555       const InstructionMapping &SSMapping = getInstructionMapping(
556           1, 1, getOperandsMapping(
557                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
558                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
559           2); // Num Operands
560       AltMappings.push_back(&SSMapping);
561     }
562 
563     const InstructionMapping &VVMapping = getInstructionMapping(
564         2, 1,
565         getOperandsMapping(
566             {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
567              AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
568         2); // Num Operands
569     AltMappings.push_back(&VVMapping);
570 
571     // It may be possible to have a vgpr = load sgpr mapping here, because
572     // the mubuf instructions support this kind of load, but probably for only
573     // gfx7 and older.  However, the addressing mode matching in the instruction
574     // selector should be able to do a better job of detecting and selecting
575     // these kinds of loads from the vgpr = load vgpr mapping.
576 
577     return AltMappings;
578 
579   }
580   case TargetOpcode::G_SELECT: {
581     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
582     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
583       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
584                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
585                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
586                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
587       4); // Num Operands
588     AltMappings.push_back(&SSMapping);
589 
590     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
591       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
592                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
593                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
594                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
595       4); // Num Operands
596     AltMappings.push_back(&VVMapping);
597 
598     return AltMappings;
599   }
600   case TargetOpcode::G_UADDE:
601   case TargetOpcode::G_USUBE:
602   case TargetOpcode::G_SADDE:
603   case TargetOpcode::G_SSUBE: {
604     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
605     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
606       getOperandsMapping(
607         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
608          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
609          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
610          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
611          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
612       5); // Num Operands
613     AltMappings.push_back(&SSMapping);
614 
615     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
616       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
617                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
618                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
619                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
620                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
621       5); // Num Operands
622     AltMappings.push_back(&VVMapping);
623     return AltMappings;
624   }
625   case AMDGPU::G_BRCOND: {
626     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
627 
628     // TODO: Change type to 32 for scalar
629     const InstructionMapping &SMapping = getInstructionMapping(
630       1, 1, getOperandsMapping(
631         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
632       2); // Num Operands
633     AltMappings.push_back(&SMapping);
634 
635     const InstructionMapping &VMapping = getInstructionMapping(
636       1, 1, getOperandsMapping(
637         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
638       2); // Num Operands
639     AltMappings.push_back(&VMapping);
640     return AltMappings;
641   }
642   case AMDGPU::G_INTRINSIC:
643   case AMDGPU::G_INTRINSIC_CONVERGENT:
644     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
645   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
646   case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
647     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
648   default:
649     break;
650   }
651   return RegisterBankInfo::getInstrAlternativeMappings(MI);
652 }
653 
654 void AMDGPURegisterBankInfo::split64BitValueForMapping(
655   MachineIRBuilder &B,
656   SmallVector<Register, 2> &Regs,
657   LLT HalfTy,
658   Register Reg) const {
659   assert(HalfTy.getSizeInBits() == 32);
660   MachineRegisterInfo *MRI = B.getMRI();
661   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
662   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
663   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
664   MRI->setRegBank(LoLHS, *Bank);
665   MRI->setRegBank(HiLHS, *Bank);
666 
667   Regs.push_back(LoLHS);
668   Regs.push_back(HiLHS);
669 
670   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
671     .addDef(LoLHS)
672     .addDef(HiLHS)
673     .addUse(Reg);
674 }
675 
676 /// Replace the current type each register in \p Regs has with \p NewTy
677 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
678                           LLT NewTy) {
679   for (Register Reg : Regs) {
680     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
681     MRI.setType(Reg, NewTy);
682   }
683 }
684 
685 static LLT getHalfSizedType(LLT Ty) {
686   if (Ty.isVector()) {
687     assert(Ty.getElementCount().isKnownMultipleOf(2));
688     return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
689                                Ty.getElementType());
690   }
691 
692   assert(Ty.getScalarSizeInBits() % 2 == 0);
693   return LLT::scalar(Ty.getScalarSizeInBits() / 2);
694 }
695 
696 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
697 // source value into a scalar register.
698 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
699                                                     MachineRegisterInfo &MRI,
700                                                     Register Src) const {
701   LLT Ty = MRI.getType(Src);
702   const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
703 
704   if (Bank == &AMDGPU::SGPRRegBank)
705     return Src;
706 
707   unsigned Bits = Ty.getSizeInBits();
708   assert(Bits % 32 == 0);
709 
710   if (Bank != &AMDGPU::VGPRRegBank) {
711     // We need to copy from AGPR to VGPR
712     Src = B.buildCopy(Ty, Src).getReg(0);
713     MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
714   }
715 
716   LLT S32 = LLT::scalar(32);
717   unsigned NumParts = Bits / 32;
718   SmallVector<Register, 8> SrcParts;
719   SmallVector<Register, 8> DstParts;
720 
721   if (Bits == 32) {
722     SrcParts.push_back(Src);
723   } else {
724     auto Unmerge = B.buildUnmerge(S32, Src);
725     for (unsigned i = 0; i < NumParts; ++i)
726       SrcParts.push_back(Unmerge.getReg(i));
727   }
728 
729   for (unsigned i = 0; i < NumParts; ++i) {
730     Register SrcPart = SrcParts[i];
731     Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
732     MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
733 
734     const TargetRegisterClass *Constrained =
735         constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
736     (void)Constrained;
737     assert(Constrained && "Failed to constrain readfirstlane src reg");
738 
739     B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
740 
741     DstParts.push_back(DstPart);
742   }
743 
744   if (Bits == 32)
745     return DstParts[0];
746 
747   Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0);
748   MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
749   return Dst;
750 }
751 
752 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
753 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
754 /// execute the instruction for each unique combination of values in all lanes
755 /// in the wave. The block will be split such that rest of the instructions are
756 /// moved to a new block.
757 ///
758 /// Essentially performs this loop:
759 //
760 /// Save Execution Mask
761 /// For (Lane : Wavefront) {
762 ///   Enable Lane, Disable all other lanes
763 ///   SGPR = read SGPR value for current lane from VGPR
764 ///   VGPRResult[Lane] = use_op SGPR
765 /// }
766 /// Restore Execution Mask
767 ///
768 /// There is additional complexity to try for compare values to identify the
769 /// unique values used.
770 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
771     MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
772     SmallSet<Register, 4> &SGPROperandRegs) const {
773   // Track use registers which have already been expanded with a readfirstlane
774   // sequence. This may have multiple uses if moving a sequence.
775   DenseMap<Register, Register> WaterfalledRegMap;
776 
777   MachineBasicBlock &MBB = B.getMBB();
778   MachineFunction *MF = &B.getMF();
779 
780   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
781   const unsigned MovExecOpc =
782       Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
783   const unsigned MovExecTermOpc =
784       Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
785 
786   const unsigned XorTermOpc = Subtarget.isWave32() ?
787     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
788   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
789     AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
790   const unsigned ExecReg =  Subtarget.isWave32() ?
791     AMDGPU::EXEC_LO : AMDGPU::EXEC;
792 
793 #ifndef NDEBUG
794   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
795 #endif
796 
797   MachineRegisterInfo &MRI = *B.getMRI();
798   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
799   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
800 
801   // Don't bother using generic instructions/registers for the exec mask.
802   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
803     .addDef(InitSaveExecReg);
804 
805   Register PhiExec = MRI.createVirtualRegister(WaveRC);
806   Register NewExec = MRI.createVirtualRegister(WaveRC);
807 
808   // To insert the loop we need to split the block. Move everything before this
809   // point to a new block, and insert a new empty block before this instruction.
810   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
811   MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
812   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
813   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
814   MachineFunction::iterator MBBI(MBB);
815   ++MBBI;
816   MF->insert(MBBI, LoopBB);
817   MF->insert(MBBI, BodyBB);
818   MF->insert(MBBI, RestoreExecBB);
819   MF->insert(MBBI, RemainderBB);
820 
821   LoopBB->addSuccessor(BodyBB);
822   BodyBB->addSuccessor(RestoreExecBB);
823   BodyBB->addSuccessor(LoopBB);
824 
825   // Move the rest of the block into a new block.
826   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
827   RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
828 
829   MBB.addSuccessor(LoopBB);
830   RestoreExecBB->addSuccessor(RemainderBB);
831 
832   B.setInsertPt(*LoopBB, LoopBB->end());
833 
834   B.buildInstr(TargetOpcode::PHI)
835       .addDef(PhiExec)
836       .addReg(InitSaveExecReg)
837       .addMBB(&MBB)
838       .addReg(NewExec)
839       .addMBB(BodyBB);
840 
841   const DebugLoc &DL = B.getDL();
842 
843   MachineInstr &FirstInst = *Range.begin();
844 
845   // Move the instruction into the loop body. Note we moved everything after
846   // Range.end() already into a new block, so Range.end() is no longer valid.
847   BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
848 
849   // Figure out the iterator range after splicing the instructions.
850   MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
851   auto NewEnd = BodyBB->end();
852 
853   B.setMBB(*LoopBB);
854 
855   LLT S1 = LLT::scalar(1);
856   Register CondReg;
857 
858   assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
859 
860   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
861     for (MachineOperand &Op : MI.all_uses()) {
862       Register OldReg = Op.getReg();
863       if (!SGPROperandRegs.count(OldReg))
864         continue;
865 
866       // See if we already processed this register in another instruction in the
867       // sequence.
868       auto OldVal = WaterfalledRegMap.find(OldReg);
869       if (OldVal != WaterfalledRegMap.end()) {
870         Op.setReg(OldVal->second);
871         continue;
872       }
873 
874       Register OpReg = Op.getReg();
875       LLT OpTy = MRI.getType(OpReg);
876 
877       const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
878       if (OpBank != &AMDGPU::VGPRRegBank) {
879         // Insert copy from AGPR to VGPR before the loop.
880         B.setMBB(MBB);
881         OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
882         MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
883         B.setMBB(*LoopBB);
884       }
885 
886       Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
887 
888       // Build the comparison(s).
889       unsigned OpSize = OpTy.getSizeInBits();
890       bool Is64 = OpSize % 64 == 0;
891       unsigned PartSize = Is64 ? 64 : 32;
892       LLT PartTy = LLT::scalar(PartSize);
893       unsigned NumParts = OpSize / PartSize;
894       SmallVector<Register, 8> OpParts;
895       SmallVector<Register, 8> CurrentLaneParts;
896 
897       if (NumParts == 1) {
898         OpParts.push_back(OpReg);
899         CurrentLaneParts.push_back(CurrentLaneReg);
900       } else {
901         auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
902         auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
903         for (unsigned i = 0; i < NumParts; ++i) {
904           OpParts.push_back(UnmergeOp.getReg(i));
905           CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
906           MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
907           MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
908         }
909       }
910 
911       for (unsigned i = 0; i < NumParts; ++i) {
912         auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
913                                   OpParts[i]).getReg(0);
914         MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
915 
916         if (!CondReg) {
917           CondReg = CmpReg;
918         } else {
919           CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
920           MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
921         }
922       }
923 
924       Op.setReg(CurrentLaneReg);
925 
926       // Make sure we don't re-process this register again.
927       WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
928     }
929   }
930 
931   // The ballot becomes a no-op during instruction selection.
932   CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
933                              {LLT::scalar(Subtarget.isWave32() ? 32 : 64)})
934                 .addReg(CondReg)
935                 .getReg(0);
936   MRI.setRegClass(CondReg, WaveRC);
937 
938   // Update EXEC, save the original EXEC value to VCC.
939   B.buildInstr(AndSaveExecOpc)
940     .addDef(NewExec)
941     .addReg(CondReg, RegState::Kill);
942 
943   MRI.setSimpleHint(NewExec, CondReg);
944 
945   B.setInsertPt(*BodyBB, BodyBB->end());
946 
947   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
948   B.buildInstr(XorTermOpc)
949     .addDef(ExecReg)
950     .addReg(ExecReg)
951     .addReg(NewExec);
952 
953   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
954   // s_cbranch_scc0?
955 
956   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
957   B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
958 
959   // Save the EXEC mask before the loop.
960   BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
961     .addReg(ExecReg);
962 
963   // Restore the EXEC mask after the loop.
964   B.setMBB(*RestoreExecBB);
965   B.buildInstr(MovExecTermOpc)
966     .addDef(ExecReg)
967     .addReg(SaveExecReg);
968 
969   // Set the insert point after the original instruction, so any new
970   // instructions will be in the remainder.
971   B.setInsertPt(*RemainderBB, RemainderBB->begin());
972 
973   return true;
974 }
975 
976 // Return any unique registers used by \p MI at \p OpIndices that need to be
977 // handled in a waterfall loop. Returns these registers in \p
978 // SGPROperandRegs. Returns true if there are any operands to handle and a
979 // waterfall loop is necessary.
980 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
981   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
982   MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
983   for (unsigned Op : OpIndices) {
984     assert(MI.getOperand(Op).isUse());
985     Register Reg = MI.getOperand(Op).getReg();
986     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
987     if (OpBank->getID() != AMDGPU::SGPRRegBankID)
988       SGPROperandRegs.insert(Reg);
989   }
990 
991   // No operands need to be replaced, so no need to loop.
992   return !SGPROperandRegs.empty();
993 }
994 
995 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
996     MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const {
997   // Use a set to avoid extra readfirstlanes in the case where multiple operands
998   // are the same register.
999   SmallSet<Register, 4> SGPROperandRegs;
1000 
1001   if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices))
1002     return false;
1003 
1004   MachineBasicBlock::iterator I = MI.getIterator();
1005   return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1006                                 SGPROperandRegs);
1007 }
1008 
1009 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1010 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1011     MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const {
1012   Register Reg = MI.getOperand(OpIdx).getReg();
1013   MachineRegisterInfo &MRI = *B.getMRI();
1014   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1015   if (Bank == &AMDGPU::SGPRRegBank)
1016     return;
1017 
1018   Reg = buildReadFirstLane(B, MRI, Reg);
1019   MI.getOperand(OpIdx).setReg(Reg);
1020 }
1021 
1022 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1023 /// rest will be in the remainder.
1024 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1025   unsigned TotalSize = Ty.getSizeInBits();
1026   if (!Ty.isVector())
1027     return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1028 
1029   LLT EltTy = Ty.getElementType();
1030   unsigned EltSize = EltTy.getSizeInBits();
1031   assert(FirstSize % EltSize == 0);
1032 
1033   unsigned FirstPartNumElts = FirstSize / EltSize;
1034   unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1035 
1036   return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1037           LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1038 }
1039 
1040 static LLT widen96To128(LLT Ty) {
1041   if (!Ty.isVector())
1042     return LLT::scalar(128);
1043 
1044   LLT EltTy = Ty.getElementType();
1045   assert(128 % EltTy.getSizeInBits() == 0);
1046   return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1047 }
1048 
1049 bool AMDGPURegisterBankInfo::applyMappingLoad(
1050     MachineIRBuilder &B,
1051     const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1052     MachineInstr &MI) const {
1053   MachineRegisterInfo &MRI = *B.getMRI();
1054   Register DstReg = MI.getOperand(0).getReg();
1055   const LLT LoadTy = MRI.getType(DstReg);
1056   unsigned LoadSize = LoadTy.getSizeInBits();
1057   const unsigned MaxNonSmrdLoadSize = 128;
1058 
1059   const RegisterBank *DstBank =
1060       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1061   if (DstBank == &AMDGPU::SGPRRegBank) {
1062     // There are some special cases that we need to look at for 32 bit and 96
1063     // bit SGPR loads otherwise we have nothing to do.
1064     if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
1065       return false;
1066 
1067     MachineMemOperand *MMO = *MI.memoperands_begin();
1068     const unsigned MemSize = 8 * MMO->getSize();
1069     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1070     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1071     // scalar loads should have a load size of 32 but memory access size of less
1072     // than 32.
1073     if (LoadSize == 32 &&
1074         (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1075       return false;
1076 
1077     Register PtrReg = MI.getOperand(1).getReg();
1078 
1079     ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
1080 
1081     if (LoadSize == 32) {
1082       // This is an extending load from a sub-dword size. Widen the memory
1083       // access size to 4 bytes and clear the extra high bits appropriately
1084       const LLT S32 = LLT::scalar(32);
1085       if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1086         // Must extend the sign bit into higher bits for a G_SEXTLOAD
1087         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1088         B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1089       } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1090         // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1091         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1092         B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1093       } else
1094         // We do not need to touch the higher bits for regular loads.
1095         B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1096     } else {
1097       // 96-bit loads are only available for vector loads. We need to split this
1098       // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1099       if (MMO->getAlign() < Align(16)) {
1100         LegalizerHelper Helper(B.getMF(), ApplyBank, B);
1101         LLT Part64, Part32;
1102         std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1103         if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1104             LegalizerHelper::Legalized)
1105           return false;
1106         return true;
1107       } else {
1108         LLT WiderTy = widen96To128(LoadTy);
1109         auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1110         if (WiderTy.isScalar())
1111           B.buildTrunc(MI.getOperand(0), WideLoad);
1112         else {
1113           B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1114                                               WideLoad);
1115         }
1116       }
1117     }
1118 
1119     MI.eraseFromParent();
1120     return true;
1121   }
1122 
1123   // 128-bit loads are supported for all instruction types.
1124   if (LoadSize <= MaxNonSmrdLoadSize)
1125     return false;
1126 
1127   SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1128   SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1129 
1130   if (SrcRegs.empty())
1131     SrcRegs.push_back(MI.getOperand(1).getReg());
1132 
1133   assert(LoadSize % MaxNonSmrdLoadSize == 0);
1134 
1135   // RegBankSelect only emits scalar types, so we need to reset the pointer
1136   // operand to a pointer type.
1137   Register BasePtrReg = SrcRegs[0];
1138   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1139   MRI.setType(BasePtrReg, PtrTy);
1140 
1141   unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1142   const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1143   ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
1144   LegalizerHelper Helper(B.getMF(), O, B);
1145 
1146   if (LoadTy.isVector()) {
1147     if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1148       return false;
1149   } else {
1150     if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1151       return false;
1152   }
1153 
1154   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1155   return true;
1156 }
1157 
1158 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1159     MachineIRBuilder &B,
1160     const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1161     MachineInstr &MI) const {
1162   MachineRegisterInfo &MRI = *B.getMRI();
1163   const MachineFunction &MF = B.getMF();
1164   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1165   const auto &TFI = *ST.getFrameLowering();
1166 
1167   // Guard in case the stack growth direction ever changes with scratch
1168   // instructions.
1169   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1170     return false;
1171 
1172   Register Dst = MI.getOperand(0).getReg();
1173   Register AllocSize = MI.getOperand(1).getReg();
1174   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1175 
1176   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1177 
1178   // TODO: Need to emit a wave reduction to get the maximum size.
1179   if (SizeBank != &AMDGPU::SGPRRegBank)
1180     return false;
1181 
1182   LLT PtrTy = MRI.getType(Dst);
1183   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1184 
1185   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1186   Register SPReg = Info->getStackPtrOffsetReg();
1187   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1188 
1189   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1190   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1191 
1192   auto SPCopy = B.buildCopy(PtrTy, SPReg);
1193   if (Alignment > TFI.getStackAlign()) {
1194     auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1195     B.buildMaskLowPtrBits(Dst, PtrAdd,
1196                           Log2(Alignment) + ST.getWavefrontSizeLog2());
1197   } else {
1198     B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1199   }
1200 
1201   MI.eraseFromParent();
1202   return true;
1203 }
1204 
1205 bool AMDGPURegisterBankInfo::applyMappingImage(
1206     MachineIRBuilder &B, MachineInstr &MI,
1207     const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1208     int RsrcIdx) const {
1209   const int NumDefs = MI.getNumExplicitDefs();
1210 
1211   // The reported argument index is relative to the IR intrinsic call arguments,
1212   // so we need to shift by the number of defs and the intrinsic ID.
1213   RsrcIdx += NumDefs + 1;
1214 
1215   // Insert copies to VGPR arguments.
1216   applyDefaultMapping(OpdMapper);
1217 
1218   // Fixup any SGPR arguments.
1219   SmallVector<unsigned, 4> SGPRIndexes;
1220   for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1221     if (!MI.getOperand(I).isReg())
1222       continue;
1223 
1224     // If this intrinsic has a sampler, it immediately follows rsrc.
1225     if (I == RsrcIdx || I == RsrcIdx + 1)
1226       SGPRIndexes.push_back(I);
1227   }
1228 
1229   executeInWaterfallLoop(B, MI, SGPRIndexes);
1230   return true;
1231 }
1232 
1233 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1234 // the three offsets (voffset, soffset and instoffset)
1235 unsigned AMDGPURegisterBankInfo::setBufferOffsets(
1236     MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
1237     Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
1238   const LLT S32 = LLT::scalar(32);
1239   MachineRegisterInfo *MRI = B.getMRI();
1240 
1241   if (std::optional<int64_t> Imm =
1242           getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1243     uint32_t SOffset, ImmOffset;
1244     if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) {
1245       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1246       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1247       InstOffsetVal = ImmOffset;
1248 
1249       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1250       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1251       return SOffset + ImmOffset;
1252     }
1253   }
1254 
1255   Register Base;
1256   unsigned Offset;
1257 
1258   std::tie(Base, Offset) =
1259       AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1260 
1261   uint32_t SOffset, ImmOffset;
1262   if ((int)Offset > 0 &&
1263       TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
1264     if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1265       VOffsetReg = Base;
1266       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1267       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1268       InstOffsetVal = ImmOffset;
1269       return 0; // XXX - Why is this 0?
1270     }
1271 
1272     // If we have SGPR base, we can use it for soffset.
1273     if (SOffset == 0) {
1274       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1275       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1276       SOffsetReg = Base;
1277       InstOffsetVal = ImmOffset;
1278       return 0; // XXX - Why is this 0?
1279     }
1280   }
1281 
1282   // Handle the variable sgpr + vgpr case.
1283   MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1284   if (Add && (int)Offset >= 0) {
1285     Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI);
1286     Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI);
1287 
1288     const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI);
1289     const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI);
1290 
1291     if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1292       VOffsetReg = Src0;
1293       SOffsetReg = Src1;
1294       return 0;
1295     }
1296 
1297     if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1298       VOffsetReg = Src1;
1299       SOffsetReg = Src0;
1300       return 0;
1301     }
1302   }
1303 
1304   // Ensure we have a VGPR for the combined offset. This could be an issue if we
1305   // have an SGPR offset and a VGPR resource.
1306   if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1307     VOffsetReg = CombinedOffset;
1308   } else {
1309     VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1310     B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1311   }
1312 
1313   SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1314   B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1315   return 0;
1316 }
1317 
1318 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1319     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1320   MachineInstr &MI = OpdMapper.getMI();
1321   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1322 
1323   const LLT S32 = LLT::scalar(32);
1324   Register Dst = MI.getOperand(0).getReg();
1325   LLT Ty = MRI.getType(Dst);
1326 
1327   const RegisterBank *RSrcBank =
1328     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1329   const RegisterBank *OffsetBank =
1330     OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1331   if (RSrcBank == &AMDGPU::SGPRRegBank &&
1332       OffsetBank == &AMDGPU::SGPRRegBank)
1333     return true; // Legal mapping
1334 
1335   // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1336   // here but don't have an MMO.
1337 
1338   unsigned LoadSize = Ty.getSizeInBits();
1339   int NumLoads = 1;
1340   if (LoadSize == 256 || LoadSize == 512) {
1341     NumLoads = LoadSize / 128;
1342     Ty = Ty.divide(NumLoads);
1343   }
1344 
1345   // Use the alignment to ensure that the required offsets will fit into the
1346   // immediate offsets.
1347   const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1348 
1349   MachineFunction &MF = B.getMF();
1350 
1351   Register SOffset;
1352   Register VOffset;
1353   int64_t ImmOffset = 0;
1354 
1355   unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset,
1356                                         SOffset, ImmOffset, Alignment);
1357 
1358   // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1359   // can, but we need to track an MMO for that.
1360   const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1361   const Align MemAlign(4); // FIXME: ABI type alignment?
1362   MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1363     MachinePointerInfo(),
1364     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1365     MachineMemOperand::MOInvariant,
1366     MemSize, MemAlign);
1367   if (MMOOffset != 0)
1368     BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1369 
1370   // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1371   // assume that the buffer is unswizzled.
1372 
1373   Register RSrc = MI.getOperand(1).getReg();
1374   Register VIndex = B.buildConstant(S32, 0).getReg(0);
1375   B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1376 
1377   SmallVector<Register, 4> LoadParts(NumLoads);
1378 
1379   MachineBasicBlock::iterator MII = MI.getIterator();
1380   MachineInstrSpan Span(MII, &B.getMBB());
1381 
1382   for (int i = 0; i < NumLoads; ++i) {
1383     if (NumLoads == 1) {
1384       LoadParts[i] = Dst;
1385     } else {
1386       LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1387       MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1388     }
1389 
1390     MachineMemOperand *MMO = BaseMMO;
1391     if (i != 0)
1392       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1393 
1394     B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1395       .addDef(LoadParts[i])       // vdata
1396       .addUse(RSrc)               // rsrc
1397       .addUse(VIndex)             // vindex
1398       .addUse(VOffset)            // voffset
1399       .addUse(SOffset)            // soffset
1400       .addImm(ImmOffset + 16 * i) // offset(imm)
1401       .addImm(0)                  // cachepolicy, swizzled buffer(imm)
1402       .addImm(0)                  // idxen(imm)
1403       .addMemOperand(MMO);
1404   }
1405 
1406   // TODO: If only the resource is a VGPR, it may be better to execute the
1407   // scalar load in the waterfall loop if the resource is expected to frequently
1408   // be dynamically uniform.
1409   if (RSrcBank != &AMDGPU::SGPRRegBank) {
1410     // Remove the original instruction to avoid potentially confusing the
1411     // waterfall loop logic.
1412     B.setInstr(*Span.begin());
1413     MI.eraseFromParent();
1414 
1415     SmallSet<Register, 4> OpsToWaterfall;
1416 
1417     OpsToWaterfall.insert(RSrc);
1418     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1419                            OpsToWaterfall);
1420   }
1421 
1422   if (NumLoads != 1) {
1423     if (Ty.isVector())
1424       B.buildConcatVectors(Dst, LoadParts);
1425     else
1426       B.buildMergeLikeInstr(Dst, LoadParts);
1427   }
1428 
1429   // We removed the instruction earlier with a waterfall loop.
1430   if (RSrcBank == &AMDGPU::SGPRRegBank)
1431     MI.eraseFromParent();
1432 
1433   return true;
1434 }
1435 
1436 bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
1437                                              const OperandsMapper &OpdMapper,
1438                                              bool Signed) const {
1439   MachineInstr &MI = OpdMapper.getMI();
1440   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1441 
1442   // Insert basic copies
1443   applyDefaultMapping(OpdMapper);
1444 
1445   Register DstReg = MI.getOperand(0).getReg();
1446   LLT Ty = MRI.getType(DstReg);
1447 
1448   const LLT S32 = LLT::scalar(32);
1449 
1450   unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
1451   Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1452   Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1453   Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1454 
1455   const RegisterBank *DstBank =
1456     OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1457   if (DstBank == &AMDGPU::VGPRRegBank) {
1458     if (Ty == S32)
1459       return true;
1460 
1461     // There is no 64-bit vgpr bitfield extract instructions so the operation
1462     // is expanded to a sequence of instructions that implement the operation.
1463     ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
1464 
1465     const LLT S64 = LLT::scalar(64);
1466     // Shift the source operand so that extracted bits start at bit 0.
1467     auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1468                               : B.buildLShr(S64, SrcReg, OffsetReg);
1469     auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1470 
1471     // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1472     // if the width is a constant.
1473     if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1474       // Use the 32-bit bitfield extract instruction if the width is a constant.
1475       // Depending on the width size, use either the low or high 32-bits.
1476       auto Zero = B.buildConstant(S32, 0);
1477       auto WidthImm = ConstWidth->Value.getZExtValue();
1478       if (WidthImm <= 32) {
1479         // Use bitfield extract on the lower 32-bit source, and then sign-extend
1480         // or clear the upper 32-bits.
1481         auto Extract =
1482             Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1483                    : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1484         auto Extend =
1485             Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1486         B.buildMergeLikeInstr(DstReg, {Extract, Extend});
1487       } else {
1488         // Use bitfield extract on upper 32-bit source, and combine with lower
1489         // 32-bit source.
1490         auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1491         auto Extract =
1492             Signed
1493                 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1494                 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1495         B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
1496       }
1497       MI.eraseFromParent();
1498       return true;
1499     }
1500 
1501     // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1502     // operations.
1503     auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1504     auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1505     if (Signed)
1506       B.buildAShr(S64, SignBit, ExtShift);
1507     else
1508       B.buildLShr(S64, SignBit, ExtShift);
1509     MI.eraseFromParent();
1510     return true;
1511   }
1512 
1513   // The scalar form packs the offset and width in a single operand.
1514 
1515   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1516 
1517   // Ensure the high bits are clear to insert the offset.
1518   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1519   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1520 
1521   // Zeros out the low bits, so don't bother clamping the input value.
1522   auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1523 
1524   // Transformation function, pack the offset and width of a BFE into
1525   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1526   // source, bits [5:0] contain the offset and bits [22:16] the width.
1527   auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1528 
1529   // TODO: It might be worth using a pseudo here to avoid scc clobber and
1530   // register class constraints.
1531   unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1532                              (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1533 
1534   auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1535   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1536     llvm_unreachable("failed to constrain BFE");
1537 
1538   MI.eraseFromParent();
1539   return true;
1540 }
1541 
1542 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1543     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1544   MachineInstr &MI = OpdMapper.getMI();
1545   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1546 
1547   // Insert basic copies.
1548   applyDefaultMapping(OpdMapper);
1549 
1550   Register Dst0 = MI.getOperand(0).getReg();
1551   Register Dst1 = MI.getOperand(1).getReg();
1552   Register Src0 = MI.getOperand(2).getReg();
1553   Register Src1 = MI.getOperand(3).getReg();
1554   Register Src2 = MI.getOperand(4).getReg();
1555 
1556   if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
1557     return true;
1558 
1559   bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1560   LLT S1 = LLT::scalar(1);
1561   LLT S32 = LLT::scalar(32);
1562 
1563   bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
1564   bool Accumulate = true;
1565 
1566   if (!DstOnValu) {
1567     if (mi_match(Src2, MRI, m_ZeroInt()))
1568       Accumulate = false;
1569   }
1570 
1571   // Keep the multiplication on the SALU.
1572   Register DstHi;
1573   Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
1574   bool MulHiInVgpr = false;
1575 
1576   MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
1577 
1578   if (Subtarget.hasSMulHi()) {
1579     DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)
1580                        : B.buildSMulH(S32, Src0, Src1).getReg(0);
1581     MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
1582   } else {
1583     Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);
1584     Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);
1585 
1586     MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
1587     MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
1588 
1589     DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)
1590                        : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);
1591     MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1592 
1593     if (!DstOnValu) {
1594       DstHi = buildReadFirstLane(B, MRI, DstHi);
1595     } else {
1596       MulHiInVgpr = true;
1597     }
1598   }
1599 
1600   // Accumulate and produce the "carry-out" bit.
1601   //
1602   // The "carry-out" is defined as bit 64 of the result when computed as a
1603   // big integer. For unsigned multiply-add, this matches the usual definition
1604   // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1605   // result, which is determined as:
1606   //   sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1607   LLT CarryType = DstOnValu ? S1 : S32;
1608   const RegisterBank &CarryBank =
1609       DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1610   const RegisterBank &DstBank =
1611       DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1612   Register Carry;
1613   Register Zero;
1614 
1615   if (!IsUnsigned) {
1616     Zero = B.buildConstant(S32, 0).getReg(0);
1617     MRI.setRegBank(Zero,
1618                    MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1619 
1620     Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)
1621                 .getReg(0);
1622     MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
1623                                       : AMDGPU::SGPRRegBank);
1624 
1625     if (DstOnValu && !MulHiInVgpr) {
1626       Carry = B.buildTrunc(S1, Carry).getReg(0);
1627       MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
1628     }
1629   }
1630 
1631   if (Accumulate) {
1632     if (DstOnValu) {
1633       DstLo = B.buildCopy(S32, DstLo).getReg(0);
1634       DstHi = B.buildCopy(S32, DstHi).getReg(0);
1635       MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
1636       MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1637     }
1638 
1639     auto Unmerge = B.buildUnmerge(S32, Src2);
1640     Register Src2Lo = Unmerge.getReg(0);
1641     Register Src2Hi = Unmerge.getReg(1);
1642     MRI.setRegBank(Src2Lo, DstBank);
1643     MRI.setRegBank(Src2Hi, DstBank);
1644 
1645     if (!IsUnsigned) {
1646       auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);
1647       MRI.setRegBank(Src2Sign.getReg(0), CarryBank);
1648 
1649       Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);
1650       MRI.setRegBank(Carry, CarryBank);
1651     }
1652 
1653     auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);
1654     DstLo = AddLo.getReg(0);
1655     Register CarryLo = AddLo.getReg(1);
1656     MRI.setRegBank(DstLo, DstBank);
1657     MRI.setRegBank(CarryLo, CarryBank);
1658 
1659     auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);
1660     DstHi = AddHi.getReg(0);
1661     MRI.setRegBank(DstHi, DstBank);
1662 
1663     Register CarryHi = AddHi.getReg(1);
1664     MRI.setRegBank(CarryHi, CarryBank);
1665 
1666     if (IsUnsigned) {
1667       Carry = CarryHi;
1668     } else {
1669       Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);
1670       MRI.setRegBank(Carry, CarryBank);
1671     }
1672   } else {
1673     if (IsUnsigned) {
1674       Carry = B.buildConstant(CarryType, 0).getReg(0);
1675       MRI.setRegBank(Carry, CarryBank);
1676     }
1677   }
1678 
1679   B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
1680 
1681   if (DstOnValu) {
1682     B.buildCopy(Dst1, Carry);
1683   } else {
1684     B.buildTrunc(Dst1, Carry);
1685   }
1686 
1687   MI.eraseFromParent();
1688   return true;
1689 }
1690 
1691 // Return a suitable opcode for extending the operands of Opc when widening.
1692 static unsigned getExtendOp(unsigned Opc) {
1693   switch (Opc) {
1694   case TargetOpcode::G_ASHR:
1695   case TargetOpcode::G_SMIN:
1696   case TargetOpcode::G_SMAX:
1697     return TargetOpcode::G_SEXT;
1698   case TargetOpcode::G_LSHR:
1699   case TargetOpcode::G_UMIN:
1700   case TargetOpcode::G_UMAX:
1701     return TargetOpcode::G_ZEXT;
1702   default:
1703     return TargetOpcode::G_ANYEXT;
1704   }
1705 }
1706 
1707 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1708 // any illegal vector extend or unmerge operations.
1709 static std::pair<Register, Register>
1710 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1711   const LLT S32 = LLT::scalar(32);
1712   auto Bitcast = B.buildBitcast(S32, Src);
1713 
1714   if (ExtOpcode == TargetOpcode::G_SEXT) {
1715     auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1716     auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1717     return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1718   }
1719 
1720   auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1721   if (ExtOpcode == TargetOpcode::G_ZEXT) {
1722     auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1723     return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1724   }
1725 
1726   assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1727   return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1728 }
1729 
1730 // For cases where only a single copy is inserted for matching register banks.
1731 // Replace the register in the instruction operand
1732 static bool substituteSimpleCopyRegs(
1733   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1734   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1735   if (!SrcReg.empty()) {
1736     assert(SrcReg.size() == 1);
1737     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1738     return true;
1739   }
1740 
1741   return false;
1742 }
1743 
1744 /// Handle register layout difference for f16 images for some subtargets.
1745 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1746                                                 MachineRegisterInfo &MRI,
1747                                                 Register Reg) const {
1748   if (!Subtarget.hasUnpackedD16VMem())
1749     return Reg;
1750 
1751   const LLT S16 = LLT::scalar(16);
1752   LLT StoreVT = MRI.getType(Reg);
1753   if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1754     return Reg;
1755 
1756   auto Unmerge = B.buildUnmerge(S16, Reg);
1757 
1758 
1759   SmallVector<Register, 4> WideRegs;
1760   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1761     WideRegs.push_back(Unmerge.getReg(I));
1762 
1763   const LLT S32 = LLT::scalar(32);
1764   int NumElts = StoreVT.getNumElements();
1765 
1766   return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs)
1767       .getReg(0);
1768 }
1769 
1770 static std::pair<Register, unsigned>
1771 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1772   int64_t Const;
1773   if (mi_match(Reg, MRI, m_ICst(Const)))
1774     return std::pair(Register(), Const);
1775 
1776   Register Base;
1777   if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1778     return std::pair(Base, Const);
1779 
1780   // TODO: Handle G_OR used for add case
1781   return std::pair(Reg, 0);
1782 }
1783 
1784 std::pair<Register, unsigned>
1785 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1786                                            Register OrigOffset) const {
1787   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(Subtarget);
1788   Register BaseReg;
1789   unsigned ImmOffset;
1790   const LLT S32 = LLT::scalar(32);
1791 
1792   // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1793   std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1794                                                            OrigOffset);
1795 
1796   unsigned C1 = 0;
1797   if (ImmOffset != 0) {
1798     // If the immediate value is too big for the immoffset field, put only bits
1799     // that would normally fit in the immoffset field. The remaining value that
1800     // is copied/added for the voffset field is a large power of 2, and it
1801     // stands more chance of being CSEd with the copy/add for another similar
1802     // load/store.
1803     // However, do not do that rounding down if that is a negative
1804     // number, as it appears to be illegal to have a negative offset in the
1805     // vgpr, even if adding the immediate offset makes it positive.
1806     unsigned Overflow = ImmOffset & ~MaxImm;
1807     ImmOffset -= Overflow;
1808     if ((int32_t)Overflow < 0) {
1809       Overflow += ImmOffset;
1810       ImmOffset = 0;
1811     }
1812 
1813     C1 = ImmOffset;
1814     if (Overflow != 0) {
1815       if (!BaseReg)
1816         BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1817       else {
1818         auto OverflowVal = B.buildConstant(S32, Overflow);
1819         BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1820       }
1821     }
1822   }
1823 
1824   if (!BaseReg)
1825     BaseReg = B.buildConstant(S32, 0).getReg(0);
1826 
1827   return {BaseReg, C1};
1828 }
1829 
1830 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1831                                         Register SrcReg) const {
1832   MachineRegisterInfo &MRI = *B.getMRI();
1833   LLT SrcTy = MRI.getType(SrcReg);
1834   if (SrcTy.getSizeInBits() == 32) {
1835     // Use a v_mov_b32 here to make the exec dependency explicit.
1836     B.buildInstr(AMDGPU::V_MOV_B32_e32)
1837       .addDef(DstReg)
1838       .addUse(SrcReg);
1839     return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1840            constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1841   }
1842 
1843   Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1844   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1845 
1846   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1847     .addDef(TmpReg0)
1848     .addUse(SrcReg, 0, AMDGPU::sub0);
1849   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1850     .addDef(TmpReg1)
1851     .addUse(SrcReg, 0, AMDGPU::sub1);
1852   B.buildInstr(AMDGPU::REG_SEQUENCE)
1853     .addDef(DstReg)
1854     .addUse(TmpReg0)
1855     .addImm(AMDGPU::sub0)
1856     .addUse(TmpReg1)
1857     .addImm(AMDGPU::sub1);
1858 
1859   return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1860          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1861 }
1862 
1863 /// Utility function for pushing dynamic vector indexes with a constant offset
1864 /// into waterfall loops.
1865 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1866                                    MachineInstr &IdxUseInstr,
1867                                    unsigned OpIdx,
1868                                    unsigned ConstOffset) {
1869   MachineRegisterInfo &MRI = *B.getMRI();
1870   const LLT S32 = LLT::scalar(32);
1871   Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1872   B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1873 
1874   auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1875 
1876   auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1877   MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1878   MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1879   IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1880 }
1881 
1882 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1883 /// original 32-bit source value (to be inserted in the low part of the combined
1884 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1885 /// value.
1886 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1887                                   Register Hi32Reg, Register Lo32Reg,
1888                                   unsigned ExtOpc,
1889                                   const RegisterBank &RegBank,
1890                                   bool IsBooleanSrc = false) {
1891   if (ExtOpc == AMDGPU::G_ZEXT) {
1892     B.buildConstant(Hi32Reg, 0);
1893   } else if (ExtOpc == AMDGPU::G_SEXT) {
1894     if (IsBooleanSrc) {
1895       // If we know the original source was an s1, the high half is the same as
1896       // the low.
1897       B.buildCopy(Hi32Reg, Lo32Reg);
1898     } else {
1899       // Replicate sign bit from 32-bit extended part.
1900       auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1901       B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1902       B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1903     }
1904   } else {
1905     assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1906     B.buildUndef(Hi32Reg);
1907   }
1908 }
1909 
1910 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1911     MachineIRBuilder &B, MachineInstr &MI,
1912     const OperandsMapper &OpdMapper) const {
1913   MachineRegisterInfo &MRI = *B.getMRI();
1914 
1915   Register VecReg = MI.getOperand(1).getReg();
1916   Register Idx = MI.getOperand(2).getReg();
1917 
1918   const RegisterBank &IdxBank =
1919     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1920 
1921   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1922 
1923   LLT VecTy = MRI.getType(VecReg);
1924   unsigned EltSize = VecTy.getScalarSizeInBits();
1925   unsigned NumElem = VecTy.getNumElements();
1926 
1927   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1928                                                   IsDivergentIdx, &Subtarget))
1929     return false;
1930 
1931   LLT S32 = LLT::scalar(32);
1932 
1933   const RegisterBank &DstBank =
1934     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1935   const RegisterBank &SrcBank =
1936     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1937 
1938   const RegisterBank &CCBank =
1939     (DstBank == AMDGPU::SGPRRegBank &&
1940      SrcBank == AMDGPU::SGPRRegBank &&
1941      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1942                                      : AMDGPU::VCCRegBank;
1943   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1944 
1945   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1946     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1947     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1948   }
1949 
1950   LLT EltTy = VecTy.getScalarType();
1951   SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1952   unsigned NumLanes = DstRegs.size();
1953   if (!NumLanes)
1954     NumLanes = 1;
1955   else
1956     EltTy = MRI.getType(DstRegs[0]);
1957 
1958   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1959   SmallVector<Register, 2> Res(NumLanes);
1960   for (unsigned L = 0; L < NumLanes; ++L)
1961     Res[L] = UnmergeToEltTy.getReg(L);
1962 
1963   for (unsigned I = 1; I < NumElem; ++I) {
1964     auto IC = B.buildConstant(S32, I);
1965     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1966     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1967     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1968 
1969     for (unsigned L = 0; L < NumLanes; ++L) {
1970       auto S = B.buildSelect(EltTy, Cmp,
1971                              UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1972 
1973       for (unsigned N : { 0, 2, 3 })
1974         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1975 
1976       Res[L] = S->getOperand(0).getReg();
1977     }
1978   }
1979 
1980   for (unsigned L = 0; L < NumLanes; ++L) {
1981     Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1982     B.buildCopy(DstReg, Res[L]);
1983     MRI.setRegBank(DstReg, DstBank);
1984   }
1985 
1986   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
1987   MI.eraseFromParent();
1988 
1989   return true;
1990 }
1991 
1992 // Insert a cross regbank copy for a register if it already has a bank that
1993 // differs from the one we want to set.
1994 static Register constrainRegToBank(MachineRegisterInfo &MRI,
1995                                    MachineIRBuilder &B, Register &Reg,
1996                                    const RegisterBank &Bank) {
1997   const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
1998   if (CurrBank && *CurrBank != Bank) {
1999     Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2000     MRI.setRegBank(Copy, Bank);
2001     return Copy;
2002   }
2003 
2004   MRI.setRegBank(Reg, Bank);
2005   return Reg;
2006 }
2007 
2008 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2009     MachineIRBuilder &B, MachineInstr &MI,
2010     const OperandsMapper &OpdMapper) const {
2011 
2012   MachineRegisterInfo &MRI = *B.getMRI();
2013   Register VecReg = MI.getOperand(1).getReg();
2014   Register Idx = MI.getOperand(3).getReg();
2015 
2016   const RegisterBank &IdxBank =
2017     *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2018 
2019   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2020 
2021   LLT VecTy = MRI.getType(VecReg);
2022   unsigned EltSize = VecTy.getScalarSizeInBits();
2023   unsigned NumElem = VecTy.getNumElements();
2024 
2025   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2026                                                   IsDivergentIdx, &Subtarget))
2027     return false;
2028 
2029   LLT S32 = LLT::scalar(32);
2030 
2031   const RegisterBank &DstBank =
2032     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2033   const RegisterBank &SrcBank =
2034     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2035   const RegisterBank &InsBank =
2036     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2037 
2038   const RegisterBank &CCBank =
2039     (DstBank == AMDGPU::SGPRRegBank &&
2040      SrcBank == AMDGPU::SGPRRegBank &&
2041      InsBank == AMDGPU::SGPRRegBank &&
2042      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2043                                      : AMDGPU::VCCRegBank;
2044   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2045 
2046   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2047     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2048     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2049   }
2050 
2051   LLT EltTy = VecTy.getScalarType();
2052   SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2053   unsigned NumLanes = InsRegs.size();
2054   if (!NumLanes) {
2055     NumLanes = 1;
2056     InsRegs.push_back(MI.getOperand(2).getReg());
2057   } else {
2058     EltTy = MRI.getType(InsRegs[0]);
2059   }
2060 
2061   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2062   SmallVector<Register, 16> Ops(NumElem * NumLanes);
2063 
2064   for (unsigned I = 0; I < NumElem; ++I) {
2065     auto IC = B.buildConstant(S32, I);
2066     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2067     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2068     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2069 
2070     for (unsigned L = 0; L < NumLanes; ++L) {
2071       Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2072       Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2073       Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2074 
2075       Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2076       MRI.setRegBank(Select, DstBank);
2077 
2078       Ops[I * NumLanes + L] = Select;
2079     }
2080   }
2081 
2082   LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2083   if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2084     B.buildBuildVector(MI.getOperand(0), Ops);
2085   } else {
2086     auto Vec = B.buildBuildVector(MergeTy, Ops);
2087     MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2088     B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2089   }
2090 
2091   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2092   MI.eraseFromParent();
2093 
2094   return true;
2095 }
2096 
2097 // Break s_mul_u64 into 32-bit vector operations.
2098 void AMDGPURegisterBankInfo::applyMappingSMULU64(
2099     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2100   SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2101   SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2102   SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2103 
2104   // All inputs are SGPRs, nothing special to do.
2105   if (DefRegs.empty()) {
2106     assert(Src0Regs.empty() && Src1Regs.empty());
2107     applyDefaultMapping(OpdMapper);
2108     return;
2109   }
2110 
2111   assert(DefRegs.size() == 2);
2112   assert(Src0Regs.size() == Src1Regs.size() &&
2113          (Src0Regs.empty() || Src0Regs.size() == 2));
2114 
2115   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2116   MachineInstr &MI = OpdMapper.getMI();
2117   Register DstReg = MI.getOperand(0).getReg();
2118   LLT HalfTy = LLT::scalar(32);
2119 
2120   // Depending on where the source registers came from, the generic code may
2121   // have decided to split the inputs already or not. If not, we still need to
2122   // extract the values.
2123 
2124   if (Src0Regs.empty())
2125     split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2126   else
2127     setRegsToType(MRI, Src0Regs, HalfTy);
2128 
2129   if (Src1Regs.empty())
2130     split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2131   else
2132     setRegsToType(MRI, Src1Regs, HalfTy);
2133 
2134   setRegsToType(MRI, DefRegs, HalfTy);
2135 
2136   // The multiplication is done as follows:
2137   //
2138   //                            Op1H  Op1L
2139   //                          * Op0H  Op0L
2140   //                       --------------------
2141   //                       Op1H*Op0L  Op1L*Op0L
2142   //          + Op1H*Op0H  Op1L*Op0H
2143   // -----------------------------------------
2144   // (Op1H*Op0L + Op1L*Op0H + carry)  Op1L*Op0L
2145   //
2146   //  We drop Op1H*Op0H because the result of the multiplication is a 64-bit
2147   //  value and that would overflow.
2148   //  The low 32-bit value is Op1L*Op0L.
2149   //  The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from
2150   //  Op1L*Op0L).
2151 
2152   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
2153 
2154   Register Hi = B.buildUMulH(HalfTy, Src0Regs[0], Src1Regs[0]).getReg(0);
2155   Register MulLoHi = B.buildMul(HalfTy, Src0Regs[0], Src1Regs[1]).getReg(0);
2156   Register Add = B.buildAdd(HalfTy, Hi, MulLoHi).getReg(0);
2157   Register MulHiLo = B.buildMul(HalfTy, Src0Regs[1], Src1Regs[0]).getReg(0);
2158   B.buildAdd(DefRegs[1], Add, MulHiLo);
2159   B.buildMul(DefRegs[0], Src0Regs[0], Src1Regs[0]);
2160 
2161   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2162   MI.eraseFromParent();
2163 }
2164 
2165 void AMDGPURegisterBankInfo::applyMappingImpl(
2166     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2167   MachineInstr &MI = OpdMapper.getMI();
2168   B.setInstrAndDebugLoc(MI);
2169   unsigned Opc = MI.getOpcode();
2170   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2171   switch (Opc) {
2172   case AMDGPU::G_CONSTANT:
2173   case AMDGPU::G_IMPLICIT_DEF: {
2174     Register DstReg = MI.getOperand(0).getReg();
2175     LLT DstTy = MRI.getType(DstReg);
2176     if (DstTy != LLT::scalar(1))
2177       break;
2178 
2179     const RegisterBank *DstBank =
2180         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2181     if (DstBank == &AMDGPU::VCCRegBank)
2182       break;
2183     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2184     if (DefRegs.empty())
2185       DefRegs.push_back(DstReg);
2186 
2187     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2188 
2189     Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
2190     LLVMContext &Ctx = B.getMF().getFunction().getContext();
2191 
2192     MI.getOperand(0).setReg(NewDstReg);
2193     if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2194       uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
2195       MI.getOperand(1).setCImm(
2196           ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal));
2197     }
2198 
2199     MRI.setRegBank(NewDstReg, *DstBank);
2200     B.buildTrunc(DefRegs[0], NewDstReg);
2201     return;
2202   }
2203   case AMDGPU::G_PHI: {
2204     Register DstReg = MI.getOperand(0).getReg();
2205     LLT DstTy = MRI.getType(DstReg);
2206     if (DstTy != LLT::scalar(1))
2207       break;
2208 
2209     const LLT S32 = LLT::scalar(32);
2210     const RegisterBank *DstBank =
2211       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2212     if (DstBank == &AMDGPU::VCCRegBank) {
2213       applyDefaultMapping(OpdMapper);
2214       // The standard handling only considers the result register bank for
2215       // phis. For VCC, blindly inserting a copy when the phi is lowered will
2216       // produce an invalid copy. We can only copy with some kind of compare to
2217       // get a vector boolean result. Insert a register bank copy that will be
2218       // correctly lowered to a compare.
2219       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2220         Register SrcReg = MI.getOperand(I).getReg();
2221         const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2222 
2223         if (SrcBank != &AMDGPU::VCCRegBank) {
2224           MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2225           B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2226 
2227           auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2228           MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2229           MI.getOperand(I).setReg(Copy.getReg(0));
2230         }
2231       }
2232 
2233       return;
2234     }
2235 
2236     // Phi handling is strange and only considers the bank of the destination.
2237     substituteSimpleCopyRegs(OpdMapper, 0);
2238 
2239     // Promote SGPR/VGPR booleans to s32
2240     ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2241     B.setInsertPt(B.getMBB(), MI);
2242     LegalizerHelper Helper(B.getMF(), ApplyBank, B);
2243 
2244     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2245       llvm_unreachable("widen scalar should have succeeded");
2246 
2247     return;
2248   }
2249   case AMDGPU::G_FCMP:
2250     if (!Subtarget.hasSALUFloatInsts())
2251       break;
2252     LLVM_FALLTHROUGH;
2253   case AMDGPU::G_ICMP:
2254   case AMDGPU::G_UADDO:
2255   case AMDGPU::G_USUBO:
2256   case AMDGPU::G_UADDE:
2257   case AMDGPU::G_SADDE:
2258   case AMDGPU::G_USUBE:
2259   case AMDGPU::G_SSUBE: {
2260     unsigned BoolDstOp =
2261         (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1;
2262     Register DstReg = MI.getOperand(BoolDstOp).getReg();
2263 
2264     const RegisterBank *DstBank =
2265       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2266     if (DstBank != &AMDGPU::SGPRRegBank)
2267       break;
2268 
2269     const bool HasCarryIn = MI.getNumOperands() == 5;
2270 
2271     // If this is a scalar compare, promote the result to s32, as the selection
2272     // will end up using a copy to a 32-bit vreg.
2273     const LLT S32 = LLT::scalar(32);
2274     Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2275     MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2276     MI.getOperand(BoolDstOp).setReg(NewDstReg);
2277 
2278     if (HasCarryIn) {
2279       Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2280       MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2281       B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2282       MI.getOperand(4).setReg(NewSrcReg);
2283     }
2284 
2285     MachineBasicBlock *MBB = MI.getParent();
2286     B.setInsertPt(*MBB, std::next(MI.getIterator()));
2287 
2288     // If we had a constrained VCC result register, a copy was inserted to VCC
2289     // from SGPR.
2290     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2291     if (DefRegs.empty())
2292       DefRegs.push_back(DstReg);
2293     B.buildTrunc(DefRegs[0], NewDstReg);
2294     return;
2295   }
2296   case AMDGPU::G_SELECT: {
2297     Register DstReg = MI.getOperand(0).getReg();
2298     LLT DstTy = MRI.getType(DstReg);
2299 
2300     SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2301     if (CondRegs.empty())
2302       CondRegs.push_back(MI.getOperand(1).getReg());
2303     else {
2304       assert(CondRegs.size() == 1);
2305     }
2306 
2307     const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2308     if (CondBank == &AMDGPU::SGPRRegBank) {
2309       const LLT S32 = LLT::scalar(32);
2310       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2311       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2312 
2313       MI.getOperand(1).setReg(NewCondReg);
2314       B.buildZExt(NewCondReg, CondRegs[0]);
2315     }
2316 
2317     if (DstTy.getSizeInBits() != 64)
2318       break;
2319 
2320     LLT HalfTy = getHalfSizedType(DstTy);
2321 
2322     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2323     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2324     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2325 
2326     // All inputs are SGPRs, nothing special to do.
2327     if (DefRegs.empty()) {
2328       assert(Src1Regs.empty() && Src2Regs.empty());
2329       break;
2330     }
2331 
2332     if (Src1Regs.empty())
2333       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2334     else {
2335       setRegsToType(MRI, Src1Regs, HalfTy);
2336     }
2337 
2338     if (Src2Regs.empty())
2339       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2340     else
2341       setRegsToType(MRI, Src2Regs, HalfTy);
2342 
2343     setRegsToType(MRI, DefRegs, HalfTy);
2344 
2345     B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2346     B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2347 
2348     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2349     MI.eraseFromParent();
2350     return;
2351   }
2352   case AMDGPU::G_BRCOND: {
2353     Register CondReg = MI.getOperand(0).getReg();
2354     // FIXME: Should use legalizer helper, but should change bool ext type.
2355     const RegisterBank *CondBank =
2356       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2357 
2358     if (CondBank == &AMDGPU::SGPRRegBank) {
2359       const LLT S32 = LLT::scalar(32);
2360       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2361       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2362 
2363       MI.getOperand(0).setReg(NewCondReg);
2364       B.buildZExt(NewCondReg, CondReg);
2365       return;
2366     }
2367 
2368     break;
2369   }
2370   case AMDGPU::G_AND:
2371   case AMDGPU::G_OR:
2372   case AMDGPU::G_XOR: {
2373     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2374     // there is a VGPR input.
2375     Register DstReg = MI.getOperand(0).getReg();
2376     LLT DstTy = MRI.getType(DstReg);
2377 
2378     if (DstTy.getSizeInBits() == 1) {
2379       const RegisterBank *DstBank =
2380         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2381       if (DstBank == &AMDGPU::VCCRegBank)
2382         break;
2383 
2384       MachineFunction *MF = MI.getParent()->getParent();
2385       ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2386       LegalizerHelper Helper(*MF, ApplyBank, B);
2387 
2388       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2389           LegalizerHelper::Legalized)
2390         llvm_unreachable("widen scalar should have succeeded");
2391       return;
2392     }
2393 
2394     if (DstTy.getSizeInBits() != 64)
2395       break;
2396 
2397     LLT HalfTy = getHalfSizedType(DstTy);
2398     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2399     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2400     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2401 
2402     // All inputs are SGPRs, nothing special to do.
2403     if (DefRegs.empty()) {
2404       assert(Src0Regs.empty() && Src1Regs.empty());
2405       break;
2406     }
2407 
2408     assert(DefRegs.size() == 2);
2409     assert(Src0Regs.size() == Src1Regs.size() &&
2410            (Src0Regs.empty() || Src0Regs.size() == 2));
2411 
2412     // Depending on where the source registers came from, the generic code may
2413     // have decided to split the inputs already or not. If not, we still need to
2414     // extract the values.
2415 
2416     if (Src0Regs.empty())
2417       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2418     else
2419       setRegsToType(MRI, Src0Regs, HalfTy);
2420 
2421     if (Src1Regs.empty())
2422       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2423     else
2424       setRegsToType(MRI, Src1Regs, HalfTy);
2425 
2426     setRegsToType(MRI, DefRegs, HalfTy);
2427 
2428     B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2429     B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2430 
2431     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2432     MI.eraseFromParent();
2433     return;
2434   }
2435   case AMDGPU::G_ABS: {
2436     Register SrcReg = MI.getOperand(1).getReg();
2437     const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2438 
2439     // There is no VALU abs instruction so we need to replace it with a sub and
2440     // max combination.
2441     if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2442       MachineFunction *MF = MI.getParent()->getParent();
2443       ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
2444       LegalizerHelper Helper(*MF, Apply, B);
2445 
2446       if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2447         llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2448       return;
2449     }
2450     [[fallthrough]];
2451   }
2452   case AMDGPU::G_ADD:
2453   case AMDGPU::G_SUB:
2454   case AMDGPU::G_MUL:
2455   case AMDGPU::G_SHL:
2456   case AMDGPU::G_LSHR:
2457   case AMDGPU::G_ASHR:
2458   case AMDGPU::G_SMIN:
2459   case AMDGPU::G_SMAX:
2460   case AMDGPU::G_UMIN:
2461   case AMDGPU::G_UMAX: {
2462     Register DstReg = MI.getOperand(0).getReg();
2463     LLT DstTy = MRI.getType(DstReg);
2464 
2465     // Special case for s_mul_u64. There is not a vector equivalent of
2466     // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
2467     // multiplications.
2468     if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) {
2469       applyMappingSMULU64(B, OpdMapper);
2470       return;
2471     }
2472 
2473     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2474     // Packed 16-bit operations need to be scalarized and promoted.
2475     if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2476       break;
2477 
2478     const RegisterBank *DstBank =
2479         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2480     if (DstBank == &AMDGPU::VGPRRegBank)
2481       break;
2482 
2483     const LLT S32 = LLT::scalar(32);
2484     MachineBasicBlock *MBB = MI.getParent();
2485     MachineFunction *MF = MBB->getParent();
2486     ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2487 
2488     if (DstTy.isVector() && Opc == AMDGPU::G_ABS) {
2489       Register WideSrcLo, WideSrcHi;
2490 
2491       std::tie(WideSrcLo, WideSrcHi) =
2492           unpackV2S16ToS32(B, MI.getOperand(1).getReg(), TargetOpcode::G_SEXT);
2493       auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo});
2494       auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi});
2495       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2496       MI.eraseFromParent();
2497       return;
2498     }
2499 
2500     if (DstTy.isVector()) {
2501       Register WideSrc0Lo, WideSrc0Hi;
2502       Register WideSrc1Lo, WideSrc1Hi;
2503 
2504       unsigned ExtendOp = getExtendOp(MI.getOpcode());
2505       std::tie(WideSrc0Lo, WideSrc0Hi)
2506         = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2507       std::tie(WideSrc1Lo, WideSrc1Hi)
2508         = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2509       auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2510       auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2511       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2512       MI.eraseFromParent();
2513     } else {
2514       LegalizerHelper Helper(*MF, ApplySALU, B);
2515 
2516       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2517         llvm_unreachable("widen scalar should have succeeded");
2518 
2519       // FIXME: s16 shift amounts should be legal.
2520       if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2521           Opc == AMDGPU::G_ASHR) {
2522         B.setInsertPt(*MBB, MI.getIterator());
2523         if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2524           llvm_unreachable("widen scalar should have succeeded");
2525       }
2526     }
2527 
2528     return;
2529   }
2530   case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
2531   case AMDGPU::G_AMDGPU_S_MUL_U64_U32: {
2532     // This is a special case for s_mul_u64. We use
2533     // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation
2534     // where the 33 higher bits are sign-extended and
2535     // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation
2536     // where the 32 higher bits are zero-extended. In case scalar registers are
2537     // selected, both opcodes are lowered as s_mul_u64. If the vector registers
2538     // are selected, then G_AMDGPU_S_MUL_I64_I32 and
2539     // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.
2540 
2541     // Insert basic copies.
2542     applyDefaultMapping(OpdMapper);
2543 
2544     Register DstReg = MI.getOperand(0).getReg();
2545     Register SrcReg0 = MI.getOperand(1).getReg();
2546     Register SrcReg1 = MI.getOperand(2).getReg();
2547     const LLT S32 = LLT::scalar(32);
2548     const LLT S64 = LLT::scalar(64);
2549     assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "
2550                                          "that handles only 64-bit operands.");
2551     const RegisterBank *DstBank =
2552         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2553 
2554     // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2555     // with s_mul_u64 operation.
2556     if (DstBank == &AMDGPU::SGPRRegBank) {
2557       MI.setDesc(TII->get(AMDGPU::S_MUL_U64));
2558       MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass);
2559       MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass);
2560       MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass);
2561       return;
2562     }
2563 
2564     // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2565     // with a vector mad.
2566     assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank &&
2567            "The destination operand should be in vector registers.");
2568 
2569     DebugLoc DL = MI.getDebugLoc();
2570 
2571     // Extract the lower subregister from the first operand.
2572     Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2573     MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass);
2574     MRI.setType(Op0L, S32);
2575     B.buildTrunc(Op0L, SrcReg0);
2576 
2577     // Extract the lower subregister from the second operand.
2578     Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2579     MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass);
2580     MRI.setType(Op1L, S32);
2581     B.buildTrunc(Op1L, SrcReg1);
2582 
2583     unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32
2584                           ? AMDGPU::G_AMDGPU_MAD_U64_U32
2585                           : AMDGPU::G_AMDGPU_MAD_I64_I32;
2586 
2587     MachineIRBuilder B(MI);
2588     Register Zero64 = B.buildConstant(S64, 0).getReg(0);
2589     MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass);
2590     Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
2591     MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass);
2592     B.buildInstr(NewOpc, {DstReg, CarryOut}, {Op0L, Op1L, Zero64});
2593     MI.eraseFromParent();
2594     return;
2595   }
2596   case AMDGPU::G_SEXT_INREG: {
2597     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2598     if (SrcRegs.empty())
2599       break; // Nothing to repair
2600 
2601     const LLT S32 = LLT::scalar(32);
2602     ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
2603 
2604     // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2605     // we would need to further expand, and doesn't let us directly set the
2606     // result registers.
2607     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2608 
2609     int Amt = MI.getOperand(2).getImm();
2610     if (Amt <= 32) {
2611       // Downstream users have expectations for the high bit behavior, so freeze
2612       // incoming undefined bits.
2613       if (Amt == 32) {
2614         // The low bits are unchanged.
2615         B.buildFreeze(DstRegs[0], SrcRegs[0]);
2616       } else {
2617         auto Freeze = B.buildFreeze(S32, SrcRegs[0]);
2618         // Extend in the low bits and propagate the sign bit to the high half.
2619         B.buildSExtInReg(DstRegs[0], Freeze, Amt);
2620       }
2621 
2622       B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2623     } else {
2624       // The low bits are unchanged, and extend in the high bits.
2625       // No freeze required
2626       B.buildCopy(DstRegs[0], SrcRegs[0]);
2627       B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2628     }
2629 
2630     Register DstReg = MI.getOperand(0).getReg();
2631     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2632     MI.eraseFromParent();
2633     return;
2634   }
2635   case AMDGPU::G_CTPOP:
2636   case AMDGPU::G_BITREVERSE: {
2637     const RegisterBank *DstBank =
2638       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2639     if (DstBank == &AMDGPU::SGPRRegBank)
2640       break;
2641 
2642     Register SrcReg = MI.getOperand(1).getReg();
2643     const LLT S32 = LLT::scalar(32);
2644     LLT Ty = MRI.getType(SrcReg);
2645     if (Ty == S32)
2646       break;
2647 
2648     ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2649 
2650     MachineFunction &MF = B.getMF();
2651     LegalizerHelper Helper(MF, ApplyVALU, B);
2652 
2653     if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2654       llvm_unreachable("narrowScalar should have succeeded");
2655     return;
2656   }
2657   case AMDGPU::G_AMDGPU_FFBH_U32:
2658   case AMDGPU::G_AMDGPU_FFBL_B32:
2659   case AMDGPU::G_CTLZ_ZERO_UNDEF:
2660   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2661     const RegisterBank *DstBank =
2662         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2663     if (DstBank == &AMDGPU::SGPRRegBank)
2664       break;
2665 
2666     Register SrcReg = MI.getOperand(1).getReg();
2667     const LLT S32 = LLT::scalar(32);
2668     LLT Ty = MRI.getType(SrcReg);
2669     if (Ty == S32)
2670       break;
2671 
2672     // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2673     // which return -1 when the input is zero:
2674     // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2675     // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2676     // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2677     // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2678     ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2679     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2680     unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2681                           ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2682                           : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2683                                 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2684                                 : Opc;
2685     unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2686     auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2687     auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2688     unsigned AddOpc =
2689         Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2690             ? AMDGPU::G_ADD
2691             : AMDGPU::G_UADDSAT;
2692     Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2693     Register DstReg = MI.getOperand(0).getReg();
2694     B.buildUMin(DstReg, X, Y);
2695     MI.eraseFromParent();
2696     return;
2697   }
2698   case AMDGPU::G_SEXT:
2699   case AMDGPU::G_ZEXT:
2700   case AMDGPU::G_ANYEXT: {
2701     Register SrcReg = MI.getOperand(1).getReg();
2702     LLT SrcTy = MRI.getType(SrcReg);
2703     const bool Signed = Opc == AMDGPU::G_SEXT;
2704 
2705     assert(OpdMapper.getVRegs(1).empty());
2706 
2707     const RegisterBank *SrcBank =
2708       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2709 
2710     Register DstReg = MI.getOperand(0).getReg();
2711     LLT DstTy = MRI.getType(DstReg);
2712     if (DstTy.isScalar() &&
2713         SrcBank != &AMDGPU::SGPRRegBank &&
2714         SrcBank != &AMDGPU::VCCRegBank &&
2715         // FIXME: Should handle any type that round to s64 when irregular
2716         // breakdowns supported.
2717         DstTy.getSizeInBits() == 64 &&
2718         SrcTy.getSizeInBits() <= 32) {
2719       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2720 
2721       // Extend to 32-bit, and then extend the low half.
2722       if (Signed) {
2723         // TODO: Should really be buildSExtOrCopy
2724         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2725       } else if (Opc == AMDGPU::G_ZEXT) {
2726         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2727       } else {
2728         B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2729       }
2730 
2731       extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2732       MRI.setRegBank(DstReg, *SrcBank);
2733       MI.eraseFromParent();
2734       return;
2735     }
2736 
2737     if (SrcTy != LLT::scalar(1))
2738       return;
2739 
2740     // It is not legal to have a legalization artifact with a VCC source. Rather
2741     // than introducing a copy, insert the select we would have to select the
2742     // copy to.
2743     if (SrcBank == &AMDGPU::VCCRegBank) {
2744       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2745 
2746       const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2747 
2748       unsigned DstSize = DstTy.getSizeInBits();
2749       // 64-bit select is SGPR only
2750       const bool UseSel64 = DstSize > 32 &&
2751         SrcBank->getID() == AMDGPU::SGPRRegBankID;
2752 
2753       // TODO: Should s16 select be legal?
2754       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2755       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2756       auto False = B.buildConstant(SelType, 0);
2757 
2758       MRI.setRegBank(True.getReg(0), *DstBank);
2759       MRI.setRegBank(False.getReg(0), *DstBank);
2760       MRI.setRegBank(DstReg, *DstBank);
2761 
2762       if (DstSize > 32) {
2763         B.buildSelect(DefRegs[0], SrcReg, True, False);
2764         extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2765       } else if (DstSize < 32) {
2766         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2767         MRI.setRegBank(Sel.getReg(0), *DstBank);
2768         B.buildTrunc(DstReg, Sel);
2769       } else {
2770         B.buildSelect(DstReg, SrcReg, True, False);
2771       }
2772 
2773       MI.eraseFromParent();
2774       return;
2775     }
2776 
2777     break;
2778   }
2779   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2780     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2781 
2782     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2783 
2784     Register DstReg = MI.getOperand(0).getReg();
2785     Register SrcReg = MI.getOperand(1).getReg();
2786 
2787     const LLT S32 = LLT::scalar(32);
2788     LLT DstTy = MRI.getType(DstReg);
2789     LLT SrcTy = MRI.getType(SrcReg);
2790 
2791     if (foldExtractEltToCmpSelect(B, MI, OpdMapper))
2792       return;
2793 
2794     const ValueMapping &DstMapping
2795       = OpdMapper.getInstrMapping().getOperandMapping(0);
2796     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2797     const RegisterBank *SrcBank =
2798       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2799     const RegisterBank *IdxBank =
2800         OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2801 
2802     Register BaseIdxReg;
2803     unsigned ConstOffset;
2804     std::tie(BaseIdxReg, ConstOffset) =
2805         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2806 
2807     // See if the index is an add of a constant which will be foldable by moving
2808     // the base register of the index later if this is going to be executed in a
2809     // waterfall loop. This is essentially to reassociate the add of a constant
2810     // with the readfirstlane.
2811     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2812                                    ConstOffset > 0 &&
2813                                    ConstOffset < SrcTy.getNumElements();
2814 
2815     // Move the base register. We'll re-insert the add later.
2816     if (ShouldMoveIndexIntoLoop)
2817       MI.getOperand(2).setReg(BaseIdxReg);
2818 
2819     // If this is a VGPR result only because the index was a VGPR result, the
2820     // actual indexing will be done on the SGPR source vector, which will
2821     // produce a scalar result. We need to copy to the VGPR result inside the
2822     // waterfall loop.
2823     const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2824                                 SrcBank == &AMDGPU::SGPRRegBank;
2825     if (DstRegs.empty()) {
2826       applyDefaultMapping(OpdMapper);
2827 
2828       executeInWaterfallLoop(B, MI, {2});
2829 
2830       if (NeedCopyToVGPR) {
2831         // We don't want a phi for this temporary reg.
2832         Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2833         MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2834         MI.getOperand(0).setReg(TmpReg);
2835         B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2836 
2837         // Use a v_mov_b32 here to make the exec dependency explicit.
2838         buildVCopy(B, DstReg, TmpReg);
2839       }
2840 
2841       // Re-insert the constant offset add inside the waterfall loop.
2842       if (ShouldMoveIndexIntoLoop)
2843         reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2844 
2845       return;
2846     }
2847 
2848     assert(DstTy.getSizeInBits() == 64);
2849 
2850     LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2851 
2852     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2853     auto One = B.buildConstant(S32, 1);
2854 
2855     MachineBasicBlock::iterator MII = MI.getIterator();
2856 
2857     // Split the vector index into 32-bit pieces. Prepare to move all of the
2858     // new instructions into a waterfall loop if necessary.
2859     //
2860     // Don't put the bitcast or constant in the loop.
2861     MachineInstrSpan Span(MII, &B.getMBB());
2862 
2863     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2864     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2865     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2866 
2867     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2868     auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2869 
2870     MRI.setRegBank(DstReg, *DstBank);
2871     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2872     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2873     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2874     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2875 
2876     SmallSet<Register, 4> OpsToWaterfall;
2877     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2878       MI.eraseFromParent();
2879       return;
2880     }
2881 
2882     // Remove the original instruction to avoid potentially confusing the
2883     // waterfall loop logic.
2884     B.setInstr(*Span.begin());
2885     MI.eraseFromParent();
2886     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2887                            OpsToWaterfall);
2888 
2889     if (NeedCopyToVGPR) {
2890       MachineBasicBlock *LoopBB = Extract1->getParent();
2891       Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2892       Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2893       MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2894       MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2895 
2896       Extract0->getOperand(0).setReg(TmpReg0);
2897       Extract1->getOperand(0).setReg(TmpReg1);
2898 
2899       B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2900 
2901       buildVCopy(B, DstRegs[0], TmpReg0);
2902       buildVCopy(B, DstRegs[1], TmpReg1);
2903     }
2904 
2905     if (ShouldMoveIndexIntoLoop)
2906       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2907 
2908     return;
2909   }
2910   case AMDGPU::G_INSERT_VECTOR_ELT: {
2911     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2912 
2913     Register DstReg = MI.getOperand(0).getReg();
2914     LLT VecTy = MRI.getType(DstReg);
2915 
2916     assert(OpdMapper.getVRegs(0).empty());
2917     assert(OpdMapper.getVRegs(3).empty());
2918 
2919     if (substituteSimpleCopyRegs(OpdMapper, 1))
2920       MRI.setType(MI.getOperand(1).getReg(), VecTy);
2921 
2922     if (foldInsertEltToCmpSelect(B, MI, OpdMapper))
2923       return;
2924 
2925     const RegisterBank *IdxBank =
2926       OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2927 
2928     Register SrcReg = MI.getOperand(1).getReg();
2929     Register InsReg = MI.getOperand(2).getReg();
2930     LLT InsTy = MRI.getType(InsReg);
2931     (void)InsTy;
2932 
2933     Register BaseIdxReg;
2934     unsigned ConstOffset;
2935     std::tie(BaseIdxReg, ConstOffset) =
2936         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2937 
2938     // See if the index is an add of a constant which will be foldable by moving
2939     // the base register of the index later if this is going to be executed in a
2940     // waterfall loop. This is essentially to reassociate the add of a constant
2941     // with the readfirstlane.
2942     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2943       ConstOffset > 0 &&
2944       ConstOffset < VecTy.getNumElements();
2945 
2946     // Move the base register. We'll re-insert the add later.
2947     if (ShouldMoveIndexIntoLoop)
2948       MI.getOperand(3).setReg(BaseIdxReg);
2949 
2950 
2951     if (InsRegs.empty()) {
2952       executeInWaterfallLoop(B, MI, {3});
2953 
2954       // Re-insert the constant offset add inside the waterfall loop.
2955       if (ShouldMoveIndexIntoLoop) {
2956         reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2957       }
2958 
2959       return;
2960     }
2961 
2962     assert(InsTy.getSizeInBits() == 64);
2963 
2964     const LLT S32 = LLT::scalar(32);
2965     LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2966 
2967     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2968     auto One = B.buildConstant(S32, 1);
2969 
2970     // Split the vector index into 32-bit pieces. Prepare to move all of the
2971     // new instructions into a waterfall loop if necessary.
2972     //
2973     // Don't put the bitcast or constant in the loop.
2974     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2975 
2976     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2977     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2978     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2979 
2980     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2981     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2982 
2983     const RegisterBank *DstBank =
2984       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2985     const RegisterBank *SrcBank =
2986       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2987     const RegisterBank *InsSrcBank =
2988       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2989 
2990     MRI.setRegBank(InsReg, *InsSrcBank);
2991     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2992     MRI.setRegBank(InsLo.getReg(0), *DstBank);
2993     MRI.setRegBank(InsHi.getReg(0), *DstBank);
2994     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2995     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2996     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2997 
2998 
2999     SmallSet<Register, 4> OpsToWaterfall;
3000     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
3001       B.setInsertPt(B.getMBB(), MI);
3002       B.buildBitcast(DstReg, InsHi);
3003       MI.eraseFromParent();
3004       return;
3005     }
3006 
3007     B.setInstr(*Span.begin());
3008     MI.eraseFromParent();
3009 
3010     // Figure out the point after the waterfall loop before mangling the control
3011     // flow.
3012     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
3013                            OpsToWaterfall);
3014 
3015     // The insertion point is now right after the original instruction.
3016     //
3017     // Keep the bitcast to the original vector type out of the loop. Doing this
3018     // saved an extra phi we don't need inside the loop.
3019     B.buildBitcast(DstReg, InsHi);
3020 
3021     // Re-insert the constant offset add inside the waterfall loop.
3022     if (ShouldMoveIndexIntoLoop)
3023       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
3024 
3025     return;
3026   }
3027   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3028   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3029   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3030   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3031   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3032   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3033   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
3034   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3035   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3036   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3037   case AMDGPU::G_AMDGPU_BUFFER_STORE:
3038   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3039   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3040   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3041   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
3042   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3043   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
3044     applyDefaultMapping(OpdMapper);
3045     executeInWaterfallLoop(B, MI, {1, 4});
3046     return;
3047   }
3048   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3049   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3050   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3051   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3052   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3053   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3054   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3055   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3056   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3057   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3058   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3059   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
3060     applyDefaultMapping(OpdMapper);
3061     executeInWaterfallLoop(B, MI, {2, 5});
3062     return;
3063   }
3064   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3065   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3066   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3067     applyDefaultMapping(OpdMapper);
3068     executeInWaterfallLoop(B, MI, {2, 5});
3069     return;
3070   }
3071   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3072     applyDefaultMapping(OpdMapper);
3073     executeInWaterfallLoop(B, MI, {3, 6});
3074     return;
3075   }
3076   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
3077     applyMappingSBufferLoad(B, OpdMapper);
3078     return;
3079   }
3080   case AMDGPU::G_INTRINSIC:
3081   case AMDGPU::G_INTRINSIC_CONVERGENT: {
3082     switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3083     case Intrinsic::amdgcn_readlane: {
3084       substituteSimpleCopyRegs(OpdMapper, 2);
3085 
3086       assert(OpdMapper.getVRegs(0).empty());
3087       assert(OpdMapper.getVRegs(3).empty());
3088 
3089       // Make sure the index is an SGPR. It doesn't make sense to run this in a
3090       // waterfall loop, so assume it's a uniform value.
3091       constrainOpWithReadfirstlane(B, MI, 3); // Index
3092       return;
3093     }
3094     case Intrinsic::amdgcn_writelane: {
3095       assert(OpdMapper.getVRegs(0).empty());
3096       assert(OpdMapper.getVRegs(2).empty());
3097       assert(OpdMapper.getVRegs(3).empty());
3098 
3099       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
3100       constrainOpWithReadfirstlane(B, MI, 2); // Source value
3101       constrainOpWithReadfirstlane(B, MI, 3); // Index
3102       return;
3103     }
3104     case Intrinsic::amdgcn_interp_p1:
3105     case Intrinsic::amdgcn_interp_p2:
3106     case Intrinsic::amdgcn_interp_mov:
3107     case Intrinsic::amdgcn_interp_p1_f16:
3108     case Intrinsic::amdgcn_interp_p2_f16:
3109     case Intrinsic::amdgcn_lds_param_load: {
3110       applyDefaultMapping(OpdMapper);
3111 
3112       // Readlane for m0 value, which is always the last operand.
3113       // FIXME: Should this be a waterfall loop instead?
3114       constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
3115       return;
3116     }
3117     case Intrinsic::amdgcn_interp_inreg_p10:
3118     case Intrinsic::amdgcn_interp_inreg_p2:
3119     case Intrinsic::amdgcn_interp_inreg_p10_f16:
3120     case Intrinsic::amdgcn_interp_inreg_p2_f16:
3121       applyDefaultMapping(OpdMapper);
3122       return;
3123     case Intrinsic::amdgcn_permlane16:
3124     case Intrinsic::amdgcn_permlanex16: {
3125       // Doing a waterfall loop over these wouldn't make any sense.
3126       substituteSimpleCopyRegs(OpdMapper, 2);
3127       substituteSimpleCopyRegs(OpdMapper, 3);
3128       constrainOpWithReadfirstlane(B, MI, 4);
3129       constrainOpWithReadfirstlane(B, MI, 5);
3130       return;
3131     }
3132     case Intrinsic::amdgcn_sbfe:
3133       applyMappingBFE(B, OpdMapper, true);
3134       return;
3135     case Intrinsic::amdgcn_ubfe:
3136       applyMappingBFE(B, OpdMapper, false);
3137       return;
3138     case Intrinsic::amdgcn_inverse_ballot:
3139     case Intrinsic::amdgcn_s_bitreplicate:
3140     case Intrinsic::amdgcn_s_quadmask:
3141     case Intrinsic::amdgcn_s_wqm:
3142       applyDefaultMapping(OpdMapper);
3143       constrainOpWithReadfirstlane(B, MI, 2); // Mask
3144       return;
3145     case Intrinsic::amdgcn_ballot:
3146       // Use default handling and insert copy to vcc source.
3147       break;
3148     }
3149     break;
3150   }
3151   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3152   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3153   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3154   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3155     const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3156         AMDGPU::lookupRsrcIntrinsic(AMDGPU::getIntrinsicID(MI));
3157     assert(RSrcIntrin && RSrcIntrin->IsImage);
3158     // Non-images can have complications from operands that allow both SGPR
3159     // and VGPR. For now it's too complicated to figure out the final opcode
3160     // to derive the register bank from the MCInstrDesc.
3161     applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3162     return;
3163   }
3164   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3165     unsigned N = MI.getNumExplicitOperands() - 2;
3166     applyDefaultMapping(OpdMapper);
3167     executeInWaterfallLoop(B, MI, {N});
3168     return;
3169   }
3170   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
3171   case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
3172     auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
3173     switch (IntrID) {
3174     case Intrinsic::amdgcn_ds_ordered_add:
3175     case Intrinsic::amdgcn_ds_ordered_swap: {
3176       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3177       assert(OpdMapper.getVRegs(0).empty());
3178       substituteSimpleCopyRegs(OpdMapper, 3);
3179       constrainOpWithReadfirstlane(B, MI, 2); // M0
3180       return;
3181     }
3182     case Intrinsic::amdgcn_ds_gws_init:
3183     case Intrinsic::amdgcn_ds_gws_barrier:
3184     case Intrinsic::amdgcn_ds_gws_sema_br: {
3185       // Only the first lane is executes, so readfirstlane is safe.
3186       substituteSimpleCopyRegs(OpdMapper, 1);
3187       constrainOpWithReadfirstlane(B, MI, 2); // M0
3188       return;
3189     }
3190     case Intrinsic::amdgcn_ds_gws_sema_v:
3191     case Intrinsic::amdgcn_ds_gws_sema_p:
3192     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3193       // Only the first lane is executes, so readfirstlane is safe.
3194       constrainOpWithReadfirstlane(B, MI, 1); // M0
3195       return;
3196     }
3197     case Intrinsic::amdgcn_ds_append:
3198     case Intrinsic::amdgcn_ds_consume: {
3199       constrainOpWithReadfirstlane(B, MI, 2); // M0
3200       return;
3201     }
3202     case Intrinsic::amdgcn_s_sendmsg:
3203     case Intrinsic::amdgcn_s_sendmsghalt: {
3204       // FIXME: Should this use a waterfall loop?
3205       constrainOpWithReadfirstlane(B, MI, 2); // M0
3206       return;
3207     }
3208     case Intrinsic::amdgcn_s_setreg: {
3209       constrainOpWithReadfirstlane(B, MI, 2);
3210       return;
3211     }
3212     case Intrinsic::amdgcn_s_ttracedata:
3213       constrainOpWithReadfirstlane(B, MI, 1); // M0
3214       return;
3215     case Intrinsic::amdgcn_raw_buffer_load_lds:
3216     case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
3217       applyDefaultMapping(OpdMapper);
3218       constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3219       constrainOpWithReadfirstlane(B, MI, 2); // M0
3220       constrainOpWithReadfirstlane(B, MI, 5); // soffset
3221       return;
3222     }
3223     case Intrinsic::amdgcn_struct_buffer_load_lds:
3224     case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
3225       applyDefaultMapping(OpdMapper);
3226       constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3227       constrainOpWithReadfirstlane(B, MI, 2); // M0
3228       constrainOpWithReadfirstlane(B, MI, 6); // soffset
3229       return;
3230     }
3231     case Intrinsic::amdgcn_global_load_lds: {
3232       applyDefaultMapping(OpdMapper);
3233       constrainOpWithReadfirstlane(B, MI, 2);
3234       return;
3235     }
3236     case Intrinsic::amdgcn_lds_direct_load: {
3237       applyDefaultMapping(OpdMapper);
3238       // Readlane for m0 value, which is always the last operand.
3239       constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
3240       return;
3241     }
3242     case Intrinsic::amdgcn_exp_row:
3243       applyDefaultMapping(OpdMapper);
3244       constrainOpWithReadfirstlane(B, MI, 8); // M0
3245       return;
3246     case Intrinsic::amdgcn_s_sleep_var:
3247       assert(OpdMapper.getVRegs(1).empty());
3248       constrainOpWithReadfirstlane(B, MI, 1);
3249       return;
3250     case Intrinsic::amdgcn_s_barrier_signal_var:
3251     case Intrinsic::amdgcn_s_barrier_join:
3252     case Intrinsic::amdgcn_s_wakeup_barrier:
3253       constrainOpWithReadfirstlane(B, MI, 1);
3254       return;
3255     case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
3256       constrainOpWithReadfirstlane(B, MI, 2);
3257       return;
3258     case Intrinsic::amdgcn_s_barrier_init:
3259       constrainOpWithReadfirstlane(B, MI, 1);
3260       constrainOpWithReadfirstlane(B, MI, 2);
3261       return;
3262     case Intrinsic::amdgcn_s_get_barrier_state: {
3263       constrainOpWithReadfirstlane(B, MI, 2);
3264       return;
3265     }
3266     default: {
3267       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3268               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3269         // Non-images can have complications from operands that allow both SGPR
3270         // and VGPR. For now it's too complicated to figure out the final opcode
3271         // to derive the register bank from the MCInstrDesc.
3272         if (RSrcIntrin->IsImage) {
3273           applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3274           return;
3275         }
3276       }
3277 
3278       break;
3279     }
3280     }
3281     break;
3282   }
3283   case AMDGPU::G_SI_CALL: {
3284     // Use a set to avoid extra readfirstlanes in the case where multiple
3285     // operands are the same register.
3286     SmallSet<Register, 4> SGPROperandRegs;
3287 
3288     if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3289       break;
3290 
3291     // Move all copies to physical SGPRs that are used by the call instruction
3292     // into the loop block. Start searching for these copies until the
3293     // ADJCALLSTACKUP.
3294     unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3295     unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3296 
3297     // Move all non-copies before the copies, so that a complete range can be
3298     // moved into the waterfall loop.
3299     SmallVector<MachineInstr *, 4> NonCopyInstrs;
3300     // Count of NonCopyInstrs found until the current LastCopy.
3301     unsigned NonCopyInstrsLen = 0;
3302     MachineBasicBlock::iterator Start(&MI);
3303     MachineBasicBlock::iterator LastCopy = Start;
3304     MachineBasicBlock *MBB = MI.getParent();
3305     const SIMachineFunctionInfo *Info =
3306         MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3307     while (Start->getOpcode() != FrameSetupOpcode) {
3308       --Start;
3309       bool IsCopy = false;
3310       if (Start->getOpcode() == AMDGPU::COPY) {
3311         auto &Dst = Start->getOperand(0);
3312         if (Dst.isReg()) {
3313           Register Reg = Dst.getReg();
3314           if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3315             IsCopy = true;
3316           } else {
3317             // Also move the copy from the scratch rsrc descriptor into the loop
3318             // to allow it to be optimized away.
3319             auto &Src = Start->getOperand(1);
3320             if (Src.isReg()) {
3321               Reg = Src.getReg();
3322               IsCopy = Info->getScratchRSrcReg() == Reg;
3323             }
3324           }
3325         }
3326       }
3327 
3328       if (IsCopy) {
3329         LastCopy = Start;
3330         NonCopyInstrsLen = NonCopyInstrs.size();
3331       } else {
3332         NonCopyInstrs.push_back(&*Start);
3333       }
3334     }
3335     NonCopyInstrs.resize(NonCopyInstrsLen);
3336 
3337     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3338       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3339     }
3340     Start = LastCopy;
3341 
3342     // Do the same for copies after the loop
3343     NonCopyInstrs.clear();
3344     NonCopyInstrsLen = 0;
3345     MachineBasicBlock::iterator End(&MI);
3346     LastCopy = End;
3347     while (End->getOpcode() != FrameDestroyOpcode) {
3348       ++End;
3349       bool IsCopy = false;
3350       if (End->getOpcode() == AMDGPU::COPY) {
3351         auto &Src = End->getOperand(1);
3352         if (Src.isReg()) {
3353           Register Reg = Src.getReg();
3354           IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3355         }
3356       }
3357 
3358       if (IsCopy) {
3359         LastCopy = End;
3360         NonCopyInstrsLen = NonCopyInstrs.size();
3361       } else {
3362         NonCopyInstrs.push_back(&*End);
3363       }
3364     }
3365     NonCopyInstrs.resize(NonCopyInstrsLen);
3366 
3367     End = LastCopy;
3368     ++LastCopy;
3369     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3370       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3371     }
3372 
3373     ++End;
3374     B.setInsertPt(B.getMBB(), Start);
3375     executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs);
3376     break;
3377   }
3378   case AMDGPU::G_LOAD:
3379   case AMDGPU::G_ZEXTLOAD:
3380   case AMDGPU::G_SEXTLOAD: {
3381     if (applyMappingLoad(B, OpdMapper, MI))
3382       return;
3383     break;
3384   }
3385   case AMDGPU::G_DYN_STACKALLOC:
3386     applyMappingDynStackAlloc(B, OpdMapper, MI);
3387     return;
3388   case AMDGPU::G_STACKRESTORE: {
3389     applyDefaultMapping(OpdMapper);
3390     constrainOpWithReadfirstlane(B, MI, 0);
3391     return;
3392   }
3393   case AMDGPU::G_SBFX:
3394     applyMappingBFE(B, OpdMapper, /*Signed*/ true);
3395     return;
3396   case AMDGPU::G_UBFX:
3397     applyMappingBFE(B, OpdMapper, /*Signed*/ false);
3398     return;
3399   case AMDGPU::G_AMDGPU_MAD_U64_U32:
3400   case AMDGPU::G_AMDGPU_MAD_I64_I32:
3401     applyMappingMAD_64_32(B, OpdMapper);
3402     return;
3403   case AMDGPU::G_PREFETCH: {
3404     if (!Subtarget.hasPrefetch()) {
3405       MI.eraseFromParent();
3406       return;
3407     }
3408     Register PtrReg = MI.getOperand(0).getReg();
3409     unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);
3410     if (PtrBank == AMDGPU::VGPRRegBankID) {
3411       MI.eraseFromParent();
3412       return;
3413     }
3414     unsigned AS = MRI.getType(PtrReg).getAddressSpace();
3415     if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
3416         AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3417       MI.eraseFromParent();
3418       return;
3419     }
3420     applyDefaultMapping(OpdMapper);
3421     return;
3422   }
3423   default:
3424     break;
3425   }
3426 
3427   return applyDefaultMapping(OpdMapper);
3428 }
3429 
3430 // vgpr, sgpr -> vgpr
3431 // vgpr, agpr -> vgpr
3432 // agpr, agpr -> agpr
3433 // agpr, sgpr -> vgpr
3434 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3435   if (RB0 == AMDGPU::InvalidRegBankID)
3436     return RB1;
3437   if (RB1 == AMDGPU::InvalidRegBankID)
3438     return RB0;
3439 
3440   if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3441     return AMDGPU::SGPRRegBankID;
3442 
3443   if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3444     return AMDGPU::AGPRRegBankID;
3445 
3446   return AMDGPU::VGPRRegBankID;
3447 }
3448 
3449 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3450   if (RB0 == AMDGPU::InvalidRegBankID)
3451     return RB1;
3452   if (RB1 == AMDGPU::InvalidRegBankID)
3453     return RB0;
3454 
3455   // vcc, vcc -> vcc
3456   // vcc, sgpr -> vcc
3457   // vcc, vgpr -> vcc
3458   if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3459     return AMDGPU::VCCRegBankID;
3460 
3461   // vcc, vgpr -> vgpr
3462   return regBankUnion(RB0, RB1);
3463 }
3464 
3465 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3466                                                 const MachineInstr &MI) const {
3467   unsigned RegBank = AMDGPU::InvalidRegBankID;
3468 
3469   for (const MachineOperand &MO : MI.operands()) {
3470     if (!MO.isReg())
3471       continue;
3472     Register Reg = MO.getReg();
3473     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3474       RegBank = regBankUnion(RegBank, Bank->getID());
3475       if (RegBank == AMDGPU::VGPRRegBankID)
3476         break;
3477     }
3478   }
3479 
3480   return RegBank;
3481 }
3482 
3483 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3484   const MachineFunction &MF = *MI.getParent()->getParent();
3485   const MachineRegisterInfo &MRI = MF.getRegInfo();
3486   for (const MachineOperand &MO : MI.operands()) {
3487     if (!MO.isReg())
3488       continue;
3489     Register Reg = MO.getReg();
3490     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3491       if (Bank->getID() != AMDGPU::SGPRRegBankID)
3492         return false;
3493     }
3494   }
3495   return true;
3496 }
3497 
3498 const RegisterBankInfo::InstructionMapping &
3499 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3500   const MachineFunction &MF = *MI.getParent()->getParent();
3501   const MachineRegisterInfo &MRI = MF.getRegInfo();
3502   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3503 
3504   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3505     const MachineOperand &SrcOp = MI.getOperand(i);
3506     if (!SrcOp.isReg())
3507       continue;
3508 
3509     unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3510     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3511   }
3512   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3513                                MI.getNumOperands());
3514 }
3515 
3516 const RegisterBankInfo::InstructionMapping &
3517 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3518   const MachineFunction &MF = *MI.getParent()->getParent();
3519   const MachineRegisterInfo &MRI = MF.getRegInfo();
3520   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3521 
3522   // Even though we technically could use SGPRs, this would require knowledge of
3523   // the constant bus restriction. Force all sources to VGPR (except for VCC).
3524   //
3525   // TODO: Unary ops are trivially OK, so accept SGPRs?
3526   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3527     const MachineOperand &Src = MI.getOperand(i);
3528     if (!Src.isReg())
3529       continue;
3530 
3531     unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3532     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3533     OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3534   }
3535 
3536   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3537                                MI.getNumOperands());
3538 }
3539 
3540 const RegisterBankInfo::InstructionMapping &
3541 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3542   const MachineFunction &MF = *MI.getParent()->getParent();
3543   const MachineRegisterInfo &MRI = MF.getRegInfo();
3544   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3545 
3546   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3547     const MachineOperand &Op = MI.getOperand(I);
3548     if (!Op.isReg())
3549       continue;
3550 
3551     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3552     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3553   }
3554 
3555   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3556                                MI.getNumOperands());
3557 }
3558 
3559 const RegisterBankInfo::InstructionMapping &
3560 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3561                                         const MachineInstr &MI,
3562                                         int RsrcIdx) const {
3563   // The reported argument index is relative to the IR intrinsic call arguments,
3564   // so we need to shift by the number of defs and the intrinsic ID.
3565   RsrcIdx += MI.getNumExplicitDefs() + 1;
3566 
3567   const int NumOps = MI.getNumOperands();
3568   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3569 
3570   // TODO: Should packed/unpacked D16 difference be reported here as part of
3571   // the value mapping?
3572   for (int I = 0; I != NumOps; ++I) {
3573     if (!MI.getOperand(I).isReg())
3574       continue;
3575 
3576     Register OpReg = MI.getOperand(I).getReg();
3577     // We replace some dead address operands with $noreg
3578     if (!OpReg)
3579       continue;
3580 
3581     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3582 
3583     // FIXME: Probably need a new intrinsic register bank searchable table to
3584     // handle arbitrary intrinsics easily.
3585     //
3586     // If this has a sampler, it immediately follows rsrc.
3587     const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3588 
3589     if (MustBeSGPR) {
3590       // If this must be an SGPR, so we must report whatever it is as legal.
3591       unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3592       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3593     } else {
3594       // Some operands must be VGPR, and these are easy to copy to.
3595       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3596     }
3597   }
3598 
3599   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3600 }
3601 
3602 /// Return the mapping for a pointer argument.
3603 const RegisterBankInfo::ValueMapping *
3604 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3605                                               Register PtrReg) const {
3606   LLT PtrTy = MRI.getType(PtrReg);
3607   unsigned Size = PtrTy.getSizeInBits();
3608   if (Subtarget.useFlatForGlobal() ||
3609       !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3610     return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3611 
3612   // If we're using MUBUF instructions for global memory, an SGPR base register
3613   // is possible. Otherwise this needs to be a VGPR.
3614   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3615   return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3616 }
3617 
3618 const RegisterBankInfo::InstructionMapping &
3619 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3620 
3621   const MachineFunction &MF = *MI.getParent()->getParent();
3622   const MachineRegisterInfo &MRI = MF.getRegInfo();
3623   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3624   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3625   Register PtrReg = MI.getOperand(1).getReg();
3626   LLT PtrTy = MRI.getType(PtrReg);
3627   unsigned AS = PtrTy.getAddressSpace();
3628   unsigned PtrSize = PtrTy.getSizeInBits();
3629 
3630   const ValueMapping *ValMapping;
3631   const ValueMapping *PtrMapping;
3632 
3633   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3634 
3635   if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3636     if (isScalarLoadLegal(MI)) {
3637       // We have a uniform instruction so we want to use an SMRD load
3638       ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3639       PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3640     } else {
3641       ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3642 
3643       // If we're using MUBUF instructions for global memory, an SGPR base
3644       // register is possible. Otherwise this needs to be a VGPR.
3645       unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3646         AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3647 
3648       PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3649     }
3650   } else {
3651     ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3652     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3653   }
3654 
3655   OpdsMapping[0] = ValMapping;
3656   OpdsMapping[1] = PtrMapping;
3657   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3658       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3659   return Mapping;
3660 
3661   // FIXME: Do we want to add a mapping for FLAT load, or should we just
3662   // handle that during instruction selection?
3663 }
3664 
3665 unsigned
3666 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3667                                      const MachineRegisterInfo &MRI,
3668                                      unsigned Default) const {
3669   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3670   return Bank ? Bank->getID() : Default;
3671 }
3672 
3673 const RegisterBankInfo::ValueMapping *
3674 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3675                                          const MachineRegisterInfo &MRI,
3676                                          const TargetRegisterInfo &TRI) const {
3677   // Lie and claim anything is legal, even though this needs to be an SGPR
3678   // applyMapping will have to deal with it as a waterfall loop.
3679   unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3680   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3681   return AMDGPU::getValueMapping(Bank, Size);
3682 }
3683 
3684 const RegisterBankInfo::ValueMapping *
3685 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3686                                          const MachineRegisterInfo &MRI,
3687                                          const TargetRegisterInfo &TRI) const {
3688   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3689   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3690 }
3691 
3692 const RegisterBankInfo::ValueMapping *
3693 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3694                                          const MachineRegisterInfo &MRI,
3695                                          const TargetRegisterInfo &TRI) const {
3696   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3697   return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3698 }
3699 
3700 ///
3701 /// This function must return a legal mapping, because
3702 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3703 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
3704 /// VGPR to SGPR generated is illegal.
3705 ///
3706 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3707 // legal. These will be dealt with in applyMappingImpl.
3708 //
3709 const RegisterBankInfo::InstructionMapping &
3710 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3711   const MachineFunction &MF = *MI.getParent()->getParent();
3712   const MachineRegisterInfo &MRI = MF.getRegInfo();
3713 
3714   if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3715     // The default logic bothers to analyze impossible alternative mappings. We
3716     // want the most straightforward mapping, so just directly handle this.
3717     const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3718                                              *TRI);
3719     const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3720                                              *TRI);
3721     assert(SrcBank && "src bank should have been assigned already");
3722     if (!DstBank)
3723       DstBank = SrcBank;
3724 
3725     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3726     if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3727         cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size)))
3728       return getInvalidInstructionMapping();
3729 
3730     const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3731     unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3732     SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3733     OpdsMapping[0] = &ValMap;
3734     if (MI.getOpcode() == AMDGPU::G_FREEZE)
3735       OpdsMapping[1] = &ValMap;
3736 
3737     return getInstructionMapping(
3738         1, /*Cost*/ 1,
3739         /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3740   }
3741 
3742   if (MI.isRegSequence()) {
3743     // If any input is a VGPR, the result must be a VGPR. The default handling
3744     // assumes any copy between banks is legal.
3745     unsigned BankID = AMDGPU::SGPRRegBankID;
3746 
3747     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3748       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3749       // It doesn't make sense to use vcc or scc banks here, so just ignore
3750       // them.
3751       if (OpBank != AMDGPU::SGPRRegBankID) {
3752         BankID = AMDGPU::VGPRRegBankID;
3753         break;
3754       }
3755     }
3756     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3757 
3758     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3759     return getInstructionMapping(
3760         1, /*Cost*/ 1,
3761         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3762   }
3763 
3764   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3765   // properly.
3766   //
3767   // TODO: There are additional exec masking dependencies to analyze.
3768   if (MI.getOpcode() == TargetOpcode::G_PHI) {
3769     unsigned ResultBank = AMDGPU::InvalidRegBankID;
3770     Register DstReg = MI.getOperand(0).getReg();
3771 
3772     // Sometimes the result may have already been assigned a bank.
3773     if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3774       ResultBank = DstBank->getID();
3775 
3776     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3777       Register Reg = MI.getOperand(I).getReg();
3778       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3779 
3780       // FIXME: Assuming VGPR for any undetermined inputs.
3781       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3782         ResultBank = AMDGPU::VGPRRegBankID;
3783         break;
3784       }
3785 
3786       // FIXME: Need to promote SGPR case to s32
3787       unsigned OpBank = Bank->getID();
3788       ResultBank = regBankBoolUnion(ResultBank, OpBank);
3789     }
3790 
3791     assert(ResultBank != AMDGPU::InvalidRegBankID);
3792 
3793     unsigned Size = MRI.getType(DstReg).getSizeInBits();
3794 
3795     const ValueMapping &ValMap =
3796         getValueMapping(0, Size, getRegBank(ResultBank));
3797     return getInstructionMapping(
3798         1, /*Cost*/ 1,
3799         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3800   }
3801 
3802   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3803   if (Mapping.isValid())
3804     return Mapping;
3805 
3806   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3807 
3808   switch (MI.getOpcode()) {
3809   default:
3810     return getInvalidInstructionMapping();
3811 
3812   case AMDGPU::G_AND:
3813   case AMDGPU::G_OR:
3814   case AMDGPU::G_XOR:
3815   case AMDGPU::G_MUL: {
3816     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3817     if (Size == 1) {
3818       const RegisterBank *DstBank
3819         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3820 
3821       unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3822       unsigned BankLHS = AMDGPU::InvalidRegBankID;
3823       unsigned BankRHS = AMDGPU::InvalidRegBankID;
3824       if (DstBank) {
3825         TargetBankID = DstBank->getID();
3826         if (DstBank == &AMDGPU::VCCRegBank) {
3827           TargetBankID = AMDGPU::VCCRegBankID;
3828           BankLHS = AMDGPU::VCCRegBankID;
3829           BankRHS = AMDGPU::VCCRegBankID;
3830         } else {
3831           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3832                                  AMDGPU::SGPRRegBankID);
3833           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3834                                  AMDGPU::SGPRRegBankID);
3835         }
3836       } else {
3837         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3838                                AMDGPU::VCCRegBankID);
3839         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3840                                AMDGPU::VCCRegBankID);
3841 
3842         // Both inputs should be true booleans to produce a boolean result.
3843         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3844           TargetBankID = AMDGPU::VGPRRegBankID;
3845         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3846           TargetBankID = AMDGPU::VCCRegBankID;
3847           BankLHS = AMDGPU::VCCRegBankID;
3848           BankRHS = AMDGPU::VCCRegBankID;
3849         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3850           TargetBankID = AMDGPU::SGPRRegBankID;
3851         }
3852       }
3853 
3854       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3855       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3856       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3857       break;
3858     }
3859 
3860     if (Size == 64) {
3861 
3862       if (isSALUMapping(MI)) {
3863         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3864         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3865       } else {
3866         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3867         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3868         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3869 
3870         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3871         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3872       }
3873 
3874       break;
3875     }
3876 
3877     [[fallthrough]];
3878   }
3879   case AMDGPU::G_PTR_ADD:
3880   case AMDGPU::G_PTRMASK:
3881   case AMDGPU::G_ADD:
3882   case AMDGPU::G_SUB:
3883   case AMDGPU::G_SHL:
3884   case AMDGPU::G_LSHR:
3885   case AMDGPU::G_ASHR:
3886   case AMDGPU::G_UADDO:
3887   case AMDGPU::G_USUBO:
3888   case AMDGPU::G_UADDE:
3889   case AMDGPU::G_SADDE:
3890   case AMDGPU::G_USUBE:
3891   case AMDGPU::G_SSUBE:
3892   case AMDGPU::G_SMIN:
3893   case AMDGPU::G_SMAX:
3894   case AMDGPU::G_UMIN:
3895   case AMDGPU::G_UMAX:
3896   case AMDGPU::G_ABS:
3897   case AMDGPU::G_SHUFFLE_VECTOR:
3898   case AMDGPU::G_SBFX:
3899   case AMDGPU::G_UBFX:
3900   case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
3901   case AMDGPU::G_AMDGPU_S_MUL_U64_U32:
3902     if (isSALUMapping(MI))
3903       return getDefaultMappingSOP(MI);
3904     return getDefaultMappingVOP(MI);
3905   case AMDGPU::G_FADD:
3906   case AMDGPU::G_FSUB:
3907   case AMDGPU::G_FMUL:
3908   case AMDGPU::G_FMA:
3909   case AMDGPU::G_FFLOOR:
3910   case AMDGPU::G_FCEIL:
3911   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
3912   case AMDGPU::G_FMINNUM:
3913   case AMDGPU::G_FMAXNUM:
3914   case AMDGPU::G_FMINIMUM:
3915   case AMDGPU::G_FMAXIMUM:
3916   case AMDGPU::G_INTRINSIC_TRUNC:
3917   case AMDGPU::G_STRICT_FADD:
3918   case AMDGPU::G_STRICT_FSUB:
3919   case AMDGPU::G_STRICT_FMUL:
3920   case AMDGPU::G_STRICT_FMA: {
3921     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3922     unsigned Size = Ty.getSizeInBits();
3923     if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
3924         (Size == 32 || Size == 16) && isSALUMapping(MI))
3925       return getDefaultMappingSOP(MI);
3926     return getDefaultMappingVOP(MI);
3927   }
3928   case AMDGPU::G_FPTOSI:
3929   case AMDGPU::G_FPTOUI:
3930   case AMDGPU::G_SITOFP:
3931   case AMDGPU::G_UITOFP: {
3932     unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3933     unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3934     if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 &&
3935         isSALUMapping(MI))
3936       return getDefaultMappingSOP(MI);
3937     return getDefaultMappingVOP(MI);
3938   }
3939   case AMDGPU::G_FPTRUNC:
3940   case AMDGPU::G_FPEXT: {
3941     unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3942     unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3943     if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 &&
3944         isSALUMapping(MI))
3945       return getDefaultMappingSOP(MI);
3946     return getDefaultMappingVOP(MI);
3947   }
3948   case AMDGPU::G_FSQRT:
3949   case AMDGPU::G_FEXP2:
3950   case AMDGPU::G_FLOG2: {
3951     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3952     if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
3953         isSALUMapping(MI))
3954       return getDefaultMappingSOP(MI);
3955     return getDefaultMappingVOP(MI);
3956   }
3957   case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3958   case AMDGPU::G_SSUBSAT:
3959   case AMDGPU::G_UADDSAT:
3960   case AMDGPU::G_USUBSAT:
3961   case AMDGPU::G_FMAD:
3962   case AMDGPU::G_FLDEXP:
3963   case AMDGPU::G_FMINNUM_IEEE:
3964   case AMDGPU::G_FMAXNUM_IEEE:
3965   case AMDGPU::G_FCANONICALIZE:
3966   case AMDGPU::G_STRICT_FLDEXP:
3967   case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3968   case AMDGPU::G_FSHR: // TODO: Expand for scalar
3969   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3970   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3971   case AMDGPU::G_AMDGPU_RCP_IFLAG:
3972   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3973   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3974   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3975   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3976   case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3977   case AMDGPU::G_AMDGPU_SMED3:
3978   case AMDGPU::G_AMDGPU_FMED3:
3979     return getDefaultMappingVOP(MI);
3980   case AMDGPU::G_UMULH:
3981   case AMDGPU::G_SMULH: {
3982     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3983       return getDefaultMappingSOP(MI);
3984     return getDefaultMappingVOP(MI);
3985   }
3986   case AMDGPU::G_AMDGPU_MAD_U64_U32:
3987   case AMDGPU::G_AMDGPU_MAD_I64_I32: {
3988     // Three possible mappings:
3989     //
3990     //  - Default SOP
3991     //  - Default VOP
3992     //  - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
3993     //
3994     // This allows instruction selection to keep the multiplication part of the
3995     // instruction on the SALU.
3996     bool AllSalu = true;
3997     bool MulSalu = true;
3998     for (unsigned i = 0; i < 5; ++i) {
3999       Register Reg = MI.getOperand(i).getReg();
4000       if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
4001         if (Bank->getID() != AMDGPU::SGPRRegBankID) {
4002           AllSalu = false;
4003           if (i == 2 || i == 3) {
4004             MulSalu = false;
4005             break;
4006           }
4007         }
4008       }
4009     }
4010 
4011     if (AllSalu)
4012       return getDefaultMappingSOP(MI);
4013 
4014     // If the multiply-add is full-rate in VALU, use that even if the
4015     // multiplication part is scalar. Accumulating separately on the VALU would
4016     // take two instructions.
4017     if (!MulSalu || Subtarget.hasFullRate64Ops())
4018       return getDefaultMappingVOP(MI);
4019 
4020     // Keep the multiplication on the SALU, then accumulate on the VALU.
4021     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
4022     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4023     OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4024     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4025     OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
4026     break;
4027   }
4028   case AMDGPU::G_IMPLICIT_DEF: {
4029     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4030     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4031     break;
4032   }
4033   case AMDGPU::G_FCONSTANT:
4034   case AMDGPU::G_CONSTANT:
4035   case AMDGPU::G_GLOBAL_VALUE:
4036   case AMDGPU::G_BLOCK_ADDR:
4037   case AMDGPU::G_READCYCLECOUNTER: {
4038     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4039     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4040     break;
4041   }
4042   case AMDGPU::G_FRAME_INDEX: {
4043     // TODO: This should be the same as other constants, but eliminateFrameIndex
4044     // currently assumes VALU uses.
4045     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4046     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4047     break;
4048   }
4049   case AMDGPU::G_DYN_STACKALLOC: {
4050     // Result is always uniform, and a wave reduction is needed for the source.
4051     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4052     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4053     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
4054     break;
4055   }
4056   case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
4057     // This case is weird because we expect a physical register in the source,
4058     // but need to set a bank anyway.
4059     //
4060     // TODO: We could select the result to SGPR or VGPR
4061     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4062     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4063     break;
4064   }
4065   case AMDGPU::G_INSERT: {
4066     unsigned BankID = getMappingType(MRI, MI);
4067     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4068     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4069     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
4070     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
4071     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
4072     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
4073     OpdsMapping[3] = nullptr;
4074     break;
4075   }
4076   case AMDGPU::G_EXTRACT: {
4077     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4078     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4079     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4080     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
4081     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
4082     OpdsMapping[2] = nullptr;
4083     break;
4084   }
4085   case AMDGPU::G_BUILD_VECTOR:
4086   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
4087     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
4088     if (DstTy == LLT::fixed_vector(2, 16)) {
4089       unsigned DstSize = DstTy.getSizeInBits();
4090       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4091       unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4092       unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4093       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
4094 
4095       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
4096       OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
4097       OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
4098       break;
4099     }
4100 
4101     [[fallthrough]];
4102   }
4103   case AMDGPU::G_MERGE_VALUES:
4104   case AMDGPU::G_CONCAT_VECTORS: {
4105     unsigned Bank = getMappingType(MRI, MI);
4106     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4107     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4108 
4109     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
4110     // Op1 and Dst should use the same register bank.
4111     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
4112       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
4113     break;
4114   }
4115   case AMDGPU::G_BITREVERSE:
4116   case AMDGPU::G_BITCAST:
4117   case AMDGPU::G_INTTOPTR:
4118   case AMDGPU::G_PTRTOINT:
4119   case AMDGPU::G_FABS:
4120   case AMDGPU::G_FNEG: {
4121     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4122     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4123     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4124     break;
4125   }
4126   case AMDGPU::G_AMDGPU_FFBH_U32:
4127   case AMDGPU::G_AMDGPU_FFBL_B32:
4128   case AMDGPU::G_CTLZ_ZERO_UNDEF:
4129   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
4130     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4131     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4132     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
4133     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
4134     break;
4135   }
4136   case AMDGPU::G_CTPOP: {
4137     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4138     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4139     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
4140 
4141     // This should really be getValueMappingSGPR64Only, but allowing the generic
4142     // code to handle the register split just makes using LegalizerHelper more
4143     // difficult.
4144     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4145     break;
4146   }
4147   case AMDGPU::G_TRUNC: {
4148     Register Dst = MI.getOperand(0).getReg();
4149     Register Src = MI.getOperand(1).getReg();
4150     unsigned Bank = getRegBankID(Src, MRI);
4151     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4152     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4153     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
4154     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
4155     break;
4156   }
4157   case AMDGPU::G_ZEXT:
4158   case AMDGPU::G_SEXT:
4159   case AMDGPU::G_ANYEXT:
4160   case AMDGPU::G_SEXT_INREG: {
4161     Register Dst = MI.getOperand(0).getReg();
4162     Register Src = MI.getOperand(1).getReg();
4163     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4164     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4165 
4166     unsigned DstBank;
4167     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
4168     assert(SrcBank);
4169     switch (SrcBank->getID()) {
4170     case AMDGPU::SGPRRegBankID:
4171       DstBank = AMDGPU::SGPRRegBankID;
4172       break;
4173     default:
4174       DstBank = AMDGPU::VGPRRegBankID;
4175       break;
4176     }
4177 
4178     // Scalar extend can use 64-bit BFE, but VGPRs require extending to
4179     // 32-bits, and then to 64.
4180     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
4181     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
4182                                                        SrcSize);
4183     break;
4184   }
4185   case AMDGPU::G_IS_FPCLASS: {
4186     Register SrcReg = MI.getOperand(1).getReg();
4187     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4188     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4189     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4190     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4191     break;
4192   }
4193   case AMDGPU::G_STORE: {
4194     assert(MI.getOperand(0).isReg());
4195     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4196 
4197     // FIXME: We need to specify a different reg bank once scalar stores are
4198     // supported.
4199     const ValueMapping *ValMapping =
4200         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4201     OpdsMapping[0] = ValMapping;
4202     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4203     break;
4204   }
4205   case AMDGPU::G_ICMP:
4206   case AMDGPU::G_FCMP: {
4207     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4208 
4209     // See if the result register has already been constrained to vcc, which may
4210     // happen due to control flow intrinsic lowering.
4211     unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4212                                     AMDGPU::SGPRRegBankID);
4213     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4214     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
4215 
4216     auto canUseSCCICMP = [&]() {
4217       auto Pred =
4218           static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
4219       return Size == 32 ||
4220              (Size == 64 &&
4221               (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4222               Subtarget.hasScalarCompareEq64());
4223     };
4224     auto canUseSCCFCMP = [&]() {
4225       return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16);
4226     };
4227 
4228     bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;
4229     bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4230                      Op2Bank == AMDGPU::SGPRRegBankID &&
4231                      Op3Bank == AMDGPU::SGPRRegBankID &&
4232                      (isICMP ? canUseSCCICMP() : canUseSCCFCMP());
4233 
4234     DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4235     unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4236 
4237     // TODO: Use 32-bit for scalar output size.
4238     // SCC results will need to be copied to a 32-bit SGPR virtual register.
4239     const unsigned ResultSize = 1;
4240 
4241     OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
4242     OpdsMapping[1] = nullptr; // Predicate Operand.
4243     OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
4244     OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
4245     break;
4246   }
4247   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4248     // VGPR index can be used for waterfall when indexing a SGPR vector.
4249     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4250     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4251     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4252     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4253     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4254     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
4255 
4256     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
4257     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
4258 
4259     // The index can be either if the source vector is VGPR.
4260     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4261     break;
4262   }
4263   case AMDGPU::G_INSERT_VECTOR_ELT: {
4264     unsigned OutputBankID = isSALUMapping(MI) ?
4265       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4266 
4267     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4268     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4269     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4270     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4271     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
4272 
4273     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4274     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4275 
4276     // This is a weird case, because we need to break down the mapping based on
4277     // the register bank of a different operand.
4278     if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4279       OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
4280                                                       InsertSize);
4281     } else {
4282       assert(InsertSize == 32 || InsertSize == 64);
4283       OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
4284     }
4285 
4286     // The index can be either if the source vector is VGPR.
4287     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
4288     break;
4289   }
4290   case AMDGPU::G_UNMERGE_VALUES: {
4291     unsigned Bank = getMappingType(MRI, MI);
4292 
4293     // Op1 and Dst should use the same register bank.
4294     // FIXME: Shouldn't this be the default? Why do we need to handle this?
4295     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4296       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
4297       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
4298     }
4299     break;
4300   }
4301   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4302   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4303   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4304   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4305   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4306   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4307   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4308   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4309   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4310   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4311   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4312   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4313   case AMDGPU::G_AMDGPU_BUFFER_STORE:
4314   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4315   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4316   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4317   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4318     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4319 
4320     // rsrc
4321     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4322 
4323     // vindex
4324     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4325 
4326     // voffset
4327     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4328 
4329     // soffset
4330     OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4331 
4332     // Any remaining operands are immediates and were correctly null
4333     // initialized.
4334     break;
4335   }
4336   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4337   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4338   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4339   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4340   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4341   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4342   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4343   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4344   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4345   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4346   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4347   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4348   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4349   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4350   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4351     // vdata_out
4352     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4353 
4354     // vdata_in
4355     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4356 
4357     // rsrc
4358     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4359 
4360     // vindex
4361     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4362 
4363     // voffset
4364     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4365 
4366     // soffset
4367     OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4368 
4369     // Any remaining operands are immediates and were correctly null
4370     // initialized.
4371     break;
4372   }
4373   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4374     // vdata_out
4375     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4376 
4377     // vdata_in
4378     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4379 
4380     // cmp
4381     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4382 
4383     // rsrc
4384     OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4385 
4386     // vindex
4387     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4388 
4389     // voffset
4390     OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4391 
4392     // soffset
4393     OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4394 
4395     // Any remaining operands are immediates and were correctly null
4396     // initialized.
4397     break;
4398   }
4399   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
4400     // Lie and claim everything is legal, even though some need to be
4401     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4402     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4403     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4404 
4405     // We need to convert this to a MUBUF if either the resource of offset is
4406     // VGPR.
4407     unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4408     unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4409     unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4410 
4411     unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4412     OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4413     break;
4414   }
4415   case AMDGPU::G_INTRINSIC:
4416   case AMDGPU::G_INTRINSIC_CONVERGENT: {
4417     switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
4418     default:
4419       return getInvalidInstructionMapping();
4420     case Intrinsic::amdgcn_div_fmas:
4421     case Intrinsic::amdgcn_div_fixup:
4422     case Intrinsic::amdgcn_trig_preop:
4423     case Intrinsic::amdgcn_sin:
4424     case Intrinsic::amdgcn_cos:
4425     case Intrinsic::amdgcn_log_clamp:
4426     case Intrinsic::amdgcn_rcp_legacy:
4427     case Intrinsic::amdgcn_rsq_legacy:
4428     case Intrinsic::amdgcn_rsq_clamp:
4429     case Intrinsic::amdgcn_fmul_legacy:
4430     case Intrinsic::amdgcn_fma_legacy:
4431     case Intrinsic::amdgcn_frexp_mant:
4432     case Intrinsic::amdgcn_frexp_exp:
4433     case Intrinsic::amdgcn_fract:
4434     case Intrinsic::amdgcn_cvt_pknorm_i16:
4435     case Intrinsic::amdgcn_cvt_pknorm_u16:
4436     case Intrinsic::amdgcn_cvt_pk_i16:
4437     case Intrinsic::amdgcn_cvt_pk_u16:
4438     case Intrinsic::amdgcn_fmed3:
4439     case Intrinsic::amdgcn_cubeid:
4440     case Intrinsic::amdgcn_cubema:
4441     case Intrinsic::amdgcn_cubesc:
4442     case Intrinsic::amdgcn_cubetc:
4443     case Intrinsic::amdgcn_sffbh:
4444     case Intrinsic::amdgcn_fmad_ftz:
4445     case Intrinsic::amdgcn_mbcnt_lo:
4446     case Intrinsic::amdgcn_mbcnt_hi:
4447     case Intrinsic::amdgcn_mul_u24:
4448     case Intrinsic::amdgcn_mul_i24:
4449     case Intrinsic::amdgcn_mulhi_u24:
4450     case Intrinsic::amdgcn_mulhi_i24:
4451     case Intrinsic::amdgcn_lerp:
4452     case Intrinsic::amdgcn_sad_u8:
4453     case Intrinsic::amdgcn_msad_u8:
4454     case Intrinsic::amdgcn_sad_hi_u8:
4455     case Intrinsic::amdgcn_sad_u16:
4456     case Intrinsic::amdgcn_qsad_pk_u16_u8:
4457     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4458     case Intrinsic::amdgcn_mqsad_u32_u8:
4459     case Intrinsic::amdgcn_cvt_pk_u8_f32:
4460     case Intrinsic::amdgcn_alignbyte:
4461     case Intrinsic::amdgcn_perm:
4462     case Intrinsic::amdgcn_fdot2:
4463     case Intrinsic::amdgcn_sdot2:
4464     case Intrinsic::amdgcn_udot2:
4465     case Intrinsic::amdgcn_sdot4:
4466     case Intrinsic::amdgcn_udot4:
4467     case Intrinsic::amdgcn_sdot8:
4468     case Intrinsic::amdgcn_udot8:
4469     case Intrinsic::amdgcn_fdot2_bf16_bf16:
4470     case Intrinsic::amdgcn_fdot2_f16_f16:
4471     case Intrinsic::amdgcn_fdot2_f32_bf16:
4472     case Intrinsic::amdgcn_sudot4:
4473     case Intrinsic::amdgcn_sudot8:
4474     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4475     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4476     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
4477     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
4478     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4479     case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4480     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4481     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4482       return getDefaultMappingVOP(MI);
4483     case Intrinsic::amdgcn_log:
4484     case Intrinsic::amdgcn_exp2:
4485     case Intrinsic::amdgcn_rcp:
4486     case Intrinsic::amdgcn_rsq:
4487     case Intrinsic::amdgcn_sqrt: {
4488       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4489       if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4490           isSALUMapping(MI))
4491         return getDefaultMappingSOP(MI);
4492       return getDefaultMappingVOP(MI);
4493     }
4494     case Intrinsic::amdgcn_sbfe:
4495     case Intrinsic::amdgcn_ubfe:
4496       if (isSALUMapping(MI))
4497         return getDefaultMappingSOP(MI);
4498       return getDefaultMappingVOP(MI);
4499     case Intrinsic::amdgcn_ds_swizzle:
4500     case Intrinsic::amdgcn_ds_permute:
4501     case Intrinsic::amdgcn_ds_bpermute:
4502     case Intrinsic::amdgcn_update_dpp:
4503     case Intrinsic::amdgcn_mov_dpp8:
4504     case Intrinsic::amdgcn_mov_dpp:
4505     case Intrinsic::amdgcn_strict_wwm:
4506     case Intrinsic::amdgcn_wwm:
4507     case Intrinsic::amdgcn_strict_wqm:
4508     case Intrinsic::amdgcn_wqm:
4509     case Intrinsic::amdgcn_softwqm:
4510     case Intrinsic::amdgcn_set_inactive:
4511     case Intrinsic::amdgcn_set_inactive_chain_arg:
4512     case Intrinsic::amdgcn_permlane64:
4513       return getDefaultMappingAllVGPR(MI);
4514     case Intrinsic::amdgcn_cvt_pkrtz:
4515       if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
4516         return getDefaultMappingSOP(MI);
4517       return getDefaultMappingVOP(MI);
4518     case Intrinsic::amdgcn_kernarg_segment_ptr:
4519     case Intrinsic::amdgcn_s_getpc:
4520     case Intrinsic::amdgcn_groupstaticsize:
4521     case Intrinsic::amdgcn_reloc_constant:
4522     case Intrinsic::returnaddress: {
4523       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4524       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4525       break;
4526     }
4527     case Intrinsic::amdgcn_wqm_vote: {
4528       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4529       OpdsMapping[0] = OpdsMapping[2]
4530         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4531       break;
4532     }
4533     case Intrinsic::amdgcn_ps_live: {
4534       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4535       break;
4536     }
4537     case Intrinsic::amdgcn_div_scale: {
4538       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4539       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4540       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4541       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4542 
4543       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4544       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4545       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4546       break;
4547     }
4548     case Intrinsic::amdgcn_class: {
4549       Register Src0Reg = MI.getOperand(2).getReg();
4550       Register Src1Reg = MI.getOperand(3).getReg();
4551       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4552       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4553       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4554       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4555       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4556       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4557       break;
4558     }
4559     case Intrinsic::amdgcn_icmp:
4560     case Intrinsic::amdgcn_fcmp: {
4561       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4562       // This is not VCCRegBank because this is not used in boolean contexts.
4563       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4564       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4565       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4566       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4567       break;
4568     }
4569     case Intrinsic::amdgcn_readlane: {
4570       // This must be an SGPR, but accept a VGPR.
4571       Register IdxReg = MI.getOperand(3).getReg();
4572       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4573       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4574       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4575       [[fallthrough]];
4576     }
4577     case Intrinsic::amdgcn_readfirstlane: {
4578       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4579       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4580       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4581       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4582       break;
4583     }
4584     case Intrinsic::amdgcn_writelane: {
4585       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4586       Register SrcReg = MI.getOperand(2).getReg();
4587       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4588       unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4589       Register IdxReg = MI.getOperand(3).getReg();
4590       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4591       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4592       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4593 
4594       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4595       // to legalize.
4596       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4597       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4598       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4599       break;
4600     }
4601     case Intrinsic::amdgcn_if_break: {
4602       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4603       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4604       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4605       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4606       break;
4607     }
4608     case Intrinsic::amdgcn_permlane16:
4609     case Intrinsic::amdgcn_permlanex16: {
4610       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4611       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4612       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4613       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4614       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4615       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4616       break;
4617     }
4618     case Intrinsic::amdgcn_permlane16_var:
4619     case Intrinsic::amdgcn_permlanex16_var: {
4620       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4621       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4622       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4623       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4624       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4625       break;
4626     }
4627     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4628     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4629     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4630     case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4631     case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4632     case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4633     case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4634     case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4635     case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4636     case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4637     case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4638     case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4639     case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4640     case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4641     case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4642     case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4643     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4644     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4645     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4646     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4647     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4648     case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4649     case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4650     case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4651     case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4652     case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4653     case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4654     case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4655     case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4656     case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4657     case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
4658     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
4659     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
4660     case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
4661     case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
4662     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
4663     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
4664     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
4665     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
4666       // Default for MAI intrinsics.
4667       // srcC can also be an immediate which can be folded later.
4668       // FIXME: Should we eventually add an alternative mapping with AGPR src
4669       // for srcA/srcB?
4670       //
4671       // vdst, srcA, srcB, srcC
4672       const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4673       OpdsMapping[0] =
4674           Info->mayNeedAGPRs()
4675               ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4676               : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4677       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4678       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4679       OpdsMapping[4] =
4680           Info->mayNeedAGPRs()
4681               ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4682               : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4683       break;
4684     }
4685     case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
4686     case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
4687     case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
4688     case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
4689     case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
4690     case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
4691     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
4692     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
4693     case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
4694     case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
4695     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
4696     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
4697     case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
4698     case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
4699       // vdst, srcA, srcB, srcC, idx
4700       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4701       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4702       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4703       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4704       OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4705       break;
4706     }
4707     case Intrinsic::amdgcn_interp_p1:
4708     case Intrinsic::amdgcn_interp_p2:
4709     case Intrinsic::amdgcn_interp_mov:
4710     case Intrinsic::amdgcn_interp_p1_f16:
4711     case Intrinsic::amdgcn_interp_p2_f16:
4712     case Intrinsic::amdgcn_lds_param_load: {
4713       const int M0Idx = MI.getNumOperands() - 1;
4714       Register M0Reg = MI.getOperand(M0Idx).getReg();
4715       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4716       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4717 
4718       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4719       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4720         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4721 
4722       // Must be SGPR, but we must take whatever the original bank is and fix it
4723       // later.
4724       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4725       break;
4726     }
4727     case Intrinsic::amdgcn_interp_inreg_p10:
4728     case Intrinsic::amdgcn_interp_inreg_p2:
4729     case Intrinsic::amdgcn_interp_inreg_p10_f16:
4730     case Intrinsic::amdgcn_interp_inreg_p2_f16: {
4731       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4732       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4733       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4734       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4735       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4736       break;
4737     }
4738     case Intrinsic::amdgcn_ballot: {
4739       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4740       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4741       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4742       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4743       break;
4744     }
4745     case Intrinsic::amdgcn_inverse_ballot: {
4746       // This must be an SGPR, but accept a VGPR.
4747       Register MaskReg = MI.getOperand(2).getReg();
4748       unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
4749       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4750       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4751       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
4752       break;
4753     }
4754     case Intrinsic::amdgcn_s_quadmask:
4755     case Intrinsic::amdgcn_s_wqm: {
4756       Register MaskReg = MI.getOperand(2).getReg();
4757       unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
4758       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4759       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize);
4760       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
4761       break;
4762     }
4763     case Intrinsic::amdgcn_wave_reduce_umin:
4764     case Intrinsic::amdgcn_wave_reduce_umax: {
4765       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4766       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4767       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4768       auto regBankID =
4769           isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4770       OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
4771       break;
4772     }
4773     case Intrinsic::amdgcn_s_bitreplicate:
4774       Register MaskReg = MI.getOperand(2).getReg();
4775       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4776       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4777       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
4778     }
4779     break;
4780   }
4781   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4782   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4783   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4784   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4785     auto IntrID = AMDGPU::getIntrinsicID(MI);
4786     const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4787     assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4788     // Non-images can have complications from operands that allow both SGPR
4789     // and VGPR. For now it's too complicated to figure out the final opcode
4790     // to derive the register bank from the MCInstrDesc.
4791     assert(RSrcIntrin->IsImage);
4792     return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4793   }
4794   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4795     unsigned N = MI.getNumExplicitOperands() - 2;
4796     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4797     OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4798     if (N == 3) {
4799       // Sequential form: all operands combined into VGPR256/VGPR512
4800       unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4801       if (Size > 256)
4802         Size = 512;
4803       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4804     } else {
4805       // NSA form
4806       for (unsigned I = 2; I < N; ++I) {
4807         unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();
4808         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4809       }
4810     }
4811     break;
4812   }
4813   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
4814   case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
4815     auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
4816     switch (IntrID) {
4817     case Intrinsic::amdgcn_s_getreg:
4818     case Intrinsic::amdgcn_s_memtime:
4819     case Intrinsic::amdgcn_s_memrealtime:
4820     case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
4821     case Intrinsic::amdgcn_s_sendmsg_rtn: {
4822       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4823       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4824       break;
4825     }
4826     case Intrinsic::amdgcn_global_atomic_fadd:
4827     case Intrinsic::amdgcn_global_atomic_csub:
4828     case Intrinsic::amdgcn_global_atomic_fmin:
4829     case Intrinsic::amdgcn_global_atomic_fmax:
4830     case Intrinsic::amdgcn_global_atomic_fmin_num:
4831     case Intrinsic::amdgcn_global_atomic_fmax_num:
4832     case Intrinsic::amdgcn_flat_atomic_fadd:
4833     case Intrinsic::amdgcn_flat_atomic_fmin:
4834     case Intrinsic::amdgcn_flat_atomic_fmax:
4835     case Intrinsic::amdgcn_flat_atomic_fmin_num:
4836     case Intrinsic::amdgcn_flat_atomic_fmax_num:
4837     case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
4838     case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
4839     case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
4840       return getDefaultMappingAllVGPR(MI);
4841     case Intrinsic::amdgcn_ds_ordered_add:
4842     case Intrinsic::amdgcn_ds_ordered_swap:
4843     case Intrinsic::amdgcn_ds_fadd_v2bf16: {
4844       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4845       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4846       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4847                                  AMDGPU::SGPRRegBankID);
4848       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4849       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4850       break;
4851     }
4852     case Intrinsic::amdgcn_ds_append:
4853     case Intrinsic::amdgcn_ds_consume: {
4854       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4855       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4856       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4857       break;
4858     }
4859     case Intrinsic::amdgcn_exp_compr:
4860       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4861       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4862       break;
4863     case Intrinsic::amdgcn_exp:
4864       // FIXME: Could we support packed types here?
4865       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4866       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4867       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4868       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4869       break;
4870     case Intrinsic::amdgcn_exp_row:
4871       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4872       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4873       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4874       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4875       OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
4876       break;
4877     case Intrinsic::amdgcn_s_sendmsg:
4878     case Intrinsic::amdgcn_s_sendmsghalt: {
4879       // This must be an SGPR, but accept a VGPR.
4880       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4881                                    AMDGPU::SGPRRegBankID);
4882       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4883       break;
4884     }
4885     case Intrinsic::amdgcn_s_setreg: {
4886       // This must be an SGPR, but accept a VGPR.
4887       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4888                                    AMDGPU::SGPRRegBankID);
4889       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4890       break;
4891     }
4892     case Intrinsic::amdgcn_s_ttracedata: {
4893       // This must be an SGPR, but accept a VGPR.
4894       unsigned Bank =
4895           getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID);
4896       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4897       break;
4898     }
4899     case Intrinsic::amdgcn_end_cf: {
4900       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4901       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4902       break;
4903     }
4904     case Intrinsic::amdgcn_else: {
4905       unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4906       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4907       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4908       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4909       break;
4910     }
4911     case Intrinsic::amdgcn_live_mask: {
4912       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4913       break;
4914     }
4915     case Intrinsic::amdgcn_wqm_demote:
4916     case Intrinsic::amdgcn_kill: {
4917       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4918       break;
4919     }
4920     case Intrinsic::amdgcn_raw_buffer_load:
4921     case Intrinsic::amdgcn_raw_ptr_buffer_load:
4922     case Intrinsic::amdgcn_raw_tbuffer_load:
4923     case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
4924       // FIXME: Should make intrinsic ID the last operand of the instruction,
4925       // then this would be the same as store
4926       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4927       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4928       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4929       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4930       break;
4931     }
4932     case Intrinsic::amdgcn_raw_buffer_load_lds:
4933     case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
4934       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4935       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4936       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4937       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4938       break;
4939     }
4940     case Intrinsic::amdgcn_raw_buffer_store:
4941     case Intrinsic::amdgcn_raw_ptr_buffer_store:
4942     case Intrinsic::amdgcn_raw_buffer_store_format:
4943     case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
4944     case Intrinsic::amdgcn_raw_tbuffer_store:
4945     case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
4946       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4947       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4948       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4949       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4950       break;
4951     }
4952     case Intrinsic::amdgcn_struct_buffer_load:
4953     case Intrinsic::amdgcn_struct_ptr_buffer_load:
4954     case Intrinsic::amdgcn_struct_tbuffer_load:
4955     case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
4956       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4957       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4958       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4959       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4960       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4961       break;
4962     }
4963     case Intrinsic::amdgcn_struct_buffer_load_lds:
4964     case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
4965       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4966       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4967       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4968       OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4969       OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4970       break;
4971     }
4972     case Intrinsic::amdgcn_struct_buffer_store:
4973     case Intrinsic::amdgcn_struct_ptr_buffer_store:
4974     case Intrinsic::amdgcn_struct_tbuffer_store:
4975     case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
4976       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4977       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4978       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4979       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4980       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4981       break;
4982     }
4983     case Intrinsic::amdgcn_init_exec_from_input: {
4984       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4985       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4986       break;
4987     }
4988     case Intrinsic::amdgcn_ds_gws_init:
4989     case Intrinsic::amdgcn_ds_gws_barrier:
4990     case Intrinsic::amdgcn_ds_gws_sema_br: {
4991       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4992 
4993       // This must be an SGPR, but accept a VGPR.
4994       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4995                                    AMDGPU::SGPRRegBankID);
4996       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4997       break;
4998     }
4999     case Intrinsic::amdgcn_ds_gws_sema_v:
5000     case Intrinsic::amdgcn_ds_gws_sema_p:
5001     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
5002       // This must be an SGPR, but accept a VGPR.
5003       unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
5004                                    AMDGPU::SGPRRegBankID);
5005       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
5006       break;
5007     }
5008     case Intrinsic::amdgcn_global_load_lds: {
5009       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5010       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5011       break;
5012     }
5013     case Intrinsic::amdgcn_lds_direct_load: {
5014       const int M0Idx = MI.getNumOperands() - 1;
5015       Register M0Reg = MI.getOperand(M0Idx).getReg();
5016       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
5017       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5018 
5019       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
5020       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
5021         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5022 
5023       // Must be SGPR, but we must take whatever the original bank is and fix it
5024       // later.
5025       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
5026       break;
5027     }
5028     case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
5029     case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
5030       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5031       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5032       break;
5033     case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
5034       OpdsMapping[0] =
5035           getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst
5036       OpdsMapping[1] =
5037           getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr
5038       OpdsMapping[3] =
5039           getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr
5040       OpdsMapping[4] =
5041           getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0
5042       OpdsMapping[5] =
5043           getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
5044       break;
5045     }
5046     case Intrinsic::amdgcn_s_sleep_var:
5047       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5048       break;
5049     case Intrinsic::amdgcn_s_barrier_signal_var:
5050     case Intrinsic::amdgcn_s_barrier_join:
5051     case Intrinsic::amdgcn_s_wakeup_barrier:
5052       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5053       break;
5054     case Intrinsic::amdgcn_s_barrier_init:
5055       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5056       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5057       break;
5058     case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: {
5059       const unsigned ResultSize = 1;
5060       OpdsMapping[0] =
5061           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
5062       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5063       break;
5064     }
5065     case Intrinsic::amdgcn_s_barrier_signal_isfirst:
5066     case Intrinsic::amdgcn_s_barrier_leave: {
5067       const unsigned ResultSize = 1;
5068       OpdsMapping[0] =
5069           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
5070       break;
5071     }
5072     case Intrinsic::amdgcn_s_get_barrier_state: {
5073       OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5074       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5075       break;
5076     }
5077     default:
5078       return getInvalidInstructionMapping();
5079     }
5080     break;
5081   }
5082   case AMDGPU::G_SELECT: {
5083     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5084     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5085                                     AMDGPU::SGPRRegBankID);
5086     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
5087                                     AMDGPU::SGPRRegBankID);
5088     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
5089                     Op3Bank == AMDGPU::SGPRRegBankID;
5090 
5091     unsigned CondBankDefault = SGPRSrcs ?
5092       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5093     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
5094                                      CondBankDefault);
5095     if (CondBank == AMDGPU::SGPRRegBankID)
5096       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5097     else if (CondBank == AMDGPU::VGPRRegBankID)
5098       CondBank = AMDGPU::VCCRegBankID;
5099 
5100     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
5101       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5102 
5103     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
5104 
5105     // TODO: Should report 32-bit for scalar condition type.
5106     if (Size == 64) {
5107       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5108       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
5109       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5110       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5111     } else {
5112       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
5113       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
5114       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
5115       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
5116     }
5117 
5118     break;
5119   }
5120 
5121   case AMDGPU::G_SI_CALL: {
5122     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
5123     // Lie and claim everything is legal, even though some need to be
5124     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
5125     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5126 
5127     // Allow anything for implicit arguments
5128     for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
5129       if (MI.getOperand(I).isReg()) {
5130         Register Reg = MI.getOperand(I).getReg();
5131         auto OpBank = getRegBankID(Reg, MRI);
5132         unsigned Size = getSizeInBits(Reg, MRI, *TRI);
5133         OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
5134       }
5135     }
5136     break;
5137   }
5138   case AMDGPU::G_LOAD:
5139   case AMDGPU::G_ZEXTLOAD:
5140   case AMDGPU::G_SEXTLOAD:
5141     return getInstrMappingForLoad(MI);
5142 
5143   case AMDGPU::G_ATOMICRMW_XCHG:
5144   case AMDGPU::G_ATOMICRMW_ADD:
5145   case AMDGPU::G_ATOMICRMW_SUB:
5146   case AMDGPU::G_ATOMICRMW_AND:
5147   case AMDGPU::G_ATOMICRMW_OR:
5148   case AMDGPU::G_ATOMICRMW_XOR:
5149   case AMDGPU::G_ATOMICRMW_MAX:
5150   case AMDGPU::G_ATOMICRMW_MIN:
5151   case AMDGPU::G_ATOMICRMW_UMAX:
5152   case AMDGPU::G_ATOMICRMW_UMIN:
5153   case AMDGPU::G_ATOMICRMW_FADD:
5154   case AMDGPU::G_ATOMICRMW_UINC_WRAP:
5155   case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
5156   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
5157   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
5158   case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
5159     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5160     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
5161     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5162     break;
5163   }
5164   case AMDGPU::G_ATOMIC_CMPXCHG: {
5165     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5166     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
5167     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5168     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5169     break;
5170   }
5171   case AMDGPU::G_BRCOND: {
5172     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
5173                                  AMDGPU::SGPRRegBankID);
5174     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
5175     if (Bank != AMDGPU::SGPRRegBankID)
5176       Bank = AMDGPU::VCCRegBankID;
5177 
5178     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
5179     break;
5180   }
5181   case AMDGPU::G_FPTRUNC_ROUND_UPWARD:
5182   case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD:
5183     return getDefaultMappingVOP(MI);
5184   case AMDGPU::G_PREFETCH:
5185     OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5186     break;
5187   }
5188 
5189   return getInstructionMapping(/*ID*/1, /*Cost*/1,
5190                                getOperandsMapping(OpdsMapping),
5191                                MI.getNumOperands());
5192 }
5193