xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (revision cb14a3fe5122c879eae1fb480ed7ce82a699ddb6)
1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83 #include "llvm/CodeGen/RegisterBank.h"
84 #include "llvm/IR/IntrinsicsAMDGPU.h"
85 
86 #define GET_TARGET_REGBANK_IMPL
87 #include "AMDGPUGenRegisterBank.inc"
88 
89 // This file will be TableGen'ed at some point.
90 #include "AMDGPUGenRegisterBankInfo.def"
91 
92 using namespace llvm;
93 using namespace MIPatternMatch;
94 
95 namespace {
96 
97 // Observer to apply a register bank to new registers created by LegalizerHelper.
98 class ApplyRegBankMapping final : public GISelChangeObserver {
99 private:
100   MachineIRBuilder &B;
101   const AMDGPURegisterBankInfo &RBI;
102   MachineRegisterInfo &MRI;
103   const RegisterBank *NewBank;
104   SmallVector<MachineInstr *, 4> NewInsts;
105 
106 public:
107   ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_,
108                       MachineRegisterInfo &MRI_, const RegisterBank *RB)
109       : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) {
110     assert(!B.isObservingChanges());
111     B.setChangeObserver(*this);
112   }
113 
114   ~ApplyRegBankMapping() {
115     for (MachineInstr *MI : NewInsts)
116       applyBank(*MI);
117 
118     B.stopObservingChanges();
119   }
120 
121   /// Set any registers that don't have a set register class or bank to SALU.
122   void applyBank(MachineInstr &MI) {
123     const unsigned Opc = MI.getOpcode();
124     if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
125         Opc == AMDGPU::G_SEXT) {
126       // LegalizerHelper wants to use the basic legalization artifacts when
127       // widening etc. We don't handle selection with vcc in artifact sources,
128       // so we need to use a select instead to handle these properly.
129       Register DstReg = MI.getOperand(0).getReg();
130       Register SrcReg = MI.getOperand(1).getReg();
131       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
132       if (SrcBank == &AMDGPU::VCCRegBank) {
133         const LLT S32 = LLT::scalar(32);
134         assert(MRI.getType(SrcReg) == LLT::scalar(1));
135         assert(MRI.getType(DstReg) == S32);
136         assert(NewBank == &AMDGPU::VGPRRegBank);
137 
138         // Replace the extension with a select, which really uses the boolean
139         // source.
140         B.setInsertPt(*MI.getParent(), MI);
141 
142         auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
143         auto False = B.buildConstant(S32, 0);
144         B.buildSelect(DstReg, SrcReg, True, False);
145         MRI.setRegBank(True.getReg(0), *NewBank);
146         MRI.setRegBank(False.getReg(0), *NewBank);
147         MI.eraseFromParent();
148       }
149 
150       assert(!MRI.getRegClassOrRegBank(DstReg));
151       MRI.setRegBank(DstReg, *NewBank);
152       return;
153     }
154 
155 #ifndef NDEBUG
156     if (Opc == AMDGPU::G_TRUNC) {
157       Register DstReg = MI.getOperand(0).getReg();
158       const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
159       assert(DstBank != &AMDGPU::VCCRegBank);
160     }
161 #endif
162 
163     for (MachineOperand &Op : MI.operands()) {
164       if (!Op.isReg())
165         continue;
166 
167       // We may see physical registers if building a real MI
168       Register Reg = Op.getReg();
169       if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
170         continue;
171 
172       const RegisterBank *RB = NewBank;
173       if (MRI.getType(Reg) == LLT::scalar(1)) {
174         assert(NewBank == &AMDGPU::VGPRRegBank &&
175                "s1 operands should only be used for vector bools");
176         assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
177                 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
178                "not expecting legalization artifacts here");
179         RB = &AMDGPU::VCCRegBank;
180       }
181 
182       MRI.setRegBank(Reg, *RB);
183     }
184   }
185 
186   void erasingInstr(MachineInstr &MI) override {}
187 
188   void createdInstr(MachineInstr &MI) override {
189     // At this point, the instruction was just inserted and has no operands.
190     NewInsts.push_back(&MI);
191   }
192 
193   void changingInstr(MachineInstr &MI) override {}
194   void changedInstr(MachineInstr &MI) override {
195     // FIXME: In principle we should probably add the instruction to NewInsts,
196     // but the way the LegalizerHelper uses the observer, we will always see the
197     // registers we need to set the regbank on also referenced in a new
198     // instruction.
199   }
200 };
201 
202 }
203 
204 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
205     : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
206       TII(Subtarget.getInstrInfo()) {
207 
208   // HACK: Until this is fully tablegen'd.
209   static llvm::once_flag InitializeRegisterBankFlag;
210 
211   static auto InitializeRegisterBankOnce = [this]() {
212     assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
213            &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
214            &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
215     (void)this;
216   };
217 
218   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
219 }
220 
221 static bool isVectorRegisterBank(const RegisterBank &Bank) {
222   unsigned BankID = Bank.getID();
223   return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
224 }
225 
226 bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const {
227   return RB != &AMDGPU::SGPRRegBank;
228 }
229 
230 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
231                                           const RegisterBank &Src,
232                                           TypeSize Size) const {
233   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
234   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
235       (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
236     return std::numeric_limits<unsigned>::max();
237   }
238 
239   // Bool values are tricky, because the meaning is based on context. The SCC
240   // and VCC banks are for the natural scalar and vector conditions produced by
241   // a compare.
242   //
243   // Legalization doesn't know about the necessary context, so an s1 use may
244   // have been a truncate from an arbitrary value, in which case a copy (lowered
245   // as a compare with 0) needs to be inserted.
246   if (Size == 1 &&
247       (Dst.getID() == AMDGPU::SGPRRegBankID) &&
248       (isVectorRegisterBank(Src) ||
249        Src.getID() == AMDGPU::SGPRRegBankID ||
250        Src.getID() == AMDGPU::VCCRegBankID))
251     return std::numeric_limits<unsigned>::max();
252 
253   // There is no direct copy between AGPRs.
254   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
255       Src.getID() == AMDGPU::AGPRRegBankID)
256     return 4;
257 
258   return RegisterBankInfo::copyCost(Dst, Src, Size);
259 }
260 
261 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
262   const ValueMapping &ValMapping,
263   const RegisterBank *CurBank) const {
264   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
265   // VGPR.
266   // FIXME: Is there a better way to do this?
267   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
268     return 10; // This is expensive.
269 
270   assert(ValMapping.NumBreakDowns == 2 &&
271          ValMapping.BreakDown[0].Length == 32 &&
272          ValMapping.BreakDown[0].StartIdx == 0 &&
273          ValMapping.BreakDown[1].Length == 32 &&
274          ValMapping.BreakDown[1].StartIdx == 32 &&
275          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
276 
277   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
278   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
279   // want.
280 
281   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
282   // alignment restrictions, but this probably isn't important.
283   return 1;
284 }
285 
286 const RegisterBank &
287 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
288                                                LLT Ty) const {
289   if (&RC == &AMDGPU::SReg_1RegClass)
290     return AMDGPU::VCCRegBank;
291 
292   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
293   // VCC-like use.
294   if (TRI->isSGPRClass(&RC)) {
295     // FIXME: This probably came from a copy from a physical register, which
296     // should be inferable from the copied to-type. We don't have many boolean
297     // physical register constraints so just assume a normal SGPR for now.
298     if (!Ty.isValid())
299       return AMDGPU::SGPRRegBank;
300 
301     return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
302   }
303 
304   return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
305 }
306 
307 template <unsigned NumOps>
308 RegisterBankInfo::InstructionMappings
309 AMDGPURegisterBankInfo::addMappingFromTable(
310     const MachineInstr &MI, const MachineRegisterInfo &MRI,
311     const std::array<unsigned, NumOps> RegSrcOpIdx,
312     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
313 
314   InstructionMappings AltMappings;
315 
316   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
317 
318   unsigned Sizes[NumOps];
319   for (unsigned I = 0; I < NumOps; ++I) {
320     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
321     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
322   }
323 
324   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
325     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
326     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
327   }
328 
329   // getInstrMapping's default mapping uses ID 1, so start at 2.
330   unsigned MappingID = 2;
331   for (const auto &Entry : Table) {
332     for (unsigned I = 0; I < NumOps; ++I) {
333       int OpIdx = RegSrcOpIdx[I];
334       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
335     }
336 
337     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
338                                                  getOperandsMapping(Operands),
339                                                  Operands.size()));
340   }
341 
342   return AltMappings;
343 }
344 
345 RegisterBankInfo::InstructionMappings
346 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
347     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
348   switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
349   case Intrinsic::amdgcn_readlane: {
350     static const OpRegBankEntry<3> Table[2] = {
351       // Perfectly legal.
352       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
353 
354       // Need a readfirstlane for the index.
355       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
356     };
357 
358     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
359     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
360   }
361   case Intrinsic::amdgcn_writelane: {
362     static const OpRegBankEntry<4> Table[4] = {
363       // Perfectly legal.
364       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
365 
366       // Need readfirstlane of first op
367       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
368 
369       // Need readfirstlane of second op
370       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
371 
372       // Need readfirstlane of both ops
373       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
374     };
375 
376     // rsrc, voffset, offset
377     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
378     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);
379   }
380   default:
381     return RegisterBankInfo::getInstrAlternativeMappings(MI);
382   }
383 }
384 
385 RegisterBankInfo::InstructionMappings
386 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
387     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
388 
389   switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
390   case Intrinsic::amdgcn_s_buffer_load: {
391     static const OpRegBankEntry<2> Table[4] = {
392       // Perfectly legal.
393       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
394 
395       // Only need 1 register in loop
396       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
397 
398       // Have to waterfall the resource.
399       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
400 
401       // Have to waterfall the resource, and the offset.
402       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
403     };
404 
405     // rsrc, offset
406     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
407     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);
408   }
409   case Intrinsic::amdgcn_ds_ordered_add:
410   case Intrinsic::amdgcn_ds_ordered_swap: {
411     // VGPR = M0, VGPR
412     static const OpRegBankEntry<3> Table[2] = {
413       // Perfectly legal.
414       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
415 
416       // Need a readfirstlane for m0
417       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
418     };
419 
420     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
421     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
422   }
423   case Intrinsic::amdgcn_s_sendmsg:
424   case Intrinsic::amdgcn_s_sendmsghalt: {
425     // FIXME: Should have no register for immediate
426     static const OpRegBankEntry<1> Table[2] = {
427       // Perfectly legal.
428       { { AMDGPU::SGPRRegBankID }, 1 },
429 
430       // Need readlane
431       { { AMDGPU::VGPRRegBankID }, 3 }
432     };
433 
434     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
435     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);
436   }
437   default:
438     return RegisterBankInfo::getInstrAlternativeMappings(MI);
439   }
440 }
441 
442 // FIXME: Returns uniform if there's no source value information. This is
443 // probably wrong.
444 static bool isScalarLoadLegal(const MachineInstr &MI) {
445   if (!MI.hasOneMemOperand())
446     return false;
447 
448   const MachineMemOperand *MMO = *MI.memoperands_begin();
449   const unsigned AS = MMO->getAddrSpace();
450   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
451                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
452   // Require 4-byte alignment.
453   return MMO->getAlign() >= Align(4) &&
454          // Can't do a scalar atomic load.
455          !MMO->isAtomic() &&
456          // Don't use scalar loads for volatile accesses to non-constant address
457          // spaces.
458          (IsConst || !MMO->isVolatile()) &&
459          // Memory must be known constant, or not written before this load.
460          (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
461          AMDGPUInstrInfo::isUniformMMO(MMO);
462 }
463 
464 RegisterBankInfo::InstructionMappings
465 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
466     const MachineInstr &MI) const {
467 
468   const MachineFunction &MF = *MI.getParent()->getParent();
469   const MachineRegisterInfo &MRI = MF.getRegInfo();
470 
471 
472   InstructionMappings AltMappings;
473   switch (MI.getOpcode()) {
474   case TargetOpcode::G_CONSTANT:
475   case TargetOpcode::G_IMPLICIT_DEF: {
476     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
477     if (Size == 1) {
478       static const OpRegBankEntry<1> Table[3] = {
479         { { AMDGPU::VGPRRegBankID }, 1 },
480         { { AMDGPU::SGPRRegBankID }, 1 },
481         { { AMDGPU::VCCRegBankID }, 1 }
482       };
483 
484       return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
485     }
486 
487     [[fallthrough]];
488   }
489   case TargetOpcode::G_FCONSTANT:
490   case TargetOpcode::G_FRAME_INDEX:
491   case TargetOpcode::G_GLOBAL_VALUE: {
492     static const OpRegBankEntry<1> Table[2] = {
493       { { AMDGPU::VGPRRegBankID }, 1 },
494       { { AMDGPU::SGPRRegBankID }, 1 }
495     };
496 
497     return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
498   }
499   case TargetOpcode::G_AND:
500   case TargetOpcode::G_OR:
501   case TargetOpcode::G_XOR: {
502     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
503 
504     if (Size == 1) {
505       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
506       const InstructionMapping &SCCMapping = getInstructionMapping(
507         1, 1, getOperandsMapping(
508           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
509            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
510            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
511         3); // Num Operands
512       AltMappings.push_back(&SCCMapping);
513 
514       const InstructionMapping &VCCMapping0 = getInstructionMapping(
515         2, 1, getOperandsMapping(
516           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
517            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
518            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
519         3); // Num Operands
520       AltMappings.push_back(&VCCMapping0);
521       return AltMappings;
522     }
523 
524     if (Size != 64)
525       break;
526 
527     const InstructionMapping &SSMapping = getInstructionMapping(
528       1, 1, getOperandsMapping(
529         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
530          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
531          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
532       3); // Num Operands
533     AltMappings.push_back(&SSMapping);
534 
535     const InstructionMapping &VVMapping = getInstructionMapping(
536       2, 2, getOperandsMapping(
537         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
538          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
539          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
540       3); // Num Operands
541     AltMappings.push_back(&VVMapping);
542     break;
543   }
544   case TargetOpcode::G_LOAD:
545   case TargetOpcode::G_ZEXTLOAD:
546   case TargetOpcode::G_SEXTLOAD: {
547     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
548     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
549     unsigned PtrSize = PtrTy.getSizeInBits();
550     unsigned AS = PtrTy.getAddressSpace();
551 
552     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
553          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
554         isScalarLoadLegal(MI)) {
555       const InstructionMapping &SSMapping = getInstructionMapping(
556           1, 1, getOperandsMapping(
557                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
558                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
559           2); // Num Operands
560       AltMappings.push_back(&SSMapping);
561     }
562 
563     const InstructionMapping &VVMapping = getInstructionMapping(
564         2, 1,
565         getOperandsMapping(
566             {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
567              AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
568         2); // Num Operands
569     AltMappings.push_back(&VVMapping);
570 
571     // It may be possible to have a vgpr = load sgpr mapping here, because
572     // the mubuf instructions support this kind of load, but probably for only
573     // gfx7 and older.  However, the addressing mode matching in the instruction
574     // selector should be able to do a better job of detecting and selecting
575     // these kinds of loads from the vgpr = load vgpr mapping.
576 
577     return AltMappings;
578 
579   }
580   case TargetOpcode::G_SELECT: {
581     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
582     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
583       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
584                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
585                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
586                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
587       4); // Num Operands
588     AltMappings.push_back(&SSMapping);
589 
590     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
591       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
592                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
593                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
594                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
595       4); // Num Operands
596     AltMappings.push_back(&VVMapping);
597 
598     return AltMappings;
599   }
600   case TargetOpcode::G_UADDE:
601   case TargetOpcode::G_USUBE:
602   case TargetOpcode::G_SADDE:
603   case TargetOpcode::G_SSUBE: {
604     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
605     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
606       getOperandsMapping(
607         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
608          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
609          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
610          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
611          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
612       5); // Num Operands
613     AltMappings.push_back(&SSMapping);
614 
615     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
616       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
617                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
618                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
619                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
620                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
621       5); // Num Operands
622     AltMappings.push_back(&VVMapping);
623     return AltMappings;
624   }
625   case AMDGPU::G_BRCOND: {
626     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
627 
628     // TODO: Change type to 32 for scalar
629     const InstructionMapping &SMapping = getInstructionMapping(
630       1, 1, getOperandsMapping(
631         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
632       2); // Num Operands
633     AltMappings.push_back(&SMapping);
634 
635     const InstructionMapping &VMapping = getInstructionMapping(
636       1, 1, getOperandsMapping(
637         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
638       2); // Num Operands
639     AltMappings.push_back(&VMapping);
640     return AltMappings;
641   }
642   case AMDGPU::G_INTRINSIC:
643   case AMDGPU::G_INTRINSIC_CONVERGENT:
644     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
645   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
646   case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
647     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
648   default:
649     break;
650   }
651   return RegisterBankInfo::getInstrAlternativeMappings(MI);
652 }
653 
654 void AMDGPURegisterBankInfo::split64BitValueForMapping(
655   MachineIRBuilder &B,
656   SmallVector<Register, 2> &Regs,
657   LLT HalfTy,
658   Register Reg) const {
659   assert(HalfTy.getSizeInBits() == 32);
660   MachineRegisterInfo *MRI = B.getMRI();
661   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
662   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
663   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
664   MRI->setRegBank(LoLHS, *Bank);
665   MRI->setRegBank(HiLHS, *Bank);
666 
667   Regs.push_back(LoLHS);
668   Regs.push_back(HiLHS);
669 
670   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
671     .addDef(LoLHS)
672     .addDef(HiLHS)
673     .addUse(Reg);
674 }
675 
676 /// Replace the current type each register in \p Regs has with \p NewTy
677 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
678                           LLT NewTy) {
679   for (Register Reg : Regs) {
680     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
681     MRI.setType(Reg, NewTy);
682   }
683 }
684 
685 static LLT getHalfSizedType(LLT Ty) {
686   if (Ty.isVector()) {
687     assert(Ty.getElementCount().isKnownMultipleOf(2));
688     return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
689                                Ty.getElementType());
690   }
691 
692   assert(Ty.getScalarSizeInBits() % 2 == 0);
693   return LLT::scalar(Ty.getScalarSizeInBits() / 2);
694 }
695 
696 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
697 // source value into a scalar register.
698 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
699                                                     MachineRegisterInfo &MRI,
700                                                     Register Src) const {
701   LLT Ty = MRI.getType(Src);
702   const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
703 
704   if (Bank == &AMDGPU::SGPRRegBank)
705     return Src;
706 
707   unsigned Bits = Ty.getSizeInBits();
708   assert(Bits % 32 == 0);
709 
710   if (Bank != &AMDGPU::VGPRRegBank) {
711     // We need to copy from AGPR to VGPR
712     Src = B.buildCopy(Ty, Src).getReg(0);
713     MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
714   }
715 
716   LLT S32 = LLT::scalar(32);
717   unsigned NumParts = Bits / 32;
718   SmallVector<Register, 8> SrcParts;
719   SmallVector<Register, 8> DstParts;
720 
721   if (Bits == 32) {
722     SrcParts.push_back(Src);
723   } else {
724     auto Unmerge = B.buildUnmerge(S32, Src);
725     for (unsigned i = 0; i < NumParts; ++i)
726       SrcParts.push_back(Unmerge.getReg(i));
727   }
728 
729   for (unsigned i = 0; i < NumParts; ++i) {
730     Register SrcPart = SrcParts[i];
731     Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
732     MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
733 
734     const TargetRegisterClass *Constrained =
735         constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
736     (void)Constrained;
737     assert(Constrained && "Failed to constrain readfirstlane src reg");
738 
739     B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
740 
741     DstParts.push_back(DstPart);
742   }
743 
744   if (Bits == 32)
745     return DstParts[0];
746 
747   Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0);
748   MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
749   return Dst;
750 }
751 
752 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
753 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
754 /// execute the instruction for each unique combination of values in all lanes
755 /// in the wave. The block will be split such that rest of the instructions are
756 /// moved to a new block.
757 ///
758 /// Essentially performs this loop:
759 //
760 /// Save Execution Mask
761 /// For (Lane : Wavefront) {
762 ///   Enable Lane, Disable all other lanes
763 ///   SGPR = read SGPR value for current lane from VGPR
764 ///   VGPRResult[Lane] = use_op SGPR
765 /// }
766 /// Restore Execution Mask
767 ///
768 /// There is additional complexity to try for compare values to identify the
769 /// unique values used.
770 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
771     MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
772     SmallSet<Register, 4> &SGPROperandRegs) const {
773   // Track use registers which have already been expanded with a readfirstlane
774   // sequence. This may have multiple uses if moving a sequence.
775   DenseMap<Register, Register> WaterfalledRegMap;
776 
777   MachineBasicBlock &MBB = B.getMBB();
778   MachineFunction *MF = &B.getMF();
779 
780   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
781   const unsigned MovExecOpc =
782       Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
783   const unsigned MovExecTermOpc =
784       Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
785 
786   const unsigned XorTermOpc = Subtarget.isWave32() ?
787     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
788   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
789     AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
790   const unsigned ExecReg =  Subtarget.isWave32() ?
791     AMDGPU::EXEC_LO : AMDGPU::EXEC;
792 
793 #ifndef NDEBUG
794   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
795 #endif
796 
797   MachineRegisterInfo &MRI = *B.getMRI();
798   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
799   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
800 
801   // Don't bother using generic instructions/registers for the exec mask.
802   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
803     .addDef(InitSaveExecReg);
804 
805   Register PhiExec = MRI.createVirtualRegister(WaveRC);
806   Register NewExec = MRI.createVirtualRegister(WaveRC);
807 
808   // To insert the loop we need to split the block. Move everything before this
809   // point to a new block, and insert a new empty block before this instruction.
810   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
811   MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
812   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
813   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
814   MachineFunction::iterator MBBI(MBB);
815   ++MBBI;
816   MF->insert(MBBI, LoopBB);
817   MF->insert(MBBI, BodyBB);
818   MF->insert(MBBI, RestoreExecBB);
819   MF->insert(MBBI, RemainderBB);
820 
821   LoopBB->addSuccessor(BodyBB);
822   BodyBB->addSuccessor(RestoreExecBB);
823   BodyBB->addSuccessor(LoopBB);
824 
825   // Move the rest of the block into a new block.
826   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
827   RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
828 
829   MBB.addSuccessor(LoopBB);
830   RestoreExecBB->addSuccessor(RemainderBB);
831 
832   B.setInsertPt(*LoopBB, LoopBB->end());
833 
834   B.buildInstr(TargetOpcode::PHI)
835       .addDef(PhiExec)
836       .addReg(InitSaveExecReg)
837       .addMBB(&MBB)
838       .addReg(NewExec)
839       .addMBB(BodyBB);
840 
841   const DebugLoc &DL = B.getDL();
842 
843   MachineInstr &FirstInst = *Range.begin();
844 
845   // Move the instruction into the loop body. Note we moved everything after
846   // Range.end() already into a new block, so Range.end() is no longer valid.
847   BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
848 
849   // Figure out the iterator range after splicing the instructions.
850   MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
851   auto NewEnd = BodyBB->end();
852 
853   B.setMBB(*LoopBB);
854 
855   LLT S1 = LLT::scalar(1);
856   Register CondReg;
857 
858   assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
859 
860   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
861     for (MachineOperand &Op : MI.all_uses()) {
862       Register OldReg = Op.getReg();
863       if (!SGPROperandRegs.count(OldReg))
864         continue;
865 
866       // See if we already processed this register in another instruction in the
867       // sequence.
868       auto OldVal = WaterfalledRegMap.find(OldReg);
869       if (OldVal != WaterfalledRegMap.end()) {
870         Op.setReg(OldVal->second);
871         continue;
872       }
873 
874       Register OpReg = Op.getReg();
875       LLT OpTy = MRI.getType(OpReg);
876 
877       const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
878       if (OpBank != &AMDGPU::VGPRRegBank) {
879         // Insert copy from AGPR to VGPR before the loop.
880         B.setMBB(MBB);
881         OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
882         MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
883         B.setMBB(*LoopBB);
884       }
885 
886       Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
887 
888       // Build the comparison(s).
889       unsigned OpSize = OpTy.getSizeInBits();
890       bool Is64 = OpSize % 64 == 0;
891       unsigned PartSize = Is64 ? 64 : 32;
892       LLT PartTy = LLT::scalar(PartSize);
893       unsigned NumParts = OpSize / PartSize;
894       SmallVector<Register, 8> OpParts;
895       SmallVector<Register, 8> CurrentLaneParts;
896 
897       if (NumParts == 1) {
898         OpParts.push_back(OpReg);
899         CurrentLaneParts.push_back(CurrentLaneReg);
900       } else {
901         auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
902         auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
903         for (unsigned i = 0; i < NumParts; ++i) {
904           OpParts.push_back(UnmergeOp.getReg(i));
905           CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
906           MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
907           MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
908         }
909       }
910 
911       for (unsigned i = 0; i < NumParts; ++i) {
912         auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
913                                   OpParts[i]).getReg(0);
914         MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
915 
916         if (!CondReg) {
917           CondReg = CmpReg;
918         } else {
919           CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
920           MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
921         }
922       }
923 
924       Op.setReg(CurrentLaneReg);
925 
926       // Make sure we don't re-process this register again.
927       WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
928     }
929   }
930 
931   // The ballot becomes a no-op during instruction selection.
932   CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
933                              {LLT::scalar(Subtarget.isWave32() ? 32 : 64)})
934                 .addReg(CondReg)
935                 .getReg(0);
936   MRI.setRegClass(CondReg, WaveRC);
937 
938   // Update EXEC, save the original EXEC value to VCC.
939   B.buildInstr(AndSaveExecOpc)
940     .addDef(NewExec)
941     .addReg(CondReg, RegState::Kill);
942 
943   MRI.setSimpleHint(NewExec, CondReg);
944 
945   B.setInsertPt(*BodyBB, BodyBB->end());
946 
947   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
948   B.buildInstr(XorTermOpc)
949     .addDef(ExecReg)
950     .addReg(ExecReg)
951     .addReg(NewExec);
952 
953   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
954   // s_cbranch_scc0?
955 
956   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
957   B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
958 
959   // Save the EXEC mask before the loop.
960   BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
961     .addReg(ExecReg);
962 
963   // Restore the EXEC mask after the loop.
964   B.setMBB(*RestoreExecBB);
965   B.buildInstr(MovExecTermOpc)
966     .addDef(ExecReg)
967     .addReg(SaveExecReg);
968 
969   // Set the insert point after the original instruction, so any new
970   // instructions will be in the remainder.
971   B.setInsertPt(*RemainderBB, RemainderBB->begin());
972 
973   return true;
974 }
975 
976 // Return any unique registers used by \p MI at \p OpIndices that need to be
977 // handled in a waterfall loop. Returns these registers in \p
978 // SGPROperandRegs. Returns true if there are any operands to handle and a
979 // waterfall loop is necessary.
980 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
981   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
982   MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
983   for (unsigned Op : OpIndices) {
984     assert(MI.getOperand(Op).isUse());
985     Register Reg = MI.getOperand(Op).getReg();
986     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
987     if (OpBank->getID() != AMDGPU::SGPRRegBankID)
988       SGPROperandRegs.insert(Reg);
989   }
990 
991   // No operands need to be replaced, so no need to loop.
992   return !SGPROperandRegs.empty();
993 }
994 
995 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
996     MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const {
997   // Use a set to avoid extra readfirstlanes in the case where multiple operands
998   // are the same register.
999   SmallSet<Register, 4> SGPROperandRegs;
1000 
1001   if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices))
1002     return false;
1003 
1004   MachineBasicBlock::iterator I = MI.getIterator();
1005   return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1006                                 SGPROperandRegs);
1007 }
1008 
1009 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1010 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1011     MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const {
1012   Register Reg = MI.getOperand(OpIdx).getReg();
1013   MachineRegisterInfo &MRI = *B.getMRI();
1014   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1015   if (Bank == &AMDGPU::SGPRRegBank)
1016     return;
1017 
1018   Reg = buildReadFirstLane(B, MRI, Reg);
1019   MI.getOperand(OpIdx).setReg(Reg);
1020 }
1021 
1022 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1023 /// rest will be in the remainder.
1024 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1025   unsigned TotalSize = Ty.getSizeInBits();
1026   if (!Ty.isVector())
1027     return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1028 
1029   LLT EltTy = Ty.getElementType();
1030   unsigned EltSize = EltTy.getSizeInBits();
1031   assert(FirstSize % EltSize == 0);
1032 
1033   unsigned FirstPartNumElts = FirstSize / EltSize;
1034   unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1035 
1036   return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1037           LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1038 }
1039 
1040 static LLT widen96To128(LLT Ty) {
1041   if (!Ty.isVector())
1042     return LLT::scalar(128);
1043 
1044   LLT EltTy = Ty.getElementType();
1045   assert(128 % EltTy.getSizeInBits() == 0);
1046   return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1047 }
1048 
1049 bool AMDGPURegisterBankInfo::applyMappingLoad(
1050     MachineIRBuilder &B,
1051     const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1052     MachineInstr &MI) const {
1053   MachineRegisterInfo &MRI = *B.getMRI();
1054   Register DstReg = MI.getOperand(0).getReg();
1055   const LLT LoadTy = MRI.getType(DstReg);
1056   unsigned LoadSize = LoadTy.getSizeInBits();
1057   const unsigned MaxNonSmrdLoadSize = 128;
1058 
1059   const RegisterBank *DstBank =
1060       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1061   if (DstBank == &AMDGPU::SGPRRegBank) {
1062     // There are some special cases that we need to look at for 32 bit and 96
1063     // bit SGPR loads otherwise we have nothing to do.
1064     if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
1065       return false;
1066 
1067     MachineMemOperand *MMO = *MI.memoperands_begin();
1068     const unsigned MemSize = 8 * MMO->getSize();
1069     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1070     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1071     // scalar loads should have a load size of 32 but memory access size of less
1072     // than 32.
1073     if (LoadSize == 32 &&
1074         (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1075       return false;
1076 
1077     Register PtrReg = MI.getOperand(1).getReg();
1078 
1079     ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
1080 
1081     if (LoadSize == 32) {
1082       // This is an extending load from a sub-dword size. Widen the memory
1083       // access size to 4 bytes and clear the extra high bits appropriately
1084       const LLT S32 = LLT::scalar(32);
1085       if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1086         // Must extend the sign bit into higher bits for a G_SEXTLOAD
1087         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1088         B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1089       } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1090         // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1091         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1092         B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1093       } else
1094         // We do not need to touch the higher bits for regular loads.
1095         B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1096     } else {
1097       // 96-bit loads are only available for vector loads. We need to split this
1098       // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1099       if (MMO->getAlign() < Align(16)) {
1100         LegalizerHelper Helper(B.getMF(), ApplyBank, B);
1101         LLT Part64, Part32;
1102         std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1103         if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1104             LegalizerHelper::Legalized)
1105           return false;
1106         return true;
1107       } else {
1108         LLT WiderTy = widen96To128(LoadTy);
1109         auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1110         if (WiderTy.isScalar())
1111           B.buildTrunc(MI.getOperand(0), WideLoad);
1112         else {
1113           B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1114                                               WideLoad);
1115         }
1116       }
1117     }
1118 
1119     MI.eraseFromParent();
1120     return true;
1121   }
1122 
1123   // 128-bit loads are supported for all instruction types.
1124   if (LoadSize <= MaxNonSmrdLoadSize)
1125     return false;
1126 
1127   SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1128   SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1129 
1130   if (SrcRegs.empty())
1131     SrcRegs.push_back(MI.getOperand(1).getReg());
1132 
1133   assert(LoadSize % MaxNonSmrdLoadSize == 0);
1134 
1135   // RegBankSelect only emits scalar types, so we need to reset the pointer
1136   // operand to a pointer type.
1137   Register BasePtrReg = SrcRegs[0];
1138   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1139   MRI.setType(BasePtrReg, PtrTy);
1140 
1141   unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1142   const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1143   ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
1144   LegalizerHelper Helper(B.getMF(), O, B);
1145 
1146   if (LoadTy.isVector()) {
1147     if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1148       return false;
1149   } else {
1150     if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1151       return false;
1152   }
1153 
1154   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1155   return true;
1156 }
1157 
1158 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1159     MachineIRBuilder &B,
1160     const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1161     MachineInstr &MI) const {
1162   MachineRegisterInfo &MRI = *B.getMRI();
1163   const MachineFunction &MF = B.getMF();
1164   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1165   const auto &TFI = *ST.getFrameLowering();
1166 
1167   // Guard in case the stack growth direction ever changes with scratch
1168   // instructions.
1169   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1170     return false;
1171 
1172   Register Dst = MI.getOperand(0).getReg();
1173   Register AllocSize = MI.getOperand(1).getReg();
1174   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1175 
1176   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1177 
1178   // TODO: Need to emit a wave reduction to get the maximum size.
1179   if (SizeBank != &AMDGPU::SGPRRegBank)
1180     return false;
1181 
1182   LLT PtrTy = MRI.getType(Dst);
1183   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1184 
1185   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1186   Register SPReg = Info->getStackPtrOffsetReg();
1187   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1188 
1189   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1190   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1191 
1192   auto SPCopy = B.buildCopy(PtrTy, SPReg);
1193   if (Alignment > TFI.getStackAlign()) {
1194     auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1195     B.buildMaskLowPtrBits(Dst, PtrAdd,
1196                           Log2(Alignment) + ST.getWavefrontSizeLog2());
1197   } else {
1198     B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1199   }
1200 
1201   MI.eraseFromParent();
1202   return true;
1203 }
1204 
1205 bool AMDGPURegisterBankInfo::applyMappingImage(
1206     MachineIRBuilder &B, MachineInstr &MI,
1207     const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1208     int RsrcIdx) const {
1209   const int NumDefs = MI.getNumExplicitDefs();
1210 
1211   // The reported argument index is relative to the IR intrinsic call arguments,
1212   // so we need to shift by the number of defs and the intrinsic ID.
1213   RsrcIdx += NumDefs + 1;
1214 
1215   // Insert copies to VGPR arguments.
1216   applyDefaultMapping(OpdMapper);
1217 
1218   // Fixup any SGPR arguments.
1219   SmallVector<unsigned, 4> SGPRIndexes;
1220   for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1221     if (!MI.getOperand(I).isReg())
1222       continue;
1223 
1224     // If this intrinsic has a sampler, it immediately follows rsrc.
1225     if (I == RsrcIdx || I == RsrcIdx + 1)
1226       SGPRIndexes.push_back(I);
1227   }
1228 
1229   executeInWaterfallLoop(B, MI, SGPRIndexes);
1230   return true;
1231 }
1232 
1233 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1234 // the three offsets (voffset, soffset and instoffset)
1235 unsigned AMDGPURegisterBankInfo::setBufferOffsets(
1236     MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
1237     Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
1238   const LLT S32 = LLT::scalar(32);
1239   MachineRegisterInfo *MRI = B.getMRI();
1240 
1241   if (std::optional<int64_t> Imm =
1242           getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1243     uint32_t SOffset, ImmOffset;
1244     if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) {
1245       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1246       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1247       InstOffsetVal = ImmOffset;
1248 
1249       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1250       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1251       return SOffset + ImmOffset;
1252     }
1253   }
1254 
1255   Register Base;
1256   unsigned Offset;
1257 
1258   std::tie(Base, Offset) =
1259       AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1260 
1261   uint32_t SOffset, ImmOffset;
1262   if ((int)Offset > 0 &&
1263       TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
1264     if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1265       VOffsetReg = Base;
1266       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1267       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1268       InstOffsetVal = ImmOffset;
1269       return 0; // XXX - Why is this 0?
1270     }
1271 
1272     // If we have SGPR base, we can use it for soffset.
1273     if (SOffset == 0) {
1274       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1275       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1276       SOffsetReg = Base;
1277       InstOffsetVal = ImmOffset;
1278       return 0; // XXX - Why is this 0?
1279     }
1280   }
1281 
1282   // Handle the variable sgpr + vgpr case.
1283   MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1284   if (Add && (int)Offset >= 0) {
1285     Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI);
1286     Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI);
1287 
1288     const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI);
1289     const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI);
1290 
1291     if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1292       VOffsetReg = Src0;
1293       SOffsetReg = Src1;
1294       return 0;
1295     }
1296 
1297     if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1298       VOffsetReg = Src1;
1299       SOffsetReg = Src0;
1300       return 0;
1301     }
1302   }
1303 
1304   // Ensure we have a VGPR for the combined offset. This could be an issue if we
1305   // have an SGPR offset and a VGPR resource.
1306   if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1307     VOffsetReg = CombinedOffset;
1308   } else {
1309     VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1310     B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1311   }
1312 
1313   SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1314   B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1315   return 0;
1316 }
1317 
1318 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1319     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1320   MachineInstr &MI = OpdMapper.getMI();
1321   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1322 
1323   const LLT S32 = LLT::scalar(32);
1324   Register Dst = MI.getOperand(0).getReg();
1325   LLT Ty = MRI.getType(Dst);
1326 
1327   const RegisterBank *RSrcBank =
1328     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1329   const RegisterBank *OffsetBank =
1330     OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1331   if (RSrcBank == &AMDGPU::SGPRRegBank &&
1332       OffsetBank == &AMDGPU::SGPRRegBank)
1333     return true; // Legal mapping
1334 
1335   // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1336   // here but don't have an MMO.
1337 
1338   unsigned LoadSize = Ty.getSizeInBits();
1339   int NumLoads = 1;
1340   if (LoadSize == 256 || LoadSize == 512) {
1341     NumLoads = LoadSize / 128;
1342     Ty = Ty.divide(NumLoads);
1343   }
1344 
1345   // Use the alignment to ensure that the required offsets will fit into the
1346   // immediate offsets.
1347   const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1348 
1349   MachineFunction &MF = B.getMF();
1350 
1351   Register SOffset;
1352   Register VOffset;
1353   int64_t ImmOffset = 0;
1354 
1355   unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset,
1356                                         SOffset, ImmOffset, Alignment);
1357 
1358   // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1359   // can, but we need to track an MMO for that.
1360   const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1361   const Align MemAlign(4); // FIXME: ABI type alignment?
1362   MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1363     MachinePointerInfo(),
1364     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1365     MachineMemOperand::MOInvariant,
1366     MemSize, MemAlign);
1367   if (MMOOffset != 0)
1368     BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1369 
1370   // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1371   // assume that the buffer is unswizzled.
1372 
1373   Register RSrc = MI.getOperand(1).getReg();
1374   Register VIndex = B.buildConstant(S32, 0).getReg(0);
1375   B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1376 
1377   SmallVector<Register, 4> LoadParts(NumLoads);
1378 
1379   MachineBasicBlock::iterator MII = MI.getIterator();
1380   MachineInstrSpan Span(MII, &B.getMBB());
1381 
1382   for (int i = 0; i < NumLoads; ++i) {
1383     if (NumLoads == 1) {
1384       LoadParts[i] = Dst;
1385     } else {
1386       LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1387       MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1388     }
1389 
1390     MachineMemOperand *MMO = BaseMMO;
1391     if (i != 0)
1392       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1393 
1394     B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1395       .addDef(LoadParts[i])       // vdata
1396       .addUse(RSrc)               // rsrc
1397       .addUse(VIndex)             // vindex
1398       .addUse(VOffset)            // voffset
1399       .addUse(SOffset)            // soffset
1400       .addImm(ImmOffset + 16 * i) // offset(imm)
1401       .addImm(0)                  // cachepolicy, swizzled buffer(imm)
1402       .addImm(0)                  // idxen(imm)
1403       .addMemOperand(MMO);
1404   }
1405 
1406   // TODO: If only the resource is a VGPR, it may be better to execute the
1407   // scalar load in the waterfall loop if the resource is expected to frequently
1408   // be dynamically uniform.
1409   if (RSrcBank != &AMDGPU::SGPRRegBank) {
1410     // Remove the original instruction to avoid potentially confusing the
1411     // waterfall loop logic.
1412     B.setInstr(*Span.begin());
1413     MI.eraseFromParent();
1414 
1415     SmallSet<Register, 4> OpsToWaterfall;
1416 
1417     OpsToWaterfall.insert(RSrc);
1418     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1419                            OpsToWaterfall);
1420   }
1421 
1422   if (NumLoads != 1) {
1423     if (Ty.isVector())
1424       B.buildConcatVectors(Dst, LoadParts);
1425     else
1426       B.buildMergeLikeInstr(Dst, LoadParts);
1427   }
1428 
1429   // We removed the instruction earlier with a waterfall loop.
1430   if (RSrcBank == &AMDGPU::SGPRRegBank)
1431     MI.eraseFromParent();
1432 
1433   return true;
1434 }
1435 
1436 bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
1437                                              const OperandsMapper &OpdMapper,
1438                                              bool Signed) const {
1439   MachineInstr &MI = OpdMapper.getMI();
1440   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1441 
1442   // Insert basic copies
1443   applyDefaultMapping(OpdMapper);
1444 
1445   Register DstReg = MI.getOperand(0).getReg();
1446   LLT Ty = MRI.getType(DstReg);
1447 
1448   const LLT S32 = LLT::scalar(32);
1449 
1450   unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
1451   Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1452   Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1453   Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1454 
1455   const RegisterBank *DstBank =
1456     OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1457   if (DstBank == &AMDGPU::VGPRRegBank) {
1458     if (Ty == S32)
1459       return true;
1460 
1461     // There is no 64-bit vgpr bitfield extract instructions so the operation
1462     // is expanded to a sequence of instructions that implement the operation.
1463     ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
1464 
1465     const LLT S64 = LLT::scalar(64);
1466     // Shift the source operand so that extracted bits start at bit 0.
1467     auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1468                               : B.buildLShr(S64, SrcReg, OffsetReg);
1469     auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1470 
1471     // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1472     // if the width is a constant.
1473     if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1474       // Use the 32-bit bitfield extract instruction if the width is a constant.
1475       // Depending on the width size, use either the low or high 32-bits.
1476       auto Zero = B.buildConstant(S32, 0);
1477       auto WidthImm = ConstWidth->Value.getZExtValue();
1478       if (WidthImm <= 32) {
1479         // Use bitfield extract on the lower 32-bit source, and then sign-extend
1480         // or clear the upper 32-bits.
1481         auto Extract =
1482             Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1483                    : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1484         auto Extend =
1485             Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1486         B.buildMergeLikeInstr(DstReg, {Extract, Extend});
1487       } else {
1488         // Use bitfield extract on upper 32-bit source, and combine with lower
1489         // 32-bit source.
1490         auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1491         auto Extract =
1492             Signed
1493                 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1494                 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1495         B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
1496       }
1497       MI.eraseFromParent();
1498       return true;
1499     }
1500 
1501     // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1502     // operations.
1503     auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1504     auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1505     if (Signed)
1506       B.buildAShr(S64, SignBit, ExtShift);
1507     else
1508       B.buildLShr(S64, SignBit, ExtShift);
1509     MI.eraseFromParent();
1510     return true;
1511   }
1512 
1513   // The scalar form packs the offset and width in a single operand.
1514 
1515   ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1516 
1517   // Ensure the high bits are clear to insert the offset.
1518   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1519   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1520 
1521   // Zeros out the low bits, so don't bother clamping the input value.
1522   auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1523 
1524   // Transformation function, pack the offset and width of a BFE into
1525   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1526   // source, bits [5:0] contain the offset and bits [22:16] the width.
1527   auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1528 
1529   // TODO: It might be worth using a pseudo here to avoid scc clobber and
1530   // register class constraints.
1531   unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1532                              (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1533 
1534   auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1535   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1536     llvm_unreachable("failed to constrain BFE");
1537 
1538   MI.eraseFromParent();
1539   return true;
1540 }
1541 
1542 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1543     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1544   MachineInstr &MI = OpdMapper.getMI();
1545   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1546 
1547   // Insert basic copies.
1548   applyDefaultMapping(OpdMapper);
1549 
1550   Register Dst0 = MI.getOperand(0).getReg();
1551   Register Dst1 = MI.getOperand(1).getReg();
1552   Register Src0 = MI.getOperand(2).getReg();
1553   Register Src1 = MI.getOperand(3).getReg();
1554   Register Src2 = MI.getOperand(4).getReg();
1555 
1556   if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
1557     return true;
1558 
1559   bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1560   LLT S1 = LLT::scalar(1);
1561   LLT S32 = LLT::scalar(32);
1562 
1563   bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
1564   bool Accumulate = true;
1565 
1566   if (!DstOnValu) {
1567     if (mi_match(Src2, MRI, m_ZeroInt()))
1568       Accumulate = false;
1569   }
1570 
1571   // Keep the multiplication on the SALU.
1572   Register DstHi;
1573   Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
1574   bool MulHiInVgpr = false;
1575 
1576   MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
1577 
1578   if (Subtarget.hasSMulHi()) {
1579     DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)
1580                        : B.buildSMulH(S32, Src0, Src1).getReg(0);
1581     MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
1582   } else {
1583     Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);
1584     Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);
1585 
1586     MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
1587     MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
1588 
1589     DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)
1590                        : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);
1591     MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1592 
1593     if (!DstOnValu) {
1594       DstHi = buildReadFirstLane(B, MRI, DstHi);
1595     } else {
1596       MulHiInVgpr = true;
1597     }
1598   }
1599 
1600   // Accumulate and produce the "carry-out" bit.
1601   //
1602   // The "carry-out" is defined as bit 64 of the result when computed as a
1603   // big integer. For unsigned multiply-add, this matches the usual definition
1604   // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1605   // result, which is determined as:
1606   //   sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1607   LLT CarryType = DstOnValu ? S1 : S32;
1608   const RegisterBank &CarryBank =
1609       DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1610   const RegisterBank &DstBank =
1611       DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1612   Register Carry;
1613   Register Zero;
1614 
1615   if (!IsUnsigned) {
1616     Zero = B.buildConstant(S32, 0).getReg(0);
1617     MRI.setRegBank(Zero,
1618                    MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1619 
1620     Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)
1621                 .getReg(0);
1622     MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
1623                                       : AMDGPU::SGPRRegBank);
1624 
1625     if (DstOnValu && !MulHiInVgpr) {
1626       Carry = B.buildTrunc(S1, Carry).getReg(0);
1627       MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
1628     }
1629   }
1630 
1631   if (Accumulate) {
1632     if (DstOnValu) {
1633       DstLo = B.buildCopy(S32, DstLo).getReg(0);
1634       DstHi = B.buildCopy(S32, DstHi).getReg(0);
1635       MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
1636       MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1637     }
1638 
1639     auto Unmerge = B.buildUnmerge(S32, Src2);
1640     Register Src2Lo = Unmerge.getReg(0);
1641     Register Src2Hi = Unmerge.getReg(1);
1642     MRI.setRegBank(Src2Lo, DstBank);
1643     MRI.setRegBank(Src2Hi, DstBank);
1644 
1645     if (!IsUnsigned) {
1646       auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);
1647       MRI.setRegBank(Src2Sign.getReg(0), CarryBank);
1648 
1649       Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);
1650       MRI.setRegBank(Carry, CarryBank);
1651     }
1652 
1653     auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);
1654     DstLo = AddLo.getReg(0);
1655     Register CarryLo = AddLo.getReg(1);
1656     MRI.setRegBank(DstLo, DstBank);
1657     MRI.setRegBank(CarryLo, CarryBank);
1658 
1659     auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);
1660     DstHi = AddHi.getReg(0);
1661     MRI.setRegBank(DstHi, DstBank);
1662 
1663     Register CarryHi = AddHi.getReg(1);
1664     MRI.setRegBank(CarryHi, CarryBank);
1665 
1666     if (IsUnsigned) {
1667       Carry = CarryHi;
1668     } else {
1669       Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);
1670       MRI.setRegBank(Carry, CarryBank);
1671     }
1672   } else {
1673     if (IsUnsigned) {
1674       Carry = B.buildConstant(CarryType, 0).getReg(0);
1675       MRI.setRegBank(Carry, CarryBank);
1676     }
1677   }
1678 
1679   B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
1680 
1681   if (DstOnValu) {
1682     B.buildCopy(Dst1, Carry);
1683   } else {
1684     B.buildTrunc(Dst1, Carry);
1685   }
1686 
1687   MI.eraseFromParent();
1688   return true;
1689 }
1690 
1691 // Return a suitable opcode for extending the operands of Opc when widening.
1692 static unsigned getExtendOp(unsigned Opc) {
1693   switch (Opc) {
1694   case TargetOpcode::G_ASHR:
1695   case TargetOpcode::G_SMIN:
1696   case TargetOpcode::G_SMAX:
1697     return TargetOpcode::G_SEXT;
1698   case TargetOpcode::G_LSHR:
1699   case TargetOpcode::G_UMIN:
1700   case TargetOpcode::G_UMAX:
1701     return TargetOpcode::G_ZEXT;
1702   default:
1703     return TargetOpcode::G_ANYEXT;
1704   }
1705 }
1706 
1707 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1708 // any illegal vector extend or unmerge operations.
1709 static std::pair<Register, Register>
1710 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1711   const LLT S32 = LLT::scalar(32);
1712   auto Bitcast = B.buildBitcast(S32, Src);
1713 
1714   if (ExtOpcode == TargetOpcode::G_SEXT) {
1715     auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1716     auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1717     return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1718   }
1719 
1720   auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1721   if (ExtOpcode == TargetOpcode::G_ZEXT) {
1722     auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1723     return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1724   }
1725 
1726   assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1727   return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1728 }
1729 
1730 // For cases where only a single copy is inserted for matching register banks.
1731 // Replace the register in the instruction operand
1732 static bool substituteSimpleCopyRegs(
1733   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1734   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1735   if (!SrcReg.empty()) {
1736     assert(SrcReg.size() == 1);
1737     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1738     return true;
1739   }
1740 
1741   return false;
1742 }
1743 
1744 /// Handle register layout difference for f16 images for some subtargets.
1745 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1746                                                 MachineRegisterInfo &MRI,
1747                                                 Register Reg) const {
1748   if (!Subtarget.hasUnpackedD16VMem())
1749     return Reg;
1750 
1751   const LLT S16 = LLT::scalar(16);
1752   LLT StoreVT = MRI.getType(Reg);
1753   if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1754     return Reg;
1755 
1756   auto Unmerge = B.buildUnmerge(S16, Reg);
1757 
1758 
1759   SmallVector<Register, 4> WideRegs;
1760   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1761     WideRegs.push_back(Unmerge.getReg(I));
1762 
1763   const LLT S32 = LLT::scalar(32);
1764   int NumElts = StoreVT.getNumElements();
1765 
1766   return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs)
1767       .getReg(0);
1768 }
1769 
1770 static std::pair<Register, unsigned>
1771 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1772   int64_t Const;
1773   if (mi_match(Reg, MRI, m_ICst(Const)))
1774     return std::pair(Register(), Const);
1775 
1776   Register Base;
1777   if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1778     return std::pair(Base, Const);
1779 
1780   // TODO: Handle G_OR used for add case
1781   return std::pair(Reg, 0);
1782 }
1783 
1784 std::pair<Register, unsigned>
1785 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1786                                            Register OrigOffset) const {
1787   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(Subtarget);
1788   Register BaseReg;
1789   unsigned ImmOffset;
1790   const LLT S32 = LLT::scalar(32);
1791 
1792   // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1793   std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1794                                                            OrigOffset);
1795 
1796   unsigned C1 = 0;
1797   if (ImmOffset != 0) {
1798     // If the immediate value is too big for the immoffset field, put only bits
1799     // that would normally fit in the immoffset field. The remaining value that
1800     // is copied/added for the voffset field is a large power of 2, and it
1801     // stands more chance of being CSEd with the copy/add for another similar
1802     // load/store.
1803     // However, do not do that rounding down if that is a negative
1804     // number, as it appears to be illegal to have a negative offset in the
1805     // vgpr, even if adding the immediate offset makes it positive.
1806     unsigned Overflow = ImmOffset & ~MaxImm;
1807     ImmOffset -= Overflow;
1808     if ((int32_t)Overflow < 0) {
1809       Overflow += ImmOffset;
1810       ImmOffset = 0;
1811     }
1812 
1813     C1 = ImmOffset;
1814     if (Overflow != 0) {
1815       if (!BaseReg)
1816         BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1817       else {
1818         auto OverflowVal = B.buildConstant(S32, Overflow);
1819         BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1820       }
1821     }
1822   }
1823 
1824   if (!BaseReg)
1825     BaseReg = B.buildConstant(S32, 0).getReg(0);
1826 
1827   return {BaseReg, C1};
1828 }
1829 
1830 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1831                                         Register SrcReg) const {
1832   MachineRegisterInfo &MRI = *B.getMRI();
1833   LLT SrcTy = MRI.getType(SrcReg);
1834   if (SrcTy.getSizeInBits() == 32) {
1835     // Use a v_mov_b32 here to make the exec dependency explicit.
1836     B.buildInstr(AMDGPU::V_MOV_B32_e32)
1837       .addDef(DstReg)
1838       .addUse(SrcReg);
1839     return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1840            constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1841   }
1842 
1843   Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1844   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1845 
1846   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1847     .addDef(TmpReg0)
1848     .addUse(SrcReg, 0, AMDGPU::sub0);
1849   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1850     .addDef(TmpReg1)
1851     .addUse(SrcReg, 0, AMDGPU::sub1);
1852   B.buildInstr(AMDGPU::REG_SEQUENCE)
1853     .addDef(DstReg)
1854     .addUse(TmpReg0)
1855     .addImm(AMDGPU::sub0)
1856     .addUse(TmpReg1)
1857     .addImm(AMDGPU::sub1);
1858 
1859   return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1860          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1861 }
1862 
1863 /// Utility function for pushing dynamic vector indexes with a constant offset
1864 /// into waterfall loops.
1865 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1866                                    MachineInstr &IdxUseInstr,
1867                                    unsigned OpIdx,
1868                                    unsigned ConstOffset) {
1869   MachineRegisterInfo &MRI = *B.getMRI();
1870   const LLT S32 = LLT::scalar(32);
1871   Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1872   B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1873 
1874   auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1875 
1876   auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1877   MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1878   MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1879   IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1880 }
1881 
1882 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1883 /// original 32-bit source value (to be inserted in the low part of the combined
1884 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1885 /// value.
1886 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1887                                   Register Hi32Reg, Register Lo32Reg,
1888                                   unsigned ExtOpc,
1889                                   const RegisterBank &RegBank,
1890                                   bool IsBooleanSrc = false) {
1891   if (ExtOpc == AMDGPU::G_ZEXT) {
1892     B.buildConstant(Hi32Reg, 0);
1893   } else if (ExtOpc == AMDGPU::G_SEXT) {
1894     if (IsBooleanSrc) {
1895       // If we know the original source was an s1, the high half is the same as
1896       // the low.
1897       B.buildCopy(Hi32Reg, Lo32Reg);
1898     } else {
1899       // Replicate sign bit from 32-bit extended part.
1900       auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1901       B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1902       B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1903     }
1904   } else {
1905     assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1906     B.buildUndef(Hi32Reg);
1907   }
1908 }
1909 
1910 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1911     MachineIRBuilder &B, MachineInstr &MI,
1912     const OperandsMapper &OpdMapper) const {
1913   MachineRegisterInfo &MRI = *B.getMRI();
1914 
1915   Register VecReg = MI.getOperand(1).getReg();
1916   Register Idx = MI.getOperand(2).getReg();
1917 
1918   const RegisterBank &IdxBank =
1919     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1920 
1921   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1922 
1923   LLT VecTy = MRI.getType(VecReg);
1924   unsigned EltSize = VecTy.getScalarSizeInBits();
1925   unsigned NumElem = VecTy.getNumElements();
1926 
1927   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1928                                                   IsDivergentIdx, &Subtarget))
1929     return false;
1930 
1931   LLT S32 = LLT::scalar(32);
1932 
1933   const RegisterBank &DstBank =
1934     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1935   const RegisterBank &SrcBank =
1936     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1937 
1938   const RegisterBank &CCBank =
1939     (DstBank == AMDGPU::SGPRRegBank &&
1940      SrcBank == AMDGPU::SGPRRegBank &&
1941      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1942                                      : AMDGPU::VCCRegBank;
1943   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1944 
1945   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1946     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1947     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1948   }
1949 
1950   LLT EltTy = VecTy.getScalarType();
1951   SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1952   unsigned NumLanes = DstRegs.size();
1953   if (!NumLanes)
1954     NumLanes = 1;
1955   else
1956     EltTy = MRI.getType(DstRegs[0]);
1957 
1958   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1959   SmallVector<Register, 2> Res(NumLanes);
1960   for (unsigned L = 0; L < NumLanes; ++L)
1961     Res[L] = UnmergeToEltTy.getReg(L);
1962 
1963   for (unsigned I = 1; I < NumElem; ++I) {
1964     auto IC = B.buildConstant(S32, I);
1965     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1966     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1967     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1968 
1969     for (unsigned L = 0; L < NumLanes; ++L) {
1970       auto S = B.buildSelect(EltTy, Cmp,
1971                              UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1972 
1973       for (unsigned N : { 0, 2, 3 })
1974         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1975 
1976       Res[L] = S->getOperand(0).getReg();
1977     }
1978   }
1979 
1980   for (unsigned L = 0; L < NumLanes; ++L) {
1981     Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1982     B.buildCopy(DstReg, Res[L]);
1983     MRI.setRegBank(DstReg, DstBank);
1984   }
1985 
1986   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
1987   MI.eraseFromParent();
1988 
1989   return true;
1990 }
1991 
1992 // Insert a cross regbank copy for a register if it already has a bank that
1993 // differs from the one we want to set.
1994 static Register constrainRegToBank(MachineRegisterInfo &MRI,
1995                                    MachineIRBuilder &B, Register &Reg,
1996                                    const RegisterBank &Bank) {
1997   const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
1998   if (CurrBank && *CurrBank != Bank) {
1999     Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2000     MRI.setRegBank(Copy, Bank);
2001     return Copy;
2002   }
2003 
2004   MRI.setRegBank(Reg, Bank);
2005   return Reg;
2006 }
2007 
2008 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2009     MachineIRBuilder &B, MachineInstr &MI,
2010     const OperandsMapper &OpdMapper) const {
2011 
2012   MachineRegisterInfo &MRI = *B.getMRI();
2013   Register VecReg = MI.getOperand(1).getReg();
2014   Register Idx = MI.getOperand(3).getReg();
2015 
2016   const RegisterBank &IdxBank =
2017     *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2018 
2019   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2020 
2021   LLT VecTy = MRI.getType(VecReg);
2022   unsigned EltSize = VecTy.getScalarSizeInBits();
2023   unsigned NumElem = VecTy.getNumElements();
2024 
2025   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2026                                                   IsDivergentIdx, &Subtarget))
2027     return false;
2028 
2029   LLT S32 = LLT::scalar(32);
2030 
2031   const RegisterBank &DstBank =
2032     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2033   const RegisterBank &SrcBank =
2034     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2035   const RegisterBank &InsBank =
2036     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2037 
2038   const RegisterBank &CCBank =
2039     (DstBank == AMDGPU::SGPRRegBank &&
2040      SrcBank == AMDGPU::SGPRRegBank &&
2041      InsBank == AMDGPU::SGPRRegBank &&
2042      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2043                                      : AMDGPU::VCCRegBank;
2044   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2045 
2046   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2047     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2048     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2049   }
2050 
2051   LLT EltTy = VecTy.getScalarType();
2052   SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2053   unsigned NumLanes = InsRegs.size();
2054   if (!NumLanes) {
2055     NumLanes = 1;
2056     InsRegs.push_back(MI.getOperand(2).getReg());
2057   } else {
2058     EltTy = MRI.getType(InsRegs[0]);
2059   }
2060 
2061   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2062   SmallVector<Register, 16> Ops(NumElem * NumLanes);
2063 
2064   for (unsigned I = 0; I < NumElem; ++I) {
2065     auto IC = B.buildConstant(S32, I);
2066     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2067     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2068     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2069 
2070     for (unsigned L = 0; L < NumLanes; ++L) {
2071       Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2072       Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2073       Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2074 
2075       Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2076       MRI.setRegBank(Select, DstBank);
2077 
2078       Ops[I * NumLanes + L] = Select;
2079     }
2080   }
2081 
2082   LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2083   if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2084     B.buildBuildVector(MI.getOperand(0), Ops);
2085   } else {
2086     auto Vec = B.buildBuildVector(MergeTy, Ops);
2087     MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2088     B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2089   }
2090 
2091   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2092   MI.eraseFromParent();
2093 
2094   return true;
2095 }
2096 
2097 void AMDGPURegisterBankInfo::applyMappingImpl(
2098     MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2099   MachineInstr &MI = OpdMapper.getMI();
2100   B.setInstrAndDebugLoc(MI);
2101   unsigned Opc = MI.getOpcode();
2102   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2103   switch (Opc) {
2104   case AMDGPU::G_CONSTANT:
2105   case AMDGPU::G_IMPLICIT_DEF: {
2106     Register DstReg = MI.getOperand(0).getReg();
2107     LLT DstTy = MRI.getType(DstReg);
2108     if (DstTy != LLT::scalar(1))
2109       break;
2110 
2111     const RegisterBank *DstBank =
2112         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2113     if (DstBank == &AMDGPU::VCCRegBank)
2114       break;
2115     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2116     if (DefRegs.empty())
2117       DefRegs.push_back(DstReg);
2118 
2119     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2120 
2121     Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
2122     LLVMContext &Ctx = B.getMF().getFunction().getContext();
2123 
2124     MI.getOperand(0).setReg(NewDstReg);
2125     if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2126       uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
2127       MI.getOperand(1).setCImm(
2128           ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal));
2129     }
2130 
2131     MRI.setRegBank(NewDstReg, *DstBank);
2132     B.buildTrunc(DefRegs[0], NewDstReg);
2133     return;
2134   }
2135   case AMDGPU::G_PHI: {
2136     Register DstReg = MI.getOperand(0).getReg();
2137     LLT DstTy = MRI.getType(DstReg);
2138     if (DstTy != LLT::scalar(1))
2139       break;
2140 
2141     const LLT S32 = LLT::scalar(32);
2142     const RegisterBank *DstBank =
2143       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2144     if (DstBank == &AMDGPU::VCCRegBank) {
2145       applyDefaultMapping(OpdMapper);
2146       // The standard handling only considers the result register bank for
2147       // phis. For VCC, blindly inserting a copy when the phi is lowered will
2148       // produce an invalid copy. We can only copy with some kind of compare to
2149       // get a vector boolean result. Insert a register bank copy that will be
2150       // correctly lowered to a compare.
2151       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2152         Register SrcReg = MI.getOperand(I).getReg();
2153         const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2154 
2155         if (SrcBank != &AMDGPU::VCCRegBank) {
2156           MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2157           B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2158 
2159           auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2160           MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2161           MI.getOperand(I).setReg(Copy.getReg(0));
2162         }
2163       }
2164 
2165       return;
2166     }
2167 
2168     // Phi handling is strange and only considers the bank of the destination.
2169     substituteSimpleCopyRegs(OpdMapper, 0);
2170 
2171     // Promote SGPR/VGPR booleans to s32
2172     ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2173     B.setInsertPt(B.getMBB(), MI);
2174     LegalizerHelper Helper(B.getMF(), ApplyBank, B);
2175 
2176     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2177       llvm_unreachable("widen scalar should have succeeded");
2178 
2179     return;
2180   }
2181   case AMDGPU::G_FCMP:
2182     if (!Subtarget.hasSALUFloatInsts())
2183       break;
2184     LLVM_FALLTHROUGH;
2185   case AMDGPU::G_ICMP:
2186   case AMDGPU::G_UADDO:
2187   case AMDGPU::G_USUBO:
2188   case AMDGPU::G_UADDE:
2189   case AMDGPU::G_SADDE:
2190   case AMDGPU::G_USUBE:
2191   case AMDGPU::G_SSUBE: {
2192     unsigned BoolDstOp =
2193         (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1;
2194     Register DstReg = MI.getOperand(BoolDstOp).getReg();
2195 
2196     const RegisterBank *DstBank =
2197       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2198     if (DstBank != &AMDGPU::SGPRRegBank)
2199       break;
2200 
2201     const bool HasCarryIn = MI.getNumOperands() == 5;
2202 
2203     // If this is a scalar compare, promote the result to s32, as the selection
2204     // will end up using a copy to a 32-bit vreg.
2205     const LLT S32 = LLT::scalar(32);
2206     Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2207     MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2208     MI.getOperand(BoolDstOp).setReg(NewDstReg);
2209 
2210     if (HasCarryIn) {
2211       Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2212       MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2213       B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2214       MI.getOperand(4).setReg(NewSrcReg);
2215     }
2216 
2217     MachineBasicBlock *MBB = MI.getParent();
2218     B.setInsertPt(*MBB, std::next(MI.getIterator()));
2219 
2220     // If we had a constrained VCC result register, a copy was inserted to VCC
2221     // from SGPR.
2222     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2223     if (DefRegs.empty())
2224       DefRegs.push_back(DstReg);
2225     B.buildTrunc(DefRegs[0], NewDstReg);
2226     return;
2227   }
2228   case AMDGPU::G_SELECT: {
2229     Register DstReg = MI.getOperand(0).getReg();
2230     LLT DstTy = MRI.getType(DstReg);
2231 
2232     SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2233     if (CondRegs.empty())
2234       CondRegs.push_back(MI.getOperand(1).getReg());
2235     else {
2236       assert(CondRegs.size() == 1);
2237     }
2238 
2239     const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2240     if (CondBank == &AMDGPU::SGPRRegBank) {
2241       const LLT S32 = LLT::scalar(32);
2242       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2243       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2244 
2245       MI.getOperand(1).setReg(NewCondReg);
2246       B.buildZExt(NewCondReg, CondRegs[0]);
2247     }
2248 
2249     if (DstTy.getSizeInBits() != 64)
2250       break;
2251 
2252     LLT HalfTy = getHalfSizedType(DstTy);
2253 
2254     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2255     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2256     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2257 
2258     // All inputs are SGPRs, nothing special to do.
2259     if (DefRegs.empty()) {
2260       assert(Src1Regs.empty() && Src2Regs.empty());
2261       break;
2262     }
2263 
2264     if (Src1Regs.empty())
2265       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2266     else {
2267       setRegsToType(MRI, Src1Regs, HalfTy);
2268     }
2269 
2270     if (Src2Regs.empty())
2271       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2272     else
2273       setRegsToType(MRI, Src2Regs, HalfTy);
2274 
2275     setRegsToType(MRI, DefRegs, HalfTy);
2276 
2277     B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2278     B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2279 
2280     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2281     MI.eraseFromParent();
2282     return;
2283   }
2284   case AMDGPU::G_BRCOND: {
2285     Register CondReg = MI.getOperand(0).getReg();
2286     // FIXME: Should use legalizer helper, but should change bool ext type.
2287     const RegisterBank *CondBank =
2288       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2289 
2290     if (CondBank == &AMDGPU::SGPRRegBank) {
2291       const LLT S32 = LLT::scalar(32);
2292       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2293       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2294 
2295       MI.getOperand(0).setReg(NewCondReg);
2296       B.buildZExt(NewCondReg, CondReg);
2297       return;
2298     }
2299 
2300     break;
2301   }
2302   case AMDGPU::G_AND:
2303   case AMDGPU::G_OR:
2304   case AMDGPU::G_XOR: {
2305     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2306     // there is a VGPR input.
2307     Register DstReg = MI.getOperand(0).getReg();
2308     LLT DstTy = MRI.getType(DstReg);
2309 
2310     if (DstTy.getSizeInBits() == 1) {
2311       const RegisterBank *DstBank =
2312         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2313       if (DstBank == &AMDGPU::VCCRegBank)
2314         break;
2315 
2316       MachineFunction *MF = MI.getParent()->getParent();
2317       ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2318       LegalizerHelper Helper(*MF, ApplyBank, B);
2319 
2320       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2321           LegalizerHelper::Legalized)
2322         llvm_unreachable("widen scalar should have succeeded");
2323       return;
2324     }
2325 
2326     if (DstTy.getSizeInBits() != 64)
2327       break;
2328 
2329     LLT HalfTy = getHalfSizedType(DstTy);
2330     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2331     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2332     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2333 
2334     // All inputs are SGPRs, nothing special to do.
2335     if (DefRegs.empty()) {
2336       assert(Src0Regs.empty() && Src1Regs.empty());
2337       break;
2338     }
2339 
2340     assert(DefRegs.size() == 2);
2341     assert(Src0Regs.size() == Src1Regs.size() &&
2342            (Src0Regs.empty() || Src0Regs.size() == 2));
2343 
2344     // Depending on where the source registers came from, the generic code may
2345     // have decided to split the inputs already or not. If not, we still need to
2346     // extract the values.
2347 
2348     if (Src0Regs.empty())
2349       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2350     else
2351       setRegsToType(MRI, Src0Regs, HalfTy);
2352 
2353     if (Src1Regs.empty())
2354       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2355     else
2356       setRegsToType(MRI, Src1Regs, HalfTy);
2357 
2358     setRegsToType(MRI, DefRegs, HalfTy);
2359 
2360     B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2361     B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2362 
2363     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2364     MI.eraseFromParent();
2365     return;
2366   }
2367   case AMDGPU::G_ABS: {
2368     Register SrcReg = MI.getOperand(1).getReg();
2369     const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2370 
2371     // There is no VALU abs instruction so we need to replace it with a sub and
2372     // max combination.
2373     if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2374       MachineFunction *MF = MI.getParent()->getParent();
2375       ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
2376       LegalizerHelper Helper(*MF, Apply, B);
2377 
2378       if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2379         llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2380       return;
2381     }
2382     [[fallthrough]];
2383   }
2384   case AMDGPU::G_ADD:
2385   case AMDGPU::G_SUB:
2386   case AMDGPU::G_MUL:
2387   case AMDGPU::G_SHL:
2388   case AMDGPU::G_LSHR:
2389   case AMDGPU::G_ASHR:
2390   case AMDGPU::G_SMIN:
2391   case AMDGPU::G_SMAX:
2392   case AMDGPU::G_UMIN:
2393   case AMDGPU::G_UMAX: {
2394     Register DstReg = MI.getOperand(0).getReg();
2395     LLT DstTy = MRI.getType(DstReg);
2396 
2397     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2398     // Packed 16-bit operations need to be scalarized and promoted.
2399     if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2400       break;
2401 
2402     const RegisterBank *DstBank =
2403       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2404     if (DstBank == &AMDGPU::VGPRRegBank)
2405       break;
2406 
2407     const LLT S32 = LLT::scalar(32);
2408     MachineBasicBlock *MBB = MI.getParent();
2409     MachineFunction *MF = MBB->getParent();
2410     ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2411 
2412     if (DstTy.isVector() && Opc == AMDGPU::G_ABS) {
2413       Register WideSrcLo, WideSrcHi;
2414 
2415       std::tie(WideSrcLo, WideSrcHi) =
2416           unpackV2S16ToS32(B, MI.getOperand(1).getReg(), TargetOpcode::G_SEXT);
2417       auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo});
2418       auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi});
2419       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2420       MI.eraseFromParent();
2421       return;
2422     }
2423 
2424     if (DstTy.isVector()) {
2425       Register WideSrc0Lo, WideSrc0Hi;
2426       Register WideSrc1Lo, WideSrc1Hi;
2427 
2428       unsigned ExtendOp = getExtendOp(MI.getOpcode());
2429       std::tie(WideSrc0Lo, WideSrc0Hi)
2430         = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2431       std::tie(WideSrc1Lo, WideSrc1Hi)
2432         = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2433       auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2434       auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2435       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2436       MI.eraseFromParent();
2437     } else {
2438       LegalizerHelper Helper(*MF, ApplySALU, B);
2439 
2440       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2441         llvm_unreachable("widen scalar should have succeeded");
2442 
2443       // FIXME: s16 shift amounts should be legal.
2444       if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2445           Opc == AMDGPU::G_ASHR) {
2446         B.setInsertPt(*MBB, MI.getIterator());
2447         if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2448           llvm_unreachable("widen scalar should have succeeded");
2449       }
2450     }
2451 
2452     return;
2453   }
2454   case AMDGPU::G_SEXT_INREG: {
2455     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2456     if (SrcRegs.empty())
2457       break; // Nothing to repair
2458 
2459     const LLT S32 = LLT::scalar(32);
2460     ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
2461 
2462     // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2463     // we would need to further expand, and doesn't let us directly set the
2464     // result registers.
2465     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2466 
2467     int Amt = MI.getOperand(2).getImm();
2468     if (Amt <= 32) {
2469       // Downstream users have expectations for the high bit behavior, so freeze
2470       // incoming undefined bits.
2471       if (Amt == 32) {
2472         // The low bits are unchanged.
2473         B.buildFreeze(DstRegs[0], SrcRegs[0]);
2474       } else {
2475         auto Freeze = B.buildFreeze(S32, SrcRegs[0]);
2476         // Extend in the low bits and propagate the sign bit to the high half.
2477         B.buildSExtInReg(DstRegs[0], Freeze, Amt);
2478       }
2479 
2480       B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2481     } else {
2482       // The low bits are unchanged, and extend in the high bits.
2483       // No freeze required
2484       B.buildCopy(DstRegs[0], SrcRegs[0]);
2485       B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2486     }
2487 
2488     Register DstReg = MI.getOperand(0).getReg();
2489     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2490     MI.eraseFromParent();
2491     return;
2492   }
2493   case AMDGPU::G_CTPOP:
2494   case AMDGPU::G_BITREVERSE: {
2495     const RegisterBank *DstBank =
2496       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2497     if (DstBank == &AMDGPU::SGPRRegBank)
2498       break;
2499 
2500     Register SrcReg = MI.getOperand(1).getReg();
2501     const LLT S32 = LLT::scalar(32);
2502     LLT Ty = MRI.getType(SrcReg);
2503     if (Ty == S32)
2504       break;
2505 
2506     ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2507 
2508     MachineFunction &MF = B.getMF();
2509     LegalizerHelper Helper(MF, ApplyVALU, B);
2510 
2511     if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2512       llvm_unreachable("narrowScalar should have succeeded");
2513     return;
2514   }
2515   case AMDGPU::G_AMDGPU_FFBH_U32:
2516   case AMDGPU::G_AMDGPU_FFBL_B32:
2517   case AMDGPU::G_CTLZ_ZERO_UNDEF:
2518   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2519     const RegisterBank *DstBank =
2520         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2521     if (DstBank == &AMDGPU::SGPRRegBank)
2522       break;
2523 
2524     Register SrcReg = MI.getOperand(1).getReg();
2525     const LLT S32 = LLT::scalar(32);
2526     LLT Ty = MRI.getType(SrcReg);
2527     if (Ty == S32)
2528       break;
2529 
2530     // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2531     // which return -1 when the input is zero:
2532     // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2533     // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2534     // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2535     // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2536     ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2537     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2538     unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2539                           ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2540                           : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2541                                 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2542                                 : Opc;
2543     unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2544     auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2545     auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2546     unsigned AddOpc =
2547         Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2548             ? AMDGPU::G_ADD
2549             : AMDGPU::G_UADDSAT;
2550     Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2551     Register DstReg = MI.getOperand(0).getReg();
2552     B.buildUMin(DstReg, X, Y);
2553     MI.eraseFromParent();
2554     return;
2555   }
2556   case AMDGPU::G_SEXT:
2557   case AMDGPU::G_ZEXT:
2558   case AMDGPU::G_ANYEXT: {
2559     Register SrcReg = MI.getOperand(1).getReg();
2560     LLT SrcTy = MRI.getType(SrcReg);
2561     const bool Signed = Opc == AMDGPU::G_SEXT;
2562 
2563     assert(OpdMapper.getVRegs(1).empty());
2564 
2565     const RegisterBank *SrcBank =
2566       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2567 
2568     Register DstReg = MI.getOperand(0).getReg();
2569     LLT DstTy = MRI.getType(DstReg);
2570     if (DstTy.isScalar() &&
2571         SrcBank != &AMDGPU::SGPRRegBank &&
2572         SrcBank != &AMDGPU::VCCRegBank &&
2573         // FIXME: Should handle any type that round to s64 when irregular
2574         // breakdowns supported.
2575         DstTy.getSizeInBits() == 64 &&
2576         SrcTy.getSizeInBits() <= 32) {
2577       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2578 
2579       // Extend to 32-bit, and then extend the low half.
2580       if (Signed) {
2581         // TODO: Should really be buildSExtOrCopy
2582         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2583       } else if (Opc == AMDGPU::G_ZEXT) {
2584         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2585       } else {
2586         B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2587       }
2588 
2589       extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2590       MRI.setRegBank(DstReg, *SrcBank);
2591       MI.eraseFromParent();
2592       return;
2593     }
2594 
2595     if (SrcTy != LLT::scalar(1))
2596       return;
2597 
2598     // It is not legal to have a legalization artifact with a VCC source. Rather
2599     // than introducing a copy, insert the select we would have to select the
2600     // copy to.
2601     if (SrcBank == &AMDGPU::VCCRegBank) {
2602       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2603 
2604       const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2605 
2606       unsigned DstSize = DstTy.getSizeInBits();
2607       // 64-bit select is SGPR only
2608       const bool UseSel64 = DstSize > 32 &&
2609         SrcBank->getID() == AMDGPU::SGPRRegBankID;
2610 
2611       // TODO: Should s16 select be legal?
2612       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2613       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2614       auto False = B.buildConstant(SelType, 0);
2615 
2616       MRI.setRegBank(True.getReg(0), *DstBank);
2617       MRI.setRegBank(False.getReg(0), *DstBank);
2618       MRI.setRegBank(DstReg, *DstBank);
2619 
2620       if (DstSize > 32) {
2621         B.buildSelect(DefRegs[0], SrcReg, True, False);
2622         extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2623       } else if (DstSize < 32) {
2624         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2625         MRI.setRegBank(Sel.getReg(0), *DstBank);
2626         B.buildTrunc(DstReg, Sel);
2627       } else {
2628         B.buildSelect(DstReg, SrcReg, True, False);
2629       }
2630 
2631       MI.eraseFromParent();
2632       return;
2633     }
2634 
2635     break;
2636   }
2637   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2638     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2639 
2640     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2641 
2642     Register DstReg = MI.getOperand(0).getReg();
2643     Register SrcReg = MI.getOperand(1).getReg();
2644 
2645     const LLT S32 = LLT::scalar(32);
2646     LLT DstTy = MRI.getType(DstReg);
2647     LLT SrcTy = MRI.getType(SrcReg);
2648 
2649     if (foldExtractEltToCmpSelect(B, MI, OpdMapper))
2650       return;
2651 
2652     const ValueMapping &DstMapping
2653       = OpdMapper.getInstrMapping().getOperandMapping(0);
2654     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2655     const RegisterBank *SrcBank =
2656       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2657     const RegisterBank *IdxBank =
2658         OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2659 
2660     Register BaseIdxReg;
2661     unsigned ConstOffset;
2662     std::tie(BaseIdxReg, ConstOffset) =
2663         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2664 
2665     // See if the index is an add of a constant which will be foldable by moving
2666     // the base register of the index later if this is going to be executed in a
2667     // waterfall loop. This is essentially to reassociate the add of a constant
2668     // with the readfirstlane.
2669     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2670                                    ConstOffset > 0 &&
2671                                    ConstOffset < SrcTy.getNumElements();
2672 
2673     // Move the base register. We'll re-insert the add later.
2674     if (ShouldMoveIndexIntoLoop)
2675       MI.getOperand(2).setReg(BaseIdxReg);
2676 
2677     // If this is a VGPR result only because the index was a VGPR result, the
2678     // actual indexing will be done on the SGPR source vector, which will
2679     // produce a scalar result. We need to copy to the VGPR result inside the
2680     // waterfall loop.
2681     const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2682                                 SrcBank == &AMDGPU::SGPRRegBank;
2683     if (DstRegs.empty()) {
2684       applyDefaultMapping(OpdMapper);
2685 
2686       executeInWaterfallLoop(B, MI, {2});
2687 
2688       if (NeedCopyToVGPR) {
2689         // We don't want a phi for this temporary reg.
2690         Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2691         MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2692         MI.getOperand(0).setReg(TmpReg);
2693         B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2694 
2695         // Use a v_mov_b32 here to make the exec dependency explicit.
2696         buildVCopy(B, DstReg, TmpReg);
2697       }
2698 
2699       // Re-insert the constant offset add inside the waterfall loop.
2700       if (ShouldMoveIndexIntoLoop)
2701         reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2702 
2703       return;
2704     }
2705 
2706     assert(DstTy.getSizeInBits() == 64);
2707 
2708     LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2709 
2710     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2711     auto One = B.buildConstant(S32, 1);
2712 
2713     MachineBasicBlock::iterator MII = MI.getIterator();
2714 
2715     // Split the vector index into 32-bit pieces. Prepare to move all of the
2716     // new instructions into a waterfall loop if necessary.
2717     //
2718     // Don't put the bitcast or constant in the loop.
2719     MachineInstrSpan Span(MII, &B.getMBB());
2720 
2721     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2722     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2723     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2724 
2725     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2726     auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2727 
2728     MRI.setRegBank(DstReg, *DstBank);
2729     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2730     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2731     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2732     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2733 
2734     SmallSet<Register, 4> OpsToWaterfall;
2735     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2736       MI.eraseFromParent();
2737       return;
2738     }
2739 
2740     // Remove the original instruction to avoid potentially confusing the
2741     // waterfall loop logic.
2742     B.setInstr(*Span.begin());
2743     MI.eraseFromParent();
2744     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2745                            OpsToWaterfall);
2746 
2747     if (NeedCopyToVGPR) {
2748       MachineBasicBlock *LoopBB = Extract1->getParent();
2749       Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2750       Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2751       MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2752       MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2753 
2754       Extract0->getOperand(0).setReg(TmpReg0);
2755       Extract1->getOperand(0).setReg(TmpReg1);
2756 
2757       B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2758 
2759       buildVCopy(B, DstRegs[0], TmpReg0);
2760       buildVCopy(B, DstRegs[1], TmpReg1);
2761     }
2762 
2763     if (ShouldMoveIndexIntoLoop)
2764       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2765 
2766     return;
2767   }
2768   case AMDGPU::G_INSERT_VECTOR_ELT: {
2769     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2770 
2771     Register DstReg = MI.getOperand(0).getReg();
2772     LLT VecTy = MRI.getType(DstReg);
2773 
2774     assert(OpdMapper.getVRegs(0).empty());
2775     assert(OpdMapper.getVRegs(3).empty());
2776 
2777     if (substituteSimpleCopyRegs(OpdMapper, 1))
2778       MRI.setType(MI.getOperand(1).getReg(), VecTy);
2779 
2780     if (foldInsertEltToCmpSelect(B, MI, OpdMapper))
2781       return;
2782 
2783     const RegisterBank *IdxBank =
2784       OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2785 
2786     Register SrcReg = MI.getOperand(1).getReg();
2787     Register InsReg = MI.getOperand(2).getReg();
2788     LLT InsTy = MRI.getType(InsReg);
2789     (void)InsTy;
2790 
2791     Register BaseIdxReg;
2792     unsigned ConstOffset;
2793     std::tie(BaseIdxReg, ConstOffset) =
2794         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2795 
2796     // See if the index is an add of a constant which will be foldable by moving
2797     // the base register of the index later if this is going to be executed in a
2798     // waterfall loop. This is essentially to reassociate the add of a constant
2799     // with the readfirstlane.
2800     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2801       ConstOffset > 0 &&
2802       ConstOffset < VecTy.getNumElements();
2803 
2804     // Move the base register. We'll re-insert the add later.
2805     if (ShouldMoveIndexIntoLoop)
2806       MI.getOperand(3).setReg(BaseIdxReg);
2807 
2808 
2809     if (InsRegs.empty()) {
2810       executeInWaterfallLoop(B, MI, {3});
2811 
2812       // Re-insert the constant offset add inside the waterfall loop.
2813       if (ShouldMoveIndexIntoLoop) {
2814         reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2815       }
2816 
2817       return;
2818     }
2819 
2820     assert(InsTy.getSizeInBits() == 64);
2821 
2822     const LLT S32 = LLT::scalar(32);
2823     LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2824 
2825     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2826     auto One = B.buildConstant(S32, 1);
2827 
2828     // Split the vector index into 32-bit pieces. Prepare to move all of the
2829     // new instructions into a waterfall loop if necessary.
2830     //
2831     // Don't put the bitcast or constant in the loop.
2832     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2833 
2834     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2835     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2836     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2837 
2838     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2839     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2840 
2841     const RegisterBank *DstBank =
2842       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2843     const RegisterBank *SrcBank =
2844       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2845     const RegisterBank *InsSrcBank =
2846       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2847 
2848     MRI.setRegBank(InsReg, *InsSrcBank);
2849     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2850     MRI.setRegBank(InsLo.getReg(0), *DstBank);
2851     MRI.setRegBank(InsHi.getReg(0), *DstBank);
2852     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2853     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2854     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2855 
2856 
2857     SmallSet<Register, 4> OpsToWaterfall;
2858     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2859       B.setInsertPt(B.getMBB(), MI);
2860       B.buildBitcast(DstReg, InsHi);
2861       MI.eraseFromParent();
2862       return;
2863     }
2864 
2865     B.setInstr(*Span.begin());
2866     MI.eraseFromParent();
2867 
2868     // Figure out the point after the waterfall loop before mangling the control
2869     // flow.
2870     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2871                            OpsToWaterfall);
2872 
2873     // The insertion point is now right after the original instruction.
2874     //
2875     // Keep the bitcast to the original vector type out of the loop. Doing this
2876     // saved an extra phi we don't need inside the loop.
2877     B.buildBitcast(DstReg, InsHi);
2878 
2879     // Re-insert the constant offset add inside the waterfall loop.
2880     if (ShouldMoveIndexIntoLoop)
2881       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2882 
2883     return;
2884   }
2885   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2886   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2887   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2888   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2889   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2890   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2891   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
2892   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2893   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2894   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2895   case AMDGPU::G_AMDGPU_BUFFER_STORE:
2896   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2897   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2898   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2899   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2900   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2901   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2902     applyDefaultMapping(OpdMapper);
2903     executeInWaterfallLoop(B, MI, {1, 4});
2904     return;
2905   }
2906   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2907   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2908   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2909   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2910   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2911   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2912   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2913   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2914   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2915   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2916   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2917   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2918     applyDefaultMapping(OpdMapper);
2919     executeInWaterfallLoop(B, MI, {2, 5});
2920     return;
2921   }
2922   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2923   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2924   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2925     applyDefaultMapping(OpdMapper);
2926     executeInWaterfallLoop(B, MI, {2, 5});
2927     return;
2928   }
2929   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2930     applyDefaultMapping(OpdMapper);
2931     executeInWaterfallLoop(B, MI, {3, 6});
2932     return;
2933   }
2934   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2935     applyMappingSBufferLoad(B, OpdMapper);
2936     return;
2937   }
2938   case AMDGPU::G_INTRINSIC:
2939   case AMDGPU::G_INTRINSIC_CONVERGENT: {
2940     switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2941     case Intrinsic::amdgcn_readlane: {
2942       substituteSimpleCopyRegs(OpdMapper, 2);
2943 
2944       assert(OpdMapper.getVRegs(0).empty());
2945       assert(OpdMapper.getVRegs(3).empty());
2946 
2947       // Make sure the index is an SGPR. It doesn't make sense to run this in a
2948       // waterfall loop, so assume it's a uniform value.
2949       constrainOpWithReadfirstlane(B, MI, 3); // Index
2950       return;
2951     }
2952     case Intrinsic::amdgcn_writelane: {
2953       assert(OpdMapper.getVRegs(0).empty());
2954       assert(OpdMapper.getVRegs(2).empty());
2955       assert(OpdMapper.getVRegs(3).empty());
2956 
2957       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2958       constrainOpWithReadfirstlane(B, MI, 2); // Source value
2959       constrainOpWithReadfirstlane(B, MI, 3); // Index
2960       return;
2961     }
2962     case Intrinsic::amdgcn_interp_p1:
2963     case Intrinsic::amdgcn_interp_p2:
2964     case Intrinsic::amdgcn_interp_mov:
2965     case Intrinsic::amdgcn_interp_p1_f16:
2966     case Intrinsic::amdgcn_interp_p2_f16:
2967     case Intrinsic::amdgcn_lds_param_load: {
2968       applyDefaultMapping(OpdMapper);
2969 
2970       // Readlane for m0 value, which is always the last operand.
2971       // FIXME: Should this be a waterfall loop instead?
2972       constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
2973       return;
2974     }
2975     case Intrinsic::amdgcn_interp_inreg_p10:
2976     case Intrinsic::amdgcn_interp_inreg_p2:
2977     case Intrinsic::amdgcn_interp_inreg_p10_f16:
2978     case Intrinsic::amdgcn_interp_inreg_p2_f16:
2979       applyDefaultMapping(OpdMapper);
2980       return;
2981     case Intrinsic::amdgcn_permlane16:
2982     case Intrinsic::amdgcn_permlanex16: {
2983       // Doing a waterfall loop over these wouldn't make any sense.
2984       substituteSimpleCopyRegs(OpdMapper, 2);
2985       substituteSimpleCopyRegs(OpdMapper, 3);
2986       constrainOpWithReadfirstlane(B, MI, 4);
2987       constrainOpWithReadfirstlane(B, MI, 5);
2988       return;
2989     }
2990     case Intrinsic::amdgcn_sbfe:
2991       applyMappingBFE(B, OpdMapper, true);
2992       return;
2993     case Intrinsic::amdgcn_ubfe:
2994       applyMappingBFE(B, OpdMapper, false);
2995       return;
2996     case Intrinsic::amdgcn_inverse_ballot:
2997     case Intrinsic::amdgcn_s_bitreplicate:
2998     case Intrinsic::amdgcn_s_quadmask:
2999     case Intrinsic::amdgcn_s_wqm:
3000       applyDefaultMapping(OpdMapper);
3001       constrainOpWithReadfirstlane(B, MI, 2); // Mask
3002       return;
3003     case Intrinsic::amdgcn_ballot:
3004       // Use default handling and insert copy to vcc source.
3005       break;
3006     }
3007     break;
3008   }
3009   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3010   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3011   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3012   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3013     const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3014         AMDGPU::lookupRsrcIntrinsic(AMDGPU::getIntrinsicID(MI));
3015     assert(RSrcIntrin && RSrcIntrin->IsImage);
3016     // Non-images can have complications from operands that allow both SGPR
3017     // and VGPR. For now it's too complicated to figure out the final opcode
3018     // to derive the register bank from the MCInstrDesc.
3019     applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3020     return;
3021   }
3022   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3023     unsigned N = MI.getNumExplicitOperands() - 2;
3024     applyDefaultMapping(OpdMapper);
3025     executeInWaterfallLoop(B, MI, {N});
3026     return;
3027   }
3028   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
3029   case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
3030     auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
3031     switch (IntrID) {
3032     case Intrinsic::amdgcn_ds_ordered_add:
3033     case Intrinsic::amdgcn_ds_ordered_swap: {
3034       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3035       assert(OpdMapper.getVRegs(0).empty());
3036       substituteSimpleCopyRegs(OpdMapper, 3);
3037       constrainOpWithReadfirstlane(B, MI, 2); // M0
3038       return;
3039     }
3040     case Intrinsic::amdgcn_ds_gws_init:
3041     case Intrinsic::amdgcn_ds_gws_barrier:
3042     case Intrinsic::amdgcn_ds_gws_sema_br: {
3043       // Only the first lane is executes, so readfirstlane is safe.
3044       substituteSimpleCopyRegs(OpdMapper, 1);
3045       constrainOpWithReadfirstlane(B, MI, 2); // M0
3046       return;
3047     }
3048     case Intrinsic::amdgcn_ds_gws_sema_v:
3049     case Intrinsic::amdgcn_ds_gws_sema_p:
3050     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3051       // Only the first lane is executes, so readfirstlane is safe.
3052       constrainOpWithReadfirstlane(B, MI, 1); // M0
3053       return;
3054     }
3055     case Intrinsic::amdgcn_ds_append:
3056     case Intrinsic::amdgcn_ds_consume: {
3057       constrainOpWithReadfirstlane(B, MI, 2); // M0
3058       return;
3059     }
3060     case Intrinsic::amdgcn_s_sendmsg:
3061     case Intrinsic::amdgcn_s_sendmsghalt: {
3062       // FIXME: Should this use a waterfall loop?
3063       constrainOpWithReadfirstlane(B, MI, 2); // M0
3064       return;
3065     }
3066     case Intrinsic::amdgcn_s_setreg: {
3067       constrainOpWithReadfirstlane(B, MI, 2);
3068       return;
3069     }
3070     case Intrinsic::amdgcn_s_ttracedata:
3071       constrainOpWithReadfirstlane(B, MI, 1); // M0
3072       return;
3073     case Intrinsic::amdgcn_raw_buffer_load_lds:
3074     case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
3075       applyDefaultMapping(OpdMapper);
3076       constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3077       constrainOpWithReadfirstlane(B, MI, 2); // M0
3078       constrainOpWithReadfirstlane(B, MI, 5); // soffset
3079       return;
3080     }
3081     case Intrinsic::amdgcn_struct_buffer_load_lds:
3082     case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
3083       applyDefaultMapping(OpdMapper);
3084       constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3085       constrainOpWithReadfirstlane(B, MI, 2); // M0
3086       constrainOpWithReadfirstlane(B, MI, 6); // soffset
3087       return;
3088     }
3089     case Intrinsic::amdgcn_global_load_lds: {
3090       applyDefaultMapping(OpdMapper);
3091       constrainOpWithReadfirstlane(B, MI, 2);
3092       return;
3093     }
3094     case Intrinsic::amdgcn_lds_direct_load: {
3095       applyDefaultMapping(OpdMapper);
3096       // Readlane for m0 value, which is always the last operand.
3097       constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
3098       return;
3099     }
3100     case Intrinsic::amdgcn_exp_row:
3101       applyDefaultMapping(OpdMapper);
3102       constrainOpWithReadfirstlane(B, MI, 8); // M0
3103       return;
3104     case Intrinsic::amdgcn_s_sleep_var:
3105       assert(OpdMapper.getVRegs(1).empty());
3106       constrainOpWithReadfirstlane(B, MI, 1);
3107       return;
3108     case Intrinsic::amdgcn_s_barrier_signal_var:
3109     case Intrinsic::amdgcn_s_barrier_join:
3110     case Intrinsic::amdgcn_s_wakeup_barrier:
3111       constrainOpWithReadfirstlane(B, MI, 1);
3112       return;
3113     case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
3114       constrainOpWithReadfirstlane(B, MI, 2);
3115       return;
3116     case Intrinsic::amdgcn_s_barrier_init:
3117       constrainOpWithReadfirstlane(B, MI, 1);
3118       constrainOpWithReadfirstlane(B, MI, 2);
3119       return;
3120     case Intrinsic::amdgcn_s_get_barrier_state: {
3121       constrainOpWithReadfirstlane(B, MI, 2);
3122       return;
3123     }
3124     default: {
3125       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3126               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3127         // Non-images can have complications from operands that allow both SGPR
3128         // and VGPR. For now it's too complicated to figure out the final opcode
3129         // to derive the register bank from the MCInstrDesc.
3130         if (RSrcIntrin->IsImage) {
3131           applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3132           return;
3133         }
3134       }
3135 
3136       break;
3137     }
3138     }
3139     break;
3140   }
3141   case AMDGPU::G_SI_CALL: {
3142     // Use a set to avoid extra readfirstlanes in the case where multiple
3143     // operands are the same register.
3144     SmallSet<Register, 4> SGPROperandRegs;
3145 
3146     if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3147       break;
3148 
3149     // Move all copies to physical SGPRs that are used by the call instruction
3150     // into the loop block. Start searching for these copies until the
3151     // ADJCALLSTACKUP.
3152     unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3153     unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3154 
3155     // Move all non-copies before the copies, so that a complete range can be
3156     // moved into the waterfall loop.
3157     SmallVector<MachineInstr *, 4> NonCopyInstrs;
3158     // Count of NonCopyInstrs found until the current LastCopy.
3159     unsigned NonCopyInstrsLen = 0;
3160     MachineBasicBlock::iterator Start(&MI);
3161     MachineBasicBlock::iterator LastCopy = Start;
3162     MachineBasicBlock *MBB = MI.getParent();
3163     const SIMachineFunctionInfo *Info =
3164         MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3165     while (Start->getOpcode() != FrameSetupOpcode) {
3166       --Start;
3167       bool IsCopy = false;
3168       if (Start->getOpcode() == AMDGPU::COPY) {
3169         auto &Dst = Start->getOperand(0);
3170         if (Dst.isReg()) {
3171           Register Reg = Dst.getReg();
3172           if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3173             IsCopy = true;
3174           } else {
3175             // Also move the copy from the scratch rsrc descriptor into the loop
3176             // to allow it to be optimized away.
3177             auto &Src = Start->getOperand(1);
3178             if (Src.isReg()) {
3179               Reg = Src.getReg();
3180               IsCopy = Info->getScratchRSrcReg() == Reg;
3181             }
3182           }
3183         }
3184       }
3185 
3186       if (IsCopy) {
3187         LastCopy = Start;
3188         NonCopyInstrsLen = NonCopyInstrs.size();
3189       } else {
3190         NonCopyInstrs.push_back(&*Start);
3191       }
3192     }
3193     NonCopyInstrs.resize(NonCopyInstrsLen);
3194 
3195     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3196       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3197     }
3198     Start = LastCopy;
3199 
3200     // Do the same for copies after the loop
3201     NonCopyInstrs.clear();
3202     NonCopyInstrsLen = 0;
3203     MachineBasicBlock::iterator End(&MI);
3204     LastCopy = End;
3205     while (End->getOpcode() != FrameDestroyOpcode) {
3206       ++End;
3207       bool IsCopy = false;
3208       if (End->getOpcode() == AMDGPU::COPY) {
3209         auto &Src = End->getOperand(1);
3210         if (Src.isReg()) {
3211           Register Reg = Src.getReg();
3212           IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3213         }
3214       }
3215 
3216       if (IsCopy) {
3217         LastCopy = End;
3218         NonCopyInstrsLen = NonCopyInstrs.size();
3219       } else {
3220         NonCopyInstrs.push_back(&*End);
3221       }
3222     }
3223     NonCopyInstrs.resize(NonCopyInstrsLen);
3224 
3225     End = LastCopy;
3226     ++LastCopy;
3227     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3228       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3229     }
3230 
3231     ++End;
3232     B.setInsertPt(B.getMBB(), Start);
3233     executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs);
3234     break;
3235   }
3236   case AMDGPU::G_LOAD:
3237   case AMDGPU::G_ZEXTLOAD:
3238   case AMDGPU::G_SEXTLOAD: {
3239     if (applyMappingLoad(B, OpdMapper, MI))
3240       return;
3241     break;
3242   }
3243   case AMDGPU::G_DYN_STACKALLOC:
3244     applyMappingDynStackAlloc(B, OpdMapper, MI);
3245     return;
3246   case AMDGPU::G_STACKRESTORE: {
3247     applyDefaultMapping(OpdMapper);
3248     constrainOpWithReadfirstlane(B, MI, 0);
3249     return;
3250   }
3251   case AMDGPU::G_SBFX:
3252     applyMappingBFE(B, OpdMapper, /*Signed*/ true);
3253     return;
3254   case AMDGPU::G_UBFX:
3255     applyMappingBFE(B, OpdMapper, /*Signed*/ false);
3256     return;
3257   case AMDGPU::G_AMDGPU_MAD_U64_U32:
3258   case AMDGPU::G_AMDGPU_MAD_I64_I32:
3259     applyMappingMAD_64_32(B, OpdMapper);
3260     return;
3261   case AMDGPU::G_PREFETCH: {
3262     if (!Subtarget.hasPrefetch()) {
3263       MI.eraseFromParent();
3264       return;
3265     }
3266     unsigned PtrBank =
3267         getRegBankID(MI.getOperand(0).getReg(), MRI, AMDGPU::SGPRRegBankID);
3268     if (PtrBank == AMDGPU::VGPRRegBankID) {
3269       MI.eraseFromParent();
3270       return;
3271     }
3272     // FIXME: There is currently no support for prefetch in global isel.
3273     // There is no node equivalence and what's worse there is no MMO produced
3274     // for a prefetch on global isel path.
3275     // Prefetch does not affect execution so erase it for now.
3276     MI.eraseFromParent();
3277     return;
3278   }
3279   default:
3280     break;
3281   }
3282 
3283   return applyDefaultMapping(OpdMapper);
3284 }
3285 
3286 // vgpr, sgpr -> vgpr
3287 // vgpr, agpr -> vgpr
3288 // agpr, agpr -> agpr
3289 // agpr, sgpr -> vgpr
3290 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3291   if (RB0 == AMDGPU::InvalidRegBankID)
3292     return RB1;
3293   if (RB1 == AMDGPU::InvalidRegBankID)
3294     return RB0;
3295 
3296   if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3297     return AMDGPU::SGPRRegBankID;
3298 
3299   if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3300     return AMDGPU::AGPRRegBankID;
3301 
3302   return AMDGPU::VGPRRegBankID;
3303 }
3304 
3305 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3306   if (RB0 == AMDGPU::InvalidRegBankID)
3307     return RB1;
3308   if (RB1 == AMDGPU::InvalidRegBankID)
3309     return RB0;
3310 
3311   // vcc, vcc -> vcc
3312   // vcc, sgpr -> vcc
3313   // vcc, vgpr -> vcc
3314   if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3315     return AMDGPU::VCCRegBankID;
3316 
3317   // vcc, vgpr -> vgpr
3318   return regBankUnion(RB0, RB1);
3319 }
3320 
3321 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3322                                                 const MachineInstr &MI) const {
3323   unsigned RegBank = AMDGPU::InvalidRegBankID;
3324 
3325   for (const MachineOperand &MO : MI.operands()) {
3326     if (!MO.isReg())
3327       continue;
3328     Register Reg = MO.getReg();
3329     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3330       RegBank = regBankUnion(RegBank, Bank->getID());
3331       if (RegBank == AMDGPU::VGPRRegBankID)
3332         break;
3333     }
3334   }
3335 
3336   return RegBank;
3337 }
3338 
3339 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3340   const MachineFunction &MF = *MI.getParent()->getParent();
3341   const MachineRegisterInfo &MRI = MF.getRegInfo();
3342   for (const MachineOperand &MO : MI.operands()) {
3343     if (!MO.isReg())
3344       continue;
3345     Register Reg = MO.getReg();
3346     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3347       if (Bank->getID() != AMDGPU::SGPRRegBankID)
3348         return false;
3349     }
3350   }
3351   return true;
3352 }
3353 
3354 const RegisterBankInfo::InstructionMapping &
3355 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3356   const MachineFunction &MF = *MI.getParent()->getParent();
3357   const MachineRegisterInfo &MRI = MF.getRegInfo();
3358   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3359 
3360   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3361     const MachineOperand &SrcOp = MI.getOperand(i);
3362     if (!SrcOp.isReg())
3363       continue;
3364 
3365     unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3366     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3367   }
3368   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3369                                MI.getNumOperands());
3370 }
3371 
3372 const RegisterBankInfo::InstructionMapping &
3373 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3374   const MachineFunction &MF = *MI.getParent()->getParent();
3375   const MachineRegisterInfo &MRI = MF.getRegInfo();
3376   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3377 
3378   // Even though we technically could use SGPRs, this would require knowledge of
3379   // the constant bus restriction. Force all sources to VGPR (except for VCC).
3380   //
3381   // TODO: Unary ops are trivially OK, so accept SGPRs?
3382   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3383     const MachineOperand &Src = MI.getOperand(i);
3384     if (!Src.isReg())
3385       continue;
3386 
3387     unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3388     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3389     OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3390   }
3391 
3392   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3393                                MI.getNumOperands());
3394 }
3395 
3396 const RegisterBankInfo::InstructionMapping &
3397 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3398   const MachineFunction &MF = *MI.getParent()->getParent();
3399   const MachineRegisterInfo &MRI = MF.getRegInfo();
3400   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3401 
3402   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3403     const MachineOperand &Op = MI.getOperand(I);
3404     if (!Op.isReg())
3405       continue;
3406 
3407     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3408     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3409   }
3410 
3411   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3412                                MI.getNumOperands());
3413 }
3414 
3415 const RegisterBankInfo::InstructionMapping &
3416 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3417                                         const MachineInstr &MI,
3418                                         int RsrcIdx) const {
3419   // The reported argument index is relative to the IR intrinsic call arguments,
3420   // so we need to shift by the number of defs and the intrinsic ID.
3421   RsrcIdx += MI.getNumExplicitDefs() + 1;
3422 
3423   const int NumOps = MI.getNumOperands();
3424   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3425 
3426   // TODO: Should packed/unpacked D16 difference be reported here as part of
3427   // the value mapping?
3428   for (int I = 0; I != NumOps; ++I) {
3429     if (!MI.getOperand(I).isReg())
3430       continue;
3431 
3432     Register OpReg = MI.getOperand(I).getReg();
3433     // We replace some dead address operands with $noreg
3434     if (!OpReg)
3435       continue;
3436 
3437     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3438 
3439     // FIXME: Probably need a new intrinsic register bank searchable table to
3440     // handle arbitrary intrinsics easily.
3441     //
3442     // If this has a sampler, it immediately follows rsrc.
3443     const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3444 
3445     if (MustBeSGPR) {
3446       // If this must be an SGPR, so we must report whatever it is as legal.
3447       unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3448       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3449     } else {
3450       // Some operands must be VGPR, and these are easy to copy to.
3451       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3452     }
3453   }
3454 
3455   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3456 }
3457 
3458 /// Return the mapping for a pointer argument.
3459 const RegisterBankInfo::ValueMapping *
3460 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3461                                               Register PtrReg) const {
3462   LLT PtrTy = MRI.getType(PtrReg);
3463   unsigned Size = PtrTy.getSizeInBits();
3464   if (Subtarget.useFlatForGlobal() ||
3465       !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3466     return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3467 
3468   // If we're using MUBUF instructions for global memory, an SGPR base register
3469   // is possible. Otherwise this needs to be a VGPR.
3470   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3471   return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3472 }
3473 
3474 const RegisterBankInfo::InstructionMapping &
3475 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3476 
3477   const MachineFunction &MF = *MI.getParent()->getParent();
3478   const MachineRegisterInfo &MRI = MF.getRegInfo();
3479   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3480   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3481   Register PtrReg = MI.getOperand(1).getReg();
3482   LLT PtrTy = MRI.getType(PtrReg);
3483   unsigned AS = PtrTy.getAddressSpace();
3484   unsigned PtrSize = PtrTy.getSizeInBits();
3485 
3486   const ValueMapping *ValMapping;
3487   const ValueMapping *PtrMapping;
3488 
3489   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3490 
3491   if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3492     if (isScalarLoadLegal(MI)) {
3493       // We have a uniform instruction so we want to use an SMRD load
3494       ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3495       PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3496     } else {
3497       ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3498 
3499       // If we're using MUBUF instructions for global memory, an SGPR base
3500       // register is possible. Otherwise this needs to be a VGPR.
3501       unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3502         AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3503 
3504       PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3505     }
3506   } else {
3507     ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3508     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3509   }
3510 
3511   OpdsMapping[0] = ValMapping;
3512   OpdsMapping[1] = PtrMapping;
3513   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3514       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3515   return Mapping;
3516 
3517   // FIXME: Do we want to add a mapping for FLAT load, or should we just
3518   // handle that during instruction selection?
3519 }
3520 
3521 unsigned
3522 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3523                                      const MachineRegisterInfo &MRI,
3524                                      unsigned Default) const {
3525   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3526   return Bank ? Bank->getID() : Default;
3527 }
3528 
3529 const RegisterBankInfo::ValueMapping *
3530 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3531                                          const MachineRegisterInfo &MRI,
3532                                          const TargetRegisterInfo &TRI) const {
3533   // Lie and claim anything is legal, even though this needs to be an SGPR
3534   // applyMapping will have to deal with it as a waterfall loop.
3535   unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3536   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3537   return AMDGPU::getValueMapping(Bank, Size);
3538 }
3539 
3540 const RegisterBankInfo::ValueMapping *
3541 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3542                                          const MachineRegisterInfo &MRI,
3543                                          const TargetRegisterInfo &TRI) const {
3544   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3545   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3546 }
3547 
3548 const RegisterBankInfo::ValueMapping *
3549 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3550                                          const MachineRegisterInfo &MRI,
3551                                          const TargetRegisterInfo &TRI) const {
3552   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3553   return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3554 }
3555 
3556 ///
3557 /// This function must return a legal mapping, because
3558 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3559 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
3560 /// VGPR to SGPR generated is illegal.
3561 ///
3562 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3563 // legal. These will be dealt with in applyMappingImpl.
3564 //
3565 const RegisterBankInfo::InstructionMapping &
3566 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3567   const MachineFunction &MF = *MI.getParent()->getParent();
3568   const MachineRegisterInfo &MRI = MF.getRegInfo();
3569 
3570   if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3571     // The default logic bothers to analyze impossible alternative mappings. We
3572     // want the most straightforward mapping, so just directly handle this.
3573     const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3574                                              *TRI);
3575     const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3576                                              *TRI);
3577     assert(SrcBank && "src bank should have been assigned already");
3578     if (!DstBank)
3579       DstBank = SrcBank;
3580 
3581     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3582     if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3583         cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size)))
3584       return getInvalidInstructionMapping();
3585 
3586     const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3587     unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3588     SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3589     OpdsMapping[0] = &ValMap;
3590     if (MI.getOpcode() == AMDGPU::G_FREEZE)
3591       OpdsMapping[1] = &ValMap;
3592 
3593     return getInstructionMapping(
3594         1, /*Cost*/ 1,
3595         /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3596   }
3597 
3598   if (MI.isRegSequence()) {
3599     // If any input is a VGPR, the result must be a VGPR. The default handling
3600     // assumes any copy between banks is legal.
3601     unsigned BankID = AMDGPU::SGPRRegBankID;
3602 
3603     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3604       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3605       // It doesn't make sense to use vcc or scc banks here, so just ignore
3606       // them.
3607       if (OpBank != AMDGPU::SGPRRegBankID) {
3608         BankID = AMDGPU::VGPRRegBankID;
3609         break;
3610       }
3611     }
3612     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3613 
3614     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3615     return getInstructionMapping(
3616         1, /*Cost*/ 1,
3617         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3618   }
3619 
3620   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3621   // properly.
3622   //
3623   // TODO: There are additional exec masking dependencies to analyze.
3624   if (MI.getOpcode() == TargetOpcode::G_PHI) {
3625     unsigned ResultBank = AMDGPU::InvalidRegBankID;
3626     Register DstReg = MI.getOperand(0).getReg();
3627 
3628     // Sometimes the result may have already been assigned a bank.
3629     if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3630       ResultBank = DstBank->getID();
3631 
3632     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3633       Register Reg = MI.getOperand(I).getReg();
3634       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3635 
3636       // FIXME: Assuming VGPR for any undetermined inputs.
3637       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3638         ResultBank = AMDGPU::VGPRRegBankID;
3639         break;
3640       }
3641 
3642       // FIXME: Need to promote SGPR case to s32
3643       unsigned OpBank = Bank->getID();
3644       ResultBank = regBankBoolUnion(ResultBank, OpBank);
3645     }
3646 
3647     assert(ResultBank != AMDGPU::InvalidRegBankID);
3648 
3649     unsigned Size = MRI.getType(DstReg).getSizeInBits();
3650 
3651     const ValueMapping &ValMap =
3652         getValueMapping(0, Size, getRegBank(ResultBank));
3653     return getInstructionMapping(
3654         1, /*Cost*/ 1,
3655         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3656   }
3657 
3658   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3659   if (Mapping.isValid())
3660     return Mapping;
3661 
3662   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3663 
3664   switch (MI.getOpcode()) {
3665   default:
3666     return getInvalidInstructionMapping();
3667 
3668   case AMDGPU::G_AND:
3669   case AMDGPU::G_OR:
3670   case AMDGPU::G_XOR: {
3671     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3672     if (Size == 1) {
3673       const RegisterBank *DstBank
3674         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3675 
3676       unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3677       unsigned BankLHS = AMDGPU::InvalidRegBankID;
3678       unsigned BankRHS = AMDGPU::InvalidRegBankID;
3679       if (DstBank) {
3680         TargetBankID = DstBank->getID();
3681         if (DstBank == &AMDGPU::VCCRegBank) {
3682           TargetBankID = AMDGPU::VCCRegBankID;
3683           BankLHS = AMDGPU::VCCRegBankID;
3684           BankRHS = AMDGPU::VCCRegBankID;
3685         } else {
3686           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3687                                  AMDGPU::SGPRRegBankID);
3688           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3689                                  AMDGPU::SGPRRegBankID);
3690         }
3691       } else {
3692         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3693                                AMDGPU::VCCRegBankID);
3694         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3695                                AMDGPU::VCCRegBankID);
3696 
3697         // Both inputs should be true booleans to produce a boolean result.
3698         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3699           TargetBankID = AMDGPU::VGPRRegBankID;
3700         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3701           TargetBankID = AMDGPU::VCCRegBankID;
3702           BankLHS = AMDGPU::VCCRegBankID;
3703           BankRHS = AMDGPU::VCCRegBankID;
3704         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3705           TargetBankID = AMDGPU::SGPRRegBankID;
3706         }
3707       }
3708 
3709       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3710       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3711       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3712       break;
3713     }
3714 
3715     if (Size == 64) {
3716 
3717       if (isSALUMapping(MI)) {
3718         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3719         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3720       } else {
3721         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3722         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3723         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3724 
3725         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3726         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3727       }
3728 
3729       break;
3730     }
3731 
3732     [[fallthrough]];
3733   }
3734   case AMDGPU::G_PTR_ADD:
3735   case AMDGPU::G_PTRMASK:
3736   case AMDGPU::G_ADD:
3737   case AMDGPU::G_SUB:
3738   case AMDGPU::G_MUL:
3739   case AMDGPU::G_SHL:
3740   case AMDGPU::G_LSHR:
3741   case AMDGPU::G_ASHR:
3742   case AMDGPU::G_UADDO:
3743   case AMDGPU::G_USUBO:
3744   case AMDGPU::G_UADDE:
3745   case AMDGPU::G_SADDE:
3746   case AMDGPU::G_USUBE:
3747   case AMDGPU::G_SSUBE:
3748   case AMDGPU::G_SMIN:
3749   case AMDGPU::G_SMAX:
3750   case AMDGPU::G_UMIN:
3751   case AMDGPU::G_UMAX:
3752   case AMDGPU::G_ABS:
3753   case AMDGPU::G_SHUFFLE_VECTOR:
3754   case AMDGPU::G_SBFX:
3755   case AMDGPU::G_UBFX:
3756     if (isSALUMapping(MI))
3757       return getDefaultMappingSOP(MI);
3758     return getDefaultMappingVOP(MI);
3759   case AMDGPU::G_FADD:
3760   case AMDGPU::G_FSUB:
3761   case AMDGPU::G_FMUL:
3762   case AMDGPU::G_FMA:
3763   case AMDGPU::G_FFLOOR:
3764   case AMDGPU::G_FCEIL:
3765   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
3766   case AMDGPU::G_FMINNUM:
3767   case AMDGPU::G_FMAXNUM:
3768   case AMDGPU::G_FMINIMUM:
3769   case AMDGPU::G_FMAXIMUM:
3770   case AMDGPU::G_INTRINSIC_TRUNC:
3771   case AMDGPU::G_STRICT_FADD:
3772   case AMDGPU::G_STRICT_FSUB:
3773   case AMDGPU::G_STRICT_FMUL:
3774   case AMDGPU::G_STRICT_FMA: {
3775     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3776     unsigned Size = Ty.getSizeInBits();
3777     if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
3778         (Size == 32 || Size == 16) && isSALUMapping(MI))
3779       return getDefaultMappingSOP(MI);
3780     return getDefaultMappingVOP(MI);
3781   }
3782   case AMDGPU::G_FPTOSI:
3783   case AMDGPU::G_FPTOUI:
3784   case AMDGPU::G_SITOFP:
3785   case AMDGPU::G_UITOFP: {
3786     unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3787     unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3788     if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 &&
3789         isSALUMapping(MI))
3790       return getDefaultMappingSOP(MI);
3791     return getDefaultMappingVOP(MI);
3792   }
3793   case AMDGPU::G_FPTRUNC:
3794   case AMDGPU::G_FPEXT: {
3795     unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3796     unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3797     if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 &&
3798         isSALUMapping(MI))
3799       return getDefaultMappingSOP(MI);
3800     return getDefaultMappingVOP(MI);
3801   }
3802   case AMDGPU::G_FSQRT:
3803   case AMDGPU::G_FEXP2:
3804   case AMDGPU::G_FLOG2: {
3805     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3806     if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
3807         isSALUMapping(MI))
3808       return getDefaultMappingSOP(MI);
3809     return getDefaultMappingVOP(MI);
3810   }
3811   case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3812   case AMDGPU::G_SSUBSAT:
3813   case AMDGPU::G_UADDSAT:
3814   case AMDGPU::G_USUBSAT:
3815   case AMDGPU::G_FMAD:
3816   case AMDGPU::G_FLDEXP:
3817   case AMDGPU::G_FMINNUM_IEEE:
3818   case AMDGPU::G_FMAXNUM_IEEE:
3819   case AMDGPU::G_FCANONICALIZE:
3820   case AMDGPU::G_STRICT_FLDEXP:
3821   case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3822   case AMDGPU::G_FSHR: // TODO: Expand for scalar
3823   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3824   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3825   case AMDGPU::G_AMDGPU_RCP_IFLAG:
3826   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3827   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3828   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3829   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3830   case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3831   case AMDGPU::G_AMDGPU_SMED3:
3832   case AMDGPU::G_AMDGPU_FMED3:
3833     return getDefaultMappingVOP(MI);
3834   case AMDGPU::G_UMULH:
3835   case AMDGPU::G_SMULH: {
3836     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3837       return getDefaultMappingSOP(MI);
3838     return getDefaultMappingVOP(MI);
3839   }
3840   case AMDGPU::G_AMDGPU_MAD_U64_U32:
3841   case AMDGPU::G_AMDGPU_MAD_I64_I32: {
3842     // Three possible mappings:
3843     //
3844     //  - Default SOP
3845     //  - Default VOP
3846     //  - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
3847     //
3848     // This allows instruction selection to keep the multiplication part of the
3849     // instruction on the SALU.
3850     bool AllSalu = true;
3851     bool MulSalu = true;
3852     for (unsigned i = 0; i < 5; ++i) {
3853       Register Reg = MI.getOperand(i).getReg();
3854       if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3855         if (Bank->getID() != AMDGPU::SGPRRegBankID) {
3856           AllSalu = false;
3857           if (i == 2 || i == 3) {
3858             MulSalu = false;
3859             break;
3860           }
3861         }
3862       }
3863     }
3864 
3865     if (AllSalu)
3866       return getDefaultMappingSOP(MI);
3867 
3868     // If the multiply-add is full-rate in VALU, use that even if the
3869     // multiplication part is scalar. Accumulating separately on the VALU would
3870     // take two instructions.
3871     if (!MulSalu || Subtarget.hasFullRate64Ops())
3872       return getDefaultMappingVOP(MI);
3873 
3874     // Keep the multiplication on the SALU, then accumulate on the VALU.
3875     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
3876     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3877     OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3878     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3879     OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
3880     break;
3881   }
3882   case AMDGPU::G_IMPLICIT_DEF: {
3883     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3884     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3885     break;
3886   }
3887   case AMDGPU::G_FCONSTANT:
3888   case AMDGPU::G_CONSTANT:
3889   case AMDGPU::G_GLOBAL_VALUE:
3890   case AMDGPU::G_BLOCK_ADDR:
3891   case AMDGPU::G_READCYCLECOUNTER: {
3892     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3893     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3894     break;
3895   }
3896   case AMDGPU::G_FRAME_INDEX: {
3897     // TODO: This should be the same as other constants, but eliminateFrameIndex
3898     // currently assumes VALU uses.
3899     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3900     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3901     break;
3902   }
3903   case AMDGPU::G_DYN_STACKALLOC: {
3904     // Result is always uniform, and a wave reduction is needed for the source.
3905     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3906     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3907     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3908     break;
3909   }
3910   case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
3911     // This case is weird because we expect a physical register in the source,
3912     // but need to set a bank anyway.
3913     //
3914     // TODO: We could select the result to SGPR or VGPR
3915     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3916     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3917     break;
3918   }
3919   case AMDGPU::G_INSERT: {
3920     unsigned BankID = getMappingType(MRI, MI);
3921     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3922     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3923     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3924     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3925     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3926     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3927     OpdsMapping[3] = nullptr;
3928     break;
3929   }
3930   case AMDGPU::G_EXTRACT: {
3931     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3932     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3933     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3934     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3935     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3936     OpdsMapping[2] = nullptr;
3937     break;
3938   }
3939   case AMDGPU::G_BUILD_VECTOR:
3940   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3941     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3942     if (DstTy == LLT::fixed_vector(2, 16)) {
3943       unsigned DstSize = DstTy.getSizeInBits();
3944       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3945       unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3946       unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3947       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3948 
3949       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3950       OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3951       OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3952       break;
3953     }
3954 
3955     [[fallthrough]];
3956   }
3957   case AMDGPU::G_MERGE_VALUES:
3958   case AMDGPU::G_CONCAT_VECTORS: {
3959     unsigned Bank = getMappingType(MRI, MI);
3960     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3961     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3962 
3963     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3964     // Op1 and Dst should use the same register bank.
3965     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3966       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3967     break;
3968   }
3969   case AMDGPU::G_BITREVERSE:
3970   case AMDGPU::G_BITCAST:
3971   case AMDGPU::G_INTTOPTR:
3972   case AMDGPU::G_PTRTOINT:
3973   case AMDGPU::G_FABS:
3974   case AMDGPU::G_FNEG: {
3975     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3976     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3977     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3978     break;
3979   }
3980   case AMDGPU::G_AMDGPU_FFBH_U32:
3981   case AMDGPU::G_AMDGPU_FFBL_B32:
3982   case AMDGPU::G_CTLZ_ZERO_UNDEF:
3983   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
3984     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3985     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3986     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3987     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
3988     break;
3989   }
3990   case AMDGPU::G_CTPOP: {
3991     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3992     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3993     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3994 
3995     // This should really be getValueMappingSGPR64Only, but allowing the generic
3996     // code to handle the register split just makes using LegalizerHelper more
3997     // difficult.
3998     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3999     break;
4000   }
4001   case AMDGPU::G_TRUNC: {
4002     Register Dst = MI.getOperand(0).getReg();
4003     Register Src = MI.getOperand(1).getReg();
4004     unsigned Bank = getRegBankID(Src, MRI);
4005     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4006     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4007     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
4008     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
4009     break;
4010   }
4011   case AMDGPU::G_ZEXT:
4012   case AMDGPU::G_SEXT:
4013   case AMDGPU::G_ANYEXT:
4014   case AMDGPU::G_SEXT_INREG: {
4015     Register Dst = MI.getOperand(0).getReg();
4016     Register Src = MI.getOperand(1).getReg();
4017     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4018     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4019 
4020     unsigned DstBank;
4021     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
4022     assert(SrcBank);
4023     switch (SrcBank->getID()) {
4024     case AMDGPU::SGPRRegBankID:
4025       DstBank = AMDGPU::SGPRRegBankID;
4026       break;
4027     default:
4028       DstBank = AMDGPU::VGPRRegBankID;
4029       break;
4030     }
4031 
4032     // Scalar extend can use 64-bit BFE, but VGPRs require extending to
4033     // 32-bits, and then to 64.
4034     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
4035     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
4036                                                        SrcSize);
4037     break;
4038   }
4039   case AMDGPU::G_IS_FPCLASS: {
4040     Register SrcReg = MI.getOperand(1).getReg();
4041     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4042     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4043     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4044     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4045     break;
4046   }
4047   case AMDGPU::G_STORE: {
4048     assert(MI.getOperand(0).isReg());
4049     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4050 
4051     // FIXME: We need to specify a different reg bank once scalar stores are
4052     // supported.
4053     const ValueMapping *ValMapping =
4054         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4055     OpdsMapping[0] = ValMapping;
4056     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4057     break;
4058   }
4059   case AMDGPU::G_ICMP:
4060   case AMDGPU::G_FCMP: {
4061     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4062 
4063     // See if the result register has already been constrained to vcc, which may
4064     // happen due to control flow intrinsic lowering.
4065     unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4066                                     AMDGPU::SGPRRegBankID);
4067     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4068     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
4069 
4070     auto canUseSCCICMP = [&]() {
4071       auto Pred =
4072           static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
4073       return Size == 32 ||
4074              (Size == 64 &&
4075               (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4076               Subtarget.hasScalarCompareEq64());
4077     };
4078     auto canUseSCCFCMP = [&]() {
4079       return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16);
4080     };
4081 
4082     bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;
4083     bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4084                      Op2Bank == AMDGPU::SGPRRegBankID &&
4085                      Op3Bank == AMDGPU::SGPRRegBankID &&
4086                      (isICMP ? canUseSCCICMP() : canUseSCCFCMP());
4087 
4088     DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4089     unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4090 
4091     // TODO: Use 32-bit for scalar output size.
4092     // SCC results will need to be copied to a 32-bit SGPR virtual register.
4093     const unsigned ResultSize = 1;
4094 
4095     OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
4096     OpdsMapping[1] = nullptr; // Predicate Operand.
4097     OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
4098     OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
4099     break;
4100   }
4101   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4102     // VGPR index can be used for waterfall when indexing a SGPR vector.
4103     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4104     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4105     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4106     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4107     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4108     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
4109 
4110     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
4111     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
4112 
4113     // The index can be either if the source vector is VGPR.
4114     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4115     break;
4116   }
4117   case AMDGPU::G_INSERT_VECTOR_ELT: {
4118     unsigned OutputBankID = isSALUMapping(MI) ?
4119       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4120 
4121     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4122     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4123     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4124     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4125     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
4126 
4127     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4128     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4129 
4130     // This is a weird case, because we need to break down the mapping based on
4131     // the register bank of a different operand.
4132     if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4133       OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
4134                                                       InsertSize);
4135     } else {
4136       assert(InsertSize == 32 || InsertSize == 64);
4137       OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
4138     }
4139 
4140     // The index can be either if the source vector is VGPR.
4141     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
4142     break;
4143   }
4144   case AMDGPU::G_UNMERGE_VALUES: {
4145     unsigned Bank = getMappingType(MRI, MI);
4146 
4147     // Op1 and Dst should use the same register bank.
4148     // FIXME: Shouldn't this be the default? Why do we need to handle this?
4149     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4150       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
4151       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
4152     }
4153     break;
4154   }
4155   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4156   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4157   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4158   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4159   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4160   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4161   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4162   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4163   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4164   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4165   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4166   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4167   case AMDGPU::G_AMDGPU_BUFFER_STORE:
4168   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4169   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4170   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4171   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4172     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4173 
4174     // rsrc
4175     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4176 
4177     // vindex
4178     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4179 
4180     // voffset
4181     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4182 
4183     // soffset
4184     OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4185 
4186     // Any remaining operands are immediates and were correctly null
4187     // initialized.
4188     break;
4189   }
4190   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4191   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4192   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4193   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4194   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4195   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4196   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4197   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4198   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4199   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4200   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4201   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4202   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4203   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4204   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4205     // vdata_out
4206     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4207 
4208     // vdata_in
4209     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4210 
4211     // rsrc
4212     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4213 
4214     // vindex
4215     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4216 
4217     // voffset
4218     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4219 
4220     // soffset
4221     OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4222 
4223     // Any remaining operands are immediates and were correctly null
4224     // initialized.
4225     break;
4226   }
4227   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4228     // vdata_out
4229     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4230 
4231     // vdata_in
4232     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4233 
4234     // cmp
4235     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4236 
4237     // rsrc
4238     OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4239 
4240     // vindex
4241     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4242 
4243     // voffset
4244     OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4245 
4246     // soffset
4247     OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4248 
4249     // Any remaining operands are immediates and were correctly null
4250     // initialized.
4251     break;
4252   }
4253   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
4254     // Lie and claim everything is legal, even though some need to be
4255     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4256     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4257     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4258 
4259     // We need to convert this to a MUBUF if either the resource of offset is
4260     // VGPR.
4261     unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4262     unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4263     unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4264 
4265     unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4266     OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4267     break;
4268   }
4269   case AMDGPU::G_INTRINSIC:
4270   case AMDGPU::G_INTRINSIC_CONVERGENT: {
4271     switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
4272     default:
4273       return getInvalidInstructionMapping();
4274     case Intrinsic::amdgcn_div_fmas:
4275     case Intrinsic::amdgcn_div_fixup:
4276     case Intrinsic::amdgcn_trig_preop:
4277     case Intrinsic::amdgcn_sin:
4278     case Intrinsic::amdgcn_cos:
4279     case Intrinsic::amdgcn_log_clamp:
4280     case Intrinsic::amdgcn_rcp_legacy:
4281     case Intrinsic::amdgcn_rsq_legacy:
4282     case Intrinsic::amdgcn_rsq_clamp:
4283     case Intrinsic::amdgcn_fmul_legacy:
4284     case Intrinsic::amdgcn_fma_legacy:
4285     case Intrinsic::amdgcn_frexp_mant:
4286     case Intrinsic::amdgcn_frexp_exp:
4287     case Intrinsic::amdgcn_fract:
4288     case Intrinsic::amdgcn_cvt_pknorm_i16:
4289     case Intrinsic::amdgcn_cvt_pknorm_u16:
4290     case Intrinsic::amdgcn_cvt_pk_i16:
4291     case Intrinsic::amdgcn_cvt_pk_u16:
4292     case Intrinsic::amdgcn_fmed3:
4293     case Intrinsic::amdgcn_cubeid:
4294     case Intrinsic::amdgcn_cubema:
4295     case Intrinsic::amdgcn_cubesc:
4296     case Intrinsic::amdgcn_cubetc:
4297     case Intrinsic::amdgcn_sffbh:
4298     case Intrinsic::amdgcn_fmad_ftz:
4299     case Intrinsic::amdgcn_mbcnt_lo:
4300     case Intrinsic::amdgcn_mbcnt_hi:
4301     case Intrinsic::amdgcn_mul_u24:
4302     case Intrinsic::amdgcn_mul_i24:
4303     case Intrinsic::amdgcn_mulhi_u24:
4304     case Intrinsic::amdgcn_mulhi_i24:
4305     case Intrinsic::amdgcn_lerp:
4306     case Intrinsic::amdgcn_sad_u8:
4307     case Intrinsic::amdgcn_msad_u8:
4308     case Intrinsic::amdgcn_sad_hi_u8:
4309     case Intrinsic::amdgcn_sad_u16:
4310     case Intrinsic::amdgcn_qsad_pk_u16_u8:
4311     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4312     case Intrinsic::amdgcn_mqsad_u32_u8:
4313     case Intrinsic::amdgcn_cvt_pk_u8_f32:
4314     case Intrinsic::amdgcn_alignbyte:
4315     case Intrinsic::amdgcn_perm:
4316     case Intrinsic::amdgcn_fdot2:
4317     case Intrinsic::amdgcn_sdot2:
4318     case Intrinsic::amdgcn_udot2:
4319     case Intrinsic::amdgcn_sdot4:
4320     case Intrinsic::amdgcn_udot4:
4321     case Intrinsic::amdgcn_sdot8:
4322     case Intrinsic::amdgcn_udot8:
4323     case Intrinsic::amdgcn_fdot2_bf16_bf16:
4324     case Intrinsic::amdgcn_fdot2_f16_f16:
4325     case Intrinsic::amdgcn_fdot2_f32_bf16:
4326     case Intrinsic::amdgcn_sudot4:
4327     case Intrinsic::amdgcn_sudot8:
4328     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4329     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4330     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
4331     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
4332     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4333     case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4334     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4335     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4336       return getDefaultMappingVOP(MI);
4337     case Intrinsic::amdgcn_log:
4338     case Intrinsic::amdgcn_exp2:
4339     case Intrinsic::amdgcn_rcp:
4340     case Intrinsic::amdgcn_rsq:
4341     case Intrinsic::amdgcn_sqrt: {
4342       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4343       if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4344           isSALUMapping(MI))
4345         return getDefaultMappingSOP(MI);
4346       return getDefaultMappingVOP(MI);
4347     }
4348     case Intrinsic::amdgcn_sbfe:
4349     case Intrinsic::amdgcn_ubfe:
4350       if (isSALUMapping(MI))
4351         return getDefaultMappingSOP(MI);
4352       return getDefaultMappingVOP(MI);
4353     case Intrinsic::amdgcn_ds_swizzle:
4354     case Intrinsic::amdgcn_ds_permute:
4355     case Intrinsic::amdgcn_ds_bpermute:
4356     case Intrinsic::amdgcn_update_dpp:
4357     case Intrinsic::amdgcn_mov_dpp8:
4358     case Intrinsic::amdgcn_mov_dpp:
4359     case Intrinsic::amdgcn_strict_wwm:
4360     case Intrinsic::amdgcn_wwm:
4361     case Intrinsic::amdgcn_strict_wqm:
4362     case Intrinsic::amdgcn_wqm:
4363     case Intrinsic::amdgcn_softwqm:
4364     case Intrinsic::amdgcn_set_inactive:
4365     case Intrinsic::amdgcn_set_inactive_chain_arg:
4366     case Intrinsic::amdgcn_permlane64:
4367       return getDefaultMappingAllVGPR(MI);
4368     case Intrinsic::amdgcn_cvt_pkrtz:
4369       if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
4370         return getDefaultMappingSOP(MI);
4371       return getDefaultMappingVOP(MI);
4372     case Intrinsic::amdgcn_kernarg_segment_ptr:
4373     case Intrinsic::amdgcn_s_getpc:
4374     case Intrinsic::amdgcn_groupstaticsize:
4375     case Intrinsic::amdgcn_reloc_constant:
4376     case Intrinsic::returnaddress: {
4377       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4378       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4379       break;
4380     }
4381     case Intrinsic::amdgcn_wqm_vote: {
4382       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4383       OpdsMapping[0] = OpdsMapping[2]
4384         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4385       break;
4386     }
4387     case Intrinsic::amdgcn_ps_live: {
4388       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4389       break;
4390     }
4391     case Intrinsic::amdgcn_div_scale: {
4392       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4393       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4394       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4395       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4396 
4397       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4398       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4399       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4400       break;
4401     }
4402     case Intrinsic::amdgcn_class: {
4403       Register Src0Reg = MI.getOperand(2).getReg();
4404       Register Src1Reg = MI.getOperand(3).getReg();
4405       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4406       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4407       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4408       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4409       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4410       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4411       break;
4412     }
4413     case Intrinsic::amdgcn_icmp:
4414     case Intrinsic::amdgcn_fcmp: {
4415       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4416       // This is not VCCRegBank because this is not used in boolean contexts.
4417       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4418       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4419       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4420       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4421       break;
4422     }
4423     case Intrinsic::amdgcn_readlane: {
4424       // This must be an SGPR, but accept a VGPR.
4425       Register IdxReg = MI.getOperand(3).getReg();
4426       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4427       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4428       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4429       [[fallthrough]];
4430     }
4431     case Intrinsic::amdgcn_readfirstlane: {
4432       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4433       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4434       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4435       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4436       break;
4437     }
4438     case Intrinsic::amdgcn_writelane: {
4439       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4440       Register SrcReg = MI.getOperand(2).getReg();
4441       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4442       unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4443       Register IdxReg = MI.getOperand(3).getReg();
4444       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4445       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4446       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4447 
4448       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4449       // to legalize.
4450       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4451       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4452       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4453       break;
4454     }
4455     case Intrinsic::amdgcn_if_break: {
4456       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4457       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4458       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4459       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4460       break;
4461     }
4462     case Intrinsic::amdgcn_permlane16:
4463     case Intrinsic::amdgcn_permlanex16: {
4464       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4465       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4466       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4467       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4468       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4469       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4470       break;
4471     }
4472     case Intrinsic::amdgcn_permlane16_var:
4473     case Intrinsic::amdgcn_permlanex16_var: {
4474       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4475       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4476       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4477       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4478       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4479       break;
4480     }
4481     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4482     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4483     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4484     case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4485     case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4486     case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4487     case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4488     case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4489     case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4490     case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4491     case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4492     case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4493     case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4494     case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4495     case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4496     case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4497     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4498     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4499     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4500     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4501     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4502     case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4503     case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4504     case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4505     case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4506     case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4507     case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4508     case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4509     case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4510     case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4511     case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
4512     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
4513     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
4514     case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
4515     case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
4516     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
4517     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
4518     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
4519     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
4520       // Default for MAI intrinsics.
4521       // srcC can also be an immediate which can be folded later.
4522       // FIXME: Should we eventually add an alternative mapping with AGPR src
4523       // for srcA/srcB?
4524       //
4525       // vdst, srcA, srcB, srcC
4526       const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4527       OpdsMapping[0] =
4528           Info->mayNeedAGPRs()
4529               ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4530               : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4531       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4532       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4533       OpdsMapping[4] =
4534           Info->mayNeedAGPRs()
4535               ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4536               : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4537       break;
4538     }
4539     case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
4540     case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
4541     case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
4542     case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
4543     case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
4544     case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
4545     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
4546     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
4547     case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
4548     case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
4549     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
4550     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
4551     case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
4552     case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
4553       // vdst, srcA, srcB, srcC, idx
4554       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4555       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4556       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4557       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4558       OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4559       break;
4560     }
4561     case Intrinsic::amdgcn_interp_p1:
4562     case Intrinsic::amdgcn_interp_p2:
4563     case Intrinsic::amdgcn_interp_mov:
4564     case Intrinsic::amdgcn_interp_p1_f16:
4565     case Intrinsic::amdgcn_interp_p2_f16:
4566     case Intrinsic::amdgcn_lds_param_load: {
4567       const int M0Idx = MI.getNumOperands() - 1;
4568       Register M0Reg = MI.getOperand(M0Idx).getReg();
4569       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4570       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4571 
4572       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4573       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4574         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4575 
4576       // Must be SGPR, but we must take whatever the original bank is and fix it
4577       // later.
4578       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4579       break;
4580     }
4581     case Intrinsic::amdgcn_interp_inreg_p10:
4582     case Intrinsic::amdgcn_interp_inreg_p2:
4583     case Intrinsic::amdgcn_interp_inreg_p10_f16:
4584     case Intrinsic::amdgcn_interp_inreg_p2_f16: {
4585       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4586       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4587       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4588       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4589       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4590       break;
4591     }
4592     case Intrinsic::amdgcn_ballot: {
4593       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4594       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4595       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4596       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4597       break;
4598     }
4599     case Intrinsic::amdgcn_inverse_ballot: {
4600       // This must be an SGPR, but accept a VGPR.
4601       Register MaskReg = MI.getOperand(2).getReg();
4602       unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
4603       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4604       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4605       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
4606       break;
4607     }
4608     case Intrinsic::amdgcn_s_quadmask:
4609     case Intrinsic::amdgcn_s_wqm: {
4610       Register MaskReg = MI.getOperand(2).getReg();
4611       unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
4612       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4613       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize);
4614       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
4615       break;
4616     }
4617     case Intrinsic::amdgcn_wave_reduce_umin:
4618     case Intrinsic::amdgcn_wave_reduce_umax: {
4619       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4620       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4621       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4622       auto regBankID =
4623           isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4624       OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
4625       break;
4626     }
4627     case Intrinsic::amdgcn_s_bitreplicate:
4628       Register MaskReg = MI.getOperand(2).getReg();
4629       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4630       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4631       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
4632     }
4633     break;
4634   }
4635   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4636   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4637   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4638   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4639     auto IntrID = AMDGPU::getIntrinsicID(MI);
4640     const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4641     assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4642     // Non-images can have complications from operands that allow both SGPR
4643     // and VGPR. For now it's too complicated to figure out the final opcode
4644     // to derive the register bank from the MCInstrDesc.
4645     assert(RSrcIntrin->IsImage);
4646     return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4647   }
4648   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4649     unsigned N = MI.getNumExplicitOperands() - 2;
4650     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4651     OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4652     if (N == 3) {
4653       // Sequential form: all operands combined into VGPR256/VGPR512
4654       unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4655       if (Size > 256)
4656         Size = 512;
4657       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4658     } else {
4659       // NSA form
4660       for (unsigned I = 2; I < N; ++I) {
4661         unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();
4662         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4663       }
4664     }
4665     break;
4666   }
4667   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
4668   case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
4669     auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
4670     switch (IntrID) {
4671     case Intrinsic::amdgcn_s_getreg:
4672     case Intrinsic::amdgcn_s_memtime:
4673     case Intrinsic::amdgcn_s_memrealtime:
4674     case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
4675     case Intrinsic::amdgcn_s_sendmsg_rtn: {
4676       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4677       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4678       break;
4679     }
4680     case Intrinsic::amdgcn_global_atomic_fadd:
4681     case Intrinsic::amdgcn_global_atomic_csub:
4682     case Intrinsic::amdgcn_global_atomic_fmin:
4683     case Intrinsic::amdgcn_global_atomic_fmax:
4684     case Intrinsic::amdgcn_global_atomic_fmin_num:
4685     case Intrinsic::amdgcn_global_atomic_fmax_num:
4686     case Intrinsic::amdgcn_flat_atomic_fadd:
4687     case Intrinsic::amdgcn_flat_atomic_fmin:
4688     case Intrinsic::amdgcn_flat_atomic_fmax:
4689     case Intrinsic::amdgcn_flat_atomic_fmin_num:
4690     case Intrinsic::amdgcn_flat_atomic_fmax_num:
4691     case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
4692     case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
4693       return getDefaultMappingAllVGPR(MI);
4694     case Intrinsic::amdgcn_ds_ordered_add:
4695     case Intrinsic::amdgcn_ds_ordered_swap:
4696     case Intrinsic::amdgcn_ds_fadd_v2bf16: {
4697       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4698       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4699       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4700                                  AMDGPU::SGPRRegBankID);
4701       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4702       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4703       break;
4704     }
4705     case Intrinsic::amdgcn_ds_append:
4706     case Intrinsic::amdgcn_ds_consume: {
4707       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4708       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4709       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4710       break;
4711     }
4712     case Intrinsic::amdgcn_exp_compr:
4713       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4714       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4715       break;
4716     case Intrinsic::amdgcn_exp:
4717       // FIXME: Could we support packed types here?
4718       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4719       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4720       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4721       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4722       break;
4723     case Intrinsic::amdgcn_exp_row:
4724       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4725       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4726       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4727       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4728       OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
4729       break;
4730     case Intrinsic::amdgcn_s_sendmsg:
4731     case Intrinsic::amdgcn_s_sendmsghalt: {
4732       // This must be an SGPR, but accept a VGPR.
4733       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4734                                    AMDGPU::SGPRRegBankID);
4735       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4736       break;
4737     }
4738     case Intrinsic::amdgcn_s_setreg: {
4739       // This must be an SGPR, but accept a VGPR.
4740       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4741                                    AMDGPU::SGPRRegBankID);
4742       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4743       break;
4744     }
4745     case Intrinsic::amdgcn_s_ttracedata: {
4746       // This must be an SGPR, but accept a VGPR.
4747       unsigned Bank =
4748           getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID);
4749       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4750       break;
4751     }
4752     case Intrinsic::amdgcn_end_cf: {
4753       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4754       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4755       break;
4756     }
4757     case Intrinsic::amdgcn_else: {
4758       unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4759       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4760       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4761       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4762       break;
4763     }
4764     case Intrinsic::amdgcn_live_mask: {
4765       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4766       break;
4767     }
4768     case Intrinsic::amdgcn_wqm_demote:
4769     case Intrinsic::amdgcn_kill: {
4770       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4771       break;
4772     }
4773     case Intrinsic::amdgcn_raw_buffer_load:
4774     case Intrinsic::amdgcn_raw_ptr_buffer_load:
4775     case Intrinsic::amdgcn_raw_tbuffer_load:
4776     case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
4777       // FIXME: Should make intrinsic ID the last operand of the instruction,
4778       // then this would be the same as store
4779       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4780       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4781       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4782       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4783       break;
4784     }
4785     case Intrinsic::amdgcn_raw_buffer_load_lds:
4786     case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
4787       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4788       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4789       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4790       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4791       break;
4792     }
4793     case Intrinsic::amdgcn_raw_buffer_store:
4794     case Intrinsic::amdgcn_raw_ptr_buffer_store:
4795     case Intrinsic::amdgcn_raw_buffer_store_format:
4796     case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
4797     case Intrinsic::amdgcn_raw_tbuffer_store:
4798     case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
4799       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4800       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4801       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4802       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4803       break;
4804     }
4805     case Intrinsic::amdgcn_struct_buffer_load:
4806     case Intrinsic::amdgcn_struct_ptr_buffer_load:
4807     case Intrinsic::amdgcn_struct_tbuffer_load:
4808     case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
4809       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4810       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4811       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4812       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4813       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4814       break;
4815     }
4816     case Intrinsic::amdgcn_struct_buffer_load_lds:
4817     case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
4818       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4819       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4820       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4821       OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4822       OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4823       break;
4824     }
4825     case Intrinsic::amdgcn_struct_buffer_store:
4826     case Intrinsic::amdgcn_struct_ptr_buffer_store:
4827     case Intrinsic::amdgcn_struct_tbuffer_store:
4828     case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
4829       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4830       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4831       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4832       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4833       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4834       break;
4835     }
4836     case Intrinsic::amdgcn_init_exec_from_input: {
4837       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4838       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4839       break;
4840     }
4841     case Intrinsic::amdgcn_ds_gws_init:
4842     case Intrinsic::amdgcn_ds_gws_barrier:
4843     case Intrinsic::amdgcn_ds_gws_sema_br: {
4844       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4845 
4846       // This must be an SGPR, but accept a VGPR.
4847       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4848                                    AMDGPU::SGPRRegBankID);
4849       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4850       break;
4851     }
4852     case Intrinsic::amdgcn_ds_gws_sema_v:
4853     case Intrinsic::amdgcn_ds_gws_sema_p:
4854     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4855       // This must be an SGPR, but accept a VGPR.
4856       unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4857                                    AMDGPU::SGPRRegBankID);
4858       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4859       break;
4860     }
4861     case Intrinsic::amdgcn_global_load_lds: {
4862       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4863       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4864       break;
4865     }
4866     case Intrinsic::amdgcn_lds_direct_load: {
4867       const int M0Idx = MI.getNumOperands() - 1;
4868       Register M0Reg = MI.getOperand(M0Idx).getReg();
4869       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4870       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4871 
4872       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4873       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4874         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4875 
4876       // Must be SGPR, but we must take whatever the original bank is and fix it
4877       // later.
4878       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4879       break;
4880     }
4881     case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
4882     case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
4883       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4884       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4885       break;
4886     case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
4887       OpdsMapping[0] =
4888           getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst
4889       OpdsMapping[1] =
4890           getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr
4891       OpdsMapping[3] =
4892           getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr
4893       OpdsMapping[4] =
4894           getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0
4895       OpdsMapping[5] =
4896           getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
4897       break;
4898     }
4899     case Intrinsic::amdgcn_s_sleep_var:
4900       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4901       break;
4902     case Intrinsic::amdgcn_s_barrier_signal_var:
4903     case Intrinsic::amdgcn_s_barrier_join:
4904     case Intrinsic::amdgcn_s_wakeup_barrier:
4905       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4906       break;
4907     case Intrinsic::amdgcn_s_barrier_init:
4908       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4909       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4910       break;
4911     case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: {
4912       const unsigned ResultSize = 1;
4913       OpdsMapping[0] =
4914           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
4915       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4916       break;
4917     }
4918     case Intrinsic::amdgcn_s_barrier_signal_isfirst:
4919     case Intrinsic::amdgcn_s_barrier_leave: {
4920       const unsigned ResultSize = 1;
4921       OpdsMapping[0] =
4922           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
4923       break;
4924     }
4925     case Intrinsic::amdgcn_s_get_barrier_state: {
4926       OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4927       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4928       break;
4929     }
4930     default:
4931       return getInvalidInstructionMapping();
4932     }
4933     break;
4934   }
4935   case AMDGPU::G_SELECT: {
4936     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4937     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4938                                     AMDGPU::SGPRRegBankID);
4939     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4940                                     AMDGPU::SGPRRegBankID);
4941     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4942                     Op3Bank == AMDGPU::SGPRRegBankID;
4943 
4944     unsigned CondBankDefault = SGPRSrcs ?
4945       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4946     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4947                                      CondBankDefault);
4948     if (CondBank == AMDGPU::SGPRRegBankID)
4949       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4950     else if (CondBank == AMDGPU::VGPRRegBankID)
4951       CondBank = AMDGPU::VCCRegBankID;
4952 
4953     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4954       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4955 
4956     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4957 
4958     // TODO: Should report 32-bit for scalar condition type.
4959     if (Size == 64) {
4960       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4961       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4962       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4963       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4964     } else {
4965       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4966       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4967       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4968       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4969     }
4970 
4971     break;
4972   }
4973 
4974   case AMDGPU::G_SI_CALL: {
4975     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4976     // Lie and claim everything is legal, even though some need to be
4977     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4978     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4979 
4980     // Allow anything for implicit arguments
4981     for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
4982       if (MI.getOperand(I).isReg()) {
4983         Register Reg = MI.getOperand(I).getReg();
4984         auto OpBank = getRegBankID(Reg, MRI);
4985         unsigned Size = getSizeInBits(Reg, MRI, *TRI);
4986         OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
4987       }
4988     }
4989     break;
4990   }
4991   case AMDGPU::G_LOAD:
4992   case AMDGPU::G_ZEXTLOAD:
4993   case AMDGPU::G_SEXTLOAD:
4994     return getInstrMappingForLoad(MI);
4995 
4996   case AMDGPU::G_ATOMICRMW_XCHG:
4997   case AMDGPU::G_ATOMICRMW_ADD:
4998   case AMDGPU::G_ATOMICRMW_SUB:
4999   case AMDGPU::G_ATOMICRMW_AND:
5000   case AMDGPU::G_ATOMICRMW_OR:
5001   case AMDGPU::G_ATOMICRMW_XOR:
5002   case AMDGPU::G_ATOMICRMW_MAX:
5003   case AMDGPU::G_ATOMICRMW_MIN:
5004   case AMDGPU::G_ATOMICRMW_UMAX:
5005   case AMDGPU::G_ATOMICRMW_UMIN:
5006   case AMDGPU::G_ATOMICRMW_FADD:
5007   case AMDGPU::G_ATOMICRMW_UINC_WRAP:
5008   case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
5009   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
5010   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
5011   case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
5012     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5013     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
5014     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5015     break;
5016   }
5017   case AMDGPU::G_ATOMIC_CMPXCHG: {
5018     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5019     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
5020     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5021     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5022     break;
5023   }
5024   case AMDGPU::G_BRCOND: {
5025     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
5026                                  AMDGPU::SGPRRegBankID);
5027     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
5028     if (Bank != AMDGPU::SGPRRegBankID)
5029       Bank = AMDGPU::VCCRegBankID;
5030 
5031     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
5032     break;
5033   }
5034   case AMDGPU::G_FPTRUNC_ROUND_UPWARD:
5035   case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD:
5036     return getDefaultMappingVOP(MI);
5037   case AMDGPU::G_PREFETCH:
5038     OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5039     break;
5040   }
5041 
5042   return getInstructionMapping(/*ID*/1, /*Cost*/1,
5043                                getOperandsMapping(OpdsMapping),
5044                                MI.getNumOperands());
5045 }
5046