1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70
71 #include "AMDGPURegisterBankInfo.h"
72
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83 #include "llvm/CodeGen/RegisterBank.h"
84 #include "llvm/IR/IntrinsicsAMDGPU.h"
85
86 #define GET_TARGET_REGBANK_IMPL
87 #include "AMDGPUGenRegisterBank.inc"
88
89 // This file will be TableGen'ed at some point.
90 #include "AMDGPUGenRegisterBankInfo.def"
91
92 using namespace llvm;
93 using namespace MIPatternMatch;
94
95 namespace {
96
97 // Observer to apply a register bank to new registers created by LegalizerHelper.
98 class ApplyRegBankMapping final : public GISelChangeObserver {
99 private:
100 MachineIRBuilder &B;
101 const AMDGPURegisterBankInfo &RBI;
102 MachineRegisterInfo &MRI;
103 const RegisterBank *NewBank;
104 SmallVector<MachineInstr *, 4> NewInsts;
105
106 public:
ApplyRegBankMapping(MachineIRBuilder & B,const AMDGPURegisterBankInfo & RBI_,MachineRegisterInfo & MRI_,const RegisterBank * RB)107 ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_,
108 MachineRegisterInfo &MRI_, const RegisterBank *RB)
109 : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) {
110 assert(!B.isObservingChanges());
111 B.setChangeObserver(*this);
112 }
113
~ApplyRegBankMapping()114 ~ApplyRegBankMapping() override {
115 for (MachineInstr *MI : NewInsts)
116 applyBank(*MI);
117
118 B.stopObservingChanges();
119 }
120
121 /// Set any registers that don't have a set register class or bank to SALU.
applyBank(MachineInstr & MI)122 void applyBank(MachineInstr &MI) {
123 const unsigned Opc = MI.getOpcode();
124 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
125 Opc == AMDGPU::G_SEXT) {
126 // LegalizerHelper wants to use the basic legalization artifacts when
127 // widening etc. We don't handle selection with vcc in artifact sources,
128 // so we need to use a select instead to handle these properly.
129 Register DstReg = MI.getOperand(0).getReg();
130 Register SrcReg = MI.getOperand(1).getReg();
131 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
132 if (SrcBank == &AMDGPU::VCCRegBank) {
133 const LLT S32 = LLT::scalar(32);
134 assert(MRI.getType(SrcReg) == LLT::scalar(1));
135 assert(MRI.getType(DstReg) == S32);
136 assert(NewBank == &AMDGPU::VGPRRegBank);
137
138 // Replace the extension with a select, which really uses the boolean
139 // source.
140 B.setInsertPt(*MI.getParent(), MI);
141
142 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
143 auto False = B.buildConstant(S32, 0);
144 B.buildSelect(DstReg, SrcReg, True, False);
145 MRI.setRegBank(True.getReg(0), *NewBank);
146 MRI.setRegBank(False.getReg(0), *NewBank);
147 MI.eraseFromParent();
148 }
149
150 assert(!MRI.getRegClassOrRegBank(DstReg));
151 MRI.setRegBank(DstReg, *NewBank);
152 return;
153 }
154
155 #ifndef NDEBUG
156 if (Opc == AMDGPU::G_TRUNC) {
157 Register DstReg = MI.getOperand(0).getReg();
158 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
159 assert(DstBank != &AMDGPU::VCCRegBank);
160 }
161 #endif
162
163 for (MachineOperand &Op : MI.operands()) {
164 if (!Op.isReg())
165 continue;
166
167 // We may see physical registers if building a real MI
168 Register Reg = Op.getReg();
169 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
170 continue;
171
172 const RegisterBank *RB = NewBank;
173 if (MRI.getType(Reg) == LLT::scalar(1)) {
174 assert(NewBank == &AMDGPU::VGPRRegBank &&
175 "s1 operands should only be used for vector bools");
176 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
177 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
178 "not expecting legalization artifacts here");
179 RB = &AMDGPU::VCCRegBank;
180 }
181
182 MRI.setRegBank(Reg, *RB);
183 }
184 }
185
erasingInstr(MachineInstr & MI)186 void erasingInstr(MachineInstr &MI) override {}
187
createdInstr(MachineInstr & MI)188 void createdInstr(MachineInstr &MI) override {
189 // At this point, the instruction was just inserted and has no operands.
190 NewInsts.push_back(&MI);
191 }
192
changingInstr(MachineInstr & MI)193 void changingInstr(MachineInstr &MI) override {}
changedInstr(MachineInstr & MI)194 void changedInstr(MachineInstr &MI) override {
195 // FIXME: In principle we should probably add the instruction to NewInsts,
196 // but the way the LegalizerHelper uses the observer, we will always see the
197 // registers we need to set the regbank on also referenced in a new
198 // instruction.
199 }
200 };
201
202 } // anonymous namespace
203
AMDGPURegisterBankInfo(const GCNSubtarget & ST)204 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
205 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
206 TII(Subtarget.getInstrInfo()) {
207
208 // HACK: Until this is fully tablegen'd.
209 static llvm::once_flag InitializeRegisterBankFlag;
210
211 static auto InitializeRegisterBankOnce = [this]() {
212 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
213 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
214 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
215 (void)this;
216 };
217
218 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
219 }
220
isVectorRegisterBank(const RegisterBank & Bank)221 static bool isVectorRegisterBank(const RegisterBank &Bank) {
222 unsigned BankID = Bank.getID();
223 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
224 }
225
isDivergentRegBank(const RegisterBank * RB) const226 bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const {
227 return RB != &AMDGPU::SGPRRegBank;
228 }
229
copyCost(const RegisterBank & Dst,const RegisterBank & Src,TypeSize Size) const230 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
231 const RegisterBank &Src,
232 TypeSize Size) const {
233 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
234 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
235 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
236 return std::numeric_limits<unsigned>::max();
237 }
238
239 // Bool values are tricky, because the meaning is based on context. The SCC
240 // and VCC banks are for the natural scalar and vector conditions produced by
241 // a compare.
242 //
243 // Legalization doesn't know about the necessary context, so an s1 use may
244 // have been a truncate from an arbitrary value, in which case a copy (lowered
245 // as a compare with 0) needs to be inserted.
246 if (Size == 1 &&
247 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
248 (isVectorRegisterBank(Src) ||
249 Src.getID() == AMDGPU::SGPRRegBankID ||
250 Src.getID() == AMDGPU::VCCRegBankID))
251 return std::numeric_limits<unsigned>::max();
252
253 // There is no direct copy between AGPRs.
254 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
255 Src.getID() == AMDGPU::AGPRRegBankID)
256 return 4;
257
258 return RegisterBankInfo::copyCost(Dst, Src, Size);
259 }
260
getBreakDownCost(const ValueMapping & ValMapping,const RegisterBank * CurBank) const261 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
262 const ValueMapping &ValMapping,
263 const RegisterBank *CurBank) const {
264 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
265 // VGPR.
266 // FIXME: Is there a better way to do this?
267 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
268 return 10; // This is expensive.
269
270 assert(ValMapping.NumBreakDowns == 2 &&
271 ValMapping.BreakDown[0].Length == 32 &&
272 ValMapping.BreakDown[0].StartIdx == 0 &&
273 ValMapping.BreakDown[1].Length == 32 &&
274 ValMapping.BreakDown[1].StartIdx == 32 &&
275 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
276
277 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
278 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
279 // want.
280
281 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
282 // alignment restrictions, but this probably isn't important.
283 return 1;
284 }
285
286 const RegisterBank &
getRegBankFromRegClass(const TargetRegisterClass & RC,LLT Ty) const287 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
288 LLT Ty) const {
289 if (&RC == &AMDGPU::SReg_1RegClass)
290 return AMDGPU::VCCRegBank;
291
292 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
293 // VCC-like use.
294 if (TRI->isSGPRClass(&RC)) {
295 // FIXME: This probably came from a copy from a physical register, which
296 // should be inferable from the copied to-type. We don't have many boolean
297 // physical register constraints so just assume a normal SGPR for now.
298 if (!Ty.isValid())
299 return AMDGPU::SGPRRegBank;
300
301 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
302 }
303
304 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
305 }
306
307 template <unsigned NumOps>
308 RegisterBankInfo::InstructionMappings
addMappingFromTable(const MachineInstr & MI,const MachineRegisterInfo & MRI,const std::array<unsigned,NumOps> RegSrcOpIdx,ArrayRef<OpRegBankEntry<NumOps>> Table) const309 AMDGPURegisterBankInfo::addMappingFromTable(
310 const MachineInstr &MI, const MachineRegisterInfo &MRI,
311 const std::array<unsigned, NumOps> RegSrcOpIdx,
312 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
313
314 InstructionMappings AltMappings;
315
316 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
317
318 unsigned Sizes[NumOps];
319 for (unsigned I = 0; I < NumOps; ++I) {
320 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
321 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
322 }
323
324 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
325 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
326 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
327 }
328
329 // getInstrMapping's default mapping uses ID 1, so start at 2.
330 unsigned MappingID = 2;
331 for (const auto &Entry : Table) {
332 for (unsigned I = 0; I < NumOps; ++I) {
333 int OpIdx = RegSrcOpIdx[I];
334 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
335 }
336
337 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
338 getOperandsMapping(Operands),
339 Operands.size()));
340 }
341
342 return AltMappings;
343 }
344
345 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsic(const MachineInstr & MI,const MachineRegisterInfo & MRI) const346 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
347 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
348 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
349 case Intrinsic::amdgcn_readlane: {
350 static const OpRegBankEntry<3> Table[2] = {
351 // Perfectly legal.
352 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
353
354 // Need a readfirstlane for the index.
355 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
356 };
357
358 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
359 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
360 }
361 case Intrinsic::amdgcn_writelane: {
362 static const OpRegBankEntry<4> Table[4] = {
363 // Perfectly legal.
364 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
365
366 // Need readfirstlane of first op
367 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
368
369 // Need readfirstlane of second op
370 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
371
372 // Need readfirstlane of both ops
373 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
374 };
375
376 // rsrc, voffset, offset
377 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
378 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);
379 }
380 default:
381 return RegisterBankInfo::getInstrAlternativeMappings(MI);
382 }
383 }
384
385 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr & MI,const MachineRegisterInfo & MRI) const386 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
387 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
388
389 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
390 case Intrinsic::amdgcn_s_buffer_load: {
391 static const OpRegBankEntry<2> Table[4] = {
392 // Perfectly legal.
393 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
394
395 // Only need 1 register in loop
396 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
397
398 // Have to waterfall the resource.
399 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
400
401 // Have to waterfall the resource, and the offset.
402 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
403 };
404
405 // rsrc, offset
406 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
407 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);
408 }
409 case Intrinsic::amdgcn_ds_ordered_add:
410 case Intrinsic::amdgcn_ds_ordered_swap: {
411 // VGPR = M0, VGPR
412 static const OpRegBankEntry<3> Table[2] = {
413 // Perfectly legal.
414 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
415
416 // Need a readfirstlane for m0
417 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
418 };
419
420 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
421 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
422 }
423 case Intrinsic::amdgcn_s_sendmsg:
424 case Intrinsic::amdgcn_s_sendmsghalt: {
425 // FIXME: Should have no register for immediate
426 static const OpRegBankEntry<1> Table[2] = {
427 // Perfectly legal.
428 { { AMDGPU::SGPRRegBankID }, 1 },
429
430 // Need readlane
431 { { AMDGPU::VGPRRegBankID }, 3 }
432 };
433
434 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
435 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);
436 }
437 default:
438 return RegisterBankInfo::getInstrAlternativeMappings(MI);
439 }
440 }
441
442 // FIXME: Returns uniform if there's no source value information. This is
443 // probably wrong.
isScalarLoadLegal(const MachineInstr & MI) const444 bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
445 if (!MI.hasOneMemOperand())
446 return false;
447
448 const MachineMemOperand *MMO = *MI.memoperands_begin();
449 const unsigned AS = MMO->getAddrSpace();
450 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
451 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
452 const unsigned MemSize = 8 * MMO->getSize().getValue();
453
454 // Require 4-byte alignment.
455 return (MMO->getAlign() >= Align(4) ||
456 (Subtarget.hasScalarSubwordLoads() &&
457 ((MemSize == 16 && MMO->getAlign() >= Align(2)) ||
458 (MemSize == 8 && MMO->getAlign() >= Align(1))))) &&
459 // Can't do a scalar atomic load.
460 !MMO->isAtomic() &&
461 // Don't use scalar loads for volatile accesses to non-constant address
462 // spaces.
463 (IsConst || !MMO->isVolatile()) &&
464 // Memory must be known constant, or not written before this load.
465 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
466 AMDGPUInstrInfo::isUniformMMO(MMO);
467 }
468
469 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappings(const MachineInstr & MI) const470 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
471 const MachineInstr &MI) const {
472
473 const MachineFunction &MF = *MI.getParent()->getParent();
474 const MachineRegisterInfo &MRI = MF.getRegInfo();
475
476
477 InstructionMappings AltMappings;
478 switch (MI.getOpcode()) {
479 case TargetOpcode::G_CONSTANT:
480 case TargetOpcode::G_IMPLICIT_DEF: {
481 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
482 if (Size == 1) {
483 static const OpRegBankEntry<1> Table[3] = {
484 { { AMDGPU::VGPRRegBankID }, 1 },
485 { { AMDGPU::SGPRRegBankID }, 1 },
486 { { AMDGPU::VCCRegBankID }, 1 }
487 };
488
489 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
490 }
491
492 [[fallthrough]];
493 }
494 case TargetOpcode::G_FCONSTANT:
495 case TargetOpcode::G_FRAME_INDEX:
496 case TargetOpcode::G_GLOBAL_VALUE: {
497 static const OpRegBankEntry<1> Table[2] = {
498 { { AMDGPU::VGPRRegBankID }, 1 },
499 { { AMDGPU::SGPRRegBankID }, 1 }
500 };
501
502 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
503 }
504 case TargetOpcode::G_AND:
505 case TargetOpcode::G_OR:
506 case TargetOpcode::G_XOR: {
507 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
508
509 if (Size == 1) {
510 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
511 const InstructionMapping &SCCMapping = getInstructionMapping(
512 1, 1, getOperandsMapping(
513 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
514 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
515 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
516 3); // Num Operands
517 AltMappings.push_back(&SCCMapping);
518
519 const InstructionMapping &VCCMapping0 = getInstructionMapping(
520 2, 1, getOperandsMapping(
521 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
522 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
523 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
524 3); // Num Operands
525 AltMappings.push_back(&VCCMapping0);
526 return AltMappings;
527 }
528
529 if (Size != 64)
530 break;
531
532 const InstructionMapping &SSMapping = getInstructionMapping(
533 1, 1, getOperandsMapping(
534 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
535 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
536 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
537 3); // Num Operands
538 AltMappings.push_back(&SSMapping);
539
540 const InstructionMapping &VVMapping = getInstructionMapping(
541 2, 2, getOperandsMapping(
542 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
543 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
544 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
545 3); // Num Operands
546 AltMappings.push_back(&VVMapping);
547 break;
548 }
549 case TargetOpcode::G_LOAD:
550 case TargetOpcode::G_ZEXTLOAD:
551 case TargetOpcode::G_SEXTLOAD: {
552 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
553 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
554 unsigned PtrSize = PtrTy.getSizeInBits();
555 unsigned AS = PtrTy.getAddressSpace();
556
557 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
558 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
559 isScalarLoadLegal(MI)) {
560 const InstructionMapping &SSMapping = getInstructionMapping(
561 1, 1, getOperandsMapping(
562 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
563 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
564 2); // Num Operands
565 AltMappings.push_back(&SSMapping);
566 }
567
568 const InstructionMapping &VVMapping = getInstructionMapping(
569 2, 1,
570 getOperandsMapping(
571 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
572 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
573 2); // Num Operands
574 AltMappings.push_back(&VVMapping);
575
576 // It may be possible to have a vgpr = load sgpr mapping here, because
577 // the mubuf instructions support this kind of load, but probably for only
578 // gfx7 and older. However, the addressing mode matching in the instruction
579 // selector should be able to do a better job of detecting and selecting
580 // these kinds of loads from the vgpr = load vgpr mapping.
581
582 return AltMappings;
583
584 }
585 case TargetOpcode::G_SELECT: {
586 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
587 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
588 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
589 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
590 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
591 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
592 4); // Num Operands
593 AltMappings.push_back(&SSMapping);
594
595 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
596 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
597 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
598 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
599 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
600 4); // Num Operands
601 AltMappings.push_back(&VVMapping);
602
603 return AltMappings;
604 }
605 case TargetOpcode::G_UADDE:
606 case TargetOpcode::G_USUBE:
607 case TargetOpcode::G_SADDE:
608 case TargetOpcode::G_SSUBE: {
609 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
610 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
611 getOperandsMapping(
612 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
613 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
614 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
615 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
616 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
617 5); // Num Operands
618 AltMappings.push_back(&SSMapping);
619
620 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
621 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
622 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
623 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
624 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
625 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
626 5); // Num Operands
627 AltMappings.push_back(&VVMapping);
628 return AltMappings;
629 }
630 case AMDGPU::G_BRCOND: {
631 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
632
633 // TODO: Change type to 32 for scalar
634 const InstructionMapping &SMapping = getInstructionMapping(
635 1, 1, getOperandsMapping(
636 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
637 2); // Num Operands
638 AltMappings.push_back(&SMapping);
639
640 const InstructionMapping &VMapping = getInstructionMapping(
641 1, 1, getOperandsMapping(
642 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
643 2); // Num Operands
644 AltMappings.push_back(&VMapping);
645 return AltMappings;
646 }
647 case AMDGPU::G_INTRINSIC:
648 case AMDGPU::G_INTRINSIC_CONVERGENT:
649 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
650 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
651 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
652 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
653 default:
654 break;
655 }
656 return RegisterBankInfo::getInstrAlternativeMappings(MI);
657 }
658
split64BitValueForMapping(MachineIRBuilder & B,SmallVector<Register,2> & Regs,LLT HalfTy,Register Reg) const659 void AMDGPURegisterBankInfo::split64BitValueForMapping(
660 MachineIRBuilder &B,
661 SmallVector<Register, 2> &Regs,
662 LLT HalfTy,
663 Register Reg) const {
664 assert(HalfTy.getSizeInBits() == 32);
665 MachineRegisterInfo *MRI = B.getMRI();
666 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
667 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
668 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
669 MRI->setRegBank(LoLHS, *Bank);
670 MRI->setRegBank(HiLHS, *Bank);
671
672 Regs.push_back(LoLHS);
673 Regs.push_back(HiLHS);
674
675 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
676 .addDef(LoLHS)
677 .addDef(HiLHS)
678 .addUse(Reg);
679 }
680
681 /// Replace the current type each register in \p Regs has with \p NewTy
setRegsToType(MachineRegisterInfo & MRI,ArrayRef<Register> Regs,LLT NewTy)682 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
683 LLT NewTy) {
684 for (Register Reg : Regs) {
685 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
686 MRI.setType(Reg, NewTy);
687 }
688 }
689
getHalfSizedType(LLT Ty)690 static LLT getHalfSizedType(LLT Ty) {
691 if (Ty.isVector()) {
692 assert(Ty.getElementCount().isKnownMultipleOf(2));
693 return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
694 Ty.getElementType());
695 }
696
697 assert(Ty.getScalarSizeInBits() % 2 == 0);
698 return LLT::scalar(Ty.getScalarSizeInBits() / 2);
699 }
700
701 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
702 // source value into a scalar register.
buildReadFirstLane(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Src) const703 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
704 MachineRegisterInfo &MRI,
705 Register Src) const {
706 LLT Ty = MRI.getType(Src);
707 const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
708
709 if (Bank == &AMDGPU::SGPRRegBank)
710 return Src;
711
712 unsigned Bits = Ty.getSizeInBits();
713 assert(Bits % 32 == 0);
714
715 if (Bank != &AMDGPU::VGPRRegBank) {
716 // We need to copy from AGPR to VGPR
717 Src = B.buildCopy(Ty, Src).getReg(0);
718 MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
719 }
720
721 LLT S32 = LLT::scalar(32);
722 unsigned NumParts = Bits / 32;
723 SmallVector<Register, 8> SrcParts;
724 SmallVector<Register, 8> DstParts;
725
726 if (Bits == 32) {
727 SrcParts.push_back(Src);
728 } else {
729 auto Unmerge = B.buildUnmerge(S32, Src);
730 for (unsigned i = 0; i < NumParts; ++i)
731 SrcParts.push_back(Unmerge.getReg(i));
732 }
733
734 for (unsigned i = 0; i < NumParts; ++i) {
735 Register SrcPart = SrcParts[i];
736 Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
737 MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
738
739 const TargetRegisterClass *Constrained =
740 constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
741 (void)Constrained;
742 assert(Constrained && "Failed to constrain readfirstlane src reg");
743
744 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
745
746 DstParts.push_back(DstPart);
747 }
748
749 if (Bits == 32)
750 return DstParts[0];
751
752 Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0);
753 MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
754 return Dst;
755 }
756
757 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
758 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
759 /// execute the instruction for each unique combination of values in all lanes
760 /// in the wave. The block will be split such that rest of the instructions are
761 /// moved to a new block.
762 ///
763 /// Essentially performs this loop:
764 //
765 /// Save Execution Mask
766 /// For (Lane : Wavefront) {
767 /// Enable Lane, Disable all other lanes
768 /// SGPR = read SGPR value for current lane from VGPR
769 /// VGPRResult[Lane] = use_op SGPR
770 /// }
771 /// Restore Execution Mask
772 ///
773 /// There is additional complexity to try for compare values to identify the
774 /// unique values used.
executeInWaterfallLoop(MachineIRBuilder & B,iterator_range<MachineBasicBlock::iterator> Range,SmallSet<Register,4> & SGPROperandRegs) const775 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
776 MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
777 SmallSet<Register, 4> &SGPROperandRegs) const {
778 // Track use registers which have already been expanded with a readfirstlane
779 // sequence. This may have multiple uses if moving a sequence.
780 DenseMap<Register, Register> WaterfalledRegMap;
781
782 MachineBasicBlock &MBB = B.getMBB();
783 MachineFunction *MF = &B.getMF();
784
785 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
786 const unsigned MovExecOpc =
787 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
788 const unsigned MovExecTermOpc =
789 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
790
791 const unsigned XorTermOpc = Subtarget.isWave32() ?
792 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
793 const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
794 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
795 const unsigned ExecReg = Subtarget.isWave32() ?
796 AMDGPU::EXEC_LO : AMDGPU::EXEC;
797
798 #ifndef NDEBUG
799 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
800 #endif
801
802 MachineRegisterInfo &MRI = *B.getMRI();
803 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
804 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
805
806 // Don't bother using generic instructions/registers for the exec mask.
807 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
808 .addDef(InitSaveExecReg);
809
810 Register PhiExec = MRI.createVirtualRegister(WaveRC);
811 Register NewExec = MRI.createVirtualRegister(WaveRC);
812
813 // To insert the loop we need to split the block. Move everything before this
814 // point to a new block, and insert a new empty block before this instruction.
815 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
816 MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
817 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
818 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
819 MachineFunction::iterator MBBI(MBB);
820 ++MBBI;
821 MF->insert(MBBI, LoopBB);
822 MF->insert(MBBI, BodyBB);
823 MF->insert(MBBI, RestoreExecBB);
824 MF->insert(MBBI, RemainderBB);
825
826 LoopBB->addSuccessor(BodyBB);
827 BodyBB->addSuccessor(RestoreExecBB);
828 BodyBB->addSuccessor(LoopBB);
829
830 // Move the rest of the block into a new block.
831 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
832 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
833
834 MBB.addSuccessor(LoopBB);
835 RestoreExecBB->addSuccessor(RemainderBB);
836
837 B.setInsertPt(*LoopBB, LoopBB->end());
838
839 B.buildInstr(TargetOpcode::PHI)
840 .addDef(PhiExec)
841 .addReg(InitSaveExecReg)
842 .addMBB(&MBB)
843 .addReg(NewExec)
844 .addMBB(BodyBB);
845
846 const DebugLoc &DL = B.getDL();
847
848 MachineInstr &FirstInst = *Range.begin();
849
850 // Move the instruction into the loop body. Note we moved everything after
851 // Range.end() already into a new block, so Range.end() is no longer valid.
852 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
853
854 // Figure out the iterator range after splicing the instructions.
855 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
856 auto NewEnd = BodyBB->end();
857
858 B.setMBB(*LoopBB);
859
860 LLT S1 = LLT::scalar(1);
861 Register CondReg;
862
863 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
864
865 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
866 for (MachineOperand &Op : MI.all_uses()) {
867 Register OldReg = Op.getReg();
868 if (!SGPROperandRegs.count(OldReg))
869 continue;
870
871 // See if we already processed this register in another instruction in the
872 // sequence.
873 auto OldVal = WaterfalledRegMap.find(OldReg);
874 if (OldVal != WaterfalledRegMap.end()) {
875 Op.setReg(OldVal->second);
876 continue;
877 }
878
879 Register OpReg = Op.getReg();
880 LLT OpTy = MRI.getType(OpReg);
881
882 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
883 if (OpBank != &AMDGPU::VGPRRegBank) {
884 // Insert copy from AGPR to VGPR before the loop.
885 B.setMBB(MBB);
886 OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
887 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
888 B.setMBB(*LoopBB);
889 }
890
891 Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
892
893 // Build the comparison(s).
894 unsigned OpSize = OpTy.getSizeInBits();
895 bool Is64 = OpSize % 64 == 0;
896 unsigned PartSize = Is64 ? 64 : 32;
897 LLT PartTy = LLT::scalar(PartSize);
898 unsigned NumParts = OpSize / PartSize;
899 SmallVector<Register, 8> OpParts;
900 SmallVector<Register, 8> CurrentLaneParts;
901
902 if (NumParts == 1) {
903 OpParts.push_back(OpReg);
904 CurrentLaneParts.push_back(CurrentLaneReg);
905 } else {
906 auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
907 auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
908 for (unsigned i = 0; i < NumParts; ++i) {
909 OpParts.push_back(UnmergeOp.getReg(i));
910 CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
911 MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
912 MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
913 }
914 }
915
916 for (unsigned i = 0; i < NumParts; ++i) {
917 auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
918 OpParts[i]).getReg(0);
919 MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
920
921 if (!CondReg) {
922 CondReg = CmpReg;
923 } else {
924 CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
925 MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
926 }
927 }
928
929 Op.setReg(CurrentLaneReg);
930
931 // Make sure we don't re-process this register again.
932 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
933 }
934 }
935
936 // The ballot becomes a no-op during instruction selection.
937 CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
938 {LLT::scalar(Subtarget.isWave32() ? 32 : 64)})
939 .addReg(CondReg)
940 .getReg(0);
941 MRI.setRegClass(CondReg, WaveRC);
942
943 // Update EXEC, save the original EXEC value to VCC.
944 B.buildInstr(AndSaveExecOpc)
945 .addDef(NewExec)
946 .addReg(CondReg, RegState::Kill);
947
948 MRI.setSimpleHint(NewExec, CondReg);
949
950 B.setInsertPt(*BodyBB, BodyBB->end());
951
952 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
953 B.buildInstr(XorTermOpc)
954 .addDef(ExecReg)
955 .addReg(ExecReg)
956 .addReg(NewExec);
957
958 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
959 // s_cbranch_scc0?
960
961 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
962 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
963
964 // Save the EXEC mask before the loop.
965 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
966 .addReg(ExecReg);
967
968 // Restore the EXEC mask after the loop.
969 B.setMBB(*RestoreExecBB);
970 B.buildInstr(MovExecTermOpc)
971 .addDef(ExecReg)
972 .addReg(SaveExecReg);
973
974 // Set the insert point after the original instruction, so any new
975 // instructions will be in the remainder.
976 B.setInsertPt(*RemainderBB, RemainderBB->begin());
977
978 return true;
979 }
980
981 // Return any unique registers used by \p MI at \p OpIndices that need to be
982 // handled in a waterfall loop. Returns these registers in \p
983 // SGPROperandRegs. Returns true if there are any operands to handle and a
984 // waterfall loop is necessary.
collectWaterfallOperands(SmallSet<Register,4> & SGPROperandRegs,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const985 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
986 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
987 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
988 for (unsigned Op : OpIndices) {
989 assert(MI.getOperand(Op).isUse());
990 Register Reg = MI.getOperand(Op).getReg();
991 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
992 if (OpBank->getID() != AMDGPU::SGPRRegBankID)
993 SGPROperandRegs.insert(Reg);
994 }
995
996 // No operands need to be replaced, so no need to loop.
997 return !SGPROperandRegs.empty();
998 }
999
executeInWaterfallLoop(MachineIRBuilder & B,MachineInstr & MI,ArrayRef<unsigned> OpIndices) const1000 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1001 MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const {
1002 // Use a set to avoid extra readfirstlanes in the case where multiple operands
1003 // are the same register.
1004 SmallSet<Register, 4> SGPROperandRegs;
1005
1006 if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices))
1007 return false;
1008
1009 MachineBasicBlock::iterator I = MI.getIterator();
1010 return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1011 SGPROperandRegs);
1012 }
1013
1014 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
constrainOpWithReadfirstlane(MachineIRBuilder & B,MachineInstr & MI,unsigned OpIdx) const1015 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1016 MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const {
1017 Register Reg = MI.getOperand(OpIdx).getReg();
1018 MachineRegisterInfo &MRI = *B.getMRI();
1019 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1020 if (Bank == &AMDGPU::SGPRRegBank)
1021 return;
1022
1023 Reg = buildReadFirstLane(B, MRI, Reg);
1024 MI.getOperand(OpIdx).setReg(Reg);
1025 }
1026
1027 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1028 /// rest will be in the remainder.
splitUnequalType(LLT Ty,unsigned FirstSize)1029 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1030 unsigned TotalSize = Ty.getSizeInBits();
1031 if (!Ty.isVector())
1032 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1033
1034 LLT EltTy = Ty.getElementType();
1035 unsigned EltSize = EltTy.getSizeInBits();
1036 assert(FirstSize % EltSize == 0);
1037
1038 unsigned FirstPartNumElts = FirstSize / EltSize;
1039 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1040
1041 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1042 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1043 }
1044
widen96To128(LLT Ty)1045 static LLT widen96To128(LLT Ty) {
1046 if (!Ty.isVector())
1047 return LLT::scalar(128);
1048
1049 LLT EltTy = Ty.getElementType();
1050 assert(128 % EltTy.getSizeInBits() == 0);
1051 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1052 }
1053
applyMappingLoad(MachineIRBuilder & B,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineInstr & MI) const1054 bool AMDGPURegisterBankInfo::applyMappingLoad(
1055 MachineIRBuilder &B,
1056 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1057 MachineInstr &MI) const {
1058 MachineRegisterInfo &MRI = *B.getMRI();
1059 Register DstReg = MI.getOperand(0).getReg();
1060 const LLT LoadTy = MRI.getType(DstReg);
1061 unsigned LoadSize = LoadTy.getSizeInBits();
1062 const unsigned MaxNonSmrdLoadSize = 128;
1063
1064 const RegisterBank *DstBank =
1065 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1066 if (DstBank == &AMDGPU::SGPRRegBank) {
1067 // There are some special cases that we need to look at for 32 bit and 96
1068 // bit SGPR loads otherwise we have nothing to do.
1069 if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
1070 return false;
1071
1072 MachineMemOperand *MMO = *MI.memoperands_begin();
1073 const unsigned MemSize = 8 * MMO->getSize().getValue();
1074 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1075 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1076 // scalar loads should have a load size of 32 but memory access size of less
1077 // than 32.
1078 if (LoadSize == 32 &&
1079 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1080 return false;
1081
1082 if (LoadSize == 32 &&
1083 ((MemSize == 8 && MMO->getAlign() >= Align(1)) ||
1084 (MemSize == 16 && MMO->getAlign() >= Align(2))) &&
1085 isScalarLoadLegal(MI) &&
1086 Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12)
1087 return false;
1088
1089 Register PtrReg = MI.getOperand(1).getReg();
1090
1091 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
1092
1093 if (LoadSize == 32) {
1094 // This is an extending load from a sub-dword size. Widen the memory
1095 // access size to 4 bytes and clear the extra high bits appropriately
1096 const LLT S32 = LLT::scalar(32);
1097 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1098 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1099 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1100 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1101 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1102 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1103 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1104 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1105 } else
1106 // We do not need to touch the higher bits for regular loads.
1107 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1108 } else {
1109 // 96-bit loads are only available for vector loads. We need to split this
1110 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1111 if (MMO->getAlign() < Align(16)) {
1112 LegalizerHelper Helper(B.getMF(), ApplyBank, B);
1113 LLT Part64, Part32;
1114 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1115 if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1116 LegalizerHelper::Legalized)
1117 return false;
1118 return true;
1119 }
1120 LLT WiderTy = widen96To128(LoadTy);
1121 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1122 if (WiderTy.isScalar()) {
1123 B.buildTrunc(MI.getOperand(0), WideLoad);
1124 } else {
1125 B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1126 WideLoad);
1127 }
1128 }
1129
1130 MI.eraseFromParent();
1131 return true;
1132 }
1133
1134 // 128-bit loads are supported for all instruction types.
1135 if (LoadSize <= MaxNonSmrdLoadSize)
1136 return false;
1137
1138 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1139 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1140
1141 if (SrcRegs.empty())
1142 SrcRegs.push_back(MI.getOperand(1).getReg());
1143
1144 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1145
1146 // RegBankSelect only emits scalar types, so we need to reset the pointer
1147 // operand to a pointer type.
1148 Register BasePtrReg = SrcRegs[0];
1149 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1150 MRI.setType(BasePtrReg, PtrTy);
1151
1152 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1153 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1154 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
1155 LegalizerHelper Helper(B.getMF(), O, B);
1156
1157 if (LoadTy.isVector()) {
1158 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1159 return false;
1160 } else {
1161 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1162 return false;
1163 }
1164
1165 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1166 return true;
1167 }
1168
applyMappingDynStackAlloc(MachineIRBuilder & B,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineInstr & MI) const1169 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1170 MachineIRBuilder &B,
1171 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1172 MachineInstr &MI) const {
1173 MachineRegisterInfo &MRI = *B.getMRI();
1174 const MachineFunction &MF = B.getMF();
1175 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1176 const auto &TFI = *ST.getFrameLowering();
1177
1178 // Guard in case the stack growth direction ever changes with scratch
1179 // instructions.
1180 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1181 return false;
1182
1183 Register Dst = MI.getOperand(0).getReg();
1184 Register AllocSize = MI.getOperand(1).getReg();
1185 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1186
1187 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1188
1189 // TODO: Need to emit a wave reduction to get the maximum size.
1190 if (SizeBank != &AMDGPU::SGPRRegBank)
1191 return false;
1192
1193 LLT PtrTy = MRI.getType(Dst);
1194 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1195
1196 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1197 Register SPReg = Info->getStackPtrOffsetReg();
1198 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1199
1200 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1201 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1202
1203 auto SPCopy = B.buildCopy(PtrTy, SPReg);
1204 if (Alignment > TFI.getStackAlign()) {
1205 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1206 B.buildMaskLowPtrBits(Dst, PtrAdd,
1207 Log2(Alignment) + ST.getWavefrontSizeLog2());
1208 } else {
1209 B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1210 }
1211
1212 MI.eraseFromParent();
1213 return true;
1214 }
1215
applyMappingImage(MachineIRBuilder & B,MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,int RsrcIdx) const1216 bool AMDGPURegisterBankInfo::applyMappingImage(
1217 MachineIRBuilder &B, MachineInstr &MI,
1218 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1219 int RsrcIdx) const {
1220 const int NumDefs = MI.getNumExplicitDefs();
1221
1222 // The reported argument index is relative to the IR intrinsic call arguments,
1223 // so we need to shift by the number of defs and the intrinsic ID.
1224 RsrcIdx += NumDefs + 1;
1225
1226 // Insert copies to VGPR arguments.
1227 applyDefaultMapping(OpdMapper);
1228
1229 // Fixup any SGPR arguments.
1230 SmallVector<unsigned, 4> SGPRIndexes;
1231 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1232 if (!MI.getOperand(I).isReg())
1233 continue;
1234
1235 // If this intrinsic has a sampler, it immediately follows rsrc.
1236 if (I == RsrcIdx || I == RsrcIdx + 1)
1237 SGPRIndexes.push_back(I);
1238 }
1239
1240 executeInWaterfallLoop(B, MI, SGPRIndexes);
1241 return true;
1242 }
1243
1244 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1245 // the three offsets (voffset, soffset and instoffset)
setBufferOffsets(MachineIRBuilder & B,Register CombinedOffset,Register & VOffsetReg,Register & SOffsetReg,int64_t & InstOffsetVal,Align Alignment) const1246 unsigned AMDGPURegisterBankInfo::setBufferOffsets(
1247 MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
1248 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
1249 const LLT S32 = LLT::scalar(32);
1250 MachineRegisterInfo *MRI = B.getMRI();
1251
1252 if (std::optional<int64_t> Imm =
1253 getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1254 uint32_t SOffset, ImmOffset;
1255 if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) {
1256 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1257 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1258 InstOffsetVal = ImmOffset;
1259
1260 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1261 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1262 return SOffset + ImmOffset;
1263 }
1264 }
1265
1266 Register Base;
1267 unsigned Offset;
1268
1269 std::tie(Base, Offset) =
1270 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1271
1272 uint32_t SOffset, ImmOffset;
1273 if ((int)Offset > 0 &&
1274 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
1275 if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1276 VOffsetReg = Base;
1277 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1278 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1279 InstOffsetVal = ImmOffset;
1280 return 0; // XXX - Why is this 0?
1281 }
1282
1283 // If we have SGPR base, we can use it for soffset.
1284 if (SOffset == 0) {
1285 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1286 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1287 SOffsetReg = Base;
1288 InstOffsetVal = ImmOffset;
1289 return 0; // XXX - Why is this 0?
1290 }
1291 }
1292
1293 // Handle the variable sgpr + vgpr case.
1294 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1295 if (Add && (int)Offset >= 0) {
1296 Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI);
1297 Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI);
1298
1299 const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI);
1300 const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI);
1301
1302 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1303 VOffsetReg = Src0;
1304 SOffsetReg = Src1;
1305 return 0;
1306 }
1307
1308 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1309 VOffsetReg = Src1;
1310 SOffsetReg = Src0;
1311 return 0;
1312 }
1313 }
1314
1315 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1316 // have an SGPR offset and a VGPR resource.
1317 if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1318 VOffsetReg = CombinedOffset;
1319 } else {
1320 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1321 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1322 }
1323
1324 SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1325 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1326 return 0;
1327 }
1328
applyMappingSBufferLoad(MachineIRBuilder & B,const OperandsMapper & OpdMapper) const1329 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1330 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1331 MachineInstr &MI = OpdMapper.getMI();
1332 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1333
1334 const LLT S32 = LLT::scalar(32);
1335 Register Dst = MI.getOperand(0).getReg();
1336 LLT Ty = MRI.getType(Dst);
1337
1338 const RegisterBank *RSrcBank =
1339 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1340 const RegisterBank *OffsetBank =
1341 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1342 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1343 OffsetBank == &AMDGPU::SGPRRegBank)
1344 return true; // Legal mapping
1345
1346 // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1347 // here but don't have an MMO.
1348
1349 unsigned LoadSize = Ty.getSizeInBits();
1350 int NumLoads = 1;
1351 if (LoadSize == 256 || LoadSize == 512) {
1352 NumLoads = LoadSize / 128;
1353 Ty = Ty.divide(NumLoads);
1354 }
1355
1356 // Use the alignment to ensure that the required offsets will fit into the
1357 // immediate offsets.
1358 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1359
1360 MachineFunction &MF = B.getMF();
1361
1362 Register SOffset;
1363 Register VOffset;
1364 int64_t ImmOffset = 0;
1365
1366 unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset,
1367 SOffset, ImmOffset, Alignment);
1368
1369 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1370 // can, but we need to track an MMO for that.
1371 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1372 const Align MemAlign(4); // FIXME: ABI type alignment?
1373 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1374 MachinePointerInfo(),
1375 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1376 MachineMemOperand::MOInvariant,
1377 MemSize, MemAlign);
1378 if (MMOOffset != 0)
1379 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1380
1381 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1382 // assume that the buffer is unswizzled.
1383
1384 Register RSrc = MI.getOperand(1).getReg();
1385 Register VIndex = B.buildConstant(S32, 0).getReg(0);
1386 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1387
1388 SmallVector<Register, 4> LoadParts(NumLoads);
1389
1390 MachineBasicBlock::iterator MII = MI.getIterator();
1391 MachineInstrSpan Span(MII, &B.getMBB());
1392
1393 for (int i = 0; i < NumLoads; ++i) {
1394 if (NumLoads == 1) {
1395 LoadParts[i] = Dst;
1396 } else {
1397 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1398 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1399 }
1400
1401 MachineMemOperand *MMO = BaseMMO;
1402 if (i != 0)
1403 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1404
1405 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1406 .addDef(LoadParts[i]) // vdata
1407 .addUse(RSrc) // rsrc
1408 .addUse(VIndex) // vindex
1409 .addUse(VOffset) // voffset
1410 .addUse(SOffset) // soffset
1411 .addImm(ImmOffset + 16 * i) // offset(imm)
1412 .addImm(0) // cachepolicy, swizzled buffer(imm)
1413 .addImm(0) // idxen(imm)
1414 .addMemOperand(MMO);
1415 }
1416
1417 // TODO: If only the resource is a VGPR, it may be better to execute the
1418 // scalar load in the waterfall loop if the resource is expected to frequently
1419 // be dynamically uniform.
1420 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1421 // Remove the original instruction to avoid potentially confusing the
1422 // waterfall loop logic.
1423 B.setInstr(*Span.begin());
1424 MI.eraseFromParent();
1425
1426 SmallSet<Register, 4> OpsToWaterfall;
1427
1428 OpsToWaterfall.insert(RSrc);
1429 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1430 OpsToWaterfall);
1431 }
1432
1433 if (NumLoads != 1) {
1434 if (Ty.isVector())
1435 B.buildConcatVectors(Dst, LoadParts);
1436 else
1437 B.buildMergeLikeInstr(Dst, LoadParts);
1438 }
1439
1440 // We removed the instruction earlier with a waterfall loop.
1441 if (RSrcBank == &AMDGPU::SGPRRegBank)
1442 MI.eraseFromParent();
1443
1444 return true;
1445 }
1446
applyMappingBFE(MachineIRBuilder & B,const OperandsMapper & OpdMapper,bool Signed) const1447 bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
1448 const OperandsMapper &OpdMapper,
1449 bool Signed) const {
1450 MachineInstr &MI = OpdMapper.getMI();
1451 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1452
1453 // Insert basic copies
1454 applyDefaultMapping(OpdMapper);
1455
1456 Register DstReg = MI.getOperand(0).getReg();
1457 LLT Ty = MRI.getType(DstReg);
1458
1459 const LLT S32 = LLT::scalar(32);
1460
1461 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
1462 Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1463 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1464 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1465
1466 const RegisterBank *DstBank =
1467 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1468 if (DstBank == &AMDGPU::VGPRRegBank) {
1469 if (Ty == S32)
1470 return true;
1471
1472 // There is no 64-bit vgpr bitfield extract instructions so the operation
1473 // is expanded to a sequence of instructions that implement the operation.
1474 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
1475
1476 const LLT S64 = LLT::scalar(64);
1477 // Shift the source operand so that extracted bits start at bit 0.
1478 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1479 : B.buildLShr(S64, SrcReg, OffsetReg);
1480 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1481
1482 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1483 // if the width is a constant.
1484 if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1485 // Use the 32-bit bitfield extract instruction if the width is a constant.
1486 // Depending on the width size, use either the low or high 32-bits.
1487 auto Zero = B.buildConstant(S32, 0);
1488 auto WidthImm = ConstWidth->Value.getZExtValue();
1489 if (WidthImm <= 32) {
1490 // Use bitfield extract on the lower 32-bit source, and then sign-extend
1491 // or clear the upper 32-bits.
1492 auto Extract =
1493 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1494 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1495 auto Extend =
1496 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1497 B.buildMergeLikeInstr(DstReg, {Extract, Extend});
1498 } else {
1499 // Use bitfield extract on upper 32-bit source, and combine with lower
1500 // 32-bit source.
1501 auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1502 auto Extract =
1503 Signed
1504 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1505 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1506 B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
1507 }
1508 MI.eraseFromParent();
1509 return true;
1510 }
1511
1512 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1513 // operations.
1514 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1515 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1516 if (Signed)
1517 B.buildAShr(S64, SignBit, ExtShift);
1518 else
1519 B.buildLShr(S64, SignBit, ExtShift);
1520 MI.eraseFromParent();
1521 return true;
1522 }
1523
1524 // The scalar form packs the offset and width in a single operand.
1525
1526 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1527
1528 // Ensure the high bits are clear to insert the offset.
1529 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1530 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1531
1532 // Zeros out the low bits, so don't bother clamping the input value.
1533 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1534
1535 // Transformation function, pack the offset and width of a BFE into
1536 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1537 // source, bits [5:0] contain the offset and bits [22:16] the width.
1538 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1539
1540 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1541 // register class constraints.
1542 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1543 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1544
1545 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1546 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1547 llvm_unreachable("failed to constrain BFE");
1548
1549 MI.eraseFromParent();
1550 return true;
1551 }
1552
applyMappingMAD_64_32(MachineIRBuilder & B,const OperandsMapper & OpdMapper) const1553 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1554 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1555 MachineInstr &MI = OpdMapper.getMI();
1556 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1557
1558 // Insert basic copies.
1559 applyDefaultMapping(OpdMapper);
1560
1561 Register Dst0 = MI.getOperand(0).getReg();
1562 Register Dst1 = MI.getOperand(1).getReg();
1563 Register Src0 = MI.getOperand(2).getReg();
1564 Register Src1 = MI.getOperand(3).getReg();
1565 Register Src2 = MI.getOperand(4).getReg();
1566
1567 if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
1568 return true;
1569
1570 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1571 LLT S1 = LLT::scalar(1);
1572 LLT S32 = LLT::scalar(32);
1573
1574 bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
1575 bool Accumulate = true;
1576
1577 if (!DstOnValu) {
1578 if (mi_match(Src2, MRI, m_ZeroInt()))
1579 Accumulate = false;
1580 }
1581
1582 // Keep the multiplication on the SALU.
1583 Register DstHi;
1584 Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
1585 bool MulHiInVgpr = false;
1586
1587 MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
1588
1589 if (Subtarget.hasSMulHi()) {
1590 DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)
1591 : B.buildSMulH(S32, Src0, Src1).getReg(0);
1592 MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
1593 } else {
1594 Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);
1595 Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);
1596
1597 MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
1598 MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
1599
1600 DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)
1601 : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);
1602 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1603
1604 if (!DstOnValu) {
1605 DstHi = buildReadFirstLane(B, MRI, DstHi);
1606 } else {
1607 MulHiInVgpr = true;
1608 }
1609 }
1610
1611 // Accumulate and produce the "carry-out" bit.
1612 //
1613 // The "carry-out" is defined as bit 64 of the result when computed as a
1614 // big integer. For unsigned multiply-add, this matches the usual definition
1615 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1616 // result, which is determined as:
1617 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1618 LLT CarryType = DstOnValu ? S1 : S32;
1619 const RegisterBank &CarryBank =
1620 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1621 const RegisterBank &DstBank =
1622 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1623 Register Carry;
1624 Register Zero;
1625
1626 if (!IsUnsigned) {
1627 Zero = B.buildConstant(S32, 0).getReg(0);
1628 MRI.setRegBank(Zero,
1629 MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1630
1631 Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)
1632 .getReg(0);
1633 MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
1634 : AMDGPU::SGPRRegBank);
1635
1636 if (DstOnValu && !MulHiInVgpr) {
1637 Carry = B.buildTrunc(S1, Carry).getReg(0);
1638 MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
1639 }
1640 }
1641
1642 if (Accumulate) {
1643 if (DstOnValu) {
1644 DstLo = B.buildCopy(S32, DstLo).getReg(0);
1645 DstHi = B.buildCopy(S32, DstHi).getReg(0);
1646 MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
1647 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1648 }
1649
1650 auto Unmerge = B.buildUnmerge(S32, Src2);
1651 Register Src2Lo = Unmerge.getReg(0);
1652 Register Src2Hi = Unmerge.getReg(1);
1653 MRI.setRegBank(Src2Lo, DstBank);
1654 MRI.setRegBank(Src2Hi, DstBank);
1655
1656 if (!IsUnsigned) {
1657 auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);
1658 MRI.setRegBank(Src2Sign.getReg(0), CarryBank);
1659
1660 Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);
1661 MRI.setRegBank(Carry, CarryBank);
1662 }
1663
1664 auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);
1665 DstLo = AddLo.getReg(0);
1666 Register CarryLo = AddLo.getReg(1);
1667 MRI.setRegBank(DstLo, DstBank);
1668 MRI.setRegBank(CarryLo, CarryBank);
1669
1670 auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);
1671 DstHi = AddHi.getReg(0);
1672 MRI.setRegBank(DstHi, DstBank);
1673
1674 Register CarryHi = AddHi.getReg(1);
1675 MRI.setRegBank(CarryHi, CarryBank);
1676
1677 if (IsUnsigned) {
1678 Carry = CarryHi;
1679 } else {
1680 Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);
1681 MRI.setRegBank(Carry, CarryBank);
1682 }
1683 } else {
1684 if (IsUnsigned) {
1685 Carry = B.buildConstant(CarryType, 0).getReg(0);
1686 MRI.setRegBank(Carry, CarryBank);
1687 }
1688 }
1689
1690 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
1691
1692 if (DstOnValu) {
1693 B.buildCopy(Dst1, Carry);
1694 } else {
1695 B.buildTrunc(Dst1, Carry);
1696 }
1697
1698 MI.eraseFromParent();
1699 return true;
1700 }
1701
1702 // Return a suitable opcode for extending the operands of Opc when widening.
getExtendOp(unsigned Opc)1703 static unsigned getExtendOp(unsigned Opc) {
1704 switch (Opc) {
1705 case TargetOpcode::G_ASHR:
1706 case TargetOpcode::G_SMIN:
1707 case TargetOpcode::G_SMAX:
1708 return TargetOpcode::G_SEXT;
1709 case TargetOpcode::G_LSHR:
1710 case TargetOpcode::G_UMIN:
1711 case TargetOpcode::G_UMAX:
1712 return TargetOpcode::G_ZEXT;
1713 default:
1714 return TargetOpcode::G_ANYEXT;
1715 }
1716 }
1717
1718 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1719 // any illegal vector extend or unmerge operations.
1720 static std::pair<Register, Register>
unpackV2S16ToS32(MachineIRBuilder & B,Register Src,unsigned ExtOpcode)1721 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1722 const LLT S32 = LLT::scalar(32);
1723 auto Bitcast = B.buildBitcast(S32, Src);
1724
1725 if (ExtOpcode == TargetOpcode::G_SEXT) {
1726 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1727 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1728 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1729 }
1730
1731 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1732 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1733 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1734 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1735 }
1736
1737 assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1738 return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1739 }
1740
1741 // For cases where only a single copy is inserted for matching register banks.
1742 // Replace the register in the instruction operand
substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,unsigned OpIdx)1743 static bool substituteSimpleCopyRegs(
1744 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1745 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1746 if (!SrcReg.empty()) {
1747 assert(SrcReg.size() == 1);
1748 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1749 return true;
1750 }
1751
1752 return false;
1753 }
1754
1755 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg) const1756 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1757 MachineRegisterInfo &MRI,
1758 Register Reg) const {
1759 if (!Subtarget.hasUnpackedD16VMem())
1760 return Reg;
1761
1762 const LLT S16 = LLT::scalar(16);
1763 LLT StoreVT = MRI.getType(Reg);
1764 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1765 return Reg;
1766
1767 auto Unmerge = B.buildUnmerge(S16, Reg);
1768
1769
1770 SmallVector<Register, 4> WideRegs;
1771 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1772 WideRegs.push_back(Unmerge.getReg(I));
1773
1774 const LLT S32 = LLT::scalar(32);
1775 int NumElts = StoreVT.getNumElements();
1776
1777 return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs)
1778 .getReg(0);
1779 }
1780
1781 static std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo & MRI,Register Reg)1782 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1783 int64_t Const;
1784 if (mi_match(Reg, MRI, m_ICst(Const)))
1785 return std::pair(Register(), Const);
1786
1787 Register Base;
1788 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1789 return std::pair(Base, Const);
1790
1791 // TODO: Handle G_OR used for add case
1792 return std::pair(Reg, 0);
1793 }
1794
1795 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const1796 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1797 Register OrigOffset) const {
1798 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(Subtarget);
1799 Register BaseReg;
1800 unsigned ImmOffset;
1801 const LLT S32 = LLT::scalar(32);
1802
1803 // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1804 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1805 OrigOffset);
1806
1807 unsigned C1 = 0;
1808 if (ImmOffset != 0) {
1809 // If the immediate value is too big for the immoffset field, put only bits
1810 // that would normally fit in the immoffset field. The remaining value that
1811 // is copied/added for the voffset field is a large power of 2, and it
1812 // stands more chance of being CSEd with the copy/add for another similar
1813 // load/store.
1814 // However, do not do that rounding down if that is a negative
1815 // number, as it appears to be illegal to have a negative offset in the
1816 // vgpr, even if adding the immediate offset makes it positive.
1817 unsigned Overflow = ImmOffset & ~MaxImm;
1818 ImmOffset -= Overflow;
1819 if ((int32_t)Overflow < 0) {
1820 Overflow += ImmOffset;
1821 ImmOffset = 0;
1822 }
1823
1824 C1 = ImmOffset;
1825 if (Overflow != 0) {
1826 if (!BaseReg)
1827 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1828 else {
1829 auto OverflowVal = B.buildConstant(S32, Overflow);
1830 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1831 }
1832 }
1833 }
1834
1835 if (!BaseReg)
1836 BaseReg = B.buildConstant(S32, 0).getReg(0);
1837
1838 return {BaseReg, C1};
1839 }
1840
buildVCopy(MachineIRBuilder & B,Register DstReg,Register SrcReg) const1841 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1842 Register SrcReg) const {
1843 MachineRegisterInfo &MRI = *B.getMRI();
1844 LLT SrcTy = MRI.getType(SrcReg);
1845 if (SrcTy.getSizeInBits() == 32) {
1846 // Use a v_mov_b32 here to make the exec dependency explicit.
1847 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1848 .addDef(DstReg)
1849 .addUse(SrcReg);
1850 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1851 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1852 }
1853
1854 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1855 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1856
1857 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1858 .addDef(TmpReg0)
1859 .addUse(SrcReg, 0, AMDGPU::sub0);
1860 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1861 .addDef(TmpReg1)
1862 .addUse(SrcReg, 0, AMDGPU::sub1);
1863 B.buildInstr(AMDGPU::REG_SEQUENCE)
1864 .addDef(DstReg)
1865 .addUse(TmpReg0)
1866 .addImm(AMDGPU::sub0)
1867 .addUse(TmpReg1)
1868 .addImm(AMDGPU::sub1);
1869
1870 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1871 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1872 }
1873
1874 /// Utility function for pushing dynamic vector indexes with a constant offset
1875 /// into waterfall loops.
reinsertVectorIndexAdd(MachineIRBuilder & B,MachineInstr & IdxUseInstr,unsigned OpIdx,unsigned ConstOffset)1876 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1877 MachineInstr &IdxUseInstr,
1878 unsigned OpIdx,
1879 unsigned ConstOffset) {
1880 MachineRegisterInfo &MRI = *B.getMRI();
1881 const LLT S32 = LLT::scalar(32);
1882 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1883 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1884
1885 auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1886
1887 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1888 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1889 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1890 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1891 }
1892
1893 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1894 /// original 32-bit source value (to be inserted in the low part of the combined
1895 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1896 /// value.
extendLow32IntoHigh32(MachineIRBuilder & B,Register Hi32Reg,Register Lo32Reg,unsigned ExtOpc,const RegisterBank & RegBank,bool IsBooleanSrc=false)1897 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1898 Register Hi32Reg, Register Lo32Reg,
1899 unsigned ExtOpc,
1900 const RegisterBank &RegBank,
1901 bool IsBooleanSrc = false) {
1902 if (ExtOpc == AMDGPU::G_ZEXT) {
1903 B.buildConstant(Hi32Reg, 0);
1904 } else if (ExtOpc == AMDGPU::G_SEXT) {
1905 if (IsBooleanSrc) {
1906 // If we know the original source was an s1, the high half is the same as
1907 // the low.
1908 B.buildCopy(Hi32Reg, Lo32Reg);
1909 } else {
1910 // Replicate sign bit from 32-bit extended part.
1911 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1912 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1913 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1914 }
1915 } else {
1916 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1917 B.buildUndef(Hi32Reg);
1918 }
1919 }
1920
foldExtractEltToCmpSelect(MachineIRBuilder & B,MachineInstr & MI,const OperandsMapper & OpdMapper) const1921 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1922 MachineIRBuilder &B, MachineInstr &MI,
1923 const OperandsMapper &OpdMapper) const {
1924 MachineRegisterInfo &MRI = *B.getMRI();
1925
1926 Register VecReg = MI.getOperand(1).getReg();
1927 Register Idx = MI.getOperand(2).getReg();
1928
1929 const RegisterBank &IdxBank =
1930 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1931
1932 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1933
1934 LLT VecTy = MRI.getType(VecReg);
1935 unsigned EltSize = VecTy.getScalarSizeInBits();
1936 unsigned NumElem = VecTy.getNumElements();
1937
1938 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1939 IsDivergentIdx, &Subtarget))
1940 return false;
1941
1942 LLT S32 = LLT::scalar(32);
1943
1944 const RegisterBank &DstBank =
1945 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1946 const RegisterBank &SrcBank =
1947 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1948
1949 const RegisterBank &CCBank =
1950 (DstBank == AMDGPU::SGPRRegBank &&
1951 SrcBank == AMDGPU::SGPRRegBank &&
1952 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1953 : AMDGPU::VCCRegBank;
1954 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1955
1956 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1957 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1958 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1959 }
1960
1961 LLT EltTy = VecTy.getScalarType();
1962 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1963 unsigned NumLanes = DstRegs.size();
1964 if (!NumLanes)
1965 NumLanes = 1;
1966 else
1967 EltTy = MRI.getType(DstRegs[0]);
1968
1969 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1970 SmallVector<Register, 2> Res(NumLanes);
1971 for (unsigned L = 0; L < NumLanes; ++L)
1972 Res[L] = UnmergeToEltTy.getReg(L);
1973
1974 for (unsigned I = 1; I < NumElem; ++I) {
1975 auto IC = B.buildConstant(S32, I);
1976 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1977 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1978 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1979
1980 for (unsigned L = 0; L < NumLanes; ++L) {
1981 auto S = B.buildSelect(EltTy, Cmp,
1982 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1983
1984 for (unsigned N : { 0, 2, 3 })
1985 MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1986
1987 Res[L] = S->getOperand(0).getReg();
1988 }
1989 }
1990
1991 for (unsigned L = 0; L < NumLanes; ++L) {
1992 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1993 B.buildCopy(DstReg, Res[L]);
1994 MRI.setRegBank(DstReg, DstBank);
1995 }
1996
1997 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
1998 MI.eraseFromParent();
1999
2000 return true;
2001 }
2002
2003 // Insert a cross regbank copy for a register if it already has a bank that
2004 // differs from the one we want to set.
constrainRegToBank(MachineRegisterInfo & MRI,MachineIRBuilder & B,Register & Reg,const RegisterBank & Bank)2005 static Register constrainRegToBank(MachineRegisterInfo &MRI,
2006 MachineIRBuilder &B, Register &Reg,
2007 const RegisterBank &Bank) {
2008 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2009 if (CurrBank && *CurrBank != Bank) {
2010 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2011 MRI.setRegBank(Copy, Bank);
2012 return Copy;
2013 }
2014
2015 MRI.setRegBank(Reg, Bank);
2016 return Reg;
2017 }
2018
foldInsertEltToCmpSelect(MachineIRBuilder & B,MachineInstr & MI,const OperandsMapper & OpdMapper) const2019 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2020 MachineIRBuilder &B, MachineInstr &MI,
2021 const OperandsMapper &OpdMapper) const {
2022
2023 MachineRegisterInfo &MRI = *B.getMRI();
2024 Register VecReg = MI.getOperand(1).getReg();
2025 Register Idx = MI.getOperand(3).getReg();
2026
2027 const RegisterBank &IdxBank =
2028 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2029
2030 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2031
2032 LLT VecTy = MRI.getType(VecReg);
2033 unsigned EltSize = VecTy.getScalarSizeInBits();
2034 unsigned NumElem = VecTy.getNumElements();
2035
2036 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2037 IsDivergentIdx, &Subtarget))
2038 return false;
2039
2040 LLT S32 = LLT::scalar(32);
2041
2042 const RegisterBank &DstBank =
2043 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2044 const RegisterBank &SrcBank =
2045 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2046 const RegisterBank &InsBank =
2047 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2048
2049 const RegisterBank &CCBank =
2050 (DstBank == AMDGPU::SGPRRegBank &&
2051 SrcBank == AMDGPU::SGPRRegBank &&
2052 InsBank == AMDGPU::SGPRRegBank &&
2053 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2054 : AMDGPU::VCCRegBank;
2055 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2056
2057 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2058 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2059 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2060 }
2061
2062 LLT EltTy = VecTy.getScalarType();
2063 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2064 unsigned NumLanes = InsRegs.size();
2065 if (!NumLanes) {
2066 NumLanes = 1;
2067 InsRegs.push_back(MI.getOperand(2).getReg());
2068 } else {
2069 EltTy = MRI.getType(InsRegs[0]);
2070 }
2071
2072 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2073 SmallVector<Register, 16> Ops(NumElem * NumLanes);
2074
2075 for (unsigned I = 0; I < NumElem; ++I) {
2076 auto IC = B.buildConstant(S32, I);
2077 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2078 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2079 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2080
2081 for (unsigned L = 0; L < NumLanes; ++L) {
2082 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2083 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2084 Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2085
2086 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2087 MRI.setRegBank(Select, DstBank);
2088
2089 Ops[I * NumLanes + L] = Select;
2090 }
2091 }
2092
2093 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2094 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2095 B.buildBuildVector(MI.getOperand(0), Ops);
2096 } else {
2097 auto Vec = B.buildBuildVector(MergeTy, Ops);
2098 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2099 B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2100 }
2101
2102 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2103 MI.eraseFromParent();
2104
2105 return true;
2106 }
2107
2108 // Break s_mul_u64 into 32-bit vector operations.
applyMappingSMULU64(MachineIRBuilder & B,const OperandsMapper & OpdMapper) const2109 void AMDGPURegisterBankInfo::applyMappingSMULU64(
2110 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2111 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2112 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2113 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2114
2115 // All inputs are SGPRs, nothing special to do.
2116 if (DefRegs.empty()) {
2117 assert(Src0Regs.empty() && Src1Regs.empty());
2118 applyDefaultMapping(OpdMapper);
2119 return;
2120 }
2121
2122 assert(DefRegs.size() == 2);
2123 assert(Src0Regs.size() == Src1Regs.size() &&
2124 (Src0Regs.empty() || Src0Regs.size() == 2));
2125
2126 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2127 MachineInstr &MI = OpdMapper.getMI();
2128 Register DstReg = MI.getOperand(0).getReg();
2129 LLT HalfTy = LLT::scalar(32);
2130
2131 // Depending on where the source registers came from, the generic code may
2132 // have decided to split the inputs already or not. If not, we still need to
2133 // extract the values.
2134
2135 if (Src0Regs.empty())
2136 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2137 else
2138 setRegsToType(MRI, Src0Regs, HalfTy);
2139
2140 if (Src1Regs.empty())
2141 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2142 else
2143 setRegsToType(MRI, Src1Regs, HalfTy);
2144
2145 setRegsToType(MRI, DefRegs, HalfTy);
2146
2147 // The multiplication is done as follows:
2148 //
2149 // Op1H Op1L
2150 // * Op0H Op0L
2151 // --------------------
2152 // Op1H*Op0L Op1L*Op0L
2153 // + Op1H*Op0H Op1L*Op0H
2154 // -----------------------------------------
2155 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
2156 //
2157 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
2158 // value and that would overflow.
2159 // The low 32-bit value is Op1L*Op0L.
2160 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from
2161 // Op1L*Op0L).
2162
2163 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
2164
2165 Register Hi = B.buildUMulH(HalfTy, Src0Regs[0], Src1Regs[0]).getReg(0);
2166 Register MulLoHi = B.buildMul(HalfTy, Src0Regs[0], Src1Regs[1]).getReg(0);
2167 Register Add = B.buildAdd(HalfTy, Hi, MulLoHi).getReg(0);
2168 Register MulHiLo = B.buildMul(HalfTy, Src0Regs[1], Src1Regs[0]).getReg(0);
2169 B.buildAdd(DefRegs[1], Add, MulHiLo);
2170 B.buildMul(DefRegs[0], Src0Regs[0], Src1Regs[0]);
2171
2172 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2173 MI.eraseFromParent();
2174 }
2175
applyMappingImpl(MachineIRBuilder & B,const OperandsMapper & OpdMapper) const2176 void AMDGPURegisterBankInfo::applyMappingImpl(
2177 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2178 MachineInstr &MI = OpdMapper.getMI();
2179 B.setInstrAndDebugLoc(MI);
2180 unsigned Opc = MI.getOpcode();
2181 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2182 switch (Opc) {
2183 case AMDGPU::G_CONSTANT:
2184 case AMDGPU::G_IMPLICIT_DEF: {
2185 Register DstReg = MI.getOperand(0).getReg();
2186 LLT DstTy = MRI.getType(DstReg);
2187 if (DstTy != LLT::scalar(1))
2188 break;
2189
2190 const RegisterBank *DstBank =
2191 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2192 if (DstBank == &AMDGPU::VCCRegBank)
2193 break;
2194 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2195 if (DefRegs.empty())
2196 DefRegs.push_back(DstReg);
2197
2198 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2199
2200 Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
2201 LLVMContext &Ctx = B.getMF().getFunction().getContext();
2202
2203 MI.getOperand(0).setReg(NewDstReg);
2204 if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2205 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
2206 MI.getOperand(1).setCImm(
2207 ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal));
2208 }
2209
2210 MRI.setRegBank(NewDstReg, *DstBank);
2211 B.buildTrunc(DefRegs[0], NewDstReg);
2212 return;
2213 }
2214 case AMDGPU::G_PHI: {
2215 Register DstReg = MI.getOperand(0).getReg();
2216 LLT DstTy = MRI.getType(DstReg);
2217 if (DstTy != LLT::scalar(1))
2218 break;
2219
2220 const LLT S32 = LLT::scalar(32);
2221 const RegisterBank *DstBank =
2222 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2223 if (DstBank == &AMDGPU::VCCRegBank) {
2224 applyDefaultMapping(OpdMapper);
2225 // The standard handling only considers the result register bank for
2226 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2227 // produce an invalid copy. We can only copy with some kind of compare to
2228 // get a vector boolean result. Insert a register bank copy that will be
2229 // correctly lowered to a compare.
2230 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2231 Register SrcReg = MI.getOperand(I).getReg();
2232 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2233
2234 if (SrcBank != &AMDGPU::VCCRegBank) {
2235 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2236 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2237
2238 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2239 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2240 MI.getOperand(I).setReg(Copy.getReg(0));
2241 }
2242 }
2243
2244 return;
2245 }
2246
2247 // Phi handling is strange and only considers the bank of the destination.
2248 substituteSimpleCopyRegs(OpdMapper, 0);
2249
2250 // Promote SGPR/VGPR booleans to s32
2251 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2252 B.setInsertPt(B.getMBB(), MI);
2253 LegalizerHelper Helper(B.getMF(), ApplyBank, B);
2254
2255 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2256 llvm_unreachable("widen scalar should have succeeded");
2257
2258 return;
2259 }
2260 case AMDGPU::G_FCMP:
2261 if (!Subtarget.hasSALUFloatInsts())
2262 break;
2263 [[fallthrough]];
2264 case AMDGPU::G_ICMP:
2265 case AMDGPU::G_UADDO:
2266 case AMDGPU::G_USUBO:
2267 case AMDGPU::G_UADDE:
2268 case AMDGPU::G_SADDE:
2269 case AMDGPU::G_USUBE:
2270 case AMDGPU::G_SSUBE: {
2271 unsigned BoolDstOp =
2272 (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1;
2273 Register DstReg = MI.getOperand(BoolDstOp).getReg();
2274
2275 const RegisterBank *DstBank =
2276 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2277 if (DstBank != &AMDGPU::SGPRRegBank)
2278 break;
2279
2280 const bool HasCarryIn = MI.getNumOperands() == 5;
2281
2282 // If this is a scalar compare, promote the result to s32, as the selection
2283 // will end up using a copy to a 32-bit vreg.
2284 const LLT S32 = LLT::scalar(32);
2285 Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2286 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2287 MI.getOperand(BoolDstOp).setReg(NewDstReg);
2288
2289 if (HasCarryIn) {
2290 Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2291 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2292 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2293 MI.getOperand(4).setReg(NewSrcReg);
2294 }
2295
2296 MachineBasicBlock *MBB = MI.getParent();
2297 B.setInsertPt(*MBB, std::next(MI.getIterator()));
2298
2299 // If we had a constrained VCC result register, a copy was inserted to VCC
2300 // from SGPR.
2301 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2302 if (DefRegs.empty())
2303 DefRegs.push_back(DstReg);
2304 B.buildTrunc(DefRegs[0], NewDstReg);
2305 return;
2306 }
2307 case AMDGPU::G_SELECT: {
2308 Register DstReg = MI.getOperand(0).getReg();
2309 LLT DstTy = MRI.getType(DstReg);
2310
2311 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2312 if (CondRegs.empty())
2313 CondRegs.push_back(MI.getOperand(1).getReg());
2314 else {
2315 assert(CondRegs.size() == 1);
2316 }
2317
2318 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2319 if (CondBank == &AMDGPU::SGPRRegBank) {
2320 const LLT S32 = LLT::scalar(32);
2321 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2322 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2323
2324 MI.getOperand(1).setReg(NewCondReg);
2325 B.buildZExt(NewCondReg, CondRegs[0]);
2326 }
2327
2328 if (DstTy.getSizeInBits() != 64)
2329 break;
2330
2331 LLT HalfTy = getHalfSizedType(DstTy);
2332
2333 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2334 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2335 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2336
2337 // All inputs are SGPRs, nothing special to do.
2338 if (DefRegs.empty()) {
2339 assert(Src1Regs.empty() && Src2Regs.empty());
2340 break;
2341 }
2342
2343 if (Src1Regs.empty())
2344 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2345 else {
2346 setRegsToType(MRI, Src1Regs, HalfTy);
2347 }
2348
2349 if (Src2Regs.empty())
2350 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2351 else
2352 setRegsToType(MRI, Src2Regs, HalfTy);
2353
2354 setRegsToType(MRI, DefRegs, HalfTy);
2355
2356 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2357 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2358
2359 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2360 MI.eraseFromParent();
2361 return;
2362 }
2363 case AMDGPU::G_BRCOND: {
2364 Register CondReg = MI.getOperand(0).getReg();
2365 // FIXME: Should use legalizer helper, but should change bool ext type.
2366 const RegisterBank *CondBank =
2367 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2368
2369 if (CondBank == &AMDGPU::SGPRRegBank) {
2370 const LLT S32 = LLT::scalar(32);
2371 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2372 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2373
2374 MI.getOperand(0).setReg(NewCondReg);
2375 B.buildZExt(NewCondReg, CondReg);
2376 return;
2377 }
2378
2379 break;
2380 }
2381 case AMDGPU::G_AND:
2382 case AMDGPU::G_OR:
2383 case AMDGPU::G_XOR: {
2384 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2385 // there is a VGPR input.
2386 Register DstReg = MI.getOperand(0).getReg();
2387 LLT DstTy = MRI.getType(DstReg);
2388
2389 if (DstTy.getSizeInBits() == 1) {
2390 const RegisterBank *DstBank =
2391 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2392 if (DstBank == &AMDGPU::VCCRegBank)
2393 break;
2394
2395 MachineFunction *MF = MI.getParent()->getParent();
2396 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2397 LegalizerHelper Helper(*MF, ApplyBank, B);
2398
2399 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2400 LegalizerHelper::Legalized)
2401 llvm_unreachable("widen scalar should have succeeded");
2402 return;
2403 }
2404
2405 if (DstTy.getSizeInBits() != 64)
2406 break;
2407
2408 LLT HalfTy = getHalfSizedType(DstTy);
2409 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2410 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2411 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2412
2413 // All inputs are SGPRs, nothing special to do.
2414 if (DefRegs.empty()) {
2415 assert(Src0Regs.empty() && Src1Regs.empty());
2416 break;
2417 }
2418
2419 assert(DefRegs.size() == 2);
2420 assert(Src0Regs.size() == Src1Regs.size() &&
2421 (Src0Regs.empty() || Src0Regs.size() == 2));
2422
2423 // Depending on where the source registers came from, the generic code may
2424 // have decided to split the inputs already or not. If not, we still need to
2425 // extract the values.
2426
2427 if (Src0Regs.empty())
2428 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2429 else
2430 setRegsToType(MRI, Src0Regs, HalfTy);
2431
2432 if (Src1Regs.empty())
2433 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2434 else
2435 setRegsToType(MRI, Src1Regs, HalfTy);
2436
2437 setRegsToType(MRI, DefRegs, HalfTy);
2438
2439 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2440 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2441
2442 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2443 MI.eraseFromParent();
2444 return;
2445 }
2446 case AMDGPU::G_ABS: {
2447 Register SrcReg = MI.getOperand(1).getReg();
2448 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2449
2450 // There is no VALU abs instruction so we need to replace it with a sub and
2451 // max combination.
2452 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2453 MachineFunction *MF = MI.getParent()->getParent();
2454 ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
2455 LegalizerHelper Helper(*MF, Apply, B);
2456
2457 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2458 llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2459 return;
2460 }
2461 [[fallthrough]];
2462 }
2463 case AMDGPU::G_ADD:
2464 case AMDGPU::G_SUB:
2465 case AMDGPU::G_MUL:
2466 case AMDGPU::G_SHL:
2467 case AMDGPU::G_LSHR:
2468 case AMDGPU::G_ASHR:
2469 case AMDGPU::G_SMIN:
2470 case AMDGPU::G_SMAX:
2471 case AMDGPU::G_UMIN:
2472 case AMDGPU::G_UMAX: {
2473 Register DstReg = MI.getOperand(0).getReg();
2474 LLT DstTy = MRI.getType(DstReg);
2475
2476 // Special case for s_mul_u64. There is not a vector equivalent of
2477 // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
2478 // multiplications.
2479 if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) {
2480 applyMappingSMULU64(B, OpdMapper);
2481 return;
2482 }
2483
2484 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2485 // Packed 16-bit operations need to be scalarized and promoted.
2486 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2487 break;
2488
2489 const RegisterBank *DstBank =
2490 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2491 if (DstBank == &AMDGPU::VGPRRegBank)
2492 break;
2493
2494 const LLT S32 = LLT::scalar(32);
2495 MachineBasicBlock *MBB = MI.getParent();
2496 MachineFunction *MF = MBB->getParent();
2497 ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2498
2499 if (DstTy.isVector() && Opc == AMDGPU::G_ABS) {
2500 Register WideSrcLo, WideSrcHi;
2501
2502 std::tie(WideSrcLo, WideSrcHi) =
2503 unpackV2S16ToS32(B, MI.getOperand(1).getReg(), TargetOpcode::G_SEXT);
2504 auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo});
2505 auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi});
2506 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2507 MI.eraseFromParent();
2508 return;
2509 }
2510
2511 if (DstTy.isVector()) {
2512 Register WideSrc0Lo, WideSrc0Hi;
2513 Register WideSrc1Lo, WideSrc1Hi;
2514
2515 unsigned ExtendOp = getExtendOp(MI.getOpcode());
2516 std::tie(WideSrc0Lo, WideSrc0Hi)
2517 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2518 std::tie(WideSrc1Lo, WideSrc1Hi)
2519 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2520 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2521 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2522 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2523 MI.eraseFromParent();
2524 } else {
2525 LegalizerHelper Helper(*MF, ApplySALU, B);
2526
2527 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2528 llvm_unreachable("widen scalar should have succeeded");
2529
2530 // FIXME: s16 shift amounts should be legal.
2531 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2532 Opc == AMDGPU::G_ASHR) {
2533 B.setInsertPt(*MBB, MI.getIterator());
2534 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2535 llvm_unreachable("widen scalar should have succeeded");
2536 }
2537 }
2538
2539 return;
2540 }
2541 case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
2542 case AMDGPU::G_AMDGPU_S_MUL_U64_U32: {
2543 // This is a special case for s_mul_u64. We use
2544 // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation
2545 // where the 33 higher bits are sign-extended and
2546 // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation
2547 // where the 32 higher bits are zero-extended. In case scalar registers are
2548 // selected, both opcodes are lowered as s_mul_u64. If the vector registers
2549 // are selected, then G_AMDGPU_S_MUL_I64_I32 and
2550 // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.
2551
2552 // Insert basic copies.
2553 applyDefaultMapping(OpdMapper);
2554
2555 Register DstReg = MI.getOperand(0).getReg();
2556 Register SrcReg0 = MI.getOperand(1).getReg();
2557 Register SrcReg1 = MI.getOperand(2).getReg();
2558 const LLT S32 = LLT::scalar(32);
2559 const LLT S64 = LLT::scalar(64);
2560 assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "
2561 "that handles only 64-bit operands.");
2562 const RegisterBank *DstBank =
2563 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2564
2565 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2566 // with s_mul_u64 operation.
2567 if (DstBank == &AMDGPU::SGPRRegBank) {
2568 MI.setDesc(TII->get(AMDGPU::S_MUL_U64));
2569 MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass);
2570 MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass);
2571 MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass);
2572 return;
2573 }
2574
2575 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2576 // with a vector mad.
2577 assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank &&
2578 "The destination operand should be in vector registers.");
2579
2580 DebugLoc DL = MI.getDebugLoc();
2581
2582 // Extract the lower subregister from the first operand.
2583 Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2584 MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass);
2585 MRI.setType(Op0L, S32);
2586 B.buildTrunc(Op0L, SrcReg0);
2587
2588 // Extract the lower subregister from the second operand.
2589 Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2590 MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass);
2591 MRI.setType(Op1L, S32);
2592 B.buildTrunc(Op1L, SrcReg1);
2593
2594 unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32
2595 ? AMDGPU::G_AMDGPU_MAD_U64_U32
2596 : AMDGPU::G_AMDGPU_MAD_I64_I32;
2597
2598 MachineIRBuilder B(MI);
2599 Register Zero64 = B.buildConstant(S64, 0).getReg(0);
2600 MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass);
2601 Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
2602 MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass);
2603 B.buildInstr(NewOpc, {DstReg, CarryOut}, {Op0L, Op1L, Zero64});
2604 MI.eraseFromParent();
2605 return;
2606 }
2607 case AMDGPU::G_SEXT_INREG: {
2608 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2609 if (SrcRegs.empty())
2610 break; // Nothing to repair
2611
2612 const LLT S32 = LLT::scalar(32);
2613 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
2614
2615 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2616 // we would need to further expand, and doesn't let us directly set the
2617 // result registers.
2618 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2619
2620 int Amt = MI.getOperand(2).getImm();
2621 if (Amt <= 32) {
2622 // Downstream users have expectations for the high bit behavior, so freeze
2623 // incoming undefined bits.
2624 if (Amt == 32) {
2625 // The low bits are unchanged.
2626 B.buildFreeze(DstRegs[0], SrcRegs[0]);
2627 } else {
2628 auto Freeze = B.buildFreeze(S32, SrcRegs[0]);
2629 // Extend in the low bits and propagate the sign bit to the high half.
2630 B.buildSExtInReg(DstRegs[0], Freeze, Amt);
2631 }
2632
2633 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2634 } else {
2635 // The low bits are unchanged, and extend in the high bits.
2636 // No freeze required
2637 B.buildCopy(DstRegs[0], SrcRegs[0]);
2638 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2639 }
2640
2641 Register DstReg = MI.getOperand(0).getReg();
2642 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2643 MI.eraseFromParent();
2644 return;
2645 }
2646 case AMDGPU::G_CTPOP:
2647 case AMDGPU::G_BITREVERSE: {
2648 const RegisterBank *DstBank =
2649 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2650 if (DstBank == &AMDGPU::SGPRRegBank)
2651 break;
2652
2653 Register SrcReg = MI.getOperand(1).getReg();
2654 const LLT S32 = LLT::scalar(32);
2655 LLT Ty = MRI.getType(SrcReg);
2656 if (Ty == S32)
2657 break;
2658
2659 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2660
2661 MachineFunction &MF = B.getMF();
2662 LegalizerHelper Helper(MF, ApplyVALU, B);
2663
2664 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2665 llvm_unreachable("narrowScalar should have succeeded");
2666 return;
2667 }
2668 case AMDGPU::G_AMDGPU_FFBH_U32:
2669 case AMDGPU::G_AMDGPU_FFBL_B32:
2670 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2671 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2672 const RegisterBank *DstBank =
2673 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2674 if (DstBank == &AMDGPU::SGPRRegBank)
2675 break;
2676
2677 Register SrcReg = MI.getOperand(1).getReg();
2678 const LLT S32 = LLT::scalar(32);
2679 LLT Ty = MRI.getType(SrcReg);
2680 if (Ty == S32)
2681 break;
2682
2683 // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2684 // which return -1 when the input is zero:
2685 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2686 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2687 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2688 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2689 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2690 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2691 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2692 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2693 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2694 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2695 : Opc;
2696 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2697 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2698 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2699 unsigned AddOpc =
2700 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2701 ? AMDGPU::G_ADD
2702 : AMDGPU::G_UADDSAT;
2703 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2704 Register DstReg = MI.getOperand(0).getReg();
2705 B.buildUMin(DstReg, X, Y);
2706 MI.eraseFromParent();
2707 return;
2708 }
2709 case AMDGPU::G_SEXT:
2710 case AMDGPU::G_ZEXT:
2711 case AMDGPU::G_ANYEXT: {
2712 Register SrcReg = MI.getOperand(1).getReg();
2713 LLT SrcTy = MRI.getType(SrcReg);
2714 const bool Signed = Opc == AMDGPU::G_SEXT;
2715
2716 assert(OpdMapper.getVRegs(1).empty());
2717
2718 const RegisterBank *SrcBank =
2719 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2720
2721 Register DstReg = MI.getOperand(0).getReg();
2722 LLT DstTy = MRI.getType(DstReg);
2723 if (DstTy.isScalar() &&
2724 SrcBank != &AMDGPU::SGPRRegBank &&
2725 SrcBank != &AMDGPU::VCCRegBank &&
2726 // FIXME: Should handle any type that round to s64 when irregular
2727 // breakdowns supported.
2728 DstTy.getSizeInBits() == 64 &&
2729 SrcTy.getSizeInBits() <= 32) {
2730 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2731
2732 // Extend to 32-bit, and then extend the low half.
2733 if (Signed) {
2734 // TODO: Should really be buildSExtOrCopy
2735 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2736 } else if (Opc == AMDGPU::G_ZEXT) {
2737 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2738 } else {
2739 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2740 }
2741
2742 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2743 MRI.setRegBank(DstReg, *SrcBank);
2744 MI.eraseFromParent();
2745 return;
2746 }
2747
2748 if (SrcTy != LLT::scalar(1))
2749 return;
2750
2751 // It is not legal to have a legalization artifact with a VCC source. Rather
2752 // than introducing a copy, insert the select we would have to select the
2753 // copy to.
2754 if (SrcBank == &AMDGPU::VCCRegBank) {
2755 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2756
2757 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2758
2759 unsigned DstSize = DstTy.getSizeInBits();
2760 // 64-bit select is SGPR only
2761 const bool UseSel64 = DstSize > 32 &&
2762 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2763
2764 // TODO: Should s16 select be legal?
2765 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2766 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2767 auto False = B.buildConstant(SelType, 0);
2768
2769 MRI.setRegBank(True.getReg(0), *DstBank);
2770 MRI.setRegBank(False.getReg(0), *DstBank);
2771 MRI.setRegBank(DstReg, *DstBank);
2772
2773 if (DstSize > 32) {
2774 B.buildSelect(DefRegs[0], SrcReg, True, False);
2775 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2776 } else if (DstSize < 32) {
2777 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2778 MRI.setRegBank(Sel.getReg(0), *DstBank);
2779 B.buildTrunc(DstReg, Sel);
2780 } else {
2781 B.buildSelect(DstReg, SrcReg, True, False);
2782 }
2783
2784 MI.eraseFromParent();
2785 return;
2786 }
2787
2788 break;
2789 }
2790 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2791 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2792
2793 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2794
2795 Register DstReg = MI.getOperand(0).getReg();
2796 Register SrcReg = MI.getOperand(1).getReg();
2797
2798 const LLT S32 = LLT::scalar(32);
2799 LLT DstTy = MRI.getType(DstReg);
2800 LLT SrcTy = MRI.getType(SrcReg);
2801
2802 if (foldExtractEltToCmpSelect(B, MI, OpdMapper))
2803 return;
2804
2805 const ValueMapping &DstMapping
2806 = OpdMapper.getInstrMapping().getOperandMapping(0);
2807 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2808 const RegisterBank *SrcBank =
2809 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2810 const RegisterBank *IdxBank =
2811 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2812
2813 Register BaseIdxReg;
2814 unsigned ConstOffset;
2815 std::tie(BaseIdxReg, ConstOffset) =
2816 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2817
2818 // See if the index is an add of a constant which will be foldable by moving
2819 // the base register of the index later if this is going to be executed in a
2820 // waterfall loop. This is essentially to reassociate the add of a constant
2821 // with the readfirstlane.
2822 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2823 ConstOffset > 0 &&
2824 ConstOffset < SrcTy.getNumElements();
2825
2826 // Move the base register. We'll re-insert the add later.
2827 if (ShouldMoveIndexIntoLoop)
2828 MI.getOperand(2).setReg(BaseIdxReg);
2829
2830 // If this is a VGPR result only because the index was a VGPR result, the
2831 // actual indexing will be done on the SGPR source vector, which will
2832 // produce a scalar result. We need to copy to the VGPR result inside the
2833 // waterfall loop.
2834 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2835 SrcBank == &AMDGPU::SGPRRegBank;
2836 if (DstRegs.empty()) {
2837 applyDefaultMapping(OpdMapper);
2838
2839 executeInWaterfallLoop(B, MI, {2});
2840
2841 if (NeedCopyToVGPR) {
2842 // We don't want a phi for this temporary reg.
2843 Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2844 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2845 MI.getOperand(0).setReg(TmpReg);
2846 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2847
2848 // Use a v_mov_b32 here to make the exec dependency explicit.
2849 buildVCopy(B, DstReg, TmpReg);
2850 }
2851
2852 // Re-insert the constant offset add inside the waterfall loop.
2853 if (ShouldMoveIndexIntoLoop)
2854 reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2855
2856 return;
2857 }
2858
2859 assert(DstTy.getSizeInBits() == 64);
2860
2861 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2862
2863 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2864 auto One = B.buildConstant(S32, 1);
2865
2866 MachineBasicBlock::iterator MII = MI.getIterator();
2867
2868 // Split the vector index into 32-bit pieces. Prepare to move all of the
2869 // new instructions into a waterfall loop if necessary.
2870 //
2871 // Don't put the bitcast or constant in the loop.
2872 MachineInstrSpan Span(MII, &B.getMBB());
2873
2874 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2875 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2876 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2877
2878 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2879 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2880
2881 MRI.setRegBank(DstReg, *DstBank);
2882 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2883 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2884 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2885 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2886
2887 SmallSet<Register, 4> OpsToWaterfall;
2888 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2889 MI.eraseFromParent();
2890 return;
2891 }
2892
2893 // Remove the original instruction to avoid potentially confusing the
2894 // waterfall loop logic.
2895 B.setInstr(*Span.begin());
2896 MI.eraseFromParent();
2897 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2898 OpsToWaterfall);
2899
2900 if (NeedCopyToVGPR) {
2901 MachineBasicBlock *LoopBB = Extract1->getParent();
2902 Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2903 Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2904 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2905 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2906
2907 Extract0->getOperand(0).setReg(TmpReg0);
2908 Extract1->getOperand(0).setReg(TmpReg1);
2909
2910 B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2911
2912 buildVCopy(B, DstRegs[0], TmpReg0);
2913 buildVCopy(B, DstRegs[1], TmpReg1);
2914 }
2915
2916 if (ShouldMoveIndexIntoLoop)
2917 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2918
2919 return;
2920 }
2921 case AMDGPU::G_INSERT_VECTOR_ELT: {
2922 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2923
2924 Register DstReg = MI.getOperand(0).getReg();
2925 LLT VecTy = MRI.getType(DstReg);
2926
2927 assert(OpdMapper.getVRegs(0).empty());
2928 assert(OpdMapper.getVRegs(3).empty());
2929
2930 if (substituteSimpleCopyRegs(OpdMapper, 1))
2931 MRI.setType(MI.getOperand(1).getReg(), VecTy);
2932
2933 if (foldInsertEltToCmpSelect(B, MI, OpdMapper))
2934 return;
2935
2936 const RegisterBank *IdxBank =
2937 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2938
2939 Register SrcReg = MI.getOperand(1).getReg();
2940 Register InsReg = MI.getOperand(2).getReg();
2941 LLT InsTy = MRI.getType(InsReg);
2942 (void)InsTy;
2943
2944 Register BaseIdxReg;
2945 unsigned ConstOffset;
2946 std::tie(BaseIdxReg, ConstOffset) =
2947 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2948
2949 // See if the index is an add of a constant which will be foldable by moving
2950 // the base register of the index later if this is going to be executed in a
2951 // waterfall loop. This is essentially to reassociate the add of a constant
2952 // with the readfirstlane.
2953 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2954 ConstOffset > 0 &&
2955 ConstOffset < VecTy.getNumElements();
2956
2957 // Move the base register. We'll re-insert the add later.
2958 if (ShouldMoveIndexIntoLoop)
2959 MI.getOperand(3).setReg(BaseIdxReg);
2960
2961
2962 if (InsRegs.empty()) {
2963 executeInWaterfallLoop(B, MI, {3});
2964
2965 // Re-insert the constant offset add inside the waterfall loop.
2966 if (ShouldMoveIndexIntoLoop) {
2967 reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2968 }
2969
2970 return;
2971 }
2972
2973 assert(InsTy.getSizeInBits() == 64);
2974
2975 const LLT S32 = LLT::scalar(32);
2976 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2977
2978 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2979 auto One = B.buildConstant(S32, 1);
2980
2981 // Split the vector index into 32-bit pieces. Prepare to move all of the
2982 // new instructions into a waterfall loop if necessary.
2983 //
2984 // Don't put the bitcast or constant in the loop.
2985 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2986
2987 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2988 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2989 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2990
2991 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2992 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2993
2994 const RegisterBank *DstBank =
2995 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2996 const RegisterBank *SrcBank =
2997 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2998 const RegisterBank *InsSrcBank =
2999 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
3000
3001 MRI.setRegBank(InsReg, *InsSrcBank);
3002 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
3003 MRI.setRegBank(InsLo.getReg(0), *DstBank);
3004 MRI.setRegBank(InsHi.getReg(0), *DstBank);
3005 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
3006 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
3007 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
3008
3009
3010 SmallSet<Register, 4> OpsToWaterfall;
3011 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
3012 B.setInsertPt(B.getMBB(), MI);
3013 B.buildBitcast(DstReg, InsHi);
3014 MI.eraseFromParent();
3015 return;
3016 }
3017
3018 B.setInstr(*Span.begin());
3019 MI.eraseFromParent();
3020
3021 // Figure out the point after the waterfall loop before mangling the control
3022 // flow.
3023 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
3024 OpsToWaterfall);
3025
3026 // The insertion point is now right after the original instruction.
3027 //
3028 // Keep the bitcast to the original vector type out of the loop. Doing this
3029 // saved an extra phi we don't need inside the loop.
3030 B.buildBitcast(DstReg, InsHi);
3031
3032 // Re-insert the constant offset add inside the waterfall loop.
3033 if (ShouldMoveIndexIntoLoop)
3034 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
3035
3036 return;
3037 }
3038 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3039 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3040 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3041 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3042 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3043 case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
3044 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
3045 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
3046 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
3047 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
3048 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3049 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
3050 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3051 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3052 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3053 case AMDGPU::G_AMDGPU_BUFFER_STORE:
3054 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3055 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3056 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3057 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
3058 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3059 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
3060 applyDefaultMapping(OpdMapper);
3061 executeInWaterfallLoop(B, MI, {1, 4});
3062 return;
3063 }
3064 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3065 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3066 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3067 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3068 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3069 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3070 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3071 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3072 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3073 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3074 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3075 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
3076 applyDefaultMapping(OpdMapper);
3077 executeInWaterfallLoop(B, MI, {2, 5});
3078 return;
3079 }
3080 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3081 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3082 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3083 applyDefaultMapping(OpdMapper);
3084 executeInWaterfallLoop(B, MI, {2, 5});
3085 return;
3086 }
3087 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3088 applyDefaultMapping(OpdMapper);
3089 executeInWaterfallLoop(B, MI, {3, 6});
3090 return;
3091 }
3092 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
3093 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
3094 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
3095 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
3096 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
3097 applyMappingSBufferLoad(B, OpdMapper);
3098 return;
3099 }
3100 case AMDGPU::G_INTRINSIC:
3101 case AMDGPU::G_INTRINSIC_CONVERGENT: {
3102 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3103 case Intrinsic::amdgcn_readlane: {
3104 substituteSimpleCopyRegs(OpdMapper, 2);
3105
3106 assert(OpdMapper.getVRegs(0).empty());
3107 assert(OpdMapper.getVRegs(3).empty());
3108
3109 // Make sure the index is an SGPR. It doesn't make sense to run this in a
3110 // waterfall loop, so assume it's a uniform value.
3111 constrainOpWithReadfirstlane(B, MI, 3); // Index
3112 return;
3113 }
3114 case Intrinsic::amdgcn_writelane: {
3115 assert(OpdMapper.getVRegs(0).empty());
3116 assert(OpdMapper.getVRegs(2).empty());
3117 assert(OpdMapper.getVRegs(3).empty());
3118
3119 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
3120 constrainOpWithReadfirstlane(B, MI, 2); // Source value
3121 constrainOpWithReadfirstlane(B, MI, 3); // Index
3122 return;
3123 }
3124 case Intrinsic::amdgcn_interp_p1:
3125 case Intrinsic::amdgcn_interp_p2:
3126 case Intrinsic::amdgcn_interp_mov:
3127 case Intrinsic::amdgcn_interp_p1_f16:
3128 case Intrinsic::amdgcn_interp_p2_f16:
3129 case Intrinsic::amdgcn_lds_param_load: {
3130 applyDefaultMapping(OpdMapper);
3131
3132 // Readlane for m0 value, which is always the last operand.
3133 // FIXME: Should this be a waterfall loop instead?
3134 constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
3135 return;
3136 }
3137 case Intrinsic::amdgcn_interp_inreg_p10:
3138 case Intrinsic::amdgcn_interp_inreg_p2:
3139 case Intrinsic::amdgcn_interp_inreg_p10_f16:
3140 case Intrinsic::amdgcn_interp_inreg_p2_f16:
3141 case Intrinsic::amdgcn_interp_p10_rtz_f16:
3142 case Intrinsic::amdgcn_interp_p2_rtz_f16:
3143 applyDefaultMapping(OpdMapper);
3144 return;
3145 case Intrinsic::amdgcn_permlane16:
3146 case Intrinsic::amdgcn_permlanex16: {
3147 // Doing a waterfall loop over these wouldn't make any sense.
3148 substituteSimpleCopyRegs(OpdMapper, 2);
3149 substituteSimpleCopyRegs(OpdMapper, 3);
3150 constrainOpWithReadfirstlane(B, MI, 4);
3151 constrainOpWithReadfirstlane(B, MI, 5);
3152 return;
3153 }
3154 case Intrinsic::amdgcn_sbfe:
3155 applyMappingBFE(B, OpdMapper, true);
3156 return;
3157 case Intrinsic::amdgcn_ubfe:
3158 applyMappingBFE(B, OpdMapper, false);
3159 return;
3160 case Intrinsic::amdgcn_inverse_ballot:
3161 case Intrinsic::amdgcn_s_bitreplicate:
3162 case Intrinsic::amdgcn_s_quadmask:
3163 case Intrinsic::amdgcn_s_wqm:
3164 applyDefaultMapping(OpdMapper);
3165 constrainOpWithReadfirstlane(B, MI, 2); // Mask
3166 return;
3167 case Intrinsic::amdgcn_ballot:
3168 // Use default handling and insert copy to vcc source.
3169 break;
3170 }
3171 break;
3172 }
3173 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3174 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3175 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3176 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3177 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3178 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3179 AMDGPU::lookupRsrcIntrinsic(AMDGPU::getIntrinsicID(MI));
3180 assert(RSrcIntrin && RSrcIntrin->IsImage);
3181 // Non-images can have complications from operands that allow both SGPR
3182 // and VGPR. For now it's too complicated to figure out the final opcode
3183 // to derive the register bank from the MCInstrDesc.
3184 applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3185 return;
3186 }
3187 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3188 unsigned N = MI.getNumExplicitOperands() - 2;
3189 applyDefaultMapping(OpdMapper);
3190 executeInWaterfallLoop(B, MI, {N});
3191 return;
3192 }
3193 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
3194 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
3195 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
3196 switch (IntrID) {
3197 case Intrinsic::amdgcn_ds_ordered_add:
3198 case Intrinsic::amdgcn_ds_ordered_swap: {
3199 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3200 assert(OpdMapper.getVRegs(0).empty());
3201 substituteSimpleCopyRegs(OpdMapper, 3);
3202 constrainOpWithReadfirstlane(B, MI, 2); // M0
3203 return;
3204 }
3205 case Intrinsic::amdgcn_ds_gws_init:
3206 case Intrinsic::amdgcn_ds_gws_barrier:
3207 case Intrinsic::amdgcn_ds_gws_sema_br: {
3208 // Only the first lane is executes, so readfirstlane is safe.
3209 substituteSimpleCopyRegs(OpdMapper, 1);
3210 constrainOpWithReadfirstlane(B, MI, 2); // M0
3211 return;
3212 }
3213 case Intrinsic::amdgcn_ds_gws_sema_v:
3214 case Intrinsic::amdgcn_ds_gws_sema_p:
3215 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3216 // Only the first lane is executes, so readfirstlane is safe.
3217 constrainOpWithReadfirstlane(B, MI, 1); // M0
3218 return;
3219 }
3220 case Intrinsic::amdgcn_ds_append:
3221 case Intrinsic::amdgcn_ds_consume: {
3222 constrainOpWithReadfirstlane(B, MI, 2); // M0
3223 return;
3224 }
3225 case Intrinsic::amdgcn_s_sendmsg:
3226 case Intrinsic::amdgcn_s_sendmsghalt: {
3227 // FIXME: Should this use a waterfall loop?
3228 constrainOpWithReadfirstlane(B, MI, 2); // M0
3229 return;
3230 }
3231 case Intrinsic::amdgcn_s_setreg: {
3232 constrainOpWithReadfirstlane(B, MI, 2);
3233 return;
3234 }
3235 case Intrinsic::amdgcn_s_ttracedata:
3236 constrainOpWithReadfirstlane(B, MI, 1); // M0
3237 return;
3238 case Intrinsic::amdgcn_raw_buffer_load_lds:
3239 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
3240 applyDefaultMapping(OpdMapper);
3241 constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3242 constrainOpWithReadfirstlane(B, MI, 2); // M0
3243 constrainOpWithReadfirstlane(B, MI, 5); // soffset
3244 return;
3245 }
3246 case Intrinsic::amdgcn_struct_buffer_load_lds:
3247 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
3248 applyDefaultMapping(OpdMapper);
3249 constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3250 constrainOpWithReadfirstlane(B, MI, 2); // M0
3251 constrainOpWithReadfirstlane(B, MI, 6); // soffset
3252 return;
3253 }
3254 case Intrinsic::amdgcn_global_load_lds: {
3255 applyDefaultMapping(OpdMapper);
3256 constrainOpWithReadfirstlane(B, MI, 2);
3257 return;
3258 }
3259 case Intrinsic::amdgcn_lds_direct_load: {
3260 applyDefaultMapping(OpdMapper);
3261 // Readlane for m0 value, which is always the last operand.
3262 constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
3263 return;
3264 }
3265 case Intrinsic::amdgcn_exp_row:
3266 applyDefaultMapping(OpdMapper);
3267 constrainOpWithReadfirstlane(B, MI, 8); // M0
3268 return;
3269 case Intrinsic::amdgcn_s_sleep_var:
3270 assert(OpdMapper.getVRegs(1).empty());
3271 constrainOpWithReadfirstlane(B, MI, 1);
3272 return;
3273 case Intrinsic::amdgcn_s_barrier_signal_var:
3274 case Intrinsic::amdgcn_s_barrier_join:
3275 case Intrinsic::amdgcn_s_wakeup_barrier:
3276 constrainOpWithReadfirstlane(B, MI, 1);
3277 return;
3278 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
3279 constrainOpWithReadfirstlane(B, MI, 2);
3280 return;
3281 case Intrinsic::amdgcn_s_barrier_init:
3282 constrainOpWithReadfirstlane(B, MI, 1);
3283 constrainOpWithReadfirstlane(B, MI, 2);
3284 return;
3285 case Intrinsic::amdgcn_s_get_barrier_state: {
3286 constrainOpWithReadfirstlane(B, MI, 2);
3287 return;
3288 }
3289 default: {
3290 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3291 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3292 // Non-images can have complications from operands that allow both SGPR
3293 // and VGPR. For now it's too complicated to figure out the final opcode
3294 // to derive the register bank from the MCInstrDesc.
3295 if (RSrcIntrin->IsImage) {
3296 applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3297 return;
3298 }
3299 }
3300
3301 break;
3302 }
3303 }
3304 break;
3305 }
3306 case AMDGPU::G_SI_CALL: {
3307 // Use a set to avoid extra readfirstlanes in the case where multiple
3308 // operands are the same register.
3309 SmallSet<Register, 4> SGPROperandRegs;
3310
3311 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3312 break;
3313
3314 // Move all copies to physical SGPRs that are used by the call instruction
3315 // into the loop block. Start searching for these copies until the
3316 // ADJCALLSTACKUP.
3317 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3318 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3319
3320 // Move all non-copies before the copies, so that a complete range can be
3321 // moved into the waterfall loop.
3322 SmallVector<MachineInstr *, 4> NonCopyInstrs;
3323 // Count of NonCopyInstrs found until the current LastCopy.
3324 unsigned NonCopyInstrsLen = 0;
3325 MachineBasicBlock::iterator Start(&MI);
3326 MachineBasicBlock::iterator LastCopy = Start;
3327 MachineBasicBlock *MBB = MI.getParent();
3328 const SIMachineFunctionInfo *Info =
3329 MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3330 while (Start->getOpcode() != FrameSetupOpcode) {
3331 --Start;
3332 bool IsCopy = false;
3333 if (Start->getOpcode() == AMDGPU::COPY) {
3334 auto &Dst = Start->getOperand(0);
3335 if (Dst.isReg()) {
3336 Register Reg = Dst.getReg();
3337 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3338 IsCopy = true;
3339 } else {
3340 // Also move the copy from the scratch rsrc descriptor into the loop
3341 // to allow it to be optimized away.
3342 auto &Src = Start->getOperand(1);
3343 if (Src.isReg()) {
3344 Reg = Src.getReg();
3345 IsCopy = Info->getScratchRSrcReg() == Reg;
3346 }
3347 }
3348 }
3349 }
3350
3351 if (IsCopy) {
3352 LastCopy = Start;
3353 NonCopyInstrsLen = NonCopyInstrs.size();
3354 } else {
3355 NonCopyInstrs.push_back(&*Start);
3356 }
3357 }
3358 NonCopyInstrs.resize(NonCopyInstrsLen);
3359
3360 for (auto *NonCopy : reverse(NonCopyInstrs)) {
3361 MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3362 }
3363 Start = LastCopy;
3364
3365 // Do the same for copies after the loop
3366 NonCopyInstrs.clear();
3367 NonCopyInstrsLen = 0;
3368 MachineBasicBlock::iterator End(&MI);
3369 LastCopy = End;
3370 while (End->getOpcode() != FrameDestroyOpcode) {
3371 ++End;
3372 bool IsCopy = false;
3373 if (End->getOpcode() == AMDGPU::COPY) {
3374 auto &Src = End->getOperand(1);
3375 if (Src.isReg()) {
3376 Register Reg = Src.getReg();
3377 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3378 }
3379 }
3380
3381 if (IsCopy) {
3382 LastCopy = End;
3383 NonCopyInstrsLen = NonCopyInstrs.size();
3384 } else {
3385 NonCopyInstrs.push_back(&*End);
3386 }
3387 }
3388 NonCopyInstrs.resize(NonCopyInstrsLen);
3389
3390 End = LastCopy;
3391 ++LastCopy;
3392 for (auto *NonCopy : reverse(NonCopyInstrs)) {
3393 MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3394 }
3395
3396 ++End;
3397 B.setInsertPt(B.getMBB(), Start);
3398 executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs);
3399 break;
3400 }
3401 case AMDGPU::G_LOAD:
3402 case AMDGPU::G_ZEXTLOAD:
3403 case AMDGPU::G_SEXTLOAD: {
3404 if (applyMappingLoad(B, OpdMapper, MI))
3405 return;
3406 break;
3407 }
3408 case AMDGPU::G_DYN_STACKALLOC:
3409 applyMappingDynStackAlloc(B, OpdMapper, MI);
3410 return;
3411 case AMDGPU::G_STACKRESTORE: {
3412 applyDefaultMapping(OpdMapper);
3413 constrainOpWithReadfirstlane(B, MI, 0);
3414 return;
3415 }
3416 case AMDGPU::G_SBFX:
3417 applyMappingBFE(B, OpdMapper, /*Signed*/ true);
3418 return;
3419 case AMDGPU::G_UBFX:
3420 applyMappingBFE(B, OpdMapper, /*Signed*/ false);
3421 return;
3422 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3423 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3424 applyMappingMAD_64_32(B, OpdMapper);
3425 return;
3426 case AMDGPU::G_PREFETCH: {
3427 if (!Subtarget.hasPrefetch()) {
3428 MI.eraseFromParent();
3429 return;
3430 }
3431 Register PtrReg = MI.getOperand(0).getReg();
3432 unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);
3433 if (PtrBank == AMDGPU::VGPRRegBankID) {
3434 MI.eraseFromParent();
3435 return;
3436 }
3437 unsigned AS = MRI.getType(PtrReg).getAddressSpace();
3438 if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
3439 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3440 MI.eraseFromParent();
3441 return;
3442 }
3443 applyDefaultMapping(OpdMapper);
3444 return;
3445 }
3446 default:
3447 break;
3448 }
3449
3450 return applyDefaultMapping(OpdMapper);
3451 }
3452
3453 // vgpr, sgpr -> vgpr
3454 // vgpr, agpr -> vgpr
3455 // agpr, agpr -> agpr
3456 // agpr, sgpr -> vgpr
regBankUnion(unsigned RB0,unsigned RB1)3457 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3458 if (RB0 == AMDGPU::InvalidRegBankID)
3459 return RB1;
3460 if (RB1 == AMDGPU::InvalidRegBankID)
3461 return RB0;
3462
3463 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3464 return AMDGPU::SGPRRegBankID;
3465
3466 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3467 return AMDGPU::AGPRRegBankID;
3468
3469 return AMDGPU::VGPRRegBankID;
3470 }
3471
regBankBoolUnion(unsigned RB0,unsigned RB1)3472 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3473 if (RB0 == AMDGPU::InvalidRegBankID)
3474 return RB1;
3475 if (RB1 == AMDGPU::InvalidRegBankID)
3476 return RB0;
3477
3478 // vcc, vcc -> vcc
3479 // vcc, sgpr -> vcc
3480 // vcc, vgpr -> vcc
3481 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3482 return AMDGPU::VCCRegBankID;
3483
3484 // vcc, vgpr -> vgpr
3485 return regBankUnion(RB0, RB1);
3486 }
3487
getMappingType(const MachineRegisterInfo & MRI,const MachineInstr & MI) const3488 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3489 const MachineInstr &MI) const {
3490 unsigned RegBank = AMDGPU::InvalidRegBankID;
3491
3492 for (const MachineOperand &MO : MI.operands()) {
3493 if (!MO.isReg())
3494 continue;
3495 Register Reg = MO.getReg();
3496 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3497 RegBank = regBankUnion(RegBank, Bank->getID());
3498 if (RegBank == AMDGPU::VGPRRegBankID)
3499 break;
3500 }
3501 }
3502
3503 return RegBank;
3504 }
3505
isSALUMapping(const MachineInstr & MI) const3506 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3507 const MachineFunction &MF = *MI.getParent()->getParent();
3508 const MachineRegisterInfo &MRI = MF.getRegInfo();
3509 for (const MachineOperand &MO : MI.operands()) {
3510 if (!MO.isReg())
3511 continue;
3512 Register Reg = MO.getReg();
3513 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3514 if (Bank->getID() != AMDGPU::SGPRRegBankID)
3515 return false;
3516 }
3517 }
3518 return true;
3519 }
3520
3521 const RegisterBankInfo::InstructionMapping &
getDefaultMappingSOP(const MachineInstr & MI) const3522 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3523 const MachineFunction &MF = *MI.getParent()->getParent();
3524 const MachineRegisterInfo &MRI = MF.getRegInfo();
3525 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3526
3527 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3528 const MachineOperand &SrcOp = MI.getOperand(i);
3529 if (!SrcOp.isReg())
3530 continue;
3531
3532 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3533 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3534 }
3535 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3536 MI.getNumOperands());
3537 }
3538
3539 const RegisterBankInfo::InstructionMapping &
getDefaultMappingVOP(const MachineInstr & MI) const3540 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3541 const MachineFunction &MF = *MI.getParent()->getParent();
3542 const MachineRegisterInfo &MRI = MF.getRegInfo();
3543 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3544
3545 // Even though we technically could use SGPRs, this would require knowledge of
3546 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3547 //
3548 // TODO: Unary ops are trivially OK, so accept SGPRs?
3549 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3550 const MachineOperand &Src = MI.getOperand(i);
3551 if (!Src.isReg())
3552 continue;
3553
3554 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3555 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3556 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3557 }
3558
3559 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3560 MI.getNumOperands());
3561 }
3562
3563 const RegisterBankInfo::InstructionMapping &
getDefaultMappingAllVGPR(const MachineInstr & MI) const3564 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3565 const MachineFunction &MF = *MI.getParent()->getParent();
3566 const MachineRegisterInfo &MRI = MF.getRegInfo();
3567 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3568
3569 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3570 const MachineOperand &Op = MI.getOperand(I);
3571 if (!Op.isReg())
3572 continue;
3573
3574 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3575 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3576 }
3577
3578 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3579 MI.getNumOperands());
3580 }
3581
3582 const RegisterBankInfo::InstructionMapping &
getImageMapping(const MachineRegisterInfo & MRI,const MachineInstr & MI,int RsrcIdx) const3583 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3584 const MachineInstr &MI,
3585 int RsrcIdx) const {
3586 // The reported argument index is relative to the IR intrinsic call arguments,
3587 // so we need to shift by the number of defs and the intrinsic ID.
3588 RsrcIdx += MI.getNumExplicitDefs() + 1;
3589
3590 const int NumOps = MI.getNumOperands();
3591 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3592
3593 // TODO: Should packed/unpacked D16 difference be reported here as part of
3594 // the value mapping?
3595 for (int I = 0; I != NumOps; ++I) {
3596 if (!MI.getOperand(I).isReg())
3597 continue;
3598
3599 Register OpReg = MI.getOperand(I).getReg();
3600 // We replace some dead address operands with $noreg
3601 if (!OpReg)
3602 continue;
3603
3604 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3605
3606 // FIXME: Probably need a new intrinsic register bank searchable table to
3607 // handle arbitrary intrinsics easily.
3608 //
3609 // If this has a sampler, it immediately follows rsrc.
3610 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3611
3612 if (MustBeSGPR) {
3613 // If this must be an SGPR, so we must report whatever it is as legal.
3614 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3615 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3616 } else {
3617 // Some operands must be VGPR, and these are easy to copy to.
3618 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3619 }
3620 }
3621
3622 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3623 }
3624
3625 /// Return the mapping for a pointer argument.
3626 const RegisterBankInfo::ValueMapping *
getValueMappingForPtr(const MachineRegisterInfo & MRI,Register PtrReg) const3627 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3628 Register PtrReg) const {
3629 LLT PtrTy = MRI.getType(PtrReg);
3630 unsigned Size = PtrTy.getSizeInBits();
3631 if (Subtarget.useFlatForGlobal() ||
3632 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3633 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3634
3635 // If we're using MUBUF instructions for global memory, an SGPR base register
3636 // is possible. Otherwise this needs to be a VGPR.
3637 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3638 return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3639 }
3640
3641 const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr & MI) const3642 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3643
3644 const MachineFunction &MF = *MI.getParent()->getParent();
3645 const MachineRegisterInfo &MRI = MF.getRegInfo();
3646 SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3647 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3648 Register PtrReg = MI.getOperand(1).getReg();
3649 LLT PtrTy = MRI.getType(PtrReg);
3650 unsigned AS = PtrTy.getAddressSpace();
3651 unsigned PtrSize = PtrTy.getSizeInBits();
3652
3653 const ValueMapping *ValMapping;
3654 const ValueMapping *PtrMapping;
3655
3656 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3657
3658 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3659 if (isScalarLoadLegal(MI)) {
3660 // We have a uniform instruction so we want to use an SMRD load
3661 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3662 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3663 } else {
3664 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3665
3666 // If we're using MUBUF instructions for global memory, an SGPR base
3667 // register is possible. Otherwise this needs to be a VGPR.
3668 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3669 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3670
3671 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3672 }
3673 } else {
3674 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3675 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3676 }
3677
3678 OpdsMapping[0] = ValMapping;
3679 OpdsMapping[1] = PtrMapping;
3680 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3681 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3682 return Mapping;
3683
3684 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3685 // handle that during instruction selection?
3686 }
3687
3688 unsigned
getRegBankID(Register Reg,const MachineRegisterInfo & MRI,unsigned Default) const3689 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3690 const MachineRegisterInfo &MRI,
3691 unsigned Default) const {
3692 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3693 return Bank ? Bank->getID() : Default;
3694 }
3695
3696 const RegisterBankInfo::ValueMapping *
getSGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3697 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3698 const MachineRegisterInfo &MRI,
3699 const TargetRegisterInfo &TRI) const {
3700 // Lie and claim anything is legal, even though this needs to be an SGPR
3701 // applyMapping will have to deal with it as a waterfall loop.
3702 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3703 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3704 return AMDGPU::getValueMapping(Bank, Size);
3705 }
3706
3707 const RegisterBankInfo::ValueMapping *
getVGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3708 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3709 const MachineRegisterInfo &MRI,
3710 const TargetRegisterInfo &TRI) const {
3711 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3712 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3713 }
3714
3715 const RegisterBankInfo::ValueMapping *
getAGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3716 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3717 const MachineRegisterInfo &MRI,
3718 const TargetRegisterInfo &TRI) const {
3719 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3720 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3721 }
3722
3723 ///
3724 /// This function must return a legal mapping, because
3725 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3726 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3727 /// VGPR to SGPR generated is illegal.
3728 ///
3729 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3730 // legal. These will be dealt with in applyMappingImpl.
3731 //
3732 const RegisterBankInfo::InstructionMapping &
getInstrMapping(const MachineInstr & MI) const3733 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3734 const MachineFunction &MF = *MI.getParent()->getParent();
3735 const MachineRegisterInfo &MRI = MF.getRegInfo();
3736
3737 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3738 // The default logic bothers to analyze impossible alternative mappings. We
3739 // want the most straightforward mapping, so just directly handle this.
3740 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3741 *TRI);
3742 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3743 *TRI);
3744 assert(SrcBank && "src bank should have been assigned already");
3745 if (!DstBank)
3746 DstBank = SrcBank;
3747
3748 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3749 if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3750 cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size)))
3751 return getInvalidInstructionMapping();
3752
3753 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3754 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3755 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3756 OpdsMapping[0] = &ValMap;
3757 if (MI.getOpcode() == AMDGPU::G_FREEZE)
3758 OpdsMapping[1] = &ValMap;
3759
3760 return getInstructionMapping(
3761 1, /*Cost*/ 1,
3762 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3763 }
3764
3765 if (MI.isRegSequence()) {
3766 // If any input is a VGPR, the result must be a VGPR. The default handling
3767 // assumes any copy between banks is legal.
3768 unsigned BankID = AMDGPU::SGPRRegBankID;
3769
3770 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3771 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3772 // It doesn't make sense to use vcc or scc banks here, so just ignore
3773 // them.
3774 if (OpBank != AMDGPU::SGPRRegBankID) {
3775 BankID = AMDGPU::VGPRRegBankID;
3776 break;
3777 }
3778 }
3779 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3780
3781 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3782 return getInstructionMapping(
3783 1, /*Cost*/ 1,
3784 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3785 }
3786
3787 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3788 // properly.
3789 //
3790 // TODO: There are additional exec masking dependencies to analyze.
3791 if (auto *PHI = dyn_cast<GPhi>(&MI)) {
3792 unsigned ResultBank = AMDGPU::InvalidRegBankID;
3793 Register DstReg = PHI->getReg(0);
3794
3795 // Sometimes the result may have already been assigned a bank.
3796 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3797 ResultBank = DstBank->getID();
3798
3799 for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) {
3800 Register Reg = PHI->getIncomingValue(I);
3801 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3802
3803 // FIXME: Assuming VGPR for any undetermined inputs.
3804 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3805 ResultBank = AMDGPU::VGPRRegBankID;
3806 break;
3807 }
3808
3809 // FIXME: Need to promote SGPR case to s32
3810 unsigned OpBank = Bank->getID();
3811 ResultBank = regBankBoolUnion(ResultBank, OpBank);
3812 }
3813
3814 assert(ResultBank != AMDGPU::InvalidRegBankID);
3815
3816 unsigned Size = MRI.getType(DstReg).getSizeInBits();
3817
3818 const ValueMapping &ValMap =
3819 getValueMapping(0, Size, getRegBank(ResultBank));
3820 return getInstructionMapping(
3821 1, /*Cost*/ 1,
3822 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3823 }
3824
3825 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3826 if (Mapping.isValid())
3827 return Mapping;
3828
3829 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3830
3831 switch (MI.getOpcode()) {
3832 default:
3833 return getInvalidInstructionMapping();
3834
3835 case AMDGPU::G_AND:
3836 case AMDGPU::G_OR:
3837 case AMDGPU::G_XOR:
3838 case AMDGPU::G_MUL: {
3839 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3840 if (Size == 1) {
3841 const RegisterBank *DstBank
3842 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3843
3844 unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3845 unsigned BankLHS = AMDGPU::InvalidRegBankID;
3846 unsigned BankRHS = AMDGPU::InvalidRegBankID;
3847 if (DstBank) {
3848 TargetBankID = DstBank->getID();
3849 if (DstBank == &AMDGPU::VCCRegBank) {
3850 TargetBankID = AMDGPU::VCCRegBankID;
3851 BankLHS = AMDGPU::VCCRegBankID;
3852 BankRHS = AMDGPU::VCCRegBankID;
3853 } else {
3854 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3855 AMDGPU::SGPRRegBankID);
3856 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3857 AMDGPU::SGPRRegBankID);
3858 }
3859 } else {
3860 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3861 AMDGPU::VCCRegBankID);
3862 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3863 AMDGPU::VCCRegBankID);
3864
3865 // Both inputs should be true booleans to produce a boolean result.
3866 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3867 TargetBankID = AMDGPU::VGPRRegBankID;
3868 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3869 TargetBankID = AMDGPU::VCCRegBankID;
3870 BankLHS = AMDGPU::VCCRegBankID;
3871 BankRHS = AMDGPU::VCCRegBankID;
3872 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3873 TargetBankID = AMDGPU::SGPRRegBankID;
3874 }
3875 }
3876
3877 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3878 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3879 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3880 break;
3881 }
3882
3883 if (Size == 64) {
3884
3885 if (isSALUMapping(MI)) {
3886 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3887 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3888 } else {
3889 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3890 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3891 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3892
3893 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3894 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3895 }
3896
3897 break;
3898 }
3899
3900 [[fallthrough]];
3901 }
3902 case AMDGPU::G_PTR_ADD:
3903 case AMDGPU::G_PTRMASK:
3904 case AMDGPU::G_ADD:
3905 case AMDGPU::G_SUB:
3906 case AMDGPU::G_SHL:
3907 case AMDGPU::G_LSHR:
3908 case AMDGPU::G_ASHR:
3909 case AMDGPU::G_UADDO:
3910 case AMDGPU::G_USUBO:
3911 case AMDGPU::G_UADDE:
3912 case AMDGPU::G_SADDE:
3913 case AMDGPU::G_USUBE:
3914 case AMDGPU::G_SSUBE:
3915 case AMDGPU::G_SMIN:
3916 case AMDGPU::G_SMAX:
3917 case AMDGPU::G_UMIN:
3918 case AMDGPU::G_UMAX:
3919 case AMDGPU::G_ABS:
3920 case AMDGPU::G_SHUFFLE_VECTOR:
3921 case AMDGPU::G_SBFX:
3922 case AMDGPU::G_UBFX:
3923 case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
3924 case AMDGPU::G_AMDGPU_S_MUL_U64_U32:
3925 if (isSALUMapping(MI))
3926 return getDefaultMappingSOP(MI);
3927 return getDefaultMappingVOP(MI);
3928 case AMDGPU::G_FADD:
3929 case AMDGPU::G_FSUB:
3930 case AMDGPU::G_FMUL:
3931 case AMDGPU::G_FMA:
3932 case AMDGPU::G_FFLOOR:
3933 case AMDGPU::G_FCEIL:
3934 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
3935 case AMDGPU::G_FMINNUM:
3936 case AMDGPU::G_FMAXNUM:
3937 case AMDGPU::G_FMINIMUM:
3938 case AMDGPU::G_FMAXIMUM:
3939 case AMDGPU::G_INTRINSIC_TRUNC:
3940 case AMDGPU::G_STRICT_FADD:
3941 case AMDGPU::G_STRICT_FSUB:
3942 case AMDGPU::G_STRICT_FMUL:
3943 case AMDGPU::G_STRICT_FMA: {
3944 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3945 unsigned Size = Ty.getSizeInBits();
3946 if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
3947 (Size == 32 || Size == 16) && isSALUMapping(MI))
3948 return getDefaultMappingSOP(MI);
3949 return getDefaultMappingVOP(MI);
3950 }
3951 case AMDGPU::G_FPTOSI:
3952 case AMDGPU::G_FPTOUI:
3953 case AMDGPU::G_SITOFP:
3954 case AMDGPU::G_UITOFP: {
3955 unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3956 unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3957 if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 &&
3958 isSALUMapping(MI))
3959 return getDefaultMappingSOP(MI);
3960 return getDefaultMappingVOP(MI);
3961 }
3962 case AMDGPU::G_FPTRUNC:
3963 case AMDGPU::G_FPEXT: {
3964 unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3965 unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3966 if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 &&
3967 isSALUMapping(MI))
3968 return getDefaultMappingSOP(MI);
3969 return getDefaultMappingVOP(MI);
3970 }
3971 case AMDGPU::G_FSQRT:
3972 case AMDGPU::G_FEXP2:
3973 case AMDGPU::G_FLOG2: {
3974 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3975 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
3976 isSALUMapping(MI))
3977 return getDefaultMappingSOP(MI);
3978 return getDefaultMappingVOP(MI);
3979 }
3980 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3981 case AMDGPU::G_SSUBSAT:
3982 case AMDGPU::G_UADDSAT:
3983 case AMDGPU::G_USUBSAT:
3984 case AMDGPU::G_FMAD:
3985 case AMDGPU::G_FLDEXP:
3986 case AMDGPU::G_FMINNUM_IEEE:
3987 case AMDGPU::G_FMAXNUM_IEEE:
3988 case AMDGPU::G_FCANONICALIZE:
3989 case AMDGPU::G_STRICT_FLDEXP:
3990 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3991 case AMDGPU::G_FSHR: // TODO: Expand for scalar
3992 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3993 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3994 case AMDGPU::G_AMDGPU_RCP_IFLAG:
3995 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3996 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3997 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3998 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3999 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
4000 case AMDGPU::G_AMDGPU_SMED3:
4001 case AMDGPU::G_AMDGPU_FMED3:
4002 return getDefaultMappingVOP(MI);
4003 case AMDGPU::G_UMULH:
4004 case AMDGPU::G_SMULH: {
4005 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
4006 return getDefaultMappingSOP(MI);
4007 return getDefaultMappingVOP(MI);
4008 }
4009 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4010 case AMDGPU::G_AMDGPU_MAD_I64_I32: {
4011 // Three possible mappings:
4012 //
4013 // - Default SOP
4014 // - Default VOP
4015 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
4016 //
4017 // This allows instruction selection to keep the multiplication part of the
4018 // instruction on the SALU.
4019 bool AllSalu = true;
4020 bool MulSalu = true;
4021 for (unsigned i = 0; i < 5; ++i) {
4022 Register Reg = MI.getOperand(i).getReg();
4023 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
4024 if (Bank->getID() != AMDGPU::SGPRRegBankID) {
4025 AllSalu = false;
4026 if (i == 2 || i == 3) {
4027 MulSalu = false;
4028 break;
4029 }
4030 }
4031 }
4032 }
4033
4034 if (AllSalu)
4035 return getDefaultMappingSOP(MI);
4036
4037 // If the multiply-add is full-rate in VALU, use that even if the
4038 // multiplication part is scalar. Accumulating separately on the VALU would
4039 // take two instructions.
4040 if (!MulSalu || Subtarget.hasFullRate64Ops())
4041 return getDefaultMappingVOP(MI);
4042
4043 // Keep the multiplication on the SALU, then accumulate on the VALU.
4044 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
4045 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4046 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4047 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4048 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
4049 break;
4050 }
4051 case AMDGPU::G_IMPLICIT_DEF: {
4052 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4053 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4054 break;
4055 }
4056 case AMDGPU::G_FCONSTANT:
4057 case AMDGPU::G_CONSTANT:
4058 case AMDGPU::G_GLOBAL_VALUE:
4059 case AMDGPU::G_BLOCK_ADDR:
4060 case AMDGPU::G_READSTEADYCOUNTER:
4061 case AMDGPU::G_READCYCLECOUNTER: {
4062 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4063 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4064 break;
4065 }
4066 case AMDGPU::G_FRAME_INDEX: {
4067 // TODO: This should be the same as other constants, but eliminateFrameIndex
4068 // currently assumes VALU uses.
4069 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4070 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4071 break;
4072 }
4073 case AMDGPU::G_DYN_STACKALLOC: {
4074 // Result is always uniform, and a wave reduction is needed for the source.
4075 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4076 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4077 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
4078 break;
4079 }
4080 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
4081 // This case is weird because we expect a physical register in the source,
4082 // but need to set a bank anyway.
4083 //
4084 // TODO: We could select the result to SGPR or VGPR
4085 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4086 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4087 break;
4088 }
4089 case AMDGPU::G_INSERT: {
4090 unsigned BankID = getMappingType(MRI, MI);
4091 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4092 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4093 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
4094 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
4095 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
4096 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
4097 OpdsMapping[3] = nullptr;
4098 break;
4099 }
4100 case AMDGPU::G_EXTRACT: {
4101 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4102 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4103 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4104 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
4105 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
4106 OpdsMapping[2] = nullptr;
4107 break;
4108 }
4109 case AMDGPU::G_BUILD_VECTOR:
4110 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
4111 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
4112 if (DstTy == LLT::fixed_vector(2, 16)) {
4113 unsigned DstSize = DstTy.getSizeInBits();
4114 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4115 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4116 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4117 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
4118
4119 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
4120 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
4121 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
4122 break;
4123 }
4124
4125 [[fallthrough]];
4126 }
4127 case AMDGPU::G_MERGE_VALUES:
4128 case AMDGPU::G_CONCAT_VECTORS: {
4129 unsigned Bank = getMappingType(MRI, MI);
4130 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4131 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4132
4133 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
4134 // Op1 and Dst should use the same register bank.
4135 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
4136 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
4137 break;
4138 }
4139 case AMDGPU::G_BITREVERSE:
4140 case AMDGPU::G_BITCAST:
4141 case AMDGPU::G_INTTOPTR:
4142 case AMDGPU::G_PTRTOINT:
4143 case AMDGPU::G_FABS:
4144 case AMDGPU::G_FNEG: {
4145 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4146 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4147 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4148 break;
4149 }
4150 case AMDGPU::G_AMDGPU_FFBH_U32:
4151 case AMDGPU::G_AMDGPU_FFBL_B32:
4152 case AMDGPU::G_CTLZ_ZERO_UNDEF:
4153 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
4154 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4155 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4156 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
4157 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
4158 break;
4159 }
4160 case AMDGPU::G_CTPOP: {
4161 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4162 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4163 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
4164
4165 // This should really be getValueMappingSGPR64Only, but allowing the generic
4166 // code to handle the register split just makes using LegalizerHelper more
4167 // difficult.
4168 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4169 break;
4170 }
4171 case AMDGPU::G_TRUNC: {
4172 Register Dst = MI.getOperand(0).getReg();
4173 Register Src = MI.getOperand(1).getReg();
4174 unsigned Bank = getRegBankID(Src, MRI);
4175 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4176 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4177 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
4178 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
4179 break;
4180 }
4181 case AMDGPU::G_ZEXT:
4182 case AMDGPU::G_SEXT:
4183 case AMDGPU::G_ANYEXT:
4184 case AMDGPU::G_SEXT_INREG: {
4185 Register Dst = MI.getOperand(0).getReg();
4186 Register Src = MI.getOperand(1).getReg();
4187 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4188 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4189
4190 unsigned DstBank;
4191 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
4192 assert(SrcBank);
4193 switch (SrcBank->getID()) {
4194 case AMDGPU::SGPRRegBankID:
4195 DstBank = AMDGPU::SGPRRegBankID;
4196 break;
4197 default:
4198 DstBank = AMDGPU::VGPRRegBankID;
4199 break;
4200 }
4201
4202 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
4203 // 32-bits, and then to 64.
4204 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
4205 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
4206 SrcSize);
4207 break;
4208 }
4209 case AMDGPU::G_IS_FPCLASS: {
4210 Register SrcReg = MI.getOperand(1).getReg();
4211 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4212 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4213 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4214 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4215 break;
4216 }
4217 case AMDGPU::G_STORE: {
4218 assert(MI.getOperand(0).isReg());
4219 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4220
4221 // FIXME: We need to specify a different reg bank once scalar stores are
4222 // supported.
4223 const ValueMapping *ValMapping =
4224 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4225 OpdsMapping[0] = ValMapping;
4226 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4227 break;
4228 }
4229 case AMDGPU::G_ICMP:
4230 case AMDGPU::G_FCMP: {
4231 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4232
4233 // See if the result register has already been constrained to vcc, which may
4234 // happen due to control flow intrinsic lowering.
4235 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4236 AMDGPU::SGPRRegBankID);
4237 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4238 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
4239
4240 auto canUseSCCICMP = [&]() {
4241 auto Pred =
4242 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
4243 return Size == 32 ||
4244 (Size == 64 &&
4245 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4246 Subtarget.hasScalarCompareEq64());
4247 };
4248 auto canUseSCCFCMP = [&]() {
4249 return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16);
4250 };
4251
4252 bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;
4253 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4254 Op2Bank == AMDGPU::SGPRRegBankID &&
4255 Op3Bank == AMDGPU::SGPRRegBankID &&
4256 (isICMP ? canUseSCCICMP() : canUseSCCFCMP());
4257
4258 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4259 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4260
4261 // TODO: Use 32-bit for scalar output size.
4262 // SCC results will need to be copied to a 32-bit SGPR virtual register.
4263 const unsigned ResultSize = 1;
4264
4265 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
4266 OpdsMapping[1] = nullptr; // Predicate Operand.
4267 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
4268 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
4269 break;
4270 }
4271 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4272 // VGPR index can be used for waterfall when indexing a SGPR vector.
4273 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4274 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4275 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4276 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4277 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4278 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
4279
4280 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
4281 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
4282
4283 // The index can be either if the source vector is VGPR.
4284 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4285 break;
4286 }
4287 case AMDGPU::G_INSERT_VECTOR_ELT: {
4288 unsigned OutputBankID = isSALUMapping(MI) ?
4289 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4290
4291 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4292 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4293 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4294 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4295 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
4296
4297 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4298 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4299
4300 // This is a weird case, because we need to break down the mapping based on
4301 // the register bank of a different operand.
4302 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4303 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
4304 InsertSize);
4305 } else {
4306 assert(InsertSize == 32 || InsertSize == 64);
4307 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
4308 }
4309
4310 // The index can be either if the source vector is VGPR.
4311 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
4312 break;
4313 }
4314 case AMDGPU::G_UNMERGE_VALUES: {
4315 unsigned Bank = getMappingType(MRI, MI);
4316
4317 // Op1 and Dst should use the same register bank.
4318 // FIXME: Shouldn't this be the default? Why do we need to handle this?
4319 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4320 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
4321 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
4322 }
4323 break;
4324 }
4325 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4326 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4327 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4328 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4329 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4330 case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
4331 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
4332 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
4333 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
4334 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
4335 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4336 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4337 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4338 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4339 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4340 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4341 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4342 case AMDGPU::G_AMDGPU_BUFFER_STORE:
4343 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4344 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4345 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4346 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4347 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4348
4349 // rsrc
4350 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4351
4352 // vindex
4353 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4354
4355 // voffset
4356 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4357
4358 // soffset
4359 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4360
4361 // Any remaining operands are immediates and were correctly null
4362 // initialized.
4363 break;
4364 }
4365 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4366 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4367 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4368 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4369 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4370 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4371 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4372 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4373 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4374 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4375 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4376 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4377 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4378 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4379 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4380 // vdata_out
4381 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4382
4383 // vdata_in
4384 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4385
4386 // rsrc
4387 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4388
4389 // vindex
4390 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4391
4392 // voffset
4393 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4394
4395 // soffset
4396 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4397
4398 // Any remaining operands are immediates and were correctly null
4399 // initialized.
4400 break;
4401 }
4402 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4403 // vdata_out
4404 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4405
4406 // vdata_in
4407 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4408
4409 // cmp
4410 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4411
4412 // rsrc
4413 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4414
4415 // vindex
4416 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4417
4418 // voffset
4419 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4420
4421 // soffset
4422 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4423
4424 // Any remaining operands are immediates and were correctly null
4425 // initialized.
4426 break;
4427 }
4428 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
4429 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
4430 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
4431 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
4432 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
4433 // Lie and claim everything is legal, even though some need to be
4434 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4435 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4436 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4437
4438 // We need to convert this to a MUBUF if either the resource of offset is
4439 // VGPR.
4440 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4441 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4442 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4443
4444 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4445 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4446 break;
4447 }
4448 case AMDGPU::G_INTRINSIC:
4449 case AMDGPU::G_INTRINSIC_CONVERGENT: {
4450 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
4451 default:
4452 return getInvalidInstructionMapping();
4453 case Intrinsic::amdgcn_div_fmas:
4454 case Intrinsic::amdgcn_div_fixup:
4455 case Intrinsic::amdgcn_trig_preop:
4456 case Intrinsic::amdgcn_sin:
4457 case Intrinsic::amdgcn_cos:
4458 case Intrinsic::amdgcn_log_clamp:
4459 case Intrinsic::amdgcn_rcp_legacy:
4460 case Intrinsic::amdgcn_rsq_legacy:
4461 case Intrinsic::amdgcn_rsq_clamp:
4462 case Intrinsic::amdgcn_fmul_legacy:
4463 case Intrinsic::amdgcn_fma_legacy:
4464 case Intrinsic::amdgcn_frexp_mant:
4465 case Intrinsic::amdgcn_frexp_exp:
4466 case Intrinsic::amdgcn_fract:
4467 case Intrinsic::amdgcn_cvt_pknorm_i16:
4468 case Intrinsic::amdgcn_cvt_pknorm_u16:
4469 case Intrinsic::amdgcn_cvt_pk_i16:
4470 case Intrinsic::amdgcn_cvt_pk_u16:
4471 case Intrinsic::amdgcn_fmed3:
4472 case Intrinsic::amdgcn_cubeid:
4473 case Intrinsic::amdgcn_cubema:
4474 case Intrinsic::amdgcn_cubesc:
4475 case Intrinsic::amdgcn_cubetc:
4476 case Intrinsic::amdgcn_sffbh:
4477 case Intrinsic::amdgcn_fmad_ftz:
4478 case Intrinsic::amdgcn_mbcnt_lo:
4479 case Intrinsic::amdgcn_mbcnt_hi:
4480 case Intrinsic::amdgcn_mul_u24:
4481 case Intrinsic::amdgcn_mul_i24:
4482 case Intrinsic::amdgcn_mulhi_u24:
4483 case Intrinsic::amdgcn_mulhi_i24:
4484 case Intrinsic::amdgcn_lerp:
4485 case Intrinsic::amdgcn_sad_u8:
4486 case Intrinsic::amdgcn_msad_u8:
4487 case Intrinsic::amdgcn_sad_hi_u8:
4488 case Intrinsic::amdgcn_sad_u16:
4489 case Intrinsic::amdgcn_qsad_pk_u16_u8:
4490 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4491 case Intrinsic::amdgcn_mqsad_u32_u8:
4492 case Intrinsic::amdgcn_cvt_pk_u8_f32:
4493 case Intrinsic::amdgcn_alignbyte:
4494 case Intrinsic::amdgcn_perm:
4495 case Intrinsic::amdgcn_fdot2:
4496 case Intrinsic::amdgcn_sdot2:
4497 case Intrinsic::amdgcn_udot2:
4498 case Intrinsic::amdgcn_sdot4:
4499 case Intrinsic::amdgcn_udot4:
4500 case Intrinsic::amdgcn_sdot8:
4501 case Intrinsic::amdgcn_udot8:
4502 case Intrinsic::amdgcn_fdot2_bf16_bf16:
4503 case Intrinsic::amdgcn_fdot2_f16_f16:
4504 case Intrinsic::amdgcn_fdot2_f32_bf16:
4505 case Intrinsic::amdgcn_sudot4:
4506 case Intrinsic::amdgcn_sudot8:
4507 case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
4508 case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
4509 case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
4510 case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
4511 case Intrinsic::amdgcn_cvt_f32_fp8:
4512 case Intrinsic::amdgcn_cvt_f32_bf8:
4513 case Intrinsic::amdgcn_cvt_pk_f32_fp8:
4514 case Intrinsic::amdgcn_cvt_pk_f32_bf8:
4515 case Intrinsic::amdgcn_cvt_pk_fp8_f32:
4516 case Intrinsic::amdgcn_cvt_pk_bf8_f32:
4517 case Intrinsic::amdgcn_cvt_sr_fp8_f32:
4518 case Intrinsic::amdgcn_cvt_sr_bf8_f32:
4519 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4520 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4521 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
4522 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
4523 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4524 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4525 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4526 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4527 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
4528 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
4529 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
4530 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
4531 case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
4532 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
4533 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
4534 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
4535 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
4536 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
4537 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
4538 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
4539 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
4540 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
4541 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
4542 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
4543 return getDefaultMappingVOP(MI);
4544 case Intrinsic::amdgcn_log:
4545 case Intrinsic::amdgcn_exp2:
4546 case Intrinsic::amdgcn_rcp:
4547 case Intrinsic::amdgcn_rsq:
4548 case Intrinsic::amdgcn_sqrt: {
4549 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4550 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4551 isSALUMapping(MI))
4552 return getDefaultMappingSOP(MI);
4553 return getDefaultMappingVOP(MI);
4554 }
4555 case Intrinsic::amdgcn_sbfe:
4556 case Intrinsic::amdgcn_ubfe:
4557 if (isSALUMapping(MI))
4558 return getDefaultMappingSOP(MI);
4559 return getDefaultMappingVOP(MI);
4560 case Intrinsic::amdgcn_ds_swizzle:
4561 case Intrinsic::amdgcn_ds_permute:
4562 case Intrinsic::amdgcn_ds_bpermute:
4563 case Intrinsic::amdgcn_update_dpp:
4564 case Intrinsic::amdgcn_mov_dpp8:
4565 case Intrinsic::amdgcn_mov_dpp:
4566 case Intrinsic::amdgcn_strict_wwm:
4567 case Intrinsic::amdgcn_wwm:
4568 case Intrinsic::amdgcn_strict_wqm:
4569 case Intrinsic::amdgcn_wqm:
4570 case Intrinsic::amdgcn_softwqm:
4571 case Intrinsic::amdgcn_set_inactive:
4572 case Intrinsic::amdgcn_set_inactive_chain_arg:
4573 case Intrinsic::amdgcn_permlane64:
4574 return getDefaultMappingAllVGPR(MI);
4575 case Intrinsic::amdgcn_cvt_pkrtz:
4576 if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
4577 return getDefaultMappingSOP(MI);
4578 return getDefaultMappingVOP(MI);
4579 case Intrinsic::amdgcn_kernarg_segment_ptr:
4580 case Intrinsic::amdgcn_s_getpc:
4581 case Intrinsic::amdgcn_groupstaticsize:
4582 case Intrinsic::amdgcn_reloc_constant:
4583 case Intrinsic::returnaddress: {
4584 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4585 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4586 break;
4587 }
4588 case Intrinsic::amdgcn_wqm_vote: {
4589 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4590 OpdsMapping[0] = OpdsMapping[2]
4591 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4592 break;
4593 }
4594 case Intrinsic::amdgcn_ps_live: {
4595 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4596 break;
4597 }
4598 case Intrinsic::amdgcn_div_scale: {
4599 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4600 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4601 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4602 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4603
4604 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4605 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4606 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4607 break;
4608 }
4609 case Intrinsic::amdgcn_class: {
4610 Register Src0Reg = MI.getOperand(2).getReg();
4611 Register Src1Reg = MI.getOperand(3).getReg();
4612 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4613 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4614 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4615 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4616 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4617 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4618 break;
4619 }
4620 case Intrinsic::amdgcn_icmp:
4621 case Intrinsic::amdgcn_fcmp: {
4622 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4623 // This is not VCCRegBank because this is not used in boolean contexts.
4624 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4625 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4626 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4627 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4628 break;
4629 }
4630 case Intrinsic::amdgcn_readlane: {
4631 // This must be an SGPR, but accept a VGPR.
4632 Register IdxReg = MI.getOperand(3).getReg();
4633 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4634 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4635 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4636 [[fallthrough]];
4637 }
4638 case Intrinsic::amdgcn_readfirstlane: {
4639 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4640 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4641 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4642 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4643 break;
4644 }
4645 case Intrinsic::amdgcn_writelane: {
4646 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4647 Register SrcReg = MI.getOperand(2).getReg();
4648 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4649 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4650 Register IdxReg = MI.getOperand(3).getReg();
4651 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4652 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4653 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4654
4655 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4656 // to legalize.
4657 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4658 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4659 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4660 break;
4661 }
4662 case Intrinsic::amdgcn_if_break: {
4663 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4664 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4665 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4666 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4667 break;
4668 }
4669 case Intrinsic::amdgcn_permlane16:
4670 case Intrinsic::amdgcn_permlanex16: {
4671 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4672 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4673 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4674 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4675 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4676 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4677 break;
4678 }
4679 case Intrinsic::amdgcn_permlane16_var:
4680 case Intrinsic::amdgcn_permlanex16_var: {
4681 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4682 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4683 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4684 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4685 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4686 break;
4687 }
4688 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4689 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4690 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4691 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4692 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4693 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4694 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4695 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4696 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4697 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4698 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4699 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4700 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4701 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4702 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4703 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4704 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4705 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4706 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4707 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4708 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4709 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4710 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4711 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4712 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4713 case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4714 case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4715 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4716 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4717 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4718 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
4719 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
4720 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
4721 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
4722 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
4723 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
4724 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
4725 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
4726 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
4727 // Default for MAI intrinsics.
4728 // srcC can also be an immediate which can be folded later.
4729 // FIXME: Should we eventually add an alternative mapping with AGPR src
4730 // for srcA/srcB?
4731 //
4732 // vdst, srcA, srcB, srcC
4733 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4734 OpdsMapping[0] =
4735 Info->mayNeedAGPRs()
4736 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4737 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4738 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4739 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4740 OpdsMapping[4] =
4741 Info->mayNeedAGPRs()
4742 ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4743 : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4744 break;
4745 }
4746 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
4747 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
4748 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
4749 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
4750 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
4751 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
4752 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
4753 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
4754 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
4755 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
4756 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
4757 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
4758 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
4759 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
4760 // vdst, srcA, srcB, srcC, idx
4761 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4762 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4763 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4764 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4765 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4766 break;
4767 }
4768 case Intrinsic::amdgcn_interp_p1:
4769 case Intrinsic::amdgcn_interp_p2:
4770 case Intrinsic::amdgcn_interp_mov:
4771 case Intrinsic::amdgcn_interp_p1_f16:
4772 case Intrinsic::amdgcn_interp_p2_f16:
4773 case Intrinsic::amdgcn_lds_param_load: {
4774 const int M0Idx = MI.getNumOperands() - 1;
4775 Register M0Reg = MI.getOperand(M0Idx).getReg();
4776 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4777 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4778
4779 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4780 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4781 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4782
4783 // Must be SGPR, but we must take whatever the original bank is and fix it
4784 // later.
4785 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4786 break;
4787 }
4788 case Intrinsic::amdgcn_interp_inreg_p10:
4789 case Intrinsic::amdgcn_interp_inreg_p2:
4790 case Intrinsic::amdgcn_interp_inreg_p10_f16:
4791 case Intrinsic::amdgcn_interp_inreg_p2_f16:
4792 case Intrinsic::amdgcn_interp_p10_rtz_f16:
4793 case Intrinsic::amdgcn_interp_p2_rtz_f16: {
4794 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4795 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4796 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4797 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4798 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4799 break;
4800 }
4801 case Intrinsic::amdgcn_ballot: {
4802 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4803 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4804 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4805 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4806 break;
4807 }
4808 case Intrinsic::amdgcn_inverse_ballot: {
4809 // This must be an SGPR, but accept a VGPR.
4810 Register MaskReg = MI.getOperand(2).getReg();
4811 unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
4812 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4813 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4814 OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
4815 break;
4816 }
4817 case Intrinsic::amdgcn_s_quadmask:
4818 case Intrinsic::amdgcn_s_wqm: {
4819 Register MaskReg = MI.getOperand(2).getReg();
4820 unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
4821 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4822 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize);
4823 OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
4824 break;
4825 }
4826 case Intrinsic::amdgcn_wave_reduce_umin:
4827 case Intrinsic::amdgcn_wave_reduce_umax: {
4828 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4829 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4830 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4831 auto regBankID =
4832 isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4833 OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
4834 break;
4835 }
4836 case Intrinsic::amdgcn_s_bitreplicate:
4837 Register MaskReg = MI.getOperand(2).getReg();
4838 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4839 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4840 OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
4841 }
4842 break;
4843 }
4844 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4845 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4846 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4847 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4848 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4849 auto IntrID = AMDGPU::getIntrinsicID(MI);
4850 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4851 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4852 // Non-images can have complications from operands that allow both SGPR
4853 // and VGPR. For now it's too complicated to figure out the final opcode
4854 // to derive the register bank from the MCInstrDesc.
4855 assert(RSrcIntrin->IsImage);
4856 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4857 }
4858 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4859 unsigned N = MI.getNumExplicitOperands() - 2;
4860 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4861 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4862 if (N == 3) {
4863 // Sequential form: all operands combined into VGPR256/VGPR512
4864 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4865 if (Size > 256)
4866 Size = 512;
4867 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4868 } else {
4869 // NSA form
4870 for (unsigned I = 2; I < N; ++I) {
4871 unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();
4872 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4873 }
4874 }
4875 break;
4876 }
4877 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
4878 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
4879 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
4880 switch (IntrID) {
4881 case Intrinsic::amdgcn_s_getreg:
4882 case Intrinsic::amdgcn_s_memtime:
4883 case Intrinsic::amdgcn_s_memrealtime:
4884 case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
4885 case Intrinsic::amdgcn_s_sendmsg_rtn: {
4886 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4887 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4888 break;
4889 }
4890 case Intrinsic::amdgcn_global_atomic_fadd:
4891 case Intrinsic::amdgcn_global_atomic_csub:
4892 case Intrinsic::amdgcn_global_atomic_fmin:
4893 case Intrinsic::amdgcn_global_atomic_fmax:
4894 case Intrinsic::amdgcn_global_atomic_fmin_num:
4895 case Intrinsic::amdgcn_global_atomic_fmax_num:
4896 case Intrinsic::amdgcn_flat_atomic_fadd:
4897 case Intrinsic::amdgcn_flat_atomic_fmin:
4898 case Intrinsic::amdgcn_flat_atomic_fmax:
4899 case Intrinsic::amdgcn_flat_atomic_fmin_num:
4900 case Intrinsic::amdgcn_flat_atomic_fmax_num:
4901 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
4902 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
4903 case Intrinsic::amdgcn_atomic_cond_sub_u32:
4904 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
4905 case Intrinsic::amdgcn_global_load_tr_b64:
4906 case Intrinsic::amdgcn_global_load_tr_b128:
4907 return getDefaultMappingAllVGPR(MI);
4908 case Intrinsic::amdgcn_ds_ordered_add:
4909 case Intrinsic::amdgcn_ds_ordered_swap: {
4910 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4911 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4912 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4913 AMDGPU::SGPRRegBankID);
4914 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4915 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4916 break;
4917 }
4918 case Intrinsic::amdgcn_ds_append:
4919 case Intrinsic::amdgcn_ds_consume: {
4920 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4921 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4922 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4923 break;
4924 }
4925 case Intrinsic::amdgcn_exp_compr:
4926 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4927 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4928 break;
4929 case Intrinsic::amdgcn_exp:
4930 // FIXME: Could we support packed types here?
4931 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4932 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4933 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4934 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4935 break;
4936 case Intrinsic::amdgcn_exp_row:
4937 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4938 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4939 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4940 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4941 OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
4942 break;
4943 case Intrinsic::amdgcn_s_sendmsg:
4944 case Intrinsic::amdgcn_s_sendmsghalt: {
4945 // This must be an SGPR, but accept a VGPR.
4946 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4947 AMDGPU::SGPRRegBankID);
4948 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4949 break;
4950 }
4951 case Intrinsic::amdgcn_s_setreg: {
4952 // This must be an SGPR, but accept a VGPR.
4953 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4954 AMDGPU::SGPRRegBankID);
4955 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4956 break;
4957 }
4958 case Intrinsic::amdgcn_s_ttracedata: {
4959 // This must be an SGPR, but accept a VGPR.
4960 unsigned Bank =
4961 getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID);
4962 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4963 break;
4964 }
4965 case Intrinsic::amdgcn_end_cf: {
4966 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4967 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4968 break;
4969 }
4970 case Intrinsic::amdgcn_else: {
4971 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4972 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4973 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4974 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4975 break;
4976 }
4977 case Intrinsic::amdgcn_live_mask: {
4978 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4979 break;
4980 }
4981 case Intrinsic::amdgcn_wqm_demote:
4982 case Intrinsic::amdgcn_kill: {
4983 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4984 break;
4985 }
4986 case Intrinsic::amdgcn_raw_buffer_load:
4987 case Intrinsic::amdgcn_raw_ptr_buffer_load:
4988 case Intrinsic::amdgcn_raw_atomic_buffer_load:
4989 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
4990 case Intrinsic::amdgcn_raw_tbuffer_load:
4991 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
4992 // FIXME: Should make intrinsic ID the last operand of the instruction,
4993 // then this would be the same as store
4994 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4995 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4996 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4997 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4998 break;
4999 }
5000 case Intrinsic::amdgcn_raw_buffer_load_lds:
5001 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
5002 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5003 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5004 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5005 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5006 break;
5007 }
5008 case Intrinsic::amdgcn_raw_buffer_store:
5009 case Intrinsic::amdgcn_raw_ptr_buffer_store:
5010 case Intrinsic::amdgcn_raw_buffer_store_format:
5011 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
5012 case Intrinsic::amdgcn_raw_tbuffer_store:
5013 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
5014 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5015 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5016 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5017 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5018 break;
5019 }
5020 case Intrinsic::amdgcn_struct_buffer_load:
5021 case Intrinsic::amdgcn_struct_ptr_buffer_load:
5022 case Intrinsic::amdgcn_struct_tbuffer_load:
5023 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
5024 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5025 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5026 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5027 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5028 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5029 break;
5030 }
5031 case Intrinsic::amdgcn_struct_buffer_load_lds:
5032 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
5033 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5034 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5035 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5036 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5037 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
5038 break;
5039 }
5040 case Intrinsic::amdgcn_struct_buffer_store:
5041 case Intrinsic::amdgcn_struct_ptr_buffer_store:
5042 case Intrinsic::amdgcn_struct_tbuffer_store:
5043 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
5044 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5045 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5046 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5047 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5048 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5049 break;
5050 }
5051 case Intrinsic::amdgcn_init_exec_from_input: {
5052 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
5053 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
5054 break;
5055 }
5056 case Intrinsic::amdgcn_ds_gws_init:
5057 case Intrinsic::amdgcn_ds_gws_barrier:
5058 case Intrinsic::amdgcn_ds_gws_sema_br: {
5059 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5060
5061 // This must be an SGPR, but accept a VGPR.
5062 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5063 AMDGPU::SGPRRegBankID);
5064 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
5065 break;
5066 }
5067 case Intrinsic::amdgcn_ds_gws_sema_v:
5068 case Intrinsic::amdgcn_ds_gws_sema_p:
5069 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
5070 // This must be an SGPR, but accept a VGPR.
5071 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
5072 AMDGPU::SGPRRegBankID);
5073 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
5074 break;
5075 }
5076 case Intrinsic::amdgcn_global_load_lds: {
5077 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5078 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5079 break;
5080 }
5081 case Intrinsic::amdgcn_lds_direct_load: {
5082 const int M0Idx = MI.getNumOperands() - 1;
5083 Register M0Reg = MI.getOperand(M0Idx).getReg();
5084 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
5085 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5086
5087 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
5088 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
5089 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5090
5091 // Must be SGPR, but we must take whatever the original bank is and fix it
5092 // later.
5093 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
5094 break;
5095 }
5096 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
5097 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
5098 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5099 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5100 break;
5101 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
5102 OpdsMapping[0] =
5103 getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst
5104 OpdsMapping[1] =
5105 getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr
5106 OpdsMapping[3] =
5107 getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr
5108 OpdsMapping[4] =
5109 getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0
5110 OpdsMapping[5] =
5111 getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
5112 break;
5113 }
5114 case Intrinsic::amdgcn_s_sleep_var:
5115 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5116 break;
5117 case Intrinsic::amdgcn_s_barrier_signal_var:
5118 case Intrinsic::amdgcn_s_barrier_join:
5119 case Intrinsic::amdgcn_s_wakeup_barrier:
5120 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5121 break;
5122 case Intrinsic::amdgcn_s_barrier_init:
5123 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5124 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5125 break;
5126 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: {
5127 const unsigned ResultSize = 1;
5128 OpdsMapping[0] =
5129 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
5130 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5131 break;
5132 }
5133 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
5134 case Intrinsic::amdgcn_s_barrier_leave: {
5135 const unsigned ResultSize = 1;
5136 OpdsMapping[0] =
5137 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
5138 break;
5139 }
5140 case Intrinsic::amdgcn_s_get_barrier_state: {
5141 OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5142 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5143 break;
5144 }
5145 case Intrinsic::amdgcn_pops_exiting_wave_id:
5146 return getDefaultMappingSOP(MI);
5147 default:
5148 return getInvalidInstructionMapping();
5149 }
5150 break;
5151 }
5152 case AMDGPU::G_SELECT: {
5153 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5154 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5155 AMDGPU::SGPRRegBankID);
5156 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
5157 AMDGPU::SGPRRegBankID);
5158 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
5159 Op3Bank == AMDGPU::SGPRRegBankID;
5160
5161 unsigned CondBankDefault = SGPRSrcs ?
5162 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5163 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
5164 CondBankDefault);
5165 if (CondBank == AMDGPU::SGPRRegBankID)
5166 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5167 else if (CondBank == AMDGPU::VGPRRegBankID)
5168 CondBank = AMDGPU::VCCRegBankID;
5169
5170 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
5171 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5172
5173 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
5174
5175 // TODO: Should report 32-bit for scalar condition type.
5176 if (Size == 64) {
5177 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5178 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
5179 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5180 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5181 } else {
5182 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
5183 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
5184 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
5185 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
5186 }
5187
5188 break;
5189 }
5190
5191 case AMDGPU::G_SI_CALL: {
5192 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
5193 // Lie and claim everything is legal, even though some need to be
5194 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
5195 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5196
5197 // Allow anything for implicit arguments
5198 for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
5199 if (MI.getOperand(I).isReg()) {
5200 Register Reg = MI.getOperand(I).getReg();
5201 auto OpBank = getRegBankID(Reg, MRI);
5202 unsigned Size = getSizeInBits(Reg, MRI, *TRI);
5203 OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
5204 }
5205 }
5206 break;
5207 }
5208 case AMDGPU::G_LOAD:
5209 case AMDGPU::G_ZEXTLOAD:
5210 case AMDGPU::G_SEXTLOAD:
5211 return getInstrMappingForLoad(MI);
5212
5213 case AMDGPU::G_ATOMICRMW_XCHG:
5214 case AMDGPU::G_ATOMICRMW_ADD:
5215 case AMDGPU::G_ATOMICRMW_SUB:
5216 case AMDGPU::G_ATOMICRMW_AND:
5217 case AMDGPU::G_ATOMICRMW_OR:
5218 case AMDGPU::G_ATOMICRMW_XOR:
5219 case AMDGPU::G_ATOMICRMW_MAX:
5220 case AMDGPU::G_ATOMICRMW_MIN:
5221 case AMDGPU::G_ATOMICRMW_UMAX:
5222 case AMDGPU::G_ATOMICRMW_UMIN:
5223 case AMDGPU::G_ATOMICRMW_FADD:
5224 case AMDGPU::G_ATOMICRMW_FMIN:
5225 case AMDGPU::G_ATOMICRMW_FMAX:
5226 case AMDGPU::G_ATOMICRMW_UINC_WRAP:
5227 case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
5228 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
5229 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5230 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
5231 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5232 break;
5233 }
5234 case AMDGPU::G_ATOMIC_CMPXCHG: {
5235 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5236 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
5237 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5238 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5239 break;
5240 }
5241 case AMDGPU::G_BRCOND: {
5242 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
5243 AMDGPU::SGPRRegBankID);
5244 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
5245 if (Bank != AMDGPU::SGPRRegBankID)
5246 Bank = AMDGPU::VCCRegBankID;
5247
5248 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
5249 break;
5250 }
5251 case AMDGPU::G_FPTRUNC_ROUND_UPWARD:
5252 case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD:
5253 return getDefaultMappingVOP(MI);
5254 case AMDGPU::G_PREFETCH:
5255 OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5256 break;
5257 }
5258
5259 return getInstructionMapping(/*ID*/1, /*Cost*/1,
5260 getOperandsMapping(OpdsMapping),
5261 MI.getNumOperands());
5262 }
5263