1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70
71 #include "AMDGPURegisterBankInfo.h"
72
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83 #include "llvm/CodeGen/RegisterBank.h"
84 #include "llvm/IR/IntrinsicsAMDGPU.h"
85
86 #define GET_TARGET_REGBANK_IMPL
87 #include "AMDGPUGenRegisterBank.inc"
88
89 // This file will be TableGen'ed at some point.
90 #include "AMDGPUGenRegisterBankInfo.def"
91
92 using namespace llvm;
93 using namespace MIPatternMatch;
94
95 namespace {
96
97 // Observer to apply a register bank to new registers created by LegalizerHelper.
98 class ApplyRegBankMapping final : public GISelChangeObserver {
99 private:
100 MachineIRBuilder &B;
101 const AMDGPURegisterBankInfo &RBI;
102 MachineRegisterInfo &MRI;
103 const RegisterBank *NewBank;
104 SmallVector<MachineInstr *, 4> NewInsts;
105
106 public:
ApplyRegBankMapping(MachineIRBuilder & B,const AMDGPURegisterBankInfo & RBI_,MachineRegisterInfo & MRI_,const RegisterBank * RB)107 ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_,
108 MachineRegisterInfo &MRI_, const RegisterBank *RB)
109 : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) {
110 assert(!B.isObservingChanges());
111 B.setChangeObserver(*this);
112 }
113
~ApplyRegBankMapping()114 ~ApplyRegBankMapping() override {
115 for (MachineInstr *MI : NewInsts)
116 applyBank(*MI);
117
118 B.stopObservingChanges();
119 }
120
121 /// Set any registers that don't have a set register class or bank to SALU.
applyBank(MachineInstr & MI)122 void applyBank(MachineInstr &MI) {
123 const unsigned Opc = MI.getOpcode();
124 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
125 Opc == AMDGPU::G_SEXT) {
126 // LegalizerHelper wants to use the basic legalization artifacts when
127 // widening etc. We don't handle selection with vcc in artifact sources,
128 // so we need to use a select instead to handle these properly.
129 Register DstReg = MI.getOperand(0).getReg();
130 Register SrcReg = MI.getOperand(1).getReg();
131 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
132 if (SrcBank == &AMDGPU::VCCRegBank) {
133 const LLT S32 = LLT::scalar(32);
134 assert(MRI.getType(SrcReg) == LLT::scalar(1));
135 assert(MRI.getType(DstReg) == S32);
136 assert(NewBank == &AMDGPU::VGPRRegBank);
137
138 // Replace the extension with a select, which really uses the boolean
139 // source.
140 B.setInsertPt(*MI.getParent(), MI);
141
142 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
143 auto False = B.buildConstant(S32, 0);
144 B.buildSelect(DstReg, SrcReg, True, False);
145 MRI.setRegBank(True.getReg(0), *NewBank);
146 MRI.setRegBank(False.getReg(0), *NewBank);
147 MI.eraseFromParent();
148 }
149
150 assert(!MRI.getRegClassOrRegBank(DstReg));
151 MRI.setRegBank(DstReg, *NewBank);
152 return;
153 }
154
155 #ifndef NDEBUG
156 if (Opc == AMDGPU::G_TRUNC) {
157 Register DstReg = MI.getOperand(0).getReg();
158 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
159 assert(DstBank != &AMDGPU::VCCRegBank);
160 }
161 #endif
162
163 for (MachineOperand &Op : MI.operands()) {
164 if (!Op.isReg())
165 continue;
166
167 // We may see physical registers if building a real MI
168 Register Reg = Op.getReg();
169 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
170 continue;
171
172 const RegisterBank *RB = NewBank;
173 if (MRI.getType(Reg) == LLT::scalar(1)) {
174 assert(NewBank == &AMDGPU::VGPRRegBank &&
175 "s1 operands should only be used for vector bools");
176 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
177 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
178 "not expecting legalization artifacts here");
179 RB = &AMDGPU::VCCRegBank;
180 }
181
182 MRI.setRegBank(Reg, *RB);
183 }
184 }
185
erasingInstr(MachineInstr & MI)186 void erasingInstr(MachineInstr &MI) override {}
187
createdInstr(MachineInstr & MI)188 void createdInstr(MachineInstr &MI) override {
189 // At this point, the instruction was just inserted and has no operands.
190 NewInsts.push_back(&MI);
191 }
192
changingInstr(MachineInstr & MI)193 void changingInstr(MachineInstr &MI) override {}
changedInstr(MachineInstr & MI)194 void changedInstr(MachineInstr &MI) override {
195 // FIXME: In principle we should probably add the instruction to NewInsts,
196 // but the way the LegalizerHelper uses the observer, we will always see the
197 // registers we need to set the regbank on also referenced in a new
198 // instruction.
199 }
200 };
201
202 } // anonymous namespace
203
AMDGPURegisterBankInfo(const GCNSubtarget & ST)204 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
205 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
206 TII(Subtarget.getInstrInfo()) {
207
208 // HACK: Until this is fully tablegen'd.
209 static llvm::once_flag InitializeRegisterBankFlag;
210
211 static auto InitializeRegisterBankOnce = [this]() {
212 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
213 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
214 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
215 (void)this;
216 };
217
218 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
219 }
220
isVectorRegisterBank(const RegisterBank & Bank)221 static bool isVectorRegisterBank(const RegisterBank &Bank) {
222 unsigned BankID = Bank.getID();
223 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
224 }
225
isDivergentRegBank(const RegisterBank * RB) const226 bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const {
227 return RB != &AMDGPU::SGPRRegBank;
228 }
229
copyCost(const RegisterBank & Dst,const RegisterBank & Src,TypeSize Size) const230 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
231 const RegisterBank &Src,
232 TypeSize Size) const {
233 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
234 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
235 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
236 return std::numeric_limits<unsigned>::max();
237 }
238
239 // Bool values are tricky, because the meaning is based on context. The SCC
240 // and VCC banks are for the natural scalar and vector conditions produced by
241 // a compare.
242 //
243 // Legalization doesn't know about the necessary context, so an s1 use may
244 // have been a truncate from an arbitrary value, in which case a copy (lowered
245 // as a compare with 0) needs to be inserted.
246 if (Size == 1 &&
247 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
248 (isVectorRegisterBank(Src) ||
249 Src.getID() == AMDGPU::SGPRRegBankID ||
250 Src.getID() == AMDGPU::VCCRegBankID))
251 return std::numeric_limits<unsigned>::max();
252
253 // There is no direct copy between AGPRs.
254 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
255 Src.getID() == AMDGPU::AGPRRegBankID)
256 return 4;
257
258 return RegisterBankInfo::copyCost(Dst, Src, Size);
259 }
260
getBreakDownCost(const ValueMapping & ValMapping,const RegisterBank * CurBank) const261 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
262 const ValueMapping &ValMapping,
263 const RegisterBank *CurBank) const {
264 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
265 // VGPR.
266 // FIXME: Is there a better way to do this?
267 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
268 return 10; // This is expensive.
269
270 assert(ValMapping.NumBreakDowns == 2 &&
271 ValMapping.BreakDown[0].Length == 32 &&
272 ValMapping.BreakDown[0].StartIdx == 0 &&
273 ValMapping.BreakDown[1].Length == 32 &&
274 ValMapping.BreakDown[1].StartIdx == 32 &&
275 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
276
277 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
278 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
279 // want.
280
281 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
282 // alignment restrictions, but this probably isn't important.
283 return 1;
284 }
285
286 const RegisterBank &
getRegBankFromRegClass(const TargetRegisterClass & RC,LLT Ty) const287 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
288 LLT Ty) const {
289 if (&RC == &AMDGPU::SReg_1RegClass)
290 return AMDGPU::VCCRegBank;
291
292 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
293 // VCC-like use.
294 if (TRI->isSGPRClass(&RC)) {
295 // FIXME: This probably came from a copy from a physical register, which
296 // should be inferable from the copied to-type. We don't have many boolean
297 // physical register constraints so just assume a normal SGPR for now.
298 if (!Ty.isValid())
299 return AMDGPU::SGPRRegBank;
300
301 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
302 }
303
304 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
305 }
306
307 template <unsigned NumOps>
308 RegisterBankInfo::InstructionMappings
addMappingFromTable(const MachineInstr & MI,const MachineRegisterInfo & MRI,const std::array<unsigned,NumOps> RegSrcOpIdx,ArrayRef<OpRegBankEntry<NumOps>> Table) const309 AMDGPURegisterBankInfo::addMappingFromTable(
310 const MachineInstr &MI, const MachineRegisterInfo &MRI,
311 const std::array<unsigned, NumOps> RegSrcOpIdx,
312 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
313
314 InstructionMappings AltMappings;
315
316 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
317
318 unsigned Sizes[NumOps];
319 for (unsigned I = 0; I < NumOps; ++I) {
320 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
321 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
322 }
323
324 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
325 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
326 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
327 }
328
329 // getInstrMapping's default mapping uses ID 1, so start at 2.
330 unsigned MappingID = 2;
331 for (const auto &Entry : Table) {
332 for (unsigned I = 0; I < NumOps; ++I) {
333 int OpIdx = RegSrcOpIdx[I];
334 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
335 }
336
337 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
338 getOperandsMapping(Operands),
339 Operands.size()));
340 }
341
342 return AltMappings;
343 }
344
345 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsic(const MachineInstr & MI,const MachineRegisterInfo & MRI) const346 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
347 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
348 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
349 case Intrinsic::amdgcn_readlane: {
350 static const OpRegBankEntry<3> Table[2] = {
351 // Perfectly legal.
352 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
353
354 // Need a readfirstlane for the index.
355 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
356 };
357
358 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
359 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
360 }
361 case Intrinsic::amdgcn_writelane: {
362 static const OpRegBankEntry<4> Table[4] = {
363 // Perfectly legal.
364 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
365
366 // Need readfirstlane of first op
367 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
368
369 // Need readfirstlane of second op
370 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
371
372 // Need readfirstlane of both ops
373 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
374 };
375
376 // rsrc, voffset, offset
377 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
378 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);
379 }
380 default:
381 return RegisterBankInfo::getInstrAlternativeMappings(MI);
382 }
383 }
384
385 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr & MI,const MachineRegisterInfo & MRI) const386 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
387 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
388
389 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
390 case Intrinsic::amdgcn_s_buffer_load: {
391 static const OpRegBankEntry<2> Table[4] = {
392 // Perfectly legal.
393 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
394
395 // Only need 1 register in loop
396 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
397
398 // Have to waterfall the resource.
399 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
400
401 // Have to waterfall the resource, and the offset.
402 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
403 };
404
405 // rsrc, offset
406 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
407 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);
408 }
409 case Intrinsic::amdgcn_ds_ordered_add:
410 case Intrinsic::amdgcn_ds_ordered_swap: {
411 // VGPR = M0, VGPR
412 static const OpRegBankEntry<3> Table[2] = {
413 // Perfectly legal.
414 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
415
416 // Need a readfirstlane for m0
417 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
418 };
419
420 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
421 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
422 }
423 case Intrinsic::amdgcn_s_sendmsg:
424 case Intrinsic::amdgcn_s_sendmsghalt: {
425 // FIXME: Should have no register for immediate
426 static const OpRegBankEntry<1> Table[2] = {
427 // Perfectly legal.
428 { { AMDGPU::SGPRRegBankID }, 1 },
429
430 // Need readlane
431 { { AMDGPU::VGPRRegBankID }, 3 }
432 };
433
434 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
435 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);
436 }
437 default:
438 return RegisterBankInfo::getInstrAlternativeMappings(MI);
439 }
440 }
441
442 // FIXME: Returns uniform if there's no source value information. This is
443 // probably wrong.
isScalarLoadLegal(const MachineInstr & MI) const444 bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
445 if (!MI.hasOneMemOperand())
446 return false;
447
448 const MachineMemOperand *MMO = *MI.memoperands_begin();
449 const unsigned AS = MMO->getAddrSpace();
450 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
451 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
452 const unsigned MemSize = 8 * MMO->getSize().getValue();
453
454 // Require 4-byte alignment.
455 return (MMO->getAlign() >= Align(4) ||
456 (Subtarget.hasScalarSubwordLoads() &&
457 ((MemSize == 16 && MMO->getAlign() >= Align(2)) ||
458 (MemSize == 8 && MMO->getAlign() >= Align(1))))) &&
459 // Can't do a scalar atomic load.
460 !MMO->isAtomic() &&
461 // Don't use scalar loads for volatile accesses to non-constant address
462 // spaces.
463 (IsConst || !MMO->isVolatile()) &&
464 // Memory must be known constant, or not written before this load.
465 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
466 AMDGPU::isUniformMMO(MMO);
467 }
468
469 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappings(const MachineInstr & MI) const470 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
471 const MachineInstr &MI) const {
472
473 const MachineFunction &MF = *MI.getParent()->getParent();
474 const MachineRegisterInfo &MRI = MF.getRegInfo();
475
476
477 InstructionMappings AltMappings;
478 switch (MI.getOpcode()) {
479 case TargetOpcode::G_CONSTANT:
480 case TargetOpcode::G_IMPLICIT_DEF: {
481 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
482 if (Size == 1) {
483 static const OpRegBankEntry<1> Table[3] = {
484 { { AMDGPU::VGPRRegBankID }, 1 },
485 { { AMDGPU::SGPRRegBankID }, 1 },
486 { { AMDGPU::VCCRegBankID }, 1 }
487 };
488
489 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
490 }
491
492 [[fallthrough]];
493 }
494 case TargetOpcode::G_FCONSTANT:
495 case TargetOpcode::G_FRAME_INDEX:
496 case TargetOpcode::G_GLOBAL_VALUE: {
497 static const OpRegBankEntry<1> Table[2] = {
498 { { AMDGPU::VGPRRegBankID }, 1 },
499 { { AMDGPU::SGPRRegBankID }, 1 }
500 };
501
502 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
503 }
504 case TargetOpcode::G_AND:
505 case TargetOpcode::G_OR:
506 case TargetOpcode::G_XOR: {
507 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
508
509 if (Size == 1) {
510 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
511 const InstructionMapping &SCCMapping = getInstructionMapping(
512 1, 1, getOperandsMapping(
513 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
514 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
515 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
516 3); // Num Operands
517 AltMappings.push_back(&SCCMapping);
518
519 const InstructionMapping &VCCMapping0 = getInstructionMapping(
520 2, 1, getOperandsMapping(
521 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
522 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
523 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
524 3); // Num Operands
525 AltMappings.push_back(&VCCMapping0);
526 return AltMappings;
527 }
528
529 if (Size != 64)
530 break;
531
532 const InstructionMapping &SSMapping = getInstructionMapping(
533 1, 1, getOperandsMapping(
534 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
535 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
536 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
537 3); // Num Operands
538 AltMappings.push_back(&SSMapping);
539
540 const InstructionMapping &VVMapping = getInstructionMapping(
541 2, 2, getOperandsMapping(
542 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
543 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
544 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
545 3); // Num Operands
546 AltMappings.push_back(&VVMapping);
547 break;
548 }
549 case TargetOpcode::G_LOAD:
550 case TargetOpcode::G_ZEXTLOAD:
551 case TargetOpcode::G_SEXTLOAD: {
552 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
553 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
554 unsigned PtrSize = PtrTy.getSizeInBits();
555 unsigned AS = PtrTy.getAddressSpace();
556
557 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
558 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
559 isScalarLoadLegal(MI)) {
560 const InstructionMapping &SSMapping = getInstructionMapping(
561 1, 1, getOperandsMapping(
562 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
563 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
564 2); // Num Operands
565 AltMappings.push_back(&SSMapping);
566 }
567
568 const InstructionMapping &VVMapping = getInstructionMapping(
569 2, 1,
570 getOperandsMapping(
571 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
572 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
573 2); // Num Operands
574 AltMappings.push_back(&VVMapping);
575
576 // It may be possible to have a vgpr = load sgpr mapping here, because
577 // the mubuf instructions support this kind of load, but probably for only
578 // gfx7 and older. However, the addressing mode matching in the instruction
579 // selector should be able to do a better job of detecting and selecting
580 // these kinds of loads from the vgpr = load vgpr mapping.
581
582 return AltMappings;
583
584 }
585 case TargetOpcode::G_SELECT: {
586 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
587 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
588 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
589 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
590 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
591 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
592 4); // Num Operands
593 AltMappings.push_back(&SSMapping);
594
595 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
596 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
597 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
598 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
599 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
600 4); // Num Operands
601 AltMappings.push_back(&VVMapping);
602
603 return AltMappings;
604 }
605 case TargetOpcode::G_UADDE:
606 case TargetOpcode::G_USUBE:
607 case TargetOpcode::G_SADDE:
608 case TargetOpcode::G_SSUBE: {
609 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
610 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
611 getOperandsMapping(
612 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
613 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
614 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
615 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
616 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
617 5); // Num Operands
618 AltMappings.push_back(&SSMapping);
619
620 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
621 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
622 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
623 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
624 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
625 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
626 5); // Num Operands
627 AltMappings.push_back(&VVMapping);
628 return AltMappings;
629 }
630 case AMDGPU::G_BRCOND: {
631 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
632
633 // TODO: Change type to 32 for scalar
634 const InstructionMapping &SMapping = getInstructionMapping(
635 1, 1, getOperandsMapping(
636 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
637 2); // Num Operands
638 AltMappings.push_back(&SMapping);
639
640 const InstructionMapping &VMapping = getInstructionMapping(
641 1, 1, getOperandsMapping(
642 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
643 2); // Num Operands
644 AltMappings.push_back(&VMapping);
645 return AltMappings;
646 }
647 case AMDGPU::G_INTRINSIC:
648 case AMDGPU::G_INTRINSIC_CONVERGENT:
649 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
650 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
651 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
652 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
653 default:
654 break;
655 }
656 return RegisterBankInfo::getInstrAlternativeMappings(MI);
657 }
658
split64BitValueForMapping(MachineIRBuilder & B,SmallVector<Register,2> & Regs,LLT HalfTy,Register Reg) const659 void AMDGPURegisterBankInfo::split64BitValueForMapping(
660 MachineIRBuilder &B,
661 SmallVector<Register, 2> &Regs,
662 LLT HalfTy,
663 Register Reg) const {
664 assert(HalfTy.getSizeInBits() == 32);
665 MachineRegisterInfo *MRI = B.getMRI();
666 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
667 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
668 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
669 MRI->setRegBank(LoLHS, *Bank);
670 MRI->setRegBank(HiLHS, *Bank);
671
672 Regs.push_back(LoLHS);
673 Regs.push_back(HiLHS);
674
675 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
676 .addDef(LoLHS)
677 .addDef(HiLHS)
678 .addUse(Reg);
679 }
680
681 /// Replace the current type each register in \p Regs has with \p NewTy
setRegsToType(MachineRegisterInfo & MRI,ArrayRef<Register> Regs,LLT NewTy)682 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
683 LLT NewTy) {
684 for (Register Reg : Regs) {
685 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
686 MRI.setType(Reg, NewTy);
687 }
688 }
689
getHalfSizedType(LLT Ty)690 static LLT getHalfSizedType(LLT Ty) {
691 if (Ty.isVector()) {
692 assert(Ty.getElementCount().isKnownMultipleOf(2));
693 return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
694 Ty.getElementType());
695 }
696
697 assert(Ty.getScalarSizeInBits() % 2 == 0);
698 return LLT::scalar(Ty.getScalarSizeInBits() / 2);
699 }
700
701 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
702 // source value into a scalar register.
buildReadFirstLane(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Src) const703 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
704 MachineRegisterInfo &MRI,
705 Register Src) const {
706 LLT Ty = MRI.getType(Src);
707 const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
708
709 if (Bank == &AMDGPU::SGPRRegBank)
710 return Src;
711
712 unsigned Bits = Ty.getSizeInBits();
713 assert(Bits % 32 == 0);
714
715 if (Bank != &AMDGPU::VGPRRegBank) {
716 // We need to copy from AGPR to VGPR
717 Src = B.buildCopy(Ty, Src).getReg(0);
718 MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
719 }
720
721 LLT S32 = LLT::scalar(32);
722 unsigned NumParts = Bits / 32;
723 SmallVector<Register, 8> SrcParts;
724 SmallVector<Register, 8> DstParts;
725
726 if (Bits == 32) {
727 SrcParts.push_back(Src);
728 } else {
729 auto Unmerge = B.buildUnmerge(S32, Src);
730 for (unsigned i = 0; i < NumParts; ++i)
731 SrcParts.push_back(Unmerge.getReg(i));
732 }
733
734 for (unsigned i = 0; i < NumParts; ++i) {
735 Register SrcPart = SrcParts[i];
736 Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
737 MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
738
739 const TargetRegisterClass *Constrained =
740 constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
741 (void)Constrained;
742 assert(Constrained && "Failed to constrain readfirstlane src reg");
743
744 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
745
746 DstParts.push_back(DstPart);
747 }
748
749 if (Bits == 32)
750 return DstParts[0];
751
752 Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0);
753 MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
754 return Dst;
755 }
756
757 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
758 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
759 /// execute the instruction for each unique combination of values in all lanes
760 /// in the wave. The block will be split such that rest of the instructions are
761 /// moved to a new block.
762 ///
763 /// Essentially performs this loop:
764 //
765 /// Save Execution Mask
766 /// For (Lane : Wavefront) {
767 /// Enable Lane, Disable all other lanes
768 /// SGPR = read SGPR value for current lane from VGPR
769 /// VGPRResult[Lane] = use_op SGPR
770 /// }
771 /// Restore Execution Mask
772 ///
773 /// There is additional complexity to try for compare values to identify the
774 /// unique values used.
executeInWaterfallLoop(MachineIRBuilder & B,iterator_range<MachineBasicBlock::iterator> Range,SmallSet<Register,4> & SGPROperandRegs) const775 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
776 MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
777 SmallSet<Register, 4> &SGPROperandRegs) const {
778 // Track use registers which have already been expanded with a readfirstlane
779 // sequence. This may have multiple uses if moving a sequence.
780 DenseMap<Register, Register> WaterfalledRegMap;
781
782 MachineBasicBlock &MBB = B.getMBB();
783 MachineFunction *MF = &B.getMF();
784
785 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
786 const unsigned MovExecOpc =
787 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
788 const unsigned MovExecTermOpc =
789 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
790
791 const unsigned XorTermOpc = Subtarget.isWave32() ?
792 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
793 const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
794 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
795 const unsigned ExecReg = Subtarget.isWave32() ?
796 AMDGPU::EXEC_LO : AMDGPU::EXEC;
797
798 #ifndef NDEBUG
799 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
800 #endif
801
802 MachineRegisterInfo &MRI = *B.getMRI();
803 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
804 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
805
806 // Don't bother using generic instructions/registers for the exec mask.
807 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
808 .addDef(InitSaveExecReg);
809
810 Register PhiExec = MRI.createVirtualRegister(WaveRC);
811 Register NewExec = MRI.createVirtualRegister(WaveRC);
812
813 // To insert the loop we need to split the block. Move everything before this
814 // point to a new block, and insert a new empty block before this instruction.
815 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
816 MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
817 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
818 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
819 MachineFunction::iterator MBBI(MBB);
820 ++MBBI;
821 MF->insert(MBBI, LoopBB);
822 MF->insert(MBBI, BodyBB);
823 MF->insert(MBBI, RestoreExecBB);
824 MF->insert(MBBI, RemainderBB);
825
826 LoopBB->addSuccessor(BodyBB);
827 BodyBB->addSuccessor(RestoreExecBB);
828 BodyBB->addSuccessor(LoopBB);
829
830 // Move the rest of the block into a new block.
831 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
832 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
833
834 MBB.addSuccessor(LoopBB);
835 RestoreExecBB->addSuccessor(RemainderBB);
836
837 B.setInsertPt(*LoopBB, LoopBB->end());
838
839 B.buildInstr(TargetOpcode::PHI)
840 .addDef(PhiExec)
841 .addReg(InitSaveExecReg)
842 .addMBB(&MBB)
843 .addReg(NewExec)
844 .addMBB(BodyBB);
845
846 const DebugLoc &DL = B.getDL();
847
848 MachineInstr &FirstInst = *Range.begin();
849
850 // Move the instruction into the loop body. Note we moved everything after
851 // Range.end() already into a new block, so Range.end() is no longer valid.
852 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
853
854 // Figure out the iterator range after splicing the instructions.
855 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
856 auto NewEnd = BodyBB->end();
857
858 B.setMBB(*LoopBB);
859
860 LLT S1 = LLT::scalar(1);
861 Register CondReg;
862
863 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
864
865 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
866 for (MachineOperand &Op : MI.all_uses()) {
867 Register OldReg = Op.getReg();
868 if (!SGPROperandRegs.count(OldReg))
869 continue;
870
871 // See if we already processed this register in another instruction in the
872 // sequence.
873 auto OldVal = WaterfalledRegMap.find(OldReg);
874 if (OldVal != WaterfalledRegMap.end()) {
875 Op.setReg(OldVal->second);
876 continue;
877 }
878
879 Register OpReg = Op.getReg();
880 LLT OpTy = MRI.getType(OpReg);
881
882 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
883 if (OpBank != &AMDGPU::VGPRRegBank) {
884 // Insert copy from AGPR to VGPR before the loop.
885 B.setMBB(MBB);
886 OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
887 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
888 B.setMBB(*LoopBB);
889 }
890
891 Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
892
893 // Build the comparison(s).
894 unsigned OpSize = OpTy.getSizeInBits();
895 bool Is64 = OpSize % 64 == 0;
896 unsigned PartSize = Is64 ? 64 : 32;
897 LLT PartTy = LLT::scalar(PartSize);
898 unsigned NumParts = OpSize / PartSize;
899 SmallVector<Register, 8> OpParts;
900 SmallVector<Register, 8> CurrentLaneParts;
901
902 if (NumParts == 1) {
903 OpParts.push_back(OpReg);
904 CurrentLaneParts.push_back(CurrentLaneReg);
905 } else {
906 auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
907 auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
908 for (unsigned i = 0; i < NumParts; ++i) {
909 OpParts.push_back(UnmergeOp.getReg(i));
910 CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
911 MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
912 MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
913 }
914 }
915
916 for (unsigned i = 0; i < NumParts; ++i) {
917 auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
918 OpParts[i]).getReg(0);
919 MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
920
921 if (!CondReg) {
922 CondReg = CmpReg;
923 } else {
924 CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
925 MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
926 }
927 }
928
929 Op.setReg(CurrentLaneReg);
930
931 // Make sure we don't re-process this register again.
932 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
933 }
934 }
935
936 // The ballot becomes a no-op during instruction selection.
937 CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
938 {LLT::scalar(Subtarget.isWave32() ? 32 : 64)})
939 .addReg(CondReg)
940 .getReg(0);
941 MRI.setRegClass(CondReg, WaveRC);
942
943 // Update EXEC, save the original EXEC value to VCC.
944 B.buildInstr(AndSaveExecOpc)
945 .addDef(NewExec)
946 .addReg(CondReg, RegState::Kill);
947
948 MRI.setSimpleHint(NewExec, CondReg);
949
950 B.setInsertPt(*BodyBB, BodyBB->end());
951
952 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
953 B.buildInstr(XorTermOpc)
954 .addDef(ExecReg)
955 .addReg(ExecReg)
956 .addReg(NewExec);
957
958 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
959 // s_cbranch_scc0?
960
961 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
962 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
963
964 // Save the EXEC mask before the loop.
965 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
966 .addReg(ExecReg);
967
968 // Restore the EXEC mask after the loop.
969 B.setMBB(*RestoreExecBB);
970 B.buildInstr(MovExecTermOpc)
971 .addDef(ExecReg)
972 .addReg(SaveExecReg);
973
974 // Set the insert point after the original instruction, so any new
975 // instructions will be in the remainder.
976 B.setInsertPt(*RemainderBB, RemainderBB->begin());
977
978 return true;
979 }
980
981 // Return any unique registers used by \p MI at \p OpIndices that need to be
982 // handled in a waterfall loop. Returns these registers in \p
983 // SGPROperandRegs. Returns true if there are any operands to handle and a
984 // waterfall loop is necessary.
collectWaterfallOperands(SmallSet<Register,4> & SGPROperandRegs,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const985 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
986 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
987 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
988 for (unsigned Op : OpIndices) {
989 assert(MI.getOperand(Op).isUse());
990 Register Reg = MI.getOperand(Op).getReg();
991 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
992 if (OpBank->getID() != AMDGPU::SGPRRegBankID)
993 SGPROperandRegs.insert(Reg);
994 }
995
996 // No operands need to be replaced, so no need to loop.
997 return !SGPROperandRegs.empty();
998 }
999
executeInWaterfallLoop(MachineIRBuilder & B,MachineInstr & MI,ArrayRef<unsigned> OpIndices) const1000 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1001 MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const {
1002 // Use a set to avoid extra readfirstlanes in the case where multiple operands
1003 // are the same register.
1004 SmallSet<Register, 4> SGPROperandRegs;
1005
1006 if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices))
1007 return false;
1008
1009 MachineBasicBlock::iterator I = MI.getIterator();
1010 return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1011 SGPROperandRegs);
1012 }
1013
1014 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
constrainOpWithReadfirstlane(MachineIRBuilder & B,MachineInstr & MI,unsigned OpIdx) const1015 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1016 MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const {
1017 Register Reg = MI.getOperand(OpIdx).getReg();
1018 MachineRegisterInfo &MRI = *B.getMRI();
1019 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1020 if (Bank == &AMDGPU::SGPRRegBank)
1021 return;
1022
1023 Reg = buildReadFirstLane(B, MRI, Reg);
1024 MI.getOperand(OpIdx).setReg(Reg);
1025 }
1026
1027 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1028 /// rest will be in the remainder.
splitUnequalType(LLT Ty,unsigned FirstSize)1029 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1030 unsigned TotalSize = Ty.getSizeInBits();
1031 if (!Ty.isVector())
1032 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1033
1034 LLT EltTy = Ty.getElementType();
1035 unsigned EltSize = EltTy.getSizeInBits();
1036 assert(FirstSize % EltSize == 0);
1037
1038 unsigned FirstPartNumElts = FirstSize / EltSize;
1039 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1040
1041 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1042 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1043 }
1044
widen96To128(LLT Ty)1045 static LLT widen96To128(LLT Ty) {
1046 if (!Ty.isVector())
1047 return LLT::scalar(128);
1048
1049 LLT EltTy = Ty.getElementType();
1050 assert(128 % EltTy.getSizeInBits() == 0);
1051 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1052 }
1053
applyMappingLoad(MachineIRBuilder & B,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineInstr & MI) const1054 bool AMDGPURegisterBankInfo::applyMappingLoad(
1055 MachineIRBuilder &B,
1056 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1057 MachineInstr &MI) const {
1058 MachineRegisterInfo &MRI = *B.getMRI();
1059 Register DstReg = MI.getOperand(0).getReg();
1060 const LLT LoadTy = MRI.getType(DstReg);
1061 unsigned LoadSize = LoadTy.getSizeInBits();
1062 MachineMemOperand *MMO = *MI.memoperands_begin();
1063 const unsigned MaxNonSmrdLoadSize = 128;
1064
1065 const RegisterBank *DstBank =
1066 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1067 if (DstBank == &AMDGPU::SGPRRegBank) {
1068 // There are some special cases that we need to look at for 32 bit and 96
1069 // bit SGPR loads otherwise we have nothing to do.
1070 if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
1071 return false;
1072
1073 const unsigned MemSize = 8 * MMO->getSize().getValue();
1074 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1075 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1076 // scalar loads should have a load size of 32 but memory access size of less
1077 // than 32.
1078 if (LoadSize == 32 &&
1079 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1080 return false;
1081
1082 if (LoadSize == 32 &&
1083 ((MemSize == 8 && MMO->getAlign() >= Align(1)) ||
1084 (MemSize == 16 && MMO->getAlign() >= Align(2))) &&
1085 isScalarLoadLegal(MI) &&
1086 Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12)
1087 return false;
1088
1089 Register PtrReg = MI.getOperand(1).getReg();
1090
1091 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
1092
1093 if (LoadSize == 32) {
1094 // This is an extending load from a sub-dword size. Widen the memory
1095 // access size to 4 bytes and clear the extra high bits appropriately
1096 const LLT S32 = LLT::scalar(32);
1097 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1098 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1099 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1100 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1101 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1102 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1103 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1104 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1105 } else
1106 // We do not need to touch the higher bits for regular loads.
1107 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1108 } else {
1109 // 96-bit loads are only available for vector loads. We need to split this
1110 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1111 if (MMO->getAlign() < Align(16)) {
1112 LegalizerHelper Helper(B.getMF(), ApplyBank, B);
1113 LLT Part64, Part32;
1114 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1115 if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1116 LegalizerHelper::Legalized)
1117 return false;
1118 return true;
1119 }
1120 LLT WiderTy = widen96To128(LoadTy);
1121 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1122 if (WiderTy.isScalar()) {
1123 B.buildTrunc(MI.getOperand(0), WideLoad);
1124 } else {
1125 B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1126 WideLoad);
1127 }
1128 }
1129
1130 MI.eraseFromParent();
1131 return true;
1132 }
1133
1134 // 128-bit loads are supported for all instruction types.
1135 if (LoadSize <= MaxNonSmrdLoadSize)
1136 return false;
1137
1138 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1139
1140 if (SrcRegs.empty())
1141 SrcRegs.push_back(MI.getOperand(1).getReg());
1142
1143 // RegBankSelect only emits scalar types, so we need to reset the pointer
1144 // operand to a pointer type.
1145 Register BasePtrReg = SrcRegs[0];
1146 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1147 MRI.setType(BasePtrReg, PtrTy);
1148
1149 // The following are the loads not splitted enough during legalization
1150 // because it was not clear they are smem-load or vmem-load
1151 if (AMDGPU::isExtendedGlobalAddrSpace(MMO->getAddrSpace()) ||
1152 MMO->getAddrSpace() == AMDGPUAS::BUFFER_RESOURCE) {
1153 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1154 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1155 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1156 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
1157 LegalizerHelper Helper(B.getMF(), O, B);
1158 if (LoadTy.isVector()) {
1159 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) !=
1160 LegalizerHelper::Legalized)
1161 return false;
1162 } else {
1163 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1164 return false;
1165 }
1166 }
1167
1168 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1169 return true;
1170 }
1171
applyMappingDynStackAlloc(MachineIRBuilder & B,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineInstr & MI) const1172 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1173 MachineIRBuilder &B,
1174 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1175 MachineInstr &MI) const {
1176 MachineRegisterInfo &MRI = *B.getMRI();
1177 const MachineFunction &MF = B.getMF();
1178 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1179 const auto &TFI = *ST.getFrameLowering();
1180
1181 // Guard in case the stack growth direction ever changes with scratch
1182 // instructions.
1183 assert(TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
1184 "Stack grows upwards for AMDGPU");
1185
1186 Register Dst = MI.getOperand(0).getReg();
1187 Register AllocSize = MI.getOperand(1).getReg();
1188 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1189
1190 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1191
1192 if (SizeBank != &AMDGPU::SGPRRegBank) {
1193 auto WaveReduction =
1194 B.buildIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, {LLT::scalar(32)})
1195 .addUse(AllocSize)
1196 .addImm(0);
1197 AllocSize = WaveReduction.getReg(0);
1198 }
1199
1200 LLT PtrTy = MRI.getType(Dst);
1201 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1202
1203 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1204 Register SPReg = Info->getStackPtrOffsetReg();
1205 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1206
1207 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1208 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1209
1210 auto OldSP = B.buildCopy(PtrTy, SPReg);
1211 if (Alignment > TFI.getStackAlign()) {
1212 auto StackAlignMask = (Alignment.value() << ST.getWavefrontSizeLog2()) - 1;
1213 auto Tmp1 = B.buildPtrAdd(PtrTy, OldSP,
1214 B.buildConstant(LLT::scalar(32), StackAlignMask));
1215 B.buildMaskLowPtrBits(Dst, Tmp1,
1216 Log2(Alignment) + ST.getWavefrontSizeLog2());
1217 } else {
1218 B.buildCopy(Dst, OldSP);
1219 }
1220 auto PtrAdd = B.buildPtrAdd(PtrTy, Dst, ScaledSize);
1221 B.buildCopy(SPReg, PtrAdd);
1222 MI.eraseFromParent();
1223 return true;
1224 }
1225
applyMappingImage(MachineIRBuilder & B,MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,int RsrcIdx) const1226 bool AMDGPURegisterBankInfo::applyMappingImage(
1227 MachineIRBuilder &B, MachineInstr &MI,
1228 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1229 int RsrcIdx) const {
1230 const int NumDefs = MI.getNumExplicitDefs();
1231
1232 // The reported argument index is relative to the IR intrinsic call arguments,
1233 // so we need to shift by the number of defs and the intrinsic ID.
1234 RsrcIdx += NumDefs + 1;
1235
1236 // Insert copies to VGPR arguments.
1237 applyDefaultMapping(OpdMapper);
1238
1239 // Fixup any SGPR arguments.
1240 SmallVector<unsigned, 4> SGPRIndexes;
1241 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1242 if (!MI.getOperand(I).isReg())
1243 continue;
1244
1245 // If this intrinsic has a sampler, it immediately follows rsrc.
1246 if (I == RsrcIdx || I == RsrcIdx + 1)
1247 SGPRIndexes.push_back(I);
1248 }
1249
1250 executeInWaterfallLoop(B, MI, SGPRIndexes);
1251 return true;
1252 }
1253
1254 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1255 // the three offsets (voffset, soffset and instoffset)
setBufferOffsets(MachineIRBuilder & B,Register CombinedOffset,Register & VOffsetReg,Register & SOffsetReg,int64_t & InstOffsetVal,Align Alignment) const1256 unsigned AMDGPURegisterBankInfo::setBufferOffsets(
1257 MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
1258 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
1259 const LLT S32 = LLT::scalar(32);
1260 MachineRegisterInfo *MRI = B.getMRI();
1261
1262 if (std::optional<int64_t> Imm =
1263 getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1264 uint32_t SOffset, ImmOffset;
1265 if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) {
1266 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1267 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1268 InstOffsetVal = ImmOffset;
1269
1270 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1271 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1272 return SOffset + ImmOffset;
1273 }
1274 }
1275
1276 Register Base;
1277 unsigned Offset;
1278
1279 std::tie(Base, Offset) =
1280 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1281
1282 uint32_t SOffset, ImmOffset;
1283 if ((int)Offset > 0 &&
1284 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
1285 if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1286 VOffsetReg = Base;
1287 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1288 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1289 InstOffsetVal = ImmOffset;
1290 return 0; // XXX - Why is this 0?
1291 }
1292
1293 // If we have SGPR base, we can use it for soffset.
1294 if (SOffset == 0) {
1295 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1296 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1297 SOffsetReg = Base;
1298 InstOffsetVal = ImmOffset;
1299 return 0; // XXX - Why is this 0?
1300 }
1301 }
1302
1303 // Handle the variable sgpr + vgpr case.
1304 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1305 if (Add && (int)Offset >= 0) {
1306 Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI);
1307 Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI);
1308
1309 const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI);
1310 const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI);
1311
1312 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1313 VOffsetReg = Src0;
1314 SOffsetReg = Src1;
1315 return 0;
1316 }
1317
1318 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1319 VOffsetReg = Src1;
1320 SOffsetReg = Src0;
1321 return 0;
1322 }
1323 }
1324
1325 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1326 // have an SGPR offset and a VGPR resource.
1327 if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1328 VOffsetReg = CombinedOffset;
1329 } else {
1330 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1331 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1332 }
1333
1334 SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1335 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1336 return 0;
1337 }
1338
getSBufferLoadCorrespondingBufferLoadOpcode(unsigned Opc)1339 static unsigned getSBufferLoadCorrespondingBufferLoadOpcode(unsigned Opc) {
1340 switch (Opc) {
1341 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
1342 return AMDGPU::G_AMDGPU_BUFFER_LOAD;
1343 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
1344 return AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
1345 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
1346 return AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE;
1347 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
1348 return AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
1349 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:
1350 return AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
1351 default:
1352 break;
1353 }
1354 llvm_unreachable("Unexpected s_buffer_load opcode");
1355 }
1356
applyMappingSBufferLoad(MachineIRBuilder & B,const OperandsMapper & OpdMapper) const1357 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1358 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1359 MachineInstr &MI = OpdMapper.getMI();
1360 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1361
1362 const LLT S32 = LLT::scalar(32);
1363 Register Dst = MI.getOperand(0).getReg();
1364 LLT Ty = MRI.getType(Dst);
1365
1366 const RegisterBank *RSrcBank =
1367 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1368 const RegisterBank *OffsetBank =
1369 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1370 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1371 OffsetBank == &AMDGPU::SGPRRegBank)
1372 return true; // Legal mapping
1373
1374 // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1375 // here but don't have an MMO.
1376
1377 unsigned LoadSize = Ty.getSizeInBits();
1378 int NumLoads = 1;
1379 if (LoadSize == 256 || LoadSize == 512) {
1380 NumLoads = LoadSize / 128;
1381 Ty = Ty.divide(NumLoads);
1382 }
1383
1384 // Use the alignment to ensure that the required offsets will fit into the
1385 // immediate offsets.
1386 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1387
1388 MachineFunction &MF = B.getMF();
1389
1390 Register SOffset;
1391 Register VOffset;
1392 int64_t ImmOffset = 0;
1393
1394 unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset,
1395 SOffset, ImmOffset, Alignment);
1396
1397 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1398 // can, but we need to track an MMO for that.
1399 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1400 const Align MemAlign(4); // FIXME: ABI type alignment?
1401 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1402 MachinePointerInfo(),
1403 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1404 MachineMemOperand::MOInvariant,
1405 MemSize, MemAlign);
1406 if (MMOOffset != 0)
1407 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1408
1409 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1410 // assume that the buffer is unswizzled.
1411
1412 Register RSrc = MI.getOperand(1).getReg();
1413 Register VIndex = B.buildConstant(S32, 0).getReg(0);
1414 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1415
1416 SmallVector<Register, 4> LoadParts(NumLoads);
1417
1418 MachineBasicBlock::iterator MII = MI.getIterator();
1419 MachineInstrSpan Span(MII, &B.getMBB());
1420
1421 for (int i = 0; i < NumLoads; ++i) {
1422 if (NumLoads == 1) {
1423 LoadParts[i] = Dst;
1424 } else {
1425 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1426 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1427 }
1428
1429 MachineMemOperand *MMO = BaseMMO;
1430 if (i != 0)
1431 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1432
1433 B.buildInstr(getSBufferLoadCorrespondingBufferLoadOpcode(MI.getOpcode()))
1434 .addDef(LoadParts[i]) // vdata
1435 .addUse(RSrc) // rsrc
1436 .addUse(VIndex) // vindex
1437 .addUse(VOffset) // voffset
1438 .addUse(SOffset) // soffset
1439 .addImm(ImmOffset + 16 * i) // offset(imm)
1440 .addImm(0) // cachepolicy, swizzled buffer(imm)
1441 .addImm(0) // idxen(imm)
1442 .addMemOperand(MMO);
1443 }
1444
1445 // TODO: If only the resource is a VGPR, it may be better to execute the
1446 // scalar load in the waterfall loop if the resource is expected to frequently
1447 // be dynamically uniform.
1448 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1449 // Remove the original instruction to avoid potentially confusing the
1450 // waterfall loop logic.
1451 B.setInstr(*Span.begin());
1452 MI.eraseFromParent();
1453
1454 SmallSet<Register, 4> OpsToWaterfall;
1455
1456 OpsToWaterfall.insert(RSrc);
1457 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1458 OpsToWaterfall);
1459 }
1460
1461 if (NumLoads != 1) {
1462 if (Ty.isVector())
1463 B.buildConcatVectors(Dst, LoadParts);
1464 else
1465 B.buildMergeLikeInstr(Dst, LoadParts);
1466 }
1467
1468 // We removed the instruction earlier with a waterfall loop.
1469 if (RSrcBank == &AMDGPU::SGPRRegBank)
1470 MI.eraseFromParent();
1471
1472 return true;
1473 }
1474
applyMappingBFE(MachineIRBuilder & B,const OperandsMapper & OpdMapper,bool Signed) const1475 bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
1476 const OperandsMapper &OpdMapper,
1477 bool Signed) const {
1478 MachineInstr &MI = OpdMapper.getMI();
1479 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1480
1481 // Insert basic copies
1482 applyDefaultMapping(OpdMapper);
1483
1484 Register DstReg = MI.getOperand(0).getReg();
1485 LLT Ty = MRI.getType(DstReg);
1486
1487 const LLT S32 = LLT::scalar(32);
1488
1489 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
1490 Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1491 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1492 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1493
1494 const RegisterBank *DstBank =
1495 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1496 if (DstBank == &AMDGPU::VGPRRegBank) {
1497 if (Ty == S32)
1498 return true;
1499
1500 // There is no 64-bit vgpr bitfield extract instructions so the operation
1501 // is expanded to a sequence of instructions that implement the operation.
1502 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
1503
1504 const LLT S64 = LLT::scalar(64);
1505 // Shift the source operand so that extracted bits start at bit 0.
1506 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1507 : B.buildLShr(S64, SrcReg, OffsetReg);
1508 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1509
1510 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1511 // if the width is a constant.
1512 if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1513 // Use the 32-bit bitfield extract instruction if the width is a constant.
1514 // Depending on the width size, use either the low or high 32-bits.
1515 auto Zero = B.buildConstant(S32, 0);
1516 auto WidthImm = ConstWidth->Value.getZExtValue();
1517 if (WidthImm <= 32) {
1518 // Use bitfield extract on the lower 32-bit source, and then sign-extend
1519 // or clear the upper 32-bits.
1520 auto Extract =
1521 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1522 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1523 auto Extend =
1524 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1525 B.buildMergeLikeInstr(DstReg, {Extract, Extend});
1526 } else {
1527 // Use bitfield extract on upper 32-bit source, and combine with lower
1528 // 32-bit source.
1529 auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1530 auto Extract =
1531 Signed
1532 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1533 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1534 B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
1535 }
1536 MI.eraseFromParent();
1537 return true;
1538 }
1539
1540 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1541 // operations.
1542 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1543 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1544 if (Signed)
1545 B.buildAShr(S64, SignBit, ExtShift);
1546 else
1547 B.buildLShr(S64, SignBit, ExtShift);
1548 MI.eraseFromParent();
1549 return true;
1550 }
1551
1552 // The scalar form packs the offset and width in a single operand.
1553
1554 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1555
1556 // Ensure the high bits are clear to insert the offset.
1557 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1558 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1559
1560 // Zeros out the low bits, so don't bother clamping the input value.
1561 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1562
1563 // Transformation function, pack the offset and width of a BFE into
1564 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1565 // source, bits [5:0] contain the offset and bits [22:16] the width.
1566 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1567
1568 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1569 // register class constraints.
1570 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1571 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1572
1573 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1574 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1575 llvm_unreachable("failed to constrain BFE");
1576
1577 MI.eraseFromParent();
1578 return true;
1579 }
1580
applyMappingMAD_64_32(MachineIRBuilder & B,const OperandsMapper & OpdMapper) const1581 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1582 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1583 MachineInstr &MI = OpdMapper.getMI();
1584 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1585
1586 // Insert basic copies.
1587 applyDefaultMapping(OpdMapper);
1588
1589 Register Dst0 = MI.getOperand(0).getReg();
1590 Register Dst1 = MI.getOperand(1).getReg();
1591 Register Src0 = MI.getOperand(2).getReg();
1592 Register Src1 = MI.getOperand(3).getReg();
1593 Register Src2 = MI.getOperand(4).getReg();
1594
1595 if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
1596 return true;
1597
1598 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1599 LLT S1 = LLT::scalar(1);
1600 LLT S32 = LLT::scalar(32);
1601
1602 bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
1603 bool Accumulate = true;
1604
1605 if (!DstOnValu) {
1606 if (mi_match(Src2, MRI, m_ZeroInt()))
1607 Accumulate = false;
1608 }
1609
1610 // Keep the multiplication on the SALU.
1611 Register DstHi;
1612 Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
1613 bool MulHiInVgpr = false;
1614
1615 MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
1616
1617 if (Subtarget.hasSMulHi()) {
1618 DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)
1619 : B.buildSMulH(S32, Src0, Src1).getReg(0);
1620 MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
1621 } else {
1622 Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);
1623 Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);
1624
1625 MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
1626 MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
1627
1628 DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)
1629 : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);
1630 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1631
1632 if (!DstOnValu) {
1633 DstHi = buildReadFirstLane(B, MRI, DstHi);
1634 } else {
1635 MulHiInVgpr = true;
1636 }
1637 }
1638
1639 // Accumulate and produce the "carry-out" bit.
1640 //
1641 // The "carry-out" is defined as bit 64 of the result when computed as a
1642 // big integer. For unsigned multiply-add, this matches the usual definition
1643 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1644 // result, which is determined as:
1645 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1646 LLT CarryType = DstOnValu ? S1 : S32;
1647 const RegisterBank &CarryBank =
1648 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1649 const RegisterBank &DstBank =
1650 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1651 Register Carry;
1652 Register Zero;
1653
1654 if (!IsUnsigned) {
1655 Zero = B.buildConstant(S32, 0).getReg(0);
1656 MRI.setRegBank(Zero,
1657 MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1658
1659 Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)
1660 .getReg(0);
1661 MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
1662 : AMDGPU::SGPRRegBank);
1663
1664 if (DstOnValu && !MulHiInVgpr) {
1665 Carry = B.buildTrunc(S1, Carry).getReg(0);
1666 MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
1667 }
1668 }
1669
1670 if (Accumulate) {
1671 if (DstOnValu) {
1672 DstLo = B.buildCopy(S32, DstLo).getReg(0);
1673 DstHi = B.buildCopy(S32, DstHi).getReg(0);
1674 MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
1675 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1676 }
1677
1678 auto Unmerge = B.buildUnmerge(S32, Src2);
1679 Register Src2Lo = Unmerge.getReg(0);
1680 Register Src2Hi = Unmerge.getReg(1);
1681 MRI.setRegBank(Src2Lo, DstBank);
1682 MRI.setRegBank(Src2Hi, DstBank);
1683
1684 if (!IsUnsigned) {
1685 auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);
1686 MRI.setRegBank(Src2Sign.getReg(0), CarryBank);
1687
1688 Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);
1689 MRI.setRegBank(Carry, CarryBank);
1690 }
1691
1692 auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);
1693 DstLo = AddLo.getReg(0);
1694 Register CarryLo = AddLo.getReg(1);
1695 MRI.setRegBank(DstLo, DstBank);
1696 MRI.setRegBank(CarryLo, CarryBank);
1697
1698 auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);
1699 DstHi = AddHi.getReg(0);
1700 MRI.setRegBank(DstHi, DstBank);
1701
1702 Register CarryHi = AddHi.getReg(1);
1703 MRI.setRegBank(CarryHi, CarryBank);
1704
1705 if (IsUnsigned) {
1706 Carry = CarryHi;
1707 } else {
1708 Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);
1709 MRI.setRegBank(Carry, CarryBank);
1710 }
1711 } else {
1712 if (IsUnsigned) {
1713 Carry = B.buildConstant(CarryType, 0).getReg(0);
1714 MRI.setRegBank(Carry, CarryBank);
1715 }
1716 }
1717
1718 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
1719
1720 if (DstOnValu) {
1721 B.buildCopy(Dst1, Carry);
1722 } else {
1723 B.buildTrunc(Dst1, Carry);
1724 }
1725
1726 MI.eraseFromParent();
1727 return true;
1728 }
1729
1730 // Return a suitable opcode for extending the operands of Opc when widening.
getExtendOp(unsigned Opc)1731 static unsigned getExtendOp(unsigned Opc) {
1732 switch (Opc) {
1733 case TargetOpcode::G_ASHR:
1734 case TargetOpcode::G_SMIN:
1735 case TargetOpcode::G_SMAX:
1736 return TargetOpcode::G_SEXT;
1737 case TargetOpcode::G_LSHR:
1738 case TargetOpcode::G_UMIN:
1739 case TargetOpcode::G_UMAX:
1740 return TargetOpcode::G_ZEXT;
1741 default:
1742 return TargetOpcode::G_ANYEXT;
1743 }
1744 }
1745
1746 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1747 // any illegal vector extend or unmerge operations.
1748 static std::pair<Register, Register>
unpackV2S16ToS32(MachineIRBuilder & B,Register Src,unsigned ExtOpcode)1749 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1750 const LLT S32 = LLT::scalar(32);
1751 auto Bitcast = B.buildBitcast(S32, Src);
1752
1753 if (ExtOpcode == TargetOpcode::G_SEXT) {
1754 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1755 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1756 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1757 }
1758
1759 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1760 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1761 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1762 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1763 }
1764
1765 assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1766 return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1767 }
1768
1769 // For cases where only a single copy is inserted for matching register banks.
1770 // Replace the register in the instruction operand
substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,unsigned OpIdx)1771 static bool substituteSimpleCopyRegs(
1772 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1773 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1774 if (!SrcReg.empty()) {
1775 assert(SrcReg.size() == 1);
1776 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1777 return true;
1778 }
1779
1780 return false;
1781 }
1782
1783 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg) const1784 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1785 MachineRegisterInfo &MRI,
1786 Register Reg) const {
1787 if (!Subtarget.hasUnpackedD16VMem())
1788 return Reg;
1789
1790 const LLT S16 = LLT::scalar(16);
1791 LLT StoreVT = MRI.getType(Reg);
1792 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1793 return Reg;
1794
1795 auto Unmerge = B.buildUnmerge(S16, Reg);
1796
1797
1798 SmallVector<Register, 4> WideRegs;
1799 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1800 WideRegs.push_back(Unmerge.getReg(I));
1801
1802 const LLT S32 = LLT::scalar(32);
1803 int NumElts = StoreVT.getNumElements();
1804
1805 return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs)
1806 .getReg(0);
1807 }
1808
1809 static std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo & MRI,Register Reg)1810 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1811 int64_t Const;
1812 if (mi_match(Reg, MRI, m_ICst(Const)))
1813 return std::pair(Register(), Const);
1814
1815 Register Base;
1816 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1817 return std::pair(Base, Const);
1818
1819 // TODO: Handle G_OR used for add case
1820 return std::pair(Reg, 0);
1821 }
1822
1823 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const1824 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1825 Register OrigOffset) const {
1826 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(Subtarget);
1827 Register BaseReg;
1828 unsigned ImmOffset;
1829 const LLT S32 = LLT::scalar(32);
1830
1831 // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1832 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1833 OrigOffset);
1834
1835 unsigned C1 = 0;
1836 if (ImmOffset != 0) {
1837 // If the immediate value is too big for the immoffset field, put only bits
1838 // that would normally fit in the immoffset field. The remaining value that
1839 // is copied/added for the voffset field is a large power of 2, and it
1840 // stands more chance of being CSEd with the copy/add for another similar
1841 // load/store.
1842 // However, do not do that rounding down if that is a negative
1843 // number, as it appears to be illegal to have a negative offset in the
1844 // vgpr, even if adding the immediate offset makes it positive.
1845 unsigned Overflow = ImmOffset & ~MaxImm;
1846 ImmOffset -= Overflow;
1847 if ((int32_t)Overflow < 0) {
1848 Overflow += ImmOffset;
1849 ImmOffset = 0;
1850 }
1851
1852 C1 = ImmOffset;
1853 if (Overflow != 0) {
1854 if (!BaseReg)
1855 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1856 else {
1857 auto OverflowVal = B.buildConstant(S32, Overflow);
1858 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1859 }
1860 }
1861 }
1862
1863 if (!BaseReg)
1864 BaseReg = B.buildConstant(S32, 0).getReg(0);
1865
1866 return {BaseReg, C1};
1867 }
1868
buildVCopy(MachineIRBuilder & B,Register DstReg,Register SrcReg) const1869 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1870 Register SrcReg) const {
1871 MachineRegisterInfo &MRI = *B.getMRI();
1872 LLT SrcTy = MRI.getType(SrcReg);
1873 if (SrcTy.getSizeInBits() == 32) {
1874 // Use a v_mov_b32 here to make the exec dependency explicit.
1875 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1876 .addDef(DstReg)
1877 .addUse(SrcReg);
1878 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1879 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1880 }
1881
1882 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1883 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1884
1885 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1886 .addDef(TmpReg0)
1887 .addUse(SrcReg, 0, AMDGPU::sub0);
1888 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1889 .addDef(TmpReg1)
1890 .addUse(SrcReg, 0, AMDGPU::sub1);
1891 B.buildInstr(AMDGPU::REG_SEQUENCE)
1892 .addDef(DstReg)
1893 .addUse(TmpReg0)
1894 .addImm(AMDGPU::sub0)
1895 .addUse(TmpReg1)
1896 .addImm(AMDGPU::sub1);
1897
1898 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1899 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1900 }
1901
1902 /// Utility function for pushing dynamic vector indexes with a constant offset
1903 /// into waterfall loops.
reinsertVectorIndexAdd(MachineIRBuilder & B,MachineInstr & IdxUseInstr,unsigned OpIdx,unsigned ConstOffset)1904 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1905 MachineInstr &IdxUseInstr,
1906 unsigned OpIdx,
1907 unsigned ConstOffset) {
1908 MachineRegisterInfo &MRI = *B.getMRI();
1909 const LLT S32 = LLT::scalar(32);
1910 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1911 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1912
1913 auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1914
1915 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1916 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1917 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1918 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1919 }
1920
1921 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1922 /// original 32-bit source value (to be inserted in the low part of the combined
1923 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1924 /// value.
extendLow32IntoHigh32(MachineIRBuilder & B,Register Hi32Reg,Register Lo32Reg,unsigned ExtOpc,const RegisterBank & RegBank,bool IsBooleanSrc=false)1925 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1926 Register Hi32Reg, Register Lo32Reg,
1927 unsigned ExtOpc,
1928 const RegisterBank &RegBank,
1929 bool IsBooleanSrc = false) {
1930 if (ExtOpc == AMDGPU::G_ZEXT) {
1931 B.buildConstant(Hi32Reg, 0);
1932 } else if (ExtOpc == AMDGPU::G_SEXT) {
1933 if (IsBooleanSrc) {
1934 // If we know the original source was an s1, the high half is the same as
1935 // the low.
1936 B.buildCopy(Hi32Reg, Lo32Reg);
1937 } else {
1938 // Replicate sign bit from 32-bit extended part.
1939 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1940 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1941 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1942 }
1943 } else {
1944 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1945 B.buildUndef(Hi32Reg);
1946 }
1947 }
1948
foldExtractEltToCmpSelect(MachineIRBuilder & B,MachineInstr & MI,const OperandsMapper & OpdMapper) const1949 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1950 MachineIRBuilder &B, MachineInstr &MI,
1951 const OperandsMapper &OpdMapper) const {
1952 MachineRegisterInfo &MRI = *B.getMRI();
1953
1954 Register VecReg = MI.getOperand(1).getReg();
1955 Register Idx = MI.getOperand(2).getReg();
1956
1957 const RegisterBank &IdxBank =
1958 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1959
1960 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1961
1962 LLT VecTy = MRI.getType(VecReg);
1963 unsigned EltSize = VecTy.getScalarSizeInBits();
1964 unsigned NumElem = VecTy.getNumElements();
1965
1966 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1967 IsDivergentIdx, &Subtarget))
1968 return false;
1969
1970 LLT S32 = LLT::scalar(32);
1971
1972 const RegisterBank &DstBank =
1973 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1974 const RegisterBank &SrcBank =
1975 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1976
1977 const RegisterBank &CCBank =
1978 (DstBank == AMDGPU::SGPRRegBank &&
1979 SrcBank == AMDGPU::SGPRRegBank &&
1980 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1981 : AMDGPU::VCCRegBank;
1982 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1983
1984 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1985 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1986 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1987 }
1988
1989 LLT EltTy = VecTy.getScalarType();
1990 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1991 unsigned NumLanes = DstRegs.size();
1992 if (!NumLanes)
1993 NumLanes = 1;
1994 else
1995 EltTy = MRI.getType(DstRegs[0]);
1996
1997 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1998 SmallVector<Register, 2> Res(NumLanes);
1999 for (unsigned L = 0; L < NumLanes; ++L)
2000 Res[L] = UnmergeToEltTy.getReg(L);
2001
2002 for (unsigned I = 1; I < NumElem; ++I) {
2003 auto IC = B.buildConstant(S32, I);
2004 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2005 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2006 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2007
2008 for (unsigned L = 0; L < NumLanes; ++L) {
2009 auto S = B.buildSelect(EltTy, Cmp,
2010 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
2011
2012 for (unsigned N : { 0, 2, 3 })
2013 MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
2014
2015 Res[L] = S->getOperand(0).getReg();
2016 }
2017 }
2018
2019 for (unsigned L = 0; L < NumLanes; ++L) {
2020 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
2021 B.buildCopy(DstReg, Res[L]);
2022 MRI.setRegBank(DstReg, DstBank);
2023 }
2024
2025 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2026 MI.eraseFromParent();
2027
2028 return true;
2029 }
2030
2031 // Insert a cross regbank copy for a register if it already has a bank that
2032 // differs from the one we want to set.
constrainRegToBank(MachineRegisterInfo & MRI,MachineIRBuilder & B,Register & Reg,const RegisterBank & Bank)2033 static Register constrainRegToBank(MachineRegisterInfo &MRI,
2034 MachineIRBuilder &B, Register &Reg,
2035 const RegisterBank &Bank) {
2036 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2037 if (CurrBank && *CurrBank != Bank) {
2038 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2039 MRI.setRegBank(Copy, Bank);
2040 return Copy;
2041 }
2042
2043 MRI.setRegBank(Reg, Bank);
2044 return Reg;
2045 }
2046
foldInsertEltToCmpSelect(MachineIRBuilder & B,MachineInstr & MI,const OperandsMapper & OpdMapper) const2047 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2048 MachineIRBuilder &B, MachineInstr &MI,
2049 const OperandsMapper &OpdMapper) const {
2050
2051 MachineRegisterInfo &MRI = *B.getMRI();
2052 Register VecReg = MI.getOperand(1).getReg();
2053 Register Idx = MI.getOperand(3).getReg();
2054
2055 const RegisterBank &IdxBank =
2056 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2057
2058 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2059
2060 LLT VecTy = MRI.getType(VecReg);
2061 unsigned EltSize = VecTy.getScalarSizeInBits();
2062 unsigned NumElem = VecTy.getNumElements();
2063
2064 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2065 IsDivergentIdx, &Subtarget))
2066 return false;
2067
2068 LLT S32 = LLT::scalar(32);
2069
2070 const RegisterBank &DstBank =
2071 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2072 const RegisterBank &SrcBank =
2073 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2074 const RegisterBank &InsBank =
2075 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2076
2077 const RegisterBank &CCBank =
2078 (DstBank == AMDGPU::SGPRRegBank &&
2079 SrcBank == AMDGPU::SGPRRegBank &&
2080 InsBank == AMDGPU::SGPRRegBank &&
2081 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2082 : AMDGPU::VCCRegBank;
2083 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2084
2085 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2086 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2087 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2088 }
2089
2090 LLT EltTy = VecTy.getScalarType();
2091 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2092 unsigned NumLanes = InsRegs.size();
2093 if (!NumLanes) {
2094 NumLanes = 1;
2095 InsRegs.push_back(MI.getOperand(2).getReg());
2096 } else {
2097 EltTy = MRI.getType(InsRegs[0]);
2098 }
2099
2100 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2101 SmallVector<Register, 16> Ops(NumElem * NumLanes);
2102
2103 for (unsigned I = 0; I < NumElem; ++I) {
2104 auto IC = B.buildConstant(S32, I);
2105 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2106 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2107 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2108
2109 for (unsigned L = 0; L < NumLanes; ++L) {
2110 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2111 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2112 Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2113
2114 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2115 MRI.setRegBank(Select, DstBank);
2116
2117 Ops[I * NumLanes + L] = Select;
2118 }
2119 }
2120
2121 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2122 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2123 B.buildBuildVector(MI.getOperand(0), Ops);
2124 } else {
2125 auto Vec = B.buildBuildVector(MergeTy, Ops);
2126 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2127 B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2128 }
2129
2130 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2131 MI.eraseFromParent();
2132
2133 return true;
2134 }
2135
2136 // Break s_mul_u64 into 32-bit vector operations.
applyMappingSMULU64(MachineIRBuilder & B,const OperandsMapper & OpdMapper) const2137 void AMDGPURegisterBankInfo::applyMappingSMULU64(
2138 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2139 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2140 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2141 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2142
2143 // All inputs are SGPRs, nothing special to do.
2144 if (DefRegs.empty()) {
2145 assert(Src0Regs.empty() && Src1Regs.empty());
2146 applyDefaultMapping(OpdMapper);
2147 return;
2148 }
2149
2150 assert(DefRegs.size() == 2);
2151 assert(Src0Regs.size() == Src1Regs.size() &&
2152 (Src0Regs.empty() || Src0Regs.size() == 2));
2153
2154 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2155 MachineInstr &MI = OpdMapper.getMI();
2156 Register DstReg = MI.getOperand(0).getReg();
2157 LLT HalfTy = LLT::scalar(32);
2158
2159 // Depending on where the source registers came from, the generic code may
2160 // have decided to split the inputs already or not. If not, we still need to
2161 // extract the values.
2162
2163 if (Src0Regs.empty())
2164 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2165 else
2166 setRegsToType(MRI, Src0Regs, HalfTy);
2167
2168 if (Src1Regs.empty())
2169 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2170 else
2171 setRegsToType(MRI, Src1Regs, HalfTy);
2172
2173 setRegsToType(MRI, DefRegs, HalfTy);
2174
2175 // The multiplication is done as follows:
2176 //
2177 // Op1H Op1L
2178 // * Op0H Op0L
2179 // --------------------
2180 // Op1H*Op0L Op1L*Op0L
2181 // + Op1H*Op0H Op1L*Op0H
2182 // -----------------------------------------
2183 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
2184 //
2185 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
2186 // value and that would overflow.
2187 // The low 32-bit value is Op1L*Op0L.
2188 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from
2189 // Op1L*Op0L).
2190
2191 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
2192
2193 Register Hi = B.buildUMulH(HalfTy, Src0Regs[0], Src1Regs[0]).getReg(0);
2194 Register MulLoHi = B.buildMul(HalfTy, Src0Regs[0], Src1Regs[1]).getReg(0);
2195 Register Add = B.buildAdd(HalfTy, Hi, MulLoHi).getReg(0);
2196 Register MulHiLo = B.buildMul(HalfTy, Src0Regs[1], Src1Regs[0]).getReg(0);
2197 B.buildAdd(DefRegs[1], Add, MulHiLo);
2198 B.buildMul(DefRegs[0], Src0Regs[0], Src1Regs[0]);
2199
2200 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2201 MI.eraseFromParent();
2202 }
2203
applyMappingImpl(MachineIRBuilder & B,const OperandsMapper & OpdMapper) const2204 void AMDGPURegisterBankInfo::applyMappingImpl(
2205 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2206 MachineInstr &MI = OpdMapper.getMI();
2207 B.setInstrAndDebugLoc(MI);
2208 unsigned Opc = MI.getOpcode();
2209 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2210 switch (Opc) {
2211 case AMDGPU::G_CONSTANT:
2212 case AMDGPU::G_IMPLICIT_DEF: {
2213 Register DstReg = MI.getOperand(0).getReg();
2214 LLT DstTy = MRI.getType(DstReg);
2215 if (DstTy != LLT::scalar(1))
2216 break;
2217
2218 const RegisterBank *DstBank =
2219 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2220 if (DstBank == &AMDGPU::VCCRegBank)
2221 break;
2222 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2223 if (DefRegs.empty())
2224 DefRegs.push_back(DstReg);
2225
2226 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2227
2228 Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
2229 LLVMContext &Ctx = B.getMF().getFunction().getContext();
2230
2231 MI.getOperand(0).setReg(NewDstReg);
2232 if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2233 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
2234 MI.getOperand(1).setCImm(
2235 ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal));
2236 }
2237
2238 MRI.setRegBank(NewDstReg, *DstBank);
2239 B.buildTrunc(DefRegs[0], NewDstReg);
2240 return;
2241 }
2242 case AMDGPU::G_PHI: {
2243 Register DstReg = MI.getOperand(0).getReg();
2244 LLT DstTy = MRI.getType(DstReg);
2245 if (DstTy != LLT::scalar(1))
2246 break;
2247
2248 const LLT S32 = LLT::scalar(32);
2249 const RegisterBank *DstBank =
2250 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2251 if (DstBank == &AMDGPU::VCCRegBank) {
2252 applyDefaultMapping(OpdMapper);
2253 // The standard handling only considers the result register bank for
2254 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2255 // produce an invalid copy. We can only copy with some kind of compare to
2256 // get a vector boolean result. Insert a register bank copy that will be
2257 // correctly lowered to a compare.
2258 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2259 Register SrcReg = MI.getOperand(I).getReg();
2260 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2261
2262 if (SrcBank != &AMDGPU::VCCRegBank) {
2263 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2264 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2265
2266 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2267 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2268 MI.getOperand(I).setReg(Copy.getReg(0));
2269 }
2270 }
2271
2272 return;
2273 }
2274
2275 // Phi handling is strange and only considers the bank of the destination.
2276 substituteSimpleCopyRegs(OpdMapper, 0);
2277
2278 // Promote SGPR/VGPR booleans to s32
2279 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2280 B.setInsertPt(B.getMBB(), MI);
2281 LegalizerHelper Helper(B.getMF(), ApplyBank, B);
2282
2283 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2284 llvm_unreachable("widen scalar should have succeeded");
2285
2286 return;
2287 }
2288 case AMDGPU::G_FCMP:
2289 if (!Subtarget.hasSALUFloatInsts())
2290 break;
2291 [[fallthrough]];
2292 case AMDGPU::G_ICMP:
2293 case AMDGPU::G_UADDO:
2294 case AMDGPU::G_USUBO:
2295 case AMDGPU::G_UADDE:
2296 case AMDGPU::G_SADDE:
2297 case AMDGPU::G_USUBE:
2298 case AMDGPU::G_SSUBE: {
2299 unsigned BoolDstOp =
2300 (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1;
2301 Register DstReg = MI.getOperand(BoolDstOp).getReg();
2302
2303 const RegisterBank *DstBank =
2304 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2305 if (DstBank != &AMDGPU::SGPRRegBank)
2306 break;
2307
2308 const bool HasCarryIn = MI.getNumOperands() == 5;
2309
2310 // If this is a scalar compare, promote the result to s32, as the selection
2311 // will end up using a copy to a 32-bit vreg.
2312 const LLT S32 = LLT::scalar(32);
2313 Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2314 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2315 MI.getOperand(BoolDstOp).setReg(NewDstReg);
2316
2317 if (HasCarryIn) {
2318 Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2319 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2320 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2321 MI.getOperand(4).setReg(NewSrcReg);
2322 }
2323
2324 MachineBasicBlock *MBB = MI.getParent();
2325 B.setInsertPt(*MBB, std::next(MI.getIterator()));
2326
2327 // If we had a constrained VCC result register, a copy was inserted to VCC
2328 // from SGPR.
2329 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2330 if (DefRegs.empty())
2331 DefRegs.push_back(DstReg);
2332 B.buildTrunc(DefRegs[0], NewDstReg);
2333 return;
2334 }
2335 case AMDGPU::G_SELECT: {
2336 Register DstReg = MI.getOperand(0).getReg();
2337 LLT DstTy = MRI.getType(DstReg);
2338
2339 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2340 if (CondRegs.empty())
2341 CondRegs.push_back(MI.getOperand(1).getReg());
2342 else {
2343 assert(CondRegs.size() == 1);
2344 }
2345
2346 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2347 if (CondBank == &AMDGPU::SGPRRegBank) {
2348 const LLT S32 = LLT::scalar(32);
2349 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2350 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2351
2352 MI.getOperand(1).setReg(NewCondReg);
2353 B.buildZExt(NewCondReg, CondRegs[0]);
2354 }
2355
2356 if (DstTy.getSizeInBits() != 64)
2357 break;
2358
2359 LLT HalfTy = getHalfSizedType(DstTy);
2360
2361 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2362 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2363 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2364
2365 // All inputs are SGPRs, nothing special to do.
2366 if (DefRegs.empty()) {
2367 assert(Src1Regs.empty() && Src2Regs.empty());
2368 break;
2369 }
2370
2371 if (Src1Regs.empty())
2372 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2373 else {
2374 setRegsToType(MRI, Src1Regs, HalfTy);
2375 }
2376
2377 if (Src2Regs.empty())
2378 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2379 else
2380 setRegsToType(MRI, Src2Regs, HalfTy);
2381
2382 setRegsToType(MRI, DefRegs, HalfTy);
2383
2384 auto Flags = MI.getFlags();
2385 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0], Flags);
2386 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1], Flags);
2387
2388 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2389 MI.eraseFromParent();
2390 return;
2391 }
2392 case AMDGPU::G_BRCOND: {
2393 Register CondReg = MI.getOperand(0).getReg();
2394 // FIXME: Should use legalizer helper, but should change bool ext type.
2395 const RegisterBank *CondBank =
2396 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2397
2398 if (CondBank == &AMDGPU::SGPRRegBank) {
2399 const LLT S32 = LLT::scalar(32);
2400 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2401 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2402
2403 MI.getOperand(0).setReg(NewCondReg);
2404 B.buildZExt(NewCondReg, CondReg);
2405 return;
2406 }
2407
2408 break;
2409 }
2410 case AMDGPU::G_AND:
2411 case AMDGPU::G_OR:
2412 case AMDGPU::G_XOR: {
2413 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2414 // there is a VGPR input.
2415 Register DstReg = MI.getOperand(0).getReg();
2416 LLT DstTy = MRI.getType(DstReg);
2417
2418 const RegisterBank *DstBank =
2419 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2420
2421 if (DstTy.getSizeInBits() == 1) {
2422 if (DstBank == &AMDGPU::VCCRegBank)
2423 break;
2424
2425 MachineFunction *MF = MI.getParent()->getParent();
2426 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2427 LegalizerHelper Helper(*MF, ApplyBank, B);
2428
2429 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2430 LegalizerHelper::Legalized)
2431 llvm_unreachable("widen scalar should have succeeded");
2432 return;
2433 }
2434
2435 if (DstTy.getSizeInBits() == 16 && DstBank == &AMDGPU::SGPRRegBank) {
2436 const LLT S32 = LLT::scalar(32);
2437 MachineBasicBlock *MBB = MI.getParent();
2438 MachineFunction *MF = MBB->getParent();
2439 ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2440 LegalizerHelper Helper(*MF, ApplySALU, B);
2441 // Widen to S32, but handle `G_XOR x, -1` differently. Legalizer widening
2442 // will use a G_ANYEXT to extend the -1 which prevents matching G_XOR -1
2443 // as "not".
2444 if (MI.getOpcode() == AMDGPU::G_XOR &&
2445 mi_match(MI.getOperand(2).getReg(), MRI, m_SpecificICstOrSplat(-1))) {
2446 Helper.widenScalarSrc(MI, S32, 1, AMDGPU::G_ANYEXT);
2447 Helper.widenScalarSrc(MI, S32, 2, AMDGPU::G_SEXT);
2448 Helper.widenScalarDst(MI, S32);
2449 } else {
2450 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2451 llvm_unreachable("widen scalar should have succeeded");
2452 }
2453 return;
2454 }
2455
2456 if (DstTy.getSizeInBits() != 64)
2457 break;
2458
2459 LLT HalfTy = getHalfSizedType(DstTy);
2460 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2461 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2462 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2463
2464 // All inputs are SGPRs, nothing special to do.
2465 if (DefRegs.empty()) {
2466 assert(Src0Regs.empty() && Src1Regs.empty());
2467 break;
2468 }
2469
2470 assert(DefRegs.size() == 2);
2471 assert(Src0Regs.size() == Src1Regs.size() &&
2472 (Src0Regs.empty() || Src0Regs.size() == 2));
2473
2474 // Depending on where the source registers came from, the generic code may
2475 // have decided to split the inputs already or not. If not, we still need to
2476 // extract the values.
2477
2478 if (Src0Regs.empty())
2479 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2480 else
2481 setRegsToType(MRI, Src0Regs, HalfTy);
2482
2483 if (Src1Regs.empty())
2484 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2485 else
2486 setRegsToType(MRI, Src1Regs, HalfTy);
2487
2488 setRegsToType(MRI, DefRegs, HalfTy);
2489
2490 auto Flags = MI.getFlags();
2491 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]}, Flags);
2492 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]}, Flags);
2493
2494 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2495 MI.eraseFromParent();
2496 return;
2497 }
2498 case AMDGPU::G_ABS: {
2499 Register SrcReg = MI.getOperand(1).getReg();
2500 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2501
2502 // There is no VALU abs instruction so we need to replace it with a sub and
2503 // max combination.
2504 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2505 MachineFunction *MF = MI.getParent()->getParent();
2506 ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
2507 LegalizerHelper Helper(*MF, Apply, B);
2508
2509 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2510 llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2511 return;
2512 }
2513 [[fallthrough]];
2514 }
2515 case AMDGPU::G_ADD:
2516 case AMDGPU::G_SUB:
2517 case AMDGPU::G_MUL:
2518 case AMDGPU::G_SHL:
2519 case AMDGPU::G_LSHR:
2520 case AMDGPU::G_ASHR:
2521 case AMDGPU::G_SMIN:
2522 case AMDGPU::G_SMAX:
2523 case AMDGPU::G_UMIN:
2524 case AMDGPU::G_UMAX: {
2525 Register DstReg = MI.getOperand(0).getReg();
2526 LLT DstTy = MRI.getType(DstReg);
2527
2528 // Special case for s_mul_u64. There is not a vector equivalent of
2529 // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
2530 // multiplications.
2531 if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) {
2532 applyMappingSMULU64(B, OpdMapper);
2533 return;
2534 }
2535
2536 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2537 // Packed 16-bit operations need to be scalarized and promoted.
2538 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2539 break;
2540
2541 const RegisterBank *DstBank =
2542 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2543 if (DstBank == &AMDGPU::VGPRRegBank)
2544 break;
2545
2546 const LLT S32 = LLT::scalar(32);
2547 MachineBasicBlock *MBB = MI.getParent();
2548 MachineFunction *MF = MBB->getParent();
2549 ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2550
2551 if (DstTy.isVector() && Opc == AMDGPU::G_ABS) {
2552 Register WideSrcLo, WideSrcHi;
2553
2554 std::tie(WideSrcLo, WideSrcHi) =
2555 unpackV2S16ToS32(B, MI.getOperand(1).getReg(), TargetOpcode::G_SEXT);
2556 auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo});
2557 auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi});
2558 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2559 MI.eraseFromParent();
2560 return;
2561 }
2562
2563 if (DstTy.isVector()) {
2564 Register WideSrc0Lo, WideSrc0Hi;
2565 Register WideSrc1Lo, WideSrc1Hi;
2566
2567 unsigned ExtendOp = getExtendOp(MI.getOpcode());
2568 std::tie(WideSrc0Lo, WideSrc0Hi)
2569 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2570 std::tie(WideSrc1Lo, WideSrc1Hi)
2571 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2572 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2573 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2574 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2575 MI.eraseFromParent();
2576 } else {
2577 LegalizerHelper Helper(*MF, ApplySALU, B);
2578
2579 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2580 llvm_unreachable("widen scalar should have succeeded");
2581
2582 // FIXME: s16 shift amounts should be legal.
2583 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2584 Opc == AMDGPU::G_ASHR) {
2585 B.setInsertPt(*MBB, MI.getIterator());
2586 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2587 llvm_unreachable("widen scalar should have succeeded");
2588 }
2589 }
2590
2591 return;
2592 }
2593 case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
2594 case AMDGPU::G_AMDGPU_S_MUL_U64_U32: {
2595 // This is a special case for s_mul_u64. We use
2596 // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation
2597 // where the 33 higher bits are sign-extended and
2598 // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation
2599 // where the 32 higher bits are zero-extended. In case scalar registers are
2600 // selected, both opcodes are lowered as s_mul_u64. If the vector registers
2601 // are selected, then G_AMDGPU_S_MUL_I64_I32 and
2602 // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.
2603
2604 // Insert basic copies.
2605 applyDefaultMapping(OpdMapper);
2606
2607 Register DstReg = MI.getOperand(0).getReg();
2608 Register SrcReg0 = MI.getOperand(1).getReg();
2609 Register SrcReg1 = MI.getOperand(2).getReg();
2610 const LLT S32 = LLT::scalar(32);
2611 const LLT S64 = LLT::scalar(64);
2612 assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "
2613 "that handles only 64-bit operands.");
2614 const RegisterBank *DstBank =
2615 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2616
2617 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2618 // with s_mul_u64 operation.
2619 if (DstBank == &AMDGPU::SGPRRegBank) {
2620 MI.setDesc(TII->get(AMDGPU::S_MUL_U64));
2621 MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass);
2622 MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass);
2623 MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass);
2624 return;
2625 }
2626
2627 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2628 // with a vector mad.
2629 assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank &&
2630 "The destination operand should be in vector registers.");
2631
2632 // Extract the lower subregister from the first operand.
2633 Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2634 MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass);
2635 MRI.setType(Op0L, S32);
2636 B.buildTrunc(Op0L, SrcReg0);
2637
2638 // Extract the lower subregister from the second operand.
2639 Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2640 MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass);
2641 MRI.setType(Op1L, S32);
2642 B.buildTrunc(Op1L, SrcReg1);
2643
2644 unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32
2645 ? AMDGPU::G_AMDGPU_MAD_U64_U32
2646 : AMDGPU::G_AMDGPU_MAD_I64_I32;
2647
2648 MachineIRBuilder B(MI);
2649 Register Zero64 = B.buildConstant(S64, 0).getReg(0);
2650 MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass);
2651 Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
2652 MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass);
2653 B.buildInstr(NewOpc, {DstReg, CarryOut}, {Op0L, Op1L, Zero64});
2654 MI.eraseFromParent();
2655 return;
2656 }
2657 case AMDGPU::G_SEXT_INREG: {
2658 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2659 if (SrcRegs.empty())
2660 break; // Nothing to repair
2661
2662 const LLT S32 = LLT::scalar(32);
2663 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
2664
2665 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2666 // we would need to further expand, and doesn't let us directly set the
2667 // result registers.
2668 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2669
2670 int Amt = MI.getOperand(2).getImm();
2671 if (Amt <= 32) {
2672 // Downstream users have expectations for the high bit behavior, so freeze
2673 // incoming undefined bits.
2674 if (Amt == 32) {
2675 // The low bits are unchanged.
2676 B.buildFreeze(DstRegs[0], SrcRegs[0]);
2677 } else {
2678 auto Freeze = B.buildFreeze(S32, SrcRegs[0]);
2679 // Extend in the low bits and propagate the sign bit to the high half.
2680 B.buildSExtInReg(DstRegs[0], Freeze, Amt);
2681 }
2682
2683 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2684 } else {
2685 // The low bits are unchanged, and extend in the high bits.
2686 // No freeze required
2687 B.buildCopy(DstRegs[0], SrcRegs[0]);
2688 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2689 }
2690
2691 Register DstReg = MI.getOperand(0).getReg();
2692 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2693 MI.eraseFromParent();
2694 return;
2695 }
2696 case AMDGPU::G_CTPOP:
2697 case AMDGPU::G_BITREVERSE: {
2698 const RegisterBank *DstBank =
2699 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2700 if (DstBank == &AMDGPU::SGPRRegBank)
2701 break;
2702
2703 Register SrcReg = MI.getOperand(1).getReg();
2704 const LLT S32 = LLT::scalar(32);
2705 LLT Ty = MRI.getType(SrcReg);
2706 if (Ty == S32)
2707 break;
2708
2709 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2710
2711 MachineFunction &MF = B.getMF();
2712 LegalizerHelper Helper(MF, ApplyVALU, B);
2713
2714 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2715 llvm_unreachable("narrowScalar should have succeeded");
2716 return;
2717 }
2718 case AMDGPU::G_AMDGPU_FFBH_U32:
2719 case AMDGPU::G_AMDGPU_FFBL_B32:
2720 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2721 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2722 const RegisterBank *DstBank =
2723 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2724 if (DstBank == &AMDGPU::SGPRRegBank)
2725 break;
2726
2727 Register SrcReg = MI.getOperand(1).getReg();
2728 const LLT S32 = LLT::scalar(32);
2729 LLT Ty = MRI.getType(SrcReg);
2730 if (Ty == S32)
2731 break;
2732
2733 // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2734 // which return -1 when the input is zero:
2735 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2736 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2737 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2738 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2739 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2740 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2741 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2742 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2743 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2744 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2745 : Opc;
2746 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2747 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2748 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2749 unsigned AddOpc =
2750 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2751 ? AMDGPU::G_ADD
2752 : AMDGPU::G_UADDSAT;
2753 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2754 Register DstReg = MI.getOperand(0).getReg();
2755 B.buildUMin(DstReg, X, Y);
2756 MI.eraseFromParent();
2757 return;
2758 }
2759 case AMDGPU::G_SEXT:
2760 case AMDGPU::G_ZEXT:
2761 case AMDGPU::G_ANYEXT: {
2762 Register SrcReg = MI.getOperand(1).getReg();
2763 LLT SrcTy = MRI.getType(SrcReg);
2764 const bool Signed = Opc == AMDGPU::G_SEXT;
2765
2766 assert(OpdMapper.getVRegs(1).empty());
2767
2768 const RegisterBank *SrcBank =
2769 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2770
2771 Register DstReg = MI.getOperand(0).getReg();
2772 LLT DstTy = MRI.getType(DstReg);
2773 if (DstTy.isScalar() &&
2774 SrcBank != &AMDGPU::SGPRRegBank &&
2775 SrcBank != &AMDGPU::VCCRegBank &&
2776 // FIXME: Should handle any type that round to s64 when irregular
2777 // breakdowns supported.
2778 DstTy.getSizeInBits() == 64 &&
2779 SrcTy.getSizeInBits() <= 32) {
2780 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2781
2782 // Extend to 32-bit, and then extend the low half.
2783 if (Signed) {
2784 // TODO: Should really be buildSExtOrCopy
2785 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2786 } else if (Opc == AMDGPU::G_ZEXT) {
2787 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2788 } else {
2789 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2790 }
2791
2792 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2793 MRI.setRegBank(DstReg, *SrcBank);
2794 MI.eraseFromParent();
2795 return;
2796 }
2797
2798 if (SrcTy != LLT::scalar(1))
2799 return;
2800
2801 // It is not legal to have a legalization artifact with a VCC source. Rather
2802 // than introducing a copy, insert the select we would have to select the
2803 // copy to.
2804 if (SrcBank == &AMDGPU::VCCRegBank) {
2805 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2806
2807 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2808
2809 unsigned DstSize = DstTy.getSizeInBits();
2810 // 64-bit select is SGPR only
2811 const bool UseSel64 = DstSize > 32 &&
2812 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2813
2814 // TODO: Should s16 select be legal?
2815 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2816 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2817 auto False = B.buildConstant(SelType, 0);
2818
2819 MRI.setRegBank(True.getReg(0), *DstBank);
2820 MRI.setRegBank(False.getReg(0), *DstBank);
2821 MRI.setRegBank(DstReg, *DstBank);
2822
2823 if (DstSize > 32) {
2824 B.buildSelect(DefRegs[0], SrcReg, True, False);
2825 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2826 } else if (DstSize < 32) {
2827 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2828 MRI.setRegBank(Sel.getReg(0), *DstBank);
2829 B.buildTrunc(DstReg, Sel);
2830 } else {
2831 B.buildSelect(DstReg, SrcReg, True, False);
2832 }
2833
2834 MI.eraseFromParent();
2835 return;
2836 }
2837
2838 break;
2839 }
2840 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2841 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2842
2843 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2844
2845 Register DstReg = MI.getOperand(0).getReg();
2846 Register SrcReg = MI.getOperand(1).getReg();
2847
2848 const LLT S32 = LLT::scalar(32);
2849 LLT DstTy = MRI.getType(DstReg);
2850 LLT SrcTy = MRI.getType(SrcReg);
2851
2852 if (foldExtractEltToCmpSelect(B, MI, OpdMapper))
2853 return;
2854
2855 const ValueMapping &DstMapping
2856 = OpdMapper.getInstrMapping().getOperandMapping(0);
2857 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2858 const RegisterBank *SrcBank =
2859 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2860 const RegisterBank *IdxBank =
2861 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2862
2863 Register BaseIdxReg;
2864 unsigned ConstOffset;
2865 std::tie(BaseIdxReg, ConstOffset) =
2866 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2867
2868 // See if the index is an add of a constant which will be foldable by moving
2869 // the base register of the index later if this is going to be executed in a
2870 // waterfall loop. This is essentially to reassociate the add of a constant
2871 // with the readfirstlane.
2872 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2873 ConstOffset > 0 &&
2874 ConstOffset < SrcTy.getNumElements();
2875
2876 // Move the base register. We'll re-insert the add later.
2877 if (ShouldMoveIndexIntoLoop)
2878 MI.getOperand(2).setReg(BaseIdxReg);
2879
2880 // If this is a VGPR result only because the index was a VGPR result, the
2881 // actual indexing will be done on the SGPR source vector, which will
2882 // produce a scalar result. We need to copy to the VGPR result inside the
2883 // waterfall loop.
2884 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2885 SrcBank == &AMDGPU::SGPRRegBank;
2886 if (DstRegs.empty()) {
2887 applyDefaultMapping(OpdMapper);
2888
2889 executeInWaterfallLoop(B, MI, {2});
2890
2891 if (NeedCopyToVGPR) {
2892 // We don't want a phi for this temporary reg.
2893 Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2894 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2895 MI.getOperand(0).setReg(TmpReg);
2896 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2897
2898 // Use a v_mov_b32 here to make the exec dependency explicit.
2899 buildVCopy(B, DstReg, TmpReg);
2900 }
2901
2902 // Re-insert the constant offset add inside the waterfall loop.
2903 if (ShouldMoveIndexIntoLoop)
2904 reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2905
2906 return;
2907 }
2908
2909 assert(DstTy.getSizeInBits() == 64);
2910
2911 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2912
2913 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2914 auto One = B.buildConstant(S32, 1);
2915
2916 MachineBasicBlock::iterator MII = MI.getIterator();
2917
2918 // Split the vector index into 32-bit pieces. Prepare to move all of the
2919 // new instructions into a waterfall loop if necessary.
2920 //
2921 // Don't put the bitcast or constant in the loop.
2922 MachineInstrSpan Span(MII, &B.getMBB());
2923
2924 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2925 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2926 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2927
2928 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2929 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2930
2931 MRI.setRegBank(DstReg, *DstBank);
2932 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2933 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2934 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2935 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2936
2937 SmallSet<Register, 4> OpsToWaterfall;
2938 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2939 MI.eraseFromParent();
2940 return;
2941 }
2942
2943 // Remove the original instruction to avoid potentially confusing the
2944 // waterfall loop logic.
2945 B.setInstr(*Span.begin());
2946 MI.eraseFromParent();
2947 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2948 OpsToWaterfall);
2949
2950 if (NeedCopyToVGPR) {
2951 MachineBasicBlock *LoopBB = Extract1->getParent();
2952 Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2953 Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2954 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2955 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2956
2957 Extract0->getOperand(0).setReg(TmpReg0);
2958 Extract1->getOperand(0).setReg(TmpReg1);
2959
2960 B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2961
2962 buildVCopy(B, DstRegs[0], TmpReg0);
2963 buildVCopy(B, DstRegs[1], TmpReg1);
2964 }
2965
2966 if (ShouldMoveIndexIntoLoop)
2967 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2968
2969 return;
2970 }
2971 case AMDGPU::G_INSERT_VECTOR_ELT: {
2972 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2973
2974 Register DstReg = MI.getOperand(0).getReg();
2975 LLT VecTy = MRI.getType(DstReg);
2976
2977 assert(OpdMapper.getVRegs(0).empty());
2978 assert(OpdMapper.getVRegs(3).empty());
2979
2980 if (substituteSimpleCopyRegs(OpdMapper, 1))
2981 MRI.setType(MI.getOperand(1).getReg(), VecTy);
2982
2983 if (foldInsertEltToCmpSelect(B, MI, OpdMapper))
2984 return;
2985
2986 const RegisterBank *IdxBank =
2987 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2988
2989 Register SrcReg = MI.getOperand(1).getReg();
2990 Register InsReg = MI.getOperand(2).getReg();
2991 LLT InsTy = MRI.getType(InsReg);
2992 (void)InsTy;
2993
2994 Register BaseIdxReg;
2995 unsigned ConstOffset;
2996 std::tie(BaseIdxReg, ConstOffset) =
2997 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2998
2999 // See if the index is an add of a constant which will be foldable by moving
3000 // the base register of the index later if this is going to be executed in a
3001 // waterfall loop. This is essentially to reassociate the add of a constant
3002 // with the readfirstlane.
3003 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
3004 ConstOffset > 0 &&
3005 ConstOffset < VecTy.getNumElements();
3006
3007 // Move the base register. We'll re-insert the add later.
3008 if (ShouldMoveIndexIntoLoop)
3009 MI.getOperand(3).setReg(BaseIdxReg);
3010
3011
3012 if (InsRegs.empty()) {
3013 executeInWaterfallLoop(B, MI, {3});
3014
3015 // Re-insert the constant offset add inside the waterfall loop.
3016 if (ShouldMoveIndexIntoLoop) {
3017 reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
3018 }
3019
3020 return;
3021 }
3022
3023 assert(InsTy.getSizeInBits() == 64);
3024
3025 const LLT S32 = LLT::scalar(32);
3026 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
3027
3028 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
3029 auto One = B.buildConstant(S32, 1);
3030
3031 // Split the vector index into 32-bit pieces. Prepare to move all of the
3032 // new instructions into a waterfall loop if necessary.
3033 //
3034 // Don't put the bitcast or constant in the loop.
3035 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
3036
3037 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
3038 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
3039 auto IdxHi = B.buildAdd(S32, IdxLo, One);
3040
3041 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
3042 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
3043
3044 const RegisterBank *DstBank =
3045 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
3046 const RegisterBank *SrcBank =
3047 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
3048 const RegisterBank *InsSrcBank =
3049 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
3050
3051 MRI.setRegBank(InsReg, *InsSrcBank);
3052 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
3053 MRI.setRegBank(InsLo.getReg(0), *DstBank);
3054 MRI.setRegBank(InsHi.getReg(0), *DstBank);
3055 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
3056 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
3057 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
3058
3059
3060 SmallSet<Register, 4> OpsToWaterfall;
3061 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
3062 B.setInsertPt(B.getMBB(), MI);
3063 B.buildBitcast(DstReg, InsHi);
3064 MI.eraseFromParent();
3065 return;
3066 }
3067
3068 B.setInstr(*Span.begin());
3069 MI.eraseFromParent();
3070
3071 // Figure out the point after the waterfall loop before mangling the control
3072 // flow.
3073 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
3074 OpsToWaterfall);
3075
3076 // The insertion point is now right after the original instruction.
3077 //
3078 // Keep the bitcast to the original vector type out of the loop. Doing this
3079 // saved an extra phi we don't need inside the loop.
3080 B.buildBitcast(DstReg, InsHi);
3081
3082 // Re-insert the constant offset add inside the waterfall loop.
3083 if (ShouldMoveIndexIntoLoop)
3084 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
3085
3086 return;
3087 }
3088 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3089 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3090 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3091 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3092 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3093 case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
3094 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
3095 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
3096 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
3097 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
3098 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3099 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
3100 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3101 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3102 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3103 case AMDGPU::G_AMDGPU_BUFFER_STORE:
3104 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3105 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3106 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3107 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
3108 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3109 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
3110 applyDefaultMapping(OpdMapper);
3111 executeInWaterfallLoop(B, MI, {1, 4});
3112 return;
3113 }
3114 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3115 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3116 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3117 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3118 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3119 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3120 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3121 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3122 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3123 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3124 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3125 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3126 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3127 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3128 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3129 applyDefaultMapping(OpdMapper);
3130 executeInWaterfallLoop(B, MI, {2, 5});
3131 return;
3132 }
3133 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3134 applyDefaultMapping(OpdMapper);
3135 executeInWaterfallLoop(B, MI, {3, 6});
3136 return;
3137 }
3138 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
3139 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
3140 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
3141 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
3142 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
3143 applyMappingSBufferLoad(B, OpdMapper);
3144 return;
3145 }
3146 case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
3147 constrainOpWithReadfirstlane(B, MI, 0);
3148 constrainOpWithReadfirstlane(B, MI, 2);
3149 return;
3150 case AMDGPU::G_INTRINSIC:
3151 case AMDGPU::G_INTRINSIC_CONVERGENT: {
3152 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3153 case Intrinsic::amdgcn_readlane: {
3154 substituteSimpleCopyRegs(OpdMapper, 2);
3155
3156 assert(OpdMapper.getVRegs(0).empty());
3157 assert(OpdMapper.getVRegs(3).empty());
3158
3159 // Make sure the index is an SGPR. It doesn't make sense to run this in a
3160 // waterfall loop, so assume it's a uniform value.
3161 constrainOpWithReadfirstlane(B, MI, 3); // Index
3162 return;
3163 }
3164 case Intrinsic::amdgcn_writelane: {
3165 assert(OpdMapper.getVRegs(0).empty());
3166 assert(OpdMapper.getVRegs(2).empty());
3167 assert(OpdMapper.getVRegs(3).empty());
3168
3169 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
3170 constrainOpWithReadfirstlane(B, MI, 2); // Source value
3171 constrainOpWithReadfirstlane(B, MI, 3); // Index
3172 return;
3173 }
3174 case Intrinsic::amdgcn_interp_p1:
3175 case Intrinsic::amdgcn_interp_p2:
3176 case Intrinsic::amdgcn_interp_mov:
3177 case Intrinsic::amdgcn_interp_p1_f16:
3178 case Intrinsic::amdgcn_interp_p2_f16:
3179 case Intrinsic::amdgcn_lds_param_load: {
3180 applyDefaultMapping(OpdMapper);
3181
3182 // Readlane for m0 value, which is always the last operand.
3183 // FIXME: Should this be a waterfall loop instead?
3184 constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
3185 return;
3186 }
3187 case Intrinsic::amdgcn_interp_inreg_p10:
3188 case Intrinsic::amdgcn_interp_inreg_p2:
3189 case Intrinsic::amdgcn_interp_inreg_p10_f16:
3190 case Intrinsic::amdgcn_interp_inreg_p2_f16:
3191 case Intrinsic::amdgcn_interp_p10_rtz_f16:
3192 case Intrinsic::amdgcn_interp_p2_rtz_f16:
3193 case Intrinsic::amdgcn_permlane16_swap:
3194 case Intrinsic::amdgcn_permlane32_swap:
3195 applyDefaultMapping(OpdMapper);
3196 return;
3197 case Intrinsic::amdgcn_permlane16:
3198 case Intrinsic::amdgcn_permlanex16: {
3199 // Doing a waterfall loop over these wouldn't make any sense.
3200 substituteSimpleCopyRegs(OpdMapper, 2);
3201 substituteSimpleCopyRegs(OpdMapper, 3);
3202 constrainOpWithReadfirstlane(B, MI, 4);
3203 constrainOpWithReadfirstlane(B, MI, 5);
3204 return;
3205 }
3206 case Intrinsic::amdgcn_sbfe:
3207 applyMappingBFE(B, OpdMapper, true);
3208 return;
3209 case Intrinsic::amdgcn_ubfe:
3210 applyMappingBFE(B, OpdMapper, false);
3211 return;
3212 case Intrinsic::amdgcn_inverse_ballot:
3213 case Intrinsic::amdgcn_s_bitreplicate:
3214 case Intrinsic::amdgcn_s_quadmask:
3215 case Intrinsic::amdgcn_s_wqm:
3216 applyDefaultMapping(OpdMapper);
3217 constrainOpWithReadfirstlane(B, MI, 2); // Mask
3218 return;
3219 case Intrinsic::amdgcn_ballot:
3220 // Use default handling and insert copy to vcc source.
3221 break;
3222 }
3223 break;
3224 }
3225 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3226 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3227 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3228 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3229 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3230 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3231 AMDGPU::lookupRsrcIntrinsic(AMDGPU::getIntrinsicID(MI));
3232 assert(RSrcIntrin && RSrcIntrin->IsImage);
3233 // Non-images can have complications from operands that allow both SGPR
3234 // and VGPR. For now it's too complicated to figure out the final opcode
3235 // to derive the register bank from the MCInstrDesc.
3236 applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3237 return;
3238 }
3239 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
3240 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
3241 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: {
3242 bool IsDualOrBVH8 =
3243 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY ||
3244 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY;
3245 unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier
3246 unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods;
3247 applyDefaultMapping(OpdMapper);
3248 executeInWaterfallLoop(B, MI, {LastRegOpIdx});
3249 return;
3250 }
3251 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
3252 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
3253 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
3254 switch (IntrID) {
3255 case Intrinsic::amdgcn_ds_ordered_add:
3256 case Intrinsic::amdgcn_ds_ordered_swap: {
3257 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3258 assert(OpdMapper.getVRegs(0).empty());
3259 substituteSimpleCopyRegs(OpdMapper, 3);
3260 constrainOpWithReadfirstlane(B, MI, 2); // M0
3261 return;
3262 }
3263 case Intrinsic::amdgcn_ds_gws_init:
3264 case Intrinsic::amdgcn_ds_gws_barrier:
3265 case Intrinsic::amdgcn_ds_gws_sema_br: {
3266 // Only the first lane is executes, so readfirstlane is safe.
3267 substituteSimpleCopyRegs(OpdMapper, 1);
3268 constrainOpWithReadfirstlane(B, MI, 2); // M0
3269 return;
3270 }
3271 case Intrinsic::amdgcn_ds_gws_sema_v:
3272 case Intrinsic::amdgcn_ds_gws_sema_p:
3273 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3274 // Only the first lane is executes, so readfirstlane is safe.
3275 constrainOpWithReadfirstlane(B, MI, 1); // M0
3276 return;
3277 }
3278 case Intrinsic::amdgcn_ds_append:
3279 case Intrinsic::amdgcn_ds_consume: {
3280 constrainOpWithReadfirstlane(B, MI, 2); // M0
3281 return;
3282 }
3283 case Intrinsic::amdgcn_s_sendmsg:
3284 case Intrinsic::amdgcn_s_sendmsghalt: {
3285 // FIXME: Should this use a waterfall loop?
3286 constrainOpWithReadfirstlane(B, MI, 2); // M0
3287 return;
3288 }
3289 case Intrinsic::amdgcn_s_setreg: {
3290 constrainOpWithReadfirstlane(B, MI, 2);
3291 return;
3292 }
3293 case Intrinsic::amdgcn_s_ttracedata:
3294 constrainOpWithReadfirstlane(B, MI, 1); // M0
3295 return;
3296 case Intrinsic::amdgcn_raw_buffer_load_lds:
3297 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
3298 applyDefaultMapping(OpdMapper);
3299 constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3300 constrainOpWithReadfirstlane(B, MI, 2); // M0
3301 constrainOpWithReadfirstlane(B, MI, 5); // soffset
3302 return;
3303 }
3304 case Intrinsic::amdgcn_struct_buffer_load_lds:
3305 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
3306 applyDefaultMapping(OpdMapper);
3307 constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3308 constrainOpWithReadfirstlane(B, MI, 2); // M0
3309 constrainOpWithReadfirstlane(B, MI, 6); // soffset
3310 return;
3311 }
3312 case Intrinsic::amdgcn_load_to_lds:
3313 case Intrinsic::amdgcn_global_load_lds: {
3314 applyDefaultMapping(OpdMapper);
3315 constrainOpWithReadfirstlane(B, MI, 2);
3316 return;
3317 }
3318 case Intrinsic::amdgcn_lds_direct_load: {
3319 applyDefaultMapping(OpdMapper);
3320 // Readlane for m0 value, which is always the last operand.
3321 constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
3322 return;
3323 }
3324 case Intrinsic::amdgcn_exp_row:
3325 applyDefaultMapping(OpdMapper);
3326 constrainOpWithReadfirstlane(B, MI, 8); // M0
3327 return;
3328 case Intrinsic::amdgcn_s_sleep_var:
3329 assert(OpdMapper.getVRegs(1).empty());
3330 constrainOpWithReadfirstlane(B, MI, 1);
3331 return;
3332 case Intrinsic::amdgcn_s_barrier_signal_var:
3333 constrainOpWithReadfirstlane(B, MI, 1);
3334 constrainOpWithReadfirstlane(B, MI, 2);
3335 return;
3336 case Intrinsic::amdgcn_s_get_barrier_state:
3337 case Intrinsic::amdgcn_s_get_named_barrier_state: {
3338 constrainOpWithReadfirstlane(B, MI, 2);
3339 return;
3340 }
3341 case Intrinsic::amdgcn_s_prefetch_data: {
3342 Register PtrReg = MI.getOperand(1).getReg();
3343 unsigned AS = MRI.getType(PtrReg).getAddressSpace();
3344 if (AMDGPU::isFlatGlobalAddrSpace(AS)) {
3345 constrainOpWithReadfirstlane(B, MI, 1);
3346 constrainOpWithReadfirstlane(B, MI, 2);
3347 } else
3348 MI.eraseFromParent();
3349 return;
3350 }
3351 case Intrinsic::amdgcn_tensor_load_to_lds:
3352 case Intrinsic::amdgcn_tensor_store_from_lds: {
3353 constrainOpWithReadfirstlane(B, MI, 1);
3354 constrainOpWithReadfirstlane(B, MI, 2);
3355 constrainOpWithReadfirstlane(B, MI, 3);
3356 constrainOpWithReadfirstlane(B, MI, 4);
3357 return;
3358 }
3359 case Intrinsic::amdgcn_tensor_load_to_lds_d2:
3360 case Intrinsic::amdgcn_tensor_store_from_lds_d2: {
3361 constrainOpWithReadfirstlane(B, MI, 1);
3362 constrainOpWithReadfirstlane(B, MI, 2);
3363 return;
3364 }
3365 default: {
3366 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3367 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3368 // Non-images can have complications from operands that allow both SGPR
3369 // and VGPR. For now it's too complicated to figure out the final opcode
3370 // to derive the register bank from the MCInstrDesc.
3371 if (RSrcIntrin->IsImage) {
3372 applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3373 return;
3374 }
3375 }
3376
3377 break;
3378 }
3379 }
3380 break;
3381 }
3382 case AMDGPU::G_SI_CALL: {
3383 // Use a set to avoid extra readfirstlanes in the case where multiple
3384 // operands are the same register.
3385 SmallSet<Register, 4> SGPROperandRegs;
3386
3387 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3388 break;
3389
3390 // Move all copies to physical SGPRs that are used by the call instruction
3391 // into the loop block. Start searching for these copies until the
3392 // ADJCALLSTACKUP.
3393 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3394 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3395
3396 // Move all non-copies before the copies, so that a complete range can be
3397 // moved into the waterfall loop.
3398 SmallVector<MachineInstr *, 4> NonCopyInstrs;
3399 // Count of NonCopyInstrs found until the current LastCopy.
3400 unsigned NonCopyInstrsLen = 0;
3401 MachineBasicBlock::iterator Start(&MI);
3402 MachineBasicBlock::iterator LastCopy = Start;
3403 MachineBasicBlock *MBB = MI.getParent();
3404 const SIMachineFunctionInfo *Info =
3405 MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3406 while (Start->getOpcode() != FrameSetupOpcode) {
3407 --Start;
3408 bool IsCopy = false;
3409 if (Start->getOpcode() == AMDGPU::COPY) {
3410 auto &Dst = Start->getOperand(0);
3411 if (Dst.isReg()) {
3412 Register Reg = Dst.getReg();
3413 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3414 IsCopy = true;
3415 } else {
3416 // Also move the copy from the scratch rsrc descriptor into the loop
3417 // to allow it to be optimized away.
3418 auto &Src = Start->getOperand(1);
3419 if (Src.isReg()) {
3420 Reg = Src.getReg();
3421 IsCopy = Info->getScratchRSrcReg() == Reg;
3422 }
3423 }
3424 }
3425 }
3426
3427 if (IsCopy) {
3428 LastCopy = Start;
3429 NonCopyInstrsLen = NonCopyInstrs.size();
3430 } else {
3431 NonCopyInstrs.push_back(&*Start);
3432 }
3433 }
3434 NonCopyInstrs.resize(NonCopyInstrsLen);
3435
3436 for (auto *NonCopy : reverse(NonCopyInstrs)) {
3437 MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3438 }
3439 Start = LastCopy;
3440
3441 // Do the same for copies after the loop
3442 NonCopyInstrs.clear();
3443 NonCopyInstrsLen = 0;
3444 MachineBasicBlock::iterator End(&MI);
3445 LastCopy = End;
3446 while (End->getOpcode() != FrameDestroyOpcode) {
3447 ++End;
3448 bool IsCopy = false;
3449 if (End->getOpcode() == AMDGPU::COPY) {
3450 auto &Src = End->getOperand(1);
3451 if (Src.isReg()) {
3452 Register Reg = Src.getReg();
3453 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3454 }
3455 }
3456
3457 if (IsCopy) {
3458 LastCopy = End;
3459 NonCopyInstrsLen = NonCopyInstrs.size();
3460 } else {
3461 NonCopyInstrs.push_back(&*End);
3462 }
3463 }
3464 NonCopyInstrs.resize(NonCopyInstrsLen);
3465
3466 End = LastCopy;
3467 ++LastCopy;
3468 for (auto *NonCopy : reverse(NonCopyInstrs)) {
3469 MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3470 }
3471
3472 ++End;
3473 B.setInsertPt(B.getMBB(), Start);
3474 executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs);
3475 break;
3476 }
3477 case AMDGPU::G_LOAD:
3478 case AMDGPU::G_ZEXTLOAD:
3479 case AMDGPU::G_SEXTLOAD: {
3480 if (applyMappingLoad(B, OpdMapper, MI))
3481 return;
3482 break;
3483 }
3484 case AMDGPU::G_DYN_STACKALLOC:
3485 applyMappingDynStackAlloc(B, OpdMapper, MI);
3486 return;
3487 case AMDGPU::G_STACKRESTORE: {
3488 applyDefaultMapping(OpdMapper);
3489 constrainOpWithReadfirstlane(B, MI, 0);
3490 return;
3491 }
3492 case AMDGPU::G_SBFX:
3493 applyMappingBFE(B, OpdMapper, /*Signed*/ true);
3494 return;
3495 case AMDGPU::G_UBFX:
3496 applyMappingBFE(B, OpdMapper, /*Signed*/ false);
3497 return;
3498 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3499 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3500 applyMappingMAD_64_32(B, OpdMapper);
3501 return;
3502 case AMDGPU::G_PREFETCH: {
3503 if (!Subtarget.hasPrefetch() || !Subtarget.hasSafeSmemPrefetch()) {
3504 MI.eraseFromParent();
3505 return;
3506 }
3507 Register PtrReg = MI.getOperand(0).getReg();
3508 unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);
3509 if (PtrBank == AMDGPU::VGPRRegBankID) {
3510 MI.eraseFromParent();
3511 return;
3512 }
3513 unsigned AS = MRI.getType(PtrReg).getAddressSpace();
3514 if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
3515 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3516 MI.eraseFromParent();
3517 return;
3518 }
3519 applyDefaultMapping(OpdMapper);
3520 return;
3521 }
3522 default:
3523 break;
3524 }
3525
3526 return applyDefaultMapping(OpdMapper);
3527 }
3528
3529 // vgpr, sgpr -> vgpr
3530 // vgpr, agpr -> vgpr
3531 // agpr, agpr -> agpr
3532 // agpr, sgpr -> vgpr
regBankUnion(unsigned RB0,unsigned RB1)3533 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3534 if (RB0 == AMDGPU::InvalidRegBankID)
3535 return RB1;
3536 if (RB1 == AMDGPU::InvalidRegBankID)
3537 return RB0;
3538
3539 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3540 return AMDGPU::SGPRRegBankID;
3541
3542 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3543 return AMDGPU::AGPRRegBankID;
3544
3545 return AMDGPU::VGPRRegBankID;
3546 }
3547
regBankBoolUnion(unsigned RB0,unsigned RB1)3548 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3549 if (RB0 == AMDGPU::InvalidRegBankID)
3550 return RB1;
3551 if (RB1 == AMDGPU::InvalidRegBankID)
3552 return RB0;
3553
3554 // vcc, vcc -> vcc
3555 // vcc, sgpr -> vcc
3556 // vcc, vgpr -> vcc
3557 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3558 return AMDGPU::VCCRegBankID;
3559
3560 // vcc, vgpr -> vgpr
3561 return regBankUnion(RB0, RB1);
3562 }
3563
getMappingType(const MachineRegisterInfo & MRI,const MachineInstr & MI) const3564 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3565 const MachineInstr &MI) const {
3566 unsigned RegBank = AMDGPU::InvalidRegBankID;
3567
3568 for (const MachineOperand &MO : MI.operands()) {
3569 if (!MO.isReg())
3570 continue;
3571 Register Reg = MO.getReg();
3572 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3573 RegBank = regBankUnion(RegBank, Bank->getID());
3574 if (RegBank == AMDGPU::VGPRRegBankID)
3575 break;
3576 }
3577 }
3578
3579 return RegBank;
3580 }
3581
isSALUMapping(const MachineInstr & MI) const3582 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3583 const MachineFunction &MF = *MI.getParent()->getParent();
3584 const MachineRegisterInfo &MRI = MF.getRegInfo();
3585 for (const MachineOperand &MO : MI.operands()) {
3586 if (!MO.isReg())
3587 continue;
3588 Register Reg = MO.getReg();
3589 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3590 if (Bank->getID() != AMDGPU::SGPRRegBankID)
3591 return false;
3592 }
3593 }
3594 return true;
3595 }
3596
3597 const RegisterBankInfo::InstructionMapping &
getDefaultMappingSOP(const MachineInstr & MI) const3598 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3599 const MachineFunction &MF = *MI.getParent()->getParent();
3600 const MachineRegisterInfo &MRI = MF.getRegInfo();
3601 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3602
3603 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3604 const MachineOperand &SrcOp = MI.getOperand(i);
3605 if (!SrcOp.isReg())
3606 continue;
3607
3608 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3609 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3610 }
3611 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3612 MI.getNumOperands());
3613 }
3614
3615 const RegisterBankInfo::InstructionMapping &
getDefaultMappingVOP(const MachineInstr & MI) const3616 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3617 const MachineFunction &MF = *MI.getParent()->getParent();
3618 const MachineRegisterInfo &MRI = MF.getRegInfo();
3619 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3620
3621 // Even though we technically could use SGPRs, this would require knowledge of
3622 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3623 //
3624 // TODO: Unary ops are trivially OK, so accept SGPRs?
3625 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3626 const MachineOperand &Src = MI.getOperand(i);
3627 if (!Src.isReg())
3628 continue;
3629
3630 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3631 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3632 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3633 }
3634
3635 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3636 MI.getNumOperands());
3637 }
3638
3639 const RegisterBankInfo::InstructionMapping &
getDefaultMappingAllVGPR(const MachineInstr & MI) const3640 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3641 const MachineFunction &MF = *MI.getParent()->getParent();
3642 const MachineRegisterInfo &MRI = MF.getRegInfo();
3643 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3644
3645 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3646 const MachineOperand &Op = MI.getOperand(I);
3647 if (!Op.isReg())
3648 continue;
3649
3650 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3651 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3652 }
3653
3654 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3655 MI.getNumOperands());
3656 }
3657
3658 const RegisterBankInfo::InstructionMapping &
getImageMapping(const MachineRegisterInfo & MRI,const MachineInstr & MI,int RsrcIdx) const3659 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3660 const MachineInstr &MI,
3661 int RsrcIdx) const {
3662 // The reported argument index is relative to the IR intrinsic call arguments,
3663 // so we need to shift by the number of defs and the intrinsic ID.
3664 RsrcIdx += MI.getNumExplicitDefs() + 1;
3665
3666 const int NumOps = MI.getNumOperands();
3667 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3668
3669 // TODO: Should packed/unpacked D16 difference be reported here as part of
3670 // the value mapping?
3671 for (int I = 0; I != NumOps; ++I) {
3672 if (!MI.getOperand(I).isReg())
3673 continue;
3674
3675 Register OpReg = MI.getOperand(I).getReg();
3676 // We replace some dead address operands with $noreg
3677 if (!OpReg)
3678 continue;
3679
3680 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3681
3682 // FIXME: Probably need a new intrinsic register bank searchable table to
3683 // handle arbitrary intrinsics easily.
3684 //
3685 // If this has a sampler, it immediately follows rsrc.
3686 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3687
3688 if (MustBeSGPR) {
3689 // If this must be an SGPR, so we must report whatever it is as legal.
3690 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3691 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3692 } else {
3693 // Some operands must be VGPR, and these are easy to copy to.
3694 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3695 }
3696 }
3697
3698 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3699 }
3700
3701 /// Return the mapping for a pointer argument.
3702 const RegisterBankInfo::ValueMapping *
getValueMappingForPtr(const MachineRegisterInfo & MRI,Register PtrReg) const3703 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3704 Register PtrReg) const {
3705 LLT PtrTy = MRI.getType(PtrReg);
3706 unsigned Size = PtrTy.getSizeInBits();
3707 if (Subtarget.useFlatForGlobal() ||
3708 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3709 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3710
3711 // If we're using MUBUF instructions for global memory, an SGPR base register
3712 // is possible. Otherwise this needs to be a VGPR.
3713 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3714 return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3715 }
3716
3717 const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr & MI) const3718 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3719
3720 const MachineFunction &MF = *MI.getParent()->getParent();
3721 const MachineRegisterInfo &MRI = MF.getRegInfo();
3722 SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3723 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3724 Register PtrReg = MI.getOperand(1).getReg();
3725 LLT PtrTy = MRI.getType(PtrReg);
3726 unsigned AS = PtrTy.getAddressSpace();
3727 unsigned PtrSize = PtrTy.getSizeInBits();
3728
3729 const ValueMapping *ValMapping;
3730 const ValueMapping *PtrMapping;
3731
3732 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3733
3734 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3735 if (isScalarLoadLegal(MI)) {
3736 // We have a uniform instruction so we want to use an SMRD load
3737 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3738 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3739 } else {
3740 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3741
3742 // If we're using MUBUF instructions for global memory, an SGPR base
3743 // register is possible. Otherwise this needs to be a VGPR.
3744 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3745 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3746
3747 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3748 }
3749 } else {
3750 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3751 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3752 }
3753
3754 OpdsMapping[0] = ValMapping;
3755 OpdsMapping[1] = PtrMapping;
3756 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3757 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3758 return Mapping;
3759
3760 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3761 // handle that during instruction selection?
3762 }
3763
3764 unsigned
getRegBankID(Register Reg,const MachineRegisterInfo & MRI,unsigned Default) const3765 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3766 const MachineRegisterInfo &MRI,
3767 unsigned Default) const {
3768 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3769 return Bank ? Bank->getID() : Default;
3770 }
3771
3772 const RegisterBankInfo::ValueMapping *
getSGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3773 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3774 const MachineRegisterInfo &MRI,
3775 const TargetRegisterInfo &TRI) const {
3776 // Lie and claim anything is legal, even though this needs to be an SGPR
3777 // applyMapping will have to deal with it as a waterfall loop.
3778 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3779 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3780 return AMDGPU::getValueMapping(Bank, Size);
3781 }
3782
3783 const RegisterBankInfo::ValueMapping *
getVGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3784 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3785 const MachineRegisterInfo &MRI,
3786 const TargetRegisterInfo &TRI) const {
3787 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3788 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3789 }
3790
3791 const RegisterBankInfo::ValueMapping *
getAGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3792 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3793 const MachineRegisterInfo &MRI,
3794 const TargetRegisterInfo &TRI) const {
3795 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3796 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3797 }
3798
3799 ///
3800 /// This function must return a legal mapping, because
3801 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3802 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3803 /// VGPR to SGPR generated is illegal.
3804 ///
3805 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3806 // legal. These will be dealt with in applyMappingImpl.
3807 //
3808 const RegisterBankInfo::InstructionMapping &
getInstrMapping(const MachineInstr & MI) const3809 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3810 const MachineFunction &MF = *MI.getParent()->getParent();
3811 const MachineRegisterInfo &MRI = MF.getRegInfo();
3812
3813 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3814 Register DstReg = MI.getOperand(0).getReg();
3815 Register SrcReg = MI.getOperand(1).getReg();
3816
3817 // The default logic bothers to analyze impossible alternative mappings. We
3818 // want the most straightforward mapping, so just directly handle this.
3819 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
3820 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
3821 assert(SrcBank && "src bank should have been assigned already");
3822
3823 // For COPY between a physical reg and an s1, there is no type associated so
3824 // we need to take the virtual register's type as a hint on how to interpret
3825 // s1 values.
3826 if (!SrcReg.isVirtual() && !DstBank &&
3827 MRI.getType(DstReg) == LLT::scalar(1))
3828 DstBank = &AMDGPU::VCCRegBank;
3829 else if (!DstReg.isVirtual() && MRI.getType(SrcReg) == LLT::scalar(1))
3830 DstBank = &AMDGPU::VCCRegBank;
3831
3832 if (!DstBank)
3833 DstBank = SrcBank;
3834
3835 unsigned Size = getSizeInBits(DstReg, MRI, *TRI);
3836 if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3837 cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size)))
3838 return getInvalidInstructionMapping();
3839
3840 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3841 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3842 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3843 OpdsMapping[0] = &ValMap;
3844 if (MI.getOpcode() == AMDGPU::G_FREEZE)
3845 OpdsMapping[1] = &ValMap;
3846
3847 return getInstructionMapping(
3848 1, /*Cost*/ 1,
3849 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3850 }
3851
3852 if (MI.isRegSequence()) {
3853 // If any input is a VGPR, the result must be a VGPR. The default handling
3854 // assumes any copy between banks is legal.
3855 unsigned BankID = AMDGPU::SGPRRegBankID;
3856
3857 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3858 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3859 // It doesn't make sense to use vcc or scc banks here, so just ignore
3860 // them.
3861 if (OpBank != AMDGPU::SGPRRegBankID) {
3862 BankID = AMDGPU::VGPRRegBankID;
3863 break;
3864 }
3865 }
3866 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3867
3868 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3869 return getInstructionMapping(
3870 1, /*Cost*/ 1,
3871 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3872 }
3873
3874 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3875 // properly.
3876 //
3877 // TODO: There are additional exec masking dependencies to analyze.
3878 if (auto *PHI = dyn_cast<GPhi>(&MI)) {
3879 unsigned ResultBank = AMDGPU::InvalidRegBankID;
3880 Register DstReg = PHI->getReg(0);
3881
3882 // Sometimes the result may have already been assigned a bank.
3883 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3884 ResultBank = DstBank->getID();
3885
3886 for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) {
3887 Register Reg = PHI->getIncomingValue(I);
3888 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3889
3890 // FIXME: Assuming VGPR for any undetermined inputs.
3891 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3892 ResultBank = AMDGPU::VGPRRegBankID;
3893 break;
3894 }
3895
3896 // FIXME: Need to promote SGPR case to s32
3897 unsigned OpBank = Bank->getID();
3898 ResultBank = regBankBoolUnion(ResultBank, OpBank);
3899 }
3900
3901 assert(ResultBank != AMDGPU::InvalidRegBankID);
3902
3903 unsigned Size = MRI.getType(DstReg).getSizeInBits();
3904
3905 const ValueMapping &ValMap =
3906 getValueMapping(0, Size, getRegBank(ResultBank));
3907 return getInstructionMapping(
3908 1, /*Cost*/ 1,
3909 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3910 }
3911
3912 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3913 if (Mapping.isValid())
3914 return Mapping;
3915
3916 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3917
3918 switch (MI.getOpcode()) {
3919 default:
3920 return getInvalidInstructionMapping();
3921
3922 case AMDGPU::G_AND:
3923 case AMDGPU::G_OR:
3924 case AMDGPU::G_XOR:
3925 case AMDGPU::G_MUL: {
3926 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3927 if (Size == 1) {
3928 const RegisterBank *DstBank
3929 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3930
3931 unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3932 unsigned BankLHS = AMDGPU::InvalidRegBankID;
3933 unsigned BankRHS = AMDGPU::InvalidRegBankID;
3934 if (DstBank) {
3935 TargetBankID = DstBank->getID();
3936 if (DstBank == &AMDGPU::VCCRegBank) {
3937 TargetBankID = AMDGPU::VCCRegBankID;
3938 BankLHS = AMDGPU::VCCRegBankID;
3939 BankRHS = AMDGPU::VCCRegBankID;
3940 } else {
3941 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3942 AMDGPU::SGPRRegBankID);
3943 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3944 AMDGPU::SGPRRegBankID);
3945 }
3946 } else {
3947 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3948 AMDGPU::VCCRegBankID);
3949 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3950 AMDGPU::VCCRegBankID);
3951
3952 // Both inputs should be true booleans to produce a boolean result.
3953 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3954 TargetBankID = AMDGPU::VGPRRegBankID;
3955 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3956 TargetBankID = AMDGPU::VCCRegBankID;
3957 BankLHS = AMDGPU::VCCRegBankID;
3958 BankRHS = AMDGPU::VCCRegBankID;
3959 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3960 TargetBankID = AMDGPU::SGPRRegBankID;
3961 }
3962 }
3963
3964 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3965 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3966 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3967 break;
3968 }
3969
3970 if (Size == 64) {
3971
3972 if (isSALUMapping(MI)) {
3973 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3974 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3975 } else {
3976 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3977 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3978 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3979
3980 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3981 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3982 }
3983
3984 break;
3985 }
3986
3987 [[fallthrough]];
3988 }
3989 case AMDGPU::G_PTR_ADD:
3990 case AMDGPU::G_PTRMASK:
3991 case AMDGPU::G_ADD:
3992 case AMDGPU::G_SUB:
3993 case AMDGPU::G_SHL:
3994 case AMDGPU::G_LSHR:
3995 case AMDGPU::G_ASHR:
3996 case AMDGPU::G_UADDO:
3997 case AMDGPU::G_USUBO:
3998 case AMDGPU::G_UADDE:
3999 case AMDGPU::G_SADDE:
4000 case AMDGPU::G_USUBE:
4001 case AMDGPU::G_SSUBE:
4002 case AMDGPU::G_SMIN:
4003 case AMDGPU::G_SMAX:
4004 case AMDGPU::G_UMIN:
4005 case AMDGPU::G_UMAX:
4006 case AMDGPU::G_ABS:
4007 case AMDGPU::G_SHUFFLE_VECTOR:
4008 case AMDGPU::G_SBFX:
4009 case AMDGPU::G_UBFX:
4010 case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
4011 case AMDGPU::G_AMDGPU_S_MUL_U64_U32:
4012 if (isSALUMapping(MI))
4013 return getDefaultMappingSOP(MI);
4014 return getDefaultMappingVOP(MI);
4015 case AMDGPU::G_FADD:
4016 case AMDGPU::G_FSUB:
4017 case AMDGPU::G_FMUL:
4018 case AMDGPU::G_FMA:
4019 case AMDGPU::G_FFLOOR:
4020 case AMDGPU::G_FCEIL:
4021 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
4022 case AMDGPU::G_FMINNUM:
4023 case AMDGPU::G_FMAXNUM:
4024 case AMDGPU::G_FMINIMUM:
4025 case AMDGPU::G_FMAXIMUM:
4026 case AMDGPU::G_FMINIMUMNUM:
4027 case AMDGPU::G_FMAXIMUMNUM:
4028 case AMDGPU::G_INTRINSIC_TRUNC:
4029 case AMDGPU::G_STRICT_FADD:
4030 case AMDGPU::G_STRICT_FSUB:
4031 case AMDGPU::G_STRICT_FMUL:
4032 case AMDGPU::G_STRICT_FMA: {
4033 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4034 unsigned Size = Ty.getSizeInBits();
4035 if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
4036 (Size == 32 || Size == 16) && isSALUMapping(MI))
4037 return getDefaultMappingSOP(MI);
4038 return getDefaultMappingVOP(MI);
4039 }
4040 case AMDGPU::G_FPTOSI:
4041 case AMDGPU::G_FPTOUI:
4042 case AMDGPU::G_SITOFP:
4043 case AMDGPU::G_UITOFP: {
4044 unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4045 unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4046 if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 &&
4047 isSALUMapping(MI))
4048 return getDefaultMappingSOP(MI);
4049 return getDefaultMappingVOP(MI);
4050 }
4051 case AMDGPU::G_FPTRUNC:
4052 case AMDGPU::G_FPEXT: {
4053 unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4054 unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4055 if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 &&
4056 isSALUMapping(MI))
4057 return getDefaultMappingSOP(MI);
4058 return getDefaultMappingVOP(MI);
4059 }
4060 case AMDGPU::G_FSQRT:
4061 case AMDGPU::G_FEXP2:
4062 case AMDGPU::G_FLOG2: {
4063 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4064 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4065 isSALUMapping(MI))
4066 return getDefaultMappingSOP(MI);
4067 return getDefaultMappingVOP(MI);
4068 }
4069 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
4070 case AMDGPU::G_SSUBSAT:
4071 case AMDGPU::G_UADDSAT:
4072 case AMDGPU::G_USUBSAT:
4073 case AMDGPU::G_FMAD:
4074 case AMDGPU::G_FLDEXP:
4075 case AMDGPU::G_FMINNUM_IEEE:
4076 case AMDGPU::G_FMAXNUM_IEEE:
4077 case AMDGPU::G_FCANONICALIZE:
4078 case AMDGPU::G_STRICT_FLDEXP:
4079 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
4080 case AMDGPU::G_FSHR: // TODO: Expand for scalar
4081 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
4082 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
4083 case AMDGPU::G_AMDGPU_RCP_IFLAG:
4084 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
4085 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
4086 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
4087 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
4088 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
4089 case AMDGPU::G_AMDGPU_SMED3:
4090 case AMDGPU::G_AMDGPU_FMED3:
4091 return getDefaultMappingVOP(MI);
4092 case AMDGPU::G_UMULH:
4093 case AMDGPU::G_SMULH: {
4094 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
4095 return getDefaultMappingSOP(MI);
4096 return getDefaultMappingVOP(MI);
4097 }
4098 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4099 case AMDGPU::G_AMDGPU_MAD_I64_I32: {
4100 // Three possible mappings:
4101 //
4102 // - Default SOP
4103 // - Default VOP
4104 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
4105 //
4106 // This allows instruction selection to keep the multiplication part of the
4107 // instruction on the SALU.
4108 bool AllSalu = true;
4109 bool MulSalu = true;
4110 for (unsigned i = 0; i < 5; ++i) {
4111 Register Reg = MI.getOperand(i).getReg();
4112 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
4113 if (Bank->getID() != AMDGPU::SGPRRegBankID) {
4114 AllSalu = false;
4115 if (i == 2 || i == 3) {
4116 MulSalu = false;
4117 break;
4118 }
4119 }
4120 }
4121 }
4122
4123 if (AllSalu)
4124 return getDefaultMappingSOP(MI);
4125
4126 // If the multiply-add is full-rate in VALU, use that even if the
4127 // multiplication part is scalar. Accumulating separately on the VALU would
4128 // take two instructions.
4129 if (!MulSalu || Subtarget.hasFullRate64Ops())
4130 return getDefaultMappingVOP(MI);
4131
4132 // Keep the multiplication on the SALU, then accumulate on the VALU.
4133 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
4134 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4135 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4136 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4137 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
4138 break;
4139 }
4140 case AMDGPU::G_IMPLICIT_DEF: {
4141 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4142 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4143 break;
4144 }
4145 case AMDGPU::G_FCONSTANT:
4146 case AMDGPU::G_CONSTANT:
4147 case AMDGPU::G_GLOBAL_VALUE:
4148 case AMDGPU::G_FRAME_INDEX:
4149 case AMDGPU::G_BLOCK_ADDR:
4150 case AMDGPU::G_READSTEADYCOUNTER:
4151 case AMDGPU::G_READCYCLECOUNTER: {
4152 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4153 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4154 break;
4155 }
4156 case AMDGPU::G_DYN_STACKALLOC: {
4157 // Result is always uniform, and a wave reduction is needed for the source.
4158 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4159 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4160 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
4161 break;
4162 }
4163 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
4164 // This case is weird because we expect a physical register in the source,
4165 // but need to set a bank anyway.
4166 //
4167 // TODO: We could select the result to SGPR or VGPR
4168 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4169 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4170 break;
4171 }
4172 case AMDGPU::G_INSERT: {
4173 unsigned BankID = getMappingType(MRI, MI);
4174 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4175 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4176 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
4177 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
4178 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
4179 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
4180 OpdsMapping[3] = nullptr;
4181 break;
4182 }
4183 case AMDGPU::G_EXTRACT: {
4184 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4185 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4186 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4187 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
4188 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
4189 OpdsMapping[2] = nullptr;
4190 break;
4191 }
4192 case AMDGPU::G_BUILD_VECTOR:
4193 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
4194 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
4195 if (DstTy == LLT::fixed_vector(2, 16)) {
4196 unsigned DstSize = DstTy.getSizeInBits();
4197 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4198 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4199 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4200 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
4201
4202 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
4203 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
4204 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
4205 break;
4206 }
4207
4208 [[fallthrough]];
4209 }
4210 case AMDGPU::G_MERGE_VALUES:
4211 case AMDGPU::G_CONCAT_VECTORS: {
4212 unsigned Bank = getMappingType(MRI, MI);
4213 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4214 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4215
4216 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
4217 // Op1 and Dst should use the same register bank.
4218 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
4219 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
4220 break;
4221 }
4222 case AMDGPU::G_BITREVERSE:
4223 case AMDGPU::G_BITCAST:
4224 case AMDGPU::G_INTTOPTR:
4225 case AMDGPU::G_PTRTOINT:
4226 case AMDGPU::G_FABS:
4227 case AMDGPU::G_FNEG: {
4228 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4229 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4230 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4231 break;
4232 }
4233 case AMDGPU::G_AMDGPU_FFBH_U32:
4234 case AMDGPU::G_AMDGPU_FFBL_B32:
4235 case AMDGPU::G_CTLZ_ZERO_UNDEF:
4236 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
4237 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4238 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4239 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
4240 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
4241 break;
4242 }
4243 case AMDGPU::G_CTPOP: {
4244 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4245 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4246 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
4247
4248 // This should really be getValueMappingSGPR64Only, but allowing the generic
4249 // code to handle the register split just makes using LegalizerHelper more
4250 // difficult.
4251 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4252 break;
4253 }
4254 case AMDGPU::G_TRUNC: {
4255 Register Dst = MI.getOperand(0).getReg();
4256 Register Src = MI.getOperand(1).getReg();
4257 unsigned Bank = getRegBankID(Src, MRI);
4258 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4259 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4260 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
4261 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
4262 break;
4263 }
4264 case AMDGPU::G_ZEXT:
4265 case AMDGPU::G_SEXT:
4266 case AMDGPU::G_ANYEXT:
4267 case AMDGPU::G_SEXT_INREG: {
4268 Register Dst = MI.getOperand(0).getReg();
4269 Register Src = MI.getOperand(1).getReg();
4270 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4271 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4272
4273 unsigned DstBank;
4274 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
4275 assert(SrcBank);
4276 switch (SrcBank->getID()) {
4277 case AMDGPU::SGPRRegBankID:
4278 DstBank = AMDGPU::SGPRRegBankID;
4279 break;
4280 default:
4281 DstBank = AMDGPU::VGPRRegBankID;
4282 break;
4283 }
4284
4285 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
4286 // 32-bits, and then to 64.
4287 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
4288 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
4289 SrcSize);
4290 break;
4291 }
4292 case AMDGPU::G_IS_FPCLASS: {
4293 Register SrcReg = MI.getOperand(1).getReg();
4294 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4295 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4296 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4297 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4298 break;
4299 }
4300 case AMDGPU::G_STORE: {
4301 assert(MI.getOperand(0).isReg());
4302 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4303
4304 // FIXME: We need to specify a different reg bank once scalar stores are
4305 // supported.
4306 const ValueMapping *ValMapping =
4307 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4308 OpdsMapping[0] = ValMapping;
4309 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4310 break;
4311 }
4312 case AMDGPU::G_ICMP:
4313 case AMDGPU::G_FCMP: {
4314 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4315
4316 // See if the result register has already been constrained to vcc, which may
4317 // happen due to control flow intrinsic lowering.
4318 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4319 AMDGPU::SGPRRegBankID);
4320 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4321 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
4322
4323 auto canUseSCCICMP = [&]() {
4324 auto Pred =
4325 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
4326 return Size == 32 ||
4327 (Size == 64 &&
4328 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4329 Subtarget.hasScalarCompareEq64());
4330 };
4331 auto canUseSCCFCMP = [&]() {
4332 return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16);
4333 };
4334
4335 bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;
4336 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4337 Op2Bank == AMDGPU::SGPRRegBankID &&
4338 Op3Bank == AMDGPU::SGPRRegBankID &&
4339 (isICMP ? canUseSCCICMP() : canUseSCCFCMP());
4340
4341 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4342 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4343
4344 // TODO: Use 32-bit for scalar output size.
4345 // SCC results will need to be copied to a 32-bit SGPR virtual register.
4346 const unsigned ResultSize = 1;
4347
4348 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
4349 OpdsMapping[1] = nullptr; // Predicate Operand.
4350 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
4351 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
4352 break;
4353 }
4354 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4355 // VGPR index can be used for waterfall when indexing a SGPR vector.
4356 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4357 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4358 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4359 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4360 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4361 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
4362
4363 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
4364 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
4365
4366 // The index can be either if the source vector is VGPR.
4367 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4368 break;
4369 }
4370 case AMDGPU::G_INSERT_VECTOR_ELT: {
4371 unsigned OutputBankID = isSALUMapping(MI) ?
4372 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4373
4374 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4375 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4376 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4377 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4378 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
4379
4380 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4381 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4382
4383 // This is a weird case, because we need to break down the mapping based on
4384 // the register bank of a different operand.
4385 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4386 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
4387 InsertSize);
4388 } else {
4389 assert(InsertSize == 32 || InsertSize == 64);
4390 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
4391 }
4392
4393 // The index can be either if the source vector is VGPR.
4394 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
4395 break;
4396 }
4397 case AMDGPU::G_UNMERGE_VALUES: {
4398 unsigned Bank = getMappingType(MRI, MI);
4399
4400 // Op1 and Dst should use the same register bank.
4401 // FIXME: Shouldn't this be the default? Why do we need to handle this?
4402 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4403 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
4404 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
4405 }
4406 break;
4407 }
4408 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4409 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4410 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4411 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4412 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4413 case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
4414 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
4415 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
4416 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
4417 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
4418 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4419 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4420 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4421 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4422 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4423 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4424 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4425 case AMDGPU::G_AMDGPU_BUFFER_STORE:
4426 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4427 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4428 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4429 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4430 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4431
4432 // rsrc
4433 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4434
4435 // vindex
4436 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4437
4438 // voffset
4439 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4440
4441 // soffset
4442 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4443
4444 // Any remaining operands are immediates and were correctly null
4445 // initialized.
4446 break;
4447 }
4448 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4449 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4450 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4451 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4452 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4453 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4454 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4455 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4456 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4457 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4458 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4459 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4460 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4461 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4462 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4463 // vdata_out
4464 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4465
4466 // vdata_in
4467 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4468
4469 // rsrc
4470 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4471
4472 // vindex
4473 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4474
4475 // voffset
4476 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4477
4478 // soffset
4479 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4480
4481 // Any remaining operands are immediates and were correctly null
4482 // initialized.
4483 break;
4484 }
4485 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4486 // vdata_out
4487 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4488
4489 // vdata_in
4490 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4491
4492 // cmp
4493 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4494
4495 // rsrc
4496 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4497
4498 // vindex
4499 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4500
4501 // voffset
4502 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4503
4504 // soffset
4505 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4506
4507 // Any remaining operands are immediates and were correctly null
4508 // initialized.
4509 break;
4510 }
4511 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
4512 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
4513 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
4514 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
4515 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
4516 // Lie and claim everything is legal, even though some need to be
4517 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4518 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4519 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4520
4521 // We need to convert this to a MUBUF if either the resource of offset is
4522 // VGPR.
4523 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4524 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4525 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4526
4527 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4528 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4529 break;
4530 }
4531 case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
4532 OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4533 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4534 break;
4535 case AMDGPU::G_INTRINSIC:
4536 case AMDGPU::G_INTRINSIC_CONVERGENT: {
4537 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
4538 default:
4539 return getInvalidInstructionMapping();
4540 case Intrinsic::amdgcn_div_fmas:
4541 case Intrinsic::amdgcn_div_fixup:
4542 case Intrinsic::amdgcn_trig_preop:
4543 case Intrinsic::amdgcn_sin:
4544 case Intrinsic::amdgcn_cos:
4545 case Intrinsic::amdgcn_log_clamp:
4546 case Intrinsic::amdgcn_rcp_legacy:
4547 case Intrinsic::amdgcn_rsq_legacy:
4548 case Intrinsic::amdgcn_rsq_clamp:
4549 case Intrinsic::amdgcn_tanh:
4550 case Intrinsic::amdgcn_fmul_legacy:
4551 case Intrinsic::amdgcn_fma_legacy:
4552 case Intrinsic::amdgcn_frexp_mant:
4553 case Intrinsic::amdgcn_frexp_exp:
4554 case Intrinsic::amdgcn_fract:
4555 case Intrinsic::amdgcn_cvt_pknorm_i16:
4556 case Intrinsic::amdgcn_cvt_pknorm_u16:
4557 case Intrinsic::amdgcn_cvt_pk_i16:
4558 case Intrinsic::amdgcn_cvt_pk_u16:
4559 case Intrinsic::amdgcn_cvt_pk_f16_fp8:
4560 case Intrinsic::amdgcn_cvt_pk_f16_bf8:
4561 case Intrinsic::amdgcn_fmed3:
4562 case Intrinsic::amdgcn_cubeid:
4563 case Intrinsic::amdgcn_cubema:
4564 case Intrinsic::amdgcn_cubesc:
4565 case Intrinsic::amdgcn_cubetc:
4566 case Intrinsic::amdgcn_sffbh:
4567 case Intrinsic::amdgcn_fmad_ftz:
4568 case Intrinsic::amdgcn_mbcnt_lo:
4569 case Intrinsic::amdgcn_mbcnt_hi:
4570 case Intrinsic::amdgcn_mul_u24:
4571 case Intrinsic::amdgcn_mul_i24:
4572 case Intrinsic::amdgcn_mulhi_u24:
4573 case Intrinsic::amdgcn_mulhi_i24:
4574 case Intrinsic::amdgcn_lerp:
4575 case Intrinsic::amdgcn_sad_u8:
4576 case Intrinsic::amdgcn_msad_u8:
4577 case Intrinsic::amdgcn_sad_hi_u8:
4578 case Intrinsic::amdgcn_sad_u16:
4579 case Intrinsic::amdgcn_qsad_pk_u16_u8:
4580 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4581 case Intrinsic::amdgcn_mqsad_u32_u8:
4582 case Intrinsic::amdgcn_cvt_pk_u8_f32:
4583 case Intrinsic::amdgcn_alignbyte:
4584 case Intrinsic::amdgcn_perm:
4585 case Intrinsic::amdgcn_prng_b32:
4586 case Intrinsic::amdgcn_fdot2:
4587 case Intrinsic::amdgcn_sdot2:
4588 case Intrinsic::amdgcn_udot2:
4589 case Intrinsic::amdgcn_sdot4:
4590 case Intrinsic::amdgcn_udot4:
4591 case Intrinsic::amdgcn_sdot8:
4592 case Intrinsic::amdgcn_udot8:
4593 case Intrinsic::amdgcn_fdot2_bf16_bf16:
4594 case Intrinsic::amdgcn_fdot2_f16_f16:
4595 case Intrinsic::amdgcn_fdot2_f32_bf16:
4596 case Intrinsic::amdgcn_fdot2c_f32_bf16:
4597 case Intrinsic::amdgcn_sudot4:
4598 case Intrinsic::amdgcn_sudot8:
4599 case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
4600 case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
4601 case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
4602 case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
4603 case Intrinsic::amdgcn_cvt_f32_fp8:
4604 case Intrinsic::amdgcn_cvt_f32_fp8_e5m3:
4605 case Intrinsic::amdgcn_cvt_f32_bf8:
4606 case Intrinsic::amdgcn_cvt_off_f32_i4:
4607 case Intrinsic::amdgcn_cvt_pk_f32_fp8:
4608 case Intrinsic::amdgcn_cvt_pk_f32_bf8:
4609 case Intrinsic::amdgcn_cvt_pk_fp8_f32:
4610 case Intrinsic::amdgcn_cvt_pk_bf8_f32:
4611 case Intrinsic::amdgcn_cvt_sr_fp8_f32:
4612 case Intrinsic::amdgcn_cvt_sr_bf8_f32:
4613 case Intrinsic::amdgcn_cvt_sr_bf16_f32:
4614 case Intrinsic::amdgcn_cvt_sr_f16_f32:
4615 case Intrinsic::amdgcn_cvt_f16_fp8:
4616 case Intrinsic::amdgcn_cvt_f16_bf8:
4617 case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_f16:
4618 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16:
4619 case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16:
4620 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16:
4621 case Intrinsic::amdgcn_cvt_scalef32_f16_fp8:
4622 case Intrinsic::amdgcn_cvt_scalef32_f16_bf8:
4623 case Intrinsic::amdgcn_cvt_scalef32_f32_fp8:
4624 case Intrinsic::amdgcn_cvt_scalef32_f32_bf8:
4625 case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f32:
4626 case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f32:
4627 case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp8:
4628 case Intrinsic::amdgcn_cvt_scalef32_pk_f32_bf8:
4629 case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f16:
4630 case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_bf16:
4631 case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f16:
4632 case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_bf16:
4633 case Intrinsic::amdgcn_cvt_scalef32_pk_f32_fp4:
4634 case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f32:
4635 case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp4:
4636 case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp4:
4637 case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_fp6:
4638 case Intrinsic::amdgcn_cvt_scalef32_pk32_f32_bf6:
4639 case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_bf6:
4640 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_bf6:
4641 case Intrinsic::amdgcn_cvt_scalef32_pk32_f16_fp6:
4642 case Intrinsic::amdgcn_cvt_scalef32_pk32_bf16_fp6:
4643 case Intrinsic::amdgcn_cvt_scalef32_pk_f16_bf8:
4644 case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_bf8:
4645 case Intrinsic::amdgcn_cvt_scalef32_pk_f16_fp8:
4646 case Intrinsic::amdgcn_cvt_scalef32_pk_bf16_fp8:
4647 case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_f16:
4648 case Intrinsic::amdgcn_cvt_scalef32_pk_fp4_bf16:
4649 case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f16:
4650 case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_bf16:
4651 case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f32:
4652 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_bf16:
4653 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f16:
4654 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f32:
4655 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_bf16:
4656 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f16:
4657 case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f32:
4658 case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_bf16:
4659 case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f16:
4660 case Intrinsic::amdgcn_cvt_scalef32_sr_bf8_f32:
4661 case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_bf16:
4662 case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f16:
4663 case Intrinsic::amdgcn_cvt_scalef32_sr_fp8_f32:
4664 case Intrinsic::amdgcn_ashr_pk_i8_i32:
4665 case Intrinsic::amdgcn_ashr_pk_u8_i32:
4666 case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:
4667 case Intrinsic::amdgcn_cvt_scalef32_2xpk16_bf6_f32:
4668 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4669 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4670 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
4671 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
4672 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4673 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4674 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4675 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4676 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
4677 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
4678 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
4679 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
4680 case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
4681 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
4682 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
4683 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
4684 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
4685 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
4686 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
4687 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
4688 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
4689 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
4690 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
4691 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
4692 return getDefaultMappingVOP(MI);
4693 case Intrinsic::amdgcn_log:
4694 case Intrinsic::amdgcn_exp2:
4695 case Intrinsic::amdgcn_rcp:
4696 case Intrinsic::amdgcn_rsq:
4697 case Intrinsic::amdgcn_sqrt: {
4698 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4699 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4700 isSALUMapping(MI))
4701 return getDefaultMappingSOP(MI);
4702 return getDefaultMappingVOP(MI);
4703 }
4704 case Intrinsic::amdgcn_sbfe:
4705 case Intrinsic::amdgcn_ubfe:
4706 if (isSALUMapping(MI))
4707 return getDefaultMappingSOP(MI);
4708 return getDefaultMappingVOP(MI);
4709 case Intrinsic::amdgcn_ds_swizzle:
4710 case Intrinsic::amdgcn_ds_permute:
4711 case Intrinsic::amdgcn_ds_bpermute:
4712 case Intrinsic::amdgcn_update_dpp:
4713 case Intrinsic::amdgcn_mov_dpp8:
4714 case Intrinsic::amdgcn_mov_dpp:
4715 case Intrinsic::amdgcn_strict_wwm:
4716 case Intrinsic::amdgcn_wwm:
4717 case Intrinsic::amdgcn_strict_wqm:
4718 case Intrinsic::amdgcn_wqm:
4719 case Intrinsic::amdgcn_softwqm:
4720 case Intrinsic::amdgcn_set_inactive:
4721 case Intrinsic::amdgcn_set_inactive_chain_arg:
4722 case Intrinsic::amdgcn_permlane64:
4723 case Intrinsic::amdgcn_ds_bpermute_fi_b32:
4724 return getDefaultMappingAllVGPR(MI);
4725 case Intrinsic::amdgcn_cvt_pkrtz:
4726 if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
4727 return getDefaultMappingSOP(MI);
4728 return getDefaultMappingVOP(MI);
4729 case Intrinsic::amdgcn_kernarg_segment_ptr:
4730 case Intrinsic::amdgcn_s_getpc:
4731 case Intrinsic::amdgcn_groupstaticsize:
4732 case Intrinsic::amdgcn_reloc_constant:
4733 case Intrinsic::returnaddress: {
4734 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4735 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4736 break;
4737 }
4738 case Intrinsic::amdgcn_wqm_vote: {
4739 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4740 OpdsMapping[0] = OpdsMapping[2]
4741 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4742 break;
4743 }
4744 case Intrinsic::amdgcn_ps_live: {
4745 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4746 break;
4747 }
4748 case Intrinsic::amdgcn_div_scale: {
4749 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4750 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4751 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4752 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4753
4754 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4755 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4756 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4757 break;
4758 }
4759 case Intrinsic::amdgcn_class: {
4760 Register Src0Reg = MI.getOperand(2).getReg();
4761 Register Src1Reg = MI.getOperand(3).getReg();
4762 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4763 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4764 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4765 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4766 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4767 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4768 break;
4769 }
4770 case Intrinsic::amdgcn_icmp:
4771 case Intrinsic::amdgcn_fcmp: {
4772 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4773 // This is not VCCRegBank because this is not used in boolean contexts.
4774 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4775 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4776 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4777 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4778 break;
4779 }
4780 case Intrinsic::amdgcn_readlane: {
4781 // This must be an SGPR, but accept a VGPR.
4782 Register IdxReg = MI.getOperand(3).getReg();
4783 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4784 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4785 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4786 [[fallthrough]];
4787 }
4788 case Intrinsic::amdgcn_readfirstlane: {
4789 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4790 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4791 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4792 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4793 break;
4794 }
4795 case Intrinsic::amdgcn_writelane: {
4796 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4797 Register SrcReg = MI.getOperand(2).getReg();
4798 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4799 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4800 Register IdxReg = MI.getOperand(3).getReg();
4801 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4802 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4803 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4804
4805 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4806 // to legalize.
4807 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4808 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4809 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4810 break;
4811 }
4812 case Intrinsic::amdgcn_if_break: {
4813 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4814 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4815 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4816 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4817 break;
4818 }
4819 case Intrinsic::amdgcn_permlane16:
4820 case Intrinsic::amdgcn_permlanex16: {
4821 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4822 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4823 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4824 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4825 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4826 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4827 break;
4828 }
4829 case Intrinsic::amdgcn_permlane16_var:
4830 case Intrinsic::amdgcn_permlanex16_var: {
4831 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4832 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4833 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4834 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4835 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4836 break;
4837 }
4838 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4839 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4840 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4841 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4842 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4843 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4844 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4845 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4846 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4847 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4848 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4849 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4850 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4851 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4852 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4853 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4854 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4855 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4856 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4857 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4858 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4859 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4860 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4861 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4862 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4863 case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4864 case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4865 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4866 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4867 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4868 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
4869 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
4870 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
4871 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
4872 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
4873 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
4874 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
4875 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
4876 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
4877 case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
4878 case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
4879 case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
4880 case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
4881 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: {
4882 // Default for MAI intrinsics.
4883 // srcC can also be an immediate which can be folded later.
4884 // FIXME: Should we eventually add an alternative mapping with AGPR src
4885 // for srcA/srcB?
4886 //
4887 // vdst, srcA, srcB, srcC
4888 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4889 OpdsMapping[0] =
4890 Info->mayNeedAGPRs()
4891 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4892 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4893 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4894 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4895 OpdsMapping[4] =
4896 Info->mayNeedAGPRs()
4897 ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4898 : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4899 break;
4900 }
4901 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
4902 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
4903 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4904 OpdsMapping[0] =
4905 Info->mayNeedAGPRs()
4906 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4907 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4908
4909 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4910 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4911 OpdsMapping[4] =
4912 Info->mayNeedAGPRs()
4913 ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4914 : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4915
4916 OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
4917 OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI);
4918 break;
4919 }
4920 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
4921 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
4922 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
4923 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
4924 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
4925 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
4926 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
4927 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
4928 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
4929 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
4930 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
4931 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
4932 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
4933 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
4934 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
4935 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
4936 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
4937 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
4938 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
4939 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
4940 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
4941 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
4942 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
4943 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
4944 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
4945 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
4946 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
4947 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: {
4948 // vdst, srcA, srcB, srcC, idx
4949 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4950 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4951 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4952 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4953 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4954 break;
4955 }
4956 case Intrinsic::amdgcn_interp_p1:
4957 case Intrinsic::amdgcn_interp_p2:
4958 case Intrinsic::amdgcn_interp_mov:
4959 case Intrinsic::amdgcn_interp_p1_f16:
4960 case Intrinsic::amdgcn_interp_p2_f16:
4961 case Intrinsic::amdgcn_lds_param_load: {
4962 const int M0Idx = MI.getNumOperands() - 1;
4963 Register M0Reg = MI.getOperand(M0Idx).getReg();
4964 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4965 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4966
4967 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4968 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4969 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4970
4971 // Must be SGPR, but we must take whatever the original bank is and fix it
4972 // later.
4973 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4974 break;
4975 }
4976 case Intrinsic::amdgcn_interp_inreg_p10:
4977 case Intrinsic::amdgcn_interp_inreg_p2:
4978 case Intrinsic::amdgcn_interp_inreg_p10_f16:
4979 case Intrinsic::amdgcn_interp_inreg_p2_f16:
4980 case Intrinsic::amdgcn_interp_p10_rtz_f16:
4981 case Intrinsic::amdgcn_interp_p2_rtz_f16: {
4982 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4983 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4984 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4985 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4986 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4987 break;
4988 }
4989 case Intrinsic::amdgcn_permlane16_swap:
4990 case Intrinsic::amdgcn_permlane32_swap: {
4991 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4992 OpdsMapping[0] = OpdsMapping[1] = OpdsMapping[3] = OpdsMapping[4] =
4993 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4994 break;
4995 }
4996 case Intrinsic::amdgcn_ballot: {
4997 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4998 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4999 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
5000 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
5001 break;
5002 }
5003 case Intrinsic::amdgcn_inverse_ballot: {
5004 // This must be an SGPR, but accept a VGPR.
5005 Register MaskReg = MI.getOperand(2).getReg();
5006 unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
5007 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
5008 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
5009 OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
5010 break;
5011 }
5012 case Intrinsic::amdgcn_bitop3: {
5013 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
5014 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
5015 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
5016 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
5017 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
5018 break;
5019 }
5020 case Intrinsic::amdgcn_s_quadmask:
5021 case Intrinsic::amdgcn_s_wqm: {
5022 Register MaskReg = MI.getOperand(2).getReg();
5023 unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
5024 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
5025 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize);
5026 OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
5027 break;
5028 }
5029 case Intrinsic::amdgcn_wave_reduce_add:
5030 case Intrinsic::amdgcn_wave_reduce_sub:
5031 case Intrinsic::amdgcn_wave_reduce_min:
5032 case Intrinsic::amdgcn_wave_reduce_umin:
5033 case Intrinsic::amdgcn_wave_reduce_max:
5034 case Intrinsic::amdgcn_wave_reduce_umax:
5035 case Intrinsic::amdgcn_wave_reduce_and:
5036 case Intrinsic::amdgcn_wave_reduce_or:
5037 case Intrinsic::amdgcn_wave_reduce_xor: {
5038 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5039 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
5040 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
5041 auto regBankID =
5042 isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5043 OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
5044 break;
5045 }
5046 case Intrinsic::amdgcn_s_bitreplicate:
5047 Register MaskReg = MI.getOperand(2).getReg();
5048 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
5049 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
5050 OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
5051 }
5052 break;
5053 }
5054 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
5055 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
5056 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
5057 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
5058 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
5059 auto IntrID = AMDGPU::getIntrinsicID(MI);
5060 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
5061 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
5062 // Non-images can have complications from operands that allow both SGPR
5063 // and VGPR. For now it's too complicated to figure out the final opcode
5064 // to derive the register bank from the MCInstrDesc.
5065 assert(RSrcIntrin->IsImage);
5066 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
5067 }
5068 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
5069 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
5070 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: {
5071 bool IsDualOrBVH8 =
5072 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY ||
5073 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY;
5074 unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier
5075 unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods;
5076 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5077 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
5078 if (IsDualOrBVH8) {
5079 OpdsMapping[1] = AMDGPU::getValueMapping(
5080 AMDGPU::VGPRRegBankID,
5081 MRI.getType(MI.getOperand(1).getReg()).getSizeInBits());
5082 OpdsMapping[2] = AMDGPU::getValueMapping(
5083 AMDGPU::VGPRRegBankID,
5084 MRI.getType(MI.getOperand(2).getReg()).getSizeInBits());
5085 }
5086 OpdsMapping[LastRegOpIdx] =
5087 getSGPROpMapping(MI.getOperand(LastRegOpIdx).getReg(), MRI, *TRI);
5088 if (LastRegOpIdx == 3) {
5089 // Sequential form: all operands combined into VGPR256/VGPR512
5090 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
5091 if (Size > 256)
5092 Size = 512;
5093 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
5094 } else {
5095 // NSA form
5096 unsigned FirstSrcOpIdx = IsDualOrBVH8 ? 4 : 2;
5097 for (unsigned I = FirstSrcOpIdx; I < LastRegOpIdx; ++I) {
5098 unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();
5099 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
5100 }
5101 }
5102 break;
5103 }
5104 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
5105 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
5106 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
5107 switch (IntrID) {
5108 case Intrinsic::amdgcn_s_getreg:
5109 case Intrinsic::amdgcn_s_memtime:
5110 case Intrinsic::amdgcn_s_memrealtime:
5111 case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
5112 case Intrinsic::amdgcn_s_sendmsg_rtn: {
5113 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5114 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
5115 break;
5116 }
5117 case Intrinsic::amdgcn_global_atomic_csub:
5118 case Intrinsic::amdgcn_global_atomic_fmin_num:
5119 case Intrinsic::amdgcn_global_atomic_fmax_num:
5120 case Intrinsic::amdgcn_flat_atomic_fmin_num:
5121 case Intrinsic::amdgcn_flat_atomic_fmax_num:
5122 case Intrinsic::amdgcn_atomic_cond_sub_u32:
5123 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
5124 case Intrinsic::amdgcn_global_load_tr_b64:
5125 case Intrinsic::amdgcn_global_load_tr_b128:
5126 case Intrinsic::amdgcn_global_load_tr4_b64:
5127 case Intrinsic::amdgcn_global_load_tr6_b96:
5128 case Intrinsic::amdgcn_ds_load_tr8_b64:
5129 case Intrinsic::amdgcn_ds_load_tr16_b128:
5130 case Intrinsic::amdgcn_ds_load_tr4_b64:
5131 case Intrinsic::amdgcn_ds_load_tr6_b96:
5132 case Intrinsic::amdgcn_ds_read_tr4_b64:
5133 case Intrinsic::amdgcn_ds_read_tr6_b96:
5134 case Intrinsic::amdgcn_ds_read_tr8_b64:
5135 case Intrinsic::amdgcn_ds_read_tr16_b64:
5136 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
5137 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
5138 return getDefaultMappingAllVGPR(MI);
5139 case Intrinsic::amdgcn_ds_ordered_add:
5140 case Intrinsic::amdgcn_ds_ordered_swap: {
5141 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5142 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
5143 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5144 AMDGPU::SGPRRegBankID);
5145 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
5146 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5147 break;
5148 }
5149 case Intrinsic::amdgcn_ds_append:
5150 case Intrinsic::amdgcn_ds_consume: {
5151 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5152 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
5153 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5154 break;
5155 }
5156 case Intrinsic::amdgcn_exp_compr:
5157 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5158 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5159 break;
5160 case Intrinsic::amdgcn_exp:
5161 // FIXME: Could we support packed types here?
5162 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5163 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5164 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5165 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5166 break;
5167 case Intrinsic::amdgcn_exp_row:
5168 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5169 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5170 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5171 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5172 OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
5173 break;
5174 case Intrinsic::amdgcn_s_sendmsg:
5175 case Intrinsic::amdgcn_s_sendmsghalt: {
5176 // This must be an SGPR, but accept a VGPR.
5177 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5178 AMDGPU::SGPRRegBankID);
5179 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
5180 break;
5181 }
5182 case Intrinsic::amdgcn_s_setreg: {
5183 // This must be an SGPR, but accept a VGPR.
5184 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5185 AMDGPU::SGPRRegBankID);
5186 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
5187 break;
5188 }
5189 case Intrinsic::amdgcn_s_ttracedata: {
5190 // This must be an SGPR, but accept a VGPR.
5191 unsigned Bank =
5192 getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID);
5193 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
5194 break;
5195 }
5196 case Intrinsic::amdgcn_end_cf: {
5197 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
5198 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
5199 break;
5200 }
5201 case Intrinsic::amdgcn_else: {
5202 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
5203 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
5204 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
5205 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
5206 break;
5207 }
5208 case Intrinsic::amdgcn_init_whole_wave:
5209 case Intrinsic::amdgcn_live_mask: {
5210 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
5211 break;
5212 }
5213 case Intrinsic::amdgcn_wqm_demote:
5214 case Intrinsic::amdgcn_kill: {
5215 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
5216 break;
5217 }
5218 case Intrinsic::amdgcn_raw_buffer_load:
5219 case Intrinsic::amdgcn_raw_ptr_buffer_load:
5220 case Intrinsic::amdgcn_raw_atomic_buffer_load:
5221 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
5222 case Intrinsic::amdgcn_raw_tbuffer_load:
5223 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
5224 // FIXME: Should make intrinsic ID the last operand of the instruction,
5225 // then this would be the same as store
5226 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5227 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5228 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5229 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5230 break;
5231 }
5232 case Intrinsic::amdgcn_raw_buffer_load_lds:
5233 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
5234 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5235 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5236 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5237 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5238 break;
5239 }
5240 case Intrinsic::amdgcn_raw_buffer_store:
5241 case Intrinsic::amdgcn_raw_ptr_buffer_store:
5242 case Intrinsic::amdgcn_raw_buffer_store_format:
5243 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
5244 case Intrinsic::amdgcn_raw_tbuffer_store:
5245 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
5246 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5247 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5248 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5249 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5250 break;
5251 }
5252 case Intrinsic::amdgcn_struct_buffer_load:
5253 case Intrinsic::amdgcn_struct_ptr_buffer_load:
5254 case Intrinsic::amdgcn_struct_tbuffer_load:
5255 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
5256 case Intrinsic::amdgcn_struct_atomic_buffer_load:
5257 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
5258 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5259 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5260 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5261 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5262 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5263 break;
5264 }
5265 case Intrinsic::amdgcn_struct_buffer_load_lds:
5266 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
5267 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5268 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5269 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5270 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5271 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
5272 break;
5273 }
5274 case Intrinsic::amdgcn_struct_buffer_store:
5275 case Intrinsic::amdgcn_struct_ptr_buffer_store:
5276 case Intrinsic::amdgcn_struct_tbuffer_store:
5277 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
5278 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5279 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5280 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5281 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
5282 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
5283 break;
5284 }
5285 case Intrinsic::amdgcn_init_exec_from_input: {
5286 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
5287 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
5288 break;
5289 }
5290 case Intrinsic::amdgcn_ds_gws_init:
5291 case Intrinsic::amdgcn_ds_gws_barrier:
5292 case Intrinsic::amdgcn_ds_gws_sema_br: {
5293 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5294
5295 // This must be an SGPR, but accept a VGPR.
5296 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5297 AMDGPU::SGPRRegBankID);
5298 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
5299 break;
5300 }
5301 case Intrinsic::amdgcn_ds_gws_sema_v:
5302 case Intrinsic::amdgcn_ds_gws_sema_p:
5303 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
5304 // This must be an SGPR, but accept a VGPR.
5305 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
5306 AMDGPU::SGPRRegBankID);
5307 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
5308 break;
5309 }
5310 case Intrinsic::amdgcn_load_to_lds:
5311 case Intrinsic::amdgcn_global_load_lds: {
5312 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5313 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5314 break;
5315 }
5316 case Intrinsic::amdgcn_lds_direct_load: {
5317 const int M0Idx = MI.getNumOperands() - 1;
5318 Register M0Reg = MI.getOperand(M0Idx).getReg();
5319 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
5320 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5321
5322 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
5323 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
5324 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5325
5326 // Must be SGPR, but we must take whatever the original bank is and fix it
5327 // later.
5328 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
5329 break;
5330 }
5331 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
5332 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
5333 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5334 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5335 break;
5336 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
5337 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
5338 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
5339 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
5340 OpdsMapping[0] =
5341 getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst
5342 OpdsMapping[1] =
5343 getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr
5344 OpdsMapping[3] =
5345 getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr
5346 OpdsMapping[4] =
5347 getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0
5348 OpdsMapping[5] =
5349 getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
5350 break;
5351 }
5352 case Intrinsic::amdgcn_s_sleep_var:
5353 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5354 break;
5355 case Intrinsic::amdgcn_s_barrier_signal_var:
5356 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5357 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5358 break;
5359 case Intrinsic::amdgcn_s_barrier_signal_isfirst: {
5360 const unsigned ResultSize = 1;
5361 OpdsMapping[0] =
5362 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
5363 break;
5364 }
5365 case Intrinsic::amdgcn_s_get_barrier_state:
5366 case Intrinsic::amdgcn_s_get_named_barrier_state: {
5367 OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5368 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5369 break;
5370 }
5371 case Intrinsic::amdgcn_pops_exiting_wave_id:
5372 return getDefaultMappingSOP(MI);
5373 case Intrinsic::amdgcn_tensor_load_to_lds_d2:
5374 case Intrinsic::amdgcn_tensor_store_from_lds_d2:
5375 case Intrinsic::amdgcn_tensor_load_to_lds:
5376 case Intrinsic::amdgcn_tensor_store_from_lds: {
5377 // Lie and claim everything is legal, even all operands need to be
5378 // SGPRs. applyMapping will have to deal with it with readfirstlane.
5379 for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
5380 if (MI.getOperand(I).isReg()) {
5381 Register Reg = MI.getOperand(I).getReg();
5382 auto OpBank = getRegBankID(Reg, MRI);
5383 unsigned Size = getSizeInBits(Reg, MRI, *TRI);
5384 OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
5385 }
5386 }
5387 break;
5388 }
5389 case Intrinsic::amdgcn_s_prefetch_data: {
5390 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5391 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5392 break;
5393 }
5394 default:
5395 return getInvalidInstructionMapping();
5396 }
5397 break;
5398 }
5399 case AMDGPU::G_SELECT: {
5400 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
5401 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5402 AMDGPU::SGPRRegBankID);
5403 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
5404 AMDGPU::SGPRRegBankID);
5405 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
5406 Op3Bank == AMDGPU::SGPRRegBankID;
5407
5408 unsigned CondBankDefault = SGPRSrcs ?
5409 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5410 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
5411 CondBankDefault);
5412 if (CondBank == AMDGPU::SGPRRegBankID)
5413 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5414 else if (CondBank == AMDGPU::VGPRRegBankID)
5415 CondBank = AMDGPU::VCCRegBankID;
5416
5417 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
5418 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5419
5420 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
5421
5422 // TODO: Should report 32-bit for scalar condition type.
5423 if (Size == 64) {
5424 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5425 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
5426 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5427 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
5428 } else {
5429 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
5430 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
5431 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
5432 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
5433 }
5434
5435 break;
5436 }
5437
5438 case AMDGPU::G_SI_CALL: {
5439 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
5440 // Lie and claim everything is legal, even though some need to be
5441 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
5442 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5443
5444 // Allow anything for implicit arguments
5445 for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
5446 if (MI.getOperand(I).isReg()) {
5447 Register Reg = MI.getOperand(I).getReg();
5448 auto OpBank = getRegBankID(Reg, MRI);
5449 unsigned Size = getSizeInBits(Reg, MRI, *TRI);
5450 OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
5451 }
5452 }
5453 break;
5454 }
5455 case AMDGPU::G_LOAD:
5456 case AMDGPU::G_ZEXTLOAD:
5457 case AMDGPU::G_SEXTLOAD:
5458 return getInstrMappingForLoad(MI);
5459
5460 case AMDGPU::G_ATOMICRMW_XCHG:
5461 case AMDGPU::G_ATOMICRMW_ADD:
5462 case AMDGPU::G_ATOMICRMW_SUB:
5463 case AMDGPU::G_ATOMICRMW_AND:
5464 case AMDGPU::G_ATOMICRMW_OR:
5465 case AMDGPU::G_ATOMICRMW_XOR:
5466 case AMDGPU::G_ATOMICRMW_MAX:
5467 case AMDGPU::G_ATOMICRMW_MIN:
5468 case AMDGPU::G_ATOMICRMW_UMAX:
5469 case AMDGPU::G_ATOMICRMW_UMIN:
5470 case AMDGPU::G_ATOMICRMW_FADD:
5471 case AMDGPU::G_ATOMICRMW_FMIN:
5472 case AMDGPU::G_ATOMICRMW_FMAX:
5473 case AMDGPU::G_ATOMICRMW_UINC_WRAP:
5474 case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
5475 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
5476 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5477 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
5478 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5479 break;
5480 }
5481 case AMDGPU::G_ATOMIC_CMPXCHG: {
5482 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5483 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
5484 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5485 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
5486 break;
5487 }
5488 case AMDGPU::G_BRCOND: {
5489 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
5490 AMDGPU::SGPRRegBankID);
5491 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
5492 if (Bank != AMDGPU::SGPRRegBankID)
5493 Bank = AMDGPU::VCCRegBankID;
5494
5495 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
5496 break;
5497 }
5498 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
5499 return getDefaultMappingVOP(MI);
5500 case AMDGPU::G_PREFETCH:
5501 OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
5502 break;
5503 }
5504
5505 return getInstructionMapping(/*ID*/1, /*Cost*/1,
5506 getOperandsMapping(OpdsMapping),
5507 MI.getNumOperands());
5508 }
5509