1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
23 #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
24 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/MachineFrameInfo.h"
28 #include "llvm/IR/DiagnosticInfo.h"
29 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #include <optional>
31
32 #define DEBUG_TYPE "amdgpu-isel"
33
34 using namespace llvm;
35 using namespace MIPatternMatch;
36
37 #define GET_GLOBALISEL_IMPL
38 #define AMDGPUSubtarget GCNSubtarget
39 #include "AMDGPUGenGlobalISel.inc"
40 #undef GET_GLOBALISEL_IMPL
41 #undef AMDGPUSubtarget
42
AMDGPUInstructionSelector(const GCNSubtarget & STI,const AMDGPURegisterBankInfo & RBI,const AMDGPUTargetMachine & TM)43 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
48 #define GET_GLOBALISEL_PREDICATES_INIT
49 #include "AMDGPUGenGlobalISel.inc"
50 #undef GET_GLOBALISEL_PREDICATES_INIT
51 #define GET_GLOBALISEL_TEMPORARIES_INIT
52 #include "AMDGPUGenGlobalISel.inc"
53 #undef GET_GLOBALISEL_TEMPORARIES_INIT
54 {
55 }
56
getName()57 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
setupMF(MachineFunction & MF,GISelValueTracking * VT,CodeGenCoverage * CoverageInfo,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)59 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF,
60 GISelValueTracking *VT,
61 CodeGenCoverage *CoverageInfo,
62 ProfileSummaryInfo *PSI,
63 BlockFrequencyInfo *BFI) {
64 MRI = &MF.getRegInfo();
65 Subtarget = &MF.getSubtarget<GCNSubtarget>();
66 Subtarget->checkSubtargetFeatures(MF.getFunction());
67 InstructionSelector::setupMF(MF, VT, CoverageInfo, PSI, BFI);
68 }
69
70 // Return the wave level SGPR base address if this is a wave address.
getWaveAddress(const MachineInstr * Def)71 static Register getWaveAddress(const MachineInstr *Def) {
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75 }
76
isVCC(Register Reg,const MachineRegisterInfo & MRI) const77 bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
85 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97 }
98
constrainCopyLikeIntrin(MachineInstr & MI,unsigned NewOpc) const99 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
120 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
121 }
122
selectCOPY(MachineInstr & I) const123 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
124 const DebugLoc &DL = I.getDebugLoc();
125 MachineBasicBlock *BB = I.getParent();
126 I.setDesc(TII.get(TargetOpcode::COPY));
127
128 const MachineOperand &Src = I.getOperand(1);
129 MachineOperand &Dst = I.getOperand(0);
130 Register DstReg = Dst.getReg();
131 Register SrcReg = Src.getReg();
132
133 if (isVCC(DstReg, *MRI)) {
134 if (SrcReg == AMDGPU::SCC) {
135 const TargetRegisterClass *RC
136 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
137 if (!RC)
138 return true;
139 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
140 }
141
142 if (!isVCC(SrcReg, *MRI)) {
143 // TODO: Should probably leave the copy and let copyPhysReg expand it.
144 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
145 return false;
146
147 const TargetRegisterClass *SrcRC
148 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
149
150 std::optional<ValueAndVReg> ConstVal =
151 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
152 if (ConstVal) {
153 unsigned MovOpc =
154 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
155 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
156 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
157 } else {
158 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
159
160 // We can't trust the high bits at this point, so clear them.
161
162 // TODO: Skip masking high bits if def is known boolean.
163
164 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
165 assert(Subtarget->useRealTrue16Insts());
166 const int64_t NoMods = 0;
167 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
168 .addImm(NoMods)
169 .addImm(1)
170 .addImm(NoMods)
171 .addReg(SrcReg)
172 .addImm(NoMods);
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
174 .addImm(NoMods)
175 .addImm(0)
176 .addImm(NoMods)
177 .addReg(MaskedReg)
178 .addImm(NoMods);
179 } else {
180 bool IsSGPR = TRI.isSGPRClass(SrcRC);
181 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
182 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
183 .addImm(1)
184 .addReg(SrcReg);
185 if (IsSGPR)
186 And.setOperandDead(3); // Dead scc
187
188 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
189 .addImm(0)
190 .addReg(MaskedReg);
191 }
192 }
193
194 if (!MRI->getRegClassOrNull(SrcReg))
195 MRI->setRegClass(SrcReg, SrcRC);
196 I.eraseFromParent();
197 return true;
198 }
199
200 const TargetRegisterClass *RC =
201 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
202 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
203 return false;
204
205 return true;
206 }
207
208 for (const MachineOperand &MO : I.operands()) {
209 if (MO.getReg().isPhysical())
210 continue;
211
212 const TargetRegisterClass *RC =
213 TRI.getConstrainedRegClassForOperand(MO, *MRI);
214 if (!RC)
215 continue;
216 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
217 }
218 return true;
219 }
220
selectCOPY_SCC_VCC(MachineInstr & I) const221 bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
222 const DebugLoc &DL = I.getDebugLoc();
223 MachineBasicBlock *BB = I.getParent();
224
225 unsigned CmpOpc =
226 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
227 MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
228 .addReg(I.getOperand(1).getReg())
229 .addImm(0);
230 if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
231 return false;
232
233 Register DstReg = I.getOperand(0).getReg();
234 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
235
236 I.eraseFromParent();
237 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
238 }
239
selectCOPY_VCC_SCC(MachineInstr & I) const240 bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
241 const DebugLoc &DL = I.getDebugLoc();
242 MachineBasicBlock *BB = I.getParent();
243
244 Register DstReg = I.getOperand(0).getReg();
245 Register SrcReg = I.getOperand(1).getReg();
246 std::optional<ValueAndVReg> Arg =
247 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
248
249 if (Arg) {
250 const int64_t Value = Arg->Value.getZExtValue();
251 if (Value == 0) {
252 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
253 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
254 } else {
255 assert(Value == 1);
256 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
257 }
258 I.eraseFromParent();
259 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
260 }
261
262 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
263 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
264
265 unsigned SelectOpcode =
266 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
267 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
268 .addReg(TRI.getExec())
269 .addImm(0);
270
271 I.eraseFromParent();
272 return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
273 }
274
selectReadAnyLane(MachineInstr & I) const275 bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
276 Register DstReg = I.getOperand(0).getReg();
277 Register SrcReg = I.getOperand(1).getReg();
278
279 const DebugLoc &DL = I.getDebugLoc();
280 MachineBasicBlock *BB = I.getParent();
281
282 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
283 .addReg(SrcReg);
284
285 I.eraseFromParent();
286 return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
287 }
288
selectPHI(MachineInstr & I) const289 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
290 const Register DefReg = I.getOperand(0).getReg();
291 const LLT DefTy = MRI->getType(DefReg);
292
293 // S1 G_PHIs should not be selected in instruction-select, instead:
294 // - divergent S1 G_PHI should go through lane mask merging algorithm
295 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
296 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
297 if (DefTy == LLT::scalar(1))
298 return false;
299
300 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
301
302 const RegClassOrRegBank &RegClassOrBank =
303 MRI->getRegClassOrRegBank(DefReg);
304
305 const TargetRegisterClass *DefRC =
306 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
307 if (!DefRC) {
308 if (!DefTy.isValid()) {
309 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
310 return false;
311 }
312
313 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
314 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
315 if (!DefRC) {
316 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
317 return false;
318 }
319 }
320
321 // If inputs have register bank, assign corresponding reg class.
322 // Note: registers don't need to have the same reg bank.
323 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
324 const Register SrcReg = I.getOperand(i).getReg();
325
326 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
327 if (RB) {
328 const LLT SrcTy = MRI->getType(SrcReg);
329 const TargetRegisterClass *SrcRC =
330 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
331 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
332 return false;
333 }
334 }
335
336 I.setDesc(TII.get(TargetOpcode::PHI));
337 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
338 }
339
340 MachineOperand
getSubOperand64(MachineOperand & MO,const TargetRegisterClass & SubRC,unsigned SubIdx) const341 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
342 const TargetRegisterClass &SubRC,
343 unsigned SubIdx) const {
344
345 MachineInstr *MI = MO.getParent();
346 MachineBasicBlock *BB = MO.getParent()->getParent();
347 Register DstReg = MRI->createVirtualRegister(&SubRC);
348
349 if (MO.isReg()) {
350 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
351 Register Reg = MO.getReg();
352 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
353 .addReg(Reg, 0, ComposedSubIdx);
354
355 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
356 MO.isKill(), MO.isDead(), MO.isUndef(),
357 MO.isEarlyClobber(), 0, MO.isDebug(),
358 MO.isInternalRead());
359 }
360
361 assert(MO.isImm());
362
363 APInt Imm(64, MO.getImm());
364
365 switch (SubIdx) {
366 default:
367 llvm_unreachable("do not know to split immediate with this sub index.");
368 case AMDGPU::sub0:
369 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
370 case AMDGPU::sub1:
371 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
372 }
373 }
374
getLogicalBitOpcode(unsigned Opc,bool Is64)375 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
376 switch (Opc) {
377 case AMDGPU::G_AND:
378 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
379 case AMDGPU::G_OR:
380 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
381 case AMDGPU::G_XOR:
382 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
383 default:
384 llvm_unreachable("not a bit op");
385 }
386 }
387
selectG_AND_OR_XOR(MachineInstr & I) const388 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
389 Register DstReg = I.getOperand(0).getReg();
390 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
391
392 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
393 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
394 DstRB->getID() != AMDGPU::VCCRegBankID)
395 return false;
396
397 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
398 STI.isWave64());
399 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
400
401 // Dead implicit-def of scc
402 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
403 true, // isImp
404 false, // isKill
405 true)); // isDead
406 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
407 }
408
selectG_ADD_SUB(MachineInstr & I) const409 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
410 MachineBasicBlock *BB = I.getParent();
411 MachineFunction *MF = BB->getParent();
412 Register DstReg = I.getOperand(0).getReg();
413 const DebugLoc &DL = I.getDebugLoc();
414 LLT Ty = MRI->getType(DstReg);
415 if (Ty.isVector())
416 return false;
417
418 unsigned Size = Ty.getSizeInBits();
419 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
420 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
421 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
422
423 if (Size == 32) {
424 if (IsSALU) {
425 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
426 MachineInstr *Add =
427 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
428 .add(I.getOperand(1))
429 .add(I.getOperand(2))
430 .setOperandDead(3); // Dead scc
431 I.eraseFromParent();
432 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
433 }
434
435 if (STI.hasAddNoCarry()) {
436 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
437 I.setDesc(TII.get(Opc));
438 I.addOperand(*MF, MachineOperand::CreateImm(0));
439 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
440 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
441 }
442
443 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
444
445 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
446 MachineInstr *Add
447 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
448 .addDef(UnusedCarry, RegState::Dead)
449 .add(I.getOperand(1))
450 .add(I.getOperand(2))
451 .addImm(0);
452 I.eraseFromParent();
453 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
454 }
455
456 assert(!Sub && "illegal sub should not reach here");
457
458 const TargetRegisterClass &RC
459 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
460 const TargetRegisterClass &HalfRC
461 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
462
463 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
464 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
465 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
466 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
467
468 Register DstLo = MRI->createVirtualRegister(&HalfRC);
469 Register DstHi = MRI->createVirtualRegister(&HalfRC);
470
471 if (IsSALU) {
472 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
473 .add(Lo1)
474 .add(Lo2);
475 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
476 .add(Hi1)
477 .add(Hi2)
478 .setOperandDead(3); // Dead scc
479 } else {
480 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
481 Register CarryReg = MRI->createVirtualRegister(CarryRC);
482 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
483 .addDef(CarryReg)
484 .add(Lo1)
485 .add(Lo2)
486 .addImm(0);
487 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
488 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
489 .add(Hi1)
490 .add(Hi2)
491 .addReg(CarryReg, RegState::Kill)
492 .addImm(0);
493
494 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
495 return false;
496 }
497
498 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
499 .addReg(DstLo)
500 .addImm(AMDGPU::sub0)
501 .addReg(DstHi)
502 .addImm(AMDGPU::sub1);
503
504
505 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
506 return false;
507
508 I.eraseFromParent();
509 return true;
510 }
511
selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr & I) const512 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
513 MachineInstr &I) const {
514 MachineBasicBlock *BB = I.getParent();
515 MachineFunction *MF = BB->getParent();
516 const DebugLoc &DL = I.getDebugLoc();
517 Register Dst0Reg = I.getOperand(0).getReg();
518 Register Dst1Reg = I.getOperand(1).getReg();
519 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
520 I.getOpcode() == AMDGPU::G_UADDE;
521 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
522 I.getOpcode() == AMDGPU::G_USUBE;
523
524 if (isVCC(Dst1Reg, *MRI)) {
525 unsigned NoCarryOpc =
526 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
527 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
528 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
529 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
530 I.addOperand(*MF, MachineOperand::CreateImm(0));
531 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
532 }
533
534 Register Src0Reg = I.getOperand(2).getReg();
535 Register Src1Reg = I.getOperand(3).getReg();
536
537 if (HasCarryIn) {
538 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
539 .addReg(I.getOperand(4).getReg());
540 }
541
542 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
543 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
544
545 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
546 .add(I.getOperand(2))
547 .add(I.getOperand(3));
548
549 if (MRI->use_nodbg_empty(Dst1Reg)) {
550 CarryInst.setOperandDead(3); // Dead scc
551 } else {
552 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
553 .addReg(AMDGPU::SCC);
554 if (!MRI->getRegClassOrNull(Dst1Reg))
555 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
556 }
557
558 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
559 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
560 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
561 return false;
562
563 if (HasCarryIn &&
564 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
565 AMDGPU::SReg_32RegClass, *MRI))
566 return false;
567
568 I.eraseFromParent();
569 return true;
570 }
571
selectG_AMDGPU_MAD_64_32(MachineInstr & I) const572 bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
573 MachineInstr &I) const {
574 MachineBasicBlock *BB = I.getParent();
575 MachineFunction *MF = BB->getParent();
576 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
577
578 unsigned Opc;
579 if (Subtarget->hasMADIntraFwdBug())
580 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
581 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
582 else
583 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
584 I.setDesc(TII.get(Opc));
585 I.addOperand(*MF, MachineOperand::CreateImm(0));
586 I.addImplicitDefUseOperands(*MF);
587 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
588 }
589
590 // TODO: We should probably legalize these to only using 32-bit results.
selectG_EXTRACT(MachineInstr & I) const591 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
592 MachineBasicBlock *BB = I.getParent();
593 Register DstReg = I.getOperand(0).getReg();
594 Register SrcReg = I.getOperand(1).getReg();
595 LLT DstTy = MRI->getType(DstReg);
596 LLT SrcTy = MRI->getType(SrcReg);
597 const unsigned SrcSize = SrcTy.getSizeInBits();
598 unsigned DstSize = DstTy.getSizeInBits();
599
600 // TODO: Should handle any multiple of 32 offset.
601 unsigned Offset = I.getOperand(2).getImm();
602 if (Offset % 32 != 0 || DstSize > 128)
603 return false;
604
605 // 16-bit operations really use 32-bit registers.
606 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
607 if (DstSize == 16)
608 DstSize = 32;
609
610 const TargetRegisterClass *DstRC =
611 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
612 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
613 return false;
614
615 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
616 const TargetRegisterClass *SrcRC =
617 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
618 if (!SrcRC)
619 return false;
620 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
621 DstSize / 32);
622 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
623 if (!SrcRC)
624 return false;
625
626 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
627 *SrcRC, I.getOperand(1));
628 const DebugLoc &DL = I.getDebugLoc();
629 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
630 .addReg(SrcReg, 0, SubReg);
631
632 I.eraseFromParent();
633 return true;
634 }
635
selectG_MERGE_VALUES(MachineInstr & MI) const636 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
637 MachineBasicBlock *BB = MI.getParent();
638 Register DstReg = MI.getOperand(0).getReg();
639 LLT DstTy = MRI->getType(DstReg);
640 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
641
642 const unsigned SrcSize = SrcTy.getSizeInBits();
643 if (SrcSize < 32)
644 return selectImpl(MI, *CoverageInfo);
645
646 const DebugLoc &DL = MI.getDebugLoc();
647 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
648 const unsigned DstSize = DstTy.getSizeInBits();
649 const TargetRegisterClass *DstRC =
650 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
651 if (!DstRC)
652 return false;
653
654 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
655 MachineInstrBuilder MIB =
656 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
657 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
658 MachineOperand &Src = MI.getOperand(I + 1);
659 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
660 MIB.addImm(SubRegs[I]);
661
662 const TargetRegisterClass *SrcRC
663 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
664 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
665 return false;
666 }
667
668 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
669 return false;
670
671 MI.eraseFromParent();
672 return true;
673 }
674
selectG_UNMERGE_VALUES(MachineInstr & MI) const675 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
676 MachineBasicBlock *BB = MI.getParent();
677 const int NumDst = MI.getNumOperands() - 1;
678
679 MachineOperand &Src = MI.getOperand(NumDst);
680
681 Register SrcReg = Src.getReg();
682 Register DstReg0 = MI.getOperand(0).getReg();
683 LLT DstTy = MRI->getType(DstReg0);
684 LLT SrcTy = MRI->getType(SrcReg);
685
686 const unsigned DstSize = DstTy.getSizeInBits();
687 const unsigned SrcSize = SrcTy.getSizeInBits();
688 const DebugLoc &DL = MI.getDebugLoc();
689 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
690
691 const TargetRegisterClass *SrcRC =
692 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
693 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
694 return false;
695
696 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
697 // source, and this relies on the fact that the same subregister indices are
698 // used for both.
699 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
700 for (int I = 0, E = NumDst; I != E; ++I) {
701 MachineOperand &Dst = MI.getOperand(I);
702 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
703 .addReg(SrcReg, 0, SubRegs[I]);
704
705 // Make sure the subregister index is valid for the source register.
706 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
707 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
708 return false;
709
710 const TargetRegisterClass *DstRC =
711 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
712 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
713 return false;
714 }
715
716 MI.eraseFromParent();
717 return true;
718 }
719
selectG_BUILD_VECTOR(MachineInstr & MI) const720 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
721 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
722 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
723
724 Register Src0 = MI.getOperand(1).getReg();
725 Register Src1 = MI.getOperand(2).getReg();
726 LLT SrcTy = MRI->getType(Src0);
727 const unsigned SrcSize = SrcTy.getSizeInBits();
728
729 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
730 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
731 return selectG_MERGE_VALUES(MI);
732 }
733
734 // Selection logic below is for V2S16 only.
735 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
736 Register Dst = MI.getOperand(0).getReg();
737 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
738 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
739 SrcTy != LLT::scalar(32)))
740 return selectImpl(MI, *CoverageInfo);
741
742 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
743 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
744 return false;
745
746 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
747 DstBank->getID() == AMDGPU::VGPRRegBankID);
748 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
749
750 const DebugLoc &DL = MI.getDebugLoc();
751 MachineBasicBlock *BB = MI.getParent();
752
753 // First, before trying TableGen patterns, check if both sources are
754 // constants. In those cases, we can trivially compute the final constant
755 // and emit a simple move.
756 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
757 if (ConstSrc1) {
758 auto ConstSrc0 =
759 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
760 if (ConstSrc0) {
761 const int64_t K0 = ConstSrc0->Value.getSExtValue();
762 const int64_t K1 = ConstSrc1->Value.getSExtValue();
763 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
764 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
765 uint32_t Imm = Lo16 | (Hi16 << 16);
766
767 // VALU
768 if (IsVector) {
769 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
770 MI.eraseFromParent();
771 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
772 }
773
774 // SALU
775 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
776 MI.eraseFromParent();
777 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
778 }
779 }
780
781 // Now try TableGen patterns.
782 if (selectImpl(MI, *CoverageInfo))
783 return true;
784
785 // TODO: This should probably be a combine somewhere
786 // (build_vector $src0, undef) -> copy $src0
787 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
788 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
789 MI.setDesc(TII.get(AMDGPU::COPY));
790 MI.removeOperand(2);
791 const auto &RC =
792 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
793 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
794 RBI.constrainGenericRegister(Src0, RC, *MRI);
795 }
796
797 // TODO: Can be improved?
798 if (IsVector) {
799 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
800 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
801 .addImm(0xFFFF)
802 .addReg(Src0);
803 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
804 return false;
805
806 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
807 .addReg(Src1)
808 .addImm(16)
809 .addReg(TmpReg);
810 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
811 return false;
812
813 MI.eraseFromParent();
814 return true;
815 }
816
817 Register ShiftSrc0;
818 Register ShiftSrc1;
819
820 // With multiple uses of the shift, this will duplicate the shift and
821 // increase register pressure.
822 //
823 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
824 // => (S_PACK_HH_B32_B16 $src0, $src1)
825 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
826 // => (S_PACK_HL_B32_B16 $src0, $src1)
827 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
828 // => (S_PACK_LH_B32_B16 $src0, $src1)
829 // (build_vector $src0, $src1)
830 // => (S_PACK_LL_B32_B16 $src0, $src1)
831
832 bool Shift0 = mi_match(
833 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
834
835 bool Shift1 = mi_match(
836 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
837
838 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
839 if (Shift0 && Shift1) {
840 Opc = AMDGPU::S_PACK_HH_B32_B16;
841 MI.getOperand(1).setReg(ShiftSrc0);
842 MI.getOperand(2).setReg(ShiftSrc1);
843 } else if (Shift1) {
844 Opc = AMDGPU::S_PACK_LH_B32_B16;
845 MI.getOperand(2).setReg(ShiftSrc1);
846 } else if (Shift0) {
847 auto ConstSrc1 =
848 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
849 if (ConstSrc1 && ConstSrc1->Value == 0) {
850 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
851 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
852 .addReg(ShiftSrc0)
853 .addImm(16)
854 .setOperandDead(3); // Dead scc
855
856 MI.eraseFromParent();
857 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
858 }
859 if (STI.hasSPackHL()) {
860 Opc = AMDGPU::S_PACK_HL_B32_B16;
861 MI.getOperand(1).setReg(ShiftSrc0);
862 }
863 }
864
865 MI.setDesc(TII.get(Opc));
866 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
867 }
868
selectG_IMPLICIT_DEF(MachineInstr & I) const869 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
870 const MachineOperand &MO = I.getOperand(0);
871
872 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
873 // regbank check here is to know why getConstrainedRegClassForOperand failed.
874 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
875 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
876 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
877 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
878 return true;
879 }
880
881 return false;
882 }
883
selectG_INSERT(MachineInstr & I) const884 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
885 MachineBasicBlock *BB = I.getParent();
886
887 Register DstReg = I.getOperand(0).getReg();
888 Register Src0Reg = I.getOperand(1).getReg();
889 Register Src1Reg = I.getOperand(2).getReg();
890 LLT Src1Ty = MRI->getType(Src1Reg);
891
892 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
893 unsigned InsSize = Src1Ty.getSizeInBits();
894
895 int64_t Offset = I.getOperand(3).getImm();
896
897 // FIXME: These cases should have been illegal and unnecessary to check here.
898 if (Offset % 32 != 0 || InsSize % 32 != 0)
899 return false;
900
901 // Currently not handled by getSubRegFromChannel.
902 if (InsSize > 128)
903 return false;
904
905 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
906 if (SubReg == AMDGPU::NoSubRegister)
907 return false;
908
909 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
910 const TargetRegisterClass *DstRC =
911 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
912 if (!DstRC)
913 return false;
914
915 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
916 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
917 const TargetRegisterClass *Src0RC =
918 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
919 const TargetRegisterClass *Src1RC =
920 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
921
922 // Deal with weird cases where the class only partially supports the subreg
923 // index.
924 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
925 if (!Src0RC || !Src1RC)
926 return false;
927
928 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
929 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
930 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
931 return false;
932
933 const DebugLoc &DL = I.getDebugLoc();
934 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
935 .addReg(Src0Reg)
936 .addReg(Src1Reg)
937 .addImm(SubReg);
938
939 I.eraseFromParent();
940 return true;
941 }
942
selectG_SBFX_UBFX(MachineInstr & MI) const943 bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
944 Register DstReg = MI.getOperand(0).getReg();
945 Register SrcReg = MI.getOperand(1).getReg();
946 Register OffsetReg = MI.getOperand(2).getReg();
947 Register WidthReg = MI.getOperand(3).getReg();
948
949 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
950 "scalar BFX instructions are expanded in regbankselect");
951 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
952 "64-bit vector BFX instructions are expanded in regbankselect");
953
954 const DebugLoc &DL = MI.getDebugLoc();
955 MachineBasicBlock *MBB = MI.getParent();
956
957 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
958 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
959 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
960 .addReg(SrcReg)
961 .addReg(OffsetReg)
962 .addReg(WidthReg);
963 MI.eraseFromParent();
964 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
965 }
966
selectInterpP1F16(MachineInstr & MI) const967 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
968 if (STI.getLDSBankCount() != 16)
969 return selectImpl(MI, *CoverageInfo);
970
971 Register Dst = MI.getOperand(0).getReg();
972 Register Src0 = MI.getOperand(2).getReg();
973 Register M0Val = MI.getOperand(6).getReg();
974 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
975 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
976 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
977 return false;
978
979 // This requires 2 instructions. It is possible to write a pattern to support
980 // this, but the generated isel emitter doesn't correctly deal with multiple
981 // output instructions using the same physical register input. The copy to m0
982 // is incorrectly placed before the second instruction.
983 //
984 // TODO: Match source modifiers.
985
986 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
987 const DebugLoc &DL = MI.getDebugLoc();
988 MachineBasicBlock *MBB = MI.getParent();
989
990 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
991 .addReg(M0Val);
992 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
993 .addImm(2)
994 .addImm(MI.getOperand(4).getImm()) // $attr
995 .addImm(MI.getOperand(3).getImm()); // $attrchan
996
997 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
998 .addImm(0) // $src0_modifiers
999 .addReg(Src0) // $src0
1000 .addImm(MI.getOperand(4).getImm()) // $attr
1001 .addImm(MI.getOperand(3).getImm()) // $attrchan
1002 .addImm(0) // $src2_modifiers
1003 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1004 .addImm(MI.getOperand(5).getImm()) // $high
1005 .addImm(0) // $clamp
1006 .addImm(0); // $omod
1007
1008 MI.eraseFromParent();
1009 return true;
1010 }
1011
1012 // Writelane is special in that it can use SGPR and M0 (which would normally
1013 // count as using the constant bus twice - but in this case it is allowed since
1014 // the lane selector doesn't count as a use of the constant bus). However, it is
1015 // still required to abide by the 1 SGPR rule. Fix this up if we might have
1016 // multiple SGPRs.
selectWritelane(MachineInstr & MI) const1017 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1018 // With a constant bus limit of at least 2, there's no issue.
1019 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1020 return selectImpl(MI, *CoverageInfo);
1021
1022 MachineBasicBlock *MBB = MI.getParent();
1023 const DebugLoc &DL = MI.getDebugLoc();
1024 Register VDst = MI.getOperand(0).getReg();
1025 Register Val = MI.getOperand(2).getReg();
1026 Register LaneSelect = MI.getOperand(3).getReg();
1027 Register VDstIn = MI.getOperand(4).getReg();
1028
1029 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1030
1031 std::optional<ValueAndVReg> ConstSelect =
1032 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1033 if (ConstSelect) {
1034 // The selector has to be an inline immediate, so we can use whatever for
1035 // the other operands.
1036 MIB.addReg(Val);
1037 MIB.addImm(ConstSelect->Value.getSExtValue() &
1038 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1039 } else {
1040 std::optional<ValueAndVReg> ConstVal =
1041 getIConstantVRegValWithLookThrough(Val, *MRI);
1042
1043 // If the value written is an inline immediate, we can get away without a
1044 // copy to m0.
1045 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1046 STI.hasInv2PiInlineImm())) {
1047 MIB.addImm(ConstVal->Value.getSExtValue());
1048 MIB.addReg(LaneSelect);
1049 } else {
1050 MIB.addReg(Val);
1051
1052 // If the lane selector was originally in a VGPR and copied with
1053 // readfirstlane, there's a hazard to read the same SGPR from the
1054 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1055 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1056
1057 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1058 .addReg(LaneSelect);
1059 MIB.addReg(AMDGPU::M0);
1060 }
1061 }
1062
1063 MIB.addReg(VDstIn);
1064
1065 MI.eraseFromParent();
1066 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1067 }
1068
1069 // We need to handle this here because tablegen doesn't support matching
1070 // instructions with multiple outputs.
selectDivScale(MachineInstr & MI) const1071 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1072 Register Dst0 = MI.getOperand(0).getReg();
1073 Register Dst1 = MI.getOperand(1).getReg();
1074
1075 LLT Ty = MRI->getType(Dst0);
1076 unsigned Opc;
1077 if (Ty == LLT::scalar(32))
1078 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1079 else if (Ty == LLT::scalar(64))
1080 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1081 else
1082 return false;
1083
1084 // TODO: Match source modifiers.
1085
1086 const DebugLoc &DL = MI.getDebugLoc();
1087 MachineBasicBlock *MBB = MI.getParent();
1088
1089 Register Numer = MI.getOperand(3).getReg();
1090 Register Denom = MI.getOperand(4).getReg();
1091 unsigned ChooseDenom = MI.getOperand(5).getImm();
1092
1093 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1094
1095 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1096 .addDef(Dst1)
1097 .addImm(0) // $src0_modifiers
1098 .addUse(Src0) // $src0
1099 .addImm(0) // $src1_modifiers
1100 .addUse(Denom) // $src1
1101 .addImm(0) // $src2_modifiers
1102 .addUse(Numer) // $src2
1103 .addImm(0) // $clamp
1104 .addImm(0); // $omod
1105
1106 MI.eraseFromParent();
1107 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1108 }
1109
selectG_INTRINSIC(MachineInstr & I) const1110 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1111 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1112 switch (IntrinsicID) {
1113 case Intrinsic::amdgcn_if_break: {
1114 MachineBasicBlock *BB = I.getParent();
1115
1116 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1117 // SelectionDAG uses for wave32 vs wave64.
1118 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1119 .add(I.getOperand(0))
1120 .add(I.getOperand(2))
1121 .add(I.getOperand(3));
1122
1123 Register DstReg = I.getOperand(0).getReg();
1124 Register Src0Reg = I.getOperand(2).getReg();
1125 Register Src1Reg = I.getOperand(3).getReg();
1126
1127 I.eraseFromParent();
1128
1129 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1130 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1131
1132 return true;
1133 }
1134 case Intrinsic::amdgcn_interp_p1_f16:
1135 return selectInterpP1F16(I);
1136 case Intrinsic::amdgcn_wqm:
1137 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1138 case Intrinsic::amdgcn_softwqm:
1139 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1140 case Intrinsic::amdgcn_strict_wwm:
1141 case Intrinsic::amdgcn_wwm:
1142 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1143 case Intrinsic::amdgcn_strict_wqm:
1144 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1145 case Intrinsic::amdgcn_writelane:
1146 return selectWritelane(I);
1147 case Intrinsic::amdgcn_div_scale:
1148 return selectDivScale(I);
1149 case Intrinsic::amdgcn_icmp:
1150 case Intrinsic::amdgcn_fcmp:
1151 if (selectImpl(I, *CoverageInfo))
1152 return true;
1153 return selectIntrinsicCmp(I);
1154 case Intrinsic::amdgcn_ballot:
1155 return selectBallot(I);
1156 case Intrinsic::amdgcn_reloc_constant:
1157 return selectRelocConstant(I);
1158 case Intrinsic::amdgcn_groupstaticsize:
1159 return selectGroupStaticSize(I);
1160 case Intrinsic::returnaddress:
1161 return selectReturnAddress(I);
1162 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1163 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1164 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1165 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1166 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1167 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1168 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1169 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1170 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1171 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1172 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1173 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1174 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1175 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1176 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1177 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1178 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1179 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1180 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1181 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1182 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1183 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1184 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1185 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1186 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1187 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1188 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1189 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1190 return selectSMFMACIntrin(I);
1191 case Intrinsic::amdgcn_permlane16_swap:
1192 case Intrinsic::amdgcn_permlane32_swap:
1193 return selectPermlaneSwapIntrin(I, IntrinsicID);
1194 default:
1195 return selectImpl(I, *CoverageInfo);
1196 }
1197 }
1198
getV_CMPOpcode(CmpInst::Predicate P,unsigned Size,const GCNSubtarget & ST)1199 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
1200 const GCNSubtarget &ST) {
1201 if (Size != 16 && Size != 32 && Size != 64)
1202 return -1;
1203
1204 if (Size == 16 && !ST.has16BitInsts())
1205 return -1;
1206
1207 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1208 unsigned FakeS16Opc, unsigned S32Opc,
1209 unsigned S64Opc) {
1210 if (Size == 16)
1211 return ST.hasTrue16BitInsts()
1212 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1213 : S16Opc;
1214 if (Size == 32)
1215 return S32Opc;
1216 return S64Opc;
1217 };
1218
1219 switch (P) {
1220 default:
1221 llvm_unreachable("Unknown condition code!");
1222 case CmpInst::ICMP_NE:
1223 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1224 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1225 AMDGPU::V_CMP_NE_U64_e64);
1226 case CmpInst::ICMP_EQ:
1227 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1228 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1229 AMDGPU::V_CMP_EQ_U64_e64);
1230 case CmpInst::ICMP_SGT:
1231 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1232 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1233 AMDGPU::V_CMP_GT_I64_e64);
1234 case CmpInst::ICMP_SGE:
1235 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1236 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1237 AMDGPU::V_CMP_GE_I64_e64);
1238 case CmpInst::ICMP_SLT:
1239 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1240 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1241 AMDGPU::V_CMP_LT_I64_e64);
1242 case CmpInst::ICMP_SLE:
1243 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1244 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1245 AMDGPU::V_CMP_LE_I64_e64);
1246 case CmpInst::ICMP_UGT:
1247 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1248 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1249 AMDGPU::V_CMP_GT_U64_e64);
1250 case CmpInst::ICMP_UGE:
1251 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1252 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1253 AMDGPU::V_CMP_GE_U64_e64);
1254 case CmpInst::ICMP_ULT:
1255 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1256 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1257 AMDGPU::V_CMP_LT_U64_e64);
1258 case CmpInst::ICMP_ULE:
1259 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1260 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1261 AMDGPU::V_CMP_LE_U64_e64);
1262
1263 case CmpInst::FCMP_OEQ:
1264 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1265 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1266 AMDGPU::V_CMP_EQ_F64_e64);
1267 case CmpInst::FCMP_OGT:
1268 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1269 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1270 AMDGPU::V_CMP_GT_F64_e64);
1271 case CmpInst::FCMP_OGE:
1272 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1273 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1274 AMDGPU::V_CMP_GE_F64_e64);
1275 case CmpInst::FCMP_OLT:
1276 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1277 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1278 AMDGPU::V_CMP_LT_F64_e64);
1279 case CmpInst::FCMP_OLE:
1280 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1281 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1282 AMDGPU::V_CMP_LE_F64_e64);
1283 case CmpInst::FCMP_ONE:
1284 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1285 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1286 AMDGPU::V_CMP_NEQ_F64_e64);
1287 case CmpInst::FCMP_ORD:
1288 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1289 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1290 AMDGPU::V_CMP_O_F64_e64);
1291 case CmpInst::FCMP_UNO:
1292 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1293 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1294 AMDGPU::V_CMP_U_F64_e64);
1295 case CmpInst::FCMP_UEQ:
1296 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1297 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1298 AMDGPU::V_CMP_NLG_F64_e64);
1299 case CmpInst::FCMP_UGT:
1300 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1301 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1302 AMDGPU::V_CMP_NLE_F64_e64);
1303 case CmpInst::FCMP_UGE:
1304 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1305 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1306 AMDGPU::V_CMP_NLT_F64_e64);
1307 case CmpInst::FCMP_ULT:
1308 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1309 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1310 AMDGPU::V_CMP_NGE_F64_e64);
1311 case CmpInst::FCMP_ULE:
1312 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1313 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1314 AMDGPU::V_CMP_NGT_F64_e64);
1315 case CmpInst::FCMP_UNE:
1316 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1317 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1318 AMDGPU::V_CMP_NEQ_F64_e64);
1319 case CmpInst::FCMP_TRUE:
1320 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1321 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1322 AMDGPU::V_CMP_TRU_F64_e64);
1323 case CmpInst::FCMP_FALSE:
1324 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1325 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1326 AMDGPU::V_CMP_F_F64_e64);
1327 }
1328 }
1329
getS_CMPOpcode(CmpInst::Predicate P,unsigned Size) const1330 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1331 unsigned Size) const {
1332 if (Size == 64) {
1333 if (!STI.hasScalarCompareEq64())
1334 return -1;
1335
1336 switch (P) {
1337 case CmpInst::ICMP_NE:
1338 return AMDGPU::S_CMP_LG_U64;
1339 case CmpInst::ICMP_EQ:
1340 return AMDGPU::S_CMP_EQ_U64;
1341 default:
1342 return -1;
1343 }
1344 }
1345
1346 if (Size == 32) {
1347 switch (P) {
1348 case CmpInst::ICMP_NE:
1349 return AMDGPU::S_CMP_LG_U32;
1350 case CmpInst::ICMP_EQ:
1351 return AMDGPU::S_CMP_EQ_U32;
1352 case CmpInst::ICMP_SGT:
1353 return AMDGPU::S_CMP_GT_I32;
1354 case CmpInst::ICMP_SGE:
1355 return AMDGPU::S_CMP_GE_I32;
1356 case CmpInst::ICMP_SLT:
1357 return AMDGPU::S_CMP_LT_I32;
1358 case CmpInst::ICMP_SLE:
1359 return AMDGPU::S_CMP_LE_I32;
1360 case CmpInst::ICMP_UGT:
1361 return AMDGPU::S_CMP_GT_U32;
1362 case CmpInst::ICMP_UGE:
1363 return AMDGPU::S_CMP_GE_U32;
1364 case CmpInst::ICMP_ULT:
1365 return AMDGPU::S_CMP_LT_U32;
1366 case CmpInst::ICMP_ULE:
1367 return AMDGPU::S_CMP_LE_U32;
1368 case CmpInst::FCMP_OEQ:
1369 return AMDGPU::S_CMP_EQ_F32;
1370 case CmpInst::FCMP_OGT:
1371 return AMDGPU::S_CMP_GT_F32;
1372 case CmpInst::FCMP_OGE:
1373 return AMDGPU::S_CMP_GE_F32;
1374 case CmpInst::FCMP_OLT:
1375 return AMDGPU::S_CMP_LT_F32;
1376 case CmpInst::FCMP_OLE:
1377 return AMDGPU::S_CMP_LE_F32;
1378 case CmpInst::FCMP_ONE:
1379 return AMDGPU::S_CMP_LG_F32;
1380 case CmpInst::FCMP_ORD:
1381 return AMDGPU::S_CMP_O_F32;
1382 case CmpInst::FCMP_UNO:
1383 return AMDGPU::S_CMP_U_F32;
1384 case CmpInst::FCMP_UEQ:
1385 return AMDGPU::S_CMP_NLG_F32;
1386 case CmpInst::FCMP_UGT:
1387 return AMDGPU::S_CMP_NLE_F32;
1388 case CmpInst::FCMP_UGE:
1389 return AMDGPU::S_CMP_NLT_F32;
1390 case CmpInst::FCMP_ULT:
1391 return AMDGPU::S_CMP_NGE_F32;
1392 case CmpInst::FCMP_ULE:
1393 return AMDGPU::S_CMP_NGT_F32;
1394 case CmpInst::FCMP_UNE:
1395 return AMDGPU::S_CMP_NEQ_F32;
1396 default:
1397 llvm_unreachable("Unknown condition code!");
1398 }
1399 }
1400
1401 if (Size == 16) {
1402 if (!STI.hasSALUFloatInsts())
1403 return -1;
1404
1405 switch (P) {
1406 case CmpInst::FCMP_OEQ:
1407 return AMDGPU::S_CMP_EQ_F16;
1408 case CmpInst::FCMP_OGT:
1409 return AMDGPU::S_CMP_GT_F16;
1410 case CmpInst::FCMP_OGE:
1411 return AMDGPU::S_CMP_GE_F16;
1412 case CmpInst::FCMP_OLT:
1413 return AMDGPU::S_CMP_LT_F16;
1414 case CmpInst::FCMP_OLE:
1415 return AMDGPU::S_CMP_LE_F16;
1416 case CmpInst::FCMP_ONE:
1417 return AMDGPU::S_CMP_LG_F16;
1418 case CmpInst::FCMP_ORD:
1419 return AMDGPU::S_CMP_O_F16;
1420 case CmpInst::FCMP_UNO:
1421 return AMDGPU::S_CMP_U_F16;
1422 case CmpInst::FCMP_UEQ:
1423 return AMDGPU::S_CMP_NLG_F16;
1424 case CmpInst::FCMP_UGT:
1425 return AMDGPU::S_CMP_NLE_F16;
1426 case CmpInst::FCMP_UGE:
1427 return AMDGPU::S_CMP_NLT_F16;
1428 case CmpInst::FCMP_ULT:
1429 return AMDGPU::S_CMP_NGE_F16;
1430 case CmpInst::FCMP_ULE:
1431 return AMDGPU::S_CMP_NGT_F16;
1432 case CmpInst::FCMP_UNE:
1433 return AMDGPU::S_CMP_NEQ_F16;
1434 default:
1435 llvm_unreachable("Unknown condition code!");
1436 }
1437 }
1438
1439 return -1;
1440 }
1441
selectG_ICMP_or_FCMP(MachineInstr & I) const1442 bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1443
1444 MachineBasicBlock *BB = I.getParent();
1445 const DebugLoc &DL = I.getDebugLoc();
1446
1447 Register SrcReg = I.getOperand(2).getReg();
1448 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1449
1450 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1451
1452 Register CCReg = I.getOperand(0).getReg();
1453 if (!isVCC(CCReg, *MRI)) {
1454 int Opcode = getS_CMPOpcode(Pred, Size);
1455 if (Opcode == -1)
1456 return false;
1457 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1458 .add(I.getOperand(2))
1459 .add(I.getOperand(3));
1460 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1461 .addReg(AMDGPU::SCC);
1462 bool Ret =
1463 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1464 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1465 I.eraseFromParent();
1466 return Ret;
1467 }
1468
1469 if (I.getOpcode() == AMDGPU::G_FCMP)
1470 return false;
1471
1472 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1473 if (Opcode == -1)
1474 return false;
1475
1476 MachineInstrBuilder ICmp;
1477 // t16 instructions
1478 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1479 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1480 .addImm(0)
1481 .add(I.getOperand(2))
1482 .addImm(0)
1483 .add(I.getOperand(3))
1484 .addImm(0); // op_sel
1485 } else {
1486 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1487 .add(I.getOperand(2))
1488 .add(I.getOperand(3));
1489 }
1490
1491 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1492 *TRI.getBoolRC(), *MRI);
1493 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1494 I.eraseFromParent();
1495 return Ret;
1496 }
1497
selectIntrinsicCmp(MachineInstr & I) const1498 bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1499 Register Dst = I.getOperand(0).getReg();
1500 if (isVCC(Dst, *MRI))
1501 return false;
1502
1503 LLT DstTy = MRI->getType(Dst);
1504 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1505 return false;
1506
1507 MachineBasicBlock *BB = I.getParent();
1508 const DebugLoc &DL = I.getDebugLoc();
1509 Register SrcReg = I.getOperand(2).getReg();
1510 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1511
1512 // i1 inputs are not supported in GlobalISel.
1513 if (Size == 1)
1514 return false;
1515
1516 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1517 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1518 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1519 I.eraseFromParent();
1520 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1521 }
1522
1523 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1524 if (Opcode == -1)
1525 return false;
1526
1527 MachineInstrBuilder SelectedMI;
1528 MachineOperand &LHS = I.getOperand(2);
1529 MachineOperand &RHS = I.getOperand(3);
1530 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1531 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1532 Register Src0Reg =
1533 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1534 Register Src1Reg =
1535 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1536 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1537 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1538 SelectedMI.addImm(Src0Mods);
1539 SelectedMI.addReg(Src0Reg);
1540 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1541 SelectedMI.addImm(Src1Mods);
1542 SelectedMI.addReg(Src1Reg);
1543 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1544 SelectedMI.addImm(0); // clamp
1545 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1546 SelectedMI.addImm(0); // op_sel
1547
1548 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1549 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1550 return false;
1551
1552 I.eraseFromParent();
1553 return true;
1554 }
1555
1556 // Ballot has to zero bits in input lane-mask that are zero in current exec,
1557 // Done as AND with exec. For inputs that are results of instruction that
1558 // implicitly use same exec, for example compares in same basic block or SCC to
1559 // VCC copy, use copy.
isLaneMaskFromSameBlock(Register Reg,MachineRegisterInfo & MRI,MachineBasicBlock * MBB)1560 static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI,
1561 MachineBasicBlock *MBB) {
1562 MachineInstr *MI = MRI.getVRegDef(Reg);
1563 if (MI->getParent() != MBB)
1564 return false;
1565
1566 // Lane mask generated by SCC to VCC copy.
1567 if (MI->getOpcode() == AMDGPU::COPY) {
1568 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1569 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1570 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1571 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1572 return true;
1573 }
1574
1575 // Lane mask generated using compare with same exec.
1576 if (isa<GAnyCmp>(MI))
1577 return true;
1578
1579 Register LHS, RHS;
1580 // Look through AND.
1581 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1582 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1583 isLaneMaskFromSameBlock(RHS, MRI, MBB);
1584
1585 return false;
1586 }
1587
selectBallot(MachineInstr & I) const1588 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1589 MachineBasicBlock *BB = I.getParent();
1590 const DebugLoc &DL = I.getDebugLoc();
1591 Register DstReg = I.getOperand(0).getReg();
1592 Register SrcReg = I.getOperand(2).getReg();
1593 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1594 const unsigned WaveSize = STI.getWavefrontSize();
1595
1596 // In the common case, the return type matches the wave size.
1597 // However we also support emitting i64 ballots in wave32 mode.
1598 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1599 return false;
1600
1601 std::optional<ValueAndVReg> Arg =
1602 getIConstantVRegValWithLookThrough(SrcReg, *MRI);
1603
1604 Register Dst = DstReg;
1605 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1606 if (BallotSize != WaveSize) {
1607 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1608 }
1609
1610 if (Arg) {
1611 const int64_t Value = Arg->Value.getZExtValue();
1612 if (Value == 0) {
1613 // Dst = S_MOV 0
1614 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1615 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1616 } else {
1617 // Dst = COPY EXEC
1618 assert(Value == 1);
1619 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1620 }
1621 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1622 return false;
1623 } else {
1624 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1625 // Dst = COPY SrcReg
1626 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1627 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1628 return false;
1629 } else {
1630 // Dst = S_AND SrcReg, EXEC
1631 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1632 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1633 .addReg(SrcReg)
1634 .addReg(TRI.getExec())
1635 .setOperandDead(3); // Dead scc
1636 if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1637 return false;
1638 }
1639 }
1640
1641 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1642 if (BallotSize != WaveSize) {
1643 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1644 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1645 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1646 .addReg(Dst)
1647 .addImm(AMDGPU::sub0)
1648 .addReg(HiReg)
1649 .addImm(AMDGPU::sub1);
1650 }
1651
1652 I.eraseFromParent();
1653 return true;
1654 }
1655
selectRelocConstant(MachineInstr & I) const1656 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1657 Register DstReg = I.getOperand(0).getReg();
1658 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1659 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1660 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1661 return false;
1662
1663 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1664
1665 Module *M = MF->getFunction().getParent();
1666 const MDNode *Metadata = I.getOperand(2).getMetadata();
1667 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1668 auto *RelocSymbol = cast<GlobalVariable>(
1669 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1670
1671 MachineBasicBlock *BB = I.getParent();
1672 BuildMI(*BB, &I, I.getDebugLoc(),
1673 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1674 .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1675
1676 I.eraseFromParent();
1677 return true;
1678 }
1679
selectGroupStaticSize(MachineInstr & I) const1680 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1681 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1682
1683 Register DstReg = I.getOperand(0).getReg();
1684 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1685 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1686 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1687
1688 MachineBasicBlock *MBB = I.getParent();
1689 const DebugLoc &DL = I.getDebugLoc();
1690
1691 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1692
1693 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1694 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1695 MIB.addImm(MFI->getLDSSize());
1696 } else {
1697 Module *M = MF->getFunction().getParent();
1698 const GlobalValue *GV =
1699 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1700 MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
1701 }
1702
1703 I.eraseFromParent();
1704 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1705 }
1706
selectReturnAddress(MachineInstr & I) const1707 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1708 MachineBasicBlock *MBB = I.getParent();
1709 MachineFunction &MF = *MBB->getParent();
1710 const DebugLoc &DL = I.getDebugLoc();
1711
1712 MachineOperand &Dst = I.getOperand(0);
1713 Register DstReg = Dst.getReg();
1714 unsigned Depth = I.getOperand(2).getImm();
1715
1716 const TargetRegisterClass *RC
1717 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1718 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1719 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1720 return false;
1721
1722 // Check for kernel and shader functions
1723 if (Depth != 0 ||
1724 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1725 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1726 .addImm(0);
1727 I.eraseFromParent();
1728 return true;
1729 }
1730
1731 MachineFrameInfo &MFI = MF.getFrameInfo();
1732 // There is a call to @llvm.returnaddress in this function
1733 MFI.setReturnAddressIsTaken(true);
1734
1735 // Get the return address reg and mark it as an implicit live-in
1736 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1737 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1738 AMDGPU::SReg_64RegClass, DL);
1739 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1740 .addReg(LiveIn);
1741 I.eraseFromParent();
1742 return true;
1743 }
1744
selectEndCfIntrinsic(MachineInstr & MI) const1745 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1746 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1747 // SelectionDAG uses for wave32 vs wave64.
1748 MachineBasicBlock *BB = MI.getParent();
1749 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1750 .add(MI.getOperand(1));
1751
1752 Register Reg = MI.getOperand(1).getReg();
1753 MI.eraseFromParent();
1754
1755 if (!MRI->getRegClassOrNull(Reg))
1756 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1757 return true;
1758 }
1759
selectDSOrderedIntrinsic(MachineInstr & MI,Intrinsic::ID IntrID) const1760 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1761 MachineInstr &MI, Intrinsic::ID IntrID) const {
1762 MachineBasicBlock *MBB = MI.getParent();
1763 MachineFunction *MF = MBB->getParent();
1764 const DebugLoc &DL = MI.getDebugLoc();
1765
1766 unsigned IndexOperand = MI.getOperand(7).getImm();
1767 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1768 bool WaveDone = MI.getOperand(9).getImm() != 0;
1769
1770 if (WaveDone && !WaveRelease) {
1771 // TODO: Move this to IR verifier
1772 const Function &Fn = MF->getFunction();
1773 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1774 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1775 }
1776
1777 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1778 IndexOperand &= ~0x3f;
1779 unsigned CountDw = 0;
1780
1781 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1782 CountDw = (IndexOperand >> 24) & 0xf;
1783 IndexOperand &= ~(0xf << 24);
1784
1785 if (CountDw < 1 || CountDw > 4) {
1786 const Function &Fn = MF->getFunction();
1787 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1788 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1789 CountDw = 1;
1790 }
1791 }
1792
1793 if (IndexOperand) {
1794 const Function &Fn = MF->getFunction();
1795 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1796 Fn, "ds_ordered_count: bad index operand", DL));
1797 }
1798
1799 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1800 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1801
1802 unsigned Offset0 = OrderedCountIndex << 2;
1803 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1804
1805 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1806 Offset1 |= (CountDw - 1) << 6;
1807
1808 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1809 Offset1 |= ShaderType << 2;
1810
1811 unsigned Offset = Offset0 | (Offset1 << 8);
1812
1813 Register M0Val = MI.getOperand(2).getReg();
1814 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1815 .addReg(M0Val);
1816
1817 Register DstReg = MI.getOperand(0).getReg();
1818 Register ValReg = MI.getOperand(3).getReg();
1819 MachineInstrBuilder DS =
1820 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1821 .addReg(ValReg)
1822 .addImm(Offset)
1823 .cloneMemRefs(MI);
1824
1825 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1826 return false;
1827
1828 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1829 MI.eraseFromParent();
1830 return Ret;
1831 }
1832
gwsIntrinToOpcode(unsigned IntrID)1833 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1834 switch (IntrID) {
1835 case Intrinsic::amdgcn_ds_gws_init:
1836 return AMDGPU::DS_GWS_INIT;
1837 case Intrinsic::amdgcn_ds_gws_barrier:
1838 return AMDGPU::DS_GWS_BARRIER;
1839 case Intrinsic::amdgcn_ds_gws_sema_v:
1840 return AMDGPU::DS_GWS_SEMA_V;
1841 case Intrinsic::amdgcn_ds_gws_sema_br:
1842 return AMDGPU::DS_GWS_SEMA_BR;
1843 case Intrinsic::amdgcn_ds_gws_sema_p:
1844 return AMDGPU::DS_GWS_SEMA_P;
1845 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1846 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1847 default:
1848 llvm_unreachable("not a gws intrinsic");
1849 }
1850 }
1851
selectDSGWSIntrinsic(MachineInstr & MI,Intrinsic::ID IID) const1852 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1853 Intrinsic::ID IID) const {
1854 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1855 !STI.hasGWSSemaReleaseAll()))
1856 return false;
1857
1858 // intrinsic ID, vsrc, offset
1859 const bool HasVSrc = MI.getNumOperands() == 3;
1860 assert(HasVSrc || MI.getNumOperands() == 2);
1861
1862 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1863 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1864 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1865 return false;
1866
1867 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1868 unsigned ImmOffset;
1869
1870 MachineBasicBlock *MBB = MI.getParent();
1871 const DebugLoc &DL = MI.getDebugLoc();
1872
1873 MachineInstr *Readfirstlane = nullptr;
1874
1875 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1876 // incoming offset, in case there's an add of a constant. We'll have to put it
1877 // back later.
1878 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1879 Readfirstlane = OffsetDef;
1880 BaseOffset = OffsetDef->getOperand(1).getReg();
1881 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1882 }
1883
1884 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1885 // If we have a constant offset, try to use the 0 in m0 as the base.
1886 // TODO: Look into changing the default m0 initialization value. If the
1887 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1888 // the immediate offset.
1889
1890 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1891 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1892 .addImm(0);
1893 } else {
1894 std::tie(BaseOffset, ImmOffset) =
1895 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
1896
1897 if (Readfirstlane) {
1898 // We have the constant offset now, so put the readfirstlane back on the
1899 // variable component.
1900 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1901 return false;
1902
1903 Readfirstlane->getOperand(1).setReg(BaseOffset);
1904 BaseOffset = Readfirstlane->getOperand(0).getReg();
1905 } else {
1906 if (!RBI.constrainGenericRegister(BaseOffset,
1907 AMDGPU::SReg_32RegClass, *MRI))
1908 return false;
1909 }
1910
1911 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1912 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1913 .addReg(BaseOffset)
1914 .addImm(16)
1915 .setOperandDead(3); // Dead scc
1916
1917 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1918 .addReg(M0Base);
1919 }
1920
1921 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1922 // offset field) % 64. Some versions of the programming guide omit the m0
1923 // part, or claim it's from offset 0.
1924 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1925
1926 if (HasVSrc) {
1927 Register VSrc = MI.getOperand(1).getReg();
1928 MIB.addReg(VSrc);
1929
1930 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1931 return false;
1932 }
1933
1934 MIB.addImm(ImmOffset)
1935 .cloneMemRefs(MI);
1936
1937 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1938
1939 MI.eraseFromParent();
1940 return true;
1941 }
1942
selectDSAppendConsume(MachineInstr & MI,bool IsAppend) const1943 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1944 bool IsAppend) const {
1945 Register PtrBase = MI.getOperand(2).getReg();
1946 LLT PtrTy = MRI->getType(PtrBase);
1947 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1948
1949 unsigned Offset;
1950 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1951
1952 // TODO: Should this try to look through readfirstlane like GWS?
1953 if (!isDSOffsetLegal(PtrBase, Offset)) {
1954 PtrBase = MI.getOperand(2).getReg();
1955 Offset = 0;
1956 }
1957
1958 MachineBasicBlock *MBB = MI.getParent();
1959 const DebugLoc &DL = MI.getDebugLoc();
1960 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1961
1962 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1963 .addReg(PtrBase);
1964 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1965 return false;
1966
1967 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1968 .addImm(Offset)
1969 .addImm(IsGDS ? -1 : 0)
1970 .cloneMemRefs(MI);
1971 MI.eraseFromParent();
1972 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1973 }
1974
selectInitWholeWave(MachineInstr & MI) const1975 bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
1976 MachineFunction *MF = MI.getParent()->getParent();
1977 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
1978
1979 MFInfo->setInitWholeWave();
1980 return selectImpl(MI, *CoverageInfo);
1981 }
1982
selectSBarrier(MachineInstr & MI) const1983 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1984 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1985 if (TM.getOptLevel() > CodeGenOptLevel::None) {
1986 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1987 if (WGSize <= STI.getWavefrontSize()) {
1988 // If the workgroup fits in a wave, remove s_barrier_signal and lower
1989 // s_barrier/s_barrier_wait to wave_barrier.
1990 if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
1991 IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
1992 MachineBasicBlock *MBB = MI.getParent();
1993 const DebugLoc &DL = MI.getDebugLoc();
1994 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1995 }
1996 MI.eraseFromParent();
1997 return true;
1998 }
1999 }
2000
2001 if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
2002 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
2003 MachineBasicBlock *MBB = MI.getParent();
2004 const DebugLoc &DL = MI.getDebugLoc();
2005 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
2006 .addImm(AMDGPU::Barrier::WORKGROUP);
2007 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
2008 .addImm(AMDGPU::Barrier::WORKGROUP);
2009 MI.eraseFromParent();
2010 return true;
2011 }
2012
2013 return selectImpl(MI, *CoverageInfo);
2014 }
2015
parseTexFail(uint64_t TexFailCtrl,bool & TFE,bool & LWE,bool & IsTexFail)2016 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2017 bool &IsTexFail) {
2018 if (TexFailCtrl)
2019 IsTexFail = true;
2020
2021 TFE = TexFailCtrl & 0x1;
2022 TexFailCtrl &= ~(uint64_t)0x1;
2023 LWE = TexFailCtrl & 0x2;
2024 TexFailCtrl &= ~(uint64_t)0x2;
2025
2026 return TexFailCtrl == 0;
2027 }
2028
selectImageIntrinsic(MachineInstr & MI,const AMDGPU::ImageDimIntrinsicInfo * Intr) const2029 bool AMDGPUInstructionSelector::selectImageIntrinsic(
2030 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2031 MachineBasicBlock *MBB = MI.getParent();
2032 const DebugLoc &DL = MI.getDebugLoc();
2033
2034 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2035 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
2036
2037 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2038 unsigned IntrOpcode = Intr->BaseOpcode;
2039 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2040 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2041 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2042
2043 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2044
2045 Register VDataIn, VDataOut;
2046 LLT VDataTy;
2047 int NumVDataDwords = -1;
2048 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2049 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2050
2051 bool Unorm;
2052 if (!BaseOpcode->Sampler)
2053 Unorm = true;
2054 else
2055 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2056
2057 bool TFE;
2058 bool LWE;
2059 bool IsTexFail = false;
2060 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2061 TFE, LWE, IsTexFail))
2062 return false;
2063
2064 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2065 const bool IsA16 = (Flags & 1) != 0;
2066 const bool IsG16 = (Flags & 2) != 0;
2067
2068 // A16 implies 16 bit gradients if subtarget doesn't support G16
2069 if (IsA16 && !STI.hasG16() && !IsG16)
2070 return false;
2071
2072 unsigned DMask = 0;
2073 unsigned DMaskLanes = 0;
2074
2075 if (BaseOpcode->Atomic) {
2076 VDataOut = MI.getOperand(0).getReg();
2077 VDataIn = MI.getOperand(2).getReg();
2078 LLT Ty = MRI->getType(VDataIn);
2079
2080 // Be careful to allow atomic swap on 16-bit element vectors.
2081 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2082 Ty.getSizeInBits() == 128 :
2083 Ty.getSizeInBits() == 64;
2084
2085 if (BaseOpcode->AtomicX2) {
2086 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2087
2088 DMask = Is64Bit ? 0xf : 0x3;
2089 NumVDataDwords = Is64Bit ? 4 : 2;
2090 } else {
2091 DMask = Is64Bit ? 0x3 : 0x1;
2092 NumVDataDwords = Is64Bit ? 2 : 1;
2093 }
2094 } else {
2095 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2096 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2097
2098 if (BaseOpcode->Store) {
2099 VDataIn = MI.getOperand(1).getReg();
2100 VDataTy = MRI->getType(VDataIn);
2101 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2102 } else if (BaseOpcode->NoReturn) {
2103 NumVDataDwords = 0;
2104 } else {
2105 VDataOut = MI.getOperand(0).getReg();
2106 VDataTy = MRI->getType(VDataOut);
2107 NumVDataDwords = DMaskLanes;
2108
2109 if (IsD16 && !STI.hasUnpackedD16VMem())
2110 NumVDataDwords = (DMaskLanes + 1) / 2;
2111 }
2112 }
2113
2114 // Set G16 opcode
2115 if (Subtarget->hasG16() && IsG16) {
2116 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2117 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
2118 assert(G16MappingInfo);
2119 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2120 }
2121
2122 // TODO: Check this in verifier.
2123 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2124
2125 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2126 if (BaseOpcode->Atomic)
2127 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
2128 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2129 AMDGPU::CPol::VOLATILE))
2130 return false;
2131
2132 int NumVAddrRegs = 0;
2133 int NumVAddrDwords = 0;
2134 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2135 // Skip the $noregs and 0s inserted during legalization.
2136 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2137 if (!AddrOp.isReg())
2138 continue; // XXX - Break?
2139
2140 Register Addr = AddrOp.getReg();
2141 if (!Addr)
2142 break;
2143
2144 ++NumVAddrRegs;
2145 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2146 }
2147
2148 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2149 // NSA, these should have been packed into a single value in the first
2150 // address register
2151 const bool UseNSA =
2152 NumVAddrRegs != 1 &&
2153 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2154 : NumVAddrDwords == NumVAddrRegs);
2155 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2156 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2157 return false;
2158 }
2159
2160 if (IsTexFail)
2161 ++NumVDataDwords;
2162
2163 int Opcode = -1;
2164 if (IsGFX12Plus) {
2165 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2166 NumVDataDwords, NumVAddrDwords);
2167 } else if (IsGFX11Plus) {
2168 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2169 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2170 : AMDGPU::MIMGEncGfx11Default,
2171 NumVDataDwords, NumVAddrDwords);
2172 } else if (IsGFX10Plus) {
2173 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2174 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2175 : AMDGPU::MIMGEncGfx10Default,
2176 NumVDataDwords, NumVAddrDwords);
2177 } else {
2178 if (Subtarget->hasGFX90AInsts()) {
2179 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2180 NumVDataDwords, NumVAddrDwords);
2181 if (Opcode == -1) {
2182 LLVM_DEBUG(
2183 dbgs()
2184 << "requested image instruction is not supported on this GPU\n");
2185 return false;
2186 }
2187 }
2188 if (Opcode == -1 &&
2189 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2190 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2191 NumVDataDwords, NumVAddrDwords);
2192 if (Opcode == -1)
2193 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2194 NumVDataDwords, NumVAddrDwords);
2195 }
2196 if (Opcode == -1)
2197 return false;
2198
2199 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2200 .cloneMemRefs(MI);
2201
2202 if (VDataOut) {
2203 if (BaseOpcode->AtomicX2) {
2204 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2205
2206 Register TmpReg = MRI->createVirtualRegister(
2207 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2208 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2209
2210 MIB.addDef(TmpReg);
2211 if (!MRI->use_empty(VDataOut)) {
2212 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2213 .addReg(TmpReg, RegState::Kill, SubReg);
2214 }
2215
2216 } else {
2217 MIB.addDef(VDataOut); // vdata output
2218 }
2219 }
2220
2221 if (VDataIn)
2222 MIB.addReg(VDataIn); // vdata input
2223
2224 for (int I = 0; I != NumVAddrRegs; ++I) {
2225 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2226 if (SrcOp.isReg()) {
2227 assert(SrcOp.getReg() != 0);
2228 MIB.addReg(SrcOp.getReg());
2229 }
2230 }
2231
2232 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2233 if (BaseOpcode->Sampler)
2234 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2235
2236 MIB.addImm(DMask); // dmask
2237
2238 if (IsGFX10Plus)
2239 MIB.addImm(DimInfo->Encoding);
2240 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2241 MIB.addImm(Unorm);
2242
2243 MIB.addImm(CPol);
2244 MIB.addImm(IsA16 && // a16 or r128
2245 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2246 if (IsGFX10Plus)
2247 MIB.addImm(IsA16 ? -1 : 0);
2248
2249 if (!Subtarget->hasGFX90AInsts()) {
2250 MIB.addImm(TFE); // tfe
2251 } else if (TFE) {
2252 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2253 return false;
2254 }
2255
2256 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2257 MIB.addImm(LWE); // lwe
2258 if (!IsGFX10Plus)
2259 MIB.addImm(DimInfo->DA ? -1 : 0);
2260 if (BaseOpcode->HasD16)
2261 MIB.addImm(IsD16 ? -1 : 0);
2262
2263 MI.eraseFromParent();
2264 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2265 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2266 return true;
2267 }
2268
2269 // We need to handle this here because tablegen doesn't support matching
2270 // instructions with multiple outputs.
selectDSBvhStackIntrinsic(MachineInstr & MI) const2271 bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2272 MachineInstr &MI) const {
2273 Register Dst0 = MI.getOperand(0).getReg();
2274 Register Dst1 = MI.getOperand(1).getReg();
2275
2276 const DebugLoc &DL = MI.getDebugLoc();
2277 MachineBasicBlock *MBB = MI.getParent();
2278
2279 Register Addr = MI.getOperand(3).getReg();
2280 Register Data0 = MI.getOperand(4).getReg();
2281 Register Data1 = MI.getOperand(5).getReg();
2282 unsigned Offset = MI.getOperand(6).getImm();
2283
2284 unsigned Opc;
2285 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2286 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2287 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2288 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2289 break;
2290 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2291 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2292 break;
2293 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2294 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2295 break;
2296 }
2297
2298 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2299 .addDef(Dst1)
2300 .addUse(Addr)
2301 .addUse(Data0)
2302 .addUse(Data1)
2303 .addImm(Offset)
2304 .cloneMemRefs(MI);
2305
2306 MI.eraseFromParent();
2307 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2308 }
2309
selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr & I) const2310 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2311 MachineInstr &I) const {
2312 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2313 switch (IntrinsicID) {
2314 case Intrinsic::amdgcn_end_cf:
2315 return selectEndCfIntrinsic(I);
2316 case Intrinsic::amdgcn_ds_ordered_add:
2317 case Intrinsic::amdgcn_ds_ordered_swap:
2318 return selectDSOrderedIntrinsic(I, IntrinsicID);
2319 case Intrinsic::amdgcn_ds_gws_init:
2320 case Intrinsic::amdgcn_ds_gws_barrier:
2321 case Intrinsic::amdgcn_ds_gws_sema_v:
2322 case Intrinsic::amdgcn_ds_gws_sema_br:
2323 case Intrinsic::amdgcn_ds_gws_sema_p:
2324 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2325 return selectDSGWSIntrinsic(I, IntrinsicID);
2326 case Intrinsic::amdgcn_ds_append:
2327 return selectDSAppendConsume(I, true);
2328 case Intrinsic::amdgcn_ds_consume:
2329 return selectDSAppendConsume(I, false);
2330 case Intrinsic::amdgcn_init_whole_wave:
2331 return selectInitWholeWave(I);
2332 case Intrinsic::amdgcn_s_barrier:
2333 case Intrinsic::amdgcn_s_barrier_signal:
2334 case Intrinsic::amdgcn_s_barrier_wait:
2335 return selectSBarrier(I);
2336 case Intrinsic::amdgcn_raw_buffer_load_lds:
2337 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2338 case Intrinsic::amdgcn_struct_buffer_load_lds:
2339 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2340 return selectBufferLoadLds(I);
2341 // Until we can store both the address space of the global and the LDS
2342 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2343 // that the argument is a global pointer (buffer pointers have been handled by
2344 // a LLVM IR-level lowering).
2345 case Intrinsic::amdgcn_load_to_lds:
2346 case Intrinsic::amdgcn_global_load_lds:
2347 return selectGlobalLoadLds(I);
2348 case Intrinsic::amdgcn_exp_compr:
2349 if (!STI.hasCompressedExport()) {
2350 Function &F = I.getMF()->getFunction();
2351 F.getContext().diagnose(
2352 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2353 I.getDebugLoc(), DS_Error));
2354 return false;
2355 }
2356 break;
2357 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2358 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2359 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2360 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2361 return selectDSBvhStackIntrinsic(I);
2362 case Intrinsic::amdgcn_s_barrier_signal_var:
2363 return selectNamedBarrierInit(I, IntrinsicID);
2364 case Intrinsic::amdgcn_s_get_named_barrier_state:
2365 return selectNamedBarrierInst(I, IntrinsicID);
2366 case Intrinsic::amdgcn_s_get_barrier_state:
2367 return selectSGetBarrierState(I, IntrinsicID);
2368 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2369 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2370 }
2371 return selectImpl(I, *CoverageInfo);
2372 }
2373
selectG_SELECT(MachineInstr & I) const2374 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2375 if (selectImpl(I, *CoverageInfo))
2376 return true;
2377
2378 MachineBasicBlock *BB = I.getParent();
2379 const DebugLoc &DL = I.getDebugLoc();
2380
2381 Register DstReg = I.getOperand(0).getReg();
2382 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2383 assert(Size <= 32 || Size == 64);
2384 const MachineOperand &CCOp = I.getOperand(1);
2385 Register CCReg = CCOp.getReg();
2386 if (!isVCC(CCReg, *MRI)) {
2387 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2388 AMDGPU::S_CSELECT_B32;
2389 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2390 .addReg(CCReg);
2391
2392 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2393 // bank, because it does not cover the register class that we used to represent
2394 // for it. So we need to manually set the register class here.
2395 if (!MRI->getRegClassOrNull(CCReg))
2396 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2397 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2398 .add(I.getOperand(2))
2399 .add(I.getOperand(3));
2400
2401 bool Ret = false;
2402 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2403 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2404 I.eraseFromParent();
2405 return Ret;
2406 }
2407
2408 // Wide VGPR select should have been split in RegBankSelect.
2409 if (Size > 32)
2410 return false;
2411
2412 MachineInstr *Select =
2413 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2414 .addImm(0)
2415 .add(I.getOperand(3))
2416 .addImm(0)
2417 .add(I.getOperand(2))
2418 .add(I.getOperand(1));
2419
2420 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2421 I.eraseFromParent();
2422 return Ret;
2423 }
2424
selectG_TRUNC(MachineInstr & I) const2425 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2426 Register DstReg = I.getOperand(0).getReg();
2427 Register SrcReg = I.getOperand(1).getReg();
2428 const LLT DstTy = MRI->getType(DstReg);
2429 const LLT SrcTy = MRI->getType(SrcReg);
2430 const LLT S1 = LLT::scalar(1);
2431
2432 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2433 const RegisterBank *DstRB;
2434 if (DstTy == S1) {
2435 // This is a special case. We don't treat s1 for legalization artifacts as
2436 // vcc booleans.
2437 DstRB = SrcRB;
2438 } else {
2439 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2440 if (SrcRB != DstRB)
2441 return false;
2442 }
2443
2444 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2445
2446 unsigned DstSize = DstTy.getSizeInBits();
2447 unsigned SrcSize = SrcTy.getSizeInBits();
2448
2449 const TargetRegisterClass *SrcRC =
2450 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2451 const TargetRegisterClass *DstRC =
2452 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2453 if (!SrcRC || !DstRC)
2454 return false;
2455
2456 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2457 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2458 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2459 return false;
2460 }
2461
2462 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2463 assert(STI.useRealTrue16Insts());
2464 const DebugLoc &DL = I.getDebugLoc();
2465 MachineBasicBlock *MBB = I.getParent();
2466 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2467 .addReg(SrcReg, 0, AMDGPU::lo16);
2468 I.eraseFromParent();
2469 return true;
2470 }
2471
2472 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2473 MachineBasicBlock *MBB = I.getParent();
2474 const DebugLoc &DL = I.getDebugLoc();
2475
2476 Register LoReg = MRI->createVirtualRegister(DstRC);
2477 Register HiReg = MRI->createVirtualRegister(DstRC);
2478 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2479 .addReg(SrcReg, 0, AMDGPU::sub0);
2480 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2481 .addReg(SrcReg, 0, AMDGPU::sub1);
2482
2483 if (IsVALU && STI.hasSDWA()) {
2484 // Write the low 16-bits of the high element into the high 16-bits of the
2485 // low element.
2486 MachineInstr *MovSDWA =
2487 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2488 .addImm(0) // $src0_modifiers
2489 .addReg(HiReg) // $src0
2490 .addImm(0) // $clamp
2491 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2492 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2493 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2494 .addReg(LoReg, RegState::Implicit);
2495 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2496 } else {
2497 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2498 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2499 Register ImmReg = MRI->createVirtualRegister(DstRC);
2500 if (IsVALU) {
2501 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2502 .addImm(16)
2503 .addReg(HiReg);
2504 } else {
2505 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2506 .addReg(HiReg)
2507 .addImm(16)
2508 .setOperandDead(3); // Dead scc
2509 }
2510
2511 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2512 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2513 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2514
2515 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2516 .addImm(0xffff);
2517 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2518 .addReg(LoReg)
2519 .addReg(ImmReg);
2520 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2521 .addReg(TmpReg0)
2522 .addReg(TmpReg1);
2523
2524 if (!IsVALU) {
2525 And.setOperandDead(3); // Dead scc
2526 Or.setOperandDead(3); // Dead scc
2527 }
2528 }
2529
2530 I.eraseFromParent();
2531 return true;
2532 }
2533
2534 if (!DstTy.isScalar())
2535 return false;
2536
2537 if (SrcSize > 32) {
2538 unsigned SubRegIdx = DstSize < 32
2539 ? static_cast<unsigned>(AMDGPU::sub0)
2540 : TRI.getSubRegFromChannel(0, DstSize / 32);
2541 if (SubRegIdx == AMDGPU::NoSubRegister)
2542 return false;
2543
2544 // Deal with weird cases where the class only partially supports the subreg
2545 // index.
2546 const TargetRegisterClass *SrcWithSubRC
2547 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2548 if (!SrcWithSubRC)
2549 return false;
2550
2551 if (SrcWithSubRC != SrcRC) {
2552 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2553 return false;
2554 }
2555
2556 I.getOperand(1).setSubReg(SubRegIdx);
2557 }
2558
2559 I.setDesc(TII.get(TargetOpcode::COPY));
2560 return true;
2561 }
2562
2563 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
shouldUseAndMask(unsigned Size,unsigned & Mask)2564 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2565 Mask = maskTrailingOnes<unsigned>(Size);
2566 int SignedMask = static_cast<int>(Mask);
2567 return SignedMask >= -16 && SignedMask <= 64;
2568 }
2569
2570 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
getArtifactRegBank(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const2571 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2572 Register Reg, const MachineRegisterInfo &MRI,
2573 const TargetRegisterInfo &TRI) const {
2574 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2575 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2576 return RB;
2577
2578 // Ignore the type, since we don't use vcc in artifacts.
2579 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2580 return &RBI.getRegBankFromRegClass(*RC, LLT());
2581 return nullptr;
2582 }
2583
selectG_SZA_EXT(MachineInstr & I) const2584 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2585 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2586 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2587 const DebugLoc &DL = I.getDebugLoc();
2588 MachineBasicBlock &MBB = *I.getParent();
2589 const Register DstReg = I.getOperand(0).getReg();
2590 const Register SrcReg = I.getOperand(1).getReg();
2591
2592 const LLT DstTy = MRI->getType(DstReg);
2593 const LLT SrcTy = MRI->getType(SrcReg);
2594 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2595 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2596 const unsigned DstSize = DstTy.getSizeInBits();
2597 if (!DstTy.isScalar())
2598 return false;
2599
2600 // Artifact casts should never use vcc.
2601 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2602
2603 // FIXME: This should probably be illegal and split earlier.
2604 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2605 if (DstSize <= 32)
2606 return selectCOPY(I);
2607
2608 const TargetRegisterClass *SrcRC =
2609 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2610 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2611 const TargetRegisterClass *DstRC =
2612 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2613
2614 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2615 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2616 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2617 .addReg(SrcReg)
2618 .addImm(AMDGPU::sub0)
2619 .addReg(UndefReg)
2620 .addImm(AMDGPU::sub1);
2621 I.eraseFromParent();
2622
2623 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2624 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2625 }
2626
2627 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2628 // 64-bit should have been split up in RegBankSelect
2629
2630 // Try to use an and with a mask if it will save code size.
2631 unsigned Mask;
2632 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2633 MachineInstr *ExtI =
2634 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2635 .addImm(Mask)
2636 .addReg(SrcReg);
2637 I.eraseFromParent();
2638 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2639 }
2640
2641 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2642 MachineInstr *ExtI =
2643 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2644 .addReg(SrcReg)
2645 .addImm(0) // Offset
2646 .addImm(SrcSize); // Width
2647 I.eraseFromParent();
2648 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2649 }
2650
2651 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2652 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2653 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2654 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2655 return false;
2656
2657 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2658 const unsigned SextOpc = SrcSize == 8 ?
2659 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2660 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2661 .addReg(SrcReg);
2662 I.eraseFromParent();
2663 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2664 }
2665
2666 // Using a single 32-bit SALU to calculate the high half is smaller than
2667 // S_BFE with a literal constant operand.
2668 if (DstSize > 32 && SrcSize == 32) {
2669 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2670 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2671 if (Signed) {
2672 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2673 .addReg(SrcReg, 0, SubReg)
2674 .addImm(31)
2675 .setOperandDead(3); // Dead scc
2676 } else {
2677 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2678 .addImm(0);
2679 }
2680 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2681 .addReg(SrcReg, 0, SubReg)
2682 .addImm(AMDGPU::sub0)
2683 .addReg(HiReg)
2684 .addImm(AMDGPU::sub1);
2685 I.eraseFromParent();
2686 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2687 *MRI);
2688 }
2689
2690 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2691 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2692
2693 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2694 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2695 // We need a 64-bit register source, but the high bits don't matter.
2696 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2697 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2698 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2699
2700 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2701 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2702 .addReg(SrcReg, 0, SubReg)
2703 .addImm(AMDGPU::sub0)
2704 .addReg(UndefReg)
2705 .addImm(AMDGPU::sub1);
2706
2707 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2708 .addReg(ExtReg)
2709 .addImm(SrcSize << 16);
2710
2711 I.eraseFromParent();
2712 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2713 }
2714
2715 unsigned Mask;
2716 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2717 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2718 .addReg(SrcReg)
2719 .addImm(Mask)
2720 .setOperandDead(3); // Dead scc
2721 } else {
2722 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2723 .addReg(SrcReg)
2724 .addImm(SrcSize << 16);
2725 }
2726
2727 I.eraseFromParent();
2728 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2729 }
2730
2731 return false;
2732 }
2733
stripCopy(Register Reg,MachineRegisterInfo & MRI)2734 static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) {
2735 return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
2736 }
2737
stripBitCast(Register Reg,MachineRegisterInfo & MRI)2738 static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) {
2739 Register BitcastSrc;
2740 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2741 Reg = BitcastSrc;
2742 return Reg;
2743 }
2744
isExtractHiElt(MachineRegisterInfo & MRI,Register In,Register & Out)2745 static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
2746 Register &Out) {
2747 Register Trunc;
2748 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2749 return false;
2750
2751 Register LShlSrc;
2752 Register Cst;
2753 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2754 Cst = stripCopy(Cst, MRI);
2755 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2756 Out = stripBitCast(LShlSrc, MRI);
2757 return true;
2758 }
2759 }
2760
2761 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2762 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2763 return false;
2764
2765 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2766 LLT::fixed_vector(2, 16));
2767
2768 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2769 assert(Mask.size() == 2);
2770
2771 if (Mask[0] == 1 && Mask[1] <= 1) {
2772 Out = Shuffle->getOperand(0).getReg();
2773 return true;
2774 }
2775
2776 return false;
2777 }
2778
selectG_FPEXT(MachineInstr & I) const2779 bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2780 if (!Subtarget->hasSALUFloatInsts())
2781 return false;
2782
2783 Register Dst = I.getOperand(0).getReg();
2784 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2785 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2786 return false;
2787
2788 Register Src = I.getOperand(1).getReg();
2789
2790 if (MRI->getType(Dst) == LLT::scalar(32) &&
2791 MRI->getType(Src) == LLT::scalar(16)) {
2792 if (isExtractHiElt(*MRI, Src, Src)) {
2793 MachineBasicBlock *BB = I.getParent();
2794 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2795 .addUse(Src);
2796 I.eraseFromParent();
2797 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2798 }
2799 }
2800
2801 return false;
2802 }
2803
selectG_FNEG(MachineInstr & MI) const2804 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2805 // Only manually handle the f64 SGPR case.
2806 //
2807 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2808 // the bit ops theoretically have a second result due to the implicit def of
2809 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2810 // that is easy by disabling the check. The result works, but uses a
2811 // nonsensical sreg32orlds_and_sreg_1 regclass.
2812 //
2813 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2814 // the variadic REG_SEQUENCE operands.
2815
2816 Register Dst = MI.getOperand(0).getReg();
2817 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2818 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2819 MRI->getType(Dst) != LLT::scalar(64))
2820 return false;
2821
2822 Register Src = MI.getOperand(1).getReg();
2823 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2824 if (Fabs)
2825 Src = Fabs->getOperand(1).getReg();
2826
2827 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2828 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2829 return false;
2830
2831 MachineBasicBlock *BB = MI.getParent();
2832 const DebugLoc &DL = MI.getDebugLoc();
2833 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2834 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2835 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2836 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2837
2838 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2839 .addReg(Src, 0, AMDGPU::sub0);
2840 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2841 .addReg(Src, 0, AMDGPU::sub1);
2842 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2843 .addImm(0x80000000);
2844
2845 // Set or toggle sign bit.
2846 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2847 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2848 .addReg(HiReg)
2849 .addReg(ConstReg)
2850 .setOperandDead(3); // Dead scc
2851 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2852 .addReg(LoReg)
2853 .addImm(AMDGPU::sub0)
2854 .addReg(OpReg)
2855 .addImm(AMDGPU::sub1);
2856 MI.eraseFromParent();
2857 return true;
2858 }
2859
2860 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
selectG_FABS(MachineInstr & MI) const2861 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2862 Register Dst = MI.getOperand(0).getReg();
2863 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2864 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2865 MRI->getType(Dst) != LLT::scalar(64))
2866 return false;
2867
2868 Register Src = MI.getOperand(1).getReg();
2869 MachineBasicBlock *BB = MI.getParent();
2870 const DebugLoc &DL = MI.getDebugLoc();
2871 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2872 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2873 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2874 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2875
2876 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2877 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2878 return false;
2879
2880 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2881 .addReg(Src, 0, AMDGPU::sub0);
2882 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2883 .addReg(Src, 0, AMDGPU::sub1);
2884 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2885 .addImm(0x7fffffff);
2886
2887 // Clear sign bit.
2888 // TODO: Should this used S_BITSET0_*?
2889 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2890 .addReg(HiReg)
2891 .addReg(ConstReg)
2892 .setOperandDead(3); // Dead scc
2893 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2894 .addReg(LoReg)
2895 .addImm(AMDGPU::sub0)
2896 .addReg(OpReg)
2897 .addImm(AMDGPU::sub1);
2898
2899 MI.eraseFromParent();
2900 return true;
2901 }
2902
isConstant(const MachineInstr & MI)2903 static bool isConstant(const MachineInstr &MI) {
2904 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2905 }
2906
getAddrModeInfo(const MachineInstr & Load,const MachineRegisterInfo & MRI,SmallVectorImpl<GEPInfo> & AddrInfo) const2907 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2908 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2909
2910 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2911 const MachineInstr *PtrMI =
2912 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2913
2914 assert(PtrMI);
2915
2916 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2917 return;
2918
2919 GEPInfo GEPInfo;
2920
2921 for (unsigned i = 1; i != 3; ++i) {
2922 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2923 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2924 assert(OpDef);
2925 if (i == 2 && isConstant(*OpDef)) {
2926 // TODO: Could handle constant base + variable offset, but a combine
2927 // probably should have commuted it.
2928 assert(GEPInfo.Imm == 0);
2929 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2930 continue;
2931 }
2932 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2933 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2934 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2935 else
2936 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2937 }
2938
2939 AddrInfo.push_back(GEPInfo);
2940 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2941 }
2942
isSGPR(Register Reg) const2943 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2944 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2945 }
2946
isInstrUniform(const MachineInstr & MI) const2947 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2948 if (!MI.hasOneMemOperand())
2949 return false;
2950
2951 const MachineMemOperand *MMO = *MI.memoperands_begin();
2952 const Value *Ptr = MMO->getValue();
2953
2954 // UndefValue means this is a load of a kernel input. These are uniform.
2955 // Sometimes LDS instructions have constant pointers.
2956 // If Ptr is null, then that means this mem operand contains a
2957 // PseudoSourceValue like GOT.
2958 if (!Ptr || isa<UndefValue, Argument, Constant, GlobalValue>(Ptr))
2959 return true;
2960
2961 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2962 return true;
2963
2964 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2965 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2966 AMDGPU::SGPRRegBankID;
2967
2968 const Instruction *I = dyn_cast<Instruction>(Ptr);
2969 return I && I->getMetadata("amdgpu.uniform");
2970 }
2971
hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const2972 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2973 for (const GEPInfo &GEPInfo : AddrInfo) {
2974 if (!GEPInfo.VgprParts.empty())
2975 return true;
2976 }
2977 return false;
2978 }
2979
initM0(MachineInstr & I) const2980 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2981 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2982 unsigned AS = PtrTy.getAddressSpace();
2983 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2984 STI.ldsRequiresM0Init()) {
2985 MachineBasicBlock *BB = I.getParent();
2986
2987 // If DS instructions require M0 initialization, insert it before selecting.
2988 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2989 .addImm(-1);
2990 }
2991 }
2992
selectG_LOAD_STORE_ATOMICRMW(MachineInstr & I) const2993 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2994 MachineInstr &I) const {
2995 initM0(I);
2996 return selectImpl(I, *CoverageInfo);
2997 }
2998
isVCmpResult(Register Reg,MachineRegisterInfo & MRI)2999 static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
3000 if (Reg.isPhysical())
3001 return false;
3002
3003 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
3004 const unsigned Opcode = MI.getOpcode();
3005
3006 if (Opcode == AMDGPU::COPY)
3007 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3008
3009 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3010 Opcode == AMDGPU::G_XOR)
3011 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3012 isVCmpResult(MI.getOperand(2).getReg(), MRI);
3013
3014 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3015 return GI->is(Intrinsic::amdgcn_class);
3016
3017 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3018 }
3019
selectG_BRCOND(MachineInstr & I) const3020 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3021 MachineBasicBlock *BB = I.getParent();
3022 MachineOperand &CondOp = I.getOperand(0);
3023 Register CondReg = CondOp.getReg();
3024 const DebugLoc &DL = I.getDebugLoc();
3025
3026 unsigned BrOpcode;
3027 Register CondPhysReg;
3028 const TargetRegisterClass *ConstrainRC;
3029
3030 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3031 // whether the branch is uniform when selecting the instruction. In
3032 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3033 // RegBankSelect knows what it's doing if the branch condition is scc, even
3034 // though it currently does not.
3035 if (!isVCC(CondReg, *MRI)) {
3036 if (MRI->getType(CondReg) != LLT::scalar(32))
3037 return false;
3038
3039 CondPhysReg = AMDGPU::SCC;
3040 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3041 ConstrainRC = &AMDGPU::SReg_32RegClass;
3042 } else {
3043 // FIXME: Should scc->vcc copies and with exec?
3044
3045 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3046 // need to insert an and with exec.
3047 if (!isVCmpResult(CondReg, *MRI)) {
3048 const bool Is64 = STI.isWave64();
3049 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3050 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3051
3052 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3053 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3054 .addReg(CondReg)
3055 .addReg(Exec)
3056 .setOperandDead(3); // Dead scc
3057 CondReg = TmpReg;
3058 }
3059
3060 CondPhysReg = TRI.getVCC();
3061 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3062 ConstrainRC = TRI.getBoolRC();
3063 }
3064
3065 if (!MRI->getRegClassOrNull(CondReg))
3066 MRI->setRegClass(CondReg, ConstrainRC);
3067
3068 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3069 .addReg(CondReg);
3070 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3071 .addMBB(I.getOperand(1).getMBB());
3072
3073 I.eraseFromParent();
3074 return true;
3075 }
3076
selectG_GLOBAL_VALUE(MachineInstr & I) const3077 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3078 MachineInstr &I) const {
3079 Register DstReg = I.getOperand(0).getReg();
3080 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3081 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3082 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3083 if (IsVGPR)
3084 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3085
3086 return RBI.constrainGenericRegister(
3087 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3088 }
3089
selectG_PTRMASK(MachineInstr & I) const3090 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3091 Register DstReg = I.getOperand(0).getReg();
3092 Register SrcReg = I.getOperand(1).getReg();
3093 Register MaskReg = I.getOperand(2).getReg();
3094 LLT Ty = MRI->getType(DstReg);
3095 LLT MaskTy = MRI->getType(MaskReg);
3096 MachineBasicBlock *BB = I.getParent();
3097 const DebugLoc &DL = I.getDebugLoc();
3098
3099 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3100 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3101 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3102 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3103 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3104 return false;
3105
3106 // Try to avoid emitting a bit operation when we only need to touch half of
3107 // the 64-bit pointer.
3108 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3109 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3110 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3111
3112 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3113 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3114
3115 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3116 !CanCopyLow32 && !CanCopyHi32) {
3117 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3118 .addReg(SrcReg)
3119 .addReg(MaskReg)
3120 .setOperandDead(3); // Dead scc
3121 I.eraseFromParent();
3122 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3123 }
3124
3125 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3126 const TargetRegisterClass &RegRC
3127 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3128
3129 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3130 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3131 const TargetRegisterClass *MaskRC =
3132 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3133
3134 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3135 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3136 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3137 return false;
3138
3139 if (Ty.getSizeInBits() == 32) {
3140 assert(MaskTy.getSizeInBits() == 32 &&
3141 "ptrmask should have been narrowed during legalize");
3142
3143 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3144 .addReg(SrcReg)
3145 .addReg(MaskReg);
3146
3147 if (!IsVGPR)
3148 NewOp.setOperandDead(3); // Dead scc
3149 I.eraseFromParent();
3150 return true;
3151 }
3152
3153 Register HiReg = MRI->createVirtualRegister(&RegRC);
3154 Register LoReg = MRI->createVirtualRegister(&RegRC);
3155
3156 // Extract the subregisters from the source pointer.
3157 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3158 .addReg(SrcReg, 0, AMDGPU::sub0);
3159 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3160 .addReg(SrcReg, 0, AMDGPU::sub1);
3161
3162 Register MaskedLo, MaskedHi;
3163
3164 if (CanCopyLow32) {
3165 // If all the bits in the low half are 1, we only need a copy for it.
3166 MaskedLo = LoReg;
3167 } else {
3168 // Extract the mask subregister and apply the and.
3169 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3170 MaskedLo = MRI->createVirtualRegister(&RegRC);
3171
3172 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3173 .addReg(MaskReg, 0, AMDGPU::sub0);
3174 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3175 .addReg(LoReg)
3176 .addReg(MaskLo);
3177 }
3178
3179 if (CanCopyHi32) {
3180 // If all the bits in the high half are 1, we only need a copy for it.
3181 MaskedHi = HiReg;
3182 } else {
3183 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3184 MaskedHi = MRI->createVirtualRegister(&RegRC);
3185
3186 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3187 .addReg(MaskReg, 0, AMDGPU::sub1);
3188 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3189 .addReg(HiReg)
3190 .addReg(MaskHi);
3191 }
3192
3193 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3194 .addReg(MaskedLo)
3195 .addImm(AMDGPU::sub0)
3196 .addReg(MaskedHi)
3197 .addImm(AMDGPU::sub1);
3198 I.eraseFromParent();
3199 return true;
3200 }
3201
3202 /// Return the register to use for the index value, and the subregister to use
3203 /// for the indirectly accessed register.
3204 static std::pair<Register, unsigned>
computeIndirectRegIndex(MachineRegisterInfo & MRI,const SIRegisterInfo & TRI,const TargetRegisterClass * SuperRC,Register IdxReg,unsigned EltSize,GISelValueTracking & ValueTracking)3205 computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI,
3206 const TargetRegisterClass *SuperRC, Register IdxReg,
3207 unsigned EltSize, GISelValueTracking &ValueTracking) {
3208 Register IdxBaseReg;
3209 int Offset;
3210
3211 std::tie(IdxBaseReg, Offset) =
3212 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3213 if (IdxBaseReg == AMDGPU::NoRegister) {
3214 // This will happen if the index is a known constant. This should ordinarily
3215 // be legalized out, but handle it as a register just in case.
3216 assert(Offset == 0);
3217 IdxBaseReg = IdxReg;
3218 }
3219
3220 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3221
3222 // Skip out of bounds offsets, or else we would end up using an undefined
3223 // register.
3224 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3225 return std::pair(IdxReg, SubRegs[0]);
3226 return std::pair(IdxBaseReg, SubRegs[Offset]);
3227 }
3228
selectG_EXTRACT_VECTOR_ELT(MachineInstr & MI) const3229 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3230 MachineInstr &MI) const {
3231 Register DstReg = MI.getOperand(0).getReg();
3232 Register SrcReg = MI.getOperand(1).getReg();
3233 Register IdxReg = MI.getOperand(2).getReg();
3234
3235 LLT DstTy = MRI->getType(DstReg);
3236 LLT SrcTy = MRI->getType(SrcReg);
3237
3238 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3239 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3240 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3241
3242 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3243 // into a waterfall loop.
3244 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3245 return false;
3246
3247 const TargetRegisterClass *SrcRC =
3248 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3249 const TargetRegisterClass *DstRC =
3250 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3251 if (!SrcRC || !DstRC)
3252 return false;
3253 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3254 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3255 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3256 return false;
3257
3258 MachineBasicBlock *BB = MI.getParent();
3259 const DebugLoc &DL = MI.getDebugLoc();
3260 const bool Is64 = DstTy.getSizeInBits() == 64;
3261
3262 unsigned SubReg;
3263 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3264 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3265
3266 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3267 if (DstTy.getSizeInBits() != 32 && !Is64)
3268 return false;
3269
3270 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3271 .addReg(IdxReg);
3272
3273 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3274 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3275 .addReg(SrcReg, 0, SubReg)
3276 .addReg(SrcReg, RegState::Implicit);
3277 MI.eraseFromParent();
3278 return true;
3279 }
3280
3281 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3282 return false;
3283
3284 if (!STI.useVGPRIndexMode()) {
3285 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3286 .addReg(IdxReg);
3287 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3288 .addReg(SrcReg, 0, SubReg)
3289 .addReg(SrcReg, RegState::Implicit);
3290 MI.eraseFromParent();
3291 return true;
3292 }
3293
3294 const MCInstrDesc &GPRIDXDesc =
3295 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3296 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3297 .addReg(SrcReg)
3298 .addReg(IdxReg)
3299 .addImm(SubReg);
3300
3301 MI.eraseFromParent();
3302 return true;
3303 }
3304
3305 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
selectG_INSERT_VECTOR_ELT(MachineInstr & MI) const3306 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3307 MachineInstr &MI) const {
3308 Register DstReg = MI.getOperand(0).getReg();
3309 Register VecReg = MI.getOperand(1).getReg();
3310 Register ValReg = MI.getOperand(2).getReg();
3311 Register IdxReg = MI.getOperand(3).getReg();
3312
3313 LLT VecTy = MRI->getType(DstReg);
3314 LLT ValTy = MRI->getType(ValReg);
3315 unsigned VecSize = VecTy.getSizeInBits();
3316 unsigned ValSize = ValTy.getSizeInBits();
3317
3318 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3319 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3320 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3321
3322 assert(VecTy.getElementType() == ValTy);
3323
3324 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3325 // into a waterfall loop.
3326 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3327 return false;
3328
3329 const TargetRegisterClass *VecRC =
3330 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3331 const TargetRegisterClass *ValRC =
3332 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3333
3334 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3335 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3336 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3337 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3338 return false;
3339
3340 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3341 return false;
3342
3343 unsigned SubReg;
3344 std::tie(IdxReg, SubReg) =
3345 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3346
3347 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3348 STI.useVGPRIndexMode();
3349
3350 MachineBasicBlock *BB = MI.getParent();
3351 const DebugLoc &DL = MI.getDebugLoc();
3352
3353 if (!IndexMode) {
3354 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3355 .addReg(IdxReg);
3356
3357 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3358 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3359 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3360 .addReg(VecReg)
3361 .addReg(ValReg)
3362 .addImm(SubReg);
3363 MI.eraseFromParent();
3364 return true;
3365 }
3366
3367 const MCInstrDesc &GPRIDXDesc =
3368 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3369 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3370 .addReg(VecReg)
3371 .addReg(ValReg)
3372 .addReg(IdxReg)
3373 .addImm(SubReg);
3374
3375 MI.eraseFromParent();
3376 return true;
3377 }
3378
selectBufferLoadLds(MachineInstr & MI) const3379 bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3380 if (!Subtarget->hasVMemToLDSLoad())
3381 return false;
3382 unsigned Opc;
3383 unsigned Size = MI.getOperand(3).getImm();
3384
3385 // The struct intrinsic variants add one additional operand over raw.
3386 const bool HasVIndex = MI.getNumOperands() == 9;
3387 Register VIndex;
3388 int OpOffset = 0;
3389 if (HasVIndex) {
3390 VIndex = MI.getOperand(4).getReg();
3391 OpOffset = 1;
3392 }
3393
3394 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3395 std::optional<ValueAndVReg> MaybeVOffset =
3396 getIConstantVRegValWithLookThrough(VOffset, *MRI);
3397 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3398
3399 switch (Size) {
3400 default:
3401 return false;
3402 case 1:
3403 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3404 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3405 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3406 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3407 break;
3408 case 2:
3409 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3410 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3411 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3412 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3413 break;
3414 case 4:
3415 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3416 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3417 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3418 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3419 break;
3420 case 12:
3421 if (!Subtarget->hasLDSLoadB96_B128())
3422 return false;
3423
3424 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3425 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3426 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3427 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3428 break;
3429 case 16:
3430 if (!Subtarget->hasLDSLoadB96_B128())
3431 return false;
3432
3433 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3434 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3435 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3436 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3437 break;
3438 }
3439
3440 MachineBasicBlock *MBB = MI.getParent();
3441 const DebugLoc &DL = MI.getDebugLoc();
3442 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3443 .add(MI.getOperand(2));
3444
3445 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3446
3447 if (HasVIndex && HasVOffset) {
3448 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3449 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3450 .addReg(VIndex)
3451 .addImm(AMDGPU::sub0)
3452 .addReg(VOffset)
3453 .addImm(AMDGPU::sub1);
3454
3455 MIB.addReg(IdxReg);
3456 } else if (HasVIndex) {
3457 MIB.addReg(VIndex);
3458 } else if (HasVOffset) {
3459 MIB.addReg(VOffset);
3460 }
3461
3462 MIB.add(MI.getOperand(1)); // rsrc
3463 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3464 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3465 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3466 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3467 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3468 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3469 MIB.addImm(
3470 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3471 ? 1
3472 : 0); // swz
3473
3474 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3475 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3476 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3477 MachinePointerInfo StorePtrI = LoadPtrI;
3478 StorePtrI.V = nullptr;
3479 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3480
3481 auto F = LoadMMO->getFlags() &
3482 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3483 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3484 Size, LoadMMO->getBaseAlign());
3485
3486 MachineMemOperand *StoreMMO =
3487 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3488 sizeof(int32_t), LoadMMO->getBaseAlign());
3489
3490 MIB.setMemRefs({LoadMMO, StoreMMO});
3491
3492 MI.eraseFromParent();
3493 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3494 }
3495
3496 /// Match a zero extend from a 32-bit value to 64-bits.
matchZeroExtendFromS32(MachineRegisterInfo & MRI,Register Reg)3497 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3498 Register ZExtSrc;
3499 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3500 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3501
3502 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3503 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3504 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3505 return Register();
3506
3507 assert(Def->getNumOperands() == 3 &&
3508 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3509 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3510 return Def->getOperand(1).getReg();
3511 }
3512
3513 return Register();
3514 }
3515
selectGlobalLoadLds(MachineInstr & MI) const3516 bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3517 if (!Subtarget->hasVMemToLDSLoad())
3518 return false;
3519
3520 unsigned Opc;
3521 unsigned Size = MI.getOperand(3).getImm();
3522
3523 switch (Size) {
3524 default:
3525 return false;
3526 case 1:
3527 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3528 break;
3529 case 2:
3530 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3531 break;
3532 case 4:
3533 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3534 break;
3535 case 12:
3536 if (!Subtarget->hasLDSLoadB96_B128())
3537 return false;
3538 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3539 break;
3540 case 16:
3541 if (!Subtarget->hasLDSLoadB96_B128())
3542 return false;
3543 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3544 break;
3545 }
3546
3547 MachineBasicBlock *MBB = MI.getParent();
3548 const DebugLoc &DL = MI.getDebugLoc();
3549 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3550 .add(MI.getOperand(2));
3551
3552 Register Addr = MI.getOperand(1).getReg();
3553 Register VOffset;
3554 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3555 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3556 if (!isSGPR(Addr)) {
3557 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3558 if (isSGPR(AddrDef->Reg)) {
3559 Addr = AddrDef->Reg;
3560 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3561 Register SAddr =
3562 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3563 if (isSGPR(SAddr)) {
3564 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3565 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3566 Addr = SAddr;
3567 VOffset = Off;
3568 }
3569 }
3570 }
3571 }
3572
3573 if (isSGPR(Addr)) {
3574 Opc = AMDGPU::getGlobalSaddrOp(Opc);
3575 if (!VOffset) {
3576 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3577 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3578 .addImm(0);
3579 }
3580 }
3581
3582 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3583 .addReg(Addr);
3584
3585 if (isSGPR(Addr))
3586 MIB.addReg(VOffset);
3587
3588 MIB.add(MI.getOperand(4)) // offset
3589 .add(MI.getOperand(5)); // cpol
3590
3591 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3592 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3593 LoadPtrI.Offset = MI.getOperand(4).getImm();
3594 MachinePointerInfo StorePtrI = LoadPtrI;
3595 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
3596 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3597 auto F = LoadMMO->getFlags() &
3598 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3599 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3600 Size, LoadMMO->getBaseAlign());
3601 MachineMemOperand *StoreMMO =
3602 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3603 sizeof(int32_t), Align(4));
3604
3605 MIB.setMemRefs({LoadMMO, StoreMMO});
3606
3607 MI.eraseFromParent();
3608 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3609 }
3610
selectBVHIntersectRayIntrinsic(MachineInstr & MI) const3611 bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3612 MachineInstr &MI) const {
3613 unsigned OpcodeOpIdx =
3614 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3615 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3616 MI.removeOperand(OpcodeOpIdx);
3617 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3618 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3619 }
3620
3621 // FIXME: This should be removed and let the patterns select. We just need the
3622 // AGPR/VGPR combination versions.
selectSMFMACIntrin(MachineInstr & MI) const3623 bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3624 unsigned Opc;
3625 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3626 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3627 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3628 break;
3629 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3630 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3631 break;
3632 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3633 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3634 break;
3635 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3636 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3637 break;
3638 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3639 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3640 break;
3641 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3642 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3643 break;
3644 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3645 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3646 break;
3647 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3648 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3649 break;
3650 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3651 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3652 break;
3653 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3654 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3655 break;
3656 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3657 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3658 break;
3659 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3660 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3661 break;
3662 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3663 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3664 break;
3665 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3666 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3667 break;
3668 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3669 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3670 break;
3671 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3672 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3673 break;
3674 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3675 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3676 break;
3677 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3678 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3679 break;
3680 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3681 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3682 break;
3683 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3684 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3685 break;
3686 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3687 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3688 break;
3689 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3690 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3691 break;
3692 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3693 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3694 break;
3695 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3696 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3697 break;
3698 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3699 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3700 break;
3701 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3702 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3703 break;
3704 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3705 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3706 break;
3707 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3708 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3709 break;
3710 default:
3711 llvm_unreachable("unhandled smfmac intrinsic");
3712 }
3713
3714 auto VDst_In = MI.getOperand(4);
3715
3716 MI.setDesc(TII.get(Opc));
3717 MI.removeOperand(4); // VDst_In
3718 MI.removeOperand(1); // Intrinsic ID
3719 MI.addOperand(VDst_In); // Readd VDst_In to the end
3720 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3721 return true;
3722 }
3723
selectPermlaneSwapIntrin(MachineInstr & MI,Intrinsic::ID IntrID) const3724 bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3725 MachineInstr &MI, Intrinsic::ID IntrID) const {
3726 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3727 !Subtarget->hasPermlane16Swap())
3728 return false;
3729 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3730 !Subtarget->hasPermlane32Swap())
3731 return false;
3732
3733 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3734 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3735 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3736
3737 MI.removeOperand(2);
3738 MI.setDesc(TII.get(Opcode));
3739 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3740
3741 MachineOperand &FI = MI.getOperand(4);
3742 FI.setImm(FI.getImm() ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0);
3743
3744 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3745 }
3746
selectWaveAddress(MachineInstr & MI) const3747 bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3748 Register DstReg = MI.getOperand(0).getReg();
3749 Register SrcReg = MI.getOperand(1).getReg();
3750 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3751 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3752 MachineBasicBlock *MBB = MI.getParent();
3753 const DebugLoc &DL = MI.getDebugLoc();
3754
3755 if (IsVALU) {
3756 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3757 .addImm(Subtarget->getWavefrontSizeLog2())
3758 .addReg(SrcReg);
3759 } else {
3760 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3761 .addReg(SrcReg)
3762 .addImm(Subtarget->getWavefrontSizeLog2())
3763 .setOperandDead(3); // Dead scc
3764 }
3765
3766 const TargetRegisterClass &RC =
3767 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3768 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3769 return false;
3770
3771 MI.eraseFromParent();
3772 return true;
3773 }
3774
3775 // Match BITOP3 operation and return a number of matched instructions plus
3776 // truth table.
BitOp3_Op(Register R,SmallVectorImpl<Register> & Src,const MachineRegisterInfo & MRI)3777 static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
3778 SmallVectorImpl<Register> &Src,
3779 const MachineRegisterInfo &MRI) {
3780 unsigned NumOpcodes = 0;
3781 uint8_t LHSBits, RHSBits;
3782
3783 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
3784 // Define truth table given Src0, Src1, Src2 bits permutations:
3785 // 0 0 0
3786 // 0 0 1
3787 // 0 1 0
3788 // 0 1 1
3789 // 1 0 0
3790 // 1 0 1
3791 // 1 1 0
3792 // 1 1 1
3793 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3794
3795 if (mi_match(Op, MRI, m_AllOnesInt())) {
3796 Bits = 0xff;
3797 return true;
3798 }
3799 if (mi_match(Op, MRI, m_ZeroInt())) {
3800 Bits = 0;
3801 return true;
3802 }
3803
3804 for (unsigned I = 0; I < Src.size(); ++I) {
3805 // Try to find existing reused operand
3806 if (Src[I] == Op) {
3807 Bits = SrcBits[I];
3808 return true;
3809 }
3810 // Try to replace parent operator
3811 if (Src[I] == R) {
3812 Bits = SrcBits[I];
3813 Src[I] = Op;
3814 return true;
3815 }
3816 }
3817
3818 if (Src.size() == 3) {
3819 // No room left for operands. Try one last time, there can be a 'not' of
3820 // one of our source operands. In this case we can compute the bits
3821 // without growing Src vector.
3822 Register LHS;
3823 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
3824 LHS = getSrcRegIgnoringCopies(LHS, MRI);
3825 for (unsigned I = 0; I < Src.size(); ++I) {
3826 if (Src[I] == LHS) {
3827 Bits = ~SrcBits[I];
3828 return true;
3829 }
3830 }
3831 }
3832
3833 return false;
3834 }
3835
3836 Bits = SrcBits[Src.size()];
3837 Src.push_back(Op);
3838 return true;
3839 };
3840
3841 MachineInstr *MI = MRI.getVRegDef(R);
3842 switch (MI->getOpcode()) {
3843 case TargetOpcode::G_AND:
3844 case TargetOpcode::G_OR:
3845 case TargetOpcode::G_XOR: {
3846 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
3847 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
3848
3849 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
3850 if (!getOperandBits(LHS, LHSBits) ||
3851 !getOperandBits(RHS, RHSBits)) {
3852 Src = Backup;
3853 return std::make_pair(0, 0);
3854 }
3855
3856 // Recursion is naturally limited by the size of the operand vector.
3857 auto Op = BitOp3_Op(LHS, Src, MRI);
3858 if (Op.first) {
3859 NumOpcodes += Op.first;
3860 LHSBits = Op.second;
3861 }
3862
3863 Op = BitOp3_Op(RHS, Src, MRI);
3864 if (Op.first) {
3865 NumOpcodes += Op.first;
3866 RHSBits = Op.second;
3867 }
3868 break;
3869 }
3870 default:
3871 return std::make_pair(0, 0);
3872 }
3873
3874 uint8_t TTbl;
3875 switch (MI->getOpcode()) {
3876 case TargetOpcode::G_AND:
3877 TTbl = LHSBits & RHSBits;
3878 break;
3879 case TargetOpcode::G_OR:
3880 TTbl = LHSBits | RHSBits;
3881 break;
3882 case TargetOpcode::G_XOR:
3883 TTbl = LHSBits ^ RHSBits;
3884 break;
3885 default:
3886 break;
3887 }
3888
3889 return std::make_pair(NumOpcodes + 1, TTbl);
3890 }
3891
selectBITOP3(MachineInstr & MI) const3892 bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
3893 if (!Subtarget->hasBitOp3Insts())
3894 return false;
3895
3896 Register DstReg = MI.getOperand(0).getReg();
3897 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3898 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3899 if (!IsVALU)
3900 return false;
3901
3902 SmallVector<Register, 3> Src;
3903 uint8_t TTbl;
3904 unsigned NumOpcodes;
3905
3906 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
3907
3908 // Src.empty() case can happen if all operands are all zero or all ones.
3909 // Normally it shall be optimized out before reaching this.
3910 if (NumOpcodes < 2 || Src.empty())
3911 return false;
3912
3913 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
3914 if (NumOpcodes == 2 && IsB32) {
3915 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3916 // asm more readable. This cannot be modeled with AddedComplexity because
3917 // selector does not know how many operations did we match.
3918 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
3919 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
3920 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
3921 return false;
3922 } else if (NumOpcodes < 4) {
3923 // For a uniform case threshold should be higher to account for moves
3924 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
3925 // in SGPRs and a readtfirstlane after.
3926 return false;
3927 }
3928
3929 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
3930 unsigned CBL = STI.getConstantBusLimit(Opc);
3931 MachineBasicBlock *MBB = MI.getParent();
3932 const DebugLoc &DL = MI.getDebugLoc();
3933
3934 for (unsigned I = 0; I < Src.size(); ++I) {
3935 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
3936 if (RB->getID() != AMDGPU::SGPRRegBankID)
3937 continue;
3938 if (CBL > 0) {
3939 --CBL;
3940 continue;
3941 }
3942 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3943 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
3944 .addReg(Src[I]);
3945 Src[I] = NewReg;
3946 }
3947
3948 // Last operand can be ignored, turning a ternary operation into a binary.
3949 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3950 // 'c' with 'a' here without changing the answer. In some pathological
3951 // cases it should be possible to get an operation with a single operand
3952 // too if optimizer would not catch it.
3953 while (Src.size() < 3)
3954 Src.push_back(Src[0]);
3955
3956 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
3957 if (!IsB32)
3958 MIB.addImm(0); // src_mod0
3959 MIB.addReg(Src[0]);
3960 if (!IsB32)
3961 MIB.addImm(0); // src_mod1
3962 MIB.addReg(Src[1]);
3963 if (!IsB32)
3964 MIB.addImm(0); // src_mod2
3965 MIB.addReg(Src[2])
3966 .addImm(TTbl);
3967 if (!IsB32)
3968 MIB.addImm(0); // op_sel
3969
3970 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3971 MI.eraseFromParent();
3972
3973 return true;
3974 }
3975
selectStackRestore(MachineInstr & MI) const3976 bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3977 Register SrcReg = MI.getOperand(0).getReg();
3978 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3979 return false;
3980
3981 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3982 Register SP =
3983 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
3984 Register WaveAddr = getWaveAddress(DefMI);
3985 MachineBasicBlock *MBB = MI.getParent();
3986 const DebugLoc &DL = MI.getDebugLoc();
3987
3988 if (!WaveAddr) {
3989 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3990 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3991 .addReg(SrcReg)
3992 .addImm(Subtarget->getWavefrontSizeLog2())
3993 .setOperandDead(3); // Dead scc
3994 }
3995
3996 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3997 .addReg(WaveAddr);
3998
3999 MI.eraseFromParent();
4000 return true;
4001 }
4002
select(MachineInstr & I)4003 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
4004
4005 if (!I.isPreISelOpcode()) {
4006 if (I.isCopy())
4007 return selectCOPY(I);
4008 return true;
4009 }
4010
4011 switch (I.getOpcode()) {
4012 case TargetOpcode::G_AND:
4013 case TargetOpcode::G_OR:
4014 case TargetOpcode::G_XOR:
4015 if (selectBITOP3(I))
4016 return true;
4017 if (selectImpl(I, *CoverageInfo))
4018 return true;
4019 return selectG_AND_OR_XOR(I);
4020 case TargetOpcode::G_ADD:
4021 case TargetOpcode::G_SUB:
4022 case TargetOpcode::G_PTR_ADD:
4023 if (selectImpl(I, *CoverageInfo))
4024 return true;
4025 return selectG_ADD_SUB(I);
4026 case TargetOpcode::G_UADDO:
4027 case TargetOpcode::G_USUBO:
4028 case TargetOpcode::G_UADDE:
4029 case TargetOpcode::G_USUBE:
4030 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4031 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4032 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4033 return selectG_AMDGPU_MAD_64_32(I);
4034 case TargetOpcode::G_INTTOPTR:
4035 case TargetOpcode::G_BITCAST:
4036 case TargetOpcode::G_PTRTOINT:
4037 case TargetOpcode::G_FREEZE:
4038 return selectCOPY(I);
4039 case TargetOpcode::G_FNEG:
4040 if (selectImpl(I, *CoverageInfo))
4041 return true;
4042 return selectG_FNEG(I);
4043 case TargetOpcode::G_FABS:
4044 if (selectImpl(I, *CoverageInfo))
4045 return true;
4046 return selectG_FABS(I);
4047 case TargetOpcode::G_EXTRACT:
4048 return selectG_EXTRACT(I);
4049 case TargetOpcode::G_MERGE_VALUES:
4050 case TargetOpcode::G_CONCAT_VECTORS:
4051 return selectG_MERGE_VALUES(I);
4052 case TargetOpcode::G_UNMERGE_VALUES:
4053 return selectG_UNMERGE_VALUES(I);
4054 case TargetOpcode::G_BUILD_VECTOR:
4055 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4056 return selectG_BUILD_VECTOR(I);
4057 case TargetOpcode::G_IMPLICIT_DEF:
4058 return selectG_IMPLICIT_DEF(I);
4059 case TargetOpcode::G_INSERT:
4060 return selectG_INSERT(I);
4061 case TargetOpcode::G_INTRINSIC:
4062 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4063 return selectG_INTRINSIC(I);
4064 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4065 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4066 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4067 case TargetOpcode::G_ICMP:
4068 case TargetOpcode::G_FCMP:
4069 if (selectG_ICMP_or_FCMP(I))
4070 return true;
4071 return selectImpl(I, *CoverageInfo);
4072 case TargetOpcode::G_LOAD:
4073 case TargetOpcode::G_ZEXTLOAD:
4074 case TargetOpcode::G_SEXTLOAD:
4075 case TargetOpcode::G_STORE:
4076 case TargetOpcode::G_ATOMIC_CMPXCHG:
4077 case TargetOpcode::G_ATOMICRMW_XCHG:
4078 case TargetOpcode::G_ATOMICRMW_ADD:
4079 case TargetOpcode::G_ATOMICRMW_SUB:
4080 case TargetOpcode::G_ATOMICRMW_AND:
4081 case TargetOpcode::G_ATOMICRMW_OR:
4082 case TargetOpcode::G_ATOMICRMW_XOR:
4083 case TargetOpcode::G_ATOMICRMW_MIN:
4084 case TargetOpcode::G_ATOMICRMW_MAX:
4085 case TargetOpcode::G_ATOMICRMW_UMIN:
4086 case TargetOpcode::G_ATOMICRMW_UMAX:
4087 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4088 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4089 case TargetOpcode::G_ATOMICRMW_FADD:
4090 case TargetOpcode::G_ATOMICRMW_FMIN:
4091 case TargetOpcode::G_ATOMICRMW_FMAX:
4092 return selectG_LOAD_STORE_ATOMICRMW(I);
4093 case TargetOpcode::G_SELECT:
4094 return selectG_SELECT(I);
4095 case TargetOpcode::G_TRUNC:
4096 return selectG_TRUNC(I);
4097 case TargetOpcode::G_SEXT:
4098 case TargetOpcode::G_ZEXT:
4099 case TargetOpcode::G_ANYEXT:
4100 case TargetOpcode::G_SEXT_INREG:
4101 // This is a workaround. For extension from type i1, `selectImpl()` uses
4102 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4103 // i1 can only be hold in a SGPR class.
4104 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4105 selectImpl(I, *CoverageInfo))
4106 return true;
4107 return selectG_SZA_EXT(I);
4108 case TargetOpcode::G_FPEXT:
4109 if (selectG_FPEXT(I))
4110 return true;
4111 return selectImpl(I, *CoverageInfo);
4112 case TargetOpcode::G_BRCOND:
4113 return selectG_BRCOND(I);
4114 case TargetOpcode::G_GLOBAL_VALUE:
4115 return selectG_GLOBAL_VALUE(I);
4116 case TargetOpcode::G_PTRMASK:
4117 return selectG_PTRMASK(I);
4118 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4119 return selectG_EXTRACT_VECTOR_ELT(I);
4120 case TargetOpcode::G_INSERT_VECTOR_ELT:
4121 return selectG_INSERT_VECTOR_ELT(I);
4122 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4123 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4124 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4125 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4126 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4127 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4128 AMDGPU::getImageDimIntrinsicInfo(AMDGPU::getIntrinsicID(I));
4129 assert(Intr && "not an image intrinsic with image pseudo");
4130 return selectImageIntrinsic(I, Intr);
4131 }
4132 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4133 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4134 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4135 return selectBVHIntersectRayIntrinsic(I);
4136 case AMDGPU::G_SBFX:
4137 case AMDGPU::G_UBFX:
4138 return selectG_SBFX_UBFX(I);
4139 case AMDGPU::G_SI_CALL:
4140 I.setDesc(TII.get(AMDGPU::SI_CALL));
4141 return true;
4142 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4143 return selectWaveAddress(I);
4144 case AMDGPU::G_STACKRESTORE:
4145 return selectStackRestore(I);
4146 case AMDGPU::G_PHI:
4147 return selectPHI(I);
4148 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4149 return selectCOPY_SCC_VCC(I);
4150 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4151 return selectCOPY_VCC_SCC(I);
4152 case AMDGPU::G_AMDGPU_READANYLANE:
4153 return selectReadAnyLane(I);
4154 case TargetOpcode::G_CONSTANT:
4155 case TargetOpcode::G_FCONSTANT:
4156 default:
4157 return selectImpl(I, *CoverageInfo);
4158 }
4159 return false;
4160 }
4161
4162 InstructionSelector::ComplexRendererFns
selectVCSRC(MachineOperand & Root) const4163 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4164 return {{
4165 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4166 }};
4167
4168 }
4169
selectVOP3ModsImpl(Register Src,bool IsCanonicalizing,bool AllowAbs,bool OpSel) const4170 std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4171 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4172 unsigned Mods = 0;
4173 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4174
4175 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4176 Src = MI->getOperand(1).getReg();
4177 Mods |= SISrcMods::NEG;
4178 MI = getDefIgnoringCopies(Src, *MRI);
4179 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4180 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4181 // denormal mode, but we're implicitly canonicalizing in a source operand.
4182 const ConstantFP *LHS =
4183 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4184 if (LHS && LHS->isZero()) {
4185 Mods |= SISrcMods::NEG;
4186 Src = MI->getOperand(2).getReg();
4187 }
4188 }
4189
4190 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4191 Src = MI->getOperand(1).getReg();
4192 Mods |= SISrcMods::ABS;
4193 }
4194
4195 if (OpSel)
4196 Mods |= SISrcMods::OP_SEL_0;
4197
4198 return std::pair(Src, Mods);
4199 }
4200
copyToVGPRIfSrcFolded(Register Src,unsigned Mods,MachineOperand Root,MachineInstr * InsertPt,bool ForceVGPR) const4201 Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4202 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4203 bool ForceVGPR) const {
4204 if ((Mods != 0 || ForceVGPR) &&
4205 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4206
4207 // If we looked through copies to find source modifiers on an SGPR operand,
4208 // we now have an SGPR register source. To avoid potentially violating the
4209 // constant bus restriction, we need to insert a copy to a VGPR.
4210 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4211 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4212 TII.get(AMDGPU::COPY), VGPRSrc)
4213 .addReg(Src);
4214 Src = VGPRSrc;
4215 }
4216
4217 return Src;
4218 }
4219
4220 ///
4221 /// This will select either an SGPR or VGPR operand and will save us from
4222 /// having to write an extra tablegen pattern.
4223 InstructionSelector::ComplexRendererFns
selectVSRC0(MachineOperand & Root) const4224 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4225 return {{
4226 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4227 }};
4228 }
4229
4230 InstructionSelector::ComplexRendererFns
selectVOP3Mods0(MachineOperand & Root) const4231 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4232 Register Src;
4233 unsigned Mods;
4234 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4235
4236 return {{
4237 [=](MachineInstrBuilder &MIB) {
4238 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4239 },
4240 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4241 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4242 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4243 }};
4244 }
4245
4246 InstructionSelector::ComplexRendererFns
selectVOP3BMods0(MachineOperand & Root) const4247 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4248 Register Src;
4249 unsigned Mods;
4250 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4251 /*IsCanonicalizing=*/true,
4252 /*AllowAbs=*/false);
4253
4254 return {{
4255 [=](MachineInstrBuilder &MIB) {
4256 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4257 },
4258 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4259 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4260 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4261 }};
4262 }
4263
4264 InstructionSelector::ComplexRendererFns
selectVOP3OMods(MachineOperand & Root) const4265 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4266 return {{
4267 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4268 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4269 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4270 }};
4271 }
4272
4273 InstructionSelector::ComplexRendererFns
selectVOP3Mods(MachineOperand & Root) const4274 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4275 Register Src;
4276 unsigned Mods;
4277 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4278
4279 return {{
4280 [=](MachineInstrBuilder &MIB) {
4281 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4282 },
4283 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4284 }};
4285 }
4286
4287 InstructionSelector::ComplexRendererFns
selectVOP3ModsNonCanonicalizing(MachineOperand & Root) const4288 AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4289 MachineOperand &Root) const {
4290 Register Src;
4291 unsigned Mods;
4292 std::tie(Src, Mods) =
4293 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4294
4295 return {{
4296 [=](MachineInstrBuilder &MIB) {
4297 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4298 },
4299 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4300 }};
4301 }
4302
4303 InstructionSelector::ComplexRendererFns
selectVOP3BMods(MachineOperand & Root) const4304 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4305 Register Src;
4306 unsigned Mods;
4307 std::tie(Src, Mods) =
4308 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4309 /*AllowAbs=*/false);
4310
4311 return {{
4312 [=](MachineInstrBuilder &MIB) {
4313 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4314 },
4315 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4316 }};
4317 }
4318
4319 InstructionSelector::ComplexRendererFns
selectVOP3NoMods(MachineOperand & Root) const4320 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4321 Register Reg = Root.getReg();
4322 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4323 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4324 return {};
4325 return {{
4326 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4327 }};
4328 }
4329
4330 enum class SrcStatus {
4331 IS_SAME,
4332 IS_UPPER_HALF,
4333 IS_LOWER_HALF,
4334 IS_UPPER_HALF_NEG,
4335 // This means current op = [op_upper, op_lower] and src = -op_lower.
4336 IS_LOWER_HALF_NEG,
4337 IS_HI_NEG,
4338 // This means current op = [op_upper, op_lower] and src = [op_upper,
4339 // -op_lower].
4340 IS_LO_NEG,
4341 IS_BOTH_NEG,
4342 INVALID,
4343 NEG_START = IS_UPPER_HALF_NEG,
4344 NEG_END = IS_BOTH_NEG,
4345 HALF_START = IS_UPPER_HALF,
4346 HALF_END = IS_LOWER_HALF_NEG
4347 };
4348 /// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
isTruncHalf(const MachineInstr * MI,const MachineRegisterInfo & MRI)4349 static bool isTruncHalf(const MachineInstr *MI,
4350 const MachineRegisterInfo &MRI) {
4351 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4352 return false;
4353
4354 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4355 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4356 return DstSize * 2 == SrcSize;
4357 }
4358
4359 /// Test if the MI is logic shift right with half bits,
4360 /// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
isLshrHalf(const MachineInstr * MI,const MachineRegisterInfo & MRI)4361 static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4362 if (MI->getOpcode() != AMDGPU::G_LSHR)
4363 return false;
4364
4365 Register ShiftSrc;
4366 std::optional<ValueAndVReg> ShiftAmt;
4367 if (mi_match(MI->getOperand(0).getReg(), MRI,
4368 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4369 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4370 unsigned Shift = ShiftAmt->Value.getZExtValue();
4371 return Shift * 2 == SrcSize;
4372 }
4373 return false;
4374 }
4375
4376 /// Test if the MI is shift left with half bits,
4377 /// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
isShlHalf(const MachineInstr * MI,const MachineRegisterInfo & MRI)4378 static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4379 if (MI->getOpcode() != AMDGPU::G_SHL)
4380 return false;
4381
4382 Register ShiftSrc;
4383 std::optional<ValueAndVReg> ShiftAmt;
4384 if (mi_match(MI->getOperand(0).getReg(), MRI,
4385 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4386 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4387 unsigned Shift = ShiftAmt->Value.getZExtValue();
4388 return Shift * 2 == SrcSize;
4389 }
4390 return false;
4391 }
4392
4393 /// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
isUnmergeHalf(const MachineInstr * MI,const MachineRegisterInfo & MRI)4394 static bool isUnmergeHalf(const MachineInstr *MI,
4395 const MachineRegisterInfo &MRI) {
4396 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4397 return false;
4398 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4399 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4400 }
4401
4402 enum class TypeClass { VECTOR_OF_TWO, SCALAR, NONE_OF_LISTED };
4403
isVectorOfTwoOrScalar(Register Reg,const MachineRegisterInfo & MRI)4404 static TypeClass isVectorOfTwoOrScalar(Register Reg,
4405 const MachineRegisterInfo &MRI) {
4406 LLT OpTy = MRI.getType(Reg);
4407 if (OpTy.isScalar())
4408 return TypeClass::SCALAR;
4409 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4410 return TypeClass::VECTOR_OF_TWO;
4411 return TypeClass::NONE_OF_LISTED;
4412 }
4413
getNegStatus(Register Reg,SrcStatus S,const MachineRegisterInfo & MRI)4414 static SrcStatus getNegStatus(Register Reg, SrcStatus S,
4415 const MachineRegisterInfo &MRI) {
4416 TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);
4417 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4418 return SrcStatus::INVALID;
4419
4420 switch (S) {
4421 case SrcStatus::IS_SAME:
4422 if (NegType == TypeClass::VECTOR_OF_TWO) {
4423 // Vector of 2:
4424 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4425 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4426 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4427 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4428 return SrcStatus::IS_BOTH_NEG;
4429 }
4430 if (NegType == TypeClass::SCALAR) {
4431 // Scalar:
4432 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4433 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4434 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4435 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4436 return SrcStatus::IS_HI_NEG;
4437 }
4438 break;
4439 case SrcStatus::IS_HI_NEG:
4440 if (NegType == TypeClass::VECTOR_OF_TWO) {
4441 // Vector of 2:
4442 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4443 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4444 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4445 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4446 return SrcStatus::IS_LO_NEG;
4447 }
4448 if (NegType == TypeClass::SCALAR) {
4449 // Scalar:
4450 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4451 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4452 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4453 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4454 return SrcStatus::IS_SAME;
4455 }
4456 break;
4457 case SrcStatus::IS_LO_NEG:
4458 if (NegType == TypeClass::VECTOR_OF_TWO) {
4459 // Vector of 2:
4460 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4461 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4462 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4463 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4464 return SrcStatus::IS_HI_NEG;
4465 }
4466 if (NegType == TypeClass::SCALAR) {
4467 // Scalar:
4468 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4469 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4470 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4471 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4472 return SrcStatus::IS_BOTH_NEG;
4473 }
4474 break;
4475 case SrcStatus::IS_BOTH_NEG:
4476 if (NegType == TypeClass::VECTOR_OF_TWO) {
4477 // Vector of 2:
4478 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4479 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4480 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4481 // [SrcHi, SrcLo] = [OpHi, OpLo]
4482 return SrcStatus::IS_SAME;
4483 }
4484 if (NegType == TypeClass::SCALAR) {
4485 // Scalar:
4486 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4487 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4488 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4489 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4490 return SrcStatus::IS_LO_NEG;
4491 }
4492 break;
4493 case SrcStatus::IS_UPPER_HALF:
4494 // Vector of 2:
4495 // Src = CurrUpper
4496 // Curr = [CurrUpper, CurrLower]
4497 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4498 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4499 // Src = -OpUpper
4500 //
4501 // Scalar:
4502 // Src = CurrUpper
4503 // Curr = [CurrUpper, CurrLower]
4504 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4505 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4506 // Src = -OpUpper
4507 return SrcStatus::IS_UPPER_HALF_NEG;
4508 case SrcStatus::IS_LOWER_HALF:
4509 if (NegType == TypeClass::VECTOR_OF_TWO) {
4510 // Vector of 2:
4511 // Src = CurrLower
4512 // Curr = [CurrUpper, CurrLower]
4513 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4514 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4515 // Src = -OpLower
4516 return SrcStatus::IS_LOWER_HALF_NEG;
4517 }
4518 if (NegType == TypeClass::SCALAR) {
4519 // Scalar:
4520 // Src = CurrLower
4521 // Curr = [CurrUpper, CurrLower]
4522 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4523 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4524 // Src = OpLower
4525 return SrcStatus::IS_LOWER_HALF;
4526 }
4527 break;
4528 case SrcStatus::IS_UPPER_HALF_NEG:
4529 // Vector of 2:
4530 // Src = -CurrUpper
4531 // Curr = [CurrUpper, CurrLower]
4532 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4533 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4534 // Src = -(-OpUpper) = OpUpper
4535 //
4536 // Scalar:
4537 // Src = -CurrUpper
4538 // Curr = [CurrUpper, CurrLower]
4539 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4540 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4541 // Src = -(-OpUpper) = OpUpper
4542 return SrcStatus::IS_UPPER_HALF;
4543 case SrcStatus::IS_LOWER_HALF_NEG:
4544 if (NegType == TypeClass::VECTOR_OF_TWO) {
4545 // Vector of 2:
4546 // Src = -CurrLower
4547 // Curr = [CurrUpper, CurrLower]
4548 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4549 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4550 // Src = -(-OpLower) = OpLower
4551 return SrcStatus::IS_LOWER_HALF;
4552 }
4553 if (NegType == TypeClass::SCALAR) {
4554 // Scalar:
4555 // Src = -CurrLower
4556 // Curr = [CurrUpper, CurrLower]
4557 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4558 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4559 // Src = -OpLower
4560 return SrcStatus::IS_LOWER_HALF_NEG;
4561 }
4562 break;
4563 default:
4564 break;
4565 }
4566 llvm_unreachable("unexpected SrcStatus & NegType combination");
4567 }
4568
4569 static std::optional<std::pair<Register, SrcStatus>>
calcNextStatus(std::pair<Register,SrcStatus> Curr,const MachineRegisterInfo & MRI)4570 calcNextStatus(std::pair<Register, SrcStatus> Curr,
4571 const MachineRegisterInfo &MRI) {
4572 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
4573
4574 unsigned Opc = MI->getOpcode();
4575
4576 // Handle general Opc cases.
4577 switch (Opc) {
4578 case AMDGPU::G_BITCAST:
4579 return std::optional<std::pair<Register, SrcStatus>>(
4580 {MI->getOperand(1).getReg(), Curr.second});
4581 case AMDGPU::COPY:
4582 if (MI->getOperand(1).getReg().isPhysical())
4583 return std::nullopt;
4584 return std::optional<std::pair<Register, SrcStatus>>(
4585 {MI->getOperand(1).getReg(), Curr.second});
4586 case AMDGPU::G_FNEG: {
4587 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
4588 if (Stat == SrcStatus::INVALID)
4589 return std::nullopt;
4590 return std::optional<std::pair<Register, SrcStatus>>(
4591 {MI->getOperand(1).getReg(), Stat});
4592 }
4593 default:
4594 break;
4595 }
4596
4597 // Calc next Stat from current Stat.
4598 switch (Curr.second) {
4599 case SrcStatus::IS_SAME:
4600 if (isTruncHalf(MI, MRI))
4601 return std::optional<std::pair<Register, SrcStatus>>(
4602 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4603 else if (isUnmergeHalf(MI, MRI)) {
4604 if (Curr.first == MI->getOperand(0).getReg())
4605 return std::optional<std::pair<Register, SrcStatus>>(
4606 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
4607 return std::optional<std::pair<Register, SrcStatus>>(
4608 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
4609 }
4610 break;
4611 case SrcStatus::IS_HI_NEG:
4612 if (isTruncHalf(MI, MRI)) {
4613 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4614 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
4615 // = [OpLowerHi, OpLowerLo]
4616 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4617 // = [-OpLowerHi, OpLowerLo]
4618 // = -OpLower
4619 return std::optional<std::pair<Register, SrcStatus>>(
4620 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4621 }
4622 if (isUnmergeHalf(MI, MRI)) {
4623 if (Curr.first == MI->getOperand(0).getReg())
4624 return std::optional<std::pair<Register, SrcStatus>>(
4625 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4626 return std::optional<std::pair<Register, SrcStatus>>(
4627 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4628 }
4629 break;
4630 case SrcStatus::IS_UPPER_HALF:
4631 if (isShlHalf(MI, MRI))
4632 return std::optional<std::pair<Register, SrcStatus>>(
4633 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4634 break;
4635 case SrcStatus::IS_LOWER_HALF:
4636 if (isLshrHalf(MI, MRI))
4637 return std::optional<std::pair<Register, SrcStatus>>(
4638 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
4639 break;
4640 case SrcStatus::IS_UPPER_HALF_NEG:
4641 if (isShlHalf(MI, MRI))
4642 return std::optional<std::pair<Register, SrcStatus>>(
4643 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4644 break;
4645 case SrcStatus::IS_LOWER_HALF_NEG:
4646 if (isLshrHalf(MI, MRI))
4647 return std::optional<std::pair<Register, SrcStatus>>(
4648 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4649 break;
4650 default:
4651 break;
4652 }
4653 return std::nullopt;
4654 }
4655
4656 /// This is used to control valid status that current MI supports. For example,
4657 /// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
4658 /// bit on VOP3P.
4659 /// The class can be further extended to recognize support on SEL, NEG, ABS bit
4660 /// for different MI on different arch
4661 class SearchOptions {
4662 private:
4663 bool HasNeg = false;
4664 // Assume all complex pattern of VOP3P have opsel.
4665 bool HasOpsel = true;
4666
4667 public:
SearchOptions(Register Reg,const MachineRegisterInfo & MRI)4668 SearchOptions(Register Reg, const MachineRegisterInfo &MRI) {
4669 const MachineInstr *MI = MRI.getVRegDef(Reg);
4670 unsigned Opc = MI->getOpcode();
4671
4672 if (Opc < TargetOpcode::GENERIC_OP_END) {
4673 // Keep same for generic op.
4674 HasNeg = true;
4675 } else if (Opc == TargetOpcode::G_INTRINSIC) {
4676 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
4677 // Only float point intrinsic has neg & neg_hi bits.
4678 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
4679 HasNeg = true;
4680 }
4681 }
checkOptions(SrcStatus Stat) const4682 bool checkOptions(SrcStatus Stat) const {
4683 if (!HasNeg &&
4684 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
4685 return false;
4686 }
4687 if (!HasOpsel &&
4688 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
4689 return false;
4690 }
4691 return true;
4692 }
4693 };
4694
4695 static SmallVector<std::pair<Register, SrcStatus>>
getSrcStats(Register Reg,const MachineRegisterInfo & MRI,SearchOptions SO,int MaxDepth=3)4696 getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO,
4697 int MaxDepth = 3) {
4698 int Depth = 0;
4699 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
4700 SmallVector<std::pair<Register, SrcStatus>> Statlist;
4701
4702 while (Depth <= MaxDepth && Curr.has_value()) {
4703 Depth++;
4704 if (SO.checkOptions(Curr.value().second))
4705 Statlist.push_back(Curr.value());
4706 Curr = calcNextStatus(Curr.value(), MRI);
4707 }
4708
4709 return Statlist;
4710 }
4711
4712 static std::pair<Register, SrcStatus>
getLastSameOrNeg(Register Reg,const MachineRegisterInfo & MRI,SearchOptions SO,int MaxDepth=3)4713 getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO,
4714 int MaxDepth = 3) {
4715 int Depth = 0;
4716 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
4717 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
4718
4719 while (Depth <= MaxDepth && Curr.has_value()) {
4720 Depth++;
4721 SrcStatus Stat = Curr.value().second;
4722 if (SO.checkOptions(Stat)) {
4723 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
4724 Stat == SrcStatus::IS_LO_NEG || Stat == SrcStatus::IS_BOTH_NEG)
4725 LastSameOrNeg = Curr.value();
4726 }
4727 Curr = calcNextStatus(Curr.value(), MRI);
4728 }
4729
4730 return LastSameOrNeg;
4731 }
4732
isSameBitWidth(Register Reg1,Register Reg2,const MachineRegisterInfo & MRI)4733 static bool isSameBitWidth(Register Reg1, Register Reg2,
4734 const MachineRegisterInfo &MRI) {
4735 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
4736 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
4737 return Width1 == Width2;
4738 }
4739
updateMods(SrcStatus HiStat,SrcStatus LoStat,unsigned Mods)4740 static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
4741 // SrcStatus::IS_LOWER_HALF remain 0.
4742 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
4743 Mods ^= SISrcMods::NEG_HI;
4744 Mods |= SISrcMods::OP_SEL_1;
4745 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
4746 Mods |= SISrcMods::OP_SEL_1;
4747 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
4748 Mods ^= SISrcMods::NEG_HI;
4749 else if (HiStat == SrcStatus::IS_HI_NEG)
4750 Mods ^= SISrcMods::NEG_HI;
4751
4752 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
4753 Mods ^= SISrcMods::NEG;
4754 Mods |= SISrcMods::OP_SEL_0;
4755 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
4756 Mods |= SISrcMods::OP_SEL_0;
4757 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
4758 Mods |= SISrcMods::NEG;
4759 else if (LoStat == SrcStatus::IS_HI_NEG)
4760 Mods ^= SISrcMods::NEG;
4761
4762 return Mods;
4763 }
4764
isValidToPack(SrcStatus HiStat,SrcStatus LoStat,Register NewReg,Register RootReg,const SIInstrInfo & TII,const MachineRegisterInfo & MRI)4765 static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
4766 Register RootReg, const SIInstrInfo &TII,
4767 const MachineRegisterInfo &MRI) {
4768 auto IsHalfState = [](SrcStatus S) {
4769 return S == SrcStatus::IS_UPPER_HALF || S == SrcStatus::IS_UPPER_HALF_NEG ||
4770 S == SrcStatus::IS_LOWER_HALF || S == SrcStatus::IS_LOWER_HALF_NEG;
4771 };
4772 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
4773 IsHalfState(HiStat);
4774 }
4775
selectVOP3PModsImpl(Register RootReg,const MachineRegisterInfo & MRI,bool IsDOT) const4776 std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
4777 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
4778 unsigned Mods = 0;
4779 // No modification if Root type is not form of <2 x Type>.
4780 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
4781 Mods |= SISrcMods::OP_SEL_1;
4782 return {RootReg, Mods};
4783 }
4784
4785 SearchOptions SO(RootReg, MRI);
4786
4787 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
4788
4789 if (Stat.second == SrcStatus::IS_BOTH_NEG)
4790 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
4791 else if (Stat.second == SrcStatus::IS_HI_NEG)
4792 Mods ^= SISrcMods::NEG_HI;
4793 else if (Stat.second == SrcStatus::IS_LO_NEG)
4794 Mods ^= SISrcMods::NEG;
4795
4796 MachineInstr *MI = MRI.getVRegDef(Stat.first);
4797
4798 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
4799 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
4800 Mods |= SISrcMods::OP_SEL_1;
4801 return {Stat.first, Mods};
4802 }
4803
4804 SmallVector<std::pair<Register, SrcStatus>> StatlistHi =
4805 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
4806
4807 if (StatlistHi.empty()) {
4808 Mods |= SISrcMods::OP_SEL_1;
4809 return {Stat.first, Mods};
4810 }
4811
4812 SmallVector<std::pair<Register, SrcStatus>> StatlistLo =
4813 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
4814
4815 if (StatlistLo.empty()) {
4816 Mods |= SISrcMods::OP_SEL_1;
4817 return {Stat.first, Mods};
4818 }
4819
4820 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
4821 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
4822 if (StatlistHi[I].first == StatlistLo[J].first &&
4823 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
4824 StatlistHi[I].first, RootReg, TII, MRI))
4825 return {StatlistHi[I].first,
4826 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
4827 }
4828 }
4829 // Packed instructions do not have abs modifiers.
4830 Mods |= SISrcMods::OP_SEL_1;
4831
4832 return {Stat.first, Mods};
4833 }
4834
4835 // Removed unused function `getAllKindImm` to eliminate dead code.
4836
checkRB(Register Reg,unsigned int RBNo,const AMDGPURegisterBankInfo & RBI,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI)4837 static bool checkRB(Register Reg, unsigned int RBNo,
4838 const AMDGPURegisterBankInfo &RBI,
4839 const MachineRegisterInfo &MRI,
4840 const TargetRegisterInfo &TRI) {
4841 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
4842 return RB->getID() == RBNo;
4843 }
4844
4845 // This function is used to get the correct register bank for returned reg.
4846 // Assume:
4847 // 1. VOP3P is always legal for VGPR.
4848 // 2. RootOp's regbank is legal.
4849 // Thus
4850 // 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
4851 // 2. If RootOp is VGPR, then NewOp must be VGPR.
getLegalRegBank(Register NewReg,Register RootReg,const AMDGPURegisterBankInfo & RBI,MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const SIInstrInfo & TII)4852 static Register getLegalRegBank(Register NewReg, Register RootReg,
4853 const AMDGPURegisterBankInfo &RBI,
4854 MachineRegisterInfo &MRI,
4855 const TargetRegisterInfo &TRI,
4856 const SIInstrInfo &TII) {
4857 // RootOp can only be VGPR or SGPR (some hand written cases such as.
4858 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
4859 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
4860 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
4861 return NewReg;
4862
4863 MachineInstr *MI = MRI.getVRegDef(RootReg);
4864 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
4865 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
4866 return RootReg;
4867 }
4868
4869 MachineBasicBlock *BB = MI->getParent();
4870 Register DstReg = MRI.cloneVirtualRegister(RootReg);
4871
4872 MachineInstrBuilder MIB =
4873 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
4874 .addReg(NewReg);
4875
4876 // Only accept VGPR.
4877 return MIB->getOperand(0).getReg();
4878 }
4879
4880 InstructionSelector::ComplexRendererFns
selectVOP3PRetHelper(MachineOperand & Root,bool IsDOT) const4881 AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
4882 bool IsDOT) const {
4883 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
4884 Register Reg;
4885 unsigned Mods;
4886 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
4887
4888 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
4889 return {{
4890 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4891 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4892 }};
4893 }
4894
4895 InstructionSelector::ComplexRendererFns
selectVOP3PMods(MachineOperand & Root) const4896 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
4897
4898 return selectVOP3PRetHelper(Root);
4899 }
4900
4901 InstructionSelector::ComplexRendererFns
selectVOP3PModsDOT(MachineOperand & Root) const4902 AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
4903
4904 return selectVOP3PRetHelper(Root, true);
4905 }
4906
4907 InstructionSelector::ComplexRendererFns
selectVOP3PModsNeg(MachineOperand & Root) const4908 AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
4909 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
4910 // Value is in Imm operand as i1 sign extended to int64_t.
4911 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
4912 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4913 "expected i1 value");
4914 unsigned Mods = SISrcMods::OP_SEL_1;
4915 if (Root.getImm() == -1)
4916 Mods ^= SISrcMods::NEG;
4917 return {{
4918 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4919 }};
4920 }
4921
4922 InstructionSelector::ComplexRendererFns
selectWMMAOpSelVOP3PMods(MachineOperand & Root) const4923 AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
4924 MachineOperand &Root) const {
4925 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4926 "expected i1 value");
4927 unsigned Mods = SISrcMods::OP_SEL_1;
4928 if (Root.getImm() != 0)
4929 Mods |= SISrcMods::OP_SEL_0;
4930
4931 return {{
4932 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4933 }};
4934 }
4935
buildRegSequence(SmallVectorImpl<Register> & Elts,MachineInstr * InsertPt,MachineRegisterInfo & MRI)4936 static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
4937 MachineInstr *InsertPt,
4938 MachineRegisterInfo &MRI) {
4939 const TargetRegisterClass *DstRegClass;
4940 switch (Elts.size()) {
4941 case 8:
4942 DstRegClass = &AMDGPU::VReg_256RegClass;
4943 break;
4944 case 4:
4945 DstRegClass = &AMDGPU::VReg_128RegClass;
4946 break;
4947 case 2:
4948 DstRegClass = &AMDGPU::VReg_64RegClass;
4949 break;
4950 default:
4951 llvm_unreachable("unhandled Reg sequence size");
4952 }
4953
4954 MachineIRBuilder B(*InsertPt);
4955 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
4956 .addDef(MRI.createVirtualRegister(DstRegClass));
4957 for (unsigned i = 0; i < Elts.size(); ++i) {
4958 MIB.addReg(Elts[i]);
4959 MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i));
4960 }
4961 return MIB->getOperand(0).getReg();
4962 }
4963
selectWMMAModsNegAbs(unsigned ModOpcode,unsigned & Mods,SmallVectorImpl<Register> & Elts,Register & Src,MachineInstr * InsertPt,MachineRegisterInfo & MRI)4964 static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
4965 SmallVectorImpl<Register> &Elts, Register &Src,
4966 MachineInstr *InsertPt,
4967 MachineRegisterInfo &MRI) {
4968 if (ModOpcode == TargetOpcode::G_FNEG) {
4969 Mods |= SISrcMods::NEG;
4970 // Check if all elements also have abs modifier
4971 SmallVector<Register, 8> NegAbsElts;
4972 for (auto El : Elts) {
4973 Register FabsSrc;
4974 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
4975 break;
4976 NegAbsElts.push_back(FabsSrc);
4977 }
4978 if (Elts.size() != NegAbsElts.size()) {
4979 // Neg
4980 Src = buildRegSequence(Elts, InsertPt, MRI);
4981 } else {
4982 // Neg and Abs
4983 Mods |= SISrcMods::NEG_HI;
4984 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
4985 }
4986 } else {
4987 assert(ModOpcode == TargetOpcode::G_FABS);
4988 // Abs
4989 Mods |= SISrcMods::NEG_HI;
4990 Src = buildRegSequence(Elts, InsertPt, MRI);
4991 }
4992 }
4993
4994 InstructionSelector::ComplexRendererFns
selectWMMAModsF32NegAbs(MachineOperand & Root) const4995 AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
4996 Register Src = Root.getReg();
4997 unsigned Mods = SISrcMods::OP_SEL_1;
4998 SmallVector<Register, 8> EltsF32;
4999
5000 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5001 assert(BV->getNumSources() > 0);
5002 // Based on first element decide which mod we match, neg or abs
5003 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5004 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5005 ? AMDGPU::G_FNEG
5006 : AMDGPU::G_FABS;
5007 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5008 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5009 if (ElF32->getOpcode() != ModOpcode)
5010 break;
5011 EltsF32.push_back(ElF32->getOperand(1).getReg());
5012 }
5013
5014 // All elements had ModOpcode modifier
5015 if (BV->getNumSources() == EltsF32.size()) {
5016 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5017 *MRI);
5018 }
5019 }
5020
5021 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5022 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5023 }
5024
5025 InstructionSelector::ComplexRendererFns
selectWMMAModsF16Neg(MachineOperand & Root) const5026 AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5027 Register Src = Root.getReg();
5028 unsigned Mods = SISrcMods::OP_SEL_1;
5029 SmallVector<Register, 8> EltsV2F16;
5030
5031 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5032 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5033 Register FNegSrc;
5034 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5035 break;
5036 EltsV2F16.push_back(FNegSrc);
5037 }
5038
5039 // All elements had ModOpcode modifier
5040 if (CV->getNumSources() == EltsV2F16.size()) {
5041 Mods |= SISrcMods::NEG;
5042 Mods |= SISrcMods::NEG_HI;
5043 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5044 }
5045 }
5046
5047 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5048 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5049 }
5050
5051 InstructionSelector::ComplexRendererFns
selectWMMAModsF16NegAbs(MachineOperand & Root) const5052 AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5053 Register Src = Root.getReg();
5054 unsigned Mods = SISrcMods::OP_SEL_1;
5055 SmallVector<Register, 8> EltsV2F16;
5056
5057 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5058 assert(CV->getNumSources() > 0);
5059 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5060 // Based on first element decide which mod we match, neg or abs
5061 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5062 ? AMDGPU::G_FNEG
5063 : AMDGPU::G_FABS;
5064
5065 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5066 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5067 if (ElV2F16->getOpcode() != ModOpcode)
5068 break;
5069 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5070 }
5071
5072 // All elements had ModOpcode modifier
5073 if (CV->getNumSources() == EltsV2F16.size()) {
5074 MachineIRBuilder B(*Root.getParent());
5075 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5076 *MRI);
5077 }
5078 }
5079
5080 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5081 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5082 }
5083
5084 InstructionSelector::ComplexRendererFns
selectWMMAVISrc(MachineOperand & Root) const5085 AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5086 std::optional<FPValueAndVReg> FPValReg;
5087 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5088 if (TII.isInlineConstant(FPValReg->Value)) {
5089 return {{[=](MachineInstrBuilder &MIB) {
5090 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5091 }}};
5092 }
5093 // Non-inlineable splat floats should not fall-through for integer immediate
5094 // checks.
5095 return {};
5096 }
5097
5098 APInt ICst;
5099 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5100 if (TII.isInlineConstant(ICst)) {
5101 return {
5102 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5103 }
5104 }
5105
5106 return {};
5107 }
5108
5109 InstructionSelector::ComplexRendererFns
selectSWMMACIndex8(MachineOperand & Root) const5110 AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5111 Register Src =
5112 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5113 unsigned Key = 0;
5114
5115 Register ShiftSrc;
5116 std::optional<ValueAndVReg> ShiftAmt;
5117 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5118 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5119 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5120 Key = ShiftAmt->Value.getZExtValue() / 8;
5121 Src = ShiftSrc;
5122 }
5123
5124 return {{
5125 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5126 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5127 }};
5128 }
5129
5130 InstructionSelector::ComplexRendererFns
selectSWMMACIndex16(MachineOperand & Root) const5131 AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5132
5133 Register Src =
5134 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5135 unsigned Key = 0;
5136
5137 Register ShiftSrc;
5138 std::optional<ValueAndVReg> ShiftAmt;
5139 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5140 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5141 ShiftAmt->Value.getZExtValue() == 16) {
5142 Src = ShiftSrc;
5143 Key = 1;
5144 }
5145
5146 return {{
5147 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5148 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5149 }};
5150 }
5151
5152 InstructionSelector::ComplexRendererFns
selectVOP3OpSelMods(MachineOperand & Root) const5153 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5154 Register Src;
5155 unsigned Mods;
5156 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5157
5158 // FIXME: Handle op_sel
5159 return {{
5160 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5161 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5162 }};
5163 }
5164
5165 // FIXME-TRUE16 remove when fake16 is removed
5166 InstructionSelector::ComplexRendererFns
selectVINTERPMods(MachineOperand & Root) const5167 AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5168 Register Src;
5169 unsigned Mods;
5170 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5171 /*IsCanonicalizing=*/true,
5172 /*AllowAbs=*/false,
5173 /*OpSel=*/false);
5174
5175 return {{
5176 [=](MachineInstrBuilder &MIB) {
5177 MIB.addReg(
5178 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5179 },
5180 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5181 }};
5182 }
5183
5184 InstructionSelector::ComplexRendererFns
selectVINTERPModsHi(MachineOperand & Root) const5185 AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5186 Register Src;
5187 unsigned Mods;
5188 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5189 /*IsCanonicalizing=*/true,
5190 /*AllowAbs=*/false,
5191 /*OpSel=*/true);
5192
5193 return {{
5194 [=](MachineInstrBuilder &MIB) {
5195 MIB.addReg(
5196 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5197 },
5198 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5199 }};
5200 }
5201
selectSmrdOffset(MachineOperand & Root,Register & Base,Register * SOffset,int64_t * Offset) const5202 bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5203 Register &Base,
5204 Register *SOffset,
5205 int64_t *Offset) const {
5206 MachineInstr *MI = Root.getParent();
5207 MachineBasicBlock *MBB = MI->getParent();
5208
5209 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5210 // then we can select all ptr + 32-bit offsets.
5211 SmallVector<GEPInfo, 4> AddrInfo;
5212 getAddrModeInfo(*MI, *MRI, AddrInfo);
5213
5214 if (AddrInfo.empty())
5215 return false;
5216
5217 const GEPInfo &GEPI = AddrInfo[0];
5218 std::optional<int64_t> EncodedImm;
5219
5220 if (SOffset && Offset) {
5221 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5222 /*HasSOffset=*/true);
5223 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5224 AddrInfo.size() > 1) {
5225 const GEPInfo &GEPI2 = AddrInfo[1];
5226 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5227 if (Register OffsetReg =
5228 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
5229 Base = GEPI2.SgprParts[0];
5230 *SOffset = OffsetReg;
5231 *Offset = *EncodedImm;
5232 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5233 return true;
5234
5235 // For unbuffered smem loads, it is illegal for the Immediate Offset
5236 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5237 // is negative. Handle the case where the Immediate Offset + SOffset
5238 // is negative.
5239 auto SKnown = VT->getKnownBits(*SOffset);
5240 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5241 return false;
5242
5243 return true;
5244 }
5245 }
5246 }
5247 return false;
5248 }
5249
5250 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5251 /*HasSOffset=*/false);
5252 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5253 Base = GEPI.SgprParts[0];
5254 *Offset = *EncodedImm;
5255 return true;
5256 }
5257
5258 // SGPR offset is unsigned.
5259 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5260 GEPI.Imm != 0) {
5261 // If we make it this far we have a load with an 32-bit immediate offset.
5262 // It is OK to select this using a sgpr offset, because we have already
5263 // failed trying to select this load into one of the _IMM variants since
5264 // the _IMM Patterns are considered before the _SGPR patterns.
5265 Base = GEPI.SgprParts[0];
5266 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5267 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5268 .addImm(GEPI.Imm);
5269 return true;
5270 }
5271
5272 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5273 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
5274 Base = GEPI.SgprParts[0];
5275 *SOffset = OffsetReg;
5276 return true;
5277 }
5278 }
5279
5280 return false;
5281 }
5282
5283 InstructionSelector::ComplexRendererFns
selectSmrdImm(MachineOperand & Root) const5284 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5285 Register Base;
5286 int64_t Offset;
5287 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
5288 return std::nullopt;
5289
5290 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5291 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5292 }
5293
5294 InstructionSelector::ComplexRendererFns
selectSmrdImm32(MachineOperand & Root) const5295 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5296 SmallVector<GEPInfo, 4> AddrInfo;
5297 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5298
5299 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5300 return std::nullopt;
5301
5302 const GEPInfo &GEPInfo = AddrInfo[0];
5303 Register PtrReg = GEPInfo.SgprParts[0];
5304 std::optional<int64_t> EncodedImm =
5305 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5306 if (!EncodedImm)
5307 return std::nullopt;
5308
5309 return {{
5310 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5311 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5312 }};
5313 }
5314
5315 InstructionSelector::ComplexRendererFns
selectSmrdSgpr(MachineOperand & Root) const5316 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5317 Register Base, SOffset;
5318 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
5319 return std::nullopt;
5320
5321 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5322 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5323 }
5324
5325 InstructionSelector::ComplexRendererFns
selectSmrdSgprImm(MachineOperand & Root) const5326 AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5327 Register Base, SOffset;
5328 int64_t Offset;
5329 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
5330 return std::nullopt;
5331
5332 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5333 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5334 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5335 }
5336
5337 std::pair<Register, int>
selectFlatOffsetImpl(MachineOperand & Root,uint64_t FlatVariant) const5338 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5339 uint64_t FlatVariant) const {
5340 MachineInstr *MI = Root.getParent();
5341
5342 auto Default = std::pair(Root.getReg(), 0);
5343
5344 if (!STI.hasFlatInstOffsets())
5345 return Default;
5346
5347 Register PtrBase;
5348 int64_t ConstOffset;
5349 std::tie(PtrBase, ConstOffset) =
5350 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5351
5352 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
5353 !isFlatScratchBaseLegal(Root.getReg())))
5354 return Default;
5355
5356 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5357 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5358 return Default;
5359
5360 return std::pair(PtrBase, ConstOffset);
5361 }
5362
5363 InstructionSelector::ComplexRendererFns
selectFlatOffset(MachineOperand & Root) const5364 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5365 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
5366
5367 return {{
5368 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5369 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5370 }};
5371 }
5372
5373 InstructionSelector::ComplexRendererFns
selectGlobalOffset(MachineOperand & Root) const5374 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5375 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
5376
5377 return {{
5378 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5379 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5380 }};
5381 }
5382
5383 InstructionSelector::ComplexRendererFns
selectScratchOffset(MachineOperand & Root) const5384 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5385 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
5386
5387 return {{
5388 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5389 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5390 }};
5391 }
5392
5393 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5394 InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand & Root) const5395 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
5396 Register Addr = Root.getReg();
5397 Register PtrBase;
5398 int64_t ConstOffset;
5399 int64_t ImmOffset = 0;
5400
5401 // Match the immediate offset first, which canonically is moved as low as
5402 // possible.
5403 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
5404
5405 if (ConstOffset != 0) {
5406 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5407 SIInstrFlags::FlatGlobal)) {
5408 Addr = PtrBase;
5409 ImmOffset = ConstOffset;
5410 } else {
5411 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
5412 if (isSGPR(PtrBaseDef->Reg)) {
5413 if (ConstOffset > 0) {
5414 // Offset is too large.
5415 //
5416 // saddr + large_offset -> saddr +
5417 // (voffset = large_offset & ~MaxOffset) +
5418 // (large_offset & MaxOffset);
5419 int64_t SplitImmOffset, RemainderOffset;
5420 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
5421 ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
5422
5423 if (isUInt<32>(RemainderOffset)) {
5424 MachineInstr *MI = Root.getParent();
5425 MachineBasicBlock *MBB = MI->getParent();
5426 Register HighBits =
5427 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5428
5429 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5430 HighBits)
5431 .addImm(RemainderOffset);
5432
5433 return {{
5434 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
5435 [=](MachineInstrBuilder &MIB) {
5436 MIB.addReg(HighBits);
5437 }, // voffset
5438 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
5439 }};
5440 }
5441 }
5442
5443 // We are adding a 64 bit SGPR and a constant. If constant bus limit
5444 // is 1 we would need to perform 1 or 2 extra moves for each half of
5445 // the constant and it is better to do a scalar add and then issue a
5446 // single VALU instruction to materialize zero. Otherwise it is less
5447 // instructions to perform VALU adds with immediates or inline literals.
5448 unsigned NumLiterals =
5449 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
5450 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
5451 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
5452 return std::nullopt;
5453 }
5454 }
5455 }
5456
5457 // Match the variable offset.
5458 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5459 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5460 // Look through the SGPR->VGPR copy.
5461 Register SAddr =
5462 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
5463
5464 if (isSGPR(SAddr)) {
5465 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
5466
5467 // It's possible voffset is an SGPR here, but the copy to VGPR will be
5468 // inserted later.
5469 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
5470 return {{[=](MachineInstrBuilder &MIB) { // saddr
5471 MIB.addReg(SAddr);
5472 },
5473 [=](MachineInstrBuilder &MIB) { // voffset
5474 MIB.addReg(VOffset);
5475 },
5476 [=](MachineInstrBuilder &MIB) { // offset
5477 MIB.addImm(ImmOffset);
5478 }}};
5479 }
5480 }
5481 }
5482
5483 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
5484 // drop this.
5485 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
5486 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
5487 return std::nullopt;
5488
5489 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
5490 // moves required to copy a 64-bit SGPR to VGPR.
5491 MachineInstr *MI = Root.getParent();
5492 MachineBasicBlock *MBB = MI->getParent();
5493 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5494
5495 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
5496 .addImm(0);
5497
5498 return {{
5499 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5500 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5501 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5502 }};
5503 }
5504
5505 InstructionSelector::ComplexRendererFns
selectScratchSAddr(MachineOperand & Root) const5506 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
5507 Register Addr = Root.getReg();
5508 Register PtrBase;
5509 int64_t ConstOffset;
5510 int64_t ImmOffset = 0;
5511
5512 // Match the immediate offset first, which canonically is moved as low as
5513 // possible.
5514 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
5515
5516 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
5517 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
5518 SIInstrFlags::FlatScratch)) {
5519 Addr = PtrBase;
5520 ImmOffset = ConstOffset;
5521 }
5522
5523 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5524 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5525 int FI = AddrDef->MI->getOperand(1).getIndex();
5526 return {{
5527 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5528 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5529 }};
5530 }
5531
5532 Register SAddr = AddrDef->Reg;
5533
5534 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5535 Register LHS = AddrDef->MI->getOperand(1).getReg();
5536 Register RHS = AddrDef->MI->getOperand(2).getReg();
5537 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5538 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
5539
5540 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
5541 isSGPR(RHSDef->Reg)) {
5542 int FI = LHSDef->MI->getOperand(1).getIndex();
5543 MachineInstr &I = *Root.getParent();
5544 MachineBasicBlock *BB = I.getParent();
5545 const DebugLoc &DL = I.getDebugLoc();
5546 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5547
5548 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
5549 .addFrameIndex(FI)
5550 .addReg(RHSDef->Reg)
5551 .setOperandDead(3); // Dead scc
5552 }
5553 }
5554
5555 if (!isSGPR(SAddr))
5556 return std::nullopt;
5557
5558 return {{
5559 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
5560 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5561 }};
5562 }
5563
5564 // Check whether the flat scratch SVS swizzle bug affects this access.
checkFlatScratchSVSSwizzleBug(Register VAddr,Register SAddr,uint64_t ImmOffset) const5565 bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
5566 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
5567 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
5568 return false;
5569
5570 // The bug affects the swizzling of SVS accesses if there is any carry out
5571 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
5572 // voffset to (soffset + inst_offset).
5573 auto VKnown = VT->getKnownBits(VAddr);
5574 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
5575 KnownBits::makeConstant(APInt(32, ImmOffset)));
5576 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
5577 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
5578 return (VMax & 3) + (SMax & 3) >= 4;
5579 }
5580
5581 InstructionSelector::ComplexRendererFns
selectScratchSVAddr(MachineOperand & Root) const5582 AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
5583 Register Addr = Root.getReg();
5584 Register PtrBase;
5585 int64_t ConstOffset;
5586 int64_t ImmOffset = 0;
5587
5588 // Match the immediate offset first, which canonically is moved as low as
5589 // possible.
5590 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
5591
5592 Register OrigAddr = Addr;
5593 if (ConstOffset != 0 &&
5594 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
5595 SIInstrFlags::FlatScratch)) {
5596 Addr = PtrBase;
5597 ImmOffset = ConstOffset;
5598 }
5599
5600 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5601 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
5602 return std::nullopt;
5603
5604 Register RHS = AddrDef->MI->getOperand(2).getReg();
5605 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
5606 return std::nullopt;
5607
5608 Register LHS = AddrDef->MI->getOperand(1).getReg();
5609 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5610
5611 if (OrigAddr != Addr) {
5612 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
5613 return std::nullopt;
5614 } else {
5615 if (!isFlatScratchBaseLegalSV(OrigAddr))
5616 return std::nullopt;
5617 }
5618
5619 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
5620 return std::nullopt;
5621
5622 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5623 int FI = LHSDef->MI->getOperand(1).getIndex();
5624 return {{
5625 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5626 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5627 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5628 }};
5629 }
5630
5631 if (!isSGPR(LHS))
5632 return std::nullopt;
5633
5634 return {{
5635 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5636 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
5637 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5638 }};
5639 }
5640
5641 InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffen(MachineOperand & Root) const5642 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
5643 MachineInstr *MI = Root.getParent();
5644 MachineBasicBlock *MBB = MI->getParent();
5645 MachineFunction *MF = MBB->getParent();
5646 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5647
5648 int64_t Offset = 0;
5649 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
5650 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
5651 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5652
5653 // TODO: Should this be inside the render function? The iterator seems to
5654 // move.
5655 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
5656 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5657 HighBits)
5658 .addImm(Offset & ~MaxOffset);
5659
5660 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5661 MIB.addReg(Info->getScratchRSrcReg());
5662 },
5663 [=](MachineInstrBuilder &MIB) { // vaddr
5664 MIB.addReg(HighBits);
5665 },
5666 [=](MachineInstrBuilder &MIB) { // soffset
5667 // Use constant zero for soffset and rely on eliminateFrameIndex
5668 // to choose the appropriate frame register if need be.
5669 MIB.addImm(0);
5670 },
5671 [=](MachineInstrBuilder &MIB) { // offset
5672 MIB.addImm(Offset & MaxOffset);
5673 }}};
5674 }
5675
5676 assert(Offset == 0 || Offset == -1);
5677
5678 // Try to fold a frame index directly into the MUBUF vaddr field, and any
5679 // offsets.
5680 std::optional<int> FI;
5681 Register VAddr = Root.getReg();
5682
5683 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5684 Register PtrBase;
5685 int64_t ConstOffset;
5686 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
5687 if (ConstOffset != 0) {
5688 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
5689 (!STI.privateMemoryResourceIsRangeChecked() ||
5690 VT->signBitIsZero(PtrBase))) {
5691 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
5692 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
5693 FI = PtrBaseDef->getOperand(1).getIndex();
5694 else
5695 VAddr = PtrBase;
5696 Offset = ConstOffset;
5697 }
5698 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5699 FI = RootDef->getOperand(1).getIndex();
5700 }
5701
5702 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5703 MIB.addReg(Info->getScratchRSrcReg());
5704 },
5705 [=](MachineInstrBuilder &MIB) { // vaddr
5706 if (FI)
5707 MIB.addFrameIndex(*FI);
5708 else
5709 MIB.addReg(VAddr);
5710 },
5711 [=](MachineInstrBuilder &MIB) { // soffset
5712 // Use constant zero for soffset and rely on eliminateFrameIndex
5713 // to choose the appropriate frame register if need be.
5714 MIB.addImm(0);
5715 },
5716 [=](MachineInstrBuilder &MIB) { // offset
5717 MIB.addImm(Offset);
5718 }}};
5719 }
5720
isDSOffsetLegal(Register Base,int64_t Offset) const5721 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
5722 int64_t Offset) const {
5723 if (!isUInt<16>(Offset))
5724 return false;
5725
5726 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
5727 return true;
5728
5729 // On Southern Islands instruction with a negative base value and an offset
5730 // don't seem to work.
5731 return VT->signBitIsZero(Base);
5732 }
5733
isDSOffset2Legal(Register Base,int64_t Offset0,int64_t Offset1,unsigned Size) const5734 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
5735 int64_t Offset1,
5736 unsigned Size) const {
5737 if (Offset0 % Size != 0 || Offset1 % Size != 0)
5738 return false;
5739 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
5740 return false;
5741
5742 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
5743 return true;
5744
5745 // On Southern Islands instruction with a negative base value and an offset
5746 // don't seem to work.
5747 return VT->signBitIsZero(Base);
5748 }
5749
5750 // Return whether the operation has NoUnsignedWrap property.
isNoUnsignedWrap(MachineInstr * Addr)5751 static bool isNoUnsignedWrap(MachineInstr *Addr) {
5752 return Addr->getOpcode() == TargetOpcode::G_OR ||
5753 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
5754 Addr->getFlag(MachineInstr::NoUWrap));
5755 }
5756
5757 // Check that the base address of flat scratch load/store in the form of `base +
5758 // offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
5759 // requirement). We always treat the first operand as the base address here.
isFlatScratchBaseLegal(Register Addr) const5760 bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
5761 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5762
5763 if (isNoUnsignedWrap(AddrMI))
5764 return true;
5765
5766 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5767 // values.
5768 if (STI.hasSignedScratchOffsets())
5769 return true;
5770
5771 Register LHS = AddrMI->getOperand(1).getReg();
5772 Register RHS = AddrMI->getOperand(2).getReg();
5773
5774 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
5775 std::optional<ValueAndVReg> RhsValReg =
5776 getIConstantVRegValWithLookThrough(RHS, *MRI);
5777 // If the immediate offset is negative and within certain range, the base
5778 // address cannot also be negative. If the base is also negative, the sum
5779 // would be either negative or much larger than the valid range of scratch
5780 // memory a thread can access.
5781 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
5782 RhsValReg->Value.getSExtValue() > -0x40000000)
5783 return true;
5784 }
5785
5786 return VT->signBitIsZero(LHS);
5787 }
5788
5789 // Check address value in SGPR/VGPR are legal for flat scratch in the form
5790 // of: SGPR + VGPR.
isFlatScratchBaseLegalSV(Register Addr) const5791 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
5792 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5793
5794 if (isNoUnsignedWrap(AddrMI))
5795 return true;
5796
5797 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5798 // values.
5799 if (STI.hasSignedScratchOffsets())
5800 return true;
5801
5802 Register LHS = AddrMI->getOperand(1).getReg();
5803 Register RHS = AddrMI->getOperand(2).getReg();
5804 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
5805 }
5806
5807 // Check address value in SGPR/VGPR are legal for flat scratch in the form
5808 // of: SGPR + VGPR + Imm.
isFlatScratchBaseLegalSVImm(Register Addr) const5809 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
5810 Register Addr) const {
5811 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5812 // values.
5813 if (STI.hasSignedScratchOffsets())
5814 return true;
5815
5816 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5817 Register Base = AddrMI->getOperand(1).getReg();
5818 std::optional<DefinitionAndSourceRegister> BaseDef =
5819 getDefSrcRegIgnoringCopies(Base, *MRI);
5820 std::optional<ValueAndVReg> RHSOffset =
5821 getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI);
5822 assert(RHSOffset);
5823
5824 // If the immediate offset is negative and within certain range, the base
5825 // address cannot also be negative. If the base is also negative, the sum
5826 // would be either negative or much larger than the valid range of scratch
5827 // memory a thread can access.
5828 if (isNoUnsignedWrap(BaseDef->MI) &&
5829 (isNoUnsignedWrap(AddrMI) ||
5830 (RHSOffset->Value.getSExtValue() < 0 &&
5831 RHSOffset->Value.getSExtValue() > -0x40000000)))
5832 return true;
5833
5834 Register LHS = BaseDef->MI->getOperand(1).getReg();
5835 Register RHS = BaseDef->MI->getOperand(2).getReg();
5836 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
5837 }
5838
isUnneededShiftMask(const MachineInstr & MI,unsigned ShAmtBits) const5839 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
5840 unsigned ShAmtBits) const {
5841 assert(MI.getOpcode() == TargetOpcode::G_AND);
5842
5843 std::optional<APInt> RHS =
5844 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
5845 if (!RHS)
5846 return false;
5847
5848 if (RHS->countr_one() >= ShAmtBits)
5849 return true;
5850
5851 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
5852 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
5853 }
5854
5855 InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffset(MachineOperand & Root) const5856 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
5857 MachineOperand &Root) const {
5858 Register Reg = Root.getReg();
5859 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5860
5861 std::optional<DefinitionAndSourceRegister> Def =
5862 getDefSrcRegIgnoringCopies(Reg, *MRI);
5863 assert(Def && "this shouldn't be an optional result");
5864 Reg = Def->Reg;
5865
5866 if (Register WaveBase = getWaveAddress(Def->MI)) {
5867 return {{
5868 [=](MachineInstrBuilder &MIB) { // rsrc
5869 MIB.addReg(Info->getScratchRSrcReg());
5870 },
5871 [=](MachineInstrBuilder &MIB) { // soffset
5872 MIB.addReg(WaveBase);
5873 },
5874 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
5875 }};
5876 }
5877
5878 int64_t Offset = 0;
5879
5880 // FIXME: Copy check is a hack
5881 Register BasePtr;
5882 if (mi_match(Reg, *MRI,
5883 m_GPtrAdd(m_Reg(BasePtr),
5884 m_any_of(m_ICst(Offset), m_Copy(m_ICst(Offset)))))) {
5885 if (!TII.isLegalMUBUFImmOffset(Offset))
5886 return {};
5887 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
5888 Register WaveBase = getWaveAddress(BasePtrDef);
5889 if (!WaveBase)
5890 return {};
5891
5892 return {{
5893 [=](MachineInstrBuilder &MIB) { // rsrc
5894 MIB.addReg(Info->getScratchRSrcReg());
5895 },
5896 [=](MachineInstrBuilder &MIB) { // soffset
5897 MIB.addReg(WaveBase);
5898 },
5899 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
5900 }};
5901 }
5902
5903 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
5904 !TII.isLegalMUBUFImmOffset(Offset))
5905 return {};
5906
5907 return {{
5908 [=](MachineInstrBuilder &MIB) { // rsrc
5909 MIB.addReg(Info->getScratchRSrcReg());
5910 },
5911 [=](MachineInstrBuilder &MIB) { // soffset
5912 MIB.addImm(0);
5913 },
5914 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
5915 }};
5916 }
5917
5918 std::pair<Register, unsigned>
selectDS1Addr1OffsetImpl(MachineOperand & Root) const5919 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
5920 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5921 int64_t ConstAddr = 0;
5922
5923 Register PtrBase;
5924 int64_t Offset;
5925 std::tie(PtrBase, Offset) =
5926 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5927
5928 if (Offset) {
5929 if (isDSOffsetLegal(PtrBase, Offset)) {
5930 // (add n0, c0)
5931 return std::pair(PtrBase, Offset);
5932 }
5933 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5934 // TODO
5935
5936
5937 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
5938 // TODO
5939
5940 }
5941
5942 return std::pair(Root.getReg(), 0);
5943 }
5944
5945 InstructionSelector::ComplexRendererFns
selectDS1Addr1Offset(MachineOperand & Root) const5946 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
5947 Register Reg;
5948 unsigned Offset;
5949 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
5950 return {{
5951 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5952 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
5953 }};
5954 }
5955
5956 InstructionSelector::ComplexRendererFns
selectDS64Bit4ByteAligned(MachineOperand & Root) const5957 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
5958 return selectDSReadWrite2(Root, 4);
5959 }
5960
5961 InstructionSelector::ComplexRendererFns
selectDS128Bit8ByteAligned(MachineOperand & Root) const5962 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
5963 return selectDSReadWrite2(Root, 8);
5964 }
5965
5966 InstructionSelector::ComplexRendererFns
selectDSReadWrite2(MachineOperand & Root,unsigned Size) const5967 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
5968 unsigned Size) const {
5969 Register Reg;
5970 unsigned Offset;
5971 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
5972 return {{
5973 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5974 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5975 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
5976 }};
5977 }
5978
5979 std::pair<Register, unsigned>
selectDSReadWrite2Impl(MachineOperand & Root,unsigned Size) const5980 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
5981 unsigned Size) const {
5982 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5983 int64_t ConstAddr = 0;
5984
5985 Register PtrBase;
5986 int64_t Offset;
5987 std::tie(PtrBase, Offset) =
5988 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5989
5990 if (Offset) {
5991 int64_t OffsetValue0 = Offset;
5992 int64_t OffsetValue1 = Offset + Size;
5993 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
5994 // (add n0, c0)
5995 return std::pair(PtrBase, OffsetValue0 / Size);
5996 }
5997 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5998 // TODO
5999
6000 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6001 // TODO
6002
6003 }
6004
6005 return std::pair(Root.getReg(), 0);
6006 }
6007
6008 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6009 /// the base value with the constant offset. There may be intervening copies
6010 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
6011 /// not match the pattern.
6012 std::pair<Register, int64_t>
getPtrBaseWithConstantOffset(Register Root,const MachineRegisterInfo & MRI) const6013 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6014 Register Root, const MachineRegisterInfo &MRI) const {
6015 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6016 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6017 return {Root, 0};
6018
6019 MachineOperand &RHS = RootI->getOperand(2);
6020 std::optional<ValueAndVReg> MaybeOffset =
6021 getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
6022 if (!MaybeOffset)
6023 return {Root, 0};
6024 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
6025 }
6026
addZeroImm(MachineInstrBuilder & MIB)6027 static void addZeroImm(MachineInstrBuilder &MIB) {
6028 MIB.addImm(0);
6029 }
6030
6031 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6032 /// BasePtr is not valid, a null base pointer will be used.
buildRSRC(MachineIRBuilder & B,MachineRegisterInfo & MRI,uint32_t FormatLo,uint32_t FormatHi,Register BasePtr)6033 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
6034 uint32_t FormatLo, uint32_t FormatHi,
6035 Register BasePtr) {
6036 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6037 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6038 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6039 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6040
6041 B.buildInstr(AMDGPU::S_MOV_B32)
6042 .addDef(RSrc2)
6043 .addImm(FormatLo);
6044 B.buildInstr(AMDGPU::S_MOV_B32)
6045 .addDef(RSrc3)
6046 .addImm(FormatHi);
6047
6048 // Build the half of the subregister with the constants before building the
6049 // full 128-bit register. If we are building multiple resource descriptors,
6050 // this will allow CSEing of the 2-component register.
6051 B.buildInstr(AMDGPU::REG_SEQUENCE)
6052 .addDef(RSrcHi)
6053 .addReg(RSrc2)
6054 .addImm(AMDGPU::sub0)
6055 .addReg(RSrc3)
6056 .addImm(AMDGPU::sub1);
6057
6058 Register RSrcLo = BasePtr;
6059 if (!BasePtr) {
6060 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6061 B.buildInstr(AMDGPU::S_MOV_B64)
6062 .addDef(RSrcLo)
6063 .addImm(0);
6064 }
6065
6066 B.buildInstr(AMDGPU::REG_SEQUENCE)
6067 .addDef(RSrc)
6068 .addReg(RSrcLo)
6069 .addImm(AMDGPU::sub0_sub1)
6070 .addReg(RSrcHi)
6071 .addImm(AMDGPU::sub2_sub3);
6072
6073 return RSrc;
6074 }
6075
buildAddr64RSrc(MachineIRBuilder & B,MachineRegisterInfo & MRI,const SIInstrInfo & TII,Register BasePtr)6076 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
6077 const SIInstrInfo &TII, Register BasePtr) {
6078 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6079
6080 // FIXME: Why are half the "default" bits ignored based on the addressing
6081 // mode?
6082 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6083 }
6084
buildOffsetSrc(MachineIRBuilder & B,MachineRegisterInfo & MRI,const SIInstrInfo & TII,Register BasePtr)6085 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
6086 const SIInstrInfo &TII, Register BasePtr) {
6087 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6088
6089 // FIXME: Why are half the "default" bits ignored based on the addressing
6090 // mode?
6091 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6092 }
6093
6094 AMDGPUInstructionSelector::MUBUFAddressData
parseMUBUFAddress(Register Src) const6095 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6096 MUBUFAddressData Data;
6097 Data.N0 = Src;
6098
6099 Register PtrBase;
6100 int64_t Offset;
6101
6102 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
6103 if (isUInt<32>(Offset)) {
6104 Data.N0 = PtrBase;
6105 Data.Offset = Offset;
6106 }
6107
6108 if (MachineInstr *InputAdd
6109 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6110 Data.N2 = InputAdd->getOperand(1).getReg();
6111 Data.N3 = InputAdd->getOperand(2).getReg();
6112
6113 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6114 // FIXME: Don't know this was defined by operand 0
6115 //
6116 // TODO: Remove this when we have copy folding optimizations after
6117 // RegBankSelect.
6118 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6119 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6120 }
6121
6122 return Data;
6123 }
6124
6125 /// Return if the addr64 mubuf mode should be used for the given address.
shouldUseAddr64(MUBUFAddressData Addr) const6126 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6127 // (ptr_add N2, N3) -> addr64, or
6128 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6129 if (Addr.N2)
6130 return true;
6131
6132 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6133 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6134 }
6135
6136 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
6137 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6138 /// component.
splitIllegalMUBUFOffset(MachineIRBuilder & B,Register & SOffset,int64_t & ImmOffset) const6139 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6140 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6141 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6142 return;
6143
6144 // Illegal offset, store it in soffset.
6145 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6146 B.buildInstr(AMDGPU::S_MOV_B32)
6147 .addDef(SOffset)
6148 .addImm(ImmOffset);
6149 ImmOffset = 0;
6150 }
6151
selectMUBUFAddr64Impl(MachineOperand & Root,Register & VAddr,Register & RSrcReg,Register & SOffset,int64_t & Offset) const6152 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6153 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6154 Register &SOffset, int64_t &Offset) const {
6155 // FIXME: Predicates should stop this from reaching here.
6156 // addr64 bit was removed for volcanic islands.
6157 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6158 return false;
6159
6160 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6161 if (!shouldUseAddr64(AddrData))
6162 return false;
6163
6164 Register N0 = AddrData.N0;
6165 Register N2 = AddrData.N2;
6166 Register N3 = AddrData.N3;
6167 Offset = AddrData.Offset;
6168
6169 // Base pointer for the SRD.
6170 Register SRDPtr;
6171
6172 if (N2) {
6173 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6174 assert(N3);
6175 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6176 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6177 // addr64, and construct the default resource from a 0 address.
6178 VAddr = N0;
6179 } else {
6180 SRDPtr = N3;
6181 VAddr = N2;
6182 }
6183 } else {
6184 // N2 is not divergent.
6185 SRDPtr = N2;
6186 VAddr = N3;
6187 }
6188 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6189 // Use the default null pointer in the resource
6190 VAddr = N0;
6191 } else {
6192 // N0 -> offset, or
6193 // (N0 + C1) -> offset
6194 SRDPtr = N0;
6195 }
6196
6197 MachineIRBuilder B(*Root.getParent());
6198 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6199 splitIllegalMUBUFOffset(B, SOffset, Offset);
6200 return true;
6201 }
6202
selectMUBUFOffsetImpl(MachineOperand & Root,Register & RSrcReg,Register & SOffset,int64_t & Offset) const6203 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6204 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6205 int64_t &Offset) const {
6206
6207 // FIXME: Pattern should not reach here.
6208 if (STI.useFlatForGlobal())
6209 return false;
6210
6211 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6212 if (shouldUseAddr64(AddrData))
6213 return false;
6214
6215 // N0 -> offset, or
6216 // (N0 + C1) -> offset
6217 Register SRDPtr = AddrData.N0;
6218 Offset = AddrData.Offset;
6219
6220 // TODO: Look through extensions for 32-bit soffset.
6221 MachineIRBuilder B(*Root.getParent());
6222
6223 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6224 splitIllegalMUBUFOffset(B, SOffset, Offset);
6225 return true;
6226 }
6227
6228 InstructionSelector::ComplexRendererFns
selectMUBUFAddr64(MachineOperand & Root) const6229 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6230 Register VAddr;
6231 Register RSrcReg;
6232 Register SOffset;
6233 int64_t Offset = 0;
6234
6235 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6236 return {};
6237
6238 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6239 // pattern.
6240 return {{
6241 [=](MachineInstrBuilder &MIB) { // rsrc
6242 MIB.addReg(RSrcReg);
6243 },
6244 [=](MachineInstrBuilder &MIB) { // vaddr
6245 MIB.addReg(VAddr);
6246 },
6247 [=](MachineInstrBuilder &MIB) { // soffset
6248 if (SOffset)
6249 MIB.addReg(SOffset);
6250 else if (STI.hasRestrictedSOffset())
6251 MIB.addReg(AMDGPU::SGPR_NULL);
6252 else
6253 MIB.addImm(0);
6254 },
6255 [=](MachineInstrBuilder &MIB) { // offset
6256 MIB.addImm(Offset);
6257 },
6258 addZeroImm, // cpol
6259 addZeroImm, // tfe
6260 addZeroImm // swz
6261 }};
6262 }
6263
6264 InstructionSelector::ComplexRendererFns
selectMUBUFOffset(MachineOperand & Root) const6265 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6266 Register RSrcReg;
6267 Register SOffset;
6268 int64_t Offset = 0;
6269
6270 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6271 return {};
6272
6273 return {{
6274 [=](MachineInstrBuilder &MIB) { // rsrc
6275 MIB.addReg(RSrcReg);
6276 },
6277 [=](MachineInstrBuilder &MIB) { // soffset
6278 if (SOffset)
6279 MIB.addReg(SOffset);
6280 else if (STI.hasRestrictedSOffset())
6281 MIB.addReg(AMDGPU::SGPR_NULL);
6282 else
6283 MIB.addImm(0);
6284 },
6285 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
6286 addZeroImm, // cpol
6287 addZeroImm, // tfe
6288 addZeroImm, // swz
6289 }};
6290 }
6291
6292 InstructionSelector::ComplexRendererFns
selectBUFSOffset(MachineOperand & Root) const6293 AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6294
6295 Register SOffset = Root.getReg();
6296
6297 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
6298 SOffset = AMDGPU::SGPR_NULL;
6299
6300 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
6301 }
6302
6303 /// Get an immediate that must be 32-bits, and treated as zero extended.
6304 static std::optional<uint64_t>
getConstantZext32Val(Register Reg,const MachineRegisterInfo & MRI)6305 getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {
6306 // getIConstantVRegVal sexts any values, so see if that matters.
6307 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
6308 if (!OffsetVal || !isInt<32>(*OffsetVal))
6309 return std::nullopt;
6310 return Lo_32(*OffsetVal);
6311 }
6312
6313 InstructionSelector::ComplexRendererFns
selectSMRDBufferImm(MachineOperand & Root) const6314 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6315 std::optional<uint64_t> OffsetVal =
6316 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
6317 if (!OffsetVal)
6318 return {};
6319
6320 std::optional<int64_t> EncodedImm =
6321 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
6322 if (!EncodedImm)
6323 return {};
6324
6325 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6326 }
6327
6328 InstructionSelector::ComplexRendererFns
selectSMRDBufferImm32(MachineOperand & Root) const6329 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6330 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
6331
6332 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
6333 if (!OffsetVal)
6334 return {};
6335
6336 std::optional<int64_t> EncodedImm =
6337 AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
6338 if (!EncodedImm)
6339 return {};
6340
6341 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6342 }
6343
6344 InstructionSelector::ComplexRendererFns
selectSMRDBufferSgprImm(MachineOperand & Root) const6345 AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6346 // Match the (soffset + offset) pair as a 32-bit register base and
6347 // an immediate offset.
6348 Register SOffset;
6349 unsigned Offset;
6350 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
6351 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
6352 if (!SOffset)
6353 return std::nullopt;
6354
6355 std::optional<int64_t> EncodedOffset =
6356 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
6357 if (!EncodedOffset)
6358 return std::nullopt;
6359
6360 assert(MRI->getType(SOffset) == LLT::scalar(32));
6361 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6362 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
6363 }
6364
6365 std::pair<Register, unsigned>
selectVOP3PMadMixModsImpl(MachineOperand & Root,bool & Matched) const6366 AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
6367 bool &Matched) const {
6368 Matched = false;
6369
6370 Register Src;
6371 unsigned Mods;
6372 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
6373
6374 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
6375 assert(MRI->getType(Src) == LLT::scalar(16));
6376
6377 // Only change Src if src modifier could be gained. In such cases new Src
6378 // could be sgpr but this does not violate constant bus restriction for
6379 // instruction that is being selected.
6380 Src = stripBitCast(Src, *MRI);
6381
6382 const auto CheckAbsNeg = [&]() {
6383 // Be careful about folding modifiers if we already have an abs. fneg is
6384 // applied last, so we don't want to apply an earlier fneg.
6385 if ((Mods & SISrcMods::ABS) == 0) {
6386 unsigned ModsTmp;
6387 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
6388
6389 if ((ModsTmp & SISrcMods::NEG) != 0)
6390 Mods ^= SISrcMods::NEG;
6391
6392 if ((ModsTmp & SISrcMods::ABS) != 0)
6393 Mods |= SISrcMods::ABS;
6394 }
6395 };
6396
6397 CheckAbsNeg();
6398
6399 // op_sel/op_sel_hi decide the source type and source.
6400 // If the source's op_sel_hi is set, it indicates to do a conversion from
6401 // fp16. If the sources's op_sel is set, it picks the high half of the
6402 // source register.
6403
6404 Mods |= SISrcMods::OP_SEL_1;
6405
6406 if (isExtractHiElt(*MRI, Src, Src)) {
6407 Mods |= SISrcMods::OP_SEL_0;
6408 CheckAbsNeg();
6409 }
6410
6411 Matched = true;
6412 }
6413
6414 return {Src, Mods};
6415 }
6416
6417 InstructionSelector::ComplexRendererFns
selectVOP3PMadMixModsExt(MachineOperand & Root) const6418 AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
6419 MachineOperand &Root) const {
6420 Register Src;
6421 unsigned Mods;
6422 bool Matched;
6423 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6424 if (!Matched)
6425 return {};
6426
6427 return {{
6428 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6429 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6430 }};
6431 }
6432
6433 InstructionSelector::ComplexRendererFns
selectVOP3PMadMixMods(MachineOperand & Root) const6434 AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
6435 Register Src;
6436 unsigned Mods;
6437 bool Matched;
6438 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6439
6440 return {{
6441 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6442 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6443 }};
6444 }
6445
selectSBarrierSignalIsfirst(MachineInstr & I,Intrinsic::ID IntrID) const6446 bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
6447 MachineInstr &I, Intrinsic::ID IntrID) const {
6448 MachineBasicBlock *MBB = I.getParent();
6449 const DebugLoc &DL = I.getDebugLoc();
6450 Register CCReg = I.getOperand(0).getReg();
6451
6452 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6453 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
6454
6455 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
6456 .addImm(I.getOperand(2).getImm());
6457
6458 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
6459
6460 I.eraseFromParent();
6461 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
6462 *MRI);
6463 }
6464
selectSGetBarrierState(MachineInstr & I,Intrinsic::ID IntrID) const6465 bool AMDGPUInstructionSelector::selectSGetBarrierState(
6466 MachineInstr &I, Intrinsic::ID IntrID) const {
6467 MachineBasicBlock *MBB = I.getParent();
6468 const DebugLoc &DL = I.getDebugLoc();
6469 MachineOperand BarOp = I.getOperand(2);
6470 std::optional<int64_t> BarValImm =
6471 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6472
6473 if (!BarValImm) {
6474 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6475 .addReg(BarOp.getReg());
6476 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6477 }
6478 MachineInstrBuilder MIB;
6479 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
6480 : AMDGPU::S_GET_BARRIER_STATE_M0;
6481 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6482
6483 auto DstReg = I.getOperand(0).getReg();
6484 const TargetRegisterClass *DstRC =
6485 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6486 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6487 return false;
6488 MIB.addDef(DstReg);
6489 if (BarValImm) {
6490 MIB.addImm(*BarValImm);
6491 }
6492 I.eraseFromParent();
6493 return true;
6494 }
6495
getNamedBarrierOp(bool HasInlineConst,Intrinsic::ID IntrID)6496 unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
6497 if (HasInlineConst) {
6498 switch (IntrID) {
6499 default:
6500 llvm_unreachable("not a named barrier op");
6501 case Intrinsic::amdgcn_s_get_named_barrier_state:
6502 return AMDGPU::S_GET_BARRIER_STATE_IMM;
6503 };
6504 } else {
6505 switch (IntrID) {
6506 default:
6507 llvm_unreachable("not a named barrier op");
6508 case Intrinsic::amdgcn_s_get_named_barrier_state:
6509 return AMDGPU::S_GET_BARRIER_STATE_M0;
6510 };
6511 }
6512 }
6513
selectNamedBarrierInit(MachineInstr & I,Intrinsic::ID IntrID) const6514 bool AMDGPUInstructionSelector::selectNamedBarrierInit(
6515 MachineInstr &I, Intrinsic::ID IntrID) const {
6516 MachineBasicBlock *MBB = I.getParent();
6517 const DebugLoc &DL = I.getDebugLoc();
6518 MachineOperand BarOp = I.getOperand(1);
6519 MachineOperand CntOp = I.getOperand(2);
6520
6521 // BarID = (BarOp >> 4) & 0x3F
6522 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6523 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6524 .add(BarOp)
6525 .addImm(4u)
6526 .setOperandDead(3); // Dead scc
6527
6528 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6529 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6530 .addReg(TmpReg0)
6531 .addImm(0x3F)
6532 .setOperandDead(3); // Dead scc
6533
6534 // MO = ((CntOp & 0x3F) << shAmt) | BarID
6535 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6536 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
6537 .add(CntOp)
6538 .addImm(0x3F)
6539 .setOperandDead(3); // Dead scc
6540
6541 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6542 constexpr unsigned ShAmt = 16;
6543 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
6544 .addReg(TmpReg2)
6545 .addImm(ShAmt)
6546 .setOperandDead(3); // Dead scc
6547
6548 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6549 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
6550 .addReg(TmpReg1)
6551 .addReg(TmpReg3)
6552 .setOperandDead(3); // Dead scc;
6553
6554 auto CopyMIB =
6555 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
6556 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6557
6558 MachineInstrBuilder MIB;
6559 MIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_M0));
6560
6561 I.eraseFromParent();
6562 return true;
6563 }
6564
selectNamedBarrierInst(MachineInstr & I,Intrinsic::ID IntrID) const6565 bool AMDGPUInstructionSelector::selectNamedBarrierInst(
6566 MachineInstr &I, Intrinsic::ID IntrID) const {
6567 MachineBasicBlock *MBB = I.getParent();
6568 const DebugLoc &DL = I.getDebugLoc();
6569 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
6570 ? I.getOperand(2)
6571 : I.getOperand(1);
6572 std::optional<int64_t> BarValImm =
6573 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6574
6575 if (!BarValImm) {
6576 // BarID = (BarOp >> 4) & 0x3F
6577 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6578 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6579 .addReg(BarOp.getReg())
6580 .addImm(4u)
6581 .setOperandDead(3); // Dead scc;
6582
6583 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6584 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6585 .addReg(TmpReg0)
6586 .addImm(0x3F)
6587 .setOperandDead(3); // Dead scc;
6588
6589 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6590 .addReg(TmpReg1);
6591 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6592 }
6593
6594 MachineInstrBuilder MIB;
6595 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
6596 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6597
6598 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
6599 auto DstReg = I.getOperand(0).getReg();
6600 const TargetRegisterClass *DstRC =
6601 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6602 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6603 return false;
6604 MIB.addDef(DstReg);
6605 }
6606
6607 if (BarValImm) {
6608 auto BarId = ((*BarValImm) >> 4) & 0x3F;
6609 MIB.addImm(BarId);
6610 }
6611
6612 I.eraseFromParent();
6613 return true;
6614 }
6615
renderTruncImm32(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6616 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
6617 const MachineInstr &MI,
6618 int OpIdx) const {
6619 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6620 "Expected G_CONSTANT");
6621 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
6622 }
6623
renderNegateImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6624 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
6625 const MachineInstr &MI,
6626 int OpIdx) const {
6627 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6628 "Expected G_CONSTANT");
6629 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
6630 }
6631
renderBitcastFPImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6632 void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
6633 const MachineInstr &MI,
6634 int OpIdx) const {
6635 const MachineOperand &Op = MI.getOperand(1);
6636 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
6637 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
6638 }
6639
renderPopcntImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6640 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
6641 const MachineInstr &MI,
6642 int OpIdx) const {
6643 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6644 "Expected G_CONSTANT");
6645 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
6646 }
6647
6648 /// This only really exists to satisfy DAG type checking machinery, so is a
6649 /// no-op here.
renderTruncTImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6650 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
6651 const MachineInstr &MI,
6652 int OpIdx) const {
6653 const MachineOperand &Op = MI.getOperand(OpIdx);
6654 int64_t Imm;
6655 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
6656 MIB.addImm(Imm);
6657 else
6658 MIB.addImm(Op.getImm());
6659 }
6660
renderZextBoolTImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6661 void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
6662 const MachineInstr &MI,
6663 int OpIdx) const {
6664 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
6665 }
6666
renderOpSelTImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6667 void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
6668 const MachineInstr &MI,
6669 int OpIdx) const {
6670 assert(OpIdx >= 0 && "expected to match an immediate operand");
6671 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6672 }
6673
renderSrcAndDstSelToOpSelXForm_0_0(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6674 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
6675 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6676 assert(OpIdx >= 0 && "expected to match an immediate operand");
6677 MIB.addImm(
6678 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6679 }
6680
renderSrcAndDstSelToOpSelXForm_0_1(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6681 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
6682 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6683 assert(OpIdx >= 0 && "expected to match an immediate operand");
6684 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
6685 ? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL)
6686 : (int64_t)SISrcMods::DST_OP_SEL);
6687 }
6688
renderSrcAndDstSelToOpSelXForm_1_0(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6689 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
6690 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6691 assert(OpIdx >= 0 && "expected to match an immediate operand");
6692 MIB.addImm(
6693 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6694 }
6695
renderSrcAndDstSelToOpSelXForm_1_1(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6696 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
6697 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6698 assert(OpIdx >= 0 && "expected to match an immediate operand");
6699 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
6700 ? (int64_t)(SISrcMods::OP_SEL_0)
6701 : 0);
6702 }
6703
renderDstSelToOpSelXForm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6704 void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
6705 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6706 assert(OpIdx >= 0 && "expected to match an immediate operand");
6707 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
6708 : 0);
6709 }
6710
renderSrcSelToOpSelXForm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6711 void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
6712 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6713 assert(OpIdx >= 0 && "expected to match an immediate operand");
6714 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
6715 : 0);
6716 }
6717
renderSrcAndDstSelToOpSelXForm_2_0(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6718 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
6719 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6720 assert(OpIdx >= 0 && "expected to match an immediate operand");
6721 MIB.addImm(
6722 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6723 }
6724
renderDstSelToOpSel3XFormXForm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6725 void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
6726 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6727 assert(OpIdx >= 0 && "expected to match an immediate operand");
6728 MIB.addImm(
6729 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0);
6730 }
6731
renderExtractCPol(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6732 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
6733 const MachineInstr &MI,
6734 int OpIdx) const {
6735 assert(OpIdx >= 0 && "expected to match an immediate operand");
6736 MIB.addImm(MI.getOperand(OpIdx).getImm() &
6737 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
6738 : AMDGPU::CPol::ALL_pregfx12));
6739 }
6740
renderExtractSWZ(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6741 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
6742 const MachineInstr &MI,
6743 int OpIdx) const {
6744 assert(OpIdx >= 0 && "expected to match an immediate operand");
6745 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
6746 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ
6747 : AMDGPU::CPol::SWZ_pregfx12);
6748 MIB.addImm(Swizzle);
6749 }
6750
renderExtractCpolSetGLC(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6751 void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
6752 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6753 assert(OpIdx >= 0 && "expected to match an immediate operand");
6754 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
6755 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
6756 : AMDGPU::CPol::ALL_pregfx12);
6757 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
6758 }
6759
renderFrameIndex(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6760 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
6761 const MachineInstr &MI,
6762 int OpIdx) const {
6763 MIB.addFrameIndex(MI.getOperand(1).getIndex());
6764 }
6765
renderFPPow2ToExponent(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6766 void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
6767 const MachineInstr &MI,
6768 int OpIdx) const {
6769 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
6770 int ExpVal = APF.getExactLog2Abs();
6771 assert(ExpVal != INT_MIN);
6772 MIB.addImm(ExpVal);
6773 }
6774
renderRoundMode(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6775 void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
6776 const MachineInstr &MI,
6777 int OpIdx) const {
6778 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
6779 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
6780 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
6781 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
6782 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
6783 }
6784
6785 /// Convert from 2-bit value to enum values used for op_sel* source modifiers.
renderScaledMAIIntrinsicOperand(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6786 void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
6787 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6788 unsigned Val = MI.getOperand(OpIdx).getImm();
6789 unsigned New = 0;
6790 if (Val & 0x1)
6791 New |= SISrcMods::OP_SEL_0;
6792 if (Val & 0x2)
6793 New |= SISrcMods::OP_SEL_1;
6794 MIB.addImm(New);
6795 }
6796
isInlineImmediate(const APInt & Imm) const6797 bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
6798 return TII.isInlineConstant(Imm);
6799 }
6800
isInlineImmediate(const APFloat & Imm) const6801 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
6802 return TII.isInlineConstant(Imm);
6803 }
6804