1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
24 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/MachineFrameInfo.h"
28 #include "llvm/IR/DiagnosticInfo.h"
29 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #include <optional>
31
32 #define DEBUG_TYPE "amdgpu-isel"
33
34 using namespace llvm;
35 using namespace MIPatternMatch;
36
37 #define GET_GLOBALISEL_IMPL
38 #define AMDGPUSubtarget GCNSubtarget
39 #include "AMDGPUGenGlobalISel.inc"
40 #undef GET_GLOBALISEL_IMPL
41 #undef AMDGPUSubtarget
42
AMDGPUInstructionSelector(const GCNSubtarget & STI,const AMDGPURegisterBankInfo & RBI,const AMDGPUTargetMachine & TM)43 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
48 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
49 #define GET_GLOBALISEL_PREDICATES_INIT
50 #include "AMDGPUGenGlobalISel.inc"
51 #undef GET_GLOBALISEL_PREDICATES_INIT
52 #define GET_GLOBALISEL_TEMPORARIES_INIT
53 #include "AMDGPUGenGlobalISel.inc"
54 #undef GET_GLOBALISEL_TEMPORARIES_INIT
55 {
56 }
57
getName()58 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
59
setupMF(MachineFunction & MF,GISelKnownBits * KB,CodeGenCoverage * CoverageInfo,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)60 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
61 CodeGenCoverage *CoverageInfo,
62 ProfileSummaryInfo *PSI,
63 BlockFrequencyInfo *BFI) {
64 MRI = &MF.getRegInfo();
65 Subtarget = &MF.getSubtarget<GCNSubtarget>();
66 Subtarget->checkSubtargetFeatures(MF.getFunction());
67 InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
68 }
69
70 // Return the wave level SGPR base address if this is a wave address.
getWaveAddress(const MachineInstr * Def)71 static Register getWaveAddress(const MachineInstr *Def) {
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75 }
76
isVCC(Register Reg,const MachineRegisterInfo & MRI) const77 bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
85 RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
96 return RB->getID() == AMDGPU::VCCRegBankID;
97 }
98
constrainCopyLikeIntrin(MachineInstr & MI,unsigned NewOpc) const99 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
120 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
121 }
122
selectCOPY(MachineInstr & I) const123 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
124 const DebugLoc &DL = I.getDebugLoc();
125 MachineBasicBlock *BB = I.getParent();
126 I.setDesc(TII.get(TargetOpcode::COPY));
127
128 const MachineOperand &Src = I.getOperand(1);
129 MachineOperand &Dst = I.getOperand(0);
130 Register DstReg = Dst.getReg();
131 Register SrcReg = Src.getReg();
132
133 if (isVCC(DstReg, *MRI)) {
134 if (SrcReg == AMDGPU::SCC) {
135 const TargetRegisterClass *RC
136 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
137 if (!RC)
138 return true;
139 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
140 }
141
142 if (!isVCC(SrcReg, *MRI)) {
143 // TODO: Should probably leave the copy and let copyPhysReg expand it.
144 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
145 return false;
146
147 const TargetRegisterClass *SrcRC
148 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
149
150 std::optional<ValueAndVReg> ConstVal =
151 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
152 if (ConstVal) {
153 unsigned MovOpc =
154 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
155 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
156 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
157 } else {
158 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
159
160 // We can't trust the high bits at this point, so clear them.
161
162 // TODO: Skip masking high bits if def is known boolean.
163
164 bool IsSGPR = TRI.isSGPRClass(SrcRC);
165 unsigned AndOpc =
166 IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
167 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
168 .addImm(1)
169 .addReg(SrcReg);
170 if (IsSGPR)
171 And.setOperandDead(3); // Dead scc
172
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
174 .addImm(0)
175 .addReg(MaskedReg);
176 }
177
178 if (!MRI->getRegClassOrNull(SrcReg))
179 MRI->setRegClass(SrcReg, SrcRC);
180 I.eraseFromParent();
181 return true;
182 }
183
184 const TargetRegisterClass *RC =
185 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
186 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
187 return false;
188
189 return true;
190 }
191
192 for (const MachineOperand &MO : I.operands()) {
193 if (MO.getReg().isPhysical())
194 continue;
195
196 const TargetRegisterClass *RC =
197 TRI.getConstrainedRegClassForOperand(MO, *MRI);
198 if (!RC)
199 continue;
200 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
201 }
202 return true;
203 }
204
selectPHI(MachineInstr & I) const205 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
206 const Register DefReg = I.getOperand(0).getReg();
207 const LLT DefTy = MRI->getType(DefReg);
208
209 // S1 G_PHIs should not be selected in instruction-select, instead:
210 // - divergent S1 G_PHI should go through lane mask merging algorithm
211 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
212 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
213 if (DefTy == LLT::scalar(1))
214 return false;
215
216 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
217
218 const RegClassOrRegBank &RegClassOrBank =
219 MRI->getRegClassOrRegBank(DefReg);
220
221 const TargetRegisterClass *DefRC
222 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
223 if (!DefRC) {
224 if (!DefTy.isValid()) {
225 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
226 return false;
227 }
228
229 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
230 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
231 if (!DefRC) {
232 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
233 return false;
234 }
235 }
236
237 // TODO: Verify that all registers have the same bank
238 I.setDesc(TII.get(TargetOpcode::PHI));
239 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
240 }
241
242 MachineOperand
getSubOperand64(MachineOperand & MO,const TargetRegisterClass & SubRC,unsigned SubIdx) const243 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
244 const TargetRegisterClass &SubRC,
245 unsigned SubIdx) const {
246
247 MachineInstr *MI = MO.getParent();
248 MachineBasicBlock *BB = MO.getParent()->getParent();
249 Register DstReg = MRI->createVirtualRegister(&SubRC);
250
251 if (MO.isReg()) {
252 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
253 Register Reg = MO.getReg();
254 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
255 .addReg(Reg, 0, ComposedSubIdx);
256
257 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
258 MO.isKill(), MO.isDead(), MO.isUndef(),
259 MO.isEarlyClobber(), 0, MO.isDebug(),
260 MO.isInternalRead());
261 }
262
263 assert(MO.isImm());
264
265 APInt Imm(64, MO.getImm());
266
267 switch (SubIdx) {
268 default:
269 llvm_unreachable("do not know to split immediate with this sub index.");
270 case AMDGPU::sub0:
271 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
272 case AMDGPU::sub1:
273 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
274 }
275 }
276
getLogicalBitOpcode(unsigned Opc,bool Is64)277 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
278 switch (Opc) {
279 case AMDGPU::G_AND:
280 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
281 case AMDGPU::G_OR:
282 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
283 case AMDGPU::G_XOR:
284 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
285 default:
286 llvm_unreachable("not a bit op");
287 }
288 }
289
selectG_AND_OR_XOR(MachineInstr & I) const290 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
291 Register DstReg = I.getOperand(0).getReg();
292 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
293
294 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
295 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
296 DstRB->getID() != AMDGPU::VCCRegBankID)
297 return false;
298
299 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
300 STI.isWave64());
301 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
302
303 // Dead implicit-def of scc
304 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
305 true, // isImp
306 false, // isKill
307 true)); // isDead
308 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
309 }
310
selectG_ADD_SUB(MachineInstr & I) const311 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
312 MachineBasicBlock *BB = I.getParent();
313 MachineFunction *MF = BB->getParent();
314 Register DstReg = I.getOperand(0).getReg();
315 const DebugLoc &DL = I.getDebugLoc();
316 LLT Ty = MRI->getType(DstReg);
317 if (Ty.isVector())
318 return false;
319
320 unsigned Size = Ty.getSizeInBits();
321 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
322 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
323 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
324
325 if (Size == 32) {
326 if (IsSALU) {
327 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
328 MachineInstr *Add =
329 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
330 .add(I.getOperand(1))
331 .add(I.getOperand(2))
332 .setOperandDead(3); // Dead scc
333 I.eraseFromParent();
334 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
335 }
336
337 if (STI.hasAddNoCarry()) {
338 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
339 I.setDesc(TII.get(Opc));
340 I.addOperand(*MF, MachineOperand::CreateImm(0));
341 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
342 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
343 }
344
345 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
346
347 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
348 MachineInstr *Add
349 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
350 .addDef(UnusedCarry, RegState::Dead)
351 .add(I.getOperand(1))
352 .add(I.getOperand(2))
353 .addImm(0);
354 I.eraseFromParent();
355 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
356 }
357
358 assert(!Sub && "illegal sub should not reach here");
359
360 const TargetRegisterClass &RC
361 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
362 const TargetRegisterClass &HalfRC
363 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
364
365 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
366 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
367 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
368 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
369
370 Register DstLo = MRI->createVirtualRegister(&HalfRC);
371 Register DstHi = MRI->createVirtualRegister(&HalfRC);
372
373 if (IsSALU) {
374 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
375 .add(Lo1)
376 .add(Lo2);
377 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
378 .add(Hi1)
379 .add(Hi2)
380 .setOperandDead(3); // Dead scc
381 } else {
382 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
383 Register CarryReg = MRI->createVirtualRegister(CarryRC);
384 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
385 .addDef(CarryReg)
386 .add(Lo1)
387 .add(Lo2)
388 .addImm(0);
389 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
390 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
391 .add(Hi1)
392 .add(Hi2)
393 .addReg(CarryReg, RegState::Kill)
394 .addImm(0);
395
396 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
397 return false;
398 }
399
400 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
401 .addReg(DstLo)
402 .addImm(AMDGPU::sub0)
403 .addReg(DstHi)
404 .addImm(AMDGPU::sub1);
405
406
407 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
408 return false;
409
410 I.eraseFromParent();
411 return true;
412 }
413
selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr & I) const414 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
415 MachineInstr &I) const {
416 MachineBasicBlock *BB = I.getParent();
417 MachineFunction *MF = BB->getParent();
418 const DebugLoc &DL = I.getDebugLoc();
419 Register Dst0Reg = I.getOperand(0).getReg();
420 Register Dst1Reg = I.getOperand(1).getReg();
421 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
422 I.getOpcode() == AMDGPU::G_UADDE;
423 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
424 I.getOpcode() == AMDGPU::G_USUBE;
425
426 if (isVCC(Dst1Reg, *MRI)) {
427 unsigned NoCarryOpc =
428 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
429 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
430 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
431 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
432 I.addOperand(*MF, MachineOperand::CreateImm(0));
433 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
434 }
435
436 Register Src0Reg = I.getOperand(2).getReg();
437 Register Src1Reg = I.getOperand(3).getReg();
438
439 if (HasCarryIn) {
440 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
441 .addReg(I.getOperand(4).getReg());
442 }
443
444 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
445 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
446
447 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
448 .add(I.getOperand(2))
449 .add(I.getOperand(3));
450
451 if (MRI->use_nodbg_empty(Dst1Reg)) {
452 CarryInst.setOperandDead(3); // Dead scc
453 } else {
454 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
455 .addReg(AMDGPU::SCC);
456 if (!MRI->getRegClassOrNull(Dst1Reg))
457 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
458 }
459
460 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
461 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
462 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
463 return false;
464
465 if (HasCarryIn &&
466 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
467 AMDGPU::SReg_32RegClass, *MRI))
468 return false;
469
470 I.eraseFromParent();
471 return true;
472 }
473
selectG_AMDGPU_MAD_64_32(MachineInstr & I) const474 bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
475 MachineInstr &I) const {
476 MachineBasicBlock *BB = I.getParent();
477 MachineFunction *MF = BB->getParent();
478 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
479
480 unsigned Opc;
481 if (Subtarget->hasMADIntraFwdBug())
482 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
483 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
484 else
485 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
486 I.setDesc(TII.get(Opc));
487 I.addOperand(*MF, MachineOperand::CreateImm(0));
488 I.addImplicitDefUseOperands(*MF);
489 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
490 }
491
492 // TODO: We should probably legalize these to only using 32-bit results.
selectG_EXTRACT(MachineInstr & I) const493 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
494 MachineBasicBlock *BB = I.getParent();
495 Register DstReg = I.getOperand(0).getReg();
496 Register SrcReg = I.getOperand(1).getReg();
497 LLT DstTy = MRI->getType(DstReg);
498 LLT SrcTy = MRI->getType(SrcReg);
499 const unsigned SrcSize = SrcTy.getSizeInBits();
500 unsigned DstSize = DstTy.getSizeInBits();
501
502 // TODO: Should handle any multiple of 32 offset.
503 unsigned Offset = I.getOperand(2).getImm();
504 if (Offset % 32 != 0 || DstSize > 128)
505 return false;
506
507 // 16-bit operations really use 32-bit registers.
508 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
509 if (DstSize == 16)
510 DstSize = 32;
511
512 const TargetRegisterClass *DstRC =
513 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
514 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
515 return false;
516
517 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
518 const TargetRegisterClass *SrcRC =
519 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
520 if (!SrcRC)
521 return false;
522 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
523 DstSize / 32);
524 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
525 if (!SrcRC)
526 return false;
527
528 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
529 *SrcRC, I.getOperand(1));
530 const DebugLoc &DL = I.getDebugLoc();
531 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
532 .addReg(SrcReg, 0, SubReg);
533
534 I.eraseFromParent();
535 return true;
536 }
537
selectG_MERGE_VALUES(MachineInstr & MI) const538 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
539 MachineBasicBlock *BB = MI.getParent();
540 Register DstReg = MI.getOperand(0).getReg();
541 LLT DstTy = MRI->getType(DstReg);
542 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
543
544 const unsigned SrcSize = SrcTy.getSizeInBits();
545 if (SrcSize < 32)
546 return selectImpl(MI, *CoverageInfo);
547
548 const DebugLoc &DL = MI.getDebugLoc();
549 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
550 const unsigned DstSize = DstTy.getSizeInBits();
551 const TargetRegisterClass *DstRC =
552 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
553 if (!DstRC)
554 return false;
555
556 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
557 MachineInstrBuilder MIB =
558 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
559 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
560 MachineOperand &Src = MI.getOperand(I + 1);
561 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
562 MIB.addImm(SubRegs[I]);
563
564 const TargetRegisterClass *SrcRC
565 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
566 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
567 return false;
568 }
569
570 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
571 return false;
572
573 MI.eraseFromParent();
574 return true;
575 }
576
selectG_UNMERGE_VALUES(MachineInstr & MI) const577 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
578 MachineBasicBlock *BB = MI.getParent();
579 const int NumDst = MI.getNumOperands() - 1;
580
581 MachineOperand &Src = MI.getOperand(NumDst);
582
583 Register SrcReg = Src.getReg();
584 Register DstReg0 = MI.getOperand(0).getReg();
585 LLT DstTy = MRI->getType(DstReg0);
586 LLT SrcTy = MRI->getType(SrcReg);
587
588 const unsigned DstSize = DstTy.getSizeInBits();
589 const unsigned SrcSize = SrcTy.getSizeInBits();
590 const DebugLoc &DL = MI.getDebugLoc();
591 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
592
593 const TargetRegisterClass *SrcRC =
594 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
595 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
596 return false;
597
598 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
599 // source, and this relies on the fact that the same subregister indices are
600 // used for both.
601 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
602 for (int I = 0, E = NumDst; I != E; ++I) {
603 MachineOperand &Dst = MI.getOperand(I);
604 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
605 .addReg(SrcReg, 0, SubRegs[I]);
606
607 // Make sure the subregister index is valid for the source register.
608 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
609 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
610 return false;
611
612 const TargetRegisterClass *DstRC =
613 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
614 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
615 return false;
616 }
617
618 MI.eraseFromParent();
619 return true;
620 }
621
selectG_BUILD_VECTOR(MachineInstr & MI) const622 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
623 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
624 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
625
626 Register Src0 = MI.getOperand(1).getReg();
627 Register Src1 = MI.getOperand(2).getReg();
628 LLT SrcTy = MRI->getType(Src0);
629 const unsigned SrcSize = SrcTy.getSizeInBits();
630
631 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
632 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
633 return selectG_MERGE_VALUES(MI);
634 }
635
636 // Selection logic below is for V2S16 only.
637 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
638 Register Dst = MI.getOperand(0).getReg();
639 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
640 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
641 SrcTy != LLT::scalar(32)))
642 return selectImpl(MI, *CoverageInfo);
643
644 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
645 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
646 return false;
647
648 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
649 DstBank->getID() == AMDGPU::VGPRRegBankID);
650 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
651
652 const DebugLoc &DL = MI.getDebugLoc();
653 MachineBasicBlock *BB = MI.getParent();
654
655 // First, before trying TableGen patterns, check if both sources are
656 // constants. In those cases, we can trivially compute the final constant
657 // and emit a simple move.
658 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
659 if (ConstSrc1) {
660 auto ConstSrc0 =
661 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
662 if (ConstSrc0) {
663 const int64_t K0 = ConstSrc0->Value.getSExtValue();
664 const int64_t K1 = ConstSrc1->Value.getSExtValue();
665 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
666 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
667 uint32_t Imm = Lo16 | (Hi16 << 16);
668
669 // VALU
670 if (IsVector) {
671 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
672 MI.eraseFromParent();
673 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
674 }
675
676 // SALU
677 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
678 MI.eraseFromParent();
679 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
680 }
681 }
682
683 // Now try TableGen patterns.
684 if (selectImpl(MI, *CoverageInfo))
685 return true;
686
687 // TODO: This should probably be a combine somewhere
688 // (build_vector $src0, undef) -> copy $src0
689 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
690 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
691 MI.setDesc(TII.get(AMDGPU::COPY));
692 MI.removeOperand(2);
693 const auto &RC =
694 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
695 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
696 RBI.constrainGenericRegister(Src0, RC, *MRI);
697 }
698
699 // TODO: Can be improved?
700 if (IsVector) {
701 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
702 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
703 .addImm(0xFFFF)
704 .addReg(Src0);
705 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
706 return false;
707
708 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
709 .addReg(Src1)
710 .addImm(16)
711 .addReg(TmpReg);
712 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
713 return false;
714
715 MI.eraseFromParent();
716 return true;
717 }
718
719 Register ShiftSrc0;
720 Register ShiftSrc1;
721
722 // With multiple uses of the shift, this will duplicate the shift and
723 // increase register pressure.
724 //
725 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
726 // => (S_PACK_HH_B32_B16 $src0, $src1)
727 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
728 // => (S_PACK_HL_B32_B16 $src0, $src1)
729 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
730 // => (S_PACK_LH_B32_B16 $src0, $src1)
731 // (build_vector $src0, $src1)
732 // => (S_PACK_LL_B32_B16 $src0, $src1)
733
734 bool Shift0 = mi_match(
735 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
736
737 bool Shift1 = mi_match(
738 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
739
740 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
741 if (Shift0 && Shift1) {
742 Opc = AMDGPU::S_PACK_HH_B32_B16;
743 MI.getOperand(1).setReg(ShiftSrc0);
744 MI.getOperand(2).setReg(ShiftSrc1);
745 } else if (Shift1) {
746 Opc = AMDGPU::S_PACK_LH_B32_B16;
747 MI.getOperand(2).setReg(ShiftSrc1);
748 } else if (Shift0) {
749 auto ConstSrc1 =
750 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
751 if (ConstSrc1 && ConstSrc1->Value == 0) {
752 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
753 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
754 .addReg(ShiftSrc0)
755 .addImm(16)
756 .setOperandDead(3); // Dead scc
757
758 MI.eraseFromParent();
759 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
760 }
761 if (STI.hasSPackHL()) {
762 Opc = AMDGPU::S_PACK_HL_B32_B16;
763 MI.getOperand(1).setReg(ShiftSrc0);
764 }
765 }
766
767 MI.setDesc(TII.get(Opc));
768 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
769 }
770
selectG_IMPLICIT_DEF(MachineInstr & I) const771 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
772 const MachineOperand &MO = I.getOperand(0);
773
774 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
775 // regbank check here is to know why getConstrainedRegClassForOperand failed.
776 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
777 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
778 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
779 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
780 return true;
781 }
782
783 return false;
784 }
785
selectG_INSERT(MachineInstr & I) const786 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
787 MachineBasicBlock *BB = I.getParent();
788
789 Register DstReg = I.getOperand(0).getReg();
790 Register Src0Reg = I.getOperand(1).getReg();
791 Register Src1Reg = I.getOperand(2).getReg();
792 LLT Src1Ty = MRI->getType(Src1Reg);
793
794 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
795 unsigned InsSize = Src1Ty.getSizeInBits();
796
797 int64_t Offset = I.getOperand(3).getImm();
798
799 // FIXME: These cases should have been illegal and unnecessary to check here.
800 if (Offset % 32 != 0 || InsSize % 32 != 0)
801 return false;
802
803 // Currently not handled by getSubRegFromChannel.
804 if (InsSize > 128)
805 return false;
806
807 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
808 if (SubReg == AMDGPU::NoSubRegister)
809 return false;
810
811 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
812 const TargetRegisterClass *DstRC =
813 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
814 if (!DstRC)
815 return false;
816
817 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
818 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
819 const TargetRegisterClass *Src0RC =
820 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
821 const TargetRegisterClass *Src1RC =
822 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
823
824 // Deal with weird cases where the class only partially supports the subreg
825 // index.
826 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
827 if (!Src0RC || !Src1RC)
828 return false;
829
830 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
831 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
832 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
833 return false;
834
835 const DebugLoc &DL = I.getDebugLoc();
836 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
837 .addReg(Src0Reg)
838 .addReg(Src1Reg)
839 .addImm(SubReg);
840
841 I.eraseFromParent();
842 return true;
843 }
844
selectG_SBFX_UBFX(MachineInstr & MI) const845 bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
846 Register DstReg = MI.getOperand(0).getReg();
847 Register SrcReg = MI.getOperand(1).getReg();
848 Register OffsetReg = MI.getOperand(2).getReg();
849 Register WidthReg = MI.getOperand(3).getReg();
850
851 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
852 "scalar BFX instructions are expanded in regbankselect");
853 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
854 "64-bit vector BFX instructions are expanded in regbankselect");
855
856 const DebugLoc &DL = MI.getDebugLoc();
857 MachineBasicBlock *MBB = MI.getParent();
858
859 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
860 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
861 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
862 .addReg(SrcReg)
863 .addReg(OffsetReg)
864 .addReg(WidthReg);
865 MI.eraseFromParent();
866 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
867 }
868
selectInterpP1F16(MachineInstr & MI) const869 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
870 if (STI.getLDSBankCount() != 16)
871 return selectImpl(MI, *CoverageInfo);
872
873 Register Dst = MI.getOperand(0).getReg();
874 Register Src0 = MI.getOperand(2).getReg();
875 Register M0Val = MI.getOperand(6).getReg();
876 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
877 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
878 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
879 return false;
880
881 // This requires 2 instructions. It is possible to write a pattern to support
882 // this, but the generated isel emitter doesn't correctly deal with multiple
883 // output instructions using the same physical register input. The copy to m0
884 // is incorrectly placed before the second instruction.
885 //
886 // TODO: Match source modifiers.
887
888 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
889 const DebugLoc &DL = MI.getDebugLoc();
890 MachineBasicBlock *MBB = MI.getParent();
891
892 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
893 .addReg(M0Val);
894 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
895 .addImm(2)
896 .addImm(MI.getOperand(4).getImm()) // $attr
897 .addImm(MI.getOperand(3).getImm()); // $attrchan
898
899 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
900 .addImm(0) // $src0_modifiers
901 .addReg(Src0) // $src0
902 .addImm(MI.getOperand(4).getImm()) // $attr
903 .addImm(MI.getOperand(3).getImm()) // $attrchan
904 .addImm(0) // $src2_modifiers
905 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
906 .addImm(MI.getOperand(5).getImm()) // $high
907 .addImm(0) // $clamp
908 .addImm(0); // $omod
909
910 MI.eraseFromParent();
911 return true;
912 }
913
914 // Writelane is special in that it can use SGPR and M0 (which would normally
915 // count as using the constant bus twice - but in this case it is allowed since
916 // the lane selector doesn't count as a use of the constant bus). However, it is
917 // still required to abide by the 1 SGPR rule. Fix this up if we might have
918 // multiple SGPRs.
selectWritelane(MachineInstr & MI) const919 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
920 // With a constant bus limit of at least 2, there's no issue.
921 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
922 return selectImpl(MI, *CoverageInfo);
923
924 MachineBasicBlock *MBB = MI.getParent();
925 const DebugLoc &DL = MI.getDebugLoc();
926 Register VDst = MI.getOperand(0).getReg();
927 Register Val = MI.getOperand(2).getReg();
928 Register LaneSelect = MI.getOperand(3).getReg();
929 Register VDstIn = MI.getOperand(4).getReg();
930
931 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
932
933 std::optional<ValueAndVReg> ConstSelect =
934 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
935 if (ConstSelect) {
936 // The selector has to be an inline immediate, so we can use whatever for
937 // the other operands.
938 MIB.addReg(Val);
939 MIB.addImm(ConstSelect->Value.getSExtValue() &
940 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
941 } else {
942 std::optional<ValueAndVReg> ConstVal =
943 getIConstantVRegValWithLookThrough(Val, *MRI);
944
945 // If the value written is an inline immediate, we can get away without a
946 // copy to m0.
947 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
948 STI.hasInv2PiInlineImm())) {
949 MIB.addImm(ConstVal->Value.getSExtValue());
950 MIB.addReg(LaneSelect);
951 } else {
952 MIB.addReg(Val);
953
954 // If the lane selector was originally in a VGPR and copied with
955 // readfirstlane, there's a hazard to read the same SGPR from the
956 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
957 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
958
959 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
960 .addReg(LaneSelect);
961 MIB.addReg(AMDGPU::M0);
962 }
963 }
964
965 MIB.addReg(VDstIn);
966
967 MI.eraseFromParent();
968 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
969 }
970
971 // We need to handle this here because tablegen doesn't support matching
972 // instructions with multiple outputs.
selectDivScale(MachineInstr & MI) const973 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
974 Register Dst0 = MI.getOperand(0).getReg();
975 Register Dst1 = MI.getOperand(1).getReg();
976
977 LLT Ty = MRI->getType(Dst0);
978 unsigned Opc;
979 if (Ty == LLT::scalar(32))
980 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
981 else if (Ty == LLT::scalar(64))
982 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
983 else
984 return false;
985
986 // TODO: Match source modifiers.
987
988 const DebugLoc &DL = MI.getDebugLoc();
989 MachineBasicBlock *MBB = MI.getParent();
990
991 Register Numer = MI.getOperand(3).getReg();
992 Register Denom = MI.getOperand(4).getReg();
993 unsigned ChooseDenom = MI.getOperand(5).getImm();
994
995 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
996
997 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
998 .addDef(Dst1)
999 .addImm(0) // $src0_modifiers
1000 .addUse(Src0) // $src0
1001 .addImm(0) // $src1_modifiers
1002 .addUse(Denom) // $src1
1003 .addImm(0) // $src2_modifiers
1004 .addUse(Numer) // $src2
1005 .addImm(0) // $clamp
1006 .addImm(0); // $omod
1007
1008 MI.eraseFromParent();
1009 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1010 }
1011
selectG_INTRINSIC(MachineInstr & I) const1012 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1013 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1014 switch (IntrinsicID) {
1015 case Intrinsic::amdgcn_if_break: {
1016 MachineBasicBlock *BB = I.getParent();
1017
1018 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1019 // SelectionDAG uses for wave32 vs wave64.
1020 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1021 .add(I.getOperand(0))
1022 .add(I.getOperand(2))
1023 .add(I.getOperand(3));
1024
1025 Register DstReg = I.getOperand(0).getReg();
1026 Register Src0Reg = I.getOperand(2).getReg();
1027 Register Src1Reg = I.getOperand(3).getReg();
1028
1029 I.eraseFromParent();
1030
1031 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1032 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1033
1034 return true;
1035 }
1036 case Intrinsic::amdgcn_interp_p1_f16:
1037 return selectInterpP1F16(I);
1038 case Intrinsic::amdgcn_wqm:
1039 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1040 case Intrinsic::amdgcn_softwqm:
1041 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1042 case Intrinsic::amdgcn_strict_wwm:
1043 case Intrinsic::amdgcn_wwm:
1044 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1045 case Intrinsic::amdgcn_strict_wqm:
1046 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1047 case Intrinsic::amdgcn_writelane:
1048 return selectWritelane(I);
1049 case Intrinsic::amdgcn_div_scale:
1050 return selectDivScale(I);
1051 case Intrinsic::amdgcn_icmp:
1052 case Intrinsic::amdgcn_fcmp:
1053 if (selectImpl(I, *CoverageInfo))
1054 return true;
1055 return selectIntrinsicCmp(I);
1056 case Intrinsic::amdgcn_ballot:
1057 return selectBallot(I);
1058 case Intrinsic::amdgcn_reloc_constant:
1059 return selectRelocConstant(I);
1060 case Intrinsic::amdgcn_groupstaticsize:
1061 return selectGroupStaticSize(I);
1062 case Intrinsic::returnaddress:
1063 return selectReturnAddress(I);
1064 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1065 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1066 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1067 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1068 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1069 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1070 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1071 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1072 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1073 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1074 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1075 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1076 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1077 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1078 return selectSMFMACIntrin(I);
1079 default:
1080 return selectImpl(I, *CoverageInfo);
1081 }
1082 }
1083
getV_CMPOpcode(CmpInst::Predicate P,unsigned Size,const GCNSubtarget & ST)1084 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
1085 const GCNSubtarget &ST) {
1086 if (Size != 16 && Size != 32 && Size != 64)
1087 return -1;
1088
1089 if (Size == 16 && !ST.has16BitInsts())
1090 return -1;
1091
1092 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc,
1093 unsigned S64Opc) {
1094 if (Size == 16)
1095 return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc;
1096 if (Size == 32)
1097 return S32Opc;
1098 return S64Opc;
1099 };
1100
1101 switch (P) {
1102 default:
1103 llvm_unreachable("Unknown condition code!");
1104 case CmpInst::ICMP_NE:
1105 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1106 AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64);
1107 case CmpInst::ICMP_EQ:
1108 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1109 AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64);
1110 case CmpInst::ICMP_SGT:
1111 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1112 AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64);
1113 case CmpInst::ICMP_SGE:
1114 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1115 AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64);
1116 case CmpInst::ICMP_SLT:
1117 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1118 AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64);
1119 case CmpInst::ICMP_SLE:
1120 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1121 AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64);
1122 case CmpInst::ICMP_UGT:
1123 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1124 AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64);
1125 case CmpInst::ICMP_UGE:
1126 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1127 AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64);
1128 case CmpInst::ICMP_ULT:
1129 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1130 AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64);
1131 case CmpInst::ICMP_ULE:
1132 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1133 AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64);
1134
1135 case CmpInst::FCMP_OEQ:
1136 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1137 AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64);
1138 case CmpInst::FCMP_OGT:
1139 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1140 AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64);
1141 case CmpInst::FCMP_OGE:
1142 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1143 AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64);
1144 case CmpInst::FCMP_OLT:
1145 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1146 AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64);
1147 case CmpInst::FCMP_OLE:
1148 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1149 AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64);
1150 case CmpInst::FCMP_ONE:
1151 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1152 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1153 case CmpInst::FCMP_ORD:
1154 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1155 AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64);
1156 case CmpInst::FCMP_UNO:
1157 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1158 AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64);
1159 case CmpInst::FCMP_UEQ:
1160 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1161 AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64);
1162 case CmpInst::FCMP_UGT:
1163 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1164 AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64);
1165 case CmpInst::FCMP_UGE:
1166 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1167 AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64);
1168 case CmpInst::FCMP_ULT:
1169 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1170 AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64);
1171 case CmpInst::FCMP_ULE:
1172 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1173 AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64);
1174 case CmpInst::FCMP_UNE:
1175 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1176 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1177 case CmpInst::FCMP_TRUE:
1178 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1179 AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64);
1180 case CmpInst::FCMP_FALSE:
1181 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1182 AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64);
1183 }
1184 }
1185
getS_CMPOpcode(CmpInst::Predicate P,unsigned Size) const1186 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1187 unsigned Size) const {
1188 if (Size == 64) {
1189 if (!STI.hasScalarCompareEq64())
1190 return -1;
1191
1192 switch (P) {
1193 case CmpInst::ICMP_NE:
1194 return AMDGPU::S_CMP_LG_U64;
1195 case CmpInst::ICMP_EQ:
1196 return AMDGPU::S_CMP_EQ_U64;
1197 default:
1198 return -1;
1199 }
1200 }
1201
1202 if (Size == 32) {
1203 switch (P) {
1204 case CmpInst::ICMP_NE:
1205 return AMDGPU::S_CMP_LG_U32;
1206 case CmpInst::ICMP_EQ:
1207 return AMDGPU::S_CMP_EQ_U32;
1208 case CmpInst::ICMP_SGT:
1209 return AMDGPU::S_CMP_GT_I32;
1210 case CmpInst::ICMP_SGE:
1211 return AMDGPU::S_CMP_GE_I32;
1212 case CmpInst::ICMP_SLT:
1213 return AMDGPU::S_CMP_LT_I32;
1214 case CmpInst::ICMP_SLE:
1215 return AMDGPU::S_CMP_LE_I32;
1216 case CmpInst::ICMP_UGT:
1217 return AMDGPU::S_CMP_GT_U32;
1218 case CmpInst::ICMP_UGE:
1219 return AMDGPU::S_CMP_GE_U32;
1220 case CmpInst::ICMP_ULT:
1221 return AMDGPU::S_CMP_LT_U32;
1222 case CmpInst::ICMP_ULE:
1223 return AMDGPU::S_CMP_LE_U32;
1224 case CmpInst::FCMP_OEQ:
1225 return AMDGPU::S_CMP_EQ_F32;
1226 case CmpInst::FCMP_OGT:
1227 return AMDGPU::S_CMP_GT_F32;
1228 case CmpInst::FCMP_OGE:
1229 return AMDGPU::S_CMP_GE_F32;
1230 case CmpInst::FCMP_OLT:
1231 return AMDGPU::S_CMP_LT_F32;
1232 case CmpInst::FCMP_OLE:
1233 return AMDGPU::S_CMP_LE_F32;
1234 case CmpInst::FCMP_ONE:
1235 return AMDGPU::S_CMP_LG_F32;
1236 case CmpInst::FCMP_ORD:
1237 return AMDGPU::S_CMP_O_F32;
1238 case CmpInst::FCMP_UNO:
1239 return AMDGPU::S_CMP_U_F32;
1240 case CmpInst::FCMP_UEQ:
1241 return AMDGPU::S_CMP_NLG_F32;
1242 case CmpInst::FCMP_UGT:
1243 return AMDGPU::S_CMP_NLE_F32;
1244 case CmpInst::FCMP_UGE:
1245 return AMDGPU::S_CMP_NLT_F32;
1246 case CmpInst::FCMP_ULT:
1247 return AMDGPU::S_CMP_NGE_F32;
1248 case CmpInst::FCMP_ULE:
1249 return AMDGPU::S_CMP_NGT_F32;
1250 case CmpInst::FCMP_UNE:
1251 return AMDGPU::S_CMP_NEQ_F32;
1252 default:
1253 llvm_unreachable("Unknown condition code!");
1254 }
1255 }
1256
1257 if (Size == 16) {
1258 if (!STI.hasSALUFloatInsts())
1259 return -1;
1260
1261 switch (P) {
1262 case CmpInst::FCMP_OEQ:
1263 return AMDGPU::S_CMP_EQ_F16;
1264 case CmpInst::FCMP_OGT:
1265 return AMDGPU::S_CMP_GT_F16;
1266 case CmpInst::FCMP_OGE:
1267 return AMDGPU::S_CMP_GE_F16;
1268 case CmpInst::FCMP_OLT:
1269 return AMDGPU::S_CMP_LT_F16;
1270 case CmpInst::FCMP_OLE:
1271 return AMDGPU::S_CMP_LE_F16;
1272 case CmpInst::FCMP_ONE:
1273 return AMDGPU::S_CMP_LG_F16;
1274 case CmpInst::FCMP_ORD:
1275 return AMDGPU::S_CMP_O_F16;
1276 case CmpInst::FCMP_UNO:
1277 return AMDGPU::S_CMP_U_F16;
1278 case CmpInst::FCMP_UEQ:
1279 return AMDGPU::S_CMP_NLG_F16;
1280 case CmpInst::FCMP_UGT:
1281 return AMDGPU::S_CMP_NLE_F16;
1282 case CmpInst::FCMP_UGE:
1283 return AMDGPU::S_CMP_NLT_F16;
1284 case CmpInst::FCMP_ULT:
1285 return AMDGPU::S_CMP_NGE_F16;
1286 case CmpInst::FCMP_ULE:
1287 return AMDGPU::S_CMP_NGT_F16;
1288 case CmpInst::FCMP_UNE:
1289 return AMDGPU::S_CMP_NEQ_F16;
1290 default:
1291 llvm_unreachable("Unknown condition code!");
1292 }
1293 }
1294
1295 return -1;
1296 }
1297
selectG_ICMP_or_FCMP(MachineInstr & I) const1298 bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1299
1300 MachineBasicBlock *BB = I.getParent();
1301 const DebugLoc &DL = I.getDebugLoc();
1302
1303 Register SrcReg = I.getOperand(2).getReg();
1304 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1305
1306 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1307
1308 Register CCReg = I.getOperand(0).getReg();
1309 if (!isVCC(CCReg, *MRI)) {
1310 int Opcode = getS_CMPOpcode(Pred, Size);
1311 if (Opcode == -1)
1312 return false;
1313 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1314 .add(I.getOperand(2))
1315 .add(I.getOperand(3));
1316 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1317 .addReg(AMDGPU::SCC);
1318 bool Ret =
1319 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1320 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1321 I.eraseFromParent();
1322 return Ret;
1323 }
1324
1325 if (I.getOpcode() == AMDGPU::G_FCMP)
1326 return false;
1327
1328 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1329 if (Opcode == -1)
1330 return false;
1331
1332 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1333 I.getOperand(0).getReg())
1334 .add(I.getOperand(2))
1335 .add(I.getOperand(3));
1336 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1337 *TRI.getBoolRC(), *MRI);
1338 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1339 I.eraseFromParent();
1340 return Ret;
1341 }
1342
selectIntrinsicCmp(MachineInstr & I) const1343 bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1344 Register Dst = I.getOperand(0).getReg();
1345 if (isVCC(Dst, *MRI))
1346 return false;
1347
1348 LLT DstTy = MRI->getType(Dst);
1349 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1350 return false;
1351
1352 MachineBasicBlock *BB = I.getParent();
1353 const DebugLoc &DL = I.getDebugLoc();
1354 Register SrcReg = I.getOperand(2).getReg();
1355 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1356
1357 // i1 inputs are not supported in GlobalISel.
1358 if (Size == 1)
1359 return false;
1360
1361 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1362 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1363 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1364 I.eraseFromParent();
1365 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1366 }
1367
1368 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1369 if (Opcode == -1)
1370 return false;
1371
1372 MachineInstrBuilder SelectedMI;
1373 MachineOperand &LHS = I.getOperand(2);
1374 MachineOperand &RHS = I.getOperand(3);
1375 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
1376 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
1377 Register Src0Reg =
1378 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1379 Register Src1Reg =
1380 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1381 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1382 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1383 SelectedMI.addImm(Src0Mods);
1384 SelectedMI.addReg(Src0Reg);
1385 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1386 SelectedMI.addImm(Src1Mods);
1387 SelectedMI.addReg(Src1Reg);
1388 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1389 SelectedMI.addImm(0); // clamp
1390 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1391 SelectedMI.addImm(0); // op_sel
1392
1393 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1394 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1395 return false;
1396
1397 I.eraseFromParent();
1398 return true;
1399 }
1400
selectBallot(MachineInstr & I) const1401 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1402 MachineBasicBlock *BB = I.getParent();
1403 const DebugLoc &DL = I.getDebugLoc();
1404 Register DstReg = I.getOperand(0).getReg();
1405 const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1406 const bool Is64 = Size == 64;
1407 const bool IsWave32 = (STI.getWavefrontSize() == 32);
1408
1409 // In the common case, the return type matches the wave size.
1410 // However we also support emitting i64 ballots in wave32 mode.
1411 if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
1412 return false;
1413
1414 std::optional<ValueAndVReg> Arg =
1415 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1416
1417 const auto BuildCopy = [&](Register SrcReg) {
1418 if (Size == STI.getWavefrontSize()) {
1419 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1420 .addReg(SrcReg);
1421 return;
1422 }
1423
1424 // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1425 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1426 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1427 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1428 .addReg(SrcReg)
1429 .addImm(AMDGPU::sub0)
1430 .addReg(HiReg)
1431 .addImm(AMDGPU::sub1);
1432 };
1433
1434 if (Arg) {
1435 const int64_t Value = Arg->Value.getSExtValue();
1436 if (Value == 0) {
1437 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1438 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1439 } else if (Value == -1) // all ones
1440 BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1441 else
1442 return false;
1443 } else
1444 BuildCopy(I.getOperand(2).getReg());
1445
1446 I.eraseFromParent();
1447 return true;
1448 }
1449
selectRelocConstant(MachineInstr & I) const1450 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1451 Register DstReg = I.getOperand(0).getReg();
1452 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1453 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1454 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1455 return false;
1456
1457 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1458
1459 Module *M = MF->getFunction().getParent();
1460 const MDNode *Metadata = I.getOperand(2).getMetadata();
1461 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1462 auto RelocSymbol = cast<GlobalVariable>(
1463 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1464
1465 MachineBasicBlock *BB = I.getParent();
1466 BuildMI(*BB, &I, I.getDebugLoc(),
1467 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1468 .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1469
1470 I.eraseFromParent();
1471 return true;
1472 }
1473
selectGroupStaticSize(MachineInstr & I) const1474 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1475 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1476
1477 Register DstReg = I.getOperand(0).getReg();
1478 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1479 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1480 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1481
1482 MachineBasicBlock *MBB = I.getParent();
1483 const DebugLoc &DL = I.getDebugLoc();
1484
1485 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1486
1487 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1488 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1489 MIB.addImm(MFI->getLDSSize());
1490 } else {
1491 Module *M = MF->getFunction().getParent();
1492 const GlobalValue *GV
1493 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1494 MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
1495 }
1496
1497 I.eraseFromParent();
1498 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1499 }
1500
selectReturnAddress(MachineInstr & I) const1501 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1502 MachineBasicBlock *MBB = I.getParent();
1503 MachineFunction &MF = *MBB->getParent();
1504 const DebugLoc &DL = I.getDebugLoc();
1505
1506 MachineOperand &Dst = I.getOperand(0);
1507 Register DstReg = Dst.getReg();
1508 unsigned Depth = I.getOperand(2).getImm();
1509
1510 const TargetRegisterClass *RC
1511 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1512 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1513 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1514 return false;
1515
1516 // Check for kernel and shader functions
1517 if (Depth != 0 ||
1518 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1519 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1520 .addImm(0);
1521 I.eraseFromParent();
1522 return true;
1523 }
1524
1525 MachineFrameInfo &MFI = MF.getFrameInfo();
1526 // There is a call to @llvm.returnaddress in this function
1527 MFI.setReturnAddressIsTaken(true);
1528
1529 // Get the return address reg and mark it as an implicit live-in
1530 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1531 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1532 AMDGPU::SReg_64RegClass, DL);
1533 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1534 .addReg(LiveIn);
1535 I.eraseFromParent();
1536 return true;
1537 }
1538
selectEndCfIntrinsic(MachineInstr & MI) const1539 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1540 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1541 // SelectionDAG uses for wave32 vs wave64.
1542 MachineBasicBlock *BB = MI.getParent();
1543 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1544 .add(MI.getOperand(1));
1545
1546 Register Reg = MI.getOperand(1).getReg();
1547 MI.eraseFromParent();
1548
1549 if (!MRI->getRegClassOrNull(Reg))
1550 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1551 return true;
1552 }
1553
selectDSOrderedIntrinsic(MachineInstr & MI,Intrinsic::ID IntrID) const1554 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1555 MachineInstr &MI, Intrinsic::ID IntrID) const {
1556 MachineBasicBlock *MBB = MI.getParent();
1557 MachineFunction *MF = MBB->getParent();
1558 const DebugLoc &DL = MI.getDebugLoc();
1559
1560 unsigned IndexOperand = MI.getOperand(7).getImm();
1561 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1562 bool WaveDone = MI.getOperand(9).getImm() != 0;
1563
1564 if (WaveDone && !WaveRelease)
1565 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1566
1567 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1568 IndexOperand &= ~0x3f;
1569 unsigned CountDw = 0;
1570
1571 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1572 CountDw = (IndexOperand >> 24) & 0xf;
1573 IndexOperand &= ~(0xf << 24);
1574
1575 if (CountDw < 1 || CountDw > 4) {
1576 report_fatal_error(
1577 "ds_ordered_count: dword count must be between 1 and 4");
1578 }
1579 }
1580
1581 if (IndexOperand)
1582 report_fatal_error("ds_ordered_count: bad index operand");
1583
1584 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1585 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1586
1587 unsigned Offset0 = OrderedCountIndex << 2;
1588 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1589
1590 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1591 Offset1 |= (CountDw - 1) << 6;
1592
1593 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1594 Offset1 |= ShaderType << 2;
1595
1596 unsigned Offset = Offset0 | (Offset1 << 8);
1597
1598 Register M0Val = MI.getOperand(2).getReg();
1599 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1600 .addReg(M0Val);
1601
1602 Register DstReg = MI.getOperand(0).getReg();
1603 Register ValReg = MI.getOperand(3).getReg();
1604 MachineInstrBuilder DS =
1605 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1606 .addReg(ValReg)
1607 .addImm(Offset)
1608 .cloneMemRefs(MI);
1609
1610 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1611 return false;
1612
1613 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1614 MI.eraseFromParent();
1615 return Ret;
1616 }
1617
gwsIntrinToOpcode(unsigned IntrID)1618 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1619 switch (IntrID) {
1620 case Intrinsic::amdgcn_ds_gws_init:
1621 return AMDGPU::DS_GWS_INIT;
1622 case Intrinsic::amdgcn_ds_gws_barrier:
1623 return AMDGPU::DS_GWS_BARRIER;
1624 case Intrinsic::amdgcn_ds_gws_sema_v:
1625 return AMDGPU::DS_GWS_SEMA_V;
1626 case Intrinsic::amdgcn_ds_gws_sema_br:
1627 return AMDGPU::DS_GWS_SEMA_BR;
1628 case Intrinsic::amdgcn_ds_gws_sema_p:
1629 return AMDGPU::DS_GWS_SEMA_P;
1630 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1631 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1632 default:
1633 llvm_unreachable("not a gws intrinsic");
1634 }
1635 }
1636
selectDSGWSIntrinsic(MachineInstr & MI,Intrinsic::ID IID) const1637 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1638 Intrinsic::ID IID) const {
1639 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1640 !STI.hasGWSSemaReleaseAll()))
1641 return false;
1642
1643 // intrinsic ID, vsrc, offset
1644 const bool HasVSrc = MI.getNumOperands() == 3;
1645 assert(HasVSrc || MI.getNumOperands() == 2);
1646
1647 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1648 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1649 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1650 return false;
1651
1652 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1653 unsigned ImmOffset;
1654
1655 MachineBasicBlock *MBB = MI.getParent();
1656 const DebugLoc &DL = MI.getDebugLoc();
1657
1658 MachineInstr *Readfirstlane = nullptr;
1659
1660 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1661 // incoming offset, in case there's an add of a constant. We'll have to put it
1662 // back later.
1663 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1664 Readfirstlane = OffsetDef;
1665 BaseOffset = OffsetDef->getOperand(1).getReg();
1666 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1667 }
1668
1669 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1670 // If we have a constant offset, try to use the 0 in m0 as the base.
1671 // TODO: Look into changing the default m0 initialization value. If the
1672 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1673 // the immediate offset.
1674
1675 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1676 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1677 .addImm(0);
1678 } else {
1679 std::tie(BaseOffset, ImmOffset) =
1680 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1681
1682 if (Readfirstlane) {
1683 // We have the constant offset now, so put the readfirstlane back on the
1684 // variable component.
1685 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1686 return false;
1687
1688 Readfirstlane->getOperand(1).setReg(BaseOffset);
1689 BaseOffset = Readfirstlane->getOperand(0).getReg();
1690 } else {
1691 if (!RBI.constrainGenericRegister(BaseOffset,
1692 AMDGPU::SReg_32RegClass, *MRI))
1693 return false;
1694 }
1695
1696 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1697 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1698 .addReg(BaseOffset)
1699 .addImm(16)
1700 .setOperandDead(3); // Dead scc
1701
1702 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1703 .addReg(M0Base);
1704 }
1705
1706 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1707 // offset field) % 64. Some versions of the programming guide omit the m0
1708 // part, or claim it's from offset 0.
1709 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1710
1711 if (HasVSrc) {
1712 Register VSrc = MI.getOperand(1).getReg();
1713 MIB.addReg(VSrc);
1714
1715 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1716 return false;
1717 }
1718
1719 MIB.addImm(ImmOffset)
1720 .cloneMemRefs(MI);
1721
1722 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1723
1724 MI.eraseFromParent();
1725 return true;
1726 }
1727
selectDSAppendConsume(MachineInstr & MI,bool IsAppend) const1728 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1729 bool IsAppend) const {
1730 Register PtrBase = MI.getOperand(2).getReg();
1731 LLT PtrTy = MRI->getType(PtrBase);
1732 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1733
1734 unsigned Offset;
1735 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1736
1737 // TODO: Should this try to look through readfirstlane like GWS?
1738 if (!isDSOffsetLegal(PtrBase, Offset)) {
1739 PtrBase = MI.getOperand(2).getReg();
1740 Offset = 0;
1741 }
1742
1743 MachineBasicBlock *MBB = MI.getParent();
1744 const DebugLoc &DL = MI.getDebugLoc();
1745 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1746
1747 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1748 .addReg(PtrBase);
1749 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1750 return false;
1751
1752 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1753 .addImm(Offset)
1754 .addImm(IsGDS ? -1 : 0)
1755 .cloneMemRefs(MI);
1756 MI.eraseFromParent();
1757 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1758 }
1759
selectSBarrier(MachineInstr & MI) const1760 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1761 if (TM.getOptLevel() > CodeGenOptLevel::None) {
1762 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1763 if (WGSize <= STI.getWavefrontSize()) {
1764 MachineBasicBlock *MBB = MI.getParent();
1765 const DebugLoc &DL = MI.getDebugLoc();
1766 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1767 MI.eraseFromParent();
1768 return true;
1769 }
1770 }
1771
1772 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1773 if (STI.hasSplitBarriers()) {
1774 MachineBasicBlock *MBB = MI.getParent();
1775 const DebugLoc &DL = MI.getDebugLoc();
1776 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1777 .addImm(AMDGPU::Barrier::WORKGROUP);
1778 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1779 .addImm(AMDGPU::Barrier::WORKGROUP);
1780 MI.eraseFromParent();
1781 return true;
1782 }
1783
1784 return selectImpl(MI, *CoverageInfo);
1785 }
1786
parseTexFail(uint64_t TexFailCtrl,bool & TFE,bool & LWE,bool & IsTexFail)1787 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1788 bool &IsTexFail) {
1789 if (TexFailCtrl)
1790 IsTexFail = true;
1791
1792 TFE = (TexFailCtrl & 0x1) ? true : false;
1793 TexFailCtrl &= ~(uint64_t)0x1;
1794 LWE = (TexFailCtrl & 0x2) ? true : false;
1795 TexFailCtrl &= ~(uint64_t)0x2;
1796
1797 return TexFailCtrl == 0;
1798 }
1799
selectImageIntrinsic(MachineInstr & MI,const AMDGPU::ImageDimIntrinsicInfo * Intr) const1800 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1801 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1802 MachineBasicBlock *MBB = MI.getParent();
1803 const DebugLoc &DL = MI.getDebugLoc();
1804
1805 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1806 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1807
1808 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1809 unsigned IntrOpcode = Intr->BaseOpcode;
1810 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1811 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1812 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
1813
1814 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1815
1816 Register VDataIn, VDataOut;
1817 LLT VDataTy;
1818 int NumVDataDwords = -1;
1819 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1820 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1821
1822 bool Unorm;
1823 if (!BaseOpcode->Sampler)
1824 Unorm = true;
1825 else
1826 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1827
1828 bool TFE;
1829 bool LWE;
1830 bool IsTexFail = false;
1831 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1832 TFE, LWE, IsTexFail))
1833 return false;
1834
1835 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1836 const bool IsA16 = (Flags & 1) != 0;
1837 const bool IsG16 = (Flags & 2) != 0;
1838
1839 // A16 implies 16 bit gradients if subtarget doesn't support G16
1840 if (IsA16 && !STI.hasG16() && !IsG16)
1841 return false;
1842
1843 unsigned DMask = 0;
1844 unsigned DMaskLanes = 0;
1845
1846 if (BaseOpcode->Atomic) {
1847 VDataOut = MI.getOperand(0).getReg();
1848 VDataIn = MI.getOperand(2).getReg();
1849 LLT Ty = MRI->getType(VDataIn);
1850
1851 // Be careful to allow atomic swap on 16-bit element vectors.
1852 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1853 Ty.getSizeInBits() == 128 :
1854 Ty.getSizeInBits() == 64;
1855
1856 if (BaseOpcode->AtomicX2) {
1857 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1858
1859 DMask = Is64Bit ? 0xf : 0x3;
1860 NumVDataDwords = Is64Bit ? 4 : 2;
1861 } else {
1862 DMask = Is64Bit ? 0x3 : 0x1;
1863 NumVDataDwords = Is64Bit ? 2 : 1;
1864 }
1865 } else {
1866 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1867 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
1868
1869 if (BaseOpcode->Store) {
1870 VDataIn = MI.getOperand(1).getReg();
1871 VDataTy = MRI->getType(VDataIn);
1872 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1873 } else if (BaseOpcode->NoReturn) {
1874 NumVDataDwords = 0;
1875 } else {
1876 VDataOut = MI.getOperand(0).getReg();
1877 VDataTy = MRI->getType(VDataOut);
1878 NumVDataDwords = DMaskLanes;
1879
1880 if (IsD16 && !STI.hasUnpackedD16VMem())
1881 NumVDataDwords = (DMaskLanes + 1) / 2;
1882 }
1883 }
1884
1885 // Set G16 opcode
1886 if (Subtarget->hasG16() && IsG16) {
1887 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1888 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1889 assert(G16MappingInfo);
1890 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1891 }
1892
1893 // TODO: Check this in verifier.
1894 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1895
1896 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1897 if (BaseOpcode->Atomic)
1898 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1899 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
1900 AMDGPU::CPol::VOLATILE))
1901 return false;
1902
1903 int NumVAddrRegs = 0;
1904 int NumVAddrDwords = 0;
1905 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1906 // Skip the $noregs and 0s inserted during legalization.
1907 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1908 if (!AddrOp.isReg())
1909 continue; // XXX - Break?
1910
1911 Register Addr = AddrOp.getReg();
1912 if (!Addr)
1913 break;
1914
1915 ++NumVAddrRegs;
1916 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1917 }
1918
1919 // The legalizer preprocessed the intrinsic arguments. If we aren't using
1920 // NSA, these should have been packed into a single value in the first
1921 // address register
1922 const bool UseNSA =
1923 NumVAddrRegs != 1 &&
1924 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
1925 : NumVAddrDwords == NumVAddrRegs);
1926 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1927 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1928 return false;
1929 }
1930
1931 if (IsTexFail)
1932 ++NumVDataDwords;
1933
1934 int Opcode = -1;
1935 if (IsGFX12Plus) {
1936 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
1937 NumVDataDwords, NumVAddrDwords);
1938 } else if (IsGFX11Plus) {
1939 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1940 UseNSA ? AMDGPU::MIMGEncGfx11NSA
1941 : AMDGPU::MIMGEncGfx11Default,
1942 NumVDataDwords, NumVAddrDwords);
1943 } else if (IsGFX10Plus) {
1944 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1945 UseNSA ? AMDGPU::MIMGEncGfx10NSA
1946 : AMDGPU::MIMGEncGfx10Default,
1947 NumVDataDwords, NumVAddrDwords);
1948 } else {
1949 if (Subtarget->hasGFX90AInsts()) {
1950 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
1951 NumVDataDwords, NumVAddrDwords);
1952 if (Opcode == -1) {
1953 LLVM_DEBUG(
1954 dbgs()
1955 << "requested image instruction is not supported on this GPU\n");
1956 return false;
1957 }
1958 }
1959 if (Opcode == -1 &&
1960 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1961 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1962 NumVDataDwords, NumVAddrDwords);
1963 if (Opcode == -1)
1964 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1965 NumVDataDwords, NumVAddrDwords);
1966 }
1967 if (Opcode == -1)
1968 return false;
1969
1970 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1971 .cloneMemRefs(MI);
1972
1973 if (VDataOut) {
1974 if (BaseOpcode->AtomicX2) {
1975 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1976
1977 Register TmpReg = MRI->createVirtualRegister(
1978 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1979 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1980
1981 MIB.addDef(TmpReg);
1982 if (!MRI->use_empty(VDataOut)) {
1983 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1984 .addReg(TmpReg, RegState::Kill, SubReg);
1985 }
1986
1987 } else {
1988 MIB.addDef(VDataOut); // vdata output
1989 }
1990 }
1991
1992 if (VDataIn)
1993 MIB.addReg(VDataIn); // vdata input
1994
1995 for (int I = 0; I != NumVAddrRegs; ++I) {
1996 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
1997 if (SrcOp.isReg()) {
1998 assert(SrcOp.getReg() != 0);
1999 MIB.addReg(SrcOp.getReg());
2000 }
2001 }
2002
2003 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2004 if (BaseOpcode->Sampler)
2005 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2006
2007 MIB.addImm(DMask); // dmask
2008
2009 if (IsGFX10Plus)
2010 MIB.addImm(DimInfo->Encoding);
2011 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2012 MIB.addImm(Unorm);
2013
2014 MIB.addImm(CPol);
2015 MIB.addImm(IsA16 && // a16 or r128
2016 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2017 if (IsGFX10Plus)
2018 MIB.addImm(IsA16 ? -1 : 0);
2019
2020 if (!Subtarget->hasGFX90AInsts()) {
2021 MIB.addImm(TFE); // tfe
2022 } else if (TFE) {
2023 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2024 return false;
2025 }
2026
2027 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2028 MIB.addImm(LWE); // lwe
2029 if (!IsGFX10Plus)
2030 MIB.addImm(DimInfo->DA ? -1 : 0);
2031 if (BaseOpcode->HasD16)
2032 MIB.addImm(IsD16 ? -1 : 0);
2033
2034 MI.eraseFromParent();
2035 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2036 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2037 return true;
2038 }
2039
2040 // We need to handle this here because tablegen doesn't support matching
2041 // instructions with multiple outputs.
selectDSBvhStackIntrinsic(MachineInstr & MI) const2042 bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2043 MachineInstr &MI) const {
2044 Register Dst0 = MI.getOperand(0).getReg();
2045 Register Dst1 = MI.getOperand(1).getReg();
2046
2047 const DebugLoc &DL = MI.getDebugLoc();
2048 MachineBasicBlock *MBB = MI.getParent();
2049
2050 Register Addr = MI.getOperand(3).getReg();
2051 Register Data0 = MI.getOperand(4).getReg();
2052 Register Data1 = MI.getOperand(5).getReg();
2053 unsigned Offset = MI.getOperand(6).getImm();
2054
2055 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2056 .addDef(Dst1)
2057 .addUse(Addr)
2058 .addUse(Data0)
2059 .addUse(Data1)
2060 .addImm(Offset)
2061 .cloneMemRefs(MI);
2062
2063 MI.eraseFromParent();
2064 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2065 }
2066
selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr & I) const2067 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2068 MachineInstr &I) const {
2069 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2070 switch (IntrinsicID) {
2071 case Intrinsic::amdgcn_end_cf:
2072 return selectEndCfIntrinsic(I);
2073 case Intrinsic::amdgcn_ds_ordered_add:
2074 case Intrinsic::amdgcn_ds_ordered_swap:
2075 return selectDSOrderedIntrinsic(I, IntrinsicID);
2076 case Intrinsic::amdgcn_ds_gws_init:
2077 case Intrinsic::amdgcn_ds_gws_barrier:
2078 case Intrinsic::amdgcn_ds_gws_sema_v:
2079 case Intrinsic::amdgcn_ds_gws_sema_br:
2080 case Intrinsic::amdgcn_ds_gws_sema_p:
2081 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2082 return selectDSGWSIntrinsic(I, IntrinsicID);
2083 case Intrinsic::amdgcn_ds_append:
2084 return selectDSAppendConsume(I, true);
2085 case Intrinsic::amdgcn_ds_consume:
2086 return selectDSAppendConsume(I, false);
2087 case Intrinsic::amdgcn_s_barrier:
2088 return selectSBarrier(I);
2089 case Intrinsic::amdgcn_raw_buffer_load_lds:
2090 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2091 case Intrinsic::amdgcn_struct_buffer_load_lds:
2092 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2093 return selectBufferLoadLds(I);
2094 case Intrinsic::amdgcn_global_load_lds:
2095 return selectGlobalLoadLds(I);
2096 case Intrinsic::amdgcn_exp_compr:
2097 if (!STI.hasCompressedExport()) {
2098 Function &F = I.getMF()->getFunction();
2099 DiagnosticInfoUnsupported NoFpRet(
2100 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2101 F.getContext().diagnose(NoFpRet);
2102 return false;
2103 }
2104 break;
2105 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2106 return selectDSBvhStackIntrinsic(I);
2107 case Intrinsic::amdgcn_s_barrier_init:
2108 case Intrinsic::amdgcn_s_barrier_join:
2109 case Intrinsic::amdgcn_s_wakeup_barrier:
2110 case Intrinsic::amdgcn_s_get_barrier_state:
2111 return selectNamedBarrierInst(I, IntrinsicID);
2112 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2113 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
2114 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2115 case Intrinsic::amdgcn_s_barrier_leave:
2116 return selectSBarrierLeave(I);
2117 }
2118 return selectImpl(I, *CoverageInfo);
2119 }
2120
selectG_SELECT(MachineInstr & I) const2121 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2122 if (selectImpl(I, *CoverageInfo))
2123 return true;
2124
2125 MachineBasicBlock *BB = I.getParent();
2126 const DebugLoc &DL = I.getDebugLoc();
2127
2128 Register DstReg = I.getOperand(0).getReg();
2129 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2130 assert(Size <= 32 || Size == 64);
2131 const MachineOperand &CCOp = I.getOperand(1);
2132 Register CCReg = CCOp.getReg();
2133 if (!isVCC(CCReg, *MRI)) {
2134 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2135 AMDGPU::S_CSELECT_B32;
2136 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2137 .addReg(CCReg);
2138
2139 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2140 // bank, because it does not cover the register class that we used to represent
2141 // for it. So we need to manually set the register class here.
2142 if (!MRI->getRegClassOrNull(CCReg))
2143 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2144 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2145 .add(I.getOperand(2))
2146 .add(I.getOperand(3));
2147
2148 bool Ret = false;
2149 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2150 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2151 I.eraseFromParent();
2152 return Ret;
2153 }
2154
2155 // Wide VGPR select should have been split in RegBankSelect.
2156 if (Size > 32)
2157 return false;
2158
2159 MachineInstr *Select =
2160 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2161 .addImm(0)
2162 .add(I.getOperand(3))
2163 .addImm(0)
2164 .add(I.getOperand(2))
2165 .add(I.getOperand(1));
2166
2167 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2168 I.eraseFromParent();
2169 return Ret;
2170 }
2171
sizeToSubRegIndex(unsigned Size)2172 static int sizeToSubRegIndex(unsigned Size) {
2173 switch (Size) {
2174 case 32:
2175 return AMDGPU::sub0;
2176 case 64:
2177 return AMDGPU::sub0_sub1;
2178 case 96:
2179 return AMDGPU::sub0_sub1_sub2;
2180 case 128:
2181 return AMDGPU::sub0_sub1_sub2_sub3;
2182 case 256:
2183 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
2184 default:
2185 if (Size < 32)
2186 return AMDGPU::sub0;
2187 if (Size > 256)
2188 return -1;
2189 return sizeToSubRegIndex(llvm::bit_ceil(Size));
2190 }
2191 }
2192
selectG_TRUNC(MachineInstr & I) const2193 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2194 Register DstReg = I.getOperand(0).getReg();
2195 Register SrcReg = I.getOperand(1).getReg();
2196 const LLT DstTy = MRI->getType(DstReg);
2197 const LLT SrcTy = MRI->getType(SrcReg);
2198 const LLT S1 = LLT::scalar(1);
2199
2200 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2201 const RegisterBank *DstRB;
2202 if (DstTy == S1) {
2203 // This is a special case. We don't treat s1 for legalization artifacts as
2204 // vcc booleans.
2205 DstRB = SrcRB;
2206 } else {
2207 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2208 if (SrcRB != DstRB)
2209 return false;
2210 }
2211
2212 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2213
2214 unsigned DstSize = DstTy.getSizeInBits();
2215 unsigned SrcSize = SrcTy.getSizeInBits();
2216
2217 const TargetRegisterClass *SrcRC =
2218 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2219 const TargetRegisterClass *DstRC =
2220 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2221 if (!SrcRC || !DstRC)
2222 return false;
2223
2224 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2225 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2226 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2227 return false;
2228 }
2229
2230 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2231 MachineBasicBlock *MBB = I.getParent();
2232 const DebugLoc &DL = I.getDebugLoc();
2233
2234 Register LoReg = MRI->createVirtualRegister(DstRC);
2235 Register HiReg = MRI->createVirtualRegister(DstRC);
2236 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2237 .addReg(SrcReg, 0, AMDGPU::sub0);
2238 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2239 .addReg(SrcReg, 0, AMDGPU::sub1);
2240
2241 if (IsVALU && STI.hasSDWA()) {
2242 // Write the low 16-bits of the high element into the high 16-bits of the
2243 // low element.
2244 MachineInstr *MovSDWA =
2245 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2246 .addImm(0) // $src0_modifiers
2247 .addReg(HiReg) // $src0
2248 .addImm(0) // $clamp
2249 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2250 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2251 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2252 .addReg(LoReg, RegState::Implicit);
2253 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2254 } else {
2255 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2256 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2257 Register ImmReg = MRI->createVirtualRegister(DstRC);
2258 if (IsVALU) {
2259 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2260 .addImm(16)
2261 .addReg(HiReg);
2262 } else {
2263 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2264 .addReg(HiReg)
2265 .addImm(16)
2266 .setOperandDead(3); // Dead scc
2267 }
2268
2269 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2270 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2271 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2272
2273 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2274 .addImm(0xffff);
2275 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2276 .addReg(LoReg)
2277 .addReg(ImmReg);
2278 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2279 .addReg(TmpReg0)
2280 .addReg(TmpReg1);
2281
2282 if (!IsVALU) {
2283 And.setOperandDead(3); // Dead scc
2284 Or.setOperandDead(3); // Dead scc
2285 }
2286 }
2287
2288 I.eraseFromParent();
2289 return true;
2290 }
2291
2292 if (!DstTy.isScalar())
2293 return false;
2294
2295 if (SrcSize > 32) {
2296 int SubRegIdx = sizeToSubRegIndex(DstSize);
2297 if (SubRegIdx == -1)
2298 return false;
2299
2300 // Deal with weird cases where the class only partially supports the subreg
2301 // index.
2302 const TargetRegisterClass *SrcWithSubRC
2303 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2304 if (!SrcWithSubRC)
2305 return false;
2306
2307 if (SrcWithSubRC != SrcRC) {
2308 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2309 return false;
2310 }
2311
2312 I.getOperand(1).setSubReg(SubRegIdx);
2313 }
2314
2315 I.setDesc(TII.get(TargetOpcode::COPY));
2316 return true;
2317 }
2318
2319 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
shouldUseAndMask(unsigned Size,unsigned & Mask)2320 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2321 Mask = maskTrailingOnes<unsigned>(Size);
2322 int SignedMask = static_cast<int>(Mask);
2323 return SignedMask >= -16 && SignedMask <= 64;
2324 }
2325
2326 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
getArtifactRegBank(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const2327 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2328 Register Reg, const MachineRegisterInfo &MRI,
2329 const TargetRegisterInfo &TRI) const {
2330 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2331 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2332 return RB;
2333
2334 // Ignore the type, since we don't use vcc in artifacts.
2335 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2336 return &RBI.getRegBankFromRegClass(*RC, LLT());
2337 return nullptr;
2338 }
2339
selectG_SZA_EXT(MachineInstr & I) const2340 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2341 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2342 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2343 const DebugLoc &DL = I.getDebugLoc();
2344 MachineBasicBlock &MBB = *I.getParent();
2345 const Register DstReg = I.getOperand(0).getReg();
2346 const Register SrcReg = I.getOperand(1).getReg();
2347
2348 const LLT DstTy = MRI->getType(DstReg);
2349 const LLT SrcTy = MRI->getType(SrcReg);
2350 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2351 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2352 const unsigned DstSize = DstTy.getSizeInBits();
2353 if (!DstTy.isScalar())
2354 return false;
2355
2356 // Artifact casts should never use vcc.
2357 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2358
2359 // FIXME: This should probably be illegal and split earlier.
2360 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2361 if (DstSize <= 32)
2362 return selectCOPY(I);
2363
2364 const TargetRegisterClass *SrcRC =
2365 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2366 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2367 const TargetRegisterClass *DstRC =
2368 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2369
2370 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2371 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2372 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2373 .addReg(SrcReg)
2374 .addImm(AMDGPU::sub0)
2375 .addReg(UndefReg)
2376 .addImm(AMDGPU::sub1);
2377 I.eraseFromParent();
2378
2379 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2380 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2381 }
2382
2383 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2384 // 64-bit should have been split up in RegBankSelect
2385
2386 // Try to use an and with a mask if it will save code size.
2387 unsigned Mask;
2388 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2389 MachineInstr *ExtI =
2390 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2391 .addImm(Mask)
2392 .addReg(SrcReg);
2393 I.eraseFromParent();
2394 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2395 }
2396
2397 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2398 MachineInstr *ExtI =
2399 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2400 .addReg(SrcReg)
2401 .addImm(0) // Offset
2402 .addImm(SrcSize); // Width
2403 I.eraseFromParent();
2404 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2405 }
2406
2407 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2408 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2409 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2410 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2411 return false;
2412
2413 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2414 const unsigned SextOpc = SrcSize == 8 ?
2415 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2416 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2417 .addReg(SrcReg);
2418 I.eraseFromParent();
2419 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2420 }
2421
2422 // Using a single 32-bit SALU to calculate the high half is smaller than
2423 // S_BFE with a literal constant operand.
2424 if (DstSize > 32 && SrcSize == 32) {
2425 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2426 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2427 if (Signed) {
2428 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2429 .addReg(SrcReg, 0, SubReg)
2430 .addImm(31)
2431 .setOperandDead(3); // Dead scc
2432 } else {
2433 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2434 .addImm(0);
2435 }
2436 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2437 .addReg(SrcReg, 0, SubReg)
2438 .addImm(AMDGPU::sub0)
2439 .addReg(HiReg)
2440 .addImm(AMDGPU::sub1);
2441 I.eraseFromParent();
2442 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2443 *MRI);
2444 }
2445
2446 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2447 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2448
2449 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2450 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2451 // We need a 64-bit register source, but the high bits don't matter.
2452 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2453 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2454 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2455
2456 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2457 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2458 .addReg(SrcReg, 0, SubReg)
2459 .addImm(AMDGPU::sub0)
2460 .addReg(UndefReg)
2461 .addImm(AMDGPU::sub1);
2462
2463 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2464 .addReg(ExtReg)
2465 .addImm(SrcSize << 16);
2466
2467 I.eraseFromParent();
2468 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2469 }
2470
2471 unsigned Mask;
2472 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2473 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2474 .addReg(SrcReg)
2475 .addImm(Mask)
2476 .setOperandDead(3); // Dead scc
2477 } else {
2478 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2479 .addReg(SrcReg)
2480 .addImm(SrcSize << 16);
2481 }
2482
2483 I.eraseFromParent();
2484 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2485 }
2486
2487 return false;
2488 }
2489
isExtractHiElt(MachineRegisterInfo & MRI,Register In,Register & Out)2490 static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
2491 Register &Out) {
2492 Register LShlSrc;
2493 if (mi_match(In, MRI,
2494 m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
2495 Out = LShlSrc;
2496 return true;
2497 }
2498 return false;
2499 }
2500
selectG_FPEXT(MachineInstr & I) const2501 bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2502 if (!Subtarget->hasSALUFloatInsts())
2503 return false;
2504
2505 Register Dst = I.getOperand(0).getReg();
2506 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2507 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2508 return false;
2509
2510 Register Src = I.getOperand(1).getReg();
2511
2512 if (MRI->getType(Dst) == LLT::scalar(32) &&
2513 MRI->getType(Src) == LLT::scalar(16)) {
2514 if (isExtractHiElt(*MRI, Src, Src)) {
2515 MachineBasicBlock *BB = I.getParent();
2516 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2517 .addUse(Src);
2518 I.eraseFromParent();
2519 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2520 }
2521 }
2522
2523 return false;
2524 }
2525
selectG_CONSTANT(MachineInstr & I) const2526 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2527 MachineBasicBlock *BB = I.getParent();
2528 MachineOperand &ImmOp = I.getOperand(1);
2529 Register DstReg = I.getOperand(0).getReg();
2530 unsigned Size = MRI->getType(DstReg).getSizeInBits();
2531 bool IsFP = false;
2532
2533 // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2534 if (ImmOp.isFPImm()) {
2535 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2536 ImmOp.ChangeToImmediate(Imm.getZExtValue());
2537 IsFP = true;
2538 } else if (ImmOp.isCImm()) {
2539 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2540 } else {
2541 llvm_unreachable("Not supported by g_constants");
2542 }
2543
2544 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2545 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2546
2547 unsigned Opcode;
2548 if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2549 Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2550 } else if (Size == 64 &&
2551 AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) {
2552 Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
2553 I.setDesc(TII.get(Opcode));
2554 I.addImplicitDefUseOperands(*MF);
2555 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2556 } else {
2557 Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2558
2559 // We should never produce s1 values on banks other than VCC. If the user of
2560 // this already constrained the register, we may incorrectly think it's VCC
2561 // if it wasn't originally.
2562 if (Size == 1)
2563 return false;
2564 }
2565
2566 if (Size != 64) {
2567 I.setDesc(TII.get(Opcode));
2568 I.addImplicitDefUseOperands(*MF);
2569 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2570 }
2571
2572 const DebugLoc &DL = I.getDebugLoc();
2573
2574 APInt Imm(Size, I.getOperand(1).getImm());
2575
2576 MachineInstr *ResInst;
2577 if (IsSgpr && TII.isInlineConstant(Imm)) {
2578 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2579 .addImm(I.getOperand(1).getImm());
2580 } else {
2581 const TargetRegisterClass *RC = IsSgpr ?
2582 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2583 Register LoReg = MRI->createVirtualRegister(RC);
2584 Register HiReg = MRI->createVirtualRegister(RC);
2585
2586 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2587 .addImm(Imm.trunc(32).getZExtValue());
2588
2589 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2590 .addImm(Imm.ashr(32).getZExtValue());
2591
2592 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2593 .addReg(LoReg)
2594 .addImm(AMDGPU::sub0)
2595 .addReg(HiReg)
2596 .addImm(AMDGPU::sub1);
2597 }
2598
2599 // We can't call constrainSelectedInstRegOperands here, because it doesn't
2600 // work for target independent opcodes
2601 I.eraseFromParent();
2602 const TargetRegisterClass *DstRC =
2603 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2604 if (!DstRC)
2605 return true;
2606 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2607 }
2608
selectG_FNEG(MachineInstr & MI) const2609 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2610 // Only manually handle the f64 SGPR case.
2611 //
2612 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2613 // the bit ops theoretically have a second result due to the implicit def of
2614 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2615 // that is easy by disabling the check. The result works, but uses a
2616 // nonsensical sreg32orlds_and_sreg_1 regclass.
2617 //
2618 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2619 // the variadic REG_SEQUENCE operands.
2620
2621 Register Dst = MI.getOperand(0).getReg();
2622 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2623 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2624 MRI->getType(Dst) != LLT::scalar(64))
2625 return false;
2626
2627 Register Src = MI.getOperand(1).getReg();
2628 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2629 if (Fabs)
2630 Src = Fabs->getOperand(1).getReg();
2631
2632 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2633 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2634 return false;
2635
2636 MachineBasicBlock *BB = MI.getParent();
2637 const DebugLoc &DL = MI.getDebugLoc();
2638 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2639 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2640 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2641 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2642
2643 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2644 .addReg(Src, 0, AMDGPU::sub0);
2645 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2646 .addReg(Src, 0, AMDGPU::sub1);
2647 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2648 .addImm(0x80000000);
2649
2650 // Set or toggle sign bit.
2651 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2652 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2653 .addReg(HiReg)
2654 .addReg(ConstReg)
2655 .setOperandDead(3); // Dead scc
2656 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2657 .addReg(LoReg)
2658 .addImm(AMDGPU::sub0)
2659 .addReg(OpReg)
2660 .addImm(AMDGPU::sub1);
2661 MI.eraseFromParent();
2662 return true;
2663 }
2664
2665 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
selectG_FABS(MachineInstr & MI) const2666 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2667 Register Dst = MI.getOperand(0).getReg();
2668 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2669 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2670 MRI->getType(Dst) != LLT::scalar(64))
2671 return false;
2672
2673 Register Src = MI.getOperand(1).getReg();
2674 MachineBasicBlock *BB = MI.getParent();
2675 const DebugLoc &DL = MI.getDebugLoc();
2676 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2677 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2678 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2679 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2680
2681 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2682 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2683 return false;
2684
2685 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2686 .addReg(Src, 0, AMDGPU::sub0);
2687 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2688 .addReg(Src, 0, AMDGPU::sub1);
2689 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2690 .addImm(0x7fffffff);
2691
2692 // Clear sign bit.
2693 // TODO: Should this used S_BITSET0_*?
2694 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2695 .addReg(HiReg)
2696 .addReg(ConstReg)
2697 .setOperandDead(3); // Dead scc
2698 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2699 .addReg(LoReg)
2700 .addImm(AMDGPU::sub0)
2701 .addReg(OpReg)
2702 .addImm(AMDGPU::sub1);
2703
2704 MI.eraseFromParent();
2705 return true;
2706 }
2707
isConstant(const MachineInstr & MI)2708 static bool isConstant(const MachineInstr &MI) {
2709 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2710 }
2711
getAddrModeInfo(const MachineInstr & Load,const MachineRegisterInfo & MRI,SmallVectorImpl<GEPInfo> & AddrInfo) const2712 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2713 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2714
2715 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2716 const MachineInstr *PtrMI =
2717 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2718
2719 assert(PtrMI);
2720
2721 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2722 return;
2723
2724 GEPInfo GEPInfo;
2725
2726 for (unsigned i = 1; i != 3; ++i) {
2727 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2728 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2729 assert(OpDef);
2730 if (i == 2 && isConstant(*OpDef)) {
2731 // TODO: Could handle constant base + variable offset, but a combine
2732 // probably should have commuted it.
2733 assert(GEPInfo.Imm == 0);
2734 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2735 continue;
2736 }
2737 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2738 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2739 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2740 else
2741 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2742 }
2743
2744 AddrInfo.push_back(GEPInfo);
2745 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2746 }
2747
isSGPR(Register Reg) const2748 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2749 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2750 }
2751
isInstrUniform(const MachineInstr & MI) const2752 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2753 if (!MI.hasOneMemOperand())
2754 return false;
2755
2756 const MachineMemOperand *MMO = *MI.memoperands_begin();
2757 const Value *Ptr = MMO->getValue();
2758
2759 // UndefValue means this is a load of a kernel input. These are uniform.
2760 // Sometimes LDS instructions have constant pointers.
2761 // If Ptr is null, then that means this mem operand contains a
2762 // PseudoSourceValue like GOT.
2763 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2764 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2765 return true;
2766
2767 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2768 return true;
2769
2770 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2771 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2772 AMDGPU::SGPRRegBankID;
2773
2774 const Instruction *I = dyn_cast<Instruction>(Ptr);
2775 return I && I->getMetadata("amdgpu.uniform");
2776 }
2777
hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const2778 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2779 for (const GEPInfo &GEPInfo : AddrInfo) {
2780 if (!GEPInfo.VgprParts.empty())
2781 return true;
2782 }
2783 return false;
2784 }
2785
initM0(MachineInstr & I) const2786 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2787 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2788 unsigned AS = PtrTy.getAddressSpace();
2789 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2790 STI.ldsRequiresM0Init()) {
2791 MachineBasicBlock *BB = I.getParent();
2792
2793 // If DS instructions require M0 initialization, insert it before selecting.
2794 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2795 .addImm(-1);
2796 }
2797 }
2798
selectG_LOAD_STORE_ATOMICRMW(MachineInstr & I) const2799 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2800 MachineInstr &I) const {
2801 initM0(I);
2802 return selectImpl(I, *CoverageInfo);
2803 }
2804
isVCmpResult(Register Reg,MachineRegisterInfo & MRI)2805 static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
2806 if (Reg.isPhysical())
2807 return false;
2808
2809 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2810 const unsigned Opcode = MI.getOpcode();
2811
2812 if (Opcode == AMDGPU::COPY)
2813 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2814
2815 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2816 Opcode == AMDGPU::G_XOR)
2817 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2818 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2819
2820 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2821 return GI->is(Intrinsic::amdgcn_class);
2822
2823 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2824 }
2825
selectG_BRCOND(MachineInstr & I) const2826 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2827 MachineBasicBlock *BB = I.getParent();
2828 MachineOperand &CondOp = I.getOperand(0);
2829 Register CondReg = CondOp.getReg();
2830 const DebugLoc &DL = I.getDebugLoc();
2831
2832 unsigned BrOpcode;
2833 Register CondPhysReg;
2834 const TargetRegisterClass *ConstrainRC;
2835
2836 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2837 // whether the branch is uniform when selecting the instruction. In
2838 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2839 // RegBankSelect knows what it's doing if the branch condition is scc, even
2840 // though it currently does not.
2841 if (!isVCC(CondReg, *MRI)) {
2842 if (MRI->getType(CondReg) != LLT::scalar(32))
2843 return false;
2844
2845 CondPhysReg = AMDGPU::SCC;
2846 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2847 ConstrainRC = &AMDGPU::SReg_32RegClass;
2848 } else {
2849 // FIXME: Should scc->vcc copies and with exec?
2850
2851 // Unless the value of CondReg is a result of a V_CMP* instruction then we
2852 // need to insert an and with exec.
2853 if (!isVCmpResult(CondReg, *MRI)) {
2854 const bool Is64 = STI.isWave64();
2855 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2856 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2857
2858 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2859 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2860 .addReg(CondReg)
2861 .addReg(Exec)
2862 .setOperandDead(3); // Dead scc
2863 CondReg = TmpReg;
2864 }
2865
2866 CondPhysReg = TRI.getVCC();
2867 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2868 ConstrainRC = TRI.getBoolRC();
2869 }
2870
2871 if (!MRI->getRegClassOrNull(CondReg))
2872 MRI->setRegClass(CondReg, ConstrainRC);
2873
2874 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2875 .addReg(CondReg);
2876 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2877 .addMBB(I.getOperand(1).getMBB());
2878
2879 I.eraseFromParent();
2880 return true;
2881 }
2882
selectG_GLOBAL_VALUE(MachineInstr & I) const2883 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2884 MachineInstr &I) const {
2885 Register DstReg = I.getOperand(0).getReg();
2886 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2887 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2888 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2889 if (IsVGPR)
2890 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2891
2892 return RBI.constrainGenericRegister(
2893 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2894 }
2895
selectG_PTRMASK(MachineInstr & I) const2896 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2897 Register DstReg = I.getOperand(0).getReg();
2898 Register SrcReg = I.getOperand(1).getReg();
2899 Register MaskReg = I.getOperand(2).getReg();
2900 LLT Ty = MRI->getType(DstReg);
2901 LLT MaskTy = MRI->getType(MaskReg);
2902 MachineBasicBlock *BB = I.getParent();
2903 const DebugLoc &DL = I.getDebugLoc();
2904
2905 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2906 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2907 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2908 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2909 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2910 return false;
2911
2912 // Try to avoid emitting a bit operation when we only need to touch half of
2913 // the 64-bit pointer.
2914 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
2915 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2916 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2917
2918 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2919 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2920
2921 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2922 !CanCopyLow32 && !CanCopyHi32) {
2923 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2924 .addReg(SrcReg)
2925 .addReg(MaskReg)
2926 .setOperandDead(3); // Dead scc
2927 I.eraseFromParent();
2928 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2929 }
2930
2931 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2932 const TargetRegisterClass &RegRC
2933 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2934
2935 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
2936 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
2937 const TargetRegisterClass *MaskRC =
2938 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
2939
2940 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2941 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2942 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2943 return false;
2944
2945 if (Ty.getSizeInBits() == 32) {
2946 assert(MaskTy.getSizeInBits() == 32 &&
2947 "ptrmask should have been narrowed during legalize");
2948
2949 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2950 .addReg(SrcReg)
2951 .addReg(MaskReg);
2952
2953 if (!IsVGPR)
2954 NewOp.setOperandDead(3); // Dead scc
2955 I.eraseFromParent();
2956 return true;
2957 }
2958
2959 Register HiReg = MRI->createVirtualRegister(&RegRC);
2960 Register LoReg = MRI->createVirtualRegister(&RegRC);
2961
2962 // Extract the subregisters from the source pointer.
2963 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2964 .addReg(SrcReg, 0, AMDGPU::sub0);
2965 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2966 .addReg(SrcReg, 0, AMDGPU::sub1);
2967
2968 Register MaskedLo, MaskedHi;
2969
2970 if (CanCopyLow32) {
2971 // If all the bits in the low half are 1, we only need a copy for it.
2972 MaskedLo = LoReg;
2973 } else {
2974 // Extract the mask subregister and apply the and.
2975 Register MaskLo = MRI->createVirtualRegister(&RegRC);
2976 MaskedLo = MRI->createVirtualRegister(&RegRC);
2977
2978 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2979 .addReg(MaskReg, 0, AMDGPU::sub0);
2980 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2981 .addReg(LoReg)
2982 .addReg(MaskLo);
2983 }
2984
2985 if (CanCopyHi32) {
2986 // If all the bits in the high half are 1, we only need a copy for it.
2987 MaskedHi = HiReg;
2988 } else {
2989 Register MaskHi = MRI->createVirtualRegister(&RegRC);
2990 MaskedHi = MRI->createVirtualRegister(&RegRC);
2991
2992 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2993 .addReg(MaskReg, 0, AMDGPU::sub1);
2994 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2995 .addReg(HiReg)
2996 .addReg(MaskHi);
2997 }
2998
2999 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3000 .addReg(MaskedLo)
3001 .addImm(AMDGPU::sub0)
3002 .addReg(MaskedHi)
3003 .addImm(AMDGPU::sub1);
3004 I.eraseFromParent();
3005 return true;
3006 }
3007
3008 /// Return the register to use for the index value, and the subregister to use
3009 /// for the indirectly accessed register.
3010 static std::pair<Register, unsigned>
computeIndirectRegIndex(MachineRegisterInfo & MRI,const SIRegisterInfo & TRI,const TargetRegisterClass * SuperRC,Register IdxReg,unsigned EltSize,GISelKnownBits & KnownBits)3011 computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI,
3012 const TargetRegisterClass *SuperRC, Register IdxReg,
3013 unsigned EltSize, GISelKnownBits &KnownBits) {
3014 Register IdxBaseReg;
3015 int Offset;
3016
3017 std::tie(IdxBaseReg, Offset) =
3018 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &KnownBits);
3019 if (IdxBaseReg == AMDGPU::NoRegister) {
3020 // This will happen if the index is a known constant. This should ordinarily
3021 // be legalized out, but handle it as a register just in case.
3022 assert(Offset == 0);
3023 IdxBaseReg = IdxReg;
3024 }
3025
3026 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3027
3028 // Skip out of bounds offsets, or else we would end up using an undefined
3029 // register.
3030 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3031 return std::pair(IdxReg, SubRegs[0]);
3032 return std::pair(IdxBaseReg, SubRegs[Offset]);
3033 }
3034
selectG_EXTRACT_VECTOR_ELT(MachineInstr & MI) const3035 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3036 MachineInstr &MI) const {
3037 Register DstReg = MI.getOperand(0).getReg();
3038 Register SrcReg = MI.getOperand(1).getReg();
3039 Register IdxReg = MI.getOperand(2).getReg();
3040
3041 LLT DstTy = MRI->getType(DstReg);
3042 LLT SrcTy = MRI->getType(SrcReg);
3043
3044 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3045 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3046 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3047
3048 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3049 // into a waterfall loop.
3050 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3051 return false;
3052
3053 const TargetRegisterClass *SrcRC =
3054 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3055 const TargetRegisterClass *DstRC =
3056 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3057 if (!SrcRC || !DstRC)
3058 return false;
3059 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3060 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3061 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3062 return false;
3063
3064 MachineBasicBlock *BB = MI.getParent();
3065 const DebugLoc &DL = MI.getDebugLoc();
3066 const bool Is64 = DstTy.getSizeInBits() == 64;
3067
3068 unsigned SubReg;
3069 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3070 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3071
3072 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3073 if (DstTy.getSizeInBits() != 32 && !Is64)
3074 return false;
3075
3076 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3077 .addReg(IdxReg);
3078
3079 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3080 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3081 .addReg(SrcReg, 0, SubReg)
3082 .addReg(SrcReg, RegState::Implicit);
3083 MI.eraseFromParent();
3084 return true;
3085 }
3086
3087 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3088 return false;
3089
3090 if (!STI.useVGPRIndexMode()) {
3091 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3092 .addReg(IdxReg);
3093 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3094 .addReg(SrcReg, 0, SubReg)
3095 .addReg(SrcReg, RegState::Implicit);
3096 MI.eraseFromParent();
3097 return true;
3098 }
3099
3100 const MCInstrDesc &GPRIDXDesc =
3101 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3102 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3103 .addReg(SrcReg)
3104 .addReg(IdxReg)
3105 .addImm(SubReg);
3106
3107 MI.eraseFromParent();
3108 return true;
3109 }
3110
3111 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
selectG_INSERT_VECTOR_ELT(MachineInstr & MI) const3112 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3113 MachineInstr &MI) const {
3114 Register DstReg = MI.getOperand(0).getReg();
3115 Register VecReg = MI.getOperand(1).getReg();
3116 Register ValReg = MI.getOperand(2).getReg();
3117 Register IdxReg = MI.getOperand(3).getReg();
3118
3119 LLT VecTy = MRI->getType(DstReg);
3120 LLT ValTy = MRI->getType(ValReg);
3121 unsigned VecSize = VecTy.getSizeInBits();
3122 unsigned ValSize = ValTy.getSizeInBits();
3123
3124 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3125 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3126 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3127
3128 assert(VecTy.getElementType() == ValTy);
3129
3130 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3131 // into a waterfall loop.
3132 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3133 return false;
3134
3135 const TargetRegisterClass *VecRC =
3136 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3137 const TargetRegisterClass *ValRC =
3138 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3139
3140 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3141 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3142 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3143 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3144 return false;
3145
3146 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3147 return false;
3148
3149 unsigned SubReg;
3150 std::tie(IdxReg, SubReg) =
3151 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3152
3153 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3154 STI.useVGPRIndexMode();
3155
3156 MachineBasicBlock *BB = MI.getParent();
3157 const DebugLoc &DL = MI.getDebugLoc();
3158
3159 if (!IndexMode) {
3160 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3161 .addReg(IdxReg);
3162
3163 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3164 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3165 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3166 .addReg(VecReg)
3167 .addReg(ValReg)
3168 .addImm(SubReg);
3169 MI.eraseFromParent();
3170 return true;
3171 }
3172
3173 const MCInstrDesc &GPRIDXDesc =
3174 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3175 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3176 .addReg(VecReg)
3177 .addReg(ValReg)
3178 .addReg(IdxReg)
3179 .addImm(SubReg);
3180
3181 MI.eraseFromParent();
3182 return true;
3183 }
3184
selectBufferLoadLds(MachineInstr & MI) const3185 bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3186 assert(!AMDGPU::isGFX12Plus(STI));
3187 unsigned Opc;
3188 unsigned Size = MI.getOperand(3).getImm();
3189
3190 // The struct intrinsic variants add one additional operand over raw.
3191 const bool HasVIndex = MI.getNumOperands() == 9;
3192 Register VIndex;
3193 int OpOffset = 0;
3194 if (HasVIndex) {
3195 VIndex = MI.getOperand(4).getReg();
3196 OpOffset = 1;
3197 }
3198
3199 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3200 std::optional<ValueAndVReg> MaybeVOffset =
3201 getIConstantVRegValWithLookThrough(VOffset, *MRI);
3202 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3203
3204 switch (Size) {
3205 default:
3206 return false;
3207 case 1:
3208 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3209 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3210 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3211 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3212 break;
3213 case 2:
3214 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3215 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3216 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3217 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3218 break;
3219 case 4:
3220 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3221 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3222 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3223 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3224 break;
3225 }
3226
3227 MachineBasicBlock *MBB = MI.getParent();
3228 const DebugLoc &DL = MI.getDebugLoc();
3229 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3230 .add(MI.getOperand(2));
3231
3232 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3233
3234 if (HasVIndex && HasVOffset) {
3235 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3236 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3237 .addReg(VIndex)
3238 .addImm(AMDGPU::sub0)
3239 .addReg(VOffset)
3240 .addImm(AMDGPU::sub1);
3241
3242 MIB.addReg(IdxReg);
3243 } else if (HasVIndex) {
3244 MIB.addReg(VIndex);
3245 } else if (HasVOffset) {
3246 MIB.addReg(VOffset);
3247 }
3248
3249 MIB.add(MI.getOperand(1)); // rsrc
3250 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3251 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3252 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3253 MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
3254 MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz
3255
3256 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3257 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3258 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3259 MachinePointerInfo StorePtrI = LoadPtrI;
3260 StorePtrI.V = nullptr;
3261 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3262
3263 auto F = LoadMMO->getFlags() &
3264 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3265 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3266 Size, LoadMMO->getBaseAlign());
3267
3268 MachineMemOperand *StoreMMO =
3269 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3270 sizeof(int32_t), LoadMMO->getBaseAlign());
3271
3272 MIB.setMemRefs({LoadMMO, StoreMMO});
3273
3274 MI.eraseFromParent();
3275 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3276 }
3277
3278 /// Match a zero extend from a 32-bit value to 64-bits.
matchZeroExtendFromS32(MachineRegisterInfo & MRI,Register Reg)3279 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3280 Register ZExtSrc;
3281 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3282 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3283
3284 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3285 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3286 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3287 return Register();
3288
3289 assert(Def->getNumOperands() == 3 &&
3290 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3291 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3292 return Def->getOperand(1).getReg();
3293 }
3294
3295 return Register();
3296 }
3297
selectGlobalLoadLds(MachineInstr & MI) const3298 bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3299 unsigned Opc;
3300 unsigned Size = MI.getOperand(3).getImm();
3301
3302 switch (Size) {
3303 default:
3304 return false;
3305 case 1:
3306 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3307 break;
3308 case 2:
3309 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3310 break;
3311 case 4:
3312 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3313 break;
3314 }
3315
3316 MachineBasicBlock *MBB = MI.getParent();
3317 const DebugLoc &DL = MI.getDebugLoc();
3318 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3319 .add(MI.getOperand(2));
3320
3321 Register Addr = MI.getOperand(1).getReg();
3322 Register VOffset;
3323 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3324 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3325 if (!isSGPR(Addr)) {
3326 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3327 if (isSGPR(AddrDef->Reg)) {
3328 Addr = AddrDef->Reg;
3329 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3330 Register SAddr =
3331 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3332 if (isSGPR(SAddr)) {
3333 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3334 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3335 Addr = SAddr;
3336 VOffset = Off;
3337 }
3338 }
3339 }
3340 }
3341
3342 if (isSGPR(Addr)) {
3343 Opc = AMDGPU::getGlobalSaddrOp(Opc);
3344 if (!VOffset) {
3345 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3346 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3347 .addImm(0);
3348 }
3349 }
3350
3351 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3352 .addReg(Addr);
3353
3354 if (isSGPR(Addr))
3355 MIB.addReg(VOffset);
3356
3357 MIB.add(MI.getOperand(4)) // offset
3358 .add(MI.getOperand(5)); // cpol
3359
3360 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3361 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3362 LoadPtrI.Offset = MI.getOperand(4).getImm();
3363 MachinePointerInfo StorePtrI = LoadPtrI;
3364 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
3365 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3366 auto F = LoadMMO->getFlags() &
3367 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3368 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3369 Size, LoadMMO->getBaseAlign());
3370 MachineMemOperand *StoreMMO =
3371 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3372 sizeof(int32_t), Align(4));
3373
3374 MIB.setMemRefs({LoadMMO, StoreMMO});
3375
3376 MI.eraseFromParent();
3377 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3378 }
3379
selectBVHIntrinsic(MachineInstr & MI) const3380 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3381 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3382 MI.removeOperand(1);
3383 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3384 return true;
3385 }
3386
selectSMFMACIntrin(MachineInstr & MI) const3387 bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3388 unsigned Opc;
3389 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3390 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3391 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3392 break;
3393 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3394 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3395 break;
3396 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3397 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3398 break;
3399 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3400 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3401 break;
3402 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3403 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3404 break;
3405 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3406 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3407 break;
3408 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3409 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3410 break;
3411 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3412 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3413 break;
3414 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3415 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3416 break;
3417 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3418 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3419 break;
3420 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3421 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3422 break;
3423 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3424 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3425 break;
3426 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3427 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3428 break;
3429 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3430 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3431 break;
3432 default:
3433 llvm_unreachable("unhandled smfmac intrinsic");
3434 }
3435
3436 auto VDst_In = MI.getOperand(4);
3437
3438 MI.setDesc(TII.get(Opc));
3439 MI.removeOperand(4); // VDst_In
3440 MI.removeOperand(1); // Intrinsic ID
3441 MI.addOperand(VDst_In); // Readd VDst_In to the end
3442 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3443 return true;
3444 }
3445
selectWaveAddress(MachineInstr & MI) const3446 bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3447 Register DstReg = MI.getOperand(0).getReg();
3448 Register SrcReg = MI.getOperand(1).getReg();
3449 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3450 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3451 MachineBasicBlock *MBB = MI.getParent();
3452 const DebugLoc &DL = MI.getDebugLoc();
3453
3454 if (IsVALU) {
3455 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3456 .addImm(Subtarget->getWavefrontSizeLog2())
3457 .addReg(SrcReg);
3458 } else {
3459 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3460 .addReg(SrcReg)
3461 .addImm(Subtarget->getWavefrontSizeLog2())
3462 .setOperandDead(3); // Dead scc
3463 }
3464
3465 const TargetRegisterClass &RC =
3466 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3467 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3468 return false;
3469
3470 MI.eraseFromParent();
3471 return true;
3472 }
3473
selectStackRestore(MachineInstr & MI) const3474 bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3475 Register SrcReg = MI.getOperand(0).getReg();
3476 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3477 return false;
3478
3479 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3480 Register SP =
3481 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
3482 Register WaveAddr = getWaveAddress(DefMI);
3483 MachineBasicBlock *MBB = MI.getParent();
3484 const DebugLoc &DL = MI.getDebugLoc();
3485
3486 if (!WaveAddr) {
3487 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3488 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3489 .addReg(SrcReg)
3490 .addImm(Subtarget->getWavefrontSizeLog2())
3491 .setOperandDead(3); // Dead scc
3492 }
3493
3494 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3495 .addReg(WaveAddr);
3496
3497 MI.eraseFromParent();
3498 return true;
3499 }
3500
select(MachineInstr & I)3501 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
3502
3503 if (!I.isPreISelOpcode()) {
3504 if (I.isCopy())
3505 return selectCOPY(I);
3506 return true;
3507 }
3508
3509 switch (I.getOpcode()) {
3510 case TargetOpcode::G_AND:
3511 case TargetOpcode::G_OR:
3512 case TargetOpcode::G_XOR:
3513 if (selectImpl(I, *CoverageInfo))
3514 return true;
3515 return selectG_AND_OR_XOR(I);
3516 case TargetOpcode::G_ADD:
3517 case TargetOpcode::G_SUB:
3518 case TargetOpcode::G_PTR_ADD:
3519 if (selectImpl(I, *CoverageInfo))
3520 return true;
3521 return selectG_ADD_SUB(I);
3522 case TargetOpcode::G_UADDO:
3523 case TargetOpcode::G_USUBO:
3524 case TargetOpcode::G_UADDE:
3525 case TargetOpcode::G_USUBE:
3526 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3527 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3528 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3529 return selectG_AMDGPU_MAD_64_32(I);
3530 case TargetOpcode::G_INTTOPTR:
3531 case TargetOpcode::G_BITCAST:
3532 case TargetOpcode::G_PTRTOINT:
3533 case TargetOpcode::G_FREEZE:
3534 return selectCOPY(I);
3535 case TargetOpcode::G_CONSTANT:
3536 case TargetOpcode::G_FCONSTANT:
3537 return selectG_CONSTANT(I);
3538 case TargetOpcode::G_FNEG:
3539 if (selectImpl(I, *CoverageInfo))
3540 return true;
3541 return selectG_FNEG(I);
3542 case TargetOpcode::G_FABS:
3543 if (selectImpl(I, *CoverageInfo))
3544 return true;
3545 return selectG_FABS(I);
3546 case TargetOpcode::G_EXTRACT:
3547 return selectG_EXTRACT(I);
3548 case TargetOpcode::G_MERGE_VALUES:
3549 case TargetOpcode::G_CONCAT_VECTORS:
3550 return selectG_MERGE_VALUES(I);
3551 case TargetOpcode::G_UNMERGE_VALUES:
3552 return selectG_UNMERGE_VALUES(I);
3553 case TargetOpcode::G_BUILD_VECTOR:
3554 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3555 return selectG_BUILD_VECTOR(I);
3556 case TargetOpcode::G_IMPLICIT_DEF:
3557 return selectG_IMPLICIT_DEF(I);
3558 case TargetOpcode::G_INSERT:
3559 return selectG_INSERT(I);
3560 case TargetOpcode::G_INTRINSIC:
3561 case TargetOpcode::G_INTRINSIC_CONVERGENT:
3562 return selectG_INTRINSIC(I);
3563 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3564 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
3565 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3566 case TargetOpcode::G_ICMP:
3567 case TargetOpcode::G_FCMP:
3568 if (selectG_ICMP_or_FCMP(I))
3569 return true;
3570 return selectImpl(I, *CoverageInfo);
3571 case TargetOpcode::G_LOAD:
3572 case TargetOpcode::G_STORE:
3573 case TargetOpcode::G_ATOMIC_CMPXCHG:
3574 case TargetOpcode::G_ATOMICRMW_XCHG:
3575 case TargetOpcode::G_ATOMICRMW_ADD:
3576 case TargetOpcode::G_ATOMICRMW_SUB:
3577 case TargetOpcode::G_ATOMICRMW_AND:
3578 case TargetOpcode::G_ATOMICRMW_OR:
3579 case TargetOpcode::G_ATOMICRMW_XOR:
3580 case TargetOpcode::G_ATOMICRMW_MIN:
3581 case TargetOpcode::G_ATOMICRMW_MAX:
3582 case TargetOpcode::G_ATOMICRMW_UMIN:
3583 case TargetOpcode::G_ATOMICRMW_UMAX:
3584 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
3585 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
3586 case TargetOpcode::G_ATOMICRMW_FADD:
3587 case TargetOpcode::G_ATOMICRMW_FMIN:
3588 case TargetOpcode::G_ATOMICRMW_FMAX:
3589 return selectG_LOAD_STORE_ATOMICRMW(I);
3590 case TargetOpcode::G_SELECT:
3591 return selectG_SELECT(I);
3592 case TargetOpcode::G_TRUNC:
3593 return selectG_TRUNC(I);
3594 case TargetOpcode::G_SEXT:
3595 case TargetOpcode::G_ZEXT:
3596 case TargetOpcode::G_ANYEXT:
3597 case TargetOpcode::G_SEXT_INREG:
3598 // This is a workaround. For extension from type i1, `selectImpl()` uses
3599 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
3600 // i1 can only be hold in a SGPR class.
3601 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
3602 selectImpl(I, *CoverageInfo))
3603 return true;
3604 return selectG_SZA_EXT(I);
3605 case TargetOpcode::G_FPEXT:
3606 if (selectG_FPEXT(I))
3607 return true;
3608 return selectImpl(I, *CoverageInfo);
3609 case TargetOpcode::G_BRCOND:
3610 return selectG_BRCOND(I);
3611 case TargetOpcode::G_GLOBAL_VALUE:
3612 return selectG_GLOBAL_VALUE(I);
3613 case TargetOpcode::G_PTRMASK:
3614 return selectG_PTRMASK(I);
3615 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3616 return selectG_EXTRACT_VECTOR_ELT(I);
3617 case TargetOpcode::G_INSERT_VECTOR_ELT:
3618 return selectG_INSERT_VECTOR_ELT(I);
3619 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3620 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3621 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3622 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3623 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3624 const AMDGPU::ImageDimIntrinsicInfo *Intr =
3625 AMDGPU::getImageDimIntrinsicInfo(AMDGPU::getIntrinsicID(I));
3626 assert(Intr && "not an image intrinsic with image pseudo");
3627 return selectImageIntrinsic(I, Intr);
3628 }
3629 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3630 return selectBVHIntrinsic(I);
3631 case AMDGPU::G_SBFX:
3632 case AMDGPU::G_UBFX:
3633 return selectG_SBFX_UBFX(I);
3634 case AMDGPU::G_SI_CALL:
3635 I.setDesc(TII.get(AMDGPU::SI_CALL));
3636 return true;
3637 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
3638 return selectWaveAddress(I);
3639 case AMDGPU::G_STACKRESTORE:
3640 return selectStackRestore(I);
3641 case AMDGPU::G_PHI:
3642 return selectPHI(I);
3643 default:
3644 return selectImpl(I, *CoverageInfo);
3645 }
3646 return false;
3647 }
3648
3649 InstructionSelector::ComplexRendererFns
selectVCSRC(MachineOperand & Root) const3650 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3651 return {{
3652 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3653 }};
3654
3655 }
3656
3657 std::pair<Register, unsigned>
selectVOP3ModsImpl(MachineOperand & Root,bool IsCanonicalizing,bool AllowAbs,bool OpSel) const3658 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3659 bool IsCanonicalizing,
3660 bool AllowAbs, bool OpSel) const {
3661 Register Src = Root.getReg();
3662 unsigned Mods = 0;
3663 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3664
3665 if (MI->getOpcode() == AMDGPU::G_FNEG) {
3666 Src = MI->getOperand(1).getReg();
3667 Mods |= SISrcMods::NEG;
3668 MI = getDefIgnoringCopies(Src, *MRI);
3669 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
3670 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3671 // denormal mode, but we're implicitly canonicalizing in a source operand.
3672 const ConstantFP *LHS =
3673 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
3674 if (LHS && LHS->isZero()) {
3675 Mods |= SISrcMods::NEG;
3676 Src = MI->getOperand(2).getReg();
3677 }
3678 }
3679
3680 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
3681 Src = MI->getOperand(1).getReg();
3682 Mods |= SISrcMods::ABS;
3683 }
3684
3685 if (OpSel)
3686 Mods |= SISrcMods::OP_SEL_0;
3687
3688 return std::pair(Src, Mods);
3689 }
3690
copyToVGPRIfSrcFolded(Register Src,unsigned Mods,MachineOperand Root,MachineInstr * InsertPt,bool ForceVGPR) const3691 Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
3692 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
3693 bool ForceVGPR) const {
3694 if ((Mods != 0 || ForceVGPR) &&
3695 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3696
3697 // If we looked through copies to find source modifiers on an SGPR operand,
3698 // we now have an SGPR register source. To avoid potentially violating the
3699 // constant bus restriction, we need to insert a copy to a VGPR.
3700 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
3701 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
3702 TII.get(AMDGPU::COPY), VGPRSrc)
3703 .addReg(Src);
3704 Src = VGPRSrc;
3705 }
3706
3707 return Src;
3708 }
3709
3710 ///
3711 /// This will select either an SGPR or VGPR operand and will save us from
3712 /// having to write an extra tablegen pattern.
3713 InstructionSelector::ComplexRendererFns
selectVSRC0(MachineOperand & Root) const3714 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3715 return {{
3716 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3717 }};
3718 }
3719
3720 InstructionSelector::ComplexRendererFns
selectVOP3Mods0(MachineOperand & Root) const3721 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3722 Register Src;
3723 unsigned Mods;
3724 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3725
3726 return {{
3727 [=](MachineInstrBuilder &MIB) {
3728 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3729 },
3730 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3731 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3732 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3733 }};
3734 }
3735
3736 InstructionSelector::ComplexRendererFns
selectVOP3BMods0(MachineOperand & Root) const3737 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3738 Register Src;
3739 unsigned Mods;
3740 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3741 /*IsCanonicalizing=*/true,
3742 /*AllowAbs=*/false);
3743
3744 return {{
3745 [=](MachineInstrBuilder &MIB) {
3746 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3747 },
3748 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3749 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3750 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3751 }};
3752 }
3753
3754 InstructionSelector::ComplexRendererFns
selectVOP3OMods(MachineOperand & Root) const3755 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3756 return {{
3757 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3758 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3759 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3760 }};
3761 }
3762
3763 InstructionSelector::ComplexRendererFns
selectVOP3Mods(MachineOperand & Root) const3764 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3765 Register Src;
3766 unsigned Mods;
3767 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3768
3769 return {{
3770 [=](MachineInstrBuilder &MIB) {
3771 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3772 },
3773 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3774 }};
3775 }
3776
3777 InstructionSelector::ComplexRendererFns
selectVOP3ModsNonCanonicalizing(MachineOperand & Root) const3778 AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
3779 MachineOperand &Root) const {
3780 Register Src;
3781 unsigned Mods;
3782 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
3783
3784 return {{
3785 [=](MachineInstrBuilder &MIB) {
3786 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3787 },
3788 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3789 }};
3790 }
3791
3792 InstructionSelector::ComplexRendererFns
selectVOP3BMods(MachineOperand & Root) const3793 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3794 Register Src;
3795 unsigned Mods;
3796 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
3797 /*AllowAbs=*/false);
3798
3799 return {{
3800 [=](MachineInstrBuilder &MIB) {
3801 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3802 },
3803 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3804 }};
3805 }
3806
3807 InstructionSelector::ComplexRendererFns
selectVOP3NoMods(MachineOperand & Root) const3808 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3809 Register Reg = Root.getReg();
3810 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3811 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
3812 return {};
3813 return {{
3814 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3815 }};
3816 }
3817
3818 std::pair<Register, unsigned>
selectVOP3PModsImpl(Register Src,const MachineRegisterInfo & MRI,bool IsDOT) const3819 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3820 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
3821 unsigned Mods = 0;
3822 MachineInstr *MI = MRI.getVRegDef(Src);
3823
3824 if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3825 // It's possible to see an f32 fneg here, but unlikely.
3826 // TODO: Treat f32 fneg as only high bit.
3827 MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3828 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3829 Src = MI->getOperand(1).getReg();
3830 MI = MRI.getVRegDef(Src);
3831 }
3832
3833 // TODO: Handle G_FSUB 0 as fneg
3834
3835 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3836 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
3837
3838 // Packed instructions do not have abs modifiers.
3839 Mods |= SISrcMods::OP_SEL_1;
3840
3841 return std::pair(Src, Mods);
3842 }
3843
3844 InstructionSelector::ComplexRendererFns
selectVOP3PMods(MachineOperand & Root) const3845 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3846 MachineRegisterInfo &MRI
3847 = Root.getParent()->getParent()->getParent()->getRegInfo();
3848
3849 Register Src;
3850 unsigned Mods;
3851 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3852
3853 return {{
3854 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3855 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3856 }};
3857 }
3858
3859 InstructionSelector::ComplexRendererFns
selectVOP3PModsDOT(MachineOperand & Root) const3860 AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
3861 MachineRegisterInfo &MRI
3862 = Root.getParent()->getParent()->getParent()->getRegInfo();
3863
3864 Register Src;
3865 unsigned Mods;
3866 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
3867
3868 return {{
3869 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3870 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3871 }};
3872 }
3873
3874 InstructionSelector::ComplexRendererFns
selectVOP3PModsNeg(MachineOperand & Root) const3875 AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
3876 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3877 // Value is in Imm operand as i1 sign extended to int64_t.
3878 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
3879 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3880 "expected i1 value");
3881 unsigned Mods = SISrcMods::OP_SEL_1;
3882 if (Root.getImm() == -1)
3883 Mods ^= SISrcMods::NEG;
3884 return {{
3885 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3886 }};
3887 }
3888
3889 InstructionSelector::ComplexRendererFns
selectWMMAOpSelVOP3PMods(MachineOperand & Root) const3890 AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
3891 MachineOperand &Root) const {
3892 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3893 "expected i1 value");
3894 unsigned Mods = SISrcMods::OP_SEL_1;
3895 if (Root.getImm() != 0)
3896 Mods |= SISrcMods::OP_SEL_0;
3897
3898 return {{
3899 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3900 }};
3901 }
3902
buildRegSequence(SmallVectorImpl<Register> & Elts,MachineInstr * InsertPt,MachineRegisterInfo & MRI)3903 static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
3904 MachineInstr *InsertPt,
3905 MachineRegisterInfo &MRI) {
3906 const TargetRegisterClass *DstRegClass;
3907 switch (Elts.size()) {
3908 case 8:
3909 DstRegClass = &AMDGPU::VReg_256RegClass;
3910 break;
3911 case 4:
3912 DstRegClass = &AMDGPU::VReg_128RegClass;
3913 break;
3914 case 2:
3915 DstRegClass = &AMDGPU::VReg_64RegClass;
3916 break;
3917 default:
3918 llvm_unreachable("unhandled Reg sequence size");
3919 }
3920
3921 MachineIRBuilder B(*InsertPt);
3922 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
3923 .addDef(MRI.createVirtualRegister(DstRegClass));
3924 for (unsigned i = 0; i < Elts.size(); ++i) {
3925 MIB.addReg(Elts[i]);
3926 MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i));
3927 }
3928 return MIB->getOperand(0).getReg();
3929 }
3930
selectWMMAModsNegAbs(unsigned ModOpcode,unsigned & Mods,SmallVectorImpl<Register> & Elts,Register & Src,MachineInstr * InsertPt,MachineRegisterInfo & MRI)3931 static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3932 SmallVectorImpl<Register> &Elts, Register &Src,
3933 MachineInstr *InsertPt,
3934 MachineRegisterInfo &MRI) {
3935 if (ModOpcode == TargetOpcode::G_FNEG) {
3936 Mods |= SISrcMods::NEG;
3937 // Check if all elements also have abs modifier
3938 SmallVector<Register, 8> NegAbsElts;
3939 for (auto El : Elts) {
3940 Register FabsSrc;
3941 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
3942 break;
3943 NegAbsElts.push_back(FabsSrc);
3944 }
3945 if (Elts.size() != NegAbsElts.size()) {
3946 // Neg
3947 Src = buildRegSequence(Elts, InsertPt, MRI);
3948 } else {
3949 // Neg and Abs
3950 Mods |= SISrcMods::NEG_HI;
3951 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
3952 }
3953 } else {
3954 assert(ModOpcode == TargetOpcode::G_FABS);
3955 // Abs
3956 Mods |= SISrcMods::NEG_HI;
3957 Src = buildRegSequence(Elts, InsertPt, MRI);
3958 }
3959 }
3960
3961 InstructionSelector::ComplexRendererFns
selectWMMAModsF32NegAbs(MachineOperand & Root) const3962 AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
3963 Register Src = Root.getReg();
3964 unsigned Mods = SISrcMods::OP_SEL_1;
3965 SmallVector<Register, 8> EltsF32;
3966
3967 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
3968 assert(BV->getNumSources() > 0);
3969 // Based on first element decide which mod we match, neg or abs
3970 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
3971 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
3972 ? AMDGPU::G_FNEG
3973 : AMDGPU::G_FABS;
3974 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
3975 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
3976 if (ElF32->getOpcode() != ModOpcode)
3977 break;
3978 EltsF32.push_back(ElF32->getOperand(1).getReg());
3979 }
3980
3981 // All elements had ModOpcode modifier
3982 if (BV->getNumSources() == EltsF32.size()) {
3983 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
3984 *MRI);
3985 }
3986 }
3987
3988 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3989 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
3990 }
3991
3992 InstructionSelector::ComplexRendererFns
selectWMMAModsF16Neg(MachineOperand & Root) const3993 AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
3994 Register Src = Root.getReg();
3995 unsigned Mods = SISrcMods::OP_SEL_1;
3996 SmallVector<Register, 8> EltsV2F16;
3997
3998 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
3999 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4000 Register FNegSrc;
4001 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
4002 break;
4003 EltsV2F16.push_back(FNegSrc);
4004 }
4005
4006 // All elements had ModOpcode modifier
4007 if (CV->getNumSources() == EltsV2F16.size()) {
4008 Mods |= SISrcMods::NEG;
4009 Mods |= SISrcMods::NEG_HI;
4010 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
4011 }
4012 }
4013
4014 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4015 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4016 }
4017
4018 InstructionSelector::ComplexRendererFns
selectWMMAModsF16NegAbs(MachineOperand & Root) const4019 AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4020 Register Src = Root.getReg();
4021 unsigned Mods = SISrcMods::OP_SEL_1;
4022 SmallVector<Register, 8> EltsV2F16;
4023
4024 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4025 assert(CV->getNumSources() > 0);
4026 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
4027 // Based on first element decide which mod we match, neg or abs
4028 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4029 ? AMDGPU::G_FNEG
4030 : AMDGPU::G_FABS;
4031
4032 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4033 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
4034 if (ElV2F16->getOpcode() != ModOpcode)
4035 break;
4036 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4037 }
4038
4039 // All elements had ModOpcode modifier
4040 if (CV->getNumSources() == EltsV2F16.size()) {
4041 MachineIRBuilder B(*Root.getParent());
4042 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
4043 *MRI);
4044 }
4045 }
4046
4047 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4048 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4049 }
4050
4051 InstructionSelector::ComplexRendererFns
selectWMMAVISrc(MachineOperand & Root) const4052 AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4053 std::optional<FPValueAndVReg> FPValReg;
4054 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
4055 if (TII.isInlineConstant(FPValReg->Value)) {
4056 return {{[=](MachineInstrBuilder &MIB) {
4057 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4058 }}};
4059 }
4060 // Non-inlineable splat floats should not fall-through for integer immediate
4061 // checks.
4062 return {};
4063 }
4064
4065 APInt ICst;
4066 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
4067 if (TII.isInlineConstant(ICst)) {
4068 return {
4069 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4070 }
4071 }
4072
4073 return {};
4074 }
4075
4076 InstructionSelector::ComplexRendererFns
selectSWMMACIndex8(MachineOperand & Root) const4077 AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4078 Register Src =
4079 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4080 unsigned Key = 0;
4081
4082 Register ShiftSrc;
4083 std::optional<ValueAndVReg> ShiftAmt;
4084 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4085 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4086 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4087 Key = ShiftAmt->Value.getZExtValue() / 8;
4088 Src = ShiftSrc;
4089 }
4090
4091 return {{
4092 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4093 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4094 }};
4095 }
4096
4097 InstructionSelector::ComplexRendererFns
selectSWMMACIndex16(MachineOperand & Root) const4098 AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4099
4100 Register Src =
4101 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4102 unsigned Key = 0;
4103
4104 Register ShiftSrc;
4105 std::optional<ValueAndVReg> ShiftAmt;
4106 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4107 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4108 ShiftAmt->Value.getZExtValue() == 16) {
4109 Src = ShiftSrc;
4110 Key = 1;
4111 }
4112
4113 return {{
4114 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4115 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4116 }};
4117 }
4118
4119 InstructionSelector::ComplexRendererFns
selectVOP3OpSelMods(MachineOperand & Root) const4120 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4121 Register Src;
4122 unsigned Mods;
4123 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
4124
4125 // FIXME: Handle op_sel
4126 return {{
4127 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4128 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4129 }};
4130 }
4131
4132 InstructionSelector::ComplexRendererFns
selectVINTERPMods(MachineOperand & Root) const4133 AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4134 Register Src;
4135 unsigned Mods;
4136 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4137 /*IsCanonicalizing=*/true,
4138 /*AllowAbs=*/false,
4139 /*OpSel=*/false);
4140
4141 return {{
4142 [=](MachineInstrBuilder &MIB) {
4143 MIB.addReg(
4144 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4145 },
4146 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4147 }};
4148 }
4149
4150 InstructionSelector::ComplexRendererFns
selectVINTERPModsHi(MachineOperand & Root) const4151 AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4152 Register Src;
4153 unsigned Mods;
4154 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4155 /*IsCanonicalizing=*/true,
4156 /*AllowAbs=*/false,
4157 /*OpSel=*/true);
4158
4159 return {{
4160 [=](MachineInstrBuilder &MIB) {
4161 MIB.addReg(
4162 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4163 },
4164 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4165 }};
4166 }
4167
selectSmrdOffset(MachineOperand & Root,Register & Base,Register * SOffset,int64_t * Offset) const4168 bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4169 Register &Base,
4170 Register *SOffset,
4171 int64_t *Offset) const {
4172 MachineInstr *MI = Root.getParent();
4173 MachineBasicBlock *MBB = MI->getParent();
4174
4175 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4176 // then we can select all ptr + 32-bit offsets.
4177 SmallVector<GEPInfo, 4> AddrInfo;
4178 getAddrModeInfo(*MI, *MRI, AddrInfo);
4179
4180 if (AddrInfo.empty())
4181 return false;
4182
4183 const GEPInfo &GEPI = AddrInfo[0];
4184 std::optional<int64_t> EncodedImm;
4185
4186 if (SOffset && Offset) {
4187 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4188 /*HasSOffset=*/true);
4189 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4190 AddrInfo.size() > 1) {
4191 const GEPInfo &GEPI2 = AddrInfo[1];
4192 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4193 if (Register OffsetReg =
4194 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4195 Base = GEPI2.SgprParts[0];
4196 *SOffset = OffsetReg;
4197 *Offset = *EncodedImm;
4198 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
4199 return true;
4200
4201 // For unbuffered smem loads, it is illegal for the Immediate Offset
4202 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
4203 // is negative. Handle the case where the Immediate Offset + SOffset
4204 // is negative.
4205 auto SKnown = KB->getKnownBits(*SOffset);
4206 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4207 return false;
4208
4209 return true;
4210 }
4211 }
4212 }
4213 return false;
4214 }
4215
4216 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4217 /*HasSOffset=*/false);
4218 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4219 Base = GEPI.SgprParts[0];
4220 *Offset = *EncodedImm;
4221 return true;
4222 }
4223
4224 // SGPR offset is unsigned.
4225 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4226 GEPI.Imm != 0) {
4227 // If we make it this far we have a load with an 32-bit immediate offset.
4228 // It is OK to select this using a sgpr offset, because we have already
4229 // failed trying to select this load into one of the _IMM variants since
4230 // the _IMM Patterns are considered before the _SGPR patterns.
4231 Base = GEPI.SgprParts[0];
4232 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4233 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4234 .addImm(GEPI.Imm);
4235 return true;
4236 }
4237
4238 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4239 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4240 Base = GEPI.SgprParts[0];
4241 *SOffset = OffsetReg;
4242 return true;
4243 }
4244 }
4245
4246 return false;
4247 }
4248
4249 InstructionSelector::ComplexRendererFns
selectSmrdImm(MachineOperand & Root) const4250 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4251 Register Base;
4252 int64_t Offset;
4253 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4254 return std::nullopt;
4255
4256 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4257 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4258 }
4259
4260 InstructionSelector::ComplexRendererFns
selectSmrdImm32(MachineOperand & Root) const4261 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4262 SmallVector<GEPInfo, 4> AddrInfo;
4263 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4264
4265 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4266 return std::nullopt;
4267
4268 const GEPInfo &GEPInfo = AddrInfo[0];
4269 Register PtrReg = GEPInfo.SgprParts[0];
4270 std::optional<int64_t> EncodedImm =
4271 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4272 if (!EncodedImm)
4273 return std::nullopt;
4274
4275 return {{
4276 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4277 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4278 }};
4279 }
4280
4281 InstructionSelector::ComplexRendererFns
selectSmrdSgpr(MachineOperand & Root) const4282 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4283 Register Base, SOffset;
4284 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4285 return std::nullopt;
4286
4287 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4288 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4289 }
4290
4291 InstructionSelector::ComplexRendererFns
selectSmrdSgprImm(MachineOperand & Root) const4292 AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4293 Register Base, SOffset;
4294 int64_t Offset;
4295 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4296 return std::nullopt;
4297
4298 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4299 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4300 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4301 }
4302
4303 std::pair<Register, int>
selectFlatOffsetImpl(MachineOperand & Root,uint64_t FlatVariant) const4304 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4305 uint64_t FlatVariant) const {
4306 MachineInstr *MI = Root.getParent();
4307
4308 auto Default = std::pair(Root.getReg(), 0);
4309
4310 if (!STI.hasFlatInstOffsets())
4311 return Default;
4312
4313 Register PtrBase;
4314 int64_t ConstOffset;
4315 std::tie(PtrBase, ConstOffset) =
4316 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4317
4318 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4319 !isFlatScratchBaseLegal(Root.getReg())))
4320 return Default;
4321
4322 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4323 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4324 return Default;
4325
4326 return std::pair(PtrBase, ConstOffset);
4327 }
4328
4329 InstructionSelector::ComplexRendererFns
selectFlatOffset(MachineOperand & Root) const4330 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4331 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4332
4333 return {{
4334 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4335 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4336 }};
4337 }
4338
4339 InstructionSelector::ComplexRendererFns
selectGlobalOffset(MachineOperand & Root) const4340 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4341 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4342
4343 return {{
4344 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4345 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4346 }};
4347 }
4348
4349 InstructionSelector::ComplexRendererFns
selectScratchOffset(MachineOperand & Root) const4350 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4351 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4352
4353 return {{
4354 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4355 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4356 }};
4357 }
4358
4359 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4360 InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand & Root) const4361 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4362 Register Addr = Root.getReg();
4363 Register PtrBase;
4364 int64_t ConstOffset;
4365 int64_t ImmOffset = 0;
4366
4367 // Match the immediate offset first, which canonically is moved as low as
4368 // possible.
4369 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4370
4371 if (ConstOffset != 0) {
4372 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4373 SIInstrFlags::FlatGlobal)) {
4374 Addr = PtrBase;
4375 ImmOffset = ConstOffset;
4376 } else {
4377 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4378 if (isSGPR(PtrBaseDef->Reg)) {
4379 if (ConstOffset > 0) {
4380 // Offset is too large.
4381 //
4382 // saddr + large_offset -> saddr +
4383 // (voffset = large_offset & ~MaxOffset) +
4384 // (large_offset & MaxOffset);
4385 int64_t SplitImmOffset, RemainderOffset;
4386 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4387 ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
4388
4389 if (isUInt<32>(RemainderOffset)) {
4390 MachineInstr *MI = Root.getParent();
4391 MachineBasicBlock *MBB = MI->getParent();
4392 Register HighBits =
4393 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4394
4395 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4396 HighBits)
4397 .addImm(RemainderOffset);
4398
4399 return {{
4400 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4401 [=](MachineInstrBuilder &MIB) {
4402 MIB.addReg(HighBits);
4403 }, // voffset
4404 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4405 }};
4406 }
4407 }
4408
4409 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4410 // is 1 we would need to perform 1 or 2 extra moves for each half of
4411 // the constant and it is better to do a scalar add and then issue a
4412 // single VALU instruction to materialize zero. Otherwise it is less
4413 // instructions to perform VALU adds with immediates or inline literals.
4414 unsigned NumLiterals =
4415 !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
4416 !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
4417 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4418 return std::nullopt;
4419 }
4420 }
4421 }
4422
4423 // Match the variable offset.
4424 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4425 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4426 // Look through the SGPR->VGPR copy.
4427 Register SAddr =
4428 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4429
4430 if (isSGPR(SAddr)) {
4431 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4432
4433 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4434 // inserted later.
4435 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4436 return {{[=](MachineInstrBuilder &MIB) { // saddr
4437 MIB.addReg(SAddr);
4438 },
4439 [=](MachineInstrBuilder &MIB) { // voffset
4440 MIB.addReg(VOffset);
4441 },
4442 [=](MachineInstrBuilder &MIB) { // offset
4443 MIB.addImm(ImmOffset);
4444 }}};
4445 }
4446 }
4447 }
4448
4449 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4450 // drop this.
4451 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4452 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4453 return std::nullopt;
4454
4455 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4456 // moves required to copy a 64-bit SGPR to VGPR.
4457 MachineInstr *MI = Root.getParent();
4458 MachineBasicBlock *MBB = MI->getParent();
4459 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4460
4461 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4462 .addImm(0);
4463
4464 return {{
4465 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4466 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
4467 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4468 }};
4469 }
4470
4471 InstructionSelector::ComplexRendererFns
selectScratchSAddr(MachineOperand & Root) const4472 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4473 Register Addr = Root.getReg();
4474 Register PtrBase;
4475 int64_t ConstOffset;
4476 int64_t ImmOffset = 0;
4477
4478 // Match the immediate offset first, which canonically is moved as low as
4479 // possible.
4480 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4481
4482 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4483 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
4484 SIInstrFlags::FlatScratch)) {
4485 Addr = PtrBase;
4486 ImmOffset = ConstOffset;
4487 }
4488
4489 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4490 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4491 int FI = AddrDef->MI->getOperand(1).getIndex();
4492 return {{
4493 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4494 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4495 }};
4496 }
4497
4498 Register SAddr = AddrDef->Reg;
4499
4500 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4501 Register LHS = AddrDef->MI->getOperand(1).getReg();
4502 Register RHS = AddrDef->MI->getOperand(2).getReg();
4503 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4504 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4505
4506 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4507 isSGPR(RHSDef->Reg)) {
4508 int FI = LHSDef->MI->getOperand(1).getIndex();
4509 MachineInstr &I = *Root.getParent();
4510 MachineBasicBlock *BB = I.getParent();
4511 const DebugLoc &DL = I.getDebugLoc();
4512 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4513
4514 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4515 .addFrameIndex(FI)
4516 .addReg(RHSDef->Reg)
4517 .setOperandDead(3); // Dead scc
4518 }
4519 }
4520
4521 if (!isSGPR(SAddr))
4522 return std::nullopt;
4523
4524 return {{
4525 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4526 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4527 }};
4528 }
4529
4530 // Check whether the flat scratch SVS swizzle bug affects this access.
checkFlatScratchSVSSwizzleBug(Register VAddr,Register SAddr,uint64_t ImmOffset) const4531 bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4532 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4533 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4534 return false;
4535
4536 // The bug affects the swizzling of SVS accesses if there is any carry out
4537 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4538 // voffset to (soffset + inst_offset).
4539 auto VKnown = KB->getKnownBits(VAddr);
4540 auto SKnown = KnownBits::computeForAddSub(
4541 /*Add=*/true, /*NSW=*/false, /*NUW=*/false, KB->getKnownBits(SAddr),
4542 KnownBits::makeConstant(APInt(32, ImmOffset)));
4543 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4544 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4545 return (VMax & 3) + (SMax & 3) >= 4;
4546 }
4547
4548 InstructionSelector::ComplexRendererFns
selectScratchSVAddr(MachineOperand & Root) const4549 AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4550 Register Addr = Root.getReg();
4551 Register PtrBase;
4552 int64_t ConstOffset;
4553 int64_t ImmOffset = 0;
4554
4555 // Match the immediate offset first, which canonically is moved as low as
4556 // possible.
4557 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4558
4559 Register OrigAddr = Addr;
4560 if (ConstOffset != 0 &&
4561 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4562 Addr = PtrBase;
4563 ImmOffset = ConstOffset;
4564 }
4565
4566 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4567 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4568 return std::nullopt;
4569
4570 Register RHS = AddrDef->MI->getOperand(2).getReg();
4571 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4572 return std::nullopt;
4573
4574 Register LHS = AddrDef->MI->getOperand(1).getReg();
4575 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4576
4577 if (OrigAddr != Addr) {
4578 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
4579 return std::nullopt;
4580 } else {
4581 if (!isFlatScratchBaseLegalSV(OrigAddr))
4582 return std::nullopt;
4583 }
4584
4585 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4586 return std::nullopt;
4587
4588 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4589 int FI = LHSDef->MI->getOperand(1).getIndex();
4590 return {{
4591 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4592 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4593 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4594 }};
4595 }
4596
4597 if (!isSGPR(LHS))
4598 return std::nullopt;
4599
4600 return {{
4601 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4602 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4603 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4604 }};
4605 }
4606
4607 InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffen(MachineOperand & Root) const4608 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4609 MachineInstr *MI = Root.getParent();
4610 MachineBasicBlock *MBB = MI->getParent();
4611 MachineFunction *MF = MBB->getParent();
4612 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
4613
4614 int64_t Offset = 0;
4615 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4616 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
4617 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4618
4619 // TODO: Should this be inside the render function? The iterator seems to
4620 // move.
4621 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
4622 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4623 HighBits)
4624 .addImm(Offset & ~MaxOffset);
4625
4626 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4627 MIB.addReg(Info->getScratchRSrcReg());
4628 },
4629 [=](MachineInstrBuilder &MIB) { // vaddr
4630 MIB.addReg(HighBits);
4631 },
4632 [=](MachineInstrBuilder &MIB) { // soffset
4633 // Use constant zero for soffset and rely on eliminateFrameIndex
4634 // to choose the appropriate frame register if need be.
4635 MIB.addImm(0);
4636 },
4637 [=](MachineInstrBuilder &MIB) { // offset
4638 MIB.addImm(Offset & MaxOffset);
4639 }}};
4640 }
4641
4642 assert(Offset == 0 || Offset == -1);
4643
4644 // Try to fold a frame index directly into the MUBUF vaddr field, and any
4645 // offsets.
4646 std::optional<int> FI;
4647 Register VAddr = Root.getReg();
4648 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
4649 Register PtrBase;
4650 int64_t ConstOffset;
4651 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
4652 if (ConstOffset != 0) {
4653 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
4654 (!STI.privateMemoryResourceIsRangeChecked() ||
4655 KB->signBitIsZero(PtrBase))) {
4656 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
4657 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
4658 FI = PtrBaseDef->getOperand(1).getIndex();
4659 else
4660 VAddr = PtrBase;
4661 Offset = ConstOffset;
4662 }
4663 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4664 FI = RootDef->getOperand(1).getIndex();
4665 }
4666 }
4667
4668 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4669 MIB.addReg(Info->getScratchRSrcReg());
4670 },
4671 [=](MachineInstrBuilder &MIB) { // vaddr
4672 if (FI)
4673 MIB.addFrameIndex(*FI);
4674 else
4675 MIB.addReg(VAddr);
4676 },
4677 [=](MachineInstrBuilder &MIB) { // soffset
4678 // Use constant zero for soffset and rely on eliminateFrameIndex
4679 // to choose the appropriate frame register if need be.
4680 MIB.addImm(0);
4681 },
4682 [=](MachineInstrBuilder &MIB) { // offset
4683 MIB.addImm(Offset);
4684 }}};
4685 }
4686
isDSOffsetLegal(Register Base,int64_t Offset) const4687 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
4688 int64_t Offset) const {
4689 if (!isUInt<16>(Offset))
4690 return false;
4691
4692 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
4693 return true;
4694
4695 // On Southern Islands instruction with a negative base value and an offset
4696 // don't seem to work.
4697 return KB->signBitIsZero(Base);
4698 }
4699
isDSOffset2Legal(Register Base,int64_t Offset0,int64_t Offset1,unsigned Size) const4700 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
4701 int64_t Offset1,
4702 unsigned Size) const {
4703 if (Offset0 % Size != 0 || Offset1 % Size != 0)
4704 return false;
4705 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
4706 return false;
4707
4708 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
4709 return true;
4710
4711 // On Southern Islands instruction with a negative base value and an offset
4712 // don't seem to work.
4713 return KB->signBitIsZero(Base);
4714 }
4715
4716 // Return whether the operation has NoUnsignedWrap property.
isNoUnsignedWrap(MachineInstr * Addr)4717 static bool isNoUnsignedWrap(MachineInstr *Addr) {
4718 return Addr->getOpcode() == TargetOpcode::G_OR ||
4719 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
4720 Addr->getFlag(MachineInstr::NoUWrap));
4721 }
4722
4723 // Check that the base address of flat scratch load/store in the form of `base +
4724 // offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
4725 // requirement). We always treat the first operand as the base address here.
isFlatScratchBaseLegal(Register Addr) const4726 bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
4727 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4728
4729 if (isNoUnsignedWrap(AddrMI))
4730 return true;
4731
4732 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4733 // values.
4734 if (STI.hasSignedScratchOffsets())
4735 return true;
4736
4737 Register LHS = AddrMI->getOperand(1).getReg();
4738 Register RHS = AddrMI->getOperand(2).getReg();
4739
4740 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
4741 std::optional<ValueAndVReg> RhsValReg =
4742 getIConstantVRegValWithLookThrough(RHS, *MRI);
4743 // If the immediate offset is negative and within certain range, the base
4744 // address cannot also be negative. If the base is also negative, the sum
4745 // would be either negative or much larger than the valid range of scratch
4746 // memory a thread can access.
4747 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
4748 RhsValReg->Value.getSExtValue() > -0x40000000)
4749 return true;
4750 }
4751
4752 return KB->signBitIsZero(LHS);
4753 }
4754
4755 // Check address value in SGPR/VGPR are legal for flat scratch in the form
4756 // of: SGPR + VGPR.
isFlatScratchBaseLegalSV(Register Addr) const4757 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
4758 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4759
4760 if (isNoUnsignedWrap(AddrMI))
4761 return true;
4762
4763 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4764 // values.
4765 if (STI.hasSignedScratchOffsets())
4766 return true;
4767
4768 Register LHS = AddrMI->getOperand(1).getReg();
4769 Register RHS = AddrMI->getOperand(2).getReg();
4770 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4771 }
4772
4773 // Check address value in SGPR/VGPR are legal for flat scratch in the form
4774 // of: SGPR + VGPR + Imm.
isFlatScratchBaseLegalSVImm(Register Addr) const4775 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
4776 Register Addr) const {
4777 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4778 // values.
4779 if (STI.hasSignedScratchOffsets())
4780 return true;
4781
4782 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4783 Register Base = AddrMI->getOperand(1).getReg();
4784 std::optional<DefinitionAndSourceRegister> BaseDef =
4785 getDefSrcRegIgnoringCopies(Base, *MRI);
4786 std::optional<ValueAndVReg> RHSOffset =
4787 getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI);
4788 assert(RHSOffset);
4789
4790 // If the immediate offset is negative and within certain range, the base
4791 // address cannot also be negative. If the base is also negative, the sum
4792 // would be either negative or much larger than the valid range of scratch
4793 // memory a thread can access.
4794 if (isNoUnsignedWrap(BaseDef->MI) &&
4795 (isNoUnsignedWrap(AddrMI) ||
4796 (RHSOffset->Value.getSExtValue() < 0 &&
4797 RHSOffset->Value.getSExtValue() > -0x40000000)))
4798 return true;
4799
4800 Register LHS = BaseDef->MI->getOperand(1).getReg();
4801 Register RHS = BaseDef->MI->getOperand(2).getReg();
4802 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4803 }
4804
isUnneededShiftMask(const MachineInstr & MI,unsigned ShAmtBits) const4805 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
4806 unsigned ShAmtBits) const {
4807 assert(MI.getOpcode() == TargetOpcode::G_AND);
4808
4809 std::optional<APInt> RHS =
4810 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
4811 if (!RHS)
4812 return false;
4813
4814 if (RHS->countr_one() >= ShAmtBits)
4815 return true;
4816
4817 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
4818 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
4819 }
4820
4821 InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffset(MachineOperand & Root) const4822 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
4823 MachineOperand &Root) const {
4824 Register Reg = Root.getReg();
4825 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
4826
4827 std::optional<DefinitionAndSourceRegister> Def =
4828 getDefSrcRegIgnoringCopies(Reg, *MRI);
4829 assert(Def && "this shouldn't be an optional result");
4830 Reg = Def->Reg;
4831
4832 if (Register WaveBase = getWaveAddress(Def->MI)) {
4833 return {{
4834 [=](MachineInstrBuilder &MIB) { // rsrc
4835 MIB.addReg(Info->getScratchRSrcReg());
4836 },
4837 [=](MachineInstrBuilder &MIB) { // soffset
4838 MIB.addReg(WaveBase);
4839 },
4840 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
4841 }};
4842 }
4843
4844 int64_t Offset = 0;
4845
4846 // FIXME: Copy check is a hack
4847 Register BasePtr;
4848 if (mi_match(Reg, *MRI,
4849 m_GPtrAdd(m_Reg(BasePtr),
4850 m_any_of(m_ICst(Offset), m_Copy(m_ICst(Offset)))))) {
4851 if (!TII.isLegalMUBUFImmOffset(Offset))
4852 return {};
4853 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
4854 Register WaveBase = getWaveAddress(BasePtrDef);
4855 if (!WaveBase)
4856 return {};
4857
4858 return {{
4859 [=](MachineInstrBuilder &MIB) { // rsrc
4860 MIB.addReg(Info->getScratchRSrcReg());
4861 },
4862 [=](MachineInstrBuilder &MIB) { // soffset
4863 MIB.addReg(WaveBase);
4864 },
4865 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4866 }};
4867 }
4868
4869 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
4870 !TII.isLegalMUBUFImmOffset(Offset))
4871 return {};
4872
4873 return {{
4874 [=](MachineInstrBuilder &MIB) { // rsrc
4875 MIB.addReg(Info->getScratchRSrcReg());
4876 },
4877 [=](MachineInstrBuilder &MIB) { // soffset
4878 MIB.addImm(0);
4879 },
4880 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4881 }};
4882 }
4883
4884 std::pair<Register, unsigned>
selectDS1Addr1OffsetImpl(MachineOperand & Root) const4885 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
4886 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4887 if (!RootDef)
4888 return std::pair(Root.getReg(), 0);
4889
4890 int64_t ConstAddr = 0;
4891
4892 Register PtrBase;
4893 int64_t Offset;
4894 std::tie(PtrBase, Offset) =
4895 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4896
4897 if (Offset) {
4898 if (isDSOffsetLegal(PtrBase, Offset)) {
4899 // (add n0, c0)
4900 return std::pair(PtrBase, Offset);
4901 }
4902 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4903 // TODO
4904
4905
4906 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4907 // TODO
4908
4909 }
4910
4911 return std::pair(Root.getReg(), 0);
4912 }
4913
4914 InstructionSelector::ComplexRendererFns
selectDS1Addr1Offset(MachineOperand & Root) const4915 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
4916 Register Reg;
4917 unsigned Offset;
4918 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
4919 return {{
4920 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4921 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
4922 }};
4923 }
4924
4925 InstructionSelector::ComplexRendererFns
selectDS64Bit4ByteAligned(MachineOperand & Root) const4926 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
4927 return selectDSReadWrite2(Root, 4);
4928 }
4929
4930 InstructionSelector::ComplexRendererFns
selectDS128Bit8ByteAligned(MachineOperand & Root) const4931 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
4932 return selectDSReadWrite2(Root, 8);
4933 }
4934
4935 InstructionSelector::ComplexRendererFns
selectDSReadWrite2(MachineOperand & Root,unsigned Size) const4936 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
4937 unsigned Size) const {
4938 Register Reg;
4939 unsigned Offset;
4940 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
4941 return {{
4942 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4943 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
4944 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
4945 }};
4946 }
4947
4948 std::pair<Register, unsigned>
selectDSReadWrite2Impl(MachineOperand & Root,unsigned Size) const4949 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
4950 unsigned Size) const {
4951 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4952 if (!RootDef)
4953 return std::pair(Root.getReg(), 0);
4954
4955 int64_t ConstAddr = 0;
4956
4957 Register PtrBase;
4958 int64_t Offset;
4959 std::tie(PtrBase, Offset) =
4960 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4961
4962 if (Offset) {
4963 int64_t OffsetValue0 = Offset;
4964 int64_t OffsetValue1 = Offset + Size;
4965 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
4966 // (add n0, c0)
4967 return std::pair(PtrBase, OffsetValue0 / Size);
4968 }
4969 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4970 // TODO
4971
4972 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4973 // TODO
4974
4975 }
4976
4977 return std::pair(Root.getReg(), 0);
4978 }
4979
4980 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
4981 /// the base value with the constant offset. There may be intervening copies
4982 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
4983 /// not match the pattern.
4984 std::pair<Register, int64_t>
getPtrBaseWithConstantOffset(Register Root,const MachineRegisterInfo & MRI) const4985 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
4986 Register Root, const MachineRegisterInfo &MRI) const {
4987 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
4988 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
4989 return {Root, 0};
4990
4991 MachineOperand &RHS = RootI->getOperand(2);
4992 std::optional<ValueAndVReg> MaybeOffset =
4993 getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
4994 if (!MaybeOffset)
4995 return {Root, 0};
4996 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
4997 }
4998
addZeroImm(MachineInstrBuilder & MIB)4999 static void addZeroImm(MachineInstrBuilder &MIB) {
5000 MIB.addImm(0);
5001 }
5002
5003 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5004 /// BasePtr is not valid, a null base pointer will be used.
buildRSRC(MachineIRBuilder & B,MachineRegisterInfo & MRI,uint32_t FormatLo,uint32_t FormatHi,Register BasePtr)5005 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5006 uint32_t FormatLo, uint32_t FormatHi,
5007 Register BasePtr) {
5008 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5009 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5010 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5011 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5012
5013 B.buildInstr(AMDGPU::S_MOV_B32)
5014 .addDef(RSrc2)
5015 .addImm(FormatLo);
5016 B.buildInstr(AMDGPU::S_MOV_B32)
5017 .addDef(RSrc3)
5018 .addImm(FormatHi);
5019
5020 // Build the half of the subregister with the constants before building the
5021 // full 128-bit register. If we are building multiple resource descriptors,
5022 // this will allow CSEing of the 2-component register.
5023 B.buildInstr(AMDGPU::REG_SEQUENCE)
5024 .addDef(RSrcHi)
5025 .addReg(RSrc2)
5026 .addImm(AMDGPU::sub0)
5027 .addReg(RSrc3)
5028 .addImm(AMDGPU::sub1);
5029
5030 Register RSrcLo = BasePtr;
5031 if (!BasePtr) {
5032 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5033 B.buildInstr(AMDGPU::S_MOV_B64)
5034 .addDef(RSrcLo)
5035 .addImm(0);
5036 }
5037
5038 B.buildInstr(AMDGPU::REG_SEQUENCE)
5039 .addDef(RSrc)
5040 .addReg(RSrcLo)
5041 .addImm(AMDGPU::sub0_sub1)
5042 .addReg(RSrcHi)
5043 .addImm(AMDGPU::sub2_sub3);
5044
5045 return RSrc;
5046 }
5047
buildAddr64RSrc(MachineIRBuilder & B,MachineRegisterInfo & MRI,const SIInstrInfo & TII,Register BasePtr)5048 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5049 const SIInstrInfo &TII, Register BasePtr) {
5050 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5051
5052 // FIXME: Why are half the "default" bits ignored based on the addressing
5053 // mode?
5054 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
5055 }
5056
buildOffsetSrc(MachineIRBuilder & B,MachineRegisterInfo & MRI,const SIInstrInfo & TII,Register BasePtr)5057 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
5058 const SIInstrInfo &TII, Register BasePtr) {
5059 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5060
5061 // FIXME: Why are half the "default" bits ignored based on the addressing
5062 // mode?
5063 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5064 }
5065
5066 AMDGPUInstructionSelector::MUBUFAddressData
parseMUBUFAddress(Register Src) const5067 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5068 MUBUFAddressData Data;
5069 Data.N0 = Src;
5070
5071 Register PtrBase;
5072 int64_t Offset;
5073
5074 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5075 if (isUInt<32>(Offset)) {
5076 Data.N0 = PtrBase;
5077 Data.Offset = Offset;
5078 }
5079
5080 if (MachineInstr *InputAdd
5081 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
5082 Data.N2 = InputAdd->getOperand(1).getReg();
5083 Data.N3 = InputAdd->getOperand(2).getReg();
5084
5085 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5086 // FIXME: Don't know this was defined by operand 0
5087 //
5088 // TODO: Remove this when we have copy folding optimizations after
5089 // RegBankSelect.
5090 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5091 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5092 }
5093
5094 return Data;
5095 }
5096
5097 /// Return if the addr64 mubuf mode should be used for the given address.
shouldUseAddr64(MUBUFAddressData Addr) const5098 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5099 // (ptr_add N2, N3) -> addr64, or
5100 // (ptr_add (ptr_add N2, N3), C1) -> addr64
5101 if (Addr.N2)
5102 return true;
5103
5104 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5105 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5106 }
5107
5108 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
5109 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5110 /// component.
splitIllegalMUBUFOffset(MachineIRBuilder & B,Register & SOffset,int64_t & ImmOffset) const5111 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5112 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5113 if (TII.isLegalMUBUFImmOffset(ImmOffset))
5114 return;
5115
5116 // Illegal offset, store it in soffset.
5117 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5118 B.buildInstr(AMDGPU::S_MOV_B32)
5119 .addDef(SOffset)
5120 .addImm(ImmOffset);
5121 ImmOffset = 0;
5122 }
5123
selectMUBUFAddr64Impl(MachineOperand & Root,Register & VAddr,Register & RSrcReg,Register & SOffset,int64_t & Offset) const5124 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5125 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5126 Register &SOffset, int64_t &Offset) const {
5127 // FIXME: Predicates should stop this from reaching here.
5128 // addr64 bit was removed for volcanic islands.
5129 if (!STI.hasAddr64() || STI.useFlatForGlobal())
5130 return false;
5131
5132 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5133 if (!shouldUseAddr64(AddrData))
5134 return false;
5135
5136 Register N0 = AddrData.N0;
5137 Register N2 = AddrData.N2;
5138 Register N3 = AddrData.N3;
5139 Offset = AddrData.Offset;
5140
5141 // Base pointer for the SRD.
5142 Register SRDPtr;
5143
5144 if (N2) {
5145 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5146 assert(N3);
5147 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5148 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5149 // addr64, and construct the default resource from a 0 address.
5150 VAddr = N0;
5151 } else {
5152 SRDPtr = N3;
5153 VAddr = N2;
5154 }
5155 } else {
5156 // N2 is not divergent.
5157 SRDPtr = N2;
5158 VAddr = N3;
5159 }
5160 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5161 // Use the default null pointer in the resource
5162 VAddr = N0;
5163 } else {
5164 // N0 -> offset, or
5165 // (N0 + C1) -> offset
5166 SRDPtr = N0;
5167 }
5168
5169 MachineIRBuilder B(*Root.getParent());
5170 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5171 splitIllegalMUBUFOffset(B, SOffset, Offset);
5172 return true;
5173 }
5174
selectMUBUFOffsetImpl(MachineOperand & Root,Register & RSrcReg,Register & SOffset,int64_t & Offset) const5175 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5176 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5177 int64_t &Offset) const {
5178
5179 // FIXME: Pattern should not reach here.
5180 if (STI.useFlatForGlobal())
5181 return false;
5182
5183 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5184 if (shouldUseAddr64(AddrData))
5185 return false;
5186
5187 // N0 -> offset, or
5188 // (N0 + C1) -> offset
5189 Register SRDPtr = AddrData.N0;
5190 Offset = AddrData.Offset;
5191
5192 // TODO: Look through extensions for 32-bit soffset.
5193 MachineIRBuilder B(*Root.getParent());
5194
5195 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
5196 splitIllegalMUBUFOffset(B, SOffset, Offset);
5197 return true;
5198 }
5199
5200 InstructionSelector::ComplexRendererFns
selectMUBUFAddr64(MachineOperand & Root) const5201 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5202 Register VAddr;
5203 Register RSrcReg;
5204 Register SOffset;
5205 int64_t Offset = 0;
5206
5207 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5208 return {};
5209
5210 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5211 // pattern.
5212 return {{
5213 [=](MachineInstrBuilder &MIB) { // rsrc
5214 MIB.addReg(RSrcReg);
5215 },
5216 [=](MachineInstrBuilder &MIB) { // vaddr
5217 MIB.addReg(VAddr);
5218 },
5219 [=](MachineInstrBuilder &MIB) { // soffset
5220 if (SOffset)
5221 MIB.addReg(SOffset);
5222 else if (STI.hasRestrictedSOffset())
5223 MIB.addReg(AMDGPU::SGPR_NULL);
5224 else
5225 MIB.addImm(0);
5226 },
5227 [=](MachineInstrBuilder &MIB) { // offset
5228 MIB.addImm(Offset);
5229 },
5230 addZeroImm, // cpol
5231 addZeroImm, // tfe
5232 addZeroImm // swz
5233 }};
5234 }
5235
5236 InstructionSelector::ComplexRendererFns
selectMUBUFOffset(MachineOperand & Root) const5237 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5238 Register RSrcReg;
5239 Register SOffset;
5240 int64_t Offset = 0;
5241
5242 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5243 return {};
5244
5245 return {{
5246 [=](MachineInstrBuilder &MIB) { // rsrc
5247 MIB.addReg(RSrcReg);
5248 },
5249 [=](MachineInstrBuilder &MIB) { // soffset
5250 if (SOffset)
5251 MIB.addReg(SOffset);
5252 else if (STI.hasRestrictedSOffset())
5253 MIB.addReg(AMDGPU::SGPR_NULL);
5254 else
5255 MIB.addImm(0);
5256 },
5257 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
5258 addZeroImm, // cpol
5259 addZeroImm, // tfe
5260 addZeroImm, // swz
5261 }};
5262 }
5263
5264 InstructionSelector::ComplexRendererFns
selectBUFSOffset(MachineOperand & Root) const5265 AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5266
5267 Register SOffset = Root.getReg();
5268
5269 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
5270 SOffset = AMDGPU::SGPR_NULL;
5271
5272 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5273 }
5274
5275 /// Get an immediate that must be 32-bits, and treated as zero extended.
5276 static std::optional<uint64_t>
getConstantZext32Val(Register Reg,const MachineRegisterInfo & MRI)5277 getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {
5278 // getIConstantVRegVal sexts any values, so see if that matters.
5279 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
5280 if (!OffsetVal || !isInt<32>(*OffsetVal))
5281 return std::nullopt;
5282 return Lo_32(*OffsetVal);
5283 }
5284
5285 InstructionSelector::ComplexRendererFns
selectSMRDBufferImm(MachineOperand & Root) const5286 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5287 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5288 if (!OffsetVal)
5289 return {};
5290
5291 std::optional<int64_t> EncodedImm =
5292 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
5293 if (!EncodedImm)
5294 return {};
5295
5296 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5297 }
5298
5299 InstructionSelector::ComplexRendererFns
selectSMRDBufferImm32(MachineOperand & Root) const5300 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5301 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
5302
5303 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5304 if (!OffsetVal)
5305 return {};
5306
5307 std::optional<int64_t> EncodedImm =
5308 AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
5309 if (!EncodedImm)
5310 return {};
5311
5312 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5313 }
5314
5315 InstructionSelector::ComplexRendererFns
selectSMRDBufferSgprImm(MachineOperand & Root) const5316 AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5317 // Match the (soffset + offset) pair as a 32-bit register base and
5318 // an immediate offset.
5319 Register SOffset;
5320 unsigned Offset;
5321 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5322 *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
5323 if (!SOffset)
5324 return std::nullopt;
5325
5326 std::optional<int64_t> EncodedOffset =
5327 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
5328 if (!EncodedOffset)
5329 return std::nullopt;
5330
5331 assert(MRI->getType(SOffset) == LLT::scalar(32));
5332 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5333 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5334 }
5335
5336 // Variant of stripBitCast that returns the instruction instead of a
5337 // MachineOperand.
stripBitCast(MachineInstr * MI,MachineRegisterInfo & MRI)5338 static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) {
5339 if (MI->getOpcode() == AMDGPU::G_BITCAST)
5340 return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
5341 return MI;
5342 }
5343
5344 // Figure out if this is really an extract of the high 16-bits of a dword,
5345 // returns nullptr if it isn't.
isExtractHiElt(MachineInstr * Inst,MachineRegisterInfo & MRI)5346 static MachineInstr *isExtractHiElt(MachineInstr *Inst,
5347 MachineRegisterInfo &MRI) {
5348 Inst = stripBitCast(Inst, MRI);
5349
5350 if (Inst->getOpcode() != AMDGPU::G_TRUNC)
5351 return nullptr;
5352
5353 MachineInstr *TruncOp =
5354 getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI);
5355 TruncOp = stripBitCast(TruncOp, MRI);
5356
5357 // G_LSHR x, (G_CONSTANT i32 16)
5358 if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
5359 auto SrlAmount = getIConstantVRegValWithLookThrough(
5360 TruncOp->getOperand(2).getReg(), MRI);
5361 if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
5362 MachineInstr *SrlOp =
5363 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5364 return stripBitCast(SrlOp, MRI);
5365 }
5366 }
5367
5368 // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
5369 // 1, 0 swaps the low/high 16 bits.
5370 // 1, 1 sets the high 16 bits to be the same as the low 16.
5371 // in any case, it selects the high elts.
5372 if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
5373 assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
5374 LLT::fixed_vector(2, 16));
5375
5376 ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
5377 assert(Mask.size() == 2);
5378
5379 if (Mask[0] == 1 && Mask[1] <= 1) {
5380 MachineInstr *LHS =
5381 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5382 return stripBitCast(LHS, MRI);
5383 }
5384 }
5385
5386 return nullptr;
5387 }
5388
5389 std::pair<Register, unsigned>
selectVOP3PMadMixModsImpl(MachineOperand & Root,bool & Matched) const5390 AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5391 bool &Matched) const {
5392 Matched = false;
5393
5394 Register Src;
5395 unsigned Mods;
5396 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
5397
5398 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
5399 if (MI->getOpcode() == AMDGPU::G_FPEXT) {
5400 MachineOperand *MO = &MI->getOperand(1);
5401 Src = MO->getReg();
5402 MI = getDefIgnoringCopies(Src, *MRI);
5403
5404 assert(MRI->getType(Src) == LLT::scalar(16));
5405
5406 // See through bitcasts.
5407 // FIXME: Would be nice to use stripBitCast here.
5408 if (MI->getOpcode() == AMDGPU::G_BITCAST) {
5409 MO = &MI->getOperand(1);
5410 Src = MO->getReg();
5411 MI = getDefIgnoringCopies(Src, *MRI);
5412 }
5413
5414 const auto CheckAbsNeg = [&]() {
5415 // Be careful about folding modifiers if we already have an abs. fneg is
5416 // applied last, so we don't want to apply an earlier fneg.
5417 if ((Mods & SISrcMods::ABS) == 0) {
5418 unsigned ModsTmp;
5419 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
5420 MI = getDefIgnoringCopies(Src, *MRI);
5421
5422 if ((ModsTmp & SISrcMods::NEG) != 0)
5423 Mods ^= SISrcMods::NEG;
5424
5425 if ((ModsTmp & SISrcMods::ABS) != 0)
5426 Mods |= SISrcMods::ABS;
5427 }
5428 };
5429
5430 CheckAbsNeg();
5431
5432 // op_sel/op_sel_hi decide the source type and source.
5433 // If the source's op_sel_hi is set, it indicates to do a conversion from
5434 // fp16. If the sources's op_sel is set, it picks the high half of the
5435 // source register.
5436
5437 Mods |= SISrcMods::OP_SEL_1;
5438
5439 if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
5440 Mods |= SISrcMods::OP_SEL_0;
5441 MI = ExtractHiEltMI;
5442 MO = &MI->getOperand(0);
5443 Src = MO->getReg();
5444
5445 CheckAbsNeg();
5446 }
5447
5448 Matched = true;
5449 }
5450
5451 return {Src, Mods};
5452 }
5453
5454 InstructionSelector::ComplexRendererFns
selectVOP3PMadMixModsExt(MachineOperand & Root) const5455 AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5456 MachineOperand &Root) const {
5457 Register Src;
5458 unsigned Mods;
5459 bool Matched;
5460 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5461 if (!Matched)
5462 return {};
5463
5464 return {{
5465 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5466 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5467 }};
5468 }
5469
5470 InstructionSelector::ComplexRendererFns
selectVOP3PMadMixMods(MachineOperand & Root) const5471 AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5472 Register Src;
5473 unsigned Mods;
5474 bool Matched;
5475 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5476
5477 return {{
5478 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5479 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5480 }};
5481 }
5482
selectSBarrierSignalIsfirst(MachineInstr & I,Intrinsic::ID IntrID) const5483 bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5484 MachineInstr &I, Intrinsic::ID IntrID) const {
5485 MachineBasicBlock *MBB = I.getParent();
5486 const DebugLoc &DL = I.getDebugLoc();
5487 Register CCReg = I.getOperand(0).getReg();
5488
5489 bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
5490
5491 if (HasM0) {
5492 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5493 .addReg(I.getOperand(2).getReg());
5494 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
5495 if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI))
5496 return false;
5497 } else {
5498 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5499 .addImm(I.getOperand(2).getImm());
5500 }
5501
5502 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5503
5504 I.eraseFromParent();
5505 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5506 *MRI);
5507 }
5508
getNamedBarrierOp(bool HasInlineConst,Intrinsic::ID IntrID)5509 unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5510 if (HasInlineConst) {
5511 switch (IntrID) {
5512 default:
5513 llvm_unreachable("not a named barrier op");
5514 case Intrinsic::amdgcn_s_barrier_init:
5515 return AMDGPU::S_BARRIER_INIT_IMM;
5516 case Intrinsic::amdgcn_s_barrier_join:
5517 return AMDGPU::S_BARRIER_JOIN_IMM;
5518 case Intrinsic::amdgcn_s_wakeup_barrier:
5519 return AMDGPU::S_WAKEUP_BARRIER_IMM;
5520 case Intrinsic::amdgcn_s_get_barrier_state:
5521 return AMDGPU::S_GET_BARRIER_STATE_IMM;
5522 };
5523 } else {
5524 switch (IntrID) {
5525 default:
5526 llvm_unreachable("not a named barrier op");
5527 case Intrinsic::amdgcn_s_barrier_init:
5528 return AMDGPU::S_BARRIER_INIT_M0;
5529 case Intrinsic::amdgcn_s_barrier_join:
5530 return AMDGPU::S_BARRIER_JOIN_M0;
5531 case Intrinsic::amdgcn_s_wakeup_barrier:
5532 return AMDGPU::S_WAKEUP_BARRIER_M0;
5533 case Intrinsic::amdgcn_s_get_barrier_state:
5534 return AMDGPU::S_GET_BARRIER_STATE_M0;
5535 };
5536 }
5537 }
5538
selectNamedBarrierInst(MachineInstr & I,Intrinsic::ID IntrID) const5539 bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5540 MachineInstr &I, Intrinsic::ID IntrID) const {
5541 MachineBasicBlock *MBB = I.getParent();
5542 const DebugLoc &DL = I.getDebugLoc();
5543 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
5544 ? I.getOperand(2)
5545 : I.getOperand(1);
5546 std::optional<int64_t> BarValImm =
5547 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5548 Register M0Val;
5549 Register TmpReg0;
5550
5551 // For S_BARRIER_INIT, member count will always be read from M0[16:22]
5552 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5553 Register MemberCount = I.getOperand(2).getReg();
5554 TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5555 // TODO: This should be expanded during legalization so that the the S_LSHL
5556 // and S_OR can be constant-folded
5557 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
5558 .addImm(16)
5559 .addReg(MemberCount);
5560 M0Val = TmpReg0;
5561 }
5562
5563 // If not inlinable, get reference to barrier depending on the instruction
5564 if (!BarValImm) {
5565 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5566 // If reference to barrier id is not an inlinable constant then it must be
5567 // referenced with M0[4:0]. Perform an OR with the member count to include
5568 // it in M0 for S_BARRIER_INIT.
5569 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5570 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1)
5571 .addReg(BarOp.getReg())
5572 .addReg(TmpReg0);
5573 M0Val = TmpReg1;
5574 } else {
5575 M0Val = BarOp.getReg();
5576 }
5577 }
5578
5579 // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
5580 if (M0Val) {
5581 auto CopyMIB =
5582 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val);
5583 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5584 }
5585
5586 MachineInstrBuilder MIB;
5587 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
5588 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5589
5590 if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
5591 MIB.addDef(I.getOperand(0).getReg());
5592
5593 if (BarValImm)
5594 MIB.addImm(*BarValImm);
5595
5596 I.eraseFromParent();
5597 return true;
5598 }
5599
selectSBarrierLeave(MachineInstr & I) const5600 bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
5601 MachineBasicBlock *BB = I.getParent();
5602 const DebugLoc &DL = I.getDebugLoc();
5603 Register CCReg = I.getOperand(0).getReg();
5604
5605 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE));
5606 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5607
5608 I.eraseFromParent();
5609 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5610 *MRI);
5611 }
5612
renderTruncImm32(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5613 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
5614 const MachineInstr &MI,
5615 int OpIdx) const {
5616 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5617 "Expected G_CONSTANT");
5618 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
5619 }
5620
renderNegateImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5621 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
5622 const MachineInstr &MI,
5623 int OpIdx) const {
5624 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5625 "Expected G_CONSTANT");
5626 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
5627 }
5628
renderBitcastImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5629 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
5630 const MachineInstr &MI,
5631 int OpIdx) const {
5632 assert(OpIdx == -1);
5633
5634 const MachineOperand &Op = MI.getOperand(1);
5635 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
5636 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
5637 else {
5638 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
5639 MIB.addImm(Op.getCImm()->getSExtValue());
5640 }
5641 }
5642
renderPopcntImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5643 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
5644 const MachineInstr &MI,
5645 int OpIdx) const {
5646 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5647 "Expected G_CONSTANT");
5648 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
5649 }
5650
5651 /// This only really exists to satisfy DAG type checking machinery, so is a
5652 /// no-op here.
renderTruncTImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5653 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
5654 const MachineInstr &MI,
5655 int OpIdx) const {
5656 MIB.addImm(MI.getOperand(OpIdx).getImm());
5657 }
5658
renderOpSelTImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5659 void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
5660 const MachineInstr &MI,
5661 int OpIdx) const {
5662 assert(OpIdx >= 0 && "expected to match an immediate operand");
5663 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
5664 }
5665
renderExtractCPol(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5666 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
5667 const MachineInstr &MI,
5668 int OpIdx) const {
5669 assert(OpIdx >= 0 && "expected to match an immediate operand");
5670 MIB.addImm(MI.getOperand(OpIdx).getImm() &
5671 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
5672 : AMDGPU::CPol::ALL_pregfx12));
5673 }
5674
renderExtractSWZ(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5675 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
5676 const MachineInstr &MI,
5677 int OpIdx) const {
5678 assert(OpIdx >= 0 && "expected to match an immediate operand");
5679 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
5680 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ
5681 : AMDGPU::CPol::SWZ_pregfx12);
5682 MIB.addImm(Swizzle);
5683 }
5684
renderExtractCpolSetGLC(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5685 void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
5686 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
5687 assert(OpIdx >= 0 && "expected to match an immediate operand");
5688 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
5689 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
5690 : AMDGPU::CPol::ALL_pregfx12);
5691 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
5692 }
5693
renderFrameIndex(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5694 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
5695 const MachineInstr &MI,
5696 int OpIdx) const {
5697 MIB.addFrameIndex(MI.getOperand(1).getIndex());
5698 }
5699
renderFPPow2ToExponent(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const5700 void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
5701 const MachineInstr &MI,
5702 int OpIdx) const {
5703 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
5704 int ExpVal = APF.getExactLog2Abs();
5705 assert(ExpVal != INT_MIN);
5706 MIB.addImm(ExpVal);
5707 }
5708
isInlineImmediate(const APInt & Imm) const5709 bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
5710 return TII.isInlineConstant(Imm);
5711 }
5712
isInlineImmediate(const APFloat & Imm) const5713 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
5714 return TII.isInlineConstant(Imm);
5715 }
5716