xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===-- AMDGPURegBankLegalizeRules.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// Definitions of RegBankLegalize Rules for all opcodes.
10 /// Implementation of container for all the Rules and search.
11 /// Fast search for most common case when Rule.Predicate checks LLT and
12 /// uniformity of register in operand 0.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPURegBankLegalizeRules.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "GCNSubtarget.h"
19 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
20 #include "llvm/CodeGen/MachineUniformityAnalysis.h"
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
22 #include "llvm/Support/AMDGPUAddrSpace.h"
23 
24 #define DEBUG_TYPE "amdgpu-regbanklegalize"
25 
26 using namespace llvm;
27 using namespace AMDGPU;
28 
isAnyPtr(LLT Ty,unsigned Width)29 bool AMDGPU::isAnyPtr(LLT Ty, unsigned Width) {
30   return Ty.isPointer() && Ty.getSizeInBits() == Width;
31 }
32 
RegBankLLTMapping(std::initializer_list<RegBankLLTMappingApplyID> DstOpMappingList,std::initializer_list<RegBankLLTMappingApplyID> SrcOpMappingList,LoweringMethodID LoweringMethod)33 RegBankLLTMapping::RegBankLLTMapping(
34     std::initializer_list<RegBankLLTMappingApplyID> DstOpMappingList,
35     std::initializer_list<RegBankLLTMappingApplyID> SrcOpMappingList,
36     LoweringMethodID LoweringMethod)
37     : DstOpMapping(DstOpMappingList), SrcOpMapping(SrcOpMappingList),
38       LoweringMethod(LoweringMethod) {}
39 
PredicateMapping(std::initializer_list<UniformityLLTOpPredicateID> OpList,std::function<bool (const MachineInstr &)> TestFunc)40 PredicateMapping::PredicateMapping(
41     std::initializer_list<UniformityLLTOpPredicateID> OpList,
42     std::function<bool(const MachineInstr &)> TestFunc)
43     : OpUniformityAndTypes(OpList), TestFunc(TestFunc) {}
44 
matchUniformityAndLLT(Register Reg,UniformityLLTOpPredicateID UniID,const MachineUniformityInfo & MUI,const MachineRegisterInfo & MRI)45 bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
46                            const MachineUniformityInfo &MUI,
47                            const MachineRegisterInfo &MRI) {
48   switch (UniID) {
49   case S1:
50     return MRI.getType(Reg) == LLT::scalar(1);
51   case S16:
52     return MRI.getType(Reg) == LLT::scalar(16);
53   case S32:
54     return MRI.getType(Reg) == LLT::scalar(32);
55   case S64:
56     return MRI.getType(Reg) == LLT::scalar(64);
57   case S128:
58     return MRI.getType(Reg) == LLT::scalar(128);
59   case P0:
60     return MRI.getType(Reg) == LLT::pointer(0, 64);
61   case P1:
62     return MRI.getType(Reg) == LLT::pointer(1, 64);
63   case P3:
64     return MRI.getType(Reg) == LLT::pointer(3, 32);
65   case P4:
66     return MRI.getType(Reg) == LLT::pointer(4, 64);
67   case P5:
68     return MRI.getType(Reg) == LLT::pointer(5, 32);
69   case Ptr32:
70     return isAnyPtr(MRI.getType(Reg), 32);
71   case Ptr64:
72     return isAnyPtr(MRI.getType(Reg), 64);
73   case Ptr128:
74     return isAnyPtr(MRI.getType(Reg), 128);
75   case V2S32:
76     return MRI.getType(Reg) == LLT::fixed_vector(2, 32);
77   case V4S32:
78     return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
79   case B32:
80     return MRI.getType(Reg).getSizeInBits() == 32;
81   case B64:
82     return MRI.getType(Reg).getSizeInBits() == 64;
83   case B96:
84     return MRI.getType(Reg).getSizeInBits() == 96;
85   case B128:
86     return MRI.getType(Reg).getSizeInBits() == 128;
87   case B256:
88     return MRI.getType(Reg).getSizeInBits() == 256;
89   case B512:
90     return MRI.getType(Reg).getSizeInBits() == 512;
91   case UniS1:
92     return MRI.getType(Reg) == LLT::scalar(1) && MUI.isUniform(Reg);
93   case UniS16:
94     return MRI.getType(Reg) == LLT::scalar(16) && MUI.isUniform(Reg);
95   case UniS32:
96     return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg);
97   case UniS64:
98     return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg);
99   case UniS128:
100     return MRI.getType(Reg) == LLT::scalar(128) && MUI.isUniform(Reg);
101   case UniP0:
102     return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniform(Reg);
103   case UniP1:
104     return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniform(Reg);
105   case UniP3:
106     return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isUniform(Reg);
107   case UniP4:
108     return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg);
109   case UniP5:
110     return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg);
111   case UniPtr32:
112     return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniform(Reg);
113   case UniPtr64:
114     return isAnyPtr(MRI.getType(Reg), 64) && MUI.isUniform(Reg);
115   case UniPtr128:
116     return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniform(Reg);
117   case UniV2S16:
118     return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg);
119   case UniB32:
120     return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg);
121   case UniB64:
122     return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isUniform(Reg);
123   case UniB96:
124     return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isUniform(Reg);
125   case UniB128:
126     return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isUniform(Reg);
127   case UniB256:
128     return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isUniform(Reg);
129   case UniB512:
130     return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isUniform(Reg);
131   case DivS1:
132     return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg);
133   case DivS16:
134     return MRI.getType(Reg) == LLT::scalar(16) && MUI.isDivergent(Reg);
135   case DivS32:
136     return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg);
137   case DivS64:
138     return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg);
139   case DivS128:
140     return MRI.getType(Reg) == LLT::scalar(128) && MUI.isDivergent(Reg);
141   case DivP0:
142     return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergent(Reg);
143   case DivP1:
144     return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg);
145   case DivP3:
146     return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isDivergent(Reg);
147   case DivP4:
148     return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergent(Reg);
149   case DivP5:
150     return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergent(Reg);
151   case DivPtr32:
152     return isAnyPtr(MRI.getType(Reg), 32) && MUI.isDivergent(Reg);
153   case DivPtr64:
154     return isAnyPtr(MRI.getType(Reg), 64) && MUI.isDivergent(Reg);
155   case DivPtr128:
156     return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergent(Reg);
157   case DivV2S16:
158     return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg);
159   case DivB32:
160     return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg);
161   case DivB64:
162     return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isDivergent(Reg);
163   case DivB96:
164     return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isDivergent(Reg);
165   case DivB128:
166     return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isDivergent(Reg);
167   case DivB256:
168     return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isDivergent(Reg);
169   case DivB512:
170     return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergent(Reg);
171   case _:
172     return true;
173   default:
174     llvm_unreachable("missing matchUniformityAndLLT");
175   }
176 }
177 
match(const MachineInstr & MI,const MachineUniformityInfo & MUI,const MachineRegisterInfo & MRI) const178 bool PredicateMapping::match(const MachineInstr &MI,
179                              const MachineUniformityInfo &MUI,
180                              const MachineRegisterInfo &MRI) const {
181   // Check LLT signature.
182   for (unsigned i = 0; i < OpUniformityAndTypes.size(); ++i) {
183     if (OpUniformityAndTypes[i] == _) {
184       if (MI.getOperand(i).isReg())
185         return false;
186       continue;
187     }
188 
189     // Remaining IDs check registers.
190     if (!MI.getOperand(i).isReg())
191       return false;
192 
193     if (!matchUniformityAndLLT(MI.getOperand(i).getReg(),
194                                OpUniformityAndTypes[i], MUI, MRI))
195       return false;
196   }
197 
198   // More complex check.
199   if (TestFunc)
200     return TestFunc(MI);
201 
202   return true;
203 }
204 
SetOfRulesForOpcode()205 SetOfRulesForOpcode::SetOfRulesForOpcode() {}
206 
SetOfRulesForOpcode(FastRulesTypes FastTypes)207 SetOfRulesForOpcode::SetOfRulesForOpcode(FastRulesTypes FastTypes)
208     : FastTypes(FastTypes) {}
209 
LLTToId(LLT Ty)210 UniformityLLTOpPredicateID LLTToId(LLT Ty) {
211   if (Ty == LLT::scalar(16))
212     return S16;
213   if (Ty == LLT::scalar(32))
214     return S32;
215   if (Ty == LLT::scalar(64))
216     return S64;
217   if (Ty == LLT::fixed_vector(2, 16))
218     return V2S16;
219   if (Ty == LLT::fixed_vector(2, 32))
220     return V2S32;
221   if (Ty == LLT::fixed_vector(3, 32))
222     return V3S32;
223   if (Ty == LLT::fixed_vector(4, 32))
224     return V4S32;
225   return _;
226 }
227 
LLTToBId(LLT Ty)228 UniformityLLTOpPredicateID LLTToBId(LLT Ty) {
229   if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
230       isAnyPtr(Ty, 32))
231     return B32;
232   if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
233       Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
234     return B64;
235   if (Ty == LLT::fixed_vector(3, 32))
236     return B96;
237   if (Ty == LLT::fixed_vector(4, 32) || isAnyPtr(Ty, 128))
238     return B128;
239   return _;
240 }
241 
242 const RegBankLLTMapping &
findMappingForMI(const MachineInstr & MI,const MachineRegisterInfo & MRI,const MachineUniformityInfo & MUI) const243 SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI,
244                                       const MachineRegisterInfo &MRI,
245                                       const MachineUniformityInfo &MUI) const {
246   // Search in "Fast Rules".
247   // Note: if fast rules are enabled, RegBankLLTMapping must be added in each
248   // slot that could "match fast Predicate". If not, InvalidMapping is
249   // returned which results in failure, does not search "Slow Rules".
250   if (FastTypes != NoFastRules) {
251     Register Reg = MI.getOperand(0).getReg();
252     int Slot;
253     if (FastTypes == StandardB)
254       Slot = getFastPredicateSlot(LLTToBId(MRI.getType(Reg)));
255     else
256       Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg)));
257 
258     if (Slot != -1)
259       return MUI.isUniform(Reg) ? Uni[Slot] : Div[Slot];
260   }
261 
262   // Slow search for more complex rules.
263   for (const RegBankLegalizeRule &Rule : Rules) {
264     if (Rule.Predicate.match(MI, MUI, MRI))
265       return Rule.OperandMapping;
266   }
267 
268   LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
269   llvm_unreachable("None of the rules defined for MI's opcode matched MI");
270 }
271 
addRule(RegBankLegalizeRule Rule)272 void SetOfRulesForOpcode::addRule(RegBankLegalizeRule Rule) {
273   Rules.push_back(Rule);
274 }
275 
addFastRuleDivergent(UniformityLLTOpPredicateID Ty,RegBankLLTMapping RuleApplyIDs)276 void SetOfRulesForOpcode::addFastRuleDivergent(UniformityLLTOpPredicateID Ty,
277                                                RegBankLLTMapping RuleApplyIDs) {
278   int Slot = getFastPredicateSlot(Ty);
279   assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
280   Div[Slot] = RuleApplyIDs;
281 }
282 
addFastRuleUniform(UniformityLLTOpPredicateID Ty,RegBankLLTMapping RuleApplyIDs)283 void SetOfRulesForOpcode::addFastRuleUniform(UniformityLLTOpPredicateID Ty,
284                                              RegBankLLTMapping RuleApplyIDs) {
285   int Slot = getFastPredicateSlot(Ty);
286   assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
287   Uni[Slot] = RuleApplyIDs;
288 }
289 
getFastPredicateSlot(UniformityLLTOpPredicateID Ty) const290 int SetOfRulesForOpcode::getFastPredicateSlot(
291     UniformityLLTOpPredicateID Ty) const {
292   switch (FastTypes) {
293   case Standard: {
294     switch (Ty) {
295     case S32:
296       return 0;
297     case S16:
298       return 1;
299     case S64:
300       return 2;
301     case V2S16:
302       return 3;
303     default:
304       return -1;
305     }
306   }
307   case StandardB: {
308     switch (Ty) {
309     case B32:
310       return 0;
311     case B64:
312       return 1;
313     case B96:
314       return 2;
315     case B128:
316       return 3;
317     default:
318       return -1;
319     }
320   }
321   case Vector: {
322     switch (Ty) {
323     case S32:
324       return 0;
325     case V2S32:
326       return 1;
327     case V3S32:
328       return 2;
329     case V4S32:
330       return 3;
331     default:
332       return -1;
333     }
334   }
335   default:
336     return -1;
337   }
338 }
339 
340 RegBankLegalizeRules::RuleSetInitializer
addRulesForGOpcs(std::initializer_list<unsigned> OpcList,FastRulesTypes FastTypes)341 RegBankLegalizeRules::addRulesForGOpcs(std::initializer_list<unsigned> OpcList,
342                                        FastRulesTypes FastTypes) {
343   return RuleSetInitializer(OpcList, GRulesAlias, GRules, FastTypes);
344 }
345 
346 RegBankLegalizeRules::RuleSetInitializer
addRulesForIOpcs(std::initializer_list<unsigned> OpcList,FastRulesTypes FastTypes)347 RegBankLegalizeRules::addRulesForIOpcs(std::initializer_list<unsigned> OpcList,
348                                        FastRulesTypes FastTypes) {
349   return RuleSetInitializer(OpcList, IRulesAlias, IRules, FastTypes);
350 }
351 
352 const SetOfRulesForOpcode &
getRulesForOpc(MachineInstr & MI) const353 RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const {
354   unsigned Opc = MI.getOpcode();
355   if (Opc == AMDGPU::G_INTRINSIC || Opc == AMDGPU::G_INTRINSIC_CONVERGENT ||
356       Opc == AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS ||
357       Opc == AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS) {
358     unsigned IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
359     auto IRAIt = IRulesAlias.find(IntrID);
360     if (IRAIt == IRulesAlias.end()) {
361       LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
362       llvm_unreachable("No rules defined for intrinsic opcode");
363     }
364     return IRules.at(IRAIt->second);
365   }
366 
367   auto GRAIt = GRulesAlias.find(Opc);
368   if (GRAIt == GRulesAlias.end()) {
369     LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
370     llvm_unreachable("No rules defined for generic opcode");
371   }
372   return GRules.at(GRAIt->second);
373 }
374 
375 // Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'.
376 class Predicate {
377 private:
378   struct Elt {
379     // Save formula composed of Pred, '&&', '||' and '!' as a jump table.
380     // Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C
381     // Sequences of && and || will be represented by jumps, for example:
382     // (A && B && ... X) or (A && B && ... X) || Y
383     //   A == true jump to B
384     //   A == false jump to end or Y, result is A(false) or Y
385     // (A || B || ... X) or (A || B || ... X) && Y
386     //   A == true jump to end or Y, result is A(true) or Y
387     //   A == false jump to B
388     // Notice that when negating expression, we simply flip Neg on each Pred
389     // and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&).
390     std::function<bool(const MachineInstr &)> Pred;
391     bool Neg; // Neg of Pred is calculated before jump
392     unsigned TJumpOffset;
393     unsigned FJumpOffset;
394   };
395 
396   SmallVector<Elt, 8> Expression;
397 
Predicate(SmallVectorImpl<Elt> && Expr)398   Predicate(SmallVectorImpl<Elt> &&Expr) { Expression.swap(Expr); };
399 
400 public:
Predicate(std::function<bool (const MachineInstr &)> Pred)401   Predicate(std::function<bool(const MachineInstr &)> Pred) {
402     Expression.push_back({Pred, false, 1, 1});
403   };
404 
operator ()(const MachineInstr & MI) const405   bool operator()(const MachineInstr &MI) const {
406     unsigned Idx = 0;
407     unsigned ResultIdx = Expression.size();
408     bool Result;
409     do {
410       Result = Expression[Idx].Pred(MI);
411       Result = Expression[Idx].Neg ? !Result : Result;
412       if (Result) {
413         Idx += Expression[Idx].TJumpOffset;
414       } else {
415         Idx += Expression[Idx].FJumpOffset;
416       }
417     } while ((Idx != ResultIdx));
418 
419     return Result;
420   };
421 
operator !() const422   Predicate operator!() const {
423     SmallVector<Elt, 8> NegExpression;
424     for (const Elt &ExprElt : Expression) {
425       NegExpression.push_back({ExprElt.Pred, !ExprElt.Neg, ExprElt.FJumpOffset,
426                                ExprElt.TJumpOffset});
427     }
428     return Predicate(std::move(NegExpression));
429   };
430 
operator &&(const Predicate & RHS) const431   Predicate operator&&(const Predicate &RHS) const {
432     SmallVector<Elt, 8> AndExpression = Expression;
433 
434     unsigned RHSSize = RHS.Expression.size();
435     unsigned ResultIdx = Expression.size();
436     for (unsigned i = 0; i < ResultIdx; ++i) {
437       // LHS results in false, whole expression results in false.
438       if (i + AndExpression[i].FJumpOffset == ResultIdx)
439         AndExpression[i].FJumpOffset += RHSSize;
440     }
441 
442     AndExpression.append(RHS.Expression);
443 
444     return Predicate(std::move(AndExpression));
445   }
446 
operator ||(const Predicate & RHS) const447   Predicate operator||(const Predicate &RHS) const {
448     SmallVector<Elt, 8> OrExpression = Expression;
449 
450     unsigned RHSSize = RHS.Expression.size();
451     unsigned ResultIdx = Expression.size();
452     for (unsigned i = 0; i < ResultIdx; ++i) {
453       // LHS results in true, whole expression results in true.
454       if (i + OrExpression[i].TJumpOffset == ResultIdx)
455         OrExpression[i].TJumpOffset += RHSSize;
456     }
457 
458     OrExpression.append(RHS.Expression);
459 
460     return Predicate(std::move(OrExpression));
461   }
462 };
463 
464 // Initialize rules
RegBankLegalizeRules(const GCNSubtarget & _ST,MachineRegisterInfo & _MRI)465 RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
466                                            MachineRegisterInfo &_MRI)
467     : ST(&_ST), MRI(&_MRI) {
468 
469   addRulesForGOpcs({G_ADD, G_SUB}, Standard)
470       .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
471       .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
472 
473   addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
474 
475   addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB)
476       .Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}})
477       .Any({{DivS1}, {{Vcc}, {Vcc, Vcc}}})
478       .Any({{UniS16}, {{Sgpr16}, {Sgpr16, Sgpr16}}})
479       .Any({{DivS16}, {{Vgpr16}, {Vgpr16, Vgpr16}}})
480       .Uni(B32, {{SgprB32}, {SgprB32, SgprB32}})
481       .Div(B32, {{VgprB32}, {VgprB32, VgprB32}})
482       .Uni(B64, {{SgprB64}, {SgprB64, SgprB64}})
483       .Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32});
484 
485   addRulesForGOpcs({G_SHL}, Standard)
486       .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32ZExt}})
487       .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
488       .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackBitShift})
489       .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
490       .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
491       .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
492       .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
493       .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
494 
495   addRulesForGOpcs({G_LSHR}, Standard)
496       .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
497       .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
498       .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackBitShift})
499       .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
500       .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
501       .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
502       .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
503       .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
504 
505   addRulesForGOpcs({G_ASHR}, Standard)
506       .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32ZExt}})
507       .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
508       .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackBitShift})
509       .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
510       .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
511       .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
512       .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
513       .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
514 
515   addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}});
516 
517   addRulesForGOpcs({G_UBFX, G_SBFX}, Standard)
518       .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}, S_BFE})
519       .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
520       .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32, Sgpr32}, S_BFE})
521       .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32, Vgpr32}, V_BFE});
522 
523   // Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT, G_FCONSTANT
524   // and G_FREEZE here, rest is trivially regbankselected earlier
525   addRulesForGOpcs({G_IMPLICIT_DEF}).Any({{UniS1}, {{Sgpr32Trunc}, {}}});
526   addRulesForGOpcs({G_CONSTANT})
527       .Any({{UniS1, _}, {{Sgpr32Trunc}, {None}, UniCstExt}});
528   addRulesForGOpcs({G_FREEZE}).Any({{DivS1}, {{Vcc}, {Vcc}}});
529 
530   addRulesForGOpcs({G_ICMP})
531       .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
532       .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}});
533 
534   addRulesForGOpcs({G_FCMP})
535       .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}})
536       .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}});
537 
538   addRulesForGOpcs({G_BRCOND})
539       .Any({{UniS1}, {{}, {Sgpr32AExtBoolInReg}}})
540       .Any({{DivS1}, {{}, {Vcc}}});
541 
542   addRulesForGOpcs({G_BR}).Any({{_}, {{}, {None}}});
543 
544   addRulesForGOpcs({G_SELECT}, StandardB)
545       .Any({{DivS16}, {{Vgpr16}, {Vcc, Vgpr16, Vgpr16}}})
546       .Any({{UniS16}, {{Sgpr16}, {Sgpr32AExtBoolInReg, Sgpr16, Sgpr16}}})
547       .Div(B32, {{VgprB32}, {Vcc, VgprB32, VgprB32}})
548       .Uni(B32, {{SgprB32}, {Sgpr32AExtBoolInReg, SgprB32, SgprB32}})
549       .Div(B64, {{VgprB64}, {Vcc, VgprB64, VgprB64}, SplitTo32Select})
550       .Uni(B64, {{SgprB64}, {Sgpr32AExtBoolInReg, SgprB64, SgprB64}});
551 
552   addRulesForGOpcs({G_ANYEXT})
553       .Any({{UniS16, S1}, {{None}, {None}}}) // should be combined away
554       .Any({{UniS32, S1}, {{None}, {None}}}) // should be combined away
555       .Any({{UniS64, S1}, {{None}, {None}}}) // should be combined away
556       .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
557       .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
558       .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
559       .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
560       .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
561       .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
562       .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
563 
564   // In global-isel G_TRUNC in-reg is treated as no-op, inst selected into COPY.
565   // It is up to user to deal with truncated bits.
566   addRulesForGOpcs({G_TRUNC})
567       .Any({{UniS1, UniS16}, {{None}, {None}}}) // should be combined away
568       .Any({{UniS1, UniS32}, {{None}, {None}}}) // should be combined away
569       .Any({{UniS1, UniS64}, {{None}, {None}}}) // should be combined away
570       .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}})
571       .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
572       .Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
573       .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
574       .Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}})
575       .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
576       // This is non-trivial. VgprToVccCopy is done using compare instruction.
577       .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}})
578       .Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}})
579       .Any({{DivS1, DivS64}, {{Vcc}, {Vgpr64}, VgprToVccCopy}});
580 
581   addRulesForGOpcs({G_ZEXT})
582       .Any({{UniS16, S1}, {{Sgpr32Trunc}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
583       .Any({{UniS32, S1}, {{Sgpr32}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
584       .Any({{UniS64, S1}, {{Sgpr64}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
585       .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
586       .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
587       .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
588       .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
589       .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
590       // not extending S16 to S32 is questionable.
591       .Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32ZExt}, Ext32To64}})
592       .Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32ZExt}, Ext32To64}})
593       .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
594       .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
595 
596   addRulesForGOpcs({G_SEXT})
597       .Any({{UniS16, S1}, {{Sgpr32Trunc}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
598       .Any({{UniS32, S1}, {{Sgpr32}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
599       .Any({{UniS64, S1}, {{Sgpr64}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
600       .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
601       .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
602       .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
603       .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
604       .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
605       // not extending S16 to S32 is questionable.
606       .Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32SExt}, Ext32To64}})
607       .Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32SExt}, Ext32To64}})
608       .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
609       .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
610 
611   addRulesForGOpcs({G_SEXT_INREG})
612       .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
613       .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
614       .Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}})
615       .Any({{DivS64, S64}, {{Vgpr64}, {Vgpr64}, SplitTo32SExtInReg}});
616 
617   bool hasUnalignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12;
618   bool hasSMRDSmall = ST->hasScalarSubwordLoads();
619 
620   Predicate isAlign16([](const MachineInstr &MI) -> bool {
621     return (*MI.memoperands_begin())->getAlign() >= Align(16);
622   });
623 
624   Predicate isAlign4([](const MachineInstr &MI) -> bool {
625     return (*MI.memoperands_begin())->getAlign() >= Align(4);
626   });
627 
628   Predicate isAtomicMMO([](const MachineInstr &MI) -> bool {
629     return (*MI.memoperands_begin())->isAtomic();
630   });
631 
632   Predicate isUniMMO([](const MachineInstr &MI) -> bool {
633     return AMDGPU::isUniformMMO(*MI.memoperands_begin());
634   });
635 
636   Predicate isConst([](const MachineInstr &MI) -> bool {
637     // Address space in MMO be different then address space on pointer.
638     const MachineMemOperand *MMO = *MI.memoperands_begin();
639     const unsigned AS = MMO->getAddrSpace();
640     return AS == AMDGPUAS::CONSTANT_ADDRESS ||
641            AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
642   });
643 
644   Predicate isVolatileMMO([](const MachineInstr &MI) -> bool {
645     return (*MI.memoperands_begin())->isVolatile();
646   });
647 
648   Predicate isInvMMO([](const MachineInstr &MI) -> bool {
649     return (*MI.memoperands_begin())->isInvariant();
650   });
651 
652   Predicate isNoClobberMMO([](const MachineInstr &MI) -> bool {
653     return (*MI.memoperands_begin())->getFlags() & MONoClobber;
654   });
655 
656   Predicate isNaturalAlignedSmall([](const MachineInstr &MI) -> bool {
657     const MachineMemOperand *MMO = *MI.memoperands_begin();
658     const unsigned MemSize = 8 * MMO->getSize().getValue();
659     return (MemSize == 16 && MMO->getAlign() >= Align(2)) ||
660            (MemSize == 8 && MMO->getAlign() >= Align(1));
661   });
662 
663   auto isUL = !isAtomicMMO && isUniMMO && (isConst || !isVolatileMMO) &&
664               (isConst || isInvMMO || isNoClobberMMO);
665 
666   // clang-format off
667   addRulesForGOpcs({G_LOAD})
668       .Any({{DivB32, DivP0}, {{VgprB32}, {VgprP0}}})
669 
670       .Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}})
671       .Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
672       .Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
673       .Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}})
674       .Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}})
675       .Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}})
676 
677       .Any({{DivB32, UniP3}, {{VgprB32}, {VgprP3}}})
678       .Any({{{UniB32, UniP3}, isAlign4 && isUL}, {{SgprB32}, {SgprP3}}})
679       .Any({{{UniB32, UniP3}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP3}}})
680 
681       .Any({{{DivB256, DivP4}}, {{VgprB256}, {VgprP4}, SplitLoad}})
682       .Any({{{UniB32, UniP4}, isNaturalAlignedSmall && isUL}, {{SgprB32}, {SgprP4}}}, hasSMRDSmall) // i8 and i16 load
683       .Any({{{UniB32, UniP4}, isAlign4 && isUL}, {{SgprB32}, {SgprP4}}})
684       .Any({{{UniB96, UniP4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasUnalignedLoads)
685       .Any({{{UniB96, UniP4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasUnalignedLoads)
686       .Any({{{UniB96, UniP4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasUnalignedLoads)
687       .Any({{{UniB256, UniP4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}})
688       .Any({{{UniB512, UniP4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}})
689       .Any({{{UniB32, UniP4}, !isNaturalAlignedSmall || !isUL}, {{UniInVgprB32}, {VgprP4}}}, hasSMRDSmall) // i8 and i16 load
690       .Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}})
691       .Any({{{UniB256, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP4}, SplitLoad}})
692       .Any({{{UniB512, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP4}, SplitLoad}})
693 
694       .Any({{DivB32, P5}, {{VgprB32}, {VgprP5}}});
695 
696   addRulesForGOpcs({G_ZEXTLOAD}) // i8 and i16 zero-extending loads
697       .Any({{{UniB32, UniP3}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP3}}})
698       .Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}});
699   // clang-format on
700 
701   addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector)
702       .Div(S32, {{Vgpr32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
703       .Uni(S32, {{UniInVgprS32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
704       .Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
705       .Uni(V4S32, {{UniInVgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}});
706 
707   addRulesForGOpcs({G_STORE})
708       .Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}})
709       .Any({{S32, P1}, {{}, {Vgpr32, VgprP1}}})
710       .Any({{S64, P1}, {{}, {Vgpr64, VgprP1}}})
711       .Any({{V4S32, P1}, {{}, {VgprV4S32, VgprP1}}});
712 
713   addRulesForGOpcs({G_AMDGPU_BUFFER_STORE})
714       .Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}});
715 
716   addRulesForGOpcs({G_PTR_ADD})
717       .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
718       .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
719       .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}});
720 
721   addRulesForGOpcs({G_INTTOPTR})
722       .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}})
723       .Any({{DivPtr32}, {{VgprPtr32}, {Vgpr32}}})
724       .Any({{UniPtr64}, {{SgprPtr64}, {Sgpr64}}})
725       .Any({{DivPtr64}, {{VgprPtr64}, {Vgpr64}}})
726       .Any({{UniPtr128}, {{SgprPtr128}, {Sgpr128}}})
727       .Any({{DivPtr128}, {{VgprPtr128}, {Vgpr128}}});
728 
729   addRulesForGOpcs({G_PTRTOINT})
730       .Any({{UniS32}, {{Sgpr32}, {SgprPtr32}}})
731       .Any({{DivS32}, {{Vgpr32}, {VgprPtr32}}})
732       .Any({{UniS64}, {{Sgpr64}, {SgprPtr64}}})
733       .Any({{DivS64}, {{Vgpr64}, {VgprPtr64}}})
734       .Any({{UniS128}, {{Sgpr128}, {SgprPtr128}}})
735       .Any({{DivS128}, {{Vgpr128}, {VgprPtr128}}});
736 
737   addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
738 
739   bool hasSALUFloat = ST->hasSALUFloatInsts();
740 
741   addRulesForGOpcs({G_FADD}, Standard)
742       .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
743       .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
744       .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
745 
746   addRulesForGOpcs({G_FPTOUI})
747       .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
748       .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);
749 
750   addRulesForGOpcs({G_UITOFP})
751       .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
752       .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
753       .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);
754 
755   using namespace Intrinsic;
756 
757   addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}});
758 
759   // This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir.
760   addRulesForIOpcs({amdgcn_end_cf}).Any({{_, S32}, {{}, {None, Sgpr32}}});
761 
762   addRulesForIOpcs({amdgcn_if_break}, Standard)
763       .Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}});
764 
765   addRulesForIOpcs({amdgcn_mbcnt_lo, amdgcn_mbcnt_hi}, Standard)
766       .Div(S32, {{}, {Vgpr32, None, Vgpr32, Vgpr32}});
767 
768   addRulesForIOpcs({amdgcn_readfirstlane})
769       .Any({{UniS32, _, DivS32}, {{}, {Sgpr32, None, Vgpr32}}})
770       // this should not exist in the first place, it is from call lowering
771       // readfirstlaning just in case register is not in sgpr.
772       .Any({{UniS32, _, UniS32}, {{}, {Sgpr32, None, Vgpr32}}});
773 
774 } // end initialize rules
775