xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td (revision c66ec88fed842fbaad62c30d510644ceb7bd2d71)
1//===-- SIInstructions.td - SI Instruction Definitions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// This file was originally auto-generated from a GPU register header file and
9// all the instruction definitions were originally commented out.  Instructions
10// that are not yet supported remain commented out.
11//===----------------------------------------------------------------------===//
12
13class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateControl {
14
15}
16
17include "SOPInstructions.td"
18include "VOPInstructions.td"
19include "SMInstructions.td"
20include "FLATInstructions.td"
21include "BUFInstructions.td"
22
23//===----------------------------------------------------------------------===//
24// EXP Instructions
25//===----------------------------------------------------------------------===//
26
27defm EXP : EXP_m<0>;
28defm EXP_DONE : EXP_m<1>;
29
30class ExpPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
31  (int_amdgcn_exp timm:$tgt, timm:$en,
32                  (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
33                  (vt ExpSrc2:$src2), (vt ExpSrc3:$src3),
34                  done_val, timm:$vm),
35  (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
36        ExpSrc2:$src2, ExpSrc3:$src3, timm:$vm, 0, timm:$en)
37>;
38
39class ExpComprPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
40  (int_amdgcn_exp_compr timm:$tgt, timm:$en,
41                        (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
42                        done_val, timm:$vm),
43  (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
44        (IMPLICIT_DEF), (IMPLICIT_DEF), timm:$vm, 1, timm:$en)
45>;
46
47// FIXME: The generated DAG matcher seems to have strange behavior
48// with a 1-bit literal to match, so use a -1 for checking a true
49// 1-bit value.
50def : ExpPattern<i32, EXP, 0>;
51def : ExpPattern<i32, EXP_DONE, -1>;
52def : ExpPattern<f32, EXP, 0>;
53def : ExpPattern<f32, EXP_DONE, -1>;
54
55def : ExpComprPattern<v2i16, EXP, 0>;
56def : ExpComprPattern<v2i16, EXP_DONE, -1>;
57def : ExpComprPattern<v2f16, EXP, 0>;
58def : ExpComprPattern<v2f16, EXP_DONE, -1>;
59
60//===----------------------------------------------------------------------===//
61// VINTRP Instructions
62//===----------------------------------------------------------------------===//
63
64// Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI)
65def VINTRPDst : VINTRPDstOperand <VGPR_32>;
66
67let Uses = [MODE, M0, EXEC] in {
68
69// FIXME: Specify SchedRW for VINTRP instructions.
70
71multiclass V_INTERP_P1_F32_m : VINTRP_m <
72  0x00000000,
73  (outs VINTRPDst:$vdst),
74  (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
75  "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan",
76  [(set f32:$vdst, (int_amdgcn_interp_p1 f32:$vsrc,
77                   (i32 timm:$attrchan), (i32 timm:$attr), M0))]
78>;
79
80let OtherPredicates = [has32BankLDS] in {
81
82defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;
83
84} // End OtherPredicates = [has32BankLDS]
85
86let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {
87
88defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
89
90} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
91
92let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
93
94defm V_INTERP_P2_F32 : VINTRP_m <
95  0x00000001,
96  (outs VINTRPDst:$vdst),
97  (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
98  "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan",
99  [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc,
100                   (i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
101
102} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst"
103
104defm V_INTERP_MOV_F32 : VINTRP_m <
105  0x00000002,
106  (outs VINTRPDst:$vdst),
107  (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
108  "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan",
109  [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc),
110                   (i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
111
112} // End Uses = [MODE, M0, EXEC]
113
114//===----------------------------------------------------------------------===//
115// Pseudo Instructions
116//===----------------------------------------------------------------------===//
117def ATOMIC_FENCE : SPseudoInstSI<
118  (outs), (ins i32imm:$ordering, i32imm:$scope),
119  [(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))],
120  "ATOMIC_FENCE $ordering, $scope"> {
121  let hasSideEffects = 1;
122  let maybeAtomic = 1;
123}
124
125def VOP_I64_I64_DPP : VOPProfile <[i64, i64, untyped, untyped]> {
126  let HasExt = 1;
127  let HasExtDPP = 1;
128}
129
130let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
131
132// For use in patterns
133def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
134  (ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> {
135  let isPseudo = 1;
136  let isCodeGenOnly = 1;
137  let usesCustomInserter = 1;
138}
139
140// 64-bit vector move instruction. This is mainly used by the
141// SIFoldOperands pass to enable folding of inline immediates.
142def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
143                                      (ins VSrc_b64:$src0)>;
144
145// 64-bit vector move with dpp. Expanded post-RA.
146def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64_DPP> {
147  let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete.
148}
149
150// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
151// WQM pass processes it.
152def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
153
154// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is
155// turned into a copy by WQM pass, but does not seed WQM requirements.
156def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
157
158// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so
159// that the @earlyclobber is respected. The @earlyclobber is to make sure that
160// the instruction that defines $src0 (which is run in WWM) doesn't
161// accidentally clobber inactive channels of $vdst.
162let Constraints = "@earlyclobber $vdst" in {
163def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
164}
165
166} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
167
168def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
169  let Uses = [EXEC];
170  let Defs = [EXEC, SCC];
171  let hasSideEffects = 0;
172  let mayLoad = 0;
173  let mayStore = 0;
174}
175
176def EXIT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
177  let hasSideEffects = 0;
178  let mayLoad = 0;
179  let mayStore = 0;
180}
181
182// Invert the exec mask and overwrite the inactive lanes of dst with inactive,
183// restoring it after we're done.
184def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
185  (ins VGPR_32: $src, VSrc_b32:$inactive),
186  [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
187  let Constraints = "$src = $vdst";
188}
189
190def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
191  (ins VReg_64: $src, VSrc_b64:$inactive),
192  [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
193  let Constraints = "$src = $vdst";
194}
195
196let usesCustomInserter = 1, Defs = [VCC, EXEC] in {
197def V_ADD_U64_PSEUDO : VPseudoInstSI <
198  (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
199  [(set VReg_64:$vdst, (getDivergentFrag<add>.ret i64:$src0, i64:$src1))]
200>;
201
202def V_SUB_U64_PSEUDO : VPseudoInstSI <
203  (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
204  [(set VReg_64:$vdst, (getDivergentFrag<sub>.ret i64:$src0, i64:$src1))]
205>;
206} // End usesCustomInserter = 1, Defs = [VCC, EXEC]
207
208let usesCustomInserter = 1, Defs = [SCC] in {
209def S_ADD_U64_PSEUDO : SPseudoInstSI <
210  (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
211  [(set SReg_64:$sdst, (UniformBinFrag<add> i64:$src0, i64:$src1))]
212>;
213
214def S_SUB_U64_PSEUDO : SPseudoInstSI <
215  (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
216  [(set SReg_64:$sdst, (UniformBinFrag<sub> i64:$src0, i64:$src1))]
217>;
218
219def S_ADD_U64_CO_PSEUDO : SPseudoInstSI <
220  (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
221>;
222
223def S_SUB_U64_CO_PSEUDO : SPseudoInstSI <
224  (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
225>;
226
227def S_ADD_CO_PSEUDO : SPseudoInstSI <
228  (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in)
229>;
230
231def S_SUB_CO_PSEUDO : SPseudoInstSI <
232  (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in)
233>;
234
235def S_UADDO_PSEUDO : SPseudoInstSI <
236  (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1)
237>;
238
239def S_USUBO_PSEUDO : SPseudoInstSI <
240  (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1)
241>;
242
243} // End usesCustomInserter = 1, Defs = [SCC]
244
245let usesCustomInserter = 1 in {
246def GET_GROUPSTATICSIZE : SPseudoInstSI <(outs SReg_32:$sdst), (ins),
247  [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
248} // End let usesCustomInserter = 1, SALU = 1
249
250// Wrap an instruction by duplicating it, except for setting isTerminator.
251class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI<
252      base_inst.OutOperandList,
253      base_inst.InOperandList> {
254  let Uses = base_inst.Uses;
255  let Defs = base_inst.Defs;
256  let isTerminator = 1;
257  let isAsCheapAsAMove = base_inst.isAsCheapAsAMove;
258  let hasSideEffects = base_inst.hasSideEffects;
259  let UseNamedOperandTable = base_inst.UseNamedOperandTable;
260  let CodeSize = base_inst.CodeSize;
261  let SchedRW = base_inst.SchedRW;
262}
263
264let WaveSizePredicate = isWave64 in {
265def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
266def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
267def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
268}
269
270let WaveSizePredicate = isWave32 in {
271def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
272def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
273def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
274def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
275}
276
277
278def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
279  [(int_amdgcn_wave_barrier)]> {
280  let SchedRW = [];
281  let hasNoSchedulingInfo = 1;
282  let hasSideEffects = 1;
283  let mayLoad = 0;
284  let mayStore = 0;
285  let isConvergent = 1;
286  let FixedSize = 1;
287  let Size = 0;
288}
289
290// SI pseudo instructions. These are used by the CFG structurizer pass
291// and should be lowered to ISA instructions prior to codegen.
292
293// Dummy terminator instruction to use after control flow instructions
294// replaced with exec mask operations.
295def SI_MASK_BRANCH : VPseudoInstSI <
296  (outs), (ins brtarget:$target)> {
297  let isBranch = 0;
298  let isTerminator = 1;
299  let isBarrier = 0;
300  let SchedRW = [];
301  let hasNoSchedulingInfo = 1;
302  let FixedSize = 1;
303  let Size = 0;
304}
305
306let isTerminator = 1 in {
307
308let OtherPredicates = [EnableLateCFGStructurize] in {
309 def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
310  (outs),
311  (ins SReg_1:$vcc, brtarget:$target),
312  [(brcond i1:$vcc, bb:$target)]> {
313    let Size = 12;
314}
315}
316
317def SI_IF: CFPseudoInstSI <
318  (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target),
319  [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
320  let Constraints = "";
321  let Size = 12;
322  let hasSideEffects = 1;
323}
324
325def SI_ELSE : CFPseudoInstSI <
326  (outs SReg_1:$dst),
327  (ins SReg_1:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
328  let Size = 12;
329  let hasSideEffects = 1;
330}
331
332def SI_LOOP : CFPseudoInstSI <
333  (outs), (ins SReg_1:$saved, brtarget:$target),
334  [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> {
335  let Size = 8;
336  let isBranch = 1;
337  let hasSideEffects = 1;
338}
339
340} // End isTerminator = 1
341
342def SI_END_CF : CFPseudoInstSI <
343  (outs), (ins SReg_1:$saved), [], 1, 1> {
344  let Size = 4;
345  let isAsCheapAsAMove = 1;
346  let isReMaterializable = 1;
347  let hasSideEffects = 1;
348  let mayLoad = 1; // FIXME: Should not need memory flags
349  let mayStore = 1;
350}
351
352def SI_IF_BREAK : CFPseudoInstSI <
353  (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> {
354  let Size = 4;
355  let isAsCheapAsAMove = 1;
356  let isReMaterializable = 1;
357}
358
359let Uses = [EXEC] in {
360
361multiclass PseudoInstKill <dag ins> {
362  // Even though this pseudo can usually be expanded without an SCC def, we
363  // conservatively assume that it has an SCC def, both because it is sometimes
364  // required in degenerate cases (when V_CMPX cannot be used due to constant
365  // bus limitations) and because it allows us to avoid having to track SCC
366  // liveness across basic blocks.
367  let Defs = [EXEC,VCC,SCC] in
368  def _PSEUDO : PseudoInstSI <(outs), ins> {
369    let isConvergent = 1;
370    let usesCustomInserter = 1;
371  }
372
373  let Defs = [EXEC,VCC,SCC] in
374  def _TERMINATOR : SPseudoInstSI <(outs), ins> {
375    let isTerminator = 1;
376  }
377}
378
379defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
380defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
381
382let Defs = [EXEC] in
383def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>;
384
385let Defs = [EXEC,VCC] in
386def SI_ILLEGAL_COPY : SPseudoInstSI <
387  (outs unknown:$dst), (ins unknown:$src),
388  [], " ; illegal copy $src to $dst">;
389
390} // End Uses = [EXEC], Defs = [EXEC,VCC]
391
392// Branch on undef scc. Used to avoid intermediate copy from
393// IMPLICIT_DEF to SCC.
394def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
395  let isTerminator = 1;
396  let usesCustomInserter = 1;
397  let isBranch = 1;
398}
399
400def SI_PS_LIVE : PseudoInstSI <
401  (outs SReg_1:$dst), (ins),
402  [(set i1:$dst, (int_amdgcn_ps_live))]> {
403  let SALU = 1;
404}
405
406def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
407  [(int_amdgcn_unreachable)],
408  "; divergent unreachable"> {
409  let Size = 0;
410  let hasNoSchedulingInfo = 1;
411  let FixedSize = 1;
412}
413
414// Used as an isel pseudo to directly emit initialization with an
415// s_mov_b32 rather than a copy of another initialized
416// register. MachineCSE skips copies, and we don't want to have to
417// fold operands before it runs.
418def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
419  let Defs = [M0];
420  let usesCustomInserter = 1;
421  let isAsCheapAsAMove = 1;
422  let isReMaterializable = 1;
423}
424
425def SI_INIT_EXEC : SPseudoInstSI <
426  (outs), (ins i64imm:$src),
427  [(int_amdgcn_init_exec (i64 timm:$src))]> {
428  let Defs = [EXEC];
429  let usesCustomInserter = 1;
430  let isAsCheapAsAMove = 1;
431  let WaveSizePredicate = isWave64;
432}
433
434// FIXME: Intrinsic should be mangled for wave size.
435def SI_INIT_EXEC_LO : SPseudoInstSI <
436  (outs), (ins i32imm:$src), []> {
437  let Defs = [EXEC_LO];
438  let usesCustomInserter = 1;
439  let isAsCheapAsAMove = 1;
440  let WaveSizePredicate = isWave32;
441}
442
443// FIXME: Wave32 version
444def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
445  (outs), (ins SSrc_b32:$input, i32imm:$shift),
446  [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> {
447  let Defs = [EXEC];
448  let usesCustomInserter = 1;
449}
450
451def : GCNPat <
452  (int_amdgcn_init_exec timm:$src),
453  (SI_INIT_EXEC_LO (as_i32timm timm:$src))> {
454  let WaveSizePredicate = isWave32;
455}
456
457// Return for returning shaders to a shader variant epilog.
458def SI_RETURN_TO_EPILOG : SPseudoInstSI <
459  (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
460  let isTerminator = 1;
461  let isBarrier = 1;
462  let isReturn = 1;
463  let hasNoSchedulingInfo = 1;
464  let DisableWQM = 1;
465  let FixedSize = 1;
466}
467
468// Return for returning function calls.
469def SI_RETURN : SPseudoInstSI <
470  (outs), (ins), [],
471  "; return"> {
472  let isTerminator = 1;
473  let isBarrier = 1;
474  let isReturn = 1;
475  let SchedRW = [WriteBranch];
476}
477
478// Return for returning function calls without output register.
479//
480// This version is only needed so we can fill in the output register
481// in the custom inserter.
482def SI_CALL_ISEL : SPseudoInstSI <
483  (outs), (ins SSrc_b64:$src0, unknown:$callee),
484  [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> {
485  let Size = 4;
486  let isCall = 1;
487  let SchedRW = [WriteBranch];
488  let usesCustomInserter = 1;
489  // TODO: Should really base this on the call target
490  let isConvergent = 1;
491}
492
493def : GCNPat<
494  (AMDGPUcall i64:$src0, (i64 0)),
495  (SI_CALL_ISEL $src0, (i64 0))
496>;
497
498// Wrapper around s_swappc_b64 with extra $callee parameter to track
499// the called function after regalloc.
500def SI_CALL : SPseudoInstSI <
501  (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> {
502  let Size = 4;
503  let isCall = 1;
504  let UseNamedOperandTable = 1;
505  let SchedRW = [WriteBranch];
506  // TODO: Should really base this on the call target
507  let isConvergent = 1;
508}
509
510// Tail call handling pseudo
511def SI_TCRETURN : SPseudoInstSI <(outs),
512  (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff),
513  [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
514  let Size = 4;
515  let isCall = 1;
516  let isTerminator = 1;
517  let isReturn = 1;
518  let isBarrier = 1;
519  let UseNamedOperandTable = 1;
520  let SchedRW = [WriteBranch];
521  // TODO: Should really base this on the call target
522  let isConvergent = 1;
523}
524
525
526def ADJCALLSTACKUP : SPseudoInstSI<
527  (outs), (ins i32imm:$amt0, i32imm:$amt1),
528  [(callseq_start timm:$amt0, timm:$amt1)],
529  "; adjcallstackup $amt0 $amt1"> {
530  let Size = 8; // Worst case. (s_add_u32 + constant)
531  let FixedSize = 1;
532  let hasSideEffects = 1;
533  let usesCustomInserter = 1;
534  let SchedRW = [WriteSALU];
535  let Defs = [SCC];
536}
537
538def ADJCALLSTACKDOWN : SPseudoInstSI<
539  (outs), (ins i32imm:$amt1, i32imm:$amt2),
540  [(callseq_end timm:$amt1, timm:$amt2)],
541  "; adjcallstackdown $amt1"> {
542  let Size = 8; // Worst case. (s_add_u32 + constant)
543  let hasSideEffects = 1;
544  let usesCustomInserter = 1;
545  let SchedRW = [WriteSALU];
546  let Defs = [SCC];
547}
548
549let Defs = [M0, EXEC, SCC],
550  UseNamedOperandTable = 1 in {
551
552// SI_INDIRECT_SRC/DST are only used by legacy SelectionDAG indirect
553// addressing implementation.
554class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI <
555  (outs VGPR_32:$vdst),
556  (ins rc:$src, VS_32:$idx, i32imm:$offset)> {
557  let usesCustomInserter = 1;
558}
559
560class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI <
561  (outs rc:$vdst),
562  (ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> {
563  let Constraints = "$src = $vdst";
564  let usesCustomInserter = 1;
565}
566
567def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
568def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
569def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
570def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
571def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;
572def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>;
573
574def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
575def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
576def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
577def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
578def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
579def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>;
580
581} // End Uses = [EXEC], Defs = [M0, EXEC]
582
583
584// This is a pseudo variant of the v_movreld_b32 (or v_mov_b32
585// expecting to be executed with gpr indexing mode enabled)
586// instruction in which the vector operand appears only twice, once as
587// def and once as use. Using this pseudo avoids problems with the Two
588// Address instructions pass.
589class INDIRECT_REG_WRITE_pseudo<RegisterClass rc,
590                                RegisterOperand val_ty> : PseudoInstSI <
591  (outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> {
592  let Constraints = "$vsrc = $vdst";
593  let Uses = [M0];
594}
595
596class V_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> :
597  INDIRECT_REG_WRITE_pseudo<rc, VSrc_b32> {
598  let VALU = 1;
599  let VOP1 = 1;
600  let Uses = [M0, EXEC];
601}
602
603class S_INDIRECT_REG_WRITE_pseudo<RegisterClass rc,
604                                  RegisterOperand val_ty> :
605  INDIRECT_REG_WRITE_pseudo<rc, val_ty> {
606  let SALU = 1;
607  let SOP1 = 1;
608  let Uses = [M0];
609}
610
611class S_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> :
612  S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b32>;
613class S_INDIRECT_REG_WRITE_B64_pseudo<RegisterClass rc> :
614  S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b64>;
615
616
617def V_INDIRECT_REG_WRITE_B32_V1 : V_INDIRECT_REG_WRITE_B32_pseudo<VGPR_32>;
618def V_INDIRECT_REG_WRITE_B32_V2 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_64>;
619def V_INDIRECT_REG_WRITE_B32_V3 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_96>;
620def V_INDIRECT_REG_WRITE_B32_V4 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_128>;
621def V_INDIRECT_REG_WRITE_B32_V5 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_160>;
622def V_INDIRECT_REG_WRITE_B32_V8 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_256>;
623def V_INDIRECT_REG_WRITE_B32_V16 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_512>;
624def V_INDIRECT_REG_WRITE_B32_V32 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_1024>;
625
626def S_INDIRECT_REG_WRITE_B32_V1 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_32>;
627def S_INDIRECT_REG_WRITE_B32_V2 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_64>;
628def S_INDIRECT_REG_WRITE_B32_V3 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_96>;
629def S_INDIRECT_REG_WRITE_B32_V4 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_128>;
630def S_INDIRECT_REG_WRITE_B32_V5 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_160>;
631def S_INDIRECT_REG_WRITE_B32_V8 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_256>;
632def S_INDIRECT_REG_WRITE_B32_V16 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_512>;
633def S_INDIRECT_REG_WRITE_B32_V32 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_1024>;
634
635def S_INDIRECT_REG_WRITE_B64_V1 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_64>;
636def S_INDIRECT_REG_WRITE_B64_V2 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_128>;
637def S_INDIRECT_REG_WRITE_B64_V4 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_256>;
638def S_INDIRECT_REG_WRITE_B64_V8 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_512>;
639def S_INDIRECT_REG_WRITE_B64_V16 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_1024>;
640
641
642multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
643  let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in {
644    def _SAVE : PseudoInstSI <
645      (outs),
646      (ins sgpr_class:$data, i32imm:$addr)> {
647      let mayStore = 1;
648      let mayLoad = 0;
649    }
650
651    def _RESTORE : PseudoInstSI <
652      (outs sgpr_class:$data),
653      (ins i32imm:$addr)> {
654      let mayStore = 0;
655      let mayLoad = 1;
656    }
657  } // End UseNamedOperandTable = 1
658}
659
660// You cannot use M0 as the output of v_readlane_b32 instructions or
661// use it in the sdata operand of SMEM instructions. We still need to
662// be able to spill the physical register m0, so allow it for
663// SI_SPILL_32_* instructions.
664defm SI_SPILL_S32  : SI_SPILL_SGPR <SReg_32>;
665defm SI_SPILL_S64  : SI_SPILL_SGPR <SReg_64>;
666defm SI_SPILL_S96  : SI_SPILL_SGPR <SReg_96>;
667defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
668defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
669defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>;
670defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
671defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
672defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
673
674multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
675  let UseNamedOperandTable = 1, VGPRSpill = 1,
676       SchedRW = [WriteVMEM] in {
677    def _SAVE : VPseudoInstSI <
678      (outs),
679      (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
680           SReg_32:$soffset, i32imm:$offset)> {
681      let mayStore = 1;
682      let mayLoad = 0;
683      // (2 * 4) + (8 * num_subregs) bytes maximum
684      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
685      // Size field is unsigned char and cannot fit more.
686      let Size = !if(!le(MaxSize, 256), MaxSize, 252);
687    }
688
689    def _RESTORE : VPseudoInstSI <
690      (outs vgpr_class:$vdata),
691      (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
692           i32imm:$offset)> {
693      let mayStore = 0;
694      let mayLoad = 1;
695
696      // (2 * 4) + (8 * num_subregs) bytes maximum
697      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
698      // Size field is unsigned char and cannot fit more.
699      let Size = !if(!le(MaxSize, 256), MaxSize, 252);
700    }
701  } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
702}
703
704defm SI_SPILL_V32  : SI_SPILL_VGPR <VGPR_32>;
705defm SI_SPILL_V64  : SI_SPILL_VGPR <VReg_64>;
706defm SI_SPILL_V96  : SI_SPILL_VGPR <VReg_96>;
707defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
708defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
709defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>;
710defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
711defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
712defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
713
714multiclass SI_SPILL_AGPR <RegisterClass vgpr_class> {
715  let UseNamedOperandTable = 1, VGPRSpill = 1,
716      Constraints = "@earlyclobber $tmp",
717      SchedRW = [WriteVMEM] in {
718    def _SAVE : VPseudoInstSI <
719      (outs VGPR_32:$tmp),
720      (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
721           SReg_32:$soffset, i32imm:$offset)> {
722      let mayStore = 1;
723      let mayLoad = 0;
724      // (2 * 4) + (16 * num_subregs) bytes maximum
725      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
726      // Size field is unsigned char and cannot fit more.
727      let Size = !if(!le(MaxSize, 256), MaxSize, 252);
728    }
729
730    def _RESTORE : VPseudoInstSI <
731      (outs vgpr_class:$vdata, VGPR_32:$tmp),
732      (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
733           i32imm:$offset)> {
734      let mayStore = 0;
735      let mayLoad = 1;
736
737      // (2 * 4) + (16 * num_subregs) bytes maximum
738      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
739      // Size field is unsigned char and cannot fit more.
740      let Size = !if(!le(MaxSize, 256), MaxSize, 252);
741    }
742  } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
743}
744
745defm SI_SPILL_A32  : SI_SPILL_AGPR <AGPR_32>;
746defm SI_SPILL_A64  : SI_SPILL_AGPR <AReg_64>;
747defm SI_SPILL_A128 : SI_SPILL_AGPR <AReg_128>;
748defm SI_SPILL_A512 : SI_SPILL_AGPR <AReg_512>;
749defm SI_SPILL_A1024 : SI_SPILL_AGPR <AReg_1024>;
750
751def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
752  (outs SReg_64:$dst),
753  (ins si_ga:$ptr_lo, si_ga:$ptr_hi),
754  [(set SReg_64:$dst,
755      (i64 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, tglobaladdr:$ptr_hi)))]> {
756  let Defs = [SCC];
757}
758
759def : GCNPat <
760  (SIpc_add_rel_offset tglobaladdr:$ptr_lo, 0),
761  (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0))
762>;
763
764def : GCNPat<
765  (AMDGPUtrap timm:$trapid),
766  (S_TRAP $trapid)
767>;
768
769def : GCNPat<
770  (AMDGPUelse i1:$src, bb:$target),
771  (SI_ELSE $src, $target, 0)
772>;
773
774def : Pat <
775  (int_amdgcn_kill i1:$src),
776  (SI_KILL_I1_PSEUDO SCSrc_i1:$src, 0)
777>;
778
779def : Pat <
780  (int_amdgcn_kill (i1 (not i1:$src))),
781  (SI_KILL_I1_PSEUDO SCSrc_i1:$src, -1)
782>;
783
784def : Pat <
785  (int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))),
786  (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
787>;
788
789  // TODO: we could add more variants for other types of conditionals
790
791def : Pat <
792  (i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
793  (COPY $src) // Return the SGPRs representing i1 src
794>;
795
796def : Pat <
797  (i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
798  (COPY $src) // Return the SGPRs representing i1 src
799>;
800
801//===----------------------------------------------------------------------===//
802// VOP1 Patterns
803//===----------------------------------------------------------------------===//
804
805let OtherPredicates = [UnsafeFPMath] in {
806
807//def : RcpPat<V_RCP_F64_e32, f64>;
808//defm : RsqPat<V_RSQ_F64_e32, f64>;
809//defm : RsqPat<V_RSQ_F32_e32, f32>;
810
811def : RsqPat<V_RSQ_F32_e32, f32>;
812def : RsqPat<V_RSQ_F64_e32, f64>;
813
814// Convert (x - floor(x)) to fract(x)
815def : GCNPat <
816  (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
817             (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
818  (V_FRACT_F32_e64 $mods, $x)
819>;
820
821// Convert (x + (-floor(x))) to fract(x)
822def : GCNPat <
823  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
824             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
825  (V_FRACT_F64_e64 $mods, $x)
826>;
827
828} // End OtherPredicates = [UnsafeFPMath]
829
830
831// f16_to_fp patterns
832def : GCNPat <
833  (f32 (f16_to_fp i32:$src0)),
834  (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0)
835>;
836
837def : GCNPat <
838  (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
839  (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0)
840>;
841
842def : GCNPat <
843  (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
844  (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)))
845>;
846
847def : GCNPat <
848  (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
849  (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0)
850>;
851
852def : GCNPat <
853  (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
854  (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0)
855>;
856
857def : GCNPat <
858  (f64 (fpextend f16:$src)),
859  (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
860>;
861
862// fp_to_fp16 patterns
863def : GCNPat <
864  (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
865  (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0)
866>;
867
868def : GCNPat <
869  (i32 (fp_to_sint f16:$src)),
870  (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src))
871>;
872
873def : GCNPat <
874  (i32 (fp_to_uint f16:$src)),
875  (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src))
876>;
877
878def : GCNPat <
879  (f16 (sint_to_fp i32:$src)),
880  (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 VSrc_b32:$src))
881>;
882
883def : GCNPat <
884  (f16 (uint_to_fp i32:$src)),
885  (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 VSrc_b32:$src))
886>;
887
888//===----------------------------------------------------------------------===//
889// VOP2 Patterns
890//===----------------------------------------------------------------------===//
891
892// TODO: Check only no src2 mods?
893class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node>
894  : GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)),
895                      (vt (VOP3NoMods vt:$src1)),
896                      (vt (VOP3NoMods vt:$src2)))),
897    (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
898          SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
899>;
900
901
902// Prefer mac form when there are no modifiers.
903let AddedComplexity = 9 in {
904def : FMADPat <f32, V_MAC_F32_e64, fmad>;
905def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>;
906
907let SubtargetPredicate = Has16BitInsts in {
908def : FMADPat <f16, V_MAC_F16_e64, fmad>;
909def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>;
910}
911
912}
913
914class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
915  : GCNPat<
916  (Ty (mad_opr (Ty (VOP3Mods Ty:$src0, i32:$src0_mod)),
917               (Ty (VOP3Mods Ty:$src1, i32:$src1_mod)),
918               (Ty (VOP3Mods Ty:$src2, i32:$src2_mod)))),
919  (inst $src0_mod, $src0, $src1_mod, $src1,
920  $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
921>;
922
923let SubtargetPredicate = HasMadMacF32Insts in
924def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>;
925def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> {
926  let SubtargetPredicate = Has16BitInsts;
927}
928
929class VOPSelectModsPat <ValueType vt> : GCNPat <
930  (vt (select i1:$src0, (VOP3Mods vt:$src1, i32:$src1_mods),
931                        (VOP3Mods vt:$src2, i32:$src2_mods))),
932  (V_CNDMASK_B32_e64 FP32InputMods:$src2_mods, VSrc_b32:$src2,
933                     FP32InputMods:$src1_mods, VSrc_b32:$src1, SSrc_i1:$src0)
934>;
935
936class VOPSelectPat <ValueType vt> : GCNPat <
937  (vt (select i1:$src0, vt:$src1, vt:$src2)),
938  (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0)
939>;
940
941def : VOPSelectModsPat <i32>;
942def : VOPSelectModsPat <f32>;
943def : VOPSelectPat <f16>;
944def : VOPSelectPat <i16>;
945
946let AddedComplexity = 1 in {
947def : GCNPat <
948  (i32 (add (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)), i32:$val)),
949  (V_BCNT_U32_B32_e64 $popcnt, $val)
950>;
951}
952
953def : GCNPat <
954  (i32 (ctpop i32:$popcnt)),
955  (V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0))
956>;
957
958def : GCNPat <
959  (i16 (add (i16 (trunc (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)))), i16:$val)),
960  (V_BCNT_U32_B32_e64 $popcnt, $val)
961>;
962
963/********** ============================================ **********/
964/********** Extraction, Insertion, Building and Casting  **********/
965/********** ============================================ **********/
966
967foreach Index = 0-2 in {
968  def Extract_Element_v2i32_#Index : Extract_Element <
969    i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
970  >;
971  def Insert_Element_v2i32_#Index : Insert_Element <
972    i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
973  >;
974
975  def Extract_Element_v2f32_#Index : Extract_Element <
976    f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
977  >;
978  def Insert_Element_v2f32_#Index : Insert_Element <
979    f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
980  >;
981}
982
983foreach Index = 0-2 in {
984  def Extract_Element_v3i32_#Index : Extract_Element <
985    i32, v3i32, Index, !cast<SubRegIndex>(sub#Index)
986  >;
987  def Insert_Element_v3i32_#Index : Insert_Element <
988    i32, v3i32, Index, !cast<SubRegIndex>(sub#Index)
989  >;
990
991  def Extract_Element_v3f32_#Index : Extract_Element <
992    f32, v3f32, Index, !cast<SubRegIndex>(sub#Index)
993  >;
994  def Insert_Element_v3f32_#Index : Insert_Element <
995    f32, v3f32, Index, !cast<SubRegIndex>(sub#Index)
996  >;
997}
998
999foreach Index = 0-3 in {
1000  def Extract_Element_v4i32_#Index : Extract_Element <
1001    i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
1002  >;
1003  def Insert_Element_v4i32_#Index : Insert_Element <
1004    i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
1005  >;
1006
1007  def Extract_Element_v4f32_#Index : Extract_Element <
1008    f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
1009  >;
1010  def Insert_Element_v4f32_#Index : Insert_Element <
1011    f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
1012  >;
1013}
1014
1015foreach Index = 0-4 in {
1016  def Extract_Element_v5i32_#Index : Extract_Element <
1017    i32, v5i32, Index, !cast<SubRegIndex>(sub#Index)
1018  >;
1019  def Insert_Element_v5i32_#Index : Insert_Element <
1020    i32, v5i32, Index, !cast<SubRegIndex>(sub#Index)
1021  >;
1022
1023  def Extract_Element_v5f32_#Index : Extract_Element <
1024    f32, v5f32, Index, !cast<SubRegIndex>(sub#Index)
1025  >;
1026  def Insert_Element_v5f32_#Index : Insert_Element <
1027    f32, v5f32, Index, !cast<SubRegIndex>(sub#Index)
1028  >;
1029}
1030
1031foreach Index = 0-7 in {
1032  def Extract_Element_v8i32_#Index : Extract_Element <
1033    i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
1034  >;
1035  def Insert_Element_v8i32_#Index : Insert_Element <
1036    i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
1037  >;
1038
1039  def Extract_Element_v8f32_#Index : Extract_Element <
1040    f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
1041  >;
1042  def Insert_Element_v8f32_#Index : Insert_Element <
1043    f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
1044  >;
1045}
1046
1047foreach Index = 0-15 in {
1048  def Extract_Element_v16i32_#Index : Extract_Element <
1049    i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
1050  >;
1051  def Insert_Element_v16i32_#Index : Insert_Element <
1052    i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
1053  >;
1054
1055  def Extract_Element_v16f32_#Index : Extract_Element <
1056    f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
1057  >;
1058  def Insert_Element_v16f32_#Index : Insert_Element <
1059    f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
1060  >;
1061}
1062
1063
1064def : Pat <
1065  (extract_subvector v4i16:$vec, (i32 0)),
1066  (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0))
1067>;
1068
1069def : Pat <
1070  (extract_subvector v4i16:$vec, (i32 2)),
1071  (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1))
1072>;
1073
1074def : Pat <
1075  (extract_subvector v4f16:$vec, (i32 0)),
1076  (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0))
1077>;
1078
1079def : Pat <
1080  (extract_subvector v4f16:$vec, (i32 2)),
1081  (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))
1082>;
1083
1084foreach Index = 0-31 in {
1085  def Extract_Element_v32i32_#Index : Extract_Element <
1086    i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
1087  >;
1088
1089  def Insert_Element_v32i32_#Index : Insert_Element <
1090    i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
1091  >;
1092
1093  def Extract_Element_v32f32_#Index : Extract_Element <
1094    f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
1095  >;
1096
1097  def Insert_Element_v32f32_#Index : Insert_Element <
1098    f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
1099  >;
1100}
1101
1102// FIXME: Why do only some of these type combinations for SReg and
1103// VReg?
1104// 16-bit bitcast
1105def : BitConvert <i16, f16, VGPR_32>;
1106def : BitConvert <f16, i16, VGPR_32>;
1107def : BitConvert <i16, f16, SReg_32>;
1108def : BitConvert <f16, i16, SReg_32>;
1109
1110// 32-bit bitcast
1111def : BitConvert <i32, f32, VGPR_32>;
1112def : BitConvert <f32, i32, VGPR_32>;
1113def : BitConvert <i32, f32, SReg_32>;
1114def : BitConvert <f32, i32, SReg_32>;
1115def : BitConvert <v2i16, i32, SReg_32>;
1116def : BitConvert <i32, v2i16, SReg_32>;
1117def : BitConvert <v2f16, i32, SReg_32>;
1118def : BitConvert <i32, v2f16, SReg_32>;
1119def : BitConvert <v2i16, v2f16, SReg_32>;
1120def : BitConvert <v2f16, v2i16, SReg_32>;
1121def : BitConvert <v2f16, f32, SReg_32>;
1122def : BitConvert <f32, v2f16, SReg_32>;
1123def : BitConvert <v2i16, f32, SReg_32>;
1124def : BitConvert <f32, v2i16, SReg_32>;
1125
1126// 64-bit bitcast
1127def : BitConvert <i64, f64, VReg_64>;
1128def : BitConvert <f64, i64, VReg_64>;
1129def : BitConvert <v2i32, v2f32, VReg_64>;
1130def : BitConvert <v2f32, v2i32, VReg_64>;
1131def : BitConvert <i64, v2i32, VReg_64>;
1132def : BitConvert <v2i32, i64, VReg_64>;
1133def : BitConvert <i64, v2f32, VReg_64>;
1134def : BitConvert <v2f32, i64, VReg_64>;
1135def : BitConvert <f64, v2f32, VReg_64>;
1136def : BitConvert <v2f32, f64, VReg_64>;
1137def : BitConvert <f64, v2i32, VReg_64>;
1138def : BitConvert <v2i32, f64, VReg_64>;
1139def : BitConvert <v4i16, v4f16, VReg_64>;
1140def : BitConvert <v4f16, v4i16, VReg_64>;
1141
1142// FIXME: Make SGPR
1143def : BitConvert <v2i32, v4f16, VReg_64>;
1144def : BitConvert <v4f16, v2i32, VReg_64>;
1145def : BitConvert <v2i32, v4f16, VReg_64>;
1146def : BitConvert <v2i32, v4i16, VReg_64>;
1147def : BitConvert <v4i16, v2i32, VReg_64>;
1148def : BitConvert <v2f32, v4f16, VReg_64>;
1149def : BitConvert <v4f16, v2f32, VReg_64>;
1150def : BitConvert <v2f32, v4i16, VReg_64>;
1151def : BitConvert <v4i16, v2f32, VReg_64>;
1152def : BitConvert <v4i16, f64, VReg_64>;
1153def : BitConvert <v4f16, f64, VReg_64>;
1154def : BitConvert <f64, v4i16, VReg_64>;
1155def : BitConvert <f64, v4f16, VReg_64>;
1156def : BitConvert <v4i16, i64, VReg_64>;
1157def : BitConvert <v4f16, i64, VReg_64>;
1158def : BitConvert <i64, v4i16, VReg_64>;
1159def : BitConvert <i64, v4f16, VReg_64>;
1160
1161def : BitConvert <v4i32, v4f32, VReg_128>;
1162def : BitConvert <v4f32, v4i32, VReg_128>;
1163
1164// 96-bit bitcast
1165def : BitConvert <v3i32, v3f32, SGPR_96>;
1166def : BitConvert <v3f32, v3i32, SGPR_96>;
1167
1168// 128-bit bitcast
1169def : BitConvert <v2i64, v4i32, SReg_128>;
1170def : BitConvert <v4i32, v2i64, SReg_128>;
1171def : BitConvert <v2f64, v4f32, VReg_128>;
1172def : BitConvert <v2f64, v4i32, VReg_128>;
1173def : BitConvert <v4f32, v2f64, VReg_128>;
1174def : BitConvert <v4i32, v2f64, VReg_128>;
1175def : BitConvert <v2i64, v2f64, VReg_128>;
1176def : BitConvert <v2f64, v2i64, VReg_128>;
1177def : BitConvert <v4f32, v2i64, VReg_128>;
1178def : BitConvert <v2i64, v4f32, VReg_128>;
1179
1180// 160-bit bitcast
1181def : BitConvert <v5i32, v5f32, SGPR_160>;
1182def : BitConvert <v5f32, v5i32, SGPR_160>;
1183
1184// 256-bit bitcast
1185def : BitConvert <v8i32, v8f32, SReg_256>;
1186def : BitConvert <v8f32, v8i32, SReg_256>;
1187def : BitConvert <v8i32, v8f32, VReg_256>;
1188def : BitConvert <v8f32, v8i32, VReg_256>;
1189def : BitConvert <v4i64, v4f64, VReg_256>;
1190def : BitConvert <v4f64, v4i64, VReg_256>;
1191def : BitConvert <v4i64, v8i32, VReg_256>;
1192def : BitConvert <v4i64, v8f32, VReg_256>;
1193def : BitConvert <v4f64, v8i32, VReg_256>;
1194def : BitConvert <v4f64, v8f32, VReg_256>;
1195def : BitConvert <v8i32, v4i64, VReg_256>;
1196def : BitConvert <v8f32, v4i64, VReg_256>;
1197def : BitConvert <v8i32, v4f64, VReg_256>;
1198def : BitConvert <v8f32, v4f64, VReg_256>;
1199
1200
1201// 512-bit bitcast
1202def : BitConvert <v16i32, v16f32, VReg_512>;
1203def : BitConvert <v16f32, v16i32, VReg_512>;
1204def : BitConvert <v8i64,  v8f64,  VReg_512>;
1205def : BitConvert <v8f64,  v8i64,  VReg_512>;
1206def : BitConvert <v8i64,  v16i32, VReg_512>;
1207def : BitConvert <v8f64,  v16i32, VReg_512>;
1208def : BitConvert <v16i32, v8i64,  VReg_512>;
1209def : BitConvert <v16i32, v8f64,  VReg_512>;
1210def : BitConvert <v8i64,  v16f32, VReg_512>;
1211def : BitConvert <v8f64,  v16f32, VReg_512>;
1212def : BitConvert <v16f32, v8i64,  VReg_512>;
1213def : BitConvert <v16f32, v8f64,  VReg_512>;
1214
1215// 1024-bit bitcast
1216def : BitConvert <v32i32, v32f32, VReg_1024>;
1217def : BitConvert <v32f32, v32i32, VReg_1024>;
1218def : BitConvert <v16i64, v16f64, VReg_1024>;
1219def : BitConvert <v16f64, v16i64, VReg_1024>;
1220def : BitConvert <v16i64, v32i32, VReg_1024>;
1221def : BitConvert <v32i32, v16i64, VReg_1024>;
1222def : BitConvert <v16f64, v32f32, VReg_1024>;
1223def : BitConvert <v32f32, v16f64, VReg_1024>;
1224def : BitConvert <v16i64, v32f32, VReg_1024>;
1225def : BitConvert <v32i32, v16f64, VReg_1024>;
1226def : BitConvert <v16f64, v32i32, VReg_1024>;
1227def : BitConvert <v32f32, v16i64, VReg_1024>;
1228
1229
1230/********** =================== **********/
1231/********** Src & Dst modifiers **********/
1232/********** =================== **********/
1233
1234
1235// If denormals are not enabled, it only impacts the compare of the
1236// inputs. The output result is not flushed.
1237class ClampPat<Instruction inst, ValueType vt> : GCNPat <
1238  (vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))),
1239  (inst i32:$src0_modifiers, vt:$src0,
1240        i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE)
1241>;
1242
1243def : ClampPat<V_MAX_F32_e64, f32>;
1244def : ClampPat<V_MAX_F64, f64>;
1245def : ClampPat<V_MAX_F16_e64, f16>;
1246
1247let SubtargetPredicate = HasVOP3PInsts in {
1248def : GCNPat <
1249  (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))),
1250  (V_PK_MAX_F16 $src0_modifiers, $src0,
1251                $src0_modifiers, $src0, DSTCLAMP.ENABLE)
1252>;
1253}
1254
1255/********** ================================ **********/
1256/********** Floating point absolute/negative **********/
1257/********** ================================ **********/
1258
1259// Prevent expanding both fneg and fabs.
1260// TODO: Add IgnoredBySelectionDAG bit?
1261let AddedComplexity = 1 in { // Prefer SALU to VALU patterns for DAG
1262
1263def : GCNPat <
1264  (fneg (fabs (f32 SReg_32:$src))),
1265  (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) // Set sign bit
1266>;
1267
1268def : GCNPat <
1269  (fabs (f32 SReg_32:$src)),
1270  (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fffffff)))
1271>;
1272
1273def : GCNPat <
1274  (fneg (f32 SReg_32:$src)),
1275  (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000)))
1276>;
1277
1278def : GCNPat <
1279  (fneg (f16 SReg_32:$src)),
1280  (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000)))
1281>;
1282
1283def : GCNPat <
1284  (fneg (f16 VGPR_32:$src)),
1285  (V_XOR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src)
1286>;
1287
1288def : GCNPat <
1289  (fabs (f16 SReg_32:$src)),
1290  (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff)))
1291>;
1292
1293def : GCNPat <
1294  (fneg (fabs (f16 SReg_32:$src))),
1295  (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
1296>;
1297
1298def : GCNPat <
1299  (fneg (fabs (f16 VGPR_32:$src))),
1300  (V_OR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
1301>;
1302
1303def : GCNPat <
1304  (fneg (v2f16 SReg_32:$src)),
1305  (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000)))
1306>;
1307
1308def : GCNPat <
1309  (fabs (v2f16 SReg_32:$src)),
1310  (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff)))
1311>;
1312
1313// This is really (fneg (fabs v2f16:$src))
1314//
1315// fabs is not reported as free because there is modifier for it in
1316// VOP3P instructions, so it is turned into the bit op.
1317def : GCNPat <
1318  (fneg (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))),
1319  (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
1320>;
1321
1322def : GCNPat <
1323  (fneg (v2f16 (fabs SReg_32:$src))),
1324  (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
1325>;
1326
1327// FIXME: The implicit-def of scc from S_[X]OR/AND_B32 is mishandled
1328 // def : GCNPat <
1329//   (fneg (f64 SReg_64:$src)),
1330//   (REG_SEQUENCE SReg_64,
1331//     (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
1332//     sub0,
1333//     (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
1334//                (i32 (S_MOV_B32 (i32 0x80000000)))),
1335//     sub1)
1336// >;
1337
1338// def : GCNPat <
1339//   (fneg (fabs (f64 SReg_64:$src))),
1340//   (REG_SEQUENCE SReg_64,
1341//     (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
1342//     sub0,
1343//     (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
1344//               (S_MOV_B32 (i32 0x80000000))), // Set sign bit.
1345//     sub1)
1346// >;
1347
1348// FIXME: Use S_BITSET0_B32/B64?
1349// def : GCNPat <
1350//   (fabs (f64 SReg_64:$src)),
1351//   (REG_SEQUENCE SReg_64,
1352//     (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
1353//     sub0,
1354//     (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
1355//                (i32 (S_MOV_B32 (i32 0x7fffffff)))),
1356//     sub1)
1357// >;
1358
1359} // End let AddedComplexity = 1
1360
1361def : GCNPat <
1362  (fabs (f32 VGPR_32:$src)),
1363  (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src)
1364>;
1365
1366def : GCNPat <
1367  (fneg (f32 VGPR_32:$src)),
1368  (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src)
1369>;
1370
1371def : GCNPat <
1372  (fabs (f16 VGPR_32:$src)),
1373  (V_AND_B32_e32 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src)
1374>;
1375
1376def : GCNPat <
1377  (fneg (v2f16 VGPR_32:$src)),
1378  (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src)
1379>;
1380
1381def : GCNPat <
1382  (fabs (v2f16 VGPR_32:$src)),
1383  (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src)
1384>;
1385
1386def : GCNPat <
1387  (fneg (v2f16 (fabs VGPR_32:$src))),
1388  (V_OR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) // Set sign bit
1389>;
1390
1391def : GCNPat <
1392  (fabs (f64 VReg_64:$src)),
1393  (REG_SEQUENCE VReg_64,
1394    (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
1395    sub0,
1396    (V_AND_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
1397                   (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit.
1398     sub1)
1399>;
1400
1401// TODO: Use SGPR for constant
1402def : GCNPat <
1403  (fneg (f64 VReg_64:$src)),
1404  (REG_SEQUENCE VReg_64,
1405    (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
1406    sub0,
1407    (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
1408                   (i32 (V_MOV_B32_e32 (i32 0x80000000)))),
1409    sub1)
1410>;
1411
1412// TODO: Use SGPR for constant
1413def : GCNPat <
1414  (fneg (fabs (f64 VReg_64:$src))),
1415  (REG_SEQUENCE VReg_64,
1416    (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
1417    sub0,
1418    (V_OR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
1419                  (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
1420    sub1)
1421>;
1422
1423def : GCNPat <
1424  (fcopysign f16:$src0, f16:$src1),
1425  (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
1426>;
1427
1428def : GCNPat <
1429  (fcopysign f32:$src0, f16:$src1),
1430  (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), $src0,
1431             (V_LSHLREV_B32_e64 (i32 16), $src1))
1432>;
1433
1434def : GCNPat <
1435  (fcopysign f64:$src0, f16:$src1),
1436  (REG_SEQUENCE SReg_64,
1437    (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
1438    (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
1439               (V_LSHLREV_B32_e64 (i32 16), $src1)), sub1)
1440>;
1441
1442def : GCNPat <
1443  (fcopysign f16:$src0, f32:$src1),
1444  (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0,
1445             (V_LSHRREV_B32_e64 (i32 16), $src1))
1446>;
1447
1448def : GCNPat <
1449  (fcopysign f16:$src0, f64:$src1),
1450  (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0,
1451             (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
1452>;
1453
1454/********** ================== **********/
1455/********** Immediate Patterns **********/
1456/********** ================== **********/
1457
1458def : GCNPat <
1459  (VGPRImm<(i32 imm)>:$imm),
1460  (V_MOV_B32_e32 imm:$imm)
1461>;
1462
1463def : GCNPat <
1464  (VGPRImm<(f32 fpimm)>:$imm),
1465  (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
1466>;
1467
1468def : GCNPat <
1469  (i32 imm:$imm),
1470  (S_MOV_B32 imm:$imm)
1471>;
1472
1473def : GCNPat <
1474  (VGPRImm<(SIlds tglobaladdr:$ga)>),
1475  (V_MOV_B32_e32 $ga)
1476>;
1477
1478def : GCNPat <
1479  (SIlds tglobaladdr:$ga),
1480  (S_MOV_B32 $ga)
1481>;
1482
1483// FIXME: Workaround for ordering issue with peephole optimizer where
1484// a register class copy interferes with immediate folding.  Should
1485// use s_mov_b32, which can be shrunk to s_movk_i32
1486def : GCNPat <
1487  (VGPRImm<(f16 fpimm)>:$imm),
1488  (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
1489>;
1490
1491def : GCNPat <
1492  (f32 fpimm:$imm),
1493  (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
1494>;
1495
1496def : GCNPat <
1497  (f16 fpimm:$imm),
1498  (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
1499>;
1500
1501def : GCNPat <
1502 (i32 frameindex:$fi),
1503 (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
1504>;
1505
1506def : GCNPat <
1507  (i64 InlineImm64:$imm),
1508  (S_MOV_B64 InlineImm64:$imm)
1509>;
1510
1511// XXX - Should this use a s_cmp to set SCC?
1512
1513// Set to sign-extended 64-bit value (true = -1, false = 0)
1514def : GCNPat <
1515  (i1 imm:$imm),
1516  (S_MOV_B64 (i64 (as_i64imm $imm)))
1517> {
1518  let WaveSizePredicate = isWave64;
1519}
1520
1521def : GCNPat <
1522  (i1 imm:$imm),
1523  (S_MOV_B32 (i32 (as_i32imm $imm)))
1524> {
1525  let WaveSizePredicate = isWave32;
1526}
1527
1528def : GCNPat <
1529  (f64 InlineImmFP64:$imm),
1530  (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineImmFP64:$imm)))
1531>;
1532
1533/********** ================== **********/
1534/********** Intrinsic Patterns **********/
1535/********** ================== **********/
1536
1537// FIXME: Should use _e64 and select source modifiers.
1538def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
1539
1540def : GCNPat <
1541  (i32 (sext i1:$src0)),
1542  (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1543                     /*src1mod*/(i32 0), /*src1*/(i32 -1), $src0)
1544>;
1545
1546class Ext32Pat <SDNode ext> : GCNPat <
1547  (i32 (ext i1:$src0)),
1548  (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1549                     /*src1mod*/(i32 0), /*src1*/(i32 1), $src0)
1550>;
1551
1552def : Ext32Pat <zext>;
1553def : Ext32Pat <anyext>;
1554
1555// The multiplication scales from [0,1) to the unsigned integer range,
1556// rounding down a bit to avoid unwanted overflow.
1557def : GCNPat <
1558  (AMDGPUurecip i32:$src0),
1559  (V_CVT_U32_F32_e32
1560    (V_MUL_F32_e32 (i32 CONST.FP_4294966784),
1561                   (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
1562>;
1563
1564//===----------------------------------------------------------------------===//
1565// VOP3 Patterns
1566//===----------------------------------------------------------------------===//
1567
1568def : IMad24Pat<V_MAD_I32_I24, 1>;
1569def : UMad24Pat<V_MAD_U32_U24, 1>;
1570
1571// FIXME: This should only be done for VALU inputs
1572defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
1573def : ROTRPattern <V_ALIGNBIT_B32>;
1574
1575def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
1576          (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
1577                          (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
1578
1579def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
1580          (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
1581                          (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
1582
1583/********** ====================== **********/
1584/**********   Indirect addressing  **********/
1585/********** ====================== **********/
1586
1587multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
1588  // Extract with offset
1589  def : GCNPat<
1590    (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))),
1591    (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset)
1592  >;
1593
1594  // Insert with offset
1595  def : GCNPat<
1596    (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))),
1597    (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val)
1598  >;
1599}
1600
1601defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
1602defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
1603defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
1604defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
1605defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">;
1606
1607defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
1608defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
1609defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
1610defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
1611defm : SI_INDIRECT_Pattern <v32i32, i32, "V32">;
1612
1613//===----------------------------------------------------------------------===//
1614// SAD Patterns
1615//===----------------------------------------------------------------------===//
1616
1617def : GCNPat <
1618  (add (sub_oneuse (umax i32:$src0, i32:$src1),
1619                   (umin i32:$src0, i32:$src1)),
1620       i32:$src2),
1621  (V_SAD_U32 $src0, $src1, $src2, (i1 0))
1622>;
1623
1624def : GCNPat <
1625  (add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)),
1626                      (sub i32:$src0, i32:$src1),
1627                      (sub i32:$src1, i32:$src0)),
1628       i32:$src2),
1629  (V_SAD_U32 $src0, $src1, $src2, (i1 0))
1630>;
1631
1632//===----------------------------------------------------------------------===//
1633// Conversion Patterns
1634//===----------------------------------------------------------------------===//
1635
1636def : GCNPat<(i32 (sext_inreg i32:$src, i1)),
1637  (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
1638
1639// Handle sext_inreg in i64
1640def : GCNPat <
1641  (i64 (sext_inreg i64:$src, i1)),
1642  (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16
1643>;
1644
1645def : GCNPat <
1646  (i16 (sext_inreg i16:$src, i1)),
1647  (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16
1648>;
1649
1650def : GCNPat <
1651  (i16 (sext_inreg i16:$src, i8)),
1652  (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
1653>;
1654
1655def : GCNPat <
1656  (i64 (sext_inreg i64:$src, i8)),
1657  (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16
1658>;
1659
1660def : GCNPat <
1661  (i64 (sext_inreg i64:$src, i16)),
1662  (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16
1663>;
1664
1665def : GCNPat <
1666  (i64 (sext_inreg i64:$src, i32)),
1667  (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
1668>;
1669
1670def : GCNPat <
1671  (i64 (zext i32:$src)),
1672  (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
1673>;
1674
1675def : GCNPat <
1676  (i64 (anyext i32:$src)),
1677  (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
1678>;
1679
1680class ZExt_i64_i1_Pat <SDNode ext> : GCNPat <
1681  (i64 (ext i1:$src)),
1682    (REG_SEQUENCE VReg_64,
1683      (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1684                         /*src1mod*/(i32 0), /*src1*/(i32 1), $src),
1685      sub0, (S_MOV_B32 (i32 0)), sub1)
1686>;
1687
1688
1689def : ZExt_i64_i1_Pat<zext>;
1690def : ZExt_i64_i1_Pat<anyext>;
1691
1692// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
1693// REG_SEQUENCE patterns don't support instructions with multiple outputs.
1694def : GCNPat <
1695  (i64 (sext i32:$src)),
1696    (REG_SEQUENCE SReg_64, $src, sub0,
1697    (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
1698>;
1699
1700def : GCNPat <
1701  (i64 (sext i1:$src)),
1702  (REG_SEQUENCE VReg_64,
1703    (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1704                       /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub0,
1705    (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1706                       /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub1)
1707>;
1708
1709class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : GCNPat <
1710  (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
1711  (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE))
1712>;
1713
1714def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
1715def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
1716def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>;
1717def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
1718
1719// If we need to perform a logical operation on i1 values, we need to
1720// use vector comparisons since there is only one SCC register. Vector
1721// comparisons may write to a pair of SGPRs or a single SGPR, so treat
1722// these as 32 or 64-bit comparisons. When legalizing SGPR copies,
1723// instructions resulting in the copies from SCC to these instructions
1724// will be moved to the VALU.
1725
1726let WaveSizePredicate = isWave64 in {
1727def : GCNPat <
1728  (i1 (and i1:$src0, i1:$src1)),
1729  (S_AND_B64 $src0, $src1)
1730>;
1731
1732def : GCNPat <
1733  (i1 (or i1:$src0, i1:$src1)),
1734  (S_OR_B64 $src0, $src1)
1735>;
1736
1737def : GCNPat <
1738  (i1 (xor i1:$src0, i1:$src1)),
1739  (S_XOR_B64 $src0, $src1)
1740>;
1741
1742def : GCNPat <
1743  (i1 (add i1:$src0, i1:$src1)),
1744  (S_XOR_B64 $src0, $src1)
1745>;
1746
1747def : GCNPat <
1748  (i1 (sub i1:$src0, i1:$src1)),
1749  (S_XOR_B64 $src0, $src1)
1750>;
1751
1752let AddedComplexity = 1 in {
1753def : GCNPat <
1754  (i1 (add i1:$src0, (i1 -1))),
1755  (S_NOT_B64 $src0)
1756>;
1757
1758def : GCNPat <
1759  (i1 (sub i1:$src0, (i1 -1))),
1760  (S_NOT_B64 $src0)
1761>;
1762}
1763} // end isWave64
1764
1765let WaveSizePredicate = isWave32 in {
1766def : GCNPat <
1767  (i1 (and i1:$src0, i1:$src1)),
1768  (S_AND_B32 $src0, $src1)
1769>;
1770
1771def : GCNPat <
1772  (i1 (or i1:$src0, i1:$src1)),
1773  (S_OR_B32 $src0, $src1)
1774>;
1775
1776def : GCNPat <
1777  (i1 (xor i1:$src0, i1:$src1)),
1778  (S_XOR_B32 $src0, $src1)
1779>;
1780
1781def : GCNPat <
1782  (i1 (add i1:$src0, i1:$src1)),
1783  (S_XOR_B32 $src0, $src1)
1784>;
1785
1786def : GCNPat <
1787  (i1 (sub i1:$src0, i1:$src1)),
1788  (S_XOR_B32 $src0, $src1)
1789>;
1790
1791let AddedComplexity = 1 in {
1792def : GCNPat <
1793  (i1 (add i1:$src0, (i1 -1))),
1794  (S_NOT_B32 $src0)
1795>;
1796
1797def : GCNPat <
1798  (i1 (sub i1:$src0, (i1 -1))),
1799  (S_NOT_B32 $src0)
1800>;
1801}
1802} // end isWave32
1803
1804def : GCNPat <
1805  (f16 (sint_to_fp i1:$src)),
1806  (V_CVT_F16_F32_e32 (
1807      V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1808                        /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
1809                        SSrc_i1:$src))
1810>;
1811
1812def : GCNPat <
1813  (f16 (uint_to_fp i1:$src)),
1814  (V_CVT_F16_F32_e32 (
1815      V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1816                        /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
1817                        SSrc_i1:$src))
1818>;
1819
1820def : GCNPat <
1821  (f32 (sint_to_fp i1:$src)),
1822  (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1823                        /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
1824                        SSrc_i1:$src)
1825>;
1826
1827def : GCNPat <
1828  (f32 (uint_to_fp i1:$src)),
1829  (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1830                        /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
1831                        SSrc_i1:$src)
1832>;
1833
1834def : GCNPat <
1835  (f64 (sint_to_fp i1:$src)),
1836  (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1837                                        /*src1mod*/(i32 0), /*src1*/(i32 -1),
1838                                        SSrc_i1:$src))
1839>;
1840
1841def : GCNPat <
1842  (f64 (uint_to_fp i1:$src)),
1843  (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1844                                        /*src1mod*/(i32 0), /*src1*/(i32 1),
1845                                        SSrc_i1:$src))
1846>;
1847
1848//===----------------------------------------------------------------------===//
1849// Miscellaneous Patterns
1850//===----------------------------------------------------------------------===//
1851def : GCNPat <
1852  (i32 (AMDGPUfp16_zext f16:$src)),
1853  (COPY $src)
1854>;
1855
1856
1857def : GCNPat <
1858  (i32 (trunc i64:$a)),
1859  (EXTRACT_SUBREG $a, sub0)
1860>;
1861
1862def : GCNPat <
1863  (i1 (trunc i32:$a)),
1864  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
1865>;
1866
1867def : GCNPat <
1868  (i1 (trunc i16:$a)),
1869  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
1870>;
1871
1872def : GCNPat <
1873  (i1 (trunc i64:$a)),
1874  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
1875                    (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
1876>;
1877
1878def : GCNPat <
1879  (i32 (bswap i32:$a)),
1880  (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
1881             (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 24)),
1882             (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 8)))
1883>;
1884
1885// FIXME: This should have been narrowed to i32 during legalization.
1886// This pattern should also be skipped for GlobalISel
1887def : GCNPat <
1888  (i64 (bswap i64:$a)),
1889  (REG_SEQUENCE VReg_64,
1890  (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
1891             (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
1892                             (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
1893                             (i32 24)),
1894             (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
1895                             (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
1896                             (i32 8))),
1897  sub0,
1898  (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
1899             (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
1900                             (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
1901                             (i32 24)),
1902             (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
1903                             (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
1904                             (i32 8))),
1905  sub1)
1906>;
1907
1908// FIXME: The AddedComplexity should not be needed, but in GlobalISel
1909// the BFI pattern ends up taking precedence without it.
1910let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in {
1911// Magic number: 3 | (2 << 8) | (1 << 16) | (0 << 24)
1912//
1913// My reading of the manual suggests we should be using src0 for the
1914// register value, but this is what seems to work.
1915def : GCNPat <
1916  (i32 (bswap i32:$a)),
1917  (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203)))
1918>;
1919
1920// FIXME: This should have been narrowed to i32 during legalization.
1921// This pattern should also be skipped for GlobalISel
1922def : GCNPat <
1923  (i64 (bswap i64:$a)),
1924  (REG_SEQUENCE VReg_64,
1925  (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1),
1926              (S_MOV_B32 (i32 0x00010203))),
1927  sub0,
1928  (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0),
1929              (S_MOV_B32 (i32 0x00010203))),
1930  sub1)
1931>;
1932
1933// Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24)
1934// The 12s emit 0s.
1935def : GCNPat <
1936  (i16 (bswap i16:$a)),
1937  (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
1938>;
1939
1940def : GCNPat <
1941  (i32 (zext (bswap i16:$a))),
1942  (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
1943>;
1944
1945// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24)
1946def : GCNPat <
1947  (v2i16 (bswap v2i16:$a)),
1948  (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001)))
1949>;
1950
1951}
1952
1953
1954// Prefer selecting to max when legal, but using mul is always valid.
1955let AddedComplexity = -5 in {
1956def : GCNPat<
1957  (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
1958  (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
1959>;
1960
1961def : GCNPat<
1962  (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
1963  (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
1964>;
1965
1966def : GCNPat<
1967  (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
1968  (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
1969>;
1970
1971def : GCNPat<
1972  (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
1973  (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src)
1974>;
1975
1976def : GCNPat<
1977  (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))),
1978  (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src)
1979>;
1980
1981// TODO: Handle fneg like other types.
1982def : GCNPat<
1983  (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
1984  (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src)
1985>;
1986} // End AddedComplexity = -5
1987
1988multiclass SelectCanonicalizeAsMax<
1989  list<Predicate> f32_preds = [],
1990  list<Predicate> f64_preds = [],
1991  list<Predicate> f16_preds = []> {
1992  def : GCNPat<
1993    (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
1994    (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src)> {
1995    let OtherPredicates = f32_preds;
1996  }
1997
1998  def : GCNPat<
1999    (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
2000    (V_MAX_F64 $src_mods, $src, $src_mods, $src)> {
2001    let OtherPredicates = f64_preds;
2002  }
2003
2004  def : GCNPat<
2005    (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
2006    (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> {
2007    // FIXME: Should have 16-bit inst subtarget predicate
2008    let OtherPredicates = f16_preds;
2009  }
2010
2011  def : GCNPat<
2012    (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
2013    (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)> {
2014    // FIXME: Should have VOP3P subtarget predicate
2015    let OtherPredicates = f16_preds;
2016  }
2017}
2018
2019// On pre-gfx9 targets, v_max_*/v_min_* did not respect the denormal
2020// mode, and would never flush. For f64, it's faster to do implement
2021// this with a max. For f16/f32 it's a wash, but prefer max when
2022// valid.
2023//
2024// FIXME: Lowering f32/f16 with max is worse since we can use a
2025// smaller encoding if the input is fneg'd. It also adds an extra
2026// register use.
2027let SubtargetPredicate = HasMinMaxDenormModes in {
2028  defm : SelectCanonicalizeAsMax<[], [], []>;
2029} // End SubtargetPredicate = HasMinMaxDenormModes
2030
2031let SubtargetPredicate = NotHasMinMaxDenormModes in {
2032  // Use the max lowering if we don't need to flush.
2033
2034  // FIXME: We don't do use this for f32 as a workaround for the
2035  // library being compiled with the default ieee mode, but
2036  // potentially being called from flushing kernels. Really we should
2037  // not be mixing code expecting different default FP modes, but mul
2038  // works in any FP environment.
2039  defm : SelectCanonicalizeAsMax<[FalsePredicate], [FP64Denormals], [FP16Denormals]>;
2040} // End SubtargetPredicate = NotHasMinMaxDenormModes
2041
2042
2043let OtherPredicates = [HasDLInsts] in {
2044def : GCNPat <
2045  (fma (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
2046       (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
2047       (f32 (VOP3NoMods f32:$src2))),
2048  (V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
2049                  SRCMODS.NONE, $src2)
2050>;
2051} // End OtherPredicates = [HasDLInsts]
2052
2053let SubtargetPredicate = isGFX10Plus in
2054def : GCNPat <
2055  (fma (f16 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
2056       (f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
2057       (f16 (VOP3NoMods f32:$src2))),
2058  (V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
2059                  SRCMODS.NONE, $src2)
2060>;
2061
2062// COPY is workaround tablegen bug from multiple outputs
2063// from S_LSHL_B32's multiple outputs from implicit scc def.
2064def : GCNPat <
2065  (v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))),
2066  (S_LSHL_B32 SReg_32:$src1, (i16 16))
2067>;
2068
2069def : GCNPat <
2070  (v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))),
2071  (COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
2072>;
2073
2074def : GCNPat <
2075  (v2i16 (build_vector (i16 VGPR_32:$src0), (i16 undef))),
2076  (COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32)
2077>;
2078
2079def : GCNPat <
2080  (v2f16 (build_vector f16:$src0, (f16 undef))),
2081  (COPY $src0)
2082>;
2083
2084def : GCNPat <
2085  (v2i16 (build_vector (i16 undef), (i16 SReg_32:$src1))),
2086  (S_LSHL_B32 SReg_32:$src1, (i32 16))
2087>;
2088
2089def : GCNPat <
2090  (v2f16 (build_vector (f16 undef), (f16 SReg_32:$src1))),
2091  (S_LSHL_B32 SReg_32:$src1, (i32 16))
2092>;
2093
2094let SubtargetPredicate = HasVOP3PInsts in {
2095def : GCNPat <
2096  (v2i16 (build_vector (i16 SReg_32:$src0), (i16 SReg_32:$src1))),
2097  (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
2098>;
2099
2100// With multiple uses of the shift, this will duplicate the shift and
2101// increase register pressure.
2102def : GCNPat <
2103  (v2i16 (build_vector (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
2104  (v2i16 (S_PACK_LH_B32_B16 SReg_32:$src0, SReg_32:$src1))
2105>;
2106
2107
2108def : GCNPat <
2109  (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))),
2110                       (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
2111  (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1)
2112>;
2113
2114// TODO: Should source modifiers be matched to v_pack_b32_f16?
2115def : GCNPat <
2116  (v2f16 (build_vector (f16 SReg_32:$src0), (f16 SReg_32:$src1))),
2117  (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
2118>;
2119
2120} // End SubtargetPredicate = HasVOP3PInsts
2121
2122
2123def : GCNPat <
2124  (v2f16 (scalar_to_vector f16:$src0)),
2125  (COPY $src0)
2126>;
2127
2128def : GCNPat <
2129  (v2i16 (scalar_to_vector i16:$src0)),
2130  (COPY $src0)
2131>;
2132
2133def : GCNPat <
2134  (v4i16 (scalar_to_vector i16:$src0)),
2135  (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
2136>;
2137
2138def : GCNPat <
2139  (v4f16 (scalar_to_vector f16:$src0)),
2140  (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
2141>;
2142
2143def : GCNPat <
2144  (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,
2145                           timm:$bank_mask, timm:$bound_ctrl)),
2146  (V_MOV_B64_DPP_PSEUDO VReg_64:$src, VReg_64:$src,
2147                        (as_i32timm $dpp_ctrl), (as_i32timm $row_mask),
2148                        (as_i32timm $bank_mask),
2149                        (as_i1timm $bound_ctrl))
2150>;
2151
2152def : GCNPat <
2153  (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask,
2154                              timm:$bank_mask, timm:$bound_ctrl)),
2155  (V_MOV_B64_DPP_PSEUDO VReg_64:$old, VReg_64:$src, (as_i32timm $dpp_ctrl),
2156                        (as_i32timm $row_mask), (as_i32timm $bank_mask),
2157                        (as_i1timm $bound_ctrl))
2158>;
2159
2160//===----------------------------------------------------------------------===//
2161// Fract Patterns
2162//===----------------------------------------------------------------------===//
2163
2164let SubtargetPredicate = isGFX6 in {
2165
2166// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
2167// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
2168// way to implement it is using V_FRACT_F64.
2169// The workaround for the V_FRACT bug is:
2170//    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2171
2172// Convert floor(x) to (x - fract(x))
2173
2174// Don't bother handling this for GlobalISel, it's handled during
2175// lowering.
2176//
2177// FIXME: DAG should also custom lower this.
2178def : GCNPat <
2179  (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
2180  (V_ADD_F64
2181      $mods,
2182      $x,
2183      SRCMODS.NEG,
2184      (V_CNDMASK_B64_PSEUDO
2185         (V_MIN_F64
2186             SRCMODS.NONE,
2187             (V_FRACT_F64_e64 $mods, $x),
2188             SRCMODS.NONE,
2189             (V_MOV_B64_PSEUDO 0x3fefffffffffffff)),
2190         $x,
2191         (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))))
2192>;
2193
2194} // End SubtargetPredicates = isGFX6
2195
2196//============================================================================//
2197// Miscellaneous Optimization Patterns
2198//============================================================================//
2199
2200// Undo sub x, c -> add x, -c canonicalization since c is more likely
2201// an inline immediate than -c.
2202// TODO: Also do for 64-bit.
2203def : GCNPat<
2204  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
2205  (S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1)
2206>;
2207
2208def : GCNPat<
2209  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
2210  (V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
2211  let SubtargetPredicate = HasAddNoCarryInsts;
2212}
2213
2214def : GCNPat<
2215  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
2216  (V_SUB_I32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
2217  let SubtargetPredicate = NotHasAddNoCarryInsts;
2218}
2219
2220
2221// Avoid pointlessly materializing a constant in VGPR.
2222// FIXME: Should also do this for readlane, but tablegen crashes on
2223// the ignored src1.
2224def : GCNPat<
2225  (int_amdgcn_readfirstlane (i32 imm:$src)),
2226  (S_MOV_B32 SReg_32:$src)
2227>;
2228
2229multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
2230  def : GCNPat <
2231    (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
2232    (BFM $a, $b)
2233  >;
2234
2235  def : GCNPat <
2236    (vt (add (vt (shl 1, vt:$a)), -1)),
2237    (BFM $a, (MOV (i32 0)))
2238  >;
2239}
2240
2241defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
2242// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
2243
2244defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
2245defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>;
2246
2247multiclass IntMed3Pat<Instruction med3Inst,
2248                 SDPatternOperator min,
2249                 SDPatternOperator max,
2250                 SDPatternOperator min_oneuse,
2251                 SDPatternOperator max_oneuse> {
2252
2253  // This matches 16 permutations of
2254  // min(max(a, b), max(min(a, b), c))
2255  def : AMDGPUPat <
2256  (min (max_oneuse i32:$src0, i32:$src1),
2257       (max_oneuse (min_oneuse i32:$src0, i32:$src1), i32:$src2)),
2258  (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
2259>;
2260
2261  // This matches 16 permutations of
2262  // max(min(x, y), min(max(x, y), z))
2263  def : AMDGPUPat <
2264  (max (min_oneuse i32:$src0, i32:$src1),
2265       (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)),
2266  (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
2267>;
2268}
2269
2270defm : IntMed3Pat<V_MED3_I32, smin, smax, smin_oneuse, smax_oneuse>;
2271defm : IntMed3Pat<V_MED3_U32, umin, umax, umin_oneuse, umax_oneuse>;
2272
2273// This matches 16 permutations of
2274// max(min(x, y), min(max(x, y), z))
2275class FPMed3Pat<ValueType vt,
2276                //SDPatternOperator max, SDPatternOperator min,
2277                Instruction med3Inst> : GCNPat<
2278  (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
2279                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
2280           (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
2281                                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
2282                           (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
2283  (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
2284>;
2285
2286class FP16Med3Pat<ValueType vt,
2287                Instruction med3Inst> : GCNPat<
2288  (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
2289                                     (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
2290           (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
2291                                                     (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
2292                           (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
2293  (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)
2294>;
2295
2296multiclass Int16Med3Pat<Instruction med3Inst,
2297                   SDPatternOperator min,
2298                   SDPatternOperator max,
2299                   SDPatternOperator max_oneuse,
2300                   SDPatternOperator min_oneuse> {
2301  // This matches 16 permutations of
2302  // max(min(x, y), min(max(x, y), z))
2303  def : GCNPat <
2304  (max (min_oneuse i16:$src0, i16:$src1),
2305       (min_oneuse (max_oneuse i16:$src0, i16:$src1), i16:$src2)),
2306  (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE)
2307>;
2308
2309  // This matches 16 permutations of
2310  // min(max(a, b), max(min(a, b), c))
2311  def : GCNPat <
2312  (min (max_oneuse i16:$src0, i16:$src1),
2313      (max_oneuse (min_oneuse i16:$src0, i16:$src1), i16:$src2)),
2314  (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE)
2315>;
2316}
2317
2318def : FPMed3Pat<f32, V_MED3_F32>;
2319
2320let OtherPredicates = [isGFX9Plus] in {
2321def : FP16Med3Pat<f16, V_MED3_F16>;
2322defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>;
2323defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>;
2324} // End Predicates = [isGFX9Plus]
2325
2326class AMDGPUGenericInstruction : GenericInstruction {
2327  let Namespace = "AMDGPU";
2328}
2329
2330def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction {
2331  let OutOperandList = (outs type0:$dst);
2332  let InOperandList = (ins type1:$src);
2333  let hasSideEffects = 0;
2334}
2335
2336def G_AMDGPU_RCP_IFLAG : AMDGPUGenericInstruction {
2337  let OutOperandList = (outs type0:$dst);
2338  let InOperandList = (ins type1:$src);
2339  let hasSideEffects = 0;
2340}
2341
2342class BufferLoadGenericInstruction : AMDGPUGenericInstruction {
2343  let OutOperandList = (outs type0:$dst);
2344  let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset,
2345                           type2:$soffset, untyped_imm_0:$offset,
2346                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
2347  let hasSideEffects = 0;
2348  let mayLoad = 1;
2349}
2350
2351class TBufferLoadGenericInstruction : AMDGPUGenericInstruction {
2352  let OutOperandList = (outs type0:$dst);
2353  let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset,
2354                           type2:$soffset, untyped_imm_0:$offset, untyped_imm_0:$format,
2355                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
2356  let hasSideEffects = 0;
2357  let mayLoad = 1;
2358}
2359
2360def G_AMDGPU_BUFFER_LOAD_UBYTE : BufferLoadGenericInstruction;
2361def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction;
2362def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction;
2363def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction;
2364def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction;
2365def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction;
2366def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction;
2367def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction;
2368def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction;
2369
2370class BufferStoreGenericInstruction : AMDGPUGenericInstruction {
2371  let OutOperandList = (outs);
2372  let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
2373                           type2:$soffset, untyped_imm_0:$offset,
2374                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
2375  let hasSideEffects = 0;
2376  let mayStore = 1;
2377}
2378
2379class TBufferStoreGenericInstruction : AMDGPUGenericInstruction {
2380  let OutOperandList = (outs);
2381  let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
2382                           type2:$soffset, untyped_imm_0:$offset,
2383                           untyped_imm_0:$format,
2384                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
2385  let hasSideEffects = 0;
2386  let mayStore = 1;
2387}
2388
2389def G_AMDGPU_BUFFER_STORE : BufferStoreGenericInstruction;
2390def G_AMDGPU_BUFFER_STORE_BYTE : BufferStoreGenericInstruction;
2391def G_AMDGPU_BUFFER_STORE_SHORT : BufferStoreGenericInstruction;
2392def G_AMDGPU_BUFFER_STORE_FORMAT : BufferStoreGenericInstruction;
2393def G_AMDGPU_BUFFER_STORE_FORMAT_D16 : BufferStoreGenericInstruction;
2394def G_AMDGPU_TBUFFER_STORE_FORMAT : TBufferStoreGenericInstruction;
2395def G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : TBufferStoreGenericInstruction;
2396
2397def G_AMDGPU_FMIN_LEGACY : AMDGPUGenericInstruction {
2398  let OutOperandList = (outs type0:$dst);
2399  let InOperandList = (ins type0:$src0, type0:$src1);
2400  let hasSideEffects = 0;
2401}
2402
2403def G_AMDGPU_FMAX_LEGACY : AMDGPUGenericInstruction {
2404  let OutOperandList = (outs type0:$dst);
2405  let InOperandList = (ins type0:$src0, type0:$src1);
2406  let hasSideEffects = 0;
2407}
2408
2409foreach N = 0-3 in {
2410def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction {
2411  let OutOperandList = (outs type0:$dst);
2412  let InOperandList = (ins type0:$src0);
2413  let hasSideEffects = 0;
2414}
2415}
2416
2417// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
2418// operand Expects a MachineMemOperand in addition to explicit
2419// operands.
2420def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction {
2421  let OutOperandList = (outs type0:$oldval);
2422  let InOperandList = (ins ptype1:$addr, type0:$cmpval_newval);
2423  let hasSideEffects = 0;
2424  let mayLoad = 1;
2425  let mayStore = 1;
2426}
2427
2428let Namespace = "AMDGPU" in {
2429def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP;
2430def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP;
2431}
2432
2433class BufferAtomicGenericInstruction : AMDGPUGenericInstruction {
2434  let OutOperandList = (outs type0:$dst);
2435  let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
2436                           type2:$soffset, untyped_imm_0:$offset,
2437                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
2438  let hasSideEffects = 0;
2439  let mayLoad = 1;
2440  let mayStore = 1;
2441}
2442
2443def G_AMDGPU_BUFFER_ATOMIC_SWAP : BufferAtomicGenericInstruction;
2444def G_AMDGPU_BUFFER_ATOMIC_ADD : BufferAtomicGenericInstruction;
2445def G_AMDGPU_BUFFER_ATOMIC_SUB : BufferAtomicGenericInstruction;
2446def G_AMDGPU_BUFFER_ATOMIC_SMIN : BufferAtomicGenericInstruction;
2447def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction;
2448def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction;
2449def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction;
2450def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction;
2451def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
2452def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
2453def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
2454def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
2455
2456def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
2457  let OutOperandList = (outs type0:$dst);
2458  let InOperandList = (ins type0:$vdata, type0:$cmp, type1:$rsrc, type2:$vindex,
2459                           type2:$voffset, type2:$soffset, untyped_imm_0:$offset,
2460                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
2461  let hasSideEffects = 0;
2462  let mayLoad = 1;
2463  let mayStore = 1;
2464}
2465
2466// Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as
2467// a workaround for the intrinsic being defined as readnone, but
2468// really needs a memory operand.
2469def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction {
2470  let OutOperandList = (outs type0:$dst);
2471  let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy);
2472  let hasSideEffects = 0;
2473  let mayLoad = 1;
2474  let mayStore = 0;
2475}
2476
2477// This is equivalent to the G_INTRINSIC*, but the operands may have
2478// been legalized depending on the subtarget requirements.
2479def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction {
2480  let OutOperandList = (outs type0:$dst);
2481  let InOperandList = (ins unknown:$intrin, variable_ops);
2482  let hasSideEffects = 0;
2483  let mayLoad = 1;
2484
2485  // FIXME: Use separate opcode for atomics.
2486  let mayStore = 1;
2487}
2488
2489// This is equivalent to the G_INTRINSIC*, but the operands may have
2490// been legalized depending on the subtarget requirements.
2491def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction {
2492  let OutOperandList = (outs);
2493  let InOperandList = (ins unknown:$intrin, variable_ops);
2494  let hasSideEffects = 0;
2495  let mayStore = 1;
2496}
2497