xref: /freebsd/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (revision 7fdf597e96a02165cfe22ff357b857d5fa15ed8a)
1//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file describes the PTX instructions in TableGen format.
10//
11//===----------------------------------------------------------------------===//
12
13include "NVPTXInstrFormats.td"
14
15let OperandType = "OPERAND_IMMEDIATE" in {
16  def f16imm : Operand<f16>;
17  def bf16imm : Operand<bf16>;
18
19}
20
21// List of vector specific properties
22def isVecLD      : VecInstTypeEnum<1>;
23def isVecST      : VecInstTypeEnum<2>;
24def isVecBuild   : VecInstTypeEnum<3>;
25def isVecShuffle : VecInstTypeEnum<4>;
26def isVecExtract : VecInstTypeEnum<5>;
27def isVecInsert  : VecInstTypeEnum<6>;
28def isVecDest    : VecInstTypeEnum<7>;
29def isVecOther   : VecInstTypeEnum<15>;
30
31//===----------------------------------------------------------------------===//
32// NVPTX Operand Definitions.
33//===----------------------------------------------------------------------===//
34
35def brtarget    : Operand<OtherVT>;
36
37// CVT conversion modes
38// These must match the enum in NVPTX.h
39def CvtNONE : PatLeaf<(i32 0x0)>;
40def CvtRNI  : PatLeaf<(i32 0x1)>;
41def CvtRZI  : PatLeaf<(i32 0x2)>;
42def CvtRMI  : PatLeaf<(i32 0x3)>;
43def CvtRPI  : PatLeaf<(i32 0x4)>;
44def CvtRN   : PatLeaf<(i32 0x5)>;
45def CvtRZ   : PatLeaf<(i32 0x6)>;
46def CvtRM   : PatLeaf<(i32 0x7)>;
47def CvtRP   : PatLeaf<(i32 0x8)>;
48def CvtRNA   : PatLeaf<(i32 0x9)>;
49
50def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
51def CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
52def CvtRZI_FTZ  : PatLeaf<(i32 0x12)>;
53def CvtRMI_FTZ  : PatLeaf<(i32 0x13)>;
54def CvtRPI_FTZ  : PatLeaf<(i32 0x14)>;
55def CvtRN_FTZ   : PatLeaf<(i32 0x15)>;
56def CvtRZ_FTZ   : PatLeaf<(i32 0x16)>;
57def CvtRM_FTZ   : PatLeaf<(i32 0x17)>;
58def CvtRP_FTZ   : PatLeaf<(i32 0x18)>;
59
60def CvtSAT      : PatLeaf<(i32 0x20)>;
61def CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
62
63def CvtNONE_RELU   : PatLeaf<(i32 0x40)>;
64def CvtRN_RELU   : PatLeaf<(i32 0x45)>;
65def CvtRZ_RELU   : PatLeaf<(i32 0x46)>;
66
67def CvtMode : Operand<i32> {
68  let PrintMethod = "printCvtMode";
69}
70
71// Compare modes
72// These must match the enum in NVPTX.h
73def CmpEQ   : PatLeaf<(i32 0)>;
74def CmpNE   : PatLeaf<(i32 1)>;
75def CmpLT   : PatLeaf<(i32 2)>;
76def CmpLE   : PatLeaf<(i32 3)>;
77def CmpGT   : PatLeaf<(i32 4)>;
78def CmpGE   : PatLeaf<(i32 5)>;
79def CmpLO   : PatLeaf<(i32 6)>;
80def CmpLS   : PatLeaf<(i32 7)>;
81def CmpHI   : PatLeaf<(i32 8)>;
82def CmpHS   : PatLeaf<(i32 9)>;
83def CmpEQU  : PatLeaf<(i32 10)>;
84def CmpNEU  : PatLeaf<(i32 11)>;
85def CmpLTU  : PatLeaf<(i32 12)>;
86def CmpLEU  : PatLeaf<(i32 13)>;
87def CmpGTU  : PatLeaf<(i32 14)>;
88def CmpGEU  : PatLeaf<(i32 15)>;
89def CmpNUM  : PatLeaf<(i32 16)>;
90def CmpNAN  : PatLeaf<(i32 17)>;
91
92def CmpEQ_FTZ   : PatLeaf<(i32 0x100)>;
93def CmpNE_FTZ   : PatLeaf<(i32 0x101)>;
94def CmpLT_FTZ   : PatLeaf<(i32 0x102)>;
95def CmpLE_FTZ   : PatLeaf<(i32 0x103)>;
96def CmpGT_FTZ   : PatLeaf<(i32 0x104)>;
97def CmpGE_FTZ   : PatLeaf<(i32 0x105)>;
98def CmpEQU_FTZ  : PatLeaf<(i32 0x10A)>;
99def CmpNEU_FTZ  : PatLeaf<(i32 0x10B)>;
100def CmpLTU_FTZ  : PatLeaf<(i32 0x10C)>;
101def CmpLEU_FTZ  : PatLeaf<(i32 0x10D)>;
102def CmpGTU_FTZ  : PatLeaf<(i32 0x10E)>;
103def CmpGEU_FTZ  : PatLeaf<(i32 0x10F)>;
104def CmpNUM_FTZ  : PatLeaf<(i32 0x110)>;
105def CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
106
107def CmpMode : Operand<i32> {
108  let PrintMethod = "printCmpMode";
109}
110def VecElement : Operand<i32> {
111  let PrintMethod = "printVecElement";
112}
113
114// PRMT modes
115// These must match the enum in NVPTX.h
116def PrmtNONE : PatLeaf<(i32 0x0)>;
117def PrmtF4E  : PatLeaf<(i32 0x1)>;
118def PrmtB4E  : PatLeaf<(i32 0x2)>;
119def PrmtRC8  : PatLeaf<(i32 0x3)>;
120def PrmtECL  : PatLeaf<(i32 0x4)>;
121def PrmtECR  : PatLeaf<(i32 0x5)>;
122def PrmtRC16 : PatLeaf<(i32 0x6)>;
123
124def PrmtMode : Operand<i32> {
125  let PrintMethod = "printPrmtMode";
126}
127
128
129//===----------------------------------------------------------------------===//
130// NVPTX Instruction Predicate Definitions
131//===----------------------------------------------------------------------===//
132
133
134def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
135def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
136def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
137def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
138def hasVote : Predicate<"Subtarget->hasVote()">;
139def hasDouble : Predicate<"Subtarget->hasDouble()">;
140def hasLDG : Predicate<"Subtarget->hasLDG()">;
141def hasLDU : Predicate<"Subtarget->hasLDU()">;
142
143def doF32FTZ : Predicate<"useF32FTZ()">;
144def doNoF32FTZ : Predicate<"!useF32FTZ()">;
145def doRsqrtOpt : Predicate<"doRsqrtOpt()">;
146
147def doMulWide      : Predicate<"doMulWide">;
148
149def allowFMA : Predicate<"allowFMA()">;
150def noFMA : Predicate<"!allowFMA()">;
151def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">;
152def noUnsafeFPMath : Predicate<"!allowUnsafeFPMath()">;
153
154def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
155def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
156
157def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
158def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
159
160def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
161def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
162
163def True : Predicate<"true">;
164def False : Predicate<"false">;
165
166class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
167class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>;
168
169// Explicit records for arch-accelerated SM versions
170def hasSM90a : Predicate<"Subtarget->getFullSmVersion() == 901">;
171
172// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
173def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
174                          "&& Subtarget->getPTXVersion() >= 64)">;
175
176def useShortPtrLocal : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_LOCAL) == 32">;
177def useShortPtrShared : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32">;
178def useShortPtrConst : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_CONST) == 32">;
179
180def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
181def hasBF16Math: Predicate<"Subtarget->hasBF16Math()">;
182
183// Helper class to aid conversion between ValueType and a matching RegisterClass.
184
185class ValueToRegClass<ValueType T> {
186   string name = !cast<string>(T);
187   NVPTXRegClass ret = !cond(
188     !eq(name, "i1"): Int1Regs,
189     !eq(name, "i16"): Int16Regs,
190     !eq(name, "v2i16"): Int32Regs,
191     !eq(name, "i32"): Int32Regs,
192     !eq(name, "i64"): Int64Regs,
193     !eq(name, "f16"): Int16Regs,
194     !eq(name, "v2f16"): Int32Regs,
195     !eq(name, "bf16"): Int16Regs,
196     !eq(name, "v2bf16"): Int32Regs,
197     !eq(name, "f32"): Float32Regs,
198     !eq(name, "f64"): Float64Regs,
199     !eq(name, "ai32"): Int32ArgRegs,
200     !eq(name, "ai64"): Int64ArgRegs,
201     !eq(name, "af32"): Float32ArgRegs,
202     !eq(name, "if64"): Float64ArgRegs,
203    );
204}
205
206
207//===----------------------------------------------------------------------===//
208// Some Common Instruction Class Templates
209//===----------------------------------------------------------------------===//
210
211// Template for instructions which take three int64, int32, or int16 args.
212// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
213multiclass I3<string OpcStr, SDNode OpNode> {
214  def i64rr :
215    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
216              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
217              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
218  def i64ri :
219    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
220              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
221              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
222  def i32rr :
223    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
224              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
225              [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
226  def i32ri :
227    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
228              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
229              [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
230  def i16rr :
231    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
232              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
233              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
234  def i16ri :
235    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
236              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
237              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
238}
239
240class I16x2<string OpcStr, SDNode OpNode> :
241 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
242              !strconcat(OpcStr, "16x2 \t$dst, $a, $b;"),
243              [(set Int32Regs:$dst, (OpNode (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)))]>,
244              Requires<[hasPTX<80>, hasSM<90>]>;
245
246// Template for instructions which take 3 int args.  The instructions are
247// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
248multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> {
249  let hasSideEffects = 1 in {
250    def i32rr :
251      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
252                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
253                [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
254    def i32ri :
255      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
256                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
257                [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
258    def i64rr :
259      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
260                !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
261                [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>,
262      Requires<[hasPTX<43>]>;
263    def i64ri :
264      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
265                !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
266                [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>,
267      Requires<[hasPTX<43>]>;
268  }
269}
270
271// Template for instructions which take three fp64 or fp32 args.  The
272// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
273//
274// Also defines ftz (flush subnormal inputs and results to sign-preserving
275// zero) variants for fp32 functions.
276//
277// This multiclass should be used for nodes that cannot be folded into FMAs.
278// For nodes that can be folded into FMAs (i.e. adds and muls), use
279// F3_fma_component.
280multiclass F3<string OpcStr, SDNode OpNode> {
281   def f64rr :
282     NVPTXInst<(outs Float64Regs:$dst),
283               (ins Float64Regs:$a, Float64Regs:$b),
284               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
285               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
286   def f64ri :
287     NVPTXInst<(outs Float64Regs:$dst),
288               (ins Float64Regs:$a, f64imm:$b),
289               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
290               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
291   def f32rr_ftz :
292     NVPTXInst<(outs Float32Regs:$dst),
293               (ins Float32Regs:$a, Float32Regs:$b),
294               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
295               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
296               Requires<[doF32FTZ]>;
297   def f32ri_ftz :
298     NVPTXInst<(outs Float32Regs:$dst),
299               (ins Float32Regs:$a, f32imm:$b),
300               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
301               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
302               Requires<[doF32FTZ]>;
303   def f32rr :
304     NVPTXInst<(outs Float32Regs:$dst),
305               (ins Float32Regs:$a, Float32Regs:$b),
306               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
307               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
308   def f32ri :
309     NVPTXInst<(outs Float32Regs:$dst),
310               (ins Float32Regs:$a, f32imm:$b),
311               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
312               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
313
314   def f16rr_ftz :
315     NVPTXInst<(outs Int16Regs:$dst),
316               (ins Int16Regs:$a, Int16Regs:$b),
317               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
318               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
319               Requires<[useFP16Math, doF32FTZ]>;
320   def f16rr :
321     NVPTXInst<(outs Int16Regs:$dst),
322               (ins Int16Regs:$a, Int16Regs:$b),
323               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
324               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
325               Requires<[useFP16Math]>;
326
327   def f16x2rr_ftz :
328     NVPTXInst<(outs Int32Regs:$dst),
329               (ins Int32Regs:$a, Int32Regs:$b),
330               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
331               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
332               Requires<[useFP16Math, doF32FTZ]>;
333   def f16x2rr :
334     NVPTXInst<(outs Int32Regs:$dst),
335               (ins Int32Regs:$a, Int32Regs:$b),
336               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
337               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
338               Requires<[useFP16Math]>;
339   def bf16rr_ftz :
340     NVPTXInst<(outs Int16Regs:$dst),
341               (ins Int16Regs:$a, Int16Regs:$b),
342               !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"),
343               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
344               Requires<[hasBF16Math, doF32FTZ]>;
345   def bf16rr :
346     NVPTXInst<(outs Int16Regs:$dst),
347               (ins Int16Regs:$a, Int16Regs:$b),
348               !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
349               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
350               Requires<[hasBF16Math]>;
351
352   def bf16x2rr_ftz :
353     NVPTXInst<(outs Int32Regs:$dst),
354               (ins Int32Regs:$a, Int32Regs:$b),
355               !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"),
356               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
357               Requires<[hasBF16Math, doF32FTZ]>;
358   def bf16x2rr :
359     NVPTXInst<(outs Int32Regs:$dst),
360               (ins Int32Regs:$a, Int32Regs:$b),
361               !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
362               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
363               Requires<[hasBF16Math]>;
364}
365
366// Template for instructions which take three FP args.  The
367// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
368//
369// Also defines ftz (flush subnormal inputs and results to sign-preserving
370// zero) variants for fp32/fp16 functions.
371//
372// This multiclass should be used for nodes that can be folded to make fma ops.
373// In this case, we use the ".rn" variant when FMA is disabled, as this behaves
374// just like the non ".rn" op, but prevents ptxas from creating FMAs.
375multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
376   def f64rr :
377     NVPTXInst<(outs Float64Regs:$dst),
378               (ins Float64Regs:$a, Float64Regs:$b),
379               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
380               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
381               Requires<[allowFMA]>;
382   def f64ri :
383     NVPTXInst<(outs Float64Regs:$dst),
384               (ins Float64Regs:$a, f64imm:$b),
385               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
386               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
387               Requires<[allowFMA]>;
388   def f32rr_ftz :
389     NVPTXInst<(outs Float32Regs:$dst),
390               (ins Float32Regs:$a, Float32Regs:$b),
391               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
392               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
393               Requires<[allowFMA, doF32FTZ]>;
394   def f32ri_ftz :
395     NVPTXInst<(outs Float32Regs:$dst),
396               (ins Float32Regs:$a, f32imm:$b),
397               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
398               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
399               Requires<[allowFMA, doF32FTZ]>;
400   def f32rr :
401     NVPTXInst<(outs Float32Regs:$dst),
402               (ins Float32Regs:$a, Float32Regs:$b),
403               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
404               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
405               Requires<[allowFMA]>;
406   def f32ri :
407     NVPTXInst<(outs Float32Regs:$dst),
408               (ins Float32Regs:$a, f32imm:$b),
409               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
410               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
411               Requires<[allowFMA]>;
412
413   def f16rr_ftz :
414     NVPTXInst<(outs Int16Regs:$dst),
415               (ins Int16Regs:$a, Int16Regs:$b),
416               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
417               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
418               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
419   def f16rr :
420     NVPTXInst<(outs Int16Regs:$dst),
421               (ins Int16Regs:$a, Int16Regs:$b),
422               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
423               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
424               Requires<[useFP16Math, allowFMA]>;
425
426   def f16x2rr_ftz :
427     NVPTXInst<(outs Int32Regs:$dst),
428               (ins Int32Regs:$a, Int32Regs:$b),
429               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
430               [(set (v2f16 Int32Regs:$dst), (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
431               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
432   def f16x2rr :
433     NVPTXInst<(outs Int32Regs:$dst),
434               (ins Int32Regs:$a, Int32Regs:$b),
435               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
436               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
437               Requires<[useFP16Math, allowFMA]>;
438   def bf16rr_ftz :
439     NVPTXInst<(outs Int16Regs:$dst),
440               (ins Int16Regs:$a, Int16Regs:$b),
441               !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"),
442               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
443               Requires<[hasBF16Math, allowFMA, doF32FTZ]>;
444   def bf16rr :
445     NVPTXInst<(outs Int16Regs:$dst),
446               (ins Int16Regs:$a, Int16Regs:$b),
447               !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
448               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
449               Requires<[hasBF16Math, allowFMA]>;
450
451   def bf16x2rr_ftz :
452     NVPTXInst<(outs Int32Regs:$dst),
453               (ins Int32Regs:$a, Int32Regs:$b),
454               !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"),
455               [(set (v2bf16 Int32Regs:$dst), (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
456               Requires<[hasBF16Math, allowFMA, doF32FTZ]>;
457   def bf16x2rr :
458     NVPTXInst<(outs Int32Regs:$dst),
459               (ins Int32Regs:$a, Int32Regs:$b),
460               !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
461               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
462               Requires<[hasBF16Math, allowFMA]>;
463   // These have strange names so we don't perturb existing mir tests.
464   def _rnf64rr :
465     NVPTXInst<(outs Float64Regs:$dst),
466               (ins Float64Regs:$a, Float64Regs:$b),
467               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
468               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
469               Requires<[noFMA]>;
470   def _rnf64ri :
471     NVPTXInst<(outs Float64Regs:$dst),
472               (ins Float64Regs:$a, f64imm:$b),
473               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
474               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
475               Requires<[noFMA]>;
476   def _rnf32rr_ftz :
477     NVPTXInst<(outs Float32Regs:$dst),
478               (ins Float32Regs:$a, Float32Regs:$b),
479               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
480               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
481               Requires<[noFMA, doF32FTZ]>;
482   def _rnf32ri_ftz :
483     NVPTXInst<(outs Float32Regs:$dst),
484               (ins Float32Regs:$a, f32imm:$b),
485               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
486               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
487               Requires<[noFMA, doF32FTZ]>;
488   def _rnf32rr :
489     NVPTXInst<(outs Float32Regs:$dst),
490               (ins Float32Regs:$a, Float32Regs:$b),
491               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
492               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
493               Requires<[noFMA]>;
494   def _rnf32ri :
495     NVPTXInst<(outs Float32Regs:$dst),
496               (ins Float32Regs:$a, f32imm:$b),
497               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
498               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
499               Requires<[noFMA]>;
500   def _rnf16rr_ftz :
501     NVPTXInst<(outs Int16Regs:$dst),
502               (ins Int16Regs:$a, Int16Regs:$b),
503               !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
504               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
505               Requires<[useFP16Math, noFMA, doF32FTZ]>;
506   def _rnf16rr :
507     NVPTXInst<(outs Int16Regs:$dst),
508               (ins Int16Regs:$a, Int16Regs:$b),
509               !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
510               [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
511               Requires<[useFP16Math, noFMA]>;
512   def _rnf16x2rr_ftz :
513     NVPTXInst<(outs Int32Regs:$dst),
514               (ins Int32Regs:$a, Int32Regs:$b),
515               !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
516               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
517               Requires<[useFP16Math, noFMA, doF32FTZ]>;
518   def _rnf16x2rr :
519     NVPTXInst<(outs Int32Regs:$dst),
520               (ins Int32Regs:$a, Int32Regs:$b),
521               !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
522               [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
523               Requires<[useFP16Math, noFMA]>;
524  def _rnbf16rr_ftz :
525     NVPTXInst<(outs Int16Regs:$dst),
526               (ins Int16Regs:$a, Int16Regs:$b),
527               !strconcat(OpcStr, ".rn.ftz.bf16 \t$dst, $a, $b;"),
528               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
529               Requires<[hasBF16Math, noFMA, doF32FTZ]>;
530   def _rnbf16rr :
531     NVPTXInst<(outs Int16Regs:$dst),
532               (ins Int16Regs:$a, Int16Regs:$b),
533               !strconcat(OpcStr, ".rn.bf16 \t$dst, $a, $b;"),
534               [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
535               Requires<[hasBF16Math, noFMA]>;
536   def _rnbf16x2rr_ftz :
537     NVPTXInst<(outs Int32Regs:$dst),
538               (ins Int32Regs:$a, Int32Regs:$b),
539               !strconcat(OpcStr, ".rn.ftz.bf16x2 \t$dst, $a, $b;"),
540               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
541               Requires<[hasBF16Math, noFMA, doF32FTZ]>;
542   def _rnbf16x2rr :
543     NVPTXInst<(outs Int32Regs:$dst),
544               (ins Int32Regs:$a, Int32Regs:$b),
545               !strconcat(OpcStr, ".rn.bf16x2 \t$dst, $a, $b;"),
546               [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
547               Requires<[hasBF16Math, noFMA]>;
548}
549
550// Template for operations which take two f32 or f64 operands.  Provides three
551// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
552// subnormal inputs and results to zero).
553multiclass F2<string OpcStr, SDNode OpNode> {
554   def f64 :     NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
555                           !strconcat(OpcStr, ".f64 \t$dst, $a;"),
556                           [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
557   def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
558                           !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
559                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
560                           Requires<[doF32FTZ]>;
561   def f32 :     NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
562                           !strconcat(OpcStr, ".f32 \t$dst, $a;"),
563                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
564}
565
566multiclass F2_Support_Half<string OpcStr, SDNode OpNode> {
567   def bf16 :      NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
568                           !strconcat(OpcStr, ".bf16 \t$dst, $a;"),
569                           [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a)))]>,
570                           Requires<[hasSM<80>, hasPTX<70>]>;
571   def bf16x2 :    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
572                           !strconcat(OpcStr, ".bf16x2 \t$dst, $a;"),
573                           [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a)))]>,
574                           Requires<[hasSM<80>, hasPTX<70>]>;
575   def f16_ftz :   NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
576                           !strconcat(OpcStr, ".ftz.f16 \t$dst, $a;"),
577                           [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>,
578                           Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>;
579   def f16x2_ftz : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
580                           !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a;"),
581                           [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>,
582                           Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>;
583   def f16 :       NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
584                           !strconcat(OpcStr, ".f16 \t$dst, $a;"),
585                           [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>,
586                           Requires<[hasSM<53>, hasPTX<65>]>;
587   def f16x2 :     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
588                           !strconcat(OpcStr, ".f16x2 \t$dst, $a;"),
589                           [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>,
590                           Requires<[hasSM<53>, hasPTX<65>]>;
591
592}
593
594//===----------------------------------------------------------------------===//
595// NVPTX Instructions.
596//===----------------------------------------------------------------------===//
597
598//-----------------------------------
599// Type Conversion
600//-----------------------------------
601
602let hasSideEffects = false in {
603  // Generate a cvt to the given type from all possible types.  Each instance
604  // takes a CvtMode immediate that defines the conversion mode to use.  It can
605  // be CvtNONE to omit a conversion mode.
606  multiclass CVT_FROM_ALL<string ToType, RegisterClass RC, list<Predicate> Preds = []> {
607    def _s8 :
608      NVPTXInst<(outs RC:$dst),
609                (ins Int16Regs:$src, CvtMode:$mode),
610                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
611                ToType, ".s8 \t$dst, $src;"), []>,
612      Requires<Preds>;
613    def _u8 :
614      NVPTXInst<(outs RC:$dst),
615                (ins Int16Regs:$src, CvtMode:$mode),
616                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
617                ToType, ".u8 \t$dst, $src;"), []>,
618      Requires<Preds>;
619    def _s16 :
620      NVPTXInst<(outs RC:$dst),
621                (ins Int16Regs:$src, CvtMode:$mode),
622                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
623                ToType, ".s16 \t$dst, $src;"), []>,
624      Requires<Preds>;
625    def _u16 :
626      NVPTXInst<(outs RC:$dst),
627                (ins Int16Regs:$src, CvtMode:$mode),
628                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
629                ToType, ".u16 \t$dst, $src;"), []>,
630      Requires<Preds>;
631    def _s32 :
632      NVPTXInst<(outs RC:$dst),
633                (ins Int32Regs:$src, CvtMode:$mode),
634                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
635                ToType, ".s32 \t$dst, $src;"), []>,
636      Requires<Preds>;
637    def _u32 :
638      NVPTXInst<(outs RC:$dst),
639                (ins Int32Regs:$src, CvtMode:$mode),
640                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
641                ToType, ".u32 \t$dst, $src;"), []>,
642      Requires<Preds>;
643    def _s64 :
644      NVPTXInst<(outs RC:$dst),
645                (ins Int64Regs:$src, CvtMode:$mode),
646                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
647                ToType, ".s64 \t$dst, $src;"), []>,
648      Requires<Preds>;
649    def _u64 :
650      NVPTXInst<(outs RC:$dst),
651                (ins Int64Regs:$src, CvtMode:$mode),
652                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
653                ToType, ".u64 \t$dst, $src;"), []>,
654      Requires<Preds>;
655    def _f16 :
656      NVPTXInst<(outs RC:$dst),
657                (ins Int16Regs:$src, CvtMode:$mode),
658                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
659                ToType, ".f16 \t$dst, $src;"), []>,
660      Requires<Preds>;
661    def _bf16 :
662      NVPTXInst<(outs RC:$dst),
663                (ins Int16Regs:$src, CvtMode:$mode),
664                !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.",
665                ToType, ".bf16 \t$dst, $src;"), []>,
666      Requires<!if(!eq(ToType, "f32"),
667                   // bf16->f32 was introduced early.
668                   [hasPTX<71>, hasSM<80>],
669                   // bf16->everything else needs sm90/ptx78
670                   [hasPTX<78>, hasSM<90>])>;
671    def _f32 :
672      NVPTXInst<(outs RC:$dst),
673                (ins Float32Regs:$src, CvtMode:$mode),
674                !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.",
675                ToType, ".f32 \t$dst, $src;"), []>,
676      Requires<!if(!eq(ToType, "bf16"),
677                   // f32->bf16 was introduced early.
678                   [hasPTX<70>, hasSM<80>],
679                   Preds)>;
680    def _f64 :
681      NVPTXInst<(outs RC:$dst),
682                (ins Float64Regs:$src, CvtMode:$mode),
683                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
684                ToType, ".f64 \t$dst, $src;"), []>,
685      Requires<Preds>;
686  }
687
688  // Generate cvts from all types to all types.
689  defm CVT_s8  : CVT_FROM_ALL<"s8",  Int16Regs>;
690  defm CVT_u8  : CVT_FROM_ALL<"u8",  Int16Regs>;
691  defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
692  defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
693  defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
694  defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
695  defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
696  defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
697  defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
698  defm CVT_bf16 : CVT_FROM_ALL<"bf16", Int16Regs, [hasPTX<78>, hasSM<90>]>;
699  defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
700  defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
701
702  // These cvts are different from those above: The source and dest registers
703  // are of the same type.
704  def CVT_INREG_s16_s8 :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
705                                    "cvt.s16.s8 \t$dst, $src;", []>;
706  def CVT_INREG_s32_s8 :  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
707                                    "cvt.s32.s8 \t$dst, $src;", []>;
708  def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
709                                    "cvt.s32.s16 \t$dst, $src;", []>;
710  def CVT_INREG_s64_s8 :  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
711                                    "cvt.s64.s8 \t$dst, $src;", []>;
712  def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
713                                    "cvt.s64.s16 \t$dst, $src;", []>;
714  def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
715                                    "cvt.s64.s32 \t$dst, $src;", []>;
716
717  multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> {
718    def _f32 :
719      NVPTXInst<(outs RC:$dst),
720                (ins Float32Regs:$src1, Float32Regs:$src2,  CvtMode:$mode),
721                !strconcat("cvt${mode:base}${mode:relu}.",
722                FromName, ".f32 \t$dst, $src1, $src2;"), []>,
723    Requires<[hasPTX<70>, hasSM<80>]>;
724  }
725
726  defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>;
727  defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>;
728}
729
730//-----------------------------------
731// Selection instructions (selp)
732//-----------------------------------
733
734// TODO: Missing slct
735
736// selp instructions that don't have any pattern matches; we explicitly use
737// them within this file.
738let hasSideEffects = false in {
739  multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
740    def rr : NVPTXInst<(outs RC:$dst),
741                       (ins RC:$a, RC:$b, Int1Regs:$p),
742                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
743    def ri : NVPTXInst<(outs RC:$dst),
744                       (ins RC:$a, ImmCls:$b, Int1Regs:$p),
745                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
746    def ir : NVPTXInst<(outs RC:$dst),
747                       (ins ImmCls:$a, RC:$b, Int1Regs:$p),
748                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
749    def ii : NVPTXInst<(outs RC:$dst),
750                       (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
751                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
752  }
753
754  multiclass SELP_PATTERN<string TypeStr, ValueType T, RegisterClass RC,
755                          Operand ImmCls, SDNode ImmNode> {
756    def rr :
757      NVPTXInst<(outs RC:$dst),
758                (ins RC:$a, RC:$b, Int1Regs:$p),
759                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
760                [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T RC:$b)))]>;
761    def ri :
762      NVPTXInst<(outs RC:$dst),
763                (ins RC:$a, ImmCls:$b, Int1Regs:$p),
764                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
765                [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T ImmNode:$b)))]>;
766    def ir :
767      NVPTXInst<(outs RC:$dst),
768                (ins ImmCls:$a, RC:$b, Int1Regs:$p),
769                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
770                [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, (T RC:$b)))]>;
771    def ii :
772      NVPTXInst<(outs RC:$dst),
773                (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
774                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
775                [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
776  }
777}
778
779// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
780// good.
781defm SELP_b16 : SELP_PATTERN<"b16", i16, Int16Regs, i16imm, imm>;
782defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
783defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
784defm SELP_b32 : SELP_PATTERN<"b32", i32, Int32Regs, i32imm, imm>;
785defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
786defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
787defm SELP_b64 : SELP_PATTERN<"b64", i64, Int64Regs, i64imm, imm>;
788defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
789defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
790defm SELP_f16 : SELP_PATTERN<"b16", f16, Int16Regs, f16imm, fpimm>;
791defm SELP_bf16 : SELP_PATTERN<"b16", bf16, Int16Regs, bf16imm, fpimm>;
792
793defm SELP_f32 : SELP_PATTERN<"f32", f32, Float32Regs, f32imm, fpimm>;
794defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>;
795
796// This does not work as tablegen fails to infer the type of 'imm'.
797// def v2f16imm : Operand<v2f16>;
798// defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>;
799
800foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
801def : Pat<(vt (select Int1Regs:$p, (vt Int32Regs:$a), (vt Int32Regs:$b))),
802          (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>;
803}
804
805//-----------------------------------
806// Test Instructions
807//-----------------------------------
808
809def TESTINF_f32r : NVPTXInst<(outs Int1Regs:$p), (ins Float32Regs:$a),
810                             "testp.infinite.f32 \t$p, $a;",
811                             []>;
812def TESTINF_f32i : NVPTXInst<(outs Int1Regs:$p), (ins f32imm:$a),
813                             "testp.infinite.f32 \t$p, $a;",
814                             []>;
815def TESTINF_f64r : NVPTXInst<(outs Int1Regs:$p), (ins Float64Regs:$a),
816                             "testp.infinite.f64 \t$p, $a;",
817                             []>;
818def TESTINF_f64i : NVPTXInst<(outs Int1Regs:$p), (ins f64imm:$a),
819                             "testp.infinite.f64 \t$p, $a;",
820                             []>;
821
822//-----------------------------------
823// Integer Arithmetic
824//-----------------------------------
825
826// Template for xor masquerading as int1 arithmetic.
827multiclass ADD_SUB_i1<SDNode OpNode> {
828   def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
829                      "xor.pred \t$dst, $a, $b;",
830                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
831   def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
832                      "xor.pred \t$dst, $a, $b;",
833                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
834}
835
836// int1 addition and subtraction are both just xor.
837defm ADD_i1 : ADD_SUB_i1<add>;
838defm SUB_i1 : ADD_SUB_i1<sub>;
839
840// int16, int32, and int64 signed addition.  Since nvptx is 2's complement, we
841// also use these for unsigned arithmetic.
842defm ADD : I3<"add.s", add>;
843defm SUB : I3<"sub.s", sub>;
844
845def ADD16x2 : I16x2<"add.s", add>;
846
847// in32 and int64 addition and subtraction with carry-out.
848defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>;
849defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>;
850
851// int32 and int64 addition and subtraction with carry-in and carry-out.
852defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>;
853defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>;
854
855defm MULT : I3<"mul.lo.s", mul>;
856
857defm MULTHS : I3<"mul.hi.s", mulhs>;
858defm MULTHU : I3<"mul.hi.u", mulhu>;
859
860defm SDIV : I3<"div.s", sdiv>;
861defm UDIV : I3<"div.u", udiv>;
862
863// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
864// will lower it.
865defm SREM : I3<"rem.s", srem>;
866defm UREM : I3<"rem.u", urem>;
867
868// Integer absolute value.  NumBits should be one minus the bit width of RC.
869// This idiom implements the algorithm at
870// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs.
871multiclass ABS<ValueType T, RegisterClass RC, string SizeName> {
872  def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
873                  !strconcat("abs", SizeName, " \t$dst, $a;"),
874                  [(set (T RC:$dst), (abs (T RC:$a)))]>;
875}
876defm ABS_16 : ABS<i16, Int16Regs, ".s16">;
877defm ABS_32 : ABS<i32, Int32Regs, ".s32">;
878defm ABS_64 : ABS<i64, Int64Regs, ".s64">;
879
880// Integer min/max.
881defm SMAX : I3<"max.s", smax>;
882defm UMAX : I3<"max.u", umax>;
883defm SMIN : I3<"min.s", smin>;
884defm UMIN : I3<"min.u", umin>;
885
886def SMAX16x2 : I16x2<"max.s", smax>;
887def UMAX16x2 : I16x2<"max.u", umax>;
888def SMIN16x2 : I16x2<"min.s", smin>;
889def UMIN16x2 : I16x2<"min.u", umin>;
890
891
892//
893// Wide multiplication
894//
895def MULWIDES64 :
896  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
897            "mul.wide.s32 \t$dst, $a, $b;", []>;
898def MULWIDES64Imm :
899  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
900            "mul.wide.s32 \t$dst, $a, $b;", []>;
901def MULWIDES64Imm64 :
902  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
903            "mul.wide.s32 \t$dst, $a, $b;", []>;
904
905def MULWIDEU64 :
906  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
907            "mul.wide.u32 \t$dst, $a, $b;", []>;
908def MULWIDEU64Imm :
909  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
910            "mul.wide.u32 \t$dst, $a, $b;", []>;
911def MULWIDEU64Imm64 :
912  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
913            "mul.wide.u32 \t$dst, $a, $b;", []>;
914
915def MULWIDES32 :
916  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
917            "mul.wide.s16 \t$dst, $a, $b;", []>;
918def MULWIDES32Imm :
919  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
920            "mul.wide.s16 \t$dst, $a, $b;", []>;
921def MULWIDES32Imm32 :
922  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
923            "mul.wide.s16 \t$dst, $a, $b;", []>;
924
925def MULWIDEU32 :
926  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
927            "mul.wide.u16 \t$dst, $a, $b;", []>;
928def MULWIDEU32Imm :
929  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
930            "mul.wide.u16 \t$dst, $a, $b;", []>;
931def MULWIDEU32Imm32 :
932  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
933            "mul.wide.u16 \t$dst, $a, $b;", []>;
934
935def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
936def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
937def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
938
939// Matchers for signed, unsigned mul.wide ISD nodes.
940def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)),
941          (MULWIDES32 i16:$a, i16:$b)>,
942      Requires<[doMulWide]>;
943def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
944          (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
945      Requires<[doMulWide]>;
946def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)),
947          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
948      Requires<[doMulWide]>;
949def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
950          (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
951      Requires<[doMulWide]>;
952
953def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)),
954          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
955      Requires<[doMulWide]>;
956def : Pat<(i64 (mul_wide_signed (i32 Int32Regs:$a), imm:$b)),
957          (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
958      Requires<[doMulWide]>;
959def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)),
960          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
961      Requires<[doMulWide]>;
962def : Pat<(i64 (mul_wide_unsigned (i32 Int32Regs:$a), imm:$b)),
963          (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
964      Requires<[doMulWide]>;
965
966// Predicates used for converting some patterns to mul.wide.
967def SInt32Const : PatLeaf<(imm), [{
968  const APInt &v = N->getAPIntValue();
969  return v.isSignedIntN(32);
970}]>;
971
972def UInt32Const : PatLeaf<(imm), [{
973  const APInt &v = N->getAPIntValue();
974  return v.isIntN(32);
975}]>;
976
977def SInt16Const : PatLeaf<(imm), [{
978  const APInt &v = N->getAPIntValue();
979  return v.isSignedIntN(16);
980}]>;
981
982def UInt16Const : PatLeaf<(imm), [{
983  const APInt &v = N->getAPIntValue();
984  return v.isIntN(16);
985}]>;
986
987def IntConst_0_30 : PatLeaf<(imm), [{
988  // Check if 0 <= v < 31; only then will the result of (x << v) be an int32.
989  const APInt &v = N->getAPIntValue();
990  return v.sge(0) && v.slt(31);
991}]>;
992
993def IntConst_0_14 : PatLeaf<(imm), [{
994  // Check if 0 <= v < 15; only then will the result of (x << v) be an int16.
995  const APInt &v = N->getAPIntValue();
996  return v.sge(0) && v.slt(15);
997}]>;
998
999def SHL2MUL32 : SDNodeXForm<imm, [{
1000  const APInt &v = N->getAPIntValue();
1001  APInt temp(32, 1);
1002  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
1003}]>;
1004
1005def SHL2MUL16 : SDNodeXForm<imm, [{
1006  const APInt &v = N->getAPIntValue();
1007  APInt temp(16, 1);
1008  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
1009}]>;
1010
1011// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
1012def : Pat<(shl (sext Int32Regs:$a), (i32 IntConst_0_30:$b)),
1013          (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
1014      Requires<[doMulWide]>;
1015def : Pat<(shl (zext Int32Regs:$a), (i32 IntConst_0_30:$b)),
1016          (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
1017      Requires<[doMulWide]>;
1018
1019def : Pat<(shl (sext Int16Regs:$a), (i16 IntConst_0_14:$b)),
1020          (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
1021      Requires<[doMulWide]>;
1022def : Pat<(shl (zext Int16Regs:$a), (i16 IntConst_0_14:$b)),
1023          (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
1024      Requires<[doMulWide]>;
1025
1026// Convert "sign/zero-extend then multiply" to mul.wide.
1027def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
1028          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
1029      Requires<[doMulWide]>;
1030def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
1031          (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
1032      Requires<[doMulWide]>;
1033
1034def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
1035          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
1036      Requires<[doMulWide]>;
1037def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
1038          (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
1039      Requires<[doMulWide]>;
1040
1041def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
1042          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
1043      Requires<[doMulWide]>;
1044def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
1045          (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
1046      Requires<[doMulWide]>;
1047
1048def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
1049          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
1050      Requires<[doMulWide]>;
1051def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
1052          (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
1053      Requires<[doMulWide]>;
1054
1055//
1056// Integer multiply-add
1057//
1058def SDTIMAD :
1059  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
1060                       SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
1061def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
1062
1063def MAD16rrr :
1064  NVPTXInst<(outs Int16Regs:$dst),
1065            (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
1066            "mad.lo.s16 \t$dst, $a, $b, $c;",
1067            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
1068def MAD16rri :
1069  NVPTXInst<(outs Int16Regs:$dst),
1070            (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
1071            "mad.lo.s16 \t$dst, $a, $b, $c;",
1072            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
1073def MAD16rir :
1074  NVPTXInst<(outs Int16Regs:$dst),
1075            (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
1076            "mad.lo.s16 \t$dst, $a, $b, $c;",
1077            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
1078def MAD16rii :
1079  NVPTXInst<(outs Int16Regs:$dst),
1080            (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
1081            "mad.lo.s16 \t$dst, $a, $b, $c;",
1082            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
1083
1084def MAD32rrr :
1085  NVPTXInst<(outs Int32Regs:$dst),
1086            (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
1087            "mad.lo.s32 \t$dst, $a, $b, $c;",
1088            [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>;
1089def MAD32rri :
1090  NVPTXInst<(outs Int32Regs:$dst),
1091            (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
1092            "mad.lo.s32 \t$dst, $a, $b, $c;",
1093            [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), imm:$c))]>;
1094def MAD32rir :
1095  NVPTXInst<(outs Int32Regs:$dst),
1096            (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
1097            "mad.lo.s32 \t$dst, $a, $b, $c;",
1098            [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, (i32 Int32Regs:$c)))]>;
1099def MAD32rii :
1100  NVPTXInst<(outs Int32Regs:$dst),
1101            (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
1102            "mad.lo.s32 \t$dst, $a, $b, $c;",
1103            [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, imm:$c))]>;
1104
1105def MAD64rrr :
1106  NVPTXInst<(outs Int64Regs:$dst),
1107            (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
1108            "mad.lo.s64 \t$dst, $a, $b, $c;",
1109            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
1110def MAD64rri :
1111  NVPTXInst<(outs Int64Regs:$dst),
1112            (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
1113            "mad.lo.s64 \t$dst, $a, $b, $c;",
1114            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
1115def MAD64rir :
1116  NVPTXInst<(outs Int64Regs:$dst),
1117            (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
1118            "mad.lo.s64 \t$dst, $a, $b, $c;",
1119            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
1120def MAD64rii :
1121  NVPTXInst<(outs Int64Regs:$dst),
1122            (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
1123            "mad.lo.s64 \t$dst, $a, $b, $c;",
1124            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
1125
1126def INEG16 :
1127  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1128            "neg.s16 \t$dst, $src;",
1129            [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
1130def INEG32 :
1131  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1132            "neg.s32 \t$dst, $src;",
1133            [(set (i32 Int32Regs:$dst), (ineg (i32 Int32Regs:$src)))]>;
1134def INEG64 :
1135  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1136            "neg.s64 \t$dst, $src;",
1137            [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
1138
1139//-----------------------------------
1140// Floating Point Arithmetic
1141//-----------------------------------
1142
1143// Constant 1.0f
1144def FloatConst1 : PatLeaf<(fpimm), [{
1145  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
1146         N->getValueAPF().convertToFloat() == 1.0f;
1147}]>;
1148// Constant 1.0 (double)
1149def DoubleConst1 : PatLeaf<(fpimm), [{
1150  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
1151         N->getValueAPF().convertToDouble() == 1.0;
1152}]>;
1153// Constant -1.0 (double)
1154def DoubleConstNeg1 : PatLeaf<(fpimm), [{
1155  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
1156         N->getValueAPF().convertToDouble() == -1.0;
1157}]>;
1158
1159
1160// Constant -X -> X (double)
1161def NegDoubleConst : SDNodeXForm<fpimm, [{
1162  return CurDAG->getTargetConstantFP(-(N->getValueAPF()),
1163                                     SDLoc(N), MVT::f64);
1164}]>;
1165
1166// Loads FP16 constant into a register.
1167//
1168// ptxas does not have hex representation for fp16, so we can't use
1169// fp16 immediate values in .f16 instructions. Instead we have to load
1170// the constant into a register using mov.b16.
1171def LOAD_CONST_F16 :
1172  NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$a),
1173            "mov.b16 \t$dst, $a;", []>;
1174def LOAD_CONST_BF16 :
1175  NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$a),
1176            "mov.b16 \t$dst, $a;", []>;
1177defm FADD : F3_fma_component<"add", fadd>;
1178defm FSUB : F3_fma_component<"sub", fsub>;
1179defm FMUL : F3_fma_component<"mul", fmul>;
1180
1181defm FMIN : F3<"min", fminnum>;
1182defm FMAX : F3<"max", fmaxnum>;
1183// Note: min.NaN.f64 and max.NaN.f64 do not actually exist.
1184defm FMINNAN : F3<"min.NaN", fminimum>;
1185defm FMAXNAN : F3<"max.NaN", fmaximum>;
1186
1187defm FABS  : F2<"abs", fabs>;
1188defm FNEG  : F2<"neg", fneg>;
1189defm FABS_H: F2_Support_Half<"abs", fabs>;
1190defm FNEG_H: F2_Support_Half<"neg", fneg>;
1191
1192defm FSQRT : F2<"sqrt.rn", fsqrt>;
1193
1194//
1195// F16 NEG
1196//
1197class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
1198      NVPTXInst<(outs RC:$dst), (ins RC:$src),
1199                !strconcat(OpcStr, " \t$dst, $src;"),
1200                [(set RC:$dst, (fneg (T RC:$src)))]>,
1201                Requires<[useFP16Math, hasPTX<60>, hasSM<53>, Pred]>;
1202def FNEG16_ftz   : FNEG_F16_F16X2<"neg.ftz.f16", f16, Int16Regs, doF32FTZ>;
1203def FNEG16       : FNEG_F16_F16X2<"neg.f16", f16, Int16Regs, True>;
1204def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
1205def FNEG16x2     : FNEG_F16_F16X2<"neg.f16x2", v2f16, Int32Regs, True>;
1206
1207//
1208// BF16 NEG
1209//
1210
1211class FNEG_BF16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
1212      NVPTXInst<(outs RC:$dst), (ins RC:$src),
1213                !strconcat(OpcStr, " \t$dst, $src;"),
1214                [(set RC:$dst, (fneg (T RC:$src)))]>,
1215                Requires<[hasBF16Math, hasPTX<70>, hasSM<80>, Pred]>;
1216def BFNEG16_ftz   : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, Int16Regs, doF32FTZ>;
1217def BFNEG16       : FNEG_BF16_F16X2<"neg.bf16", bf16, Int16Regs, True>;
1218def BFNEG16x2_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>;
1219def BFNEG16x2     : FNEG_BF16_F16X2<"neg.bf16x2", v2bf16, Int32Regs, True>;
1220
1221//
1222// F64 division
1223//
1224def FDIV641r :
1225  NVPTXInst<(outs Float64Regs:$dst),
1226            (ins f64imm:$a, Float64Regs:$b),
1227            "rcp.rn.f64 \t$dst, $b;",
1228            [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
1229def FDIV64rr :
1230  NVPTXInst<(outs Float64Regs:$dst),
1231            (ins Float64Regs:$a, Float64Regs:$b),
1232            "div.rn.f64 \t$dst, $a, $b;",
1233            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
1234def FDIV64ri :
1235  NVPTXInst<(outs Float64Regs:$dst),
1236            (ins Float64Regs:$a, f64imm:$b),
1237            "div.rn.f64 \t$dst, $a, $b;",
1238            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
1239
1240// fdiv will be converted to rcp
1241// fneg (fdiv 1.0, X) => fneg (rcp.rn X)
1242def : Pat<(fdiv DoubleConstNeg1:$a, Float64Regs:$b),
1243          (FNEGf64 (FDIV641r (NegDoubleConst node:$a), Float64Regs:$b))>;
1244
1245//
1246// F32 Approximate reciprocal
1247//
1248def FDIV321r_ftz :
1249  NVPTXInst<(outs Float32Regs:$dst),
1250            (ins f32imm:$a, Float32Regs:$b),
1251            "rcp.approx.ftz.f32 \t$dst, $b;",
1252            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1253            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1254def FDIV321r :
1255  NVPTXInst<(outs Float32Regs:$dst),
1256            (ins f32imm:$a, Float32Regs:$b),
1257            "rcp.approx.f32 \t$dst, $b;",
1258            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1259            Requires<[do_DIVF32_APPROX]>;
1260//
1261// F32 Approximate division
1262//
1263def FDIV32approxrr_ftz :
1264  NVPTXInst<(outs Float32Regs:$dst),
1265            (ins Float32Regs:$a, Float32Regs:$b),
1266            "div.approx.ftz.f32 \t$dst, $a, $b;",
1267            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1268            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1269def FDIV32approxri_ftz :
1270  NVPTXInst<(outs Float32Regs:$dst),
1271            (ins Float32Regs:$a, f32imm:$b),
1272            "div.approx.ftz.f32 \t$dst, $a, $b;",
1273            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1274            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1275def FDIV32approxrr :
1276  NVPTXInst<(outs Float32Regs:$dst),
1277            (ins Float32Regs:$a, Float32Regs:$b),
1278            "div.approx.f32 \t$dst, $a, $b;",
1279            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1280            Requires<[do_DIVF32_APPROX]>;
1281def FDIV32approxri :
1282  NVPTXInst<(outs Float32Regs:$dst),
1283            (ins Float32Regs:$a, f32imm:$b),
1284            "div.approx.f32 \t$dst, $a, $b;",
1285            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1286            Requires<[do_DIVF32_APPROX]>;
1287//
1288// F32 Semi-accurate reciprocal
1289//
1290// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
1291//
1292def FDIV321r_approx_ftz :
1293  NVPTXInst<(outs Float32Regs:$dst),
1294            (ins f32imm:$a, Float32Regs:$b),
1295            "rcp.approx.ftz.f32 \t$dst, $b;",
1296            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1297            Requires<[do_DIVF32_FULL, doF32FTZ]>;
1298def FDIV321r_approx :
1299  NVPTXInst<(outs Float32Regs:$dst),
1300            (ins f32imm:$a, Float32Regs:$b),
1301            "rcp.approx.f32 \t$dst, $b;",
1302            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1303            Requires<[do_DIVF32_FULL]>;
1304//
1305// F32 Semi-accurate division
1306//
1307def FDIV32rr_ftz :
1308  NVPTXInst<(outs Float32Regs:$dst),
1309            (ins Float32Regs:$a, Float32Regs:$b),
1310            "div.full.ftz.f32 \t$dst, $a, $b;",
1311            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1312            Requires<[do_DIVF32_FULL, doF32FTZ]>;
1313def FDIV32ri_ftz :
1314  NVPTXInst<(outs Float32Regs:$dst),
1315            (ins Float32Regs:$a, f32imm:$b),
1316            "div.full.ftz.f32 \t$dst, $a, $b;",
1317            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1318            Requires<[do_DIVF32_FULL, doF32FTZ]>;
1319def FDIV32rr :
1320  NVPTXInst<(outs Float32Regs:$dst),
1321            (ins Float32Regs:$a, Float32Regs:$b),
1322            "div.full.f32 \t$dst, $a, $b;",
1323            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1324            Requires<[do_DIVF32_FULL]>;
1325def FDIV32ri :
1326  NVPTXInst<(outs Float32Regs:$dst),
1327            (ins Float32Regs:$a, f32imm:$b),
1328            "div.full.f32 \t$dst, $a, $b;",
1329            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1330            Requires<[do_DIVF32_FULL]>;
1331//
1332// F32 Accurate reciprocal
1333//
1334def FDIV321r_prec_ftz :
1335  NVPTXInst<(outs Float32Regs:$dst),
1336            (ins f32imm:$a, Float32Regs:$b),
1337            "rcp.rn.ftz.f32 \t$dst, $b;",
1338            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1339            Requires<[doF32FTZ]>;
1340def FDIV321r_prec :
1341  NVPTXInst<(outs Float32Regs:$dst),
1342            (ins f32imm:$a, Float32Regs:$b),
1343            "rcp.rn.f32 \t$dst, $b;",
1344            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>;
1345//
1346// F32 Accurate division
1347//
1348def FDIV32rr_prec_ftz :
1349  NVPTXInst<(outs Float32Regs:$dst),
1350            (ins Float32Regs:$a, Float32Regs:$b),
1351            "div.rn.ftz.f32 \t$dst, $a, $b;",
1352            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1353            Requires<[doF32FTZ]>;
1354def FDIV32ri_prec_ftz :
1355  NVPTXInst<(outs Float32Regs:$dst),
1356            (ins Float32Regs:$a, f32imm:$b),
1357            "div.rn.ftz.f32 \t$dst, $a, $b;",
1358            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1359            Requires<[doF32FTZ]>;
1360def FDIV32rr_prec :
1361  NVPTXInst<(outs Float32Regs:$dst),
1362            (ins Float32Regs:$a, Float32Regs:$b),
1363            "div.rn.f32 \t$dst, $a, $b;",
1364            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>;
1365def FDIV32ri_prec :
1366  NVPTXInst<(outs Float32Regs:$dst),
1367            (ins Float32Regs:$a, f32imm:$b),
1368            "div.rn.f32 \t$dst, $a, $b;",
1369            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>;
1370
1371//
1372// FMA
1373//
1374
1375multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
1376   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1377                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1378                       [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
1379                       Requires<[Pred]>;
1380   def rri : NVPTXInst<(outs RC:$dst),
1381                       (ins RC:$a, RC:$b, ImmCls:$c),
1382                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1383                       [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
1384                       Requires<[Pred]>;
1385   def rir : NVPTXInst<(outs RC:$dst),
1386                       (ins RC:$a, ImmCls:$b, RC:$c),
1387                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1388                       [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
1389                       Requires<[Pred]>;
1390   def rii : NVPTXInst<(outs RC:$dst),
1391                       (ins RC:$a, ImmCls:$b, ImmCls:$c),
1392                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1393                       [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
1394                       Requires<[Pred]>;
1395}
1396
1397multiclass FMA_F16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> {
1398   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1399                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1400                       [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>,
1401                       Requires<[useFP16Math, Pred]>;
1402}
1403
1404multiclass FMA_BF16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> {
1405   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1406                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1407                       [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>,
1408                       Requires<[hasBF16Math, Pred]>;
1409}
1410
1411defm FMA16_ftz    : FMA_F16<"fma.rn.ftz.f16", f16, Int16Regs, doF32FTZ>;
1412defm FMA16        : FMA_F16<"fma.rn.f16", f16, Int16Regs, True>;
1413defm FMA16x2_ftz  : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
1414defm FMA16x2      : FMA_F16<"fma.rn.f16x2", v2f16, Int32Regs, True>;
1415defm BFMA16_ftz   : FMA_BF16<"fma.rn.ftz.bf16", bf16, Int16Regs, doF32FTZ>;
1416defm BFMA16       : FMA_BF16<"fma.rn.bf16", bf16, Int16Regs, True>;
1417defm BFMA16x2_ftz : FMA_BF16<"fma.rn.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>;
1418defm BFMA16x2     : FMA_BF16<"fma.rn.bf16x2", v2bf16, Int32Regs, True>;
1419defm FMA32_ftz    : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
1420defm FMA32        : FMA<"fma.rn.f32", Float32Regs, f32imm, True>;
1421defm FMA64        : FMA<"fma.rn.f64", Float64Regs, f64imm, True>;
1422
1423// sin/cos
1424def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1425                      "sin.approx.f32 \t$dst, $src;",
1426                      [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,
1427                      Requires<[allowUnsafeFPMath]>;
1428def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1429                      "cos.approx.f32 \t$dst, $src;",
1430                      [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>,
1431                      Requires<[allowUnsafeFPMath]>;
1432
1433// Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
1434// i.e. "poor man's fmod()". When y is infinite, x is returned. This matches the
1435// semantics of LLVM's frem.
1436
1437// frem - f32 FTZ
1438def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1439          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
1440            (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
1441             Float32Regs:$y))>,
1442          Requires<[doF32FTZ, allowUnsafeFPMath]>;
1443def : Pat<(frem Float32Regs:$x, fpimm:$y),
1444          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
1445            (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
1446             fpimm:$y))>,
1447          Requires<[doF32FTZ, allowUnsafeFPMath]>;
1448
1449def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1450          (SELP_f32rr Float32Regs:$x,
1451            (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
1452              (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
1453              Float32Regs:$y)),
1454            (TESTINF_f32r Float32Regs:$y))>,
1455          Requires<[doF32FTZ, noUnsafeFPMath]>;
1456def : Pat<(frem Float32Regs:$x, fpimm:$y),
1457          (SELP_f32rr Float32Regs:$x,
1458            (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
1459              (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
1460              fpimm:$y)),
1461            (TESTINF_f32i fpimm:$y))>,
1462          Requires<[doF32FTZ, noUnsafeFPMath]>;
1463
1464// frem - f32
1465def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1466          (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
1467            (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
1468             Float32Regs:$y))>,
1469          Requires<[allowUnsafeFPMath]>;
1470def : Pat<(frem Float32Regs:$x, fpimm:$y),
1471          (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
1472            (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
1473             fpimm:$y))>,
1474          Requires<[allowUnsafeFPMath]>;
1475
1476def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1477          (SELP_f32rr Float32Regs:$x,
1478            (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
1479              (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
1480              Float32Regs:$y)),
1481            (TESTINF_f32r Float32Regs:$y))>,
1482          Requires<[noUnsafeFPMath]>;
1483def : Pat<(frem Float32Regs:$x, fpimm:$y),
1484          (SELP_f32rr Float32Regs:$x,
1485            (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
1486              (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
1487              fpimm:$y)),
1488            (TESTINF_f32i fpimm:$y))>,
1489          Requires<[noUnsafeFPMath]>;
1490
1491// frem - f64
1492def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
1493          (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
1494            (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
1495             Float64Regs:$y))>,
1496          Requires<[allowUnsafeFPMath]>;
1497def : Pat<(frem Float64Regs:$x, fpimm:$y),
1498          (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
1499            (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
1500             fpimm:$y))>,
1501          Requires<[allowUnsafeFPMath]>;
1502
1503def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
1504          (SELP_f64rr Float64Regs:$x,
1505            (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
1506              (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
1507               Float64Regs:$y)),
1508            (TESTINF_f64r Float64Regs:$y))>,
1509          Requires<[noUnsafeFPMath]>;
1510def : Pat<(frem Float64Regs:$x, fpimm:$y),
1511          (SELP_f64rr Float64Regs:$x,
1512            (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
1513              (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
1514              fpimm:$y)),
1515            (TESTINF_f64r Float64Regs:$y))>,
1516          Requires<[noUnsafeFPMath]>;
1517
1518//-----------------------------------
1519// Bitwise operations
1520//-----------------------------------
1521
1522// Template for three-arg bitwise operations.  Takes three args, Creates .b16,
1523// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
1524multiclass BITWISE<string OpcStr, SDNode OpNode> {
1525  def b1rr :
1526    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
1527              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
1528              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
1529  def b1ri :
1530    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
1531              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
1532              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
1533  def b16rr :
1534    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
1535              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
1536              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
1537  def b16ri :
1538    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
1539              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
1540              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
1541  def b32rr :
1542    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1543              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
1544              [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
1545  def b32ri :
1546    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1547              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
1548              [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
1549  def b64rr :
1550    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
1551              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
1552              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
1553  def b64ri :
1554    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
1555              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
1556              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
1557}
1558
1559defm OR  : BITWISE<"or", or>;
1560defm AND : BITWISE<"and", and>;
1561defm XOR : BITWISE<"xor", xor>;
1562
1563// PTX does not support mul on predicates, convert to and instructions
1564def : Pat<(mul Int1Regs:$a, Int1Regs:$b), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
1565def : Pat<(mul Int1Regs:$a, (i1 imm:$b)), (ANDb1ri Int1Regs:$a, imm:$b)>;
1566
1567// These transformations were once reliably performed by instcombine, but thanks
1568// to poison semantics they are no longer safe for LLVM IR, perform them here
1569// instead.
1570def : Pat<(select Int1Regs:$a, Int1Regs:$b, 0), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
1571def : Pat<(select Int1Regs:$a, 1, Int1Regs:$b), (ORb1rr Int1Regs:$a, Int1Regs:$b)>;
1572
1573// Lower logical v2i16/v4i8 ops as bitwise ops on b32.
1574foreach vt = [v2i16, v4i8] in {
1575  def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)),
1576           (ORb32rr Int32Regs:$a, Int32Regs:$b)>;
1577  def: Pat<(xor (vt Int32Regs:$a), (vt Int32Regs:$b)),
1578           (XORb32rr Int32Regs:$a, Int32Regs:$b)>;
1579  def: Pat<(and (vt Int32Regs:$a), (vt Int32Regs:$b)),
1580           (ANDb32rr Int32Regs:$a, Int32Regs:$b)>;
1581
1582  // The constants get legalized into a bitcast from i32, so that's what we need
1583  // to match here.
1584  def: Pat<(or Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
1585           (ORb32ri Int32Regs:$a, imm:$b)>;
1586  def: Pat<(xor Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
1587           (XORb32ri Int32Regs:$a, imm:$b)>;
1588  def: Pat<(and Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
1589           (ANDb32ri Int32Regs:$a, imm:$b)>;
1590}
1591
1592def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
1593                      "not.pred \t$dst, $src;",
1594                      [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
1595def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1596                      "not.b16 \t$dst, $src;",
1597                      [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
1598def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1599                      "not.b32 \t$dst, $src;",
1600                      [(set (i32 Int32Regs:$dst), (not (i32 Int32Regs:$src)))]>;
1601def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1602                       "not.b64 \t$dst, $src;",
1603                       [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
1604
1605// Template for left/right shifts.  Takes three operands,
1606//   [dest (reg), src (reg), shift (reg or imm)].
1607// dest and src may be int64, int32, or int16, but shift is always int32.
1608//
1609// This template also defines a 32-bit shift (imm, imm) instruction.
1610multiclass SHIFT<string OpcStr, SDNode OpNode> {
1611   def i64rr :
1612     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
1613               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1614               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 Int32Regs:$b)))]>;
1615   def i64ri :
1616     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
1617               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1618               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
1619   def i32rr :
1620     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1621               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1622               [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
1623   def i32ri :
1624     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1625               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1626               [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 imm:$b)))]>;
1627   def i32ii :
1628     NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
1629               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1630               [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
1631   def i16rr :
1632     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
1633               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1634               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 Int32Regs:$b)))]>;
1635   def i16ri :
1636     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
1637               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1638               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
1639}
1640
1641defm SHL : SHIFT<"shl.b", shl>;
1642defm SRA : SHIFT<"shr.s", sra>;
1643defm SRL : SHIFT<"shr.u", srl>;
1644
1645// Bit-reverse
1646def BREV32 :
1647  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
1648             "brev.b32 \t$dst, $a;",
1649             [(set Int32Regs:$dst, (bitreverse (i32 Int32Regs:$a)))]>;
1650def BREV64 :
1651  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
1652             "brev.b64 \t$dst, $a;",
1653             [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
1654
1655//
1656// Rotate: Use ptx shf instruction if available.
1657//
1658
1659// 32 bit r2 = rotl r1, n
1660//    =>
1661//        r2 = shf.l r1, r1, n
1662def ROTL32imm_hw :
1663  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1664            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1665            [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 imm:$amt)))]>,
1666           Requires<[hasHWROT32]>;
1667
1668def ROTL32reg_hw :
1669  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1670            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1671            [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1672           Requires<[hasHWROT32]>;
1673
1674// 32 bit r2 = rotr r1, n
1675//    =>
1676//        r2 = shf.r r1, r1, n
1677def ROTR32imm_hw :
1678  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1679            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1680            [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 imm:$amt)))]>,
1681           Requires<[hasHWROT32]>;
1682
1683def ROTR32reg_hw :
1684  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1685            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1686            [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1687           Requires<[hasHWROT32]>;
1688
1689// 32-bit software rotate by immediate.  $amt2 should equal 32 - $amt1.
1690def ROT32imm_sw :
1691  NVPTXInst<(outs Int32Regs:$dst),
1692            (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
1693            "{{\n\t"
1694            ".reg .b32 %lhs;\n\t"
1695            ".reg .b32 %rhs;\n\t"
1696            "shl.b32 \t%lhs, $src, $amt1;\n\t"
1697            "shr.b32 \t%rhs, $src, $amt2;\n\t"
1698            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1699            "}}",
1700            []>;
1701
1702def SUB_FRM_32 : SDNodeXForm<imm, [{
1703  return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
1704}]>;
1705
1706def : Pat<(rotl (i32 Int32Regs:$src), (i32 imm:$amt)),
1707          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
1708      Requires<[noHWROT32]>;
1709def : Pat<(rotr (i32 Int32Regs:$src), (i32 imm:$amt)),
1710          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
1711      Requires<[noHWROT32]>;
1712
1713// 32-bit software rotate left by register.
1714def ROTL32reg_sw :
1715  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1716            "{{\n\t"
1717            ".reg .b32 %lhs;\n\t"
1718            ".reg .b32 %rhs;\n\t"
1719            ".reg .b32 %amt2;\n\t"
1720            "shl.b32 \t%lhs, $src, $amt;\n\t"
1721            "sub.s32 \t%amt2, 32, $amt;\n\t"
1722            "shr.b32 \t%rhs, $src, %amt2;\n\t"
1723            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1724            "}}",
1725            [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1726           Requires<[noHWROT32]>;
1727
1728// 32-bit software rotate right by register.
1729def ROTR32reg_sw :
1730  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1731            "{{\n\t"
1732            ".reg .b32 %lhs;\n\t"
1733            ".reg .b32 %rhs;\n\t"
1734            ".reg .b32 %amt2;\n\t"
1735            "shr.b32 \t%lhs, $src, $amt;\n\t"
1736            "sub.s32 \t%amt2, 32, $amt;\n\t"
1737            "shl.b32 \t%rhs, $src, %amt2;\n\t"
1738            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1739            "}}",
1740            [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
1741           Requires<[noHWROT32]>;
1742
1743// 64-bit software rotate by immediate.  $amt2 should equal 64 - $amt1.
1744def ROT64imm_sw :
1745  NVPTXInst<(outs Int64Regs:$dst),
1746            (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
1747            "{{\n\t"
1748            ".reg .b64 %lhs;\n\t"
1749            ".reg .b64 %rhs;\n\t"
1750            "shl.b64 \t%lhs, $src, $amt1;\n\t"
1751            "shr.b64 \t%rhs, $src, $amt2;\n\t"
1752            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1753            "}}",
1754            []>;
1755
1756def SUB_FRM_64 : SDNodeXForm<imm, [{
1757    return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
1758}]>;
1759
1760def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
1761          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
1762def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
1763          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
1764
1765// 64-bit software rotate left by register.
1766def ROTL64reg_sw :
1767  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1768            "{{\n\t"
1769            ".reg .b64 %lhs;\n\t"
1770            ".reg .b64 %rhs;\n\t"
1771            ".reg .u32 %amt2;\n\t"
1772            "and.b32 \t%amt2, $amt, 63;\n\t"
1773            "shl.b64 \t%lhs, $src, %amt2;\n\t"
1774            "sub.u32 \t%amt2, 64, %amt2;\n\t"
1775            "shr.b64 \t%rhs, $src, %amt2;\n\t"
1776            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1777            "}}",
1778            [(set Int64Regs:$dst, (rotl Int64Regs:$src, (i32 Int32Regs:$amt)))]>;
1779
1780def ROTR64reg_sw :
1781  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1782            "{{\n\t"
1783            ".reg .b64 %lhs;\n\t"
1784            ".reg .b64 %rhs;\n\t"
1785            ".reg .u32 %amt2;\n\t"
1786            "and.b32 \t%amt2, $amt, 63;\n\t"
1787            "shr.b64 \t%lhs, $src, %amt2;\n\t"
1788            "sub.u32 \t%amt2, 64, %amt2;\n\t"
1789            "shl.b64 \t%rhs, $src, %amt2;\n\t"
1790            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1791            "}}",
1792            [(set Int64Regs:$dst, (rotr Int64Regs:$src, (i32 Int32Regs:$amt)))]>;
1793
1794//
1795// Funnnel shift in clamp mode
1796//
1797
1798// Create SDNodes so they can be used in the DAG code, e.g.
1799// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
1800def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
1801def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
1802
1803def FUNSHFLCLAMP :
1804  NVPTXInst<(outs Int32Regs:$dst),
1805            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1806            "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
1807            [(set Int32Regs:$dst,
1808              (FUN_SHFL_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>;
1809
1810def FUNSHFRCLAMP :
1811  NVPTXInst<(outs Int32Regs:$dst),
1812            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1813            "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
1814            [(set Int32Regs:$dst,
1815             (FUN_SHFR_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>;
1816
1817//
1818// BFE - bit-field extract
1819//
1820
1821// Template for BFE/BFI instructions.
1822// Args: [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
1823// Start may be an imm only if end is also an imm.  FIXME: Is this a
1824// restriction in PTX?
1825//
1826// dest and src may be int32 or int64, but start and end are always int32.
1827def SDTBFE :
1828  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>,
1829                       SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
1830def bfe : SDNode<"NVPTXISD::BFE", SDTBFE>;
1831
1832def SDTBFI :
1833  SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
1834                       SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
1835def bfi : SDNode<"NVPTXISD::BFI", SDTBFI>;
1836
1837def SDTPRMT :
1838  SDTypeProfile<1, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
1839                       SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>,]>;
1840def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>;
1841
1842multiclass BFE<string Instr, ValueType T, RegisterClass RC> {
1843  def rrr
1844    : NVPTXInst<(outs RC:$d),
1845                (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
1846                !strconcat(Instr, " \t$d, $a, $b, $c;"),
1847                [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>;
1848  def rri
1849    : NVPTXInst<(outs RC:$d),
1850                (ins RC:$a, Int32Regs:$b, i32imm:$c),
1851                !strconcat(Instr, " \t$d, $a, $b, $c;"),
1852                [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 imm:$c)))]>;
1853  def rii
1854    : NVPTXInst<(outs RC:$d),
1855                (ins RC:$a, i32imm:$b, i32imm:$c),
1856                !strconcat(Instr, " \t$d, $a, $b, $c;"),
1857                [(set (T RC:$d), (bfe (T RC:$a), (i32 imm:$b), (i32 imm:$c)))]>;
1858}
1859
1860multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> {
1861  def rrrr
1862    : NVPTXInst<(outs RC:$f),
1863                (ins RC:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
1864                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1865                [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>;
1866  def rrri
1867    : NVPTXInst<(outs RC:$f),
1868                (ins RC:$a, RC:$b, Int32Regs:$c, i32imm:$d),
1869                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1870                [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>;
1871  def rrii
1872    : NVPTXInst<(outs RC:$f),
1873                (ins RC:$a, RC:$b, i32imm:$c, i32imm:$d),
1874                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1875                [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>;
1876  def irrr
1877    : NVPTXInst<(outs RC:$f),
1878                (ins ImmCls:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
1879                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1880                [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>;
1881  def irri
1882    : NVPTXInst<(outs RC:$f),
1883                (ins ImmCls:$a, RC:$b, Int32Regs:$c, i32imm:$d),
1884                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1885                [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>;
1886  def irii
1887    : NVPTXInst<(outs RC:$f),
1888                (ins ImmCls:$a, RC:$b, i32imm:$c, i32imm:$d),
1889                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1890                [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>;
1891}
1892
1893multiclass PRMT<ValueType T, RegisterClass RC> {
1894  def rrr
1895    : NVPTXInst<(outs RC:$d),
1896                (ins RC:$a, Int32Regs:$b, Int32Regs:$c, PrmtMode:$mode),
1897                !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1898                [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), imm:$mode))]>;
1899  def rri
1900    : NVPTXInst<(outs RC:$d),
1901                (ins RC:$a, Int32Regs:$b, i32imm:$c, PrmtMode:$mode),
1902                !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1903                [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 imm:$c), imm:$mode))]>;
1904  def rii
1905    : NVPTXInst<(outs RC:$d),
1906                (ins RC:$a, i32imm:$b, i32imm:$c, PrmtMode:$mode),
1907                !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1908                [(set (T RC:$d), (prmt (T RC:$a), (T imm:$b), (i32 imm:$c), imm:$mode))]>;
1909}
1910
1911let hasSideEffects = false in {
1912  // order is somewhat important here. signed/unsigned variants match
1913  // the same patterns, so the first one wins. Having unsigned byte extraction
1914  // has the benefit of always having zero in unused bits, which makes some
1915  // optimizations easier (e.g. no need to mask them).
1916  defm BFE_U32 : BFE<"bfe.u32", i32, Int32Regs>;
1917  defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>;
1918  defm BFE_U64 : BFE<"bfe.u64", i64, Int64Regs>;
1919  defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>;
1920
1921  defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>;
1922  defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>;
1923
1924  defm PRMT_B32 : PRMT<i32, Int32Regs>;
1925}
1926
1927
1928// byte extraction + signed/unsigned extension to i32.
1929def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s),  (i32 Int32Regs:$o), 8), i8)),
1930          (BFE_S32rri Int32Regs:$s, Int32Regs:$o, 8)>;
1931def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s),  (i32 imm:$o), 8), i8)),
1932          (BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
1933def : Pat<(i32 (and (bfe (i32 Int32Regs:$s),  (i32 Int32Regs:$o), 8), 255)),
1934          (BFE_U32rri Int32Regs:$s, Int32Regs:$o, 8)>;
1935def : Pat<(i32 (and (bfe (i32 Int32Regs:$s),  (i32 imm:$o), 8), 255)),
1936          (BFE_U32rii Int32Regs:$s, imm:$o, 8)>;
1937
1938// byte extraction + signed extension to i16
1939def : Pat<(i16 (sext_inreg (trunc (bfe (i32 Int32Regs:$s),  (i32 imm:$o), 8)), i8)),
1940          (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>;
1941
1942
1943// Byte extraction via shift/trunc/sext
1944def : Pat<(i16 (sext_inreg (trunc Int32Regs:$s), i8)),
1945          (CVT_s8_s32 Int32Regs:$s, CvtNONE)>;
1946def : Pat<(i16 (sext_inreg (trunc (srl (i32 Int32Regs:$s),  (i32 imm:$o))), i8)),
1947          (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>;
1948def : Pat<(sext_inreg (srl (i32 Int32Regs:$s),  (i32 imm:$o)), i8),
1949          (BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
1950def : Pat<(i16 (sra (i16 (trunc Int32Regs:$s)), (i32 8))),
1951          (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, 8, 8), CvtNONE)>;
1952def : Pat<(sext_inreg (srl (i64 Int64Regs:$s),  (i32 imm:$o)), i8),
1953          (BFE_S64rii Int64Regs:$s, imm:$o, 8)>;
1954def : Pat<(i16 (sext_inreg (trunc Int64Regs:$s), i8)),
1955          (CVT_s8_s64 Int64Regs:$s, CvtNONE)>;
1956def : Pat<(i16 (sext_inreg (trunc (srl (i64 Int64Regs:$s),  (i32 imm:$o))), i8)),
1957          (CVT_s8_s64 (BFE_S64rii Int64Regs:$s, imm:$o, 8), CvtNONE)>;
1958
1959//-----------------------------------
1960// Comparison instructions (setp, set)
1961//-----------------------------------
1962
1963// FIXME: This doesn't cover versions of set and setp that combine with a
1964// boolean predicate, e.g. setp.eq.and.b16.
1965
1966let hasSideEffects = false in {
1967  multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
1968    def rr :
1969      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
1970                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1971                           " \t$dst, $a, $b;"), []>;
1972    def ri :
1973      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1974                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1975                           " \t$dst, $a, $b;"), []>;
1976    def ir :
1977      NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1978                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1979                           " \t$dst, $a, $b;"), []>;
1980  }
1981}
1982
1983defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
1984defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
1985defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
1986defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
1987defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
1988defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
1989defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
1990defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
1991defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
1992defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
1993defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
1994def SETP_f16rr :
1995      NVPTXInst<(outs Int1Regs:$dst),
1996                (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp),
1997                "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
1998                []>, Requires<[useFP16Math]>;
1999
2000def SETP_f16x2rr :
2001      NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
2002                (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp),
2003                "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
2004                []>,
2005                Requires<[useFP16Math]>;
2006def SETP_bf16rr :
2007      NVPTXInst<(outs Int1Regs:$dst),
2008                (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp),
2009                "setp${cmp:base}${cmp:ftz}.bf16 \t$dst, $a, $b;",
2010                []>, Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>;
2011
2012def SETP_bf16x2rr :
2013      NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
2014                (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp),
2015                "setp${cmp:base}${cmp:ftz}.bf16x2 \t$p|$q, $a, $b;",
2016                []>,
2017                Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>;
2018
2019
2020// FIXME: This doesn't appear to be correct.  The "set" mnemonic has the form
2021// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
2022// reg, either u32, s32, or f32.  Anyway these aren't used at the moment.
2023
2024let hasSideEffects = false in {
2025  multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
2026    def rr : NVPTXInst<(outs Int32Regs:$dst),
2027                       (ins RC:$a, RC:$b, CmpMode:$cmp),
2028                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
2029    def ri : NVPTXInst<(outs Int32Regs:$dst),
2030                       (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
2031                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
2032    def ir : NVPTXInst<(outs Int32Regs:$dst),
2033                       (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
2034                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
2035  }
2036}
2037
2038defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
2039defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
2040defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
2041defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
2042defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
2043defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
2044defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
2045defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
2046defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
2047defm SET_f16 : SET<"f16", Int16Regs, f16imm>;
2048defm SET_bf16 : SET<"bf16", Int16Regs, bf16imm>, Requires<[hasPTX<78>, hasSM<90>]>;
2049defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
2050defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
2051
2052//-----------------------------------
2053// Data Movement (Load / Store, Move)
2054//-----------------------------------
2055
2056def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
2057                            [SDNPWantRoot]>;
2058def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
2059                              [SDNPWantRoot]>;
2060def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>;
2061
2062def MEMri : Operand<i32> {
2063  let PrintMethod = "printMemOperand";
2064  let MIOperandInfo = (ops Int32Regs, i32imm);
2065}
2066def MEMri64 : Operand<i64> {
2067  let PrintMethod = "printMemOperand";
2068  let MIOperandInfo = (ops Int64Regs, i64imm);
2069}
2070
2071def imem : Operand<iPTR> {
2072  let PrintMethod = "printOperand";
2073}
2074
2075def imemAny : Operand<iPTRAny> {
2076  let PrintMethod = "printOperand";
2077}
2078
2079def LdStCode : Operand<i32> {
2080  let PrintMethod = "printLdStCode";
2081}
2082
2083def MmaCode : Operand<i32> {
2084  let PrintMethod = "printMmaCode";
2085}
2086
2087def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
2088def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
2089
2090// Load a memory address into a u32 or u64 register.
2091def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
2092                         "mov.u32 \t$dst, $a;",
2093                         [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
2094def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
2095                           "mov.u64 \t$dst, $a;",
2096                           [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
2097
2098// Get pointer to local stack.
2099let hasSideEffects = false in {
2100  def MOV_DEPOT_ADDR :    NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
2101                                     "mov.u32 \t$d, __local_depot$num;", []>;
2102  def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
2103                                    "mov.u64 \t$d, __local_depot$num;", []>;
2104}
2105
2106
2107// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
2108let IsSimpleMove=1, hasSideEffects=0 in {
2109  def IMOV1rr :  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
2110                           "mov.pred \t$dst, $sss;", []>;
2111  def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
2112                           "mov.u16 \t$dst, $sss;", []>;
2113  def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
2114                           "mov.u32 \t$dst, $sss;", []>;
2115  def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
2116                           "mov.u64 \t$dst, $sss;", []>;
2117  def IMOV128rr : NVPTXInst<(outs Int128Regs:$dst), (ins Int128Regs:$sss),
2118                           "mov.b128 \t$dst, $sss;", []>;
2119
2120  def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
2121                           "mov.b16 \t$dst, $sss;", []>;
2122  def IMOVB32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
2123                           "mov.b32 \t$dst, $sss;", []>;
2124  def IMOVB64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
2125                           "mov.b64 \t$dst, $sss;", []>;
2126
2127  def FMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
2128                           // We have to use .b16 here as there's no mov.f16.
2129                           "mov.b16 \t$dst, $src;", []>;
2130  def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
2131                           "mov.f32 \t$dst, $src;", []>;
2132  def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
2133                           "mov.f64 \t$dst, $src;", []>;
2134}
2135
2136def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
2137                        "mov.pred \t$dst, $src;",
2138                        [(set Int1Regs:$dst, imm:$src)]>;
2139def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
2140                         "mov.u16 \t$dst, $src;",
2141                         [(set Int16Regs:$dst, imm:$src)]>;
2142def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
2143                         "mov.u32 \t$dst, $src;",
2144                         [(set (i32 Int32Regs:$dst), imm:$src)]>;
2145def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
2146                        "mov.u64 \t$dst, $src;",
2147                        [(set Int64Regs:$dst, imm:$src)]>;
2148
2149def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
2150                         "mov.b16 \t$dst, $src;", []>;
2151def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
2152                         "mov.b32 \t$dst, $src;", []>;
2153def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
2154                        "mov.b64 \t$dst, $src;", []>;
2155
2156def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
2157                         "mov.f32 \t$dst, $src;",
2158                         [(set Float32Regs:$dst, fpimm:$src)]>;
2159def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
2160                         "mov.f64 \t$dst, $src;",
2161                         [(set Float64Regs:$dst, fpimm:$src)]>;
2162
2163def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
2164def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>;
2165
2166//---- Copy Frame Index ----
2167def LEA_ADDRi :   NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
2168                            "add.u32 \t$dst, ${addr:add};",
2169                            [(set Int32Regs:$dst, ADDRri:$addr)]>;
2170def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
2171                            "add.u64 \t$dst, ${addr:add};",
2172                            [(set Int64Regs:$dst, ADDRri64:$addr)]>;
2173
2174//-----------------------------------
2175// Comparison and Selection
2176//-----------------------------------
2177
2178multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
2179                       Instruction setp_16rr,
2180                       Instruction setp_16ri,
2181                       Instruction setp_16ir,
2182                       Instruction setp_32rr,
2183                       Instruction setp_32ri,
2184                       Instruction setp_32ir,
2185                       Instruction setp_64rr,
2186                       Instruction setp_64ri,
2187                       Instruction setp_64ir,
2188                       Instruction set_16rr,
2189                       Instruction set_16ri,
2190                       Instruction set_16ir,
2191                       Instruction set_32rr,
2192                       Instruction set_32ri,
2193                       Instruction set_32ir,
2194                       Instruction set_64rr,
2195                       Instruction set_64ri,
2196                       Instruction set_64ir> {
2197  // i16 -> pred
2198  def : Pat<(i1 (OpNode i16:$a, i16:$b)),
2199            (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
2200  def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
2201            (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
2202  def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
2203            (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
2204  // i32 -> pred
2205  def : Pat<(i1 (OpNode i32:$a, i32:$b)),
2206            (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
2207  def : Pat<(i1 (OpNode (i32 Int32Regs:$a), imm:$b)),
2208            (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
2209  def : Pat<(i1 (OpNode imm:$a, (i32 Int32Regs:$b))),
2210            (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
2211  // i64 -> pred
2212  def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
2213            (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
2214  def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
2215            (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
2216  def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
2217            (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
2218
2219  // i16 -> i32
2220  def : Pat<(i32 (OpNode i16:$a, i16:$b)),
2221            (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
2222  def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
2223            (set_16ri Int16Regs:$a, imm:$b, Mode)>;
2224  def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
2225            (set_16ir imm:$a, Int16Regs:$b, Mode)>;
2226  // i32 -> i32
2227  def : Pat<(i32 (OpNode i32:$a, i32:$b)),
2228            (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
2229  def : Pat<(i32 (OpNode (i32 Int32Regs:$a), imm:$b)),
2230            (set_32ri Int32Regs:$a, imm:$b, Mode)>;
2231  def : Pat<(i32 (OpNode imm:$a, (i32 Int32Regs:$b))),
2232            (set_32ir imm:$a, Int32Regs:$b, Mode)>;
2233  // i64 -> i32
2234  def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
2235            (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
2236  def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
2237            (set_64ri Int64Regs:$a, imm:$b, Mode)>;
2238  def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
2239            (set_64ir imm:$a, Int64Regs:$b, Mode)>;
2240}
2241
2242multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
2243  : ISET_FORMAT<OpNode, Mode,
2244                SETP_s16rr, SETP_s16ri, SETP_s16ir,
2245                SETP_s32rr, SETP_s32ri, SETP_s32ir,
2246                SETP_s64rr, SETP_s64ri, SETP_s64ir,
2247                SET_s16rr, SET_s16ri, SET_s16ir,
2248                SET_s32rr, SET_s32ri, SET_s32ir,
2249                SET_s64rr, SET_s64ri, SET_s64ir> {
2250  // TableGen doesn't like empty multiclasses.
2251  def : PatLeaf<(i32 0)>;
2252}
2253
2254multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
2255  : ISET_FORMAT<OpNode, Mode,
2256                SETP_u16rr, SETP_u16ri, SETP_u16ir,
2257                SETP_u32rr, SETP_u32ri, SETP_u32ir,
2258                SETP_u64rr, SETP_u64ri, SETP_u64ir,
2259                SET_u16rr, SET_u16ri, SET_u16ir,
2260                SET_u32rr, SET_u32ri, SET_u32ir,
2261                SET_u64rr, SET_u64ri, SET_u64ir> {
2262  // TableGen doesn't like empty multiclasses.
2263  def : PatLeaf<(i32 0)>;
2264}
2265
2266defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
2267defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
2268defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
2269defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
2270defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
2271defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
2272defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
2273defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
2274defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
2275defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
2276defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
2277defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
2278
2279// i1 compares
2280def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
2281          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
2282def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
2283          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
2284
2285def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
2286          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2287def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
2288          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2289
2290// comparisons of i8 extracted with BFE as i32
2291// It's faster to do comparison directly on i32 extracted by BFE,
2292// instead of the long conversion and sign extending.
2293def: Pat<(setgt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
2294                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
2295         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGT)>;
2296def: Pat<(setgt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
2297                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
2298         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGT)>;
2299def: Pat<(setge (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
2300                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
2301         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGE)>;
2302def: Pat<(setge (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
2303                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
2304         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGE)>;
2305def: Pat<(setlt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
2306                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
2307         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLT)>;
2308def: Pat<(setlt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
2309                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
2310         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLT)>;
2311def: Pat<(setle (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
2312                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
2313         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLE)>;
2314def: Pat<(setle (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
2315                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
2316         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLE)>;
2317
2318def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2319                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2320         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHI)>;
2321def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2322                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2323         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHI)>;
2324def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2325                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2326         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHS)>;
2327def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2328                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2329         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHS)>;
2330def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2331                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2332         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLO)>;
2333def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2334                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2335         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLO)>;
2336def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2337                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2338         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLS)>;
2339def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2340                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2341         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLS)>;
2342def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2343                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2344         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpEQ)>;
2345def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2346                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2347         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpEQ)>;
2348def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2349                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2350         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpNE)>;
2351def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2352                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2353         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpNE)>;
2354
2355// i1 compare -> i32
2356def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
2357          (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2358def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
2359          (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
2360
2361
2362
2363multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
2364  // f16 -> pred
2365  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2366            (SETP_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2367        Requires<[useFP16Math,doF32FTZ]>;
2368  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2369            (SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2370        Requires<[useFP16Math]>;
2371  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2372            (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
2373        Requires<[useFP16Math,doF32FTZ]>;
2374  def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2375            (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
2376        Requires<[useFP16Math]>;
2377  def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2378            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2379        Requires<[useFP16Math,doF32FTZ]>;
2380  def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2381            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
2382        Requires<[useFP16Math]>;
2383
2384  // bf16 -> pred
2385  def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2386            (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2387        Requires<[hasBF16Math,doF32FTZ]>;
2388  def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2389            (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2390        Requires<[hasBF16Math]>;
2391  def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2392            (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
2393        Requires<[hasBF16Math,doF32FTZ]>;
2394  def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2395            (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
2396        Requires<[hasBF16Math]>;
2397  def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2398            (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2399        Requires<[hasBF16Math,doF32FTZ]>;
2400  def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2401            (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
2402        Requires<[hasBF16Math]>;
2403
2404  // f32 -> pred
2405  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
2406            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
2407        Requires<[doF32FTZ]>;
2408  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
2409            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
2410  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
2411            (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
2412        Requires<[doF32FTZ]>;
2413  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
2414            (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
2415  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
2416            (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
2417        Requires<[doF32FTZ]>;
2418  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
2419            (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
2420
2421  // f64 -> pred
2422  def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
2423            (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
2424  def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
2425            (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
2426  def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
2427            (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
2428
2429  // f16 -> i32
2430  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2431            (SET_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2432        Requires<[useFP16Math, doF32FTZ]>;
2433  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
2434            (SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2435        Requires<[useFP16Math]>;
2436  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2437            (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
2438        Requires<[useFP16Math, doF32FTZ]>;
2439  def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
2440            (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
2441        Requires<[useFP16Math]>;
2442  def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2443            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2444        Requires<[useFP16Math, doF32FTZ]>;
2445  def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
2446            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
2447        Requires<[useFP16Math]>;
2448
2449  // bf16 -> i32
2450  def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2451            (SET_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
2452        Requires<[hasBF16Math, doF32FTZ]>;
2453  def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
2454            (SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
2455        Requires<[hasBF16Math]>;
2456  def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2457            (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
2458        Requires<[hasBF16Math, doF32FTZ]>;
2459  def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
2460            (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
2461        Requires<[hasBF16Math]>;
2462  def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2463            (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2464        Requires<[hasBF16Math, doF32FTZ]>;
2465  def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
2466            (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
2467        Requires<[hasBF16Math]>;
2468
2469  // f32 -> i32
2470  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
2471            (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
2472        Requires<[doF32FTZ]>;
2473  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
2474            (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
2475  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
2476            (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
2477        Requires<[doF32FTZ]>;
2478  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
2479            (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
2480  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
2481            (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
2482        Requires<[doF32FTZ]>;
2483  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
2484            (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
2485
2486  // f64 -> i32
2487  def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
2488            (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
2489  def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
2490            (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
2491  def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
2492            (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
2493}
2494
2495defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
2496defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
2497defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
2498defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
2499defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
2500defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
2501
2502defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
2503defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
2504defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
2505defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
2506defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
2507defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
2508
2509defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
2510defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
2511defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
2512defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
2513defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
2514defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
2515
2516defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
2517defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
2518
2519def SDTDeclareParamProfile :
2520  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
2521def SDTDeclareScalarParamProfile :
2522  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
2523def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
2524def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
2525def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
2526def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
2527def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
2528def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
2529def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
2530def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
2531def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
2532def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
2533def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
2534def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
2535def SDTCallValProfile : SDTypeProfile<1, 0, []>;
2536def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
2537def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
2538def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
2539def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
2540def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
2541def SDTProxyRegProfile : SDTypeProfile<1, 1, []>;
2542
2543def DeclareParam :
2544  SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
2545         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2546def DeclareScalarParam :
2547  SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
2548         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2549def DeclareRetParam :
2550  SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
2551         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2552def DeclareRet :
2553  SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
2554         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2555def LoadParam :
2556  SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
2557         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2558def LoadParamV2 :
2559  SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
2560         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2561def LoadParamV4 :
2562  SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
2563         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2564def PrintCall :
2565  SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
2566         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2567def PrintConvergentCall :
2568  SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
2569         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2570def PrintCallUni :
2571  SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
2572         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2573def PrintConvergentCallUni :
2574  SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
2575         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2576def StoreParam :
2577  SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
2578         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2579def StoreParamV2 :
2580  SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
2581         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2582def StoreParamV4 :
2583  SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
2584         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2585def StoreParamU32 :
2586  SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
2587         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2588def StoreParamS32 :
2589  SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
2590         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2591def CallArgBegin :
2592  SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
2593         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2594def CallArg :
2595  SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
2596         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2597def LastCallArg :
2598  SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
2599         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2600def CallArgEnd :
2601  SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
2602         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2603def CallVoid :
2604  SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
2605         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2606def Prototype :
2607  SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
2608         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2609def CallVal :
2610  SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
2611         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2612def MoveParam :
2613  SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
2614def StoreRetval :
2615  SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
2616         [SDNPHasChain, SDNPSideEffect]>;
2617def StoreRetvalV2 :
2618  SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
2619         [SDNPHasChain, SDNPSideEffect]>;
2620def StoreRetvalV4 :
2621  SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
2622         [SDNPHasChain, SDNPSideEffect]>;
2623def PseudoUseParam :
2624  SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
2625         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2626def RETURNNode :
2627  SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
2628         [SDNPHasChain, SDNPSideEffect]>;
2629def ProxyReg :
2630  SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile,
2631         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2632
2633let mayLoad = true in {
2634  class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
2635        NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
2636                  !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
2637                  []>;
2638
2639  class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
2640        NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
2641                  !strconcat("ld.param.v2", opstr,
2642                             " \t{{$dst, $dst2}}, [retval0+$b];"), []>;
2643
2644  class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
2645        NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
2646                        regclass:$dst4),
2647                  (ins i32imm:$b),
2648                  !strconcat("ld.param.v4", opstr,
2649                             " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
2650                  []>;
2651}
2652
2653class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
2654      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
2655                !strconcat("mov", opstr, " \t$dst, retval$b;"),
2656                [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
2657
2658let mayStore = true in {
2659
2660  multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr, bit support_imm = true> {
2661    foreach op = [IMMType, regclass] in
2662      if !or(support_imm, !isa<NVPTXRegClass>(op)) then
2663        def _ # !if(!isa<NVPTXRegClass>(op), "r", "i")
2664          : NVPTXInst<(outs),
2665                      (ins op:$val, i32imm:$a, i32imm:$b),
2666                      "st.param" # opstr # " \t[param$a+$b], $val;",
2667                      []>;
2668  }
2669
2670  multiclass StoreParamV2Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
2671    foreach op1 = [IMMType, regclass] in
2672      foreach op2 = [IMMType, regclass] in
2673        def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
2674              # !if(!isa<NVPTXRegClass>(op2), "r", "i")
2675          : NVPTXInst<(outs),
2676                      (ins op1:$val1, op2:$val2,
2677                           i32imm:$a, i32imm:$b),
2678                      "st.param.v2" # opstr # " \t[param$a+$b], {{$val1, $val2}};",
2679                      []>;
2680  }
2681
2682  multiclass StoreParamV4Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
2683    foreach op1 = [IMMType, regclass] in
2684      foreach op2 = [IMMType, regclass] in
2685        foreach op3 = [IMMType, regclass] in
2686          foreach op4 = [IMMType, regclass] in
2687            def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
2688                  # !if(!isa<NVPTXRegClass>(op2), "r", "i")
2689                  # !if(!isa<NVPTXRegClass>(op3), "r", "i")
2690                  # !if(!isa<NVPTXRegClass>(op4), "r", "i")
2691
2692              : NVPTXInst<(outs),
2693                          (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4,
2694                               i32imm:$a, i32imm:$b),
2695                          "st.param.v4" # opstr #
2696                          " \t[param$a+$b], {{$val1, $val2, $val3, $val4}};",
2697                          []>;
2698  }
2699
2700  class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
2701        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
2702                  !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"),
2703                  []>;
2704
2705  class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
2706        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
2707                  !strconcat("st.param.v2", opstr,
2708                             " \t[func_retval0+$a], {{$val, $val2}};"),
2709                  []>;
2710
2711  class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
2712        NVPTXInst<(outs),
2713                  (ins regclass:$val, regclass:$val2, regclass:$val3,
2714                       regclass:$val4, i32imm:$a),
2715                  !strconcat("st.param.v4", opstr,
2716                             " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
2717                  []>;
2718}
2719
2720let isCall=1 in {
2721  multiclass CALL<string OpcStr, SDNode OpNode> {
2722     def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
2723       !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
2724     def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
2725       !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
2726     def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
2727       !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
2728     def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
2729       !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
2730     def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
2731       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
2732       [(OpNode (i32 4))]>;
2733     def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
2734       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
2735       [(OpNode (i32 5))]>;
2736     def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
2737       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2738                            "retval5), "),
2739       [(OpNode (i32 6))]>;
2740     def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
2741       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2742                            "retval5, retval6), "),
2743       [(OpNode (i32 7))]>;
2744     def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
2745       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2746                            "retval5, retval6, retval7), "),
2747       [(OpNode (i32 8))]>;
2748  }
2749}
2750
2751defm Call : CALL<"call", PrintCall>;
2752defm CallUni : CALL<"call.uni", PrintCallUni>;
2753
2754// Convergent call instructions.  These are identical to regular calls, except
2755// they have the isConvergent bit set.
2756let isConvergent=1 in {
2757  defm ConvergentCall : CALL<"call", PrintConvergentCall>;
2758  defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
2759}
2760
2761def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
2762def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
2763def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
2764def LoadParamMemI8     : LoadParamMemInst<Int16Regs, ".b8">;
2765def LoadParamMemV2I64  : LoadParamV2MemInst<Int64Regs, ".b64">;
2766def LoadParamMemV2I32  : LoadParamV2MemInst<Int32Regs, ".b32">;
2767def LoadParamMemV2I16  : LoadParamV2MemInst<Int16Regs, ".b16">;
2768def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
2769def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
2770def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
2771def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
2772def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
2773def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
2774def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
2775def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
2776def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
2777
2778defm StoreParamI64    : StoreParamInst<Int64Regs, i64imm, ".b64">;
2779defm StoreParamI32    : StoreParamInst<Int32Regs, i32imm, ".b32">;
2780defm StoreParamI16    : StoreParamInst<Int16Regs, i16imm, ".b16">;
2781defm StoreParamI8     : StoreParamInst<Int16Regs, i8imm,  ".b8">;
2782
2783defm StoreParamI8TruncI32 : StoreParamInst<Int32Regs, i8imm, ".b8", /* support_imm */ false>;
2784defm StoreParamI8TruncI64 : StoreParamInst<Int64Regs, i8imm, ".b8", /* support_imm */ false>;
2785
2786defm StoreParamV2I64  : StoreParamV2Inst<Int64Regs, i64imm, ".b64">;
2787defm StoreParamV2I32  : StoreParamV2Inst<Int32Regs, i32imm, ".b32">;
2788defm StoreParamV2I16  : StoreParamV2Inst<Int16Regs, i16imm, ".b16">;
2789defm StoreParamV2I8   : StoreParamV2Inst<Int16Regs, i8imm,  ".b8">;
2790
2791defm StoreParamV4I32  : StoreParamV4Inst<Int32Regs, i32imm, ".b32">;
2792defm StoreParamV4I16  : StoreParamV4Inst<Int16Regs, i16imm, ".b16">;
2793defm StoreParamV4I8   : StoreParamV4Inst<Int16Regs, i8imm,  ".b8">;
2794
2795defm StoreParamF32    : StoreParamInst<Float32Regs, f32imm, ".f32">;
2796defm StoreParamF64    : StoreParamInst<Float64Regs, f64imm, ".f64">;
2797
2798defm StoreParamV2F32  : StoreParamV2Inst<Float32Regs, f32imm, ".f32">;
2799defm StoreParamV2F64  : StoreParamV2Inst<Float64Regs, f64imm, ".f64">;
2800
2801defm StoreParamV4F32  : StoreParamV4Inst<Float32Regs, f32imm, ".f32">;
2802
2803def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
2804def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
2805def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
2806def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
2807def StoreRetvalI8TruncI32 : StoreRetvalInst<Int32Regs, ".b8">;
2808def StoreRetvalI8TruncI64 : StoreRetvalInst<Int64Regs, ".b8">;
2809def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
2810def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
2811def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
2812def StoreRetvalV2I8   : StoreRetvalV2Inst<Int16Regs, ".b8">;
2813def StoreRetvalV4I32  : StoreRetvalV4Inst<Int32Regs, ".b32">;
2814def StoreRetvalV4I16  : StoreRetvalV4Inst<Int16Regs, ".b16">;
2815def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
2816
2817def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
2818def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
2819def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
2820def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
2821def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
2822
2823def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
2824def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
2825def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
2826def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
2827
2828class CallArgInst<NVPTXRegClass regclass> :
2829  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
2830            [(CallArg (i32 0), regclass:$a)]>;
2831
2832class CallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
2833  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
2834            [(CallArg (i32 0), vt:$a)]>;
2835
2836class LastCallArgInst<NVPTXRegClass regclass> :
2837  NVPTXInst<(outs), (ins regclass:$a), "$a",
2838            [(LastCallArg (i32 0), regclass:$a)]>;
2839class LastCallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
2840  NVPTXInst<(outs), (ins regclass:$a), "$a",
2841            [(LastCallArg (i32 0), vt:$a)]>;
2842
2843def CallArgI64     : CallArgInst<Int64Regs>;
2844def CallArgI32     : CallArgInstVT<Int32Regs, i32>;
2845def CallArgI16     : CallArgInstVT<Int16Regs, i16>;
2846def CallArgF64     : CallArgInst<Float64Regs>;
2847def CallArgF32     : CallArgInst<Float32Regs>;
2848
2849def LastCallArgI64 : LastCallArgInst<Int64Regs>;
2850def LastCallArgI32 : LastCallArgInstVT<Int32Regs, i32>;
2851def LastCallArgI16 : LastCallArgInstVT<Int16Regs, i16>;
2852def LastCallArgF64 : LastCallArgInst<Float64Regs>;
2853def LastCallArgF32 : LastCallArgInst<Float32Regs>;
2854
2855def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
2856                              [(CallArg (i32 0), (i32 imm:$a))]>;
2857def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
2858                                  [(LastCallArg (i32 0), (i32 imm:$a))]>;
2859
2860def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
2861                             [(CallArg (i32 1), (i32 imm:$a))]>;
2862def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
2863                                 [(LastCallArg (i32 1), (i32 imm:$a))]>;
2864
2865def CallVoidInst :      NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
2866                                  [(CallVoid (Wrapper tglobaladdr:$addr))]>;
2867def CallVoidInstReg :   NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
2868                                  [(CallVoid i32:$addr)]>;
2869def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
2870                                  [(CallVoid Int64Regs:$addr)]>;
2871def PrototypeInst :     NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
2872                                  [(Prototype (i32 imm:$val))]>;
2873
2874def DeclareRetMemInst :
2875  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
2876            ".param .align $align .b8 retval$num[$size];",
2877            [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
2878def DeclareRetScalarInst :
2879  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2880            ".param .b$size retval$num;",
2881            [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
2882def DeclareRetRegInst :
2883  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2884            ".reg .b$size retval$num;",
2885            [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
2886
2887def DeclareParamInst :
2888  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
2889            ".param .align $align .b8 param$a[$size];",
2890            [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
2891def DeclareScalarParamInst :
2892  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2893            ".param .b$size param$a;",
2894            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
2895def DeclareScalarRegInst :
2896  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2897            ".reg .b$size param$a;",
2898            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
2899
2900class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> :
2901  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2902            !strconcat("mov", asmstr, " \t$dst, $src;"),
2903            [(set (T regclass:$dst), (MoveParam (T regclass:$src)))]>;
2904
2905class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, ValueType vt,
2906                          string asmstr> :
2907  NVPTXInst<(outs regclass:$dst), (ins srcty:$src),
2908            !strconcat("mov", asmstr, " \t$dst, $src;"),
2909            [(set vt:$dst, (MoveParam texternalsym:$src))]>;
2910
2911def MoveParamI64 : MoveParamInst<i64, Int64Regs, ".b64">;
2912def MoveParamI32 : MoveParamInst<i32, Int32Regs, ".b32">;
2913
2914def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, i64, ".b64">;
2915def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, i32, ".b32">;
2916
2917def MoveParamI16 :
2918  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
2919            "cvt.u16.u32 \t$dst, $src;", // ??? Why cvt.u16.u32 ?
2920            [(set i16:$dst, (MoveParam i16:$src))]>;
2921def MoveParamF64 : MoveParamInst<f64, Float64Regs, ".f64">;
2922def MoveParamF32 : MoveParamInst<f32, Float32Regs, ".f32">;
2923
2924class PseudoUseParamInst<NVPTXRegClass regclass, ValueType vt> :
2925  NVPTXInst<(outs), (ins regclass:$src),
2926            "// Pseudo use of $src",
2927            [(PseudoUseParam vt:$src)]>;
2928
2929def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs, i64>;
2930def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs, i32>;
2931def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs, i16>;
2932def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs, f64>;
2933def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>;
2934
2935class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> :
2936  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2937            !strconcat("mov.", SzStr, " \t$dst, $src;"),
2938            [(set (T regclass:$dst), (ProxyReg (T regclass:$src)))]>;
2939
2940def ProxyRegI1    : ProxyRegInst<"pred", i1, Int1Regs>;
2941def ProxyRegI16   : ProxyRegInst<"b16",  i16, Int16Regs>;
2942def ProxyRegI32   : ProxyRegInst<"b32",  i32, Int32Regs>;
2943def ProxyRegI64   : ProxyRegInst<"b64",  i64, Int64Regs>;
2944def ProxyRegF32   : ProxyRegInst<"f32",  f32, Float32Regs>;
2945def ProxyRegF64   : ProxyRegInst<"f64",  f64, Float64Regs>;
2946
2947foreach vt = [f16, bf16] in {
2948  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI16 Int16Regs:$src)>;
2949}
2950
2951foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
2952  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI32 Int32Regs:$src)>;
2953}
2954
2955//
2956// Load / Store Handling
2957//
2958multiclass LD<NVPTXRegClass regclass> {
2959  def _avar : NVPTXInst<
2960    (outs regclass:$dst),
2961    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2962         i32imm:$fromWidth, imem:$addr),
2963    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2964    "\t$dst, [$addr];", []>;
2965  def _areg : NVPTXInst<
2966    (outs regclass:$dst),
2967    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2968         i32imm:$fromWidth, Int32Regs:$addr),
2969    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2970    "\t$dst, [$addr];", []>;
2971  def _areg_64 : NVPTXInst<
2972    (outs regclass:$dst),
2973    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2974         i32imm:$fromWidth, Int64Regs:$addr),
2975    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2976    "\t$dst, [$addr];", []>;
2977  def _ari : NVPTXInst<
2978    (outs regclass:$dst),
2979    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2980         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2981    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2982    "\t$dst, [$addr+$offset];", []>;
2983  def _ari_64 : NVPTXInst<
2984    (outs regclass:$dst),
2985    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2986         LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2987    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2988    "\t$dst, [$addr+$offset];", []>;
2989  def _asi : NVPTXInst<
2990    (outs regclass:$dst),
2991    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2992         LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2993    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2994    "\t$dst, [$addr+$offset];", []>;
2995}
2996
2997let mayLoad=1, hasSideEffects=0 in {
2998  defm LD_i8  : LD<Int16Regs>;
2999  defm LD_i16 : LD<Int16Regs>;
3000  defm LD_i32 : LD<Int32Regs>;
3001  defm LD_i64 : LD<Int64Regs>;
3002  defm LD_f32 : LD<Float32Regs>;
3003  defm LD_f64 : LD<Float64Regs>;
3004}
3005
3006multiclass ST<NVPTXRegClass regclass> {
3007  def _avar : NVPTXInst<
3008    (outs),
3009    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
3010         LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
3011    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
3012    " \t[$addr], $src;", []>;
3013  def _areg : NVPTXInst<
3014    (outs),
3015    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
3016         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
3017    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
3018    " \t[$addr], $src;", []>;
3019  def _areg_64 : NVPTXInst<
3020    (outs),
3021    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
3022         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
3023    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
3024    " \t[$addr], $src;", []>;
3025  def _ari : NVPTXInst<
3026    (outs),
3027    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
3028         LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
3029    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
3030    " \t[$addr+$offset], $src;", []>;
3031  def _ari_64 : NVPTXInst<
3032    (outs),
3033    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
3034         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
3035    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
3036    " \t[$addr+$offset], $src;", []>;
3037  def _asi : NVPTXInst<
3038    (outs),
3039    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
3040         LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
3041    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
3042    " \t[$addr+$offset], $src;", []>;
3043}
3044
3045let mayStore=1, hasSideEffects=0 in {
3046  defm ST_i8  : ST<Int16Regs>;
3047  defm ST_i16 : ST<Int16Regs>;
3048  defm ST_i32 : ST<Int32Regs>;
3049  defm ST_i64 : ST<Int64Regs>;
3050  defm ST_f32 : ST<Float32Regs>;
3051  defm ST_f64 : ST<Float64Regs>;
3052}
3053
3054// The following is used only in and after vector elementizations.  Vector
3055// elementization happens at the machine instruction level, so the following
3056// instructions never appear in the DAG.
3057multiclass LD_VEC<NVPTXRegClass regclass> {
3058  def _v2_avar : NVPTXInst<
3059    (outs regclass:$dst1, regclass:$dst2),
3060    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3061         i32imm:$fromWidth, imem:$addr),
3062    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3063    "\t{{$dst1, $dst2}}, [$addr];", []>;
3064  def _v2_areg : NVPTXInst<
3065    (outs regclass:$dst1, regclass:$dst2),
3066    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3067         i32imm:$fromWidth, Int32Regs:$addr),
3068    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3069    "\t{{$dst1, $dst2}}, [$addr];", []>;
3070  def _v2_areg_64 : NVPTXInst<
3071    (outs regclass:$dst1, regclass:$dst2),
3072    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3073         i32imm:$fromWidth, Int64Regs:$addr),
3074    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3075    "\t{{$dst1, $dst2}}, [$addr];", []>;
3076  def _v2_ari : NVPTXInst<
3077    (outs regclass:$dst1, regclass:$dst2),
3078    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3079         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
3080    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3081    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
3082  def _v2_ari_64 : NVPTXInst<
3083    (outs regclass:$dst1, regclass:$dst2),
3084    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3085         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
3086    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3087    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
3088  def _v2_asi : NVPTXInst<
3089    (outs regclass:$dst1, regclass:$dst2),
3090    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3091         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
3092    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3093    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
3094  def _v4_avar : NVPTXInst<
3095    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3096    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3097         i32imm:$fromWidth, imem:$addr),
3098    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3099    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
3100  def _v4_areg : NVPTXInst<
3101    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3102    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3103         i32imm:$fromWidth, Int32Regs:$addr),
3104    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3105    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
3106  def _v4_areg_64 : NVPTXInst<
3107    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3108    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3109         i32imm:$fromWidth, Int64Regs:$addr),
3110    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3111    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
3112  def _v4_ari : NVPTXInst<
3113    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3114    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3115         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
3116    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3117    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
3118  def _v4_ari_64 : NVPTXInst<
3119    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3120    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3121         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
3122    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3123    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
3124  def _v4_asi : NVPTXInst<
3125    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
3126    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3127         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
3128    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3129    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
3130}
3131let mayLoad=1, hasSideEffects=0 in {
3132  defm LDV_i8  : LD_VEC<Int16Regs>;
3133  defm LDV_i16 : LD_VEC<Int16Regs>;
3134  defm LDV_i32 : LD_VEC<Int32Regs>;
3135  defm LDV_i64 : LD_VEC<Int64Regs>;
3136  defm LDV_f32 : LD_VEC<Float32Regs>;
3137  defm LDV_f64 : LD_VEC<Float64Regs>;
3138}
3139
3140multiclass ST_VEC<NVPTXRegClass regclass> {
3141  def _v2_avar : NVPTXInst<
3142    (outs),
3143    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3144         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
3145    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3146    "\t[$addr], {{$src1, $src2}};", []>;
3147  def _v2_areg : NVPTXInst<
3148    (outs),
3149    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3150         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
3151    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3152    "\t[$addr], {{$src1, $src2}};", []>;
3153  def _v2_areg_64 : NVPTXInst<
3154    (outs),
3155    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3156         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
3157    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3158    "\t[$addr], {{$src1, $src2}};", []>;
3159  def _v2_ari : NVPTXInst<
3160    (outs),
3161    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3162         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
3163         i32imm:$offset),
3164    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3165    "\t[$addr+$offset], {{$src1, $src2}};", []>;
3166  def _v2_ari_64 : NVPTXInst<
3167    (outs),
3168    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3169         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
3170         i32imm:$offset),
3171    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3172    "\t[$addr+$offset], {{$src1, $src2}};", []>;
3173  def _v2_asi : NVPTXInst<
3174    (outs),
3175    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
3176         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
3177         i32imm:$offset),
3178    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3179    "\t[$addr+$offset], {{$src1, $src2}};", []>;
3180  def _v4_avar : NVPTXInst<
3181    (outs),
3182    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3183         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3184         i32imm:$fromWidth, imem:$addr),
3185    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3186    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
3187  def _v4_areg : NVPTXInst<
3188    (outs),
3189    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3190         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3191         i32imm:$fromWidth, Int32Regs:$addr),
3192    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3193    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
3194  def _v4_areg_64 : NVPTXInst<
3195    (outs),
3196    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3197         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3198         i32imm:$fromWidth, Int64Regs:$addr),
3199    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3200    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
3201  def _v4_ari : NVPTXInst<
3202    (outs),
3203    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3204         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3205         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
3206    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3207    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
3208  def _v4_ari_64 : NVPTXInst<
3209    (outs),
3210    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3211         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3212         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
3213    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3214    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
3215  def _v4_asi : NVPTXInst<
3216    (outs),
3217    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3218         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
3219         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
3220    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
3221    "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
3222}
3223
3224let mayStore=1, hasSideEffects=0 in {
3225  defm STV_i8  : ST_VEC<Int16Regs>;
3226  defm STV_i16 : ST_VEC<Int16Regs>;
3227  defm STV_i32 : ST_VEC<Int32Regs>;
3228  defm STV_i64 : ST_VEC<Int64Regs>;
3229  defm STV_f32 : ST_VEC<Float32Regs>;
3230  defm STV_f64 : ST_VEC<Float64Regs>;
3231}
3232
3233//---- Conversion ----
3234
3235class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut,
3236  NVPTXRegClass regclassIn = ValueToRegClass<TIn>.ret,
3237  NVPTXRegClass regclassOut = ValueToRegClass<TOut>.ret> :
3238           NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
3239           !strconcat("mov.b", SzStr, " \t$d, $a;"),
3240     [(set (TOut regclassOut:$d), (bitconvert (TIn regclassIn:$a)))]>;
3241
3242def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>;
3243def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>;
3244def BITCONVERT_64_I2F : F_BITCONVERT<"64", i64, f64>;
3245def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>;
3246
3247foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
3248def: Pat<(vt (bitconvert (f32 Float32Regs:$a))),
3249         (BITCONVERT_32_F2I Float32Regs:$a)>;
3250def: Pat<(f32 (bitconvert (vt Int32Regs:$a))),
3251         (BITCONVERT_32_I2F Int32Regs:$a)>;
3252}
3253foreach vt = [f16, bf16] in {
3254def: Pat<(vt (bitconvert (i16 UInt16Const:$a))),
3255         (IMOVB16ri UInt16Const:$a)>;
3256def: Pat<(vt (bitconvert (i16 Int16Regs:$a))),
3257         (ProxyRegI16 Int16Regs:$a)>;
3258def: Pat<(i16 (bitconvert (vt Int16Regs:$a))),
3259         (ProxyRegI16 Int16Regs:$a)>;
3260}
3261
3262foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in {
3263  def: Pat<(ta (bitconvert (i32 UInt32Const:$a))),
3264           (IMOVB32ri UInt32Const:$a)>;
3265  foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in {
3266    if !ne(ta, tb) then {
3267      def: Pat<(ta (bitconvert (tb Int32Regs:$a))),
3268             (ProxyRegI32 Int32Regs:$a)>;
3269    }
3270  }
3271}
3272
3273// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
3274// we cannot specify floating-point literals in isel patterns.  Therefore, we
3275// use an integer selp to select either 1 or 0 and then cvt to floating-point.
3276
3277// sint -> f16
3278def : Pat<(f16 (sint_to_fp Int1Regs:$a)),
3279          (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3280def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
3281          (CVT_f16_s16 Int16Regs:$a, CvtRN)>;
3282def : Pat<(f16 (sint_to_fp Int32Regs:$a)),
3283          (CVT_f16_s32 Int32Regs:$a, CvtRN)>;
3284def : Pat<(f16 (sint_to_fp Int64Regs:$a)),
3285          (CVT_f16_s64 Int64Regs:$a, CvtRN)>;
3286
3287// uint -> f16
3288def : Pat<(f16 (uint_to_fp Int1Regs:$a)),
3289          (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3290def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
3291          (CVT_f16_u16 Int16Regs:$a, CvtRN)>;
3292def : Pat<(f16 (uint_to_fp Int32Regs:$a)),
3293          (CVT_f16_u32 Int32Regs:$a, CvtRN)>;
3294def : Pat<(f16 (uint_to_fp Int64Regs:$a)),
3295          (CVT_f16_u64 Int64Regs:$a, CvtRN)>;
3296
3297// sint -> bf16
3298def : Pat<(bf16 (sint_to_fp Int1Regs:$a)),
3299          (CVT_bf16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3300def : Pat<(bf16 (sint_to_fp Int16Regs:$a)),
3301          (CVT_bf16_s16 Int16Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3302def : Pat<(bf16 (sint_to_fp Int32Regs:$a)),
3303          (CVT_bf16_s32 Int32Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3304def : Pat<(bf16 (sint_to_fp Int64Regs:$a)),
3305          (CVT_bf16_s64 Int64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3306
3307// uint -> bf16
3308def : Pat<(bf16 (uint_to_fp Int1Regs:$a)),
3309          (CVT_bf16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3310def : Pat<(bf16 (uint_to_fp Int16Regs:$a)),
3311          (CVT_bf16_u16 Int16Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3312def : Pat<(bf16 (uint_to_fp Int32Regs:$a)),
3313          (CVT_bf16_u32 Int32Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3314def : Pat<(bf16 (uint_to_fp Int64Regs:$a)),
3315          (CVT_bf16_u64 Int64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3316
3317// sint -> f32
3318def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
3319          (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3320def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
3321          (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
3322def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
3323          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
3324def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
3325          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
3326
3327// uint -> f32
3328def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
3329          (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3330def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
3331          (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
3332def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
3333          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
3334def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
3335          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
3336
3337// sint -> f64
3338def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
3339          (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3340def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
3341          (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
3342def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
3343          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
3344def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
3345          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
3346
3347// uint -> f64
3348def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
3349          (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
3350def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
3351          (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
3352def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
3353          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
3354def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
3355          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
3356
3357
3358// f16 -> sint
3359def : Pat<(i1 (fp_to_sint (f16 Int16Regs:$a))),
3360          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3361def : Pat<(i16 (fp_to_sint (f16 Int16Regs:$a))),
3362          (CVT_s16_f16 (f16 Int16Regs:$a), CvtRZI)>;
3363def : Pat<(i32 (fp_to_sint (f16 Int16Regs:$a))),
3364          (CVT_s32_f16 (f16 Int16Regs:$a), CvtRZI)>;
3365def : Pat<(i64 (fp_to_sint (f16 Int16Regs:$a))),
3366          (CVT_s64_f16 Int16Regs:$a, CvtRZI)>;
3367
3368// f16 -> uint
3369def : Pat<(i1 (fp_to_uint (f16 Int16Regs:$a))),
3370          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3371def : Pat<(i16 (fp_to_uint (f16 Int16Regs:$a))),
3372          (CVT_u16_f16 Int16Regs:$a, CvtRZI)>;
3373def : Pat<(i32 (fp_to_uint (f16 Int16Regs:$a))),
3374          (CVT_u32_f16 Int16Regs:$a, CvtRZI)>;
3375def : Pat<(i64 (fp_to_uint (f16 Int16Regs:$a))),
3376          (CVT_u64_f16 Int16Regs:$a, CvtRZI)>;
3377
3378// bf16 -> sint
3379def : Pat<(i1 (fp_to_sint (bf16 Int16Regs:$a))),
3380          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3381def : Pat<(i16 (fp_to_sint (bf16 Int16Regs:$a))),
3382          (CVT_s16_bf16 (bf16 Int16Regs:$a), CvtRZI)>;
3383def : Pat<(i32 (fp_to_sint (bf16 Int16Regs:$a))),
3384          (CVT_s32_bf16 (bf16 Int16Regs:$a), CvtRZI)>;
3385def : Pat<(i64 (fp_to_sint (bf16 Int16Regs:$a))),
3386          (CVT_s64_bf16 Int16Regs:$a, CvtRZI)>;
3387
3388// bf16 -> uint
3389def : Pat<(i1 (fp_to_uint (bf16 Int16Regs:$a))),
3390          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
3391def : Pat<(i16 (fp_to_uint (bf16 Int16Regs:$a))),
3392          (CVT_u16_bf16 Int16Regs:$a, CvtRZI)>;
3393def : Pat<(i32 (fp_to_uint (bf16 Int16Regs:$a))),
3394          (CVT_u32_bf16 Int16Regs:$a, CvtRZI)>;
3395def : Pat<(i64 (fp_to_uint (bf16 Int16Regs:$a))),
3396          (CVT_u64_bf16 Int16Regs:$a, CvtRZI)>;
3397// f32 -> sint
3398def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
3399          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
3400def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
3401          (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3402def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
3403          (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
3404def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
3405          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3406def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
3407          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
3408def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
3409          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3410def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
3411          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
3412
3413// f32 -> uint
3414def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
3415          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
3416def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
3417          (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3418def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
3419          (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
3420def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
3421          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3422def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
3423          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
3424def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
3425          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3426def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
3427          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
3428
3429// f64 -> sint
3430def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
3431          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
3432def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
3433          (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
3434def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
3435          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
3436def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
3437          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
3438
3439// f64 -> uint
3440def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
3441          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
3442def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
3443          (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
3444def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
3445          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
3446def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
3447          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
3448
3449// sext i1
3450def : Pat<(i16 (sext Int1Regs:$a)),
3451          (SELP_s16ii -1, 0, Int1Regs:$a)>;
3452def : Pat<(i32 (sext Int1Regs:$a)),
3453          (SELP_s32ii -1, 0, Int1Regs:$a)>;
3454def : Pat<(i64 (sext Int1Regs:$a)),
3455          (SELP_s64ii -1, 0, Int1Regs:$a)>;
3456
3457// zext i1
3458def : Pat<(i16 (zext Int1Regs:$a)),
3459          (SELP_u16ii 1, 0, Int1Regs:$a)>;
3460def : Pat<(i32 (zext Int1Regs:$a)),
3461          (SELP_u32ii 1, 0, Int1Regs:$a)>;
3462def : Pat<(i64 (zext Int1Regs:$a)),
3463          (SELP_u64ii 1, 0, Int1Regs:$a)>;
3464
3465// anyext i1
3466def : Pat<(i16 (anyext Int1Regs:$a)),
3467          (SELP_u16ii -1, 0, Int1Regs:$a)>;
3468def : Pat<(i32 (anyext Int1Regs:$a)),
3469          (SELP_u32ii -1, 0, Int1Regs:$a)>;
3470def : Pat<(i64 (anyext Int1Regs:$a)),
3471          (SELP_u64ii -1, 0, Int1Regs:$a)>;
3472
3473// sext i16
3474def : Pat<(i32 (sext Int16Regs:$a)),
3475          (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
3476def : Pat<(i64 (sext Int16Regs:$a)),
3477          (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
3478
3479// zext i16
3480def : Pat<(i32 (zext Int16Regs:$a)),
3481          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
3482def : Pat<(i64 (zext Int16Regs:$a)),
3483          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
3484
3485// anyext i16
3486def : Pat<(i32 (anyext Int16Regs:$a)),
3487          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
3488def : Pat<(i64 (anyext Int16Regs:$a)),
3489          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
3490
3491// sext i32
3492def : Pat<(i64 (sext Int32Regs:$a)),
3493          (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
3494
3495// zext i32
3496def : Pat<(i64 (zext Int32Regs:$a)),
3497          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
3498
3499// anyext i32
3500def : Pat<(i64 (anyext Int32Regs:$a)),
3501          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
3502
3503
3504// truncate i64
3505def : Pat<(i32 (trunc Int64Regs:$a)),
3506          (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
3507def : Pat<(i16 (trunc Int64Regs:$a)),
3508          (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
3509def : Pat<(i1 (trunc Int64Regs:$a)),
3510          (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
3511
3512// truncate i32
3513def : Pat<(i16 (trunc Int32Regs:$a)),
3514          (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
3515def : Pat<(i1 (trunc Int32Regs:$a)),
3516          (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
3517
3518// truncate i16
3519def : Pat<(i1 (trunc Int16Regs:$a)),
3520          (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
3521
3522// sext_inreg
3523def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
3524def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
3525def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
3526def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
3527def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
3528def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
3529
3530
3531// Select instructions with 32-bit predicates
3532def : Pat<(select (i32 Int32Regs:$pred), i16:$a, i16:$b),
3533          (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
3534          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3535def : Pat<(select (i32 Int32Regs:$pred), i32:$a, i32:$b),
3536          (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
3537          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3538def : Pat<(select (i32 Int32Regs:$pred), Int64Regs:$a, Int64Regs:$b),
3539          (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
3540          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3541def : Pat<(select (i32 Int32Regs:$pred), (f16 Int16Regs:$a), (f16 Int16Regs:$b)),
3542          (SELP_f16rr Int16Regs:$a, Int16Regs:$b,
3543          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3544def : Pat<(select (i32 Int32Regs:$pred), (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)),
3545          (SELP_bf16rr Int16Regs:$a, Int16Regs:$b,
3546          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3547def : Pat<(select (i32 Int32Regs:$pred), Float32Regs:$a, Float32Regs:$b),
3548          (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
3549          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3550def : Pat<(select (i32 Int32Regs:$pred), Float64Regs:$a, Float64Regs:$b),
3551          (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
3552          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
3553
3554
3555let hasSideEffects = false in {
3556  // pack a set of smaller int registers to a larger int register
3557  def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
3558                             (ins Int16Regs:$s1, Int16Regs:$s2,
3559                                  Int16Regs:$s3, Int16Regs:$s4),
3560                             "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
3561  def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
3562                             (ins Int16Regs:$s1, Int16Regs:$s2),
3563                             "mov.b32 \t$d, {{$s1, $s2}};", []>;
3564  def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
3565                             (ins Int32Regs:$s1, Int32Regs:$s2),
3566                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
3567  def V2I64toI128 : NVPTXInst<(outs Int128Regs:$d),
3568                              (ins Int64Regs:$s1, Int64Regs:$s2),
3569                              "mov.b128 \t$d, {{$s1, $s2}};", []>;
3570  def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
3571                             (ins Float32Regs:$s1, Float32Regs:$s2),
3572                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
3573
3574  // unpack a larger int register to a set of smaller int registers
3575  def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
3576                                   Int16Regs:$d3, Int16Regs:$d4),
3577                             (ins Int64Regs:$s),
3578                             "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
3579  def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
3580                             (ins Int32Regs:$s),
3581                             "mov.b32 \t{{$d1, $d2}}, $s;", []>;
3582  def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
3583                             (ins Int64Regs:$s),
3584                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
3585  def I128toV2I64: NVPTXInst<(outs Int64Regs:$d1, Int64Regs:$d2),
3586                              (ins Int128Regs:$s),
3587                              "mov.b128 \t{{$d1, $d2}}, $s;", []>;
3588  def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
3589                             (ins Float64Regs:$s),
3590                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
3591
3592  def I32toI16H  : NVPTXInst<(outs Int16Regs:$high),
3593                             (ins Int32Regs:$s),
3594                             "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}",
3595                             []>;
3596  def I32toI16L  : NVPTXInst<(outs Int16Regs:$low),
3597                             (ins Int32Regs:$s),
3598                             "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}",
3599                             []>;
3600  def I64toI32H  : NVPTXInst<(outs Int32Regs:$high),
3601                             (ins Int64Regs:$s),
3602                             "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}",
3603                             []>;
3604  def I64toI32L  : NVPTXInst<(outs Int32Regs:$low),
3605                             (ins Int64Regs:$s),
3606                             "{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
3607                             []>;
3608
3609}
3610
3611// Using partial vectorized move produces better SASS code for extraction of
3612// upper/lower parts of an integer.
3613def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))),
3614          (I32toI16H Int32Regs:$s)>;
3615def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))),
3616          (I32toI16H Int32Regs:$s)>;
3617def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))),
3618          (I64toI32H Int64Regs:$s)>;
3619def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))),
3620          (I64toI32H Int64Regs:$s)>;
3621
3622def: Pat<(i32 (sext (extractelt (v2i16 Int32Regs:$src), 0))),
3623         (CVT_INREG_s32_s16 Int32Regs:$src)>;
3624
3625foreach vt = [v2f16, v2bf16, v2i16] in {
3626def : Pat<(extractelt (vt Int32Regs:$src), 0),
3627          (I32toI16L Int32Regs:$src)>;
3628def : Pat<(extractelt (vt Int32Regs:$src), 1),
3629          (I32toI16H Int32Regs:$src)>;
3630}
3631def : Pat<(v2f16 (build_vector (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
3632          (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
3633def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
3634          (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
3635def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))),
3636          (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
3637
3638def: Pat<(v2i16 (scalar_to_vector (i16 Int16Regs:$a))),
3639         (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
3640
3641// Count leading zeros
3642let hasSideEffects = false in {
3643  def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3644                         "clz.b32 \t$d, $a;", []>;
3645  def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3646                         "clz.b64 \t$d, $a;", []>;
3647}
3648
3649// 32-bit has a direct PTX instruction
3650def : Pat<(i32 (ctlz (i32 Int32Regs:$a))), (CLZr32 Int32Regs:$a)>;
3651
3652// The return type of the ctlz ISD node is the same as its input, but the PTX
3653// ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
3654// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
3655// truncating back down to 32 bits.
3656def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
3657def : Pat<(i32 (trunc (i64 (ctlz Int64Regs:$a)))), (CLZr64 Int64Regs:$a)>;
3658
3659// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
3660// result back to 16-bits if necessary.  We also need to subtract 16 because
3661// the high-order 16 zeros were counted.
3662//
3663// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
3664// use to save one SASS instruction (on sm_35 anyway):
3665//
3666//   mov.b32 $tmp, {0xffff, $a}
3667//   ctlz.b32 $result, $tmp
3668//
3669// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
3670// and then ctlz that value.  This way we don't have to subtract 16 from the
3671// result.  Unfortunately today we don't have a way to generate
3672// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
3673def : Pat<(i16 (ctlz Int16Regs:$a)),
3674          (SUBi16ri (CVT_u16_u32
3675           (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
3676def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))),
3677          (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
3678
3679// Population count
3680let hasSideEffects = false in {
3681  def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3682                          "popc.b32 \t$d, $a;", []>;
3683  def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3684                          "popc.b64 \t$d, $a;", []>;
3685}
3686
3687// 32-bit has a direct PTX instruction
3688def : Pat<(i32 (ctpop (i32 Int32Regs:$a))), (POPCr32 Int32Regs:$a)>;
3689
3690// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
3691// to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
3692// pattern that avoids the type conversion if we're truncating the result to
3693// i32 anyway.
3694def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
3695def : Pat<(i32 (trunc (i64 (ctpop Int64Regs:$a)))), (POPCr64 Int64Regs:$a)>;
3696
3697// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
3698// If we know that we're storing into an i32, we can avoid the final trunc.
3699def : Pat<(ctpop Int16Regs:$a),
3700          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
3701def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))),
3702          (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
3703
3704// fpround f32 -> f16
3705def : Pat<(f16 (fpround Float32Regs:$a)),
3706          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
3707
3708// fpround f32 -> bf16
3709def : Pat<(bf16 (fpround Float32Regs:$a)),
3710          (CVT_bf16_f32 Float32Regs:$a, CvtRN)>, Requires<[hasPTX<70>, hasSM<80>]>;
3711
3712// fpround f64 -> f16
3713def : Pat<(f16 (fpround Float64Regs:$a)),
3714          (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
3715
3716// fpround f64 -> bf16
3717def : Pat<(bf16 (fpround Float64Regs:$a)),
3718          (CVT_bf16_f64 Float64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3719// fpround f64 -> f32
3720def : Pat<(f32 (fpround Float64Regs:$a)),
3721          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
3722def : Pat<(f32 (fpround Float64Regs:$a)),
3723          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
3724
3725// fpextend f16 -> f32
3726def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
3727          (CVT_f32_f16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3728def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
3729          (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
3730// fpextend bf16 -> f32
3731def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
3732          (CVT_f32_bf16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3733def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
3734          (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
3735
3736// fpextend f16 -> f64
3737def : Pat<(f64 (fpextend (f16 Int16Regs:$a))),
3738          (CVT_f64_f16 Int16Regs:$a, CvtNONE)>;
3739
3740// fpextend bf16 -> f64
3741def : Pat<(f64 (fpextend (bf16 Int16Regs:$a))),
3742          (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>;
3743
3744// fpextend f32 -> f64
3745def : Pat<(f64 (fpextend Float32Regs:$a)),
3746          (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3747def : Pat<(f64 (fpextend Float32Regs:$a)),
3748          (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
3749
3750def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone,
3751                     [SDNPHasChain, SDNPOptInGlue]>;
3752
3753// fceil, ffloor, froundeven, ftrunc.
3754
3755multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
3756  def : Pat<(OpNode (f16 Int16Regs:$a)),
3757            (CVT_f16_f16 Int16Regs:$a, Mode)>;
3758  def : Pat<(OpNode (bf16 Int16Regs:$a)),
3759            (CVT_bf16_bf16 Int16Regs:$a, Mode)>;
3760  def : Pat<(OpNode Float32Regs:$a),
3761            (CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>;
3762  def : Pat<(OpNode Float32Regs:$a),
3763            (CVT_f32_f32 Float32Regs:$a, Mode)>, Requires<[doNoF32FTZ]>;
3764  def : Pat<(OpNode Float64Regs:$a),
3765            (CVT_f64_f64 Float64Regs:$a, Mode)>;
3766}
3767
3768defm : CVT_ROUND<fceil, CvtRPI, CvtRPI_FTZ>;
3769defm : CVT_ROUND<ffloor, CvtRMI, CvtRMI_FTZ>;
3770defm : CVT_ROUND<froundeven, CvtRNI, CvtRNI_FTZ>;
3771defm : CVT_ROUND<ftrunc, CvtRZI, CvtRZI_FTZ>;
3772
3773// nearbyint and rint are implemented as rounding to nearest even.  This isn't
3774// strictly correct, because it causes us to ignore the rounding mode.  But it
3775// matches what CUDA's "libm" does.
3776
3777defm : CVT_ROUND<fnearbyint, CvtRNI, CvtRNI_FTZ>;
3778defm : CVT_ROUND<frint, CvtRNI, CvtRNI_FTZ>;
3779
3780//-----------------------------------
3781// Control-flow
3782//-----------------------------------
3783
3784let isTerminator=1 in {
3785   let isReturn=1, isBarrier=1 in
3786      def Return : NVPTXInst<(outs), (ins), "ret;", [(retglue)]>;
3787
3788   let isBranch=1 in
3789      def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3790                              "@$a bra \t$target;",
3791                              [(brcond Int1Regs:$a, bb:$target)]>;
3792   let isBranch=1 in
3793      def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3794                                   "@!$a bra \t$target;", []>;
3795
3796   let isBranch=1, isBarrier=1 in
3797      def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
3798                           "bra.uni \t$target;", [(br bb:$target)]>;
3799}
3800
3801def : Pat<(brcond (i32 Int32Regs:$a), bb:$target),
3802          (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
3803
3804// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
3805// conditional branch if the target block is the next block so that the code
3806// can fall through to the target block.  The invertion is done by 'xor
3807// condition, 1', which will be translated to (setne condition, -1).  Since ptx
3808// supports '@!pred bra target', we should use it.
3809def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
3810          (CBranchOther Int1Regs:$a, bb:$target)>;
3811
3812// Call
3813def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
3814                                            SDTCisVT<1, i32>]>;
3815def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
3816
3817def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
3818                           [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
3819def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
3820                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
3821                            SDNPSideEffect]>;
3822
3823def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
3824def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
3825                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
3826def calltarget : Operand<i32>;
3827let isCall=1 in {
3828   def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
3829}
3830
3831def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
3832def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
3833
3834// Pseudo instructions.
3835class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
3836   : NVPTXInst<outs, ins, asmstr, pattern>;
3837
3838def Callseq_Start :
3839  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3840            "\\{ // callseq $amt1, $amt2",
3841            [(callseq_start timm:$amt1, timm:$amt2)]>;
3842def Callseq_End :
3843  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3844            "\\} // callseq $amt1",
3845            [(callseq_end timm:$amt1, timm:$amt2)]>;
3846
3847// trap instruction
3848// Emit an `exit` as well to convey to ptxas that `trap` exits the CFG.
3849// This won't be necessary in a future version of ptxas.
3850def trapinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>;
3851
3852// Call prototype wrapper
3853def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
3854def CallPrototype :
3855  SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
3856         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
3857def ProtoIdent : Operand<i32> {
3858  let PrintMethod = "printProtoIdent";
3859}
3860def CALL_PROTOTYPE :
3861  NVPTXInst<(outs), (ins ProtoIdent:$ident),
3862            "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
3863
3864def SDTDynAllocaOp :
3865  SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisInt<2>]>;
3866
3867def dyn_alloca :
3868  SDNode<"NVPTXISD::DYNAMIC_STACKALLOC", SDTDynAllocaOp,
3869         [SDNPHasChain, SDNPSideEffect]>;
3870
3871def DYNAMIC_STACKALLOC32 :
3872  NVPTXInst<(outs Int32Regs:$ptr),
3873            (ins Int32Regs:$size, i32imm:$align),
3874            "alloca.u32 \t$ptr, $size, $align;\n\t"
3875            "cvta.local.u32 \t$ptr, $ptr;",
3876            [(set (i32 Int32Regs:$ptr), (dyn_alloca Int32Regs:$size, (i32 timm:$align)))]>,
3877            Requires<[hasPTX<73>, hasSM<52>]>;
3878
3879def DYNAMIC_STACKALLOC64 :
3880  NVPTXInst<(outs Int64Regs:$ptr),
3881            (ins Int64Regs:$size, i32imm:$align),
3882            "alloca.u64 \t$ptr, $size, $align;\n\t"
3883            "cvta.local.u64 \t$ptr, $ptr;",
3884            [(set Int64Regs:$ptr, (dyn_alloca Int64Regs:$size, (i32 timm:$align)))]>,
3885            Requires<[hasPTX<73>, hasSM<52>]>;
3886
3887include "NVPTXIntrinsics.td"
3888
3889//-----------------------------------
3890// Notes
3891//-----------------------------------
3892// BSWAP is currently expanded. The following is a more efficient
3893// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
3894// - for sm_20, use pmpt (use vector scalar mov to get the pack and
3895//   unpack). sm_20 supports native 32-bit register, but not native 16-bit
3896// register.
3897
3898def : Pat <
3899  (i32 (bswap i32:$a)),
3900  (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x0123))>;
3901
3902def : Pat <
3903  (v2i16 (bswap v2i16:$a)),
3904  (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x2301))>;
3905
3906def : Pat <
3907  (i64 (bswap i64:$a)),
3908  (V2I32toI64
3909    (INT_NVVM_PRMT (I64toI32H Int64Regs:$a), (i32 0), (i32 0x0123)),
3910    (INT_NVVM_PRMT (I64toI32L Int64Regs:$a), (i32 0), (i32 0x0123)))>;
3911