xref: /freebsd/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (revision 8ddb146abcdf061be9f2c0db7e391697dafad85c)
1//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file describes the PTX instructions in TableGen format.
10//
11//===----------------------------------------------------------------------===//
12
13include "NVPTXInstrFormats.td"
14
15// A NOP instruction
16let hasSideEffects = false in {
17  def NOP : NVPTXInst<(outs), (ins), "", []>;
18}
19
20let OperandType = "OPERAND_IMMEDIATE" in {
21  def f16imm : Operand<f16>;
22}
23
24// List of vector specific properties
25def isVecLD      : VecInstTypeEnum<1>;
26def isVecST      : VecInstTypeEnum<2>;
27def isVecBuild   : VecInstTypeEnum<3>;
28def isVecShuffle : VecInstTypeEnum<4>;
29def isVecExtract : VecInstTypeEnum<5>;
30def isVecInsert  : VecInstTypeEnum<6>;
31def isVecDest    : VecInstTypeEnum<7>;
32def isVecOther   : VecInstTypeEnum<15>;
33
34//===----------------------------------------------------------------------===//
35// NVPTX Operand Definitions.
36//===----------------------------------------------------------------------===//
37
38def brtarget    : Operand<OtherVT>;
39
40// CVT conversion modes
41// These must match the enum in NVPTX.h
42def CvtNONE : PatLeaf<(i32 0x0)>;
43def CvtRNI  : PatLeaf<(i32 0x1)>;
44def CvtRZI  : PatLeaf<(i32 0x2)>;
45def CvtRMI  : PatLeaf<(i32 0x3)>;
46def CvtRPI  : PatLeaf<(i32 0x4)>;
47def CvtRN   : PatLeaf<(i32 0x5)>;
48def CvtRZ   : PatLeaf<(i32 0x6)>;
49def CvtRM   : PatLeaf<(i32 0x7)>;
50def CvtRP   : PatLeaf<(i32 0x8)>;
51def CvtRNA   : PatLeaf<(i32 0x9)>;
52
53def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
54def CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
55def CvtRZI_FTZ  : PatLeaf<(i32 0x12)>;
56def CvtRMI_FTZ  : PatLeaf<(i32 0x13)>;
57def CvtRPI_FTZ  : PatLeaf<(i32 0x14)>;
58def CvtRN_FTZ   : PatLeaf<(i32 0x15)>;
59def CvtRZ_FTZ   : PatLeaf<(i32 0x16)>;
60def CvtRM_FTZ   : PatLeaf<(i32 0x17)>;
61def CvtRP_FTZ   : PatLeaf<(i32 0x18)>;
62
63def CvtSAT      : PatLeaf<(i32 0x20)>;
64def CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
65
66def CvtNONE_RELU   : PatLeaf<(i32 0x40)>;
67def CvtRN_RELU   : PatLeaf<(i32 0x45)>;
68def CvtRZ_RELU   : PatLeaf<(i32 0x46)>;
69
70def CvtMode : Operand<i32> {
71  let PrintMethod = "printCvtMode";
72}
73
74// Compare modes
75// These must match the enum in NVPTX.h
76def CmpEQ   : PatLeaf<(i32 0)>;
77def CmpNE   : PatLeaf<(i32 1)>;
78def CmpLT   : PatLeaf<(i32 2)>;
79def CmpLE   : PatLeaf<(i32 3)>;
80def CmpGT   : PatLeaf<(i32 4)>;
81def CmpGE   : PatLeaf<(i32 5)>;
82def CmpEQU  : PatLeaf<(i32 10)>;
83def CmpNEU  : PatLeaf<(i32 11)>;
84def CmpLTU  : PatLeaf<(i32 12)>;
85def CmpLEU  : PatLeaf<(i32 13)>;
86def CmpGTU  : PatLeaf<(i32 14)>;
87def CmpGEU  : PatLeaf<(i32 15)>;
88def CmpNUM  : PatLeaf<(i32 16)>;
89def CmpNAN  : PatLeaf<(i32 17)>;
90
91def CmpEQ_FTZ   : PatLeaf<(i32 0x100)>;
92def CmpNE_FTZ   : PatLeaf<(i32 0x101)>;
93def CmpLT_FTZ   : PatLeaf<(i32 0x102)>;
94def CmpLE_FTZ   : PatLeaf<(i32 0x103)>;
95def CmpGT_FTZ   : PatLeaf<(i32 0x104)>;
96def CmpGE_FTZ   : PatLeaf<(i32 0x105)>;
97def CmpEQU_FTZ  : PatLeaf<(i32 0x10A)>;
98def CmpNEU_FTZ  : PatLeaf<(i32 0x10B)>;
99def CmpLTU_FTZ  : PatLeaf<(i32 0x10C)>;
100def CmpLEU_FTZ  : PatLeaf<(i32 0x10D)>;
101def CmpGTU_FTZ  : PatLeaf<(i32 0x10E)>;
102def CmpGEU_FTZ  : PatLeaf<(i32 0x10F)>;
103def CmpNUM_FTZ  : PatLeaf<(i32 0x110)>;
104def CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
105
106def CmpMode : Operand<i32> {
107  let PrintMethod = "printCmpMode";
108}
109def VecElement : Operand<i32> {
110  let PrintMethod = "printVecElement";
111}
112
113//===----------------------------------------------------------------------===//
114// NVPTX Instruction Predicate Definitions
115//===----------------------------------------------------------------------===//
116
117
118def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
119def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
120def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
121def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
122def hasVote : Predicate<"Subtarget->hasVote()">;
123def hasDouble : Predicate<"Subtarget->hasDouble()">;
124def hasLDG : Predicate<"Subtarget->hasLDG()">;
125def hasLDU : Predicate<"Subtarget->hasLDU()">;
126
127def doF32FTZ : Predicate<"useF32FTZ()">;
128def doNoF32FTZ : Predicate<"!useF32FTZ()">;
129
130def doMulWide      : Predicate<"doMulWide">;
131
132def allowFMA : Predicate<"allowFMA()">;
133def noFMA : Predicate<"!allowFMA()">;
134def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">;
135
136def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
137def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
138
139def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
140def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
141
142def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
143def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
144
145def True : Predicate<"true">;
146
147def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
148def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
149def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
150def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">;
151def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">;
152def hasPTX65 : Predicate<"Subtarget->getPTXVersion() >= 65">;
153def hasPTX70 : Predicate<"Subtarget->getPTXVersion() >= 70">;
154def hasPTX71 : Predicate<"Subtarget->getPTXVersion() >= 71">;
155
156def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
157def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
158def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">;
159def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">;
160def hasSM80 : Predicate<"Subtarget->getSmVersion() >= 80">;
161
162// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
163def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
164                          "&& Subtarget->getPTXVersion() >= 64)">;
165
166def useShortPtr : Predicate<"useShortPointers()">;
167def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
168
169//===----------------------------------------------------------------------===//
170// Some Common Instruction Class Templates
171//===----------------------------------------------------------------------===//
172
173// Template for instructions which take three int64, int32, or int16 args.
174// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
175multiclass I3<string OpcStr, SDNode OpNode> {
176  def i64rr :
177    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
178              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
179              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
180  def i64ri :
181    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
182              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
183              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
184  def i32rr :
185    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
186              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
187              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
188  def i32ri :
189    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
190              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
191              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
192  def i16rr :
193    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
194              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
195              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
196  def i16ri :
197    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
198              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
199              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
200}
201
202// Template for instructions which take 3 int32 args.  The instructions are
203// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
204multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
205   def i32rr :
206     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
207               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
208               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
209   def i32ri :
210     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
211               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
212               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
213}
214
215// Template for instructions which take three fp64 or fp32 args.  The
216// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
217//
218// Also defines ftz (flush subnormal inputs and results to sign-preserving
219// zero) variants for fp32 functions.
220//
221// This multiclass should be used for nodes that cannot be folded into FMAs.
222// For nodes that can be folded into FMAs (i.e. adds and muls), use
223// F3_fma_component.
224multiclass F3<string OpcStr, SDNode OpNode> {
225   def f64rr :
226     NVPTXInst<(outs Float64Regs:$dst),
227               (ins Float64Regs:$a, Float64Regs:$b),
228               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
229               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
230   def f64ri :
231     NVPTXInst<(outs Float64Regs:$dst),
232               (ins Float64Regs:$a, f64imm:$b),
233               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
234               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
235   def f32rr_ftz :
236     NVPTXInst<(outs Float32Regs:$dst),
237               (ins Float32Regs:$a, Float32Regs:$b),
238               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
239               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
240               Requires<[doF32FTZ]>;
241   def f32ri_ftz :
242     NVPTXInst<(outs Float32Regs:$dst),
243               (ins Float32Regs:$a, f32imm:$b),
244               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
245               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
246               Requires<[doF32FTZ]>;
247   def f32rr :
248     NVPTXInst<(outs Float32Regs:$dst),
249               (ins Float32Regs:$a, Float32Regs:$b),
250               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
251               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
252   def f32ri :
253     NVPTXInst<(outs Float32Regs:$dst),
254               (ins Float32Regs:$a, f32imm:$b),
255               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
256               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
257
258   def f16rr_ftz :
259     NVPTXInst<(outs Float16Regs:$dst),
260               (ins Float16Regs:$a, Float16Regs:$b),
261               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
262               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
263               Requires<[useFP16Math, doF32FTZ]>;
264   def f16rr :
265     NVPTXInst<(outs Float16Regs:$dst),
266               (ins Float16Regs:$a, Float16Regs:$b),
267               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
268               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
269               Requires<[useFP16Math]>;
270
271   def f16x2rr_ftz :
272     NVPTXInst<(outs Float16x2Regs:$dst),
273               (ins Float16x2Regs:$a, Float16x2Regs:$b),
274               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
275               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
276               Requires<[useFP16Math, doF32FTZ]>;
277   def f16x2rr :
278     NVPTXInst<(outs Float16x2Regs:$dst),
279               (ins Float16x2Regs:$a, Float16x2Regs:$b),
280               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
281               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
282               Requires<[useFP16Math]>;
283}
284
285// Template for instructions which take three FP args.  The
286// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
287//
288// Also defines ftz (flush subnormal inputs and results to sign-preserving
289// zero) variants for fp32/fp16 functions.
290//
291// This multiclass should be used for nodes that can be folded to make fma ops.
292// In this case, we use the ".rn" variant when FMA is disabled, as this behaves
293// just like the non ".rn" op, but prevents ptxas from creating FMAs.
294multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
295   def f64rr :
296     NVPTXInst<(outs Float64Regs:$dst),
297               (ins Float64Regs:$a, Float64Regs:$b),
298               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
299               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
300               Requires<[allowFMA]>;
301   def f64ri :
302     NVPTXInst<(outs Float64Regs:$dst),
303               (ins Float64Regs:$a, f64imm:$b),
304               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
305               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
306               Requires<[allowFMA]>;
307   def f32rr_ftz :
308     NVPTXInst<(outs Float32Regs:$dst),
309               (ins Float32Regs:$a, Float32Regs:$b),
310               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
311               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
312               Requires<[allowFMA, doF32FTZ]>;
313   def f32ri_ftz :
314     NVPTXInst<(outs Float32Regs:$dst),
315               (ins Float32Regs:$a, f32imm:$b),
316               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
317               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
318               Requires<[allowFMA, doF32FTZ]>;
319   def f32rr :
320     NVPTXInst<(outs Float32Regs:$dst),
321               (ins Float32Regs:$a, Float32Regs:$b),
322               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
323               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
324               Requires<[allowFMA]>;
325   def f32ri :
326     NVPTXInst<(outs Float32Regs:$dst),
327               (ins Float32Regs:$a, f32imm:$b),
328               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
329               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
330               Requires<[allowFMA]>;
331
332   def f16rr_ftz :
333     NVPTXInst<(outs Float16Regs:$dst),
334               (ins Float16Regs:$a, Float16Regs:$b),
335               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
336               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
337               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
338   def f16rr :
339     NVPTXInst<(outs Float16Regs:$dst),
340               (ins Float16Regs:$a, Float16Regs:$b),
341               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
342               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
343               Requires<[useFP16Math, allowFMA]>;
344
345   def f16x2rr_ftz :
346     NVPTXInst<(outs Float16x2Regs:$dst),
347               (ins Float16x2Regs:$a, Float16x2Regs:$b),
348               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
349               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
350               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
351   def f16x2rr :
352     NVPTXInst<(outs Float16x2Regs:$dst),
353               (ins Float16x2Regs:$a, Float16x2Regs:$b),
354               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
355               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
356               Requires<[useFP16Math, allowFMA]>;
357
358   // These have strange names so we don't perturb existing mir tests.
359   def _rnf64rr :
360     NVPTXInst<(outs Float64Regs:$dst),
361               (ins Float64Regs:$a, Float64Regs:$b),
362               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
363               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
364               Requires<[noFMA]>;
365   def _rnf64ri :
366     NVPTXInst<(outs Float64Regs:$dst),
367               (ins Float64Regs:$a, f64imm:$b),
368               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
369               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
370               Requires<[noFMA]>;
371   def _rnf32rr_ftz :
372     NVPTXInst<(outs Float32Regs:$dst),
373               (ins Float32Regs:$a, Float32Regs:$b),
374               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
375               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
376               Requires<[noFMA, doF32FTZ]>;
377   def _rnf32ri_ftz :
378     NVPTXInst<(outs Float32Regs:$dst),
379               (ins Float32Regs:$a, f32imm:$b),
380               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
381               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
382               Requires<[noFMA, doF32FTZ]>;
383   def _rnf32rr :
384     NVPTXInst<(outs Float32Regs:$dst),
385               (ins Float32Regs:$a, Float32Regs:$b),
386               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
387               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
388               Requires<[noFMA]>;
389   def _rnf32ri :
390     NVPTXInst<(outs Float32Regs:$dst),
391               (ins Float32Regs:$a, f32imm:$b),
392               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
393               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
394               Requires<[noFMA]>;
395   def _rnf16rr_ftz :
396     NVPTXInst<(outs Float16Regs:$dst),
397               (ins Float16Regs:$a, Float16Regs:$b),
398               !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
399               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
400               Requires<[useFP16Math, noFMA, doF32FTZ]>;
401   def _rnf16rr :
402     NVPTXInst<(outs Float16Regs:$dst),
403               (ins Float16Regs:$a, Float16Regs:$b),
404               !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
405               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
406               Requires<[useFP16Math, noFMA]>;
407   def _rnf16x2rr_ftz :
408     NVPTXInst<(outs Float16x2Regs:$dst),
409               (ins Float16x2Regs:$a, Float16x2Regs:$b),
410               !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
411               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
412               Requires<[useFP16Math, noFMA, doF32FTZ]>;
413   def _rnf16x2rr :
414     NVPTXInst<(outs Float16x2Regs:$dst),
415               (ins Float16x2Regs:$a, Float16x2Regs:$b),
416               !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
417               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
418               Requires<[useFP16Math, noFMA]>;
419}
420
421// Template for operations which take two f32 or f64 operands.  Provides three
422// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
423// subnormal inputs and results to zero).
424multiclass F2<string OpcStr, SDNode OpNode> {
425   def f64 :     NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
426                           !strconcat(OpcStr, ".f64 \t$dst, $a;"),
427                           [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
428   def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
429                           !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
430                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
431                           Requires<[doF32FTZ]>;
432   def f32 :     NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
433                           !strconcat(OpcStr, ".f32 \t$dst, $a;"),
434                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
435}
436
437//===----------------------------------------------------------------------===//
438// NVPTX Instructions.
439//===----------------------------------------------------------------------===//
440
441//-----------------------------------
442// Type Conversion
443//-----------------------------------
444
445let hasSideEffects = false in {
446  // Generate a cvt to the given type from all possible types.  Each instance
447  // takes a CvtMode immediate that defines the conversion mode to use.  It can
448  // be CvtNONE to omit a conversion mode.
449  multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
450    def _s8 :
451      NVPTXInst<(outs RC:$dst),
452                (ins Int16Regs:$src, CvtMode:$mode),
453                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
454                FromName, ".s8 \t$dst, $src;"), []>;
455    def _u8 :
456      NVPTXInst<(outs RC:$dst),
457                (ins Int16Regs:$src, CvtMode:$mode),
458                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
459                FromName, ".u8 \t$dst, $src;"), []>;
460    def _s16 :
461      NVPTXInst<(outs RC:$dst),
462                (ins Int16Regs:$src, CvtMode:$mode),
463                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
464                FromName, ".s16 \t$dst, $src;"), []>;
465    def _u16 :
466      NVPTXInst<(outs RC:$dst),
467                (ins Int16Regs:$src, CvtMode:$mode),
468                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
469                FromName, ".u16 \t$dst, $src;"), []>;
470    def _s32 :
471      NVPTXInst<(outs RC:$dst),
472                (ins Int32Regs:$src, CvtMode:$mode),
473                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
474                FromName, ".s32 \t$dst, $src;"), []>;
475    def _u32 :
476      NVPTXInst<(outs RC:$dst),
477                (ins Int32Regs:$src, CvtMode:$mode),
478                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
479                FromName, ".u32 \t$dst, $src;"), []>;
480    def _s64 :
481      NVPTXInst<(outs RC:$dst),
482                (ins Int64Regs:$src, CvtMode:$mode),
483                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
484                FromName, ".s64 \t$dst, $src;"), []>;
485    def _u64 :
486      NVPTXInst<(outs RC:$dst),
487                (ins Int64Regs:$src, CvtMode:$mode),
488                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
489                FromName, ".u64 \t$dst, $src;"), []>;
490    def _f16 :
491      NVPTXInst<(outs RC:$dst),
492                (ins Float16Regs:$src, CvtMode:$mode),
493                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
494                FromName, ".f16 \t$dst, $src;"), []>;
495    def _f32 :
496      NVPTXInst<(outs RC:$dst),
497                (ins Float32Regs:$src, CvtMode:$mode),
498                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
499                FromName, ".f32 \t$dst, $src;"), []>;
500    def _f64 :
501      NVPTXInst<(outs RC:$dst),
502                (ins Float64Regs:$src, CvtMode:$mode),
503                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
504                FromName, ".f64 \t$dst, $src;"), []>;
505  }
506
507  // Generate cvts from all types to all types.
508  defm CVT_s8  : CVT_FROM_ALL<"s8",  Int16Regs>;
509  defm CVT_u8  : CVT_FROM_ALL<"u8",  Int16Regs>;
510  defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
511  defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
512  defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
513  defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
514  defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
515  defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
516  defm CVT_f16 : CVT_FROM_ALL<"f16", Float16Regs>;
517  defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
518  defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
519
520  // These cvts are different from those above: The source and dest registers
521  // are of the same type.
522  def CVT_INREG_s16_s8 :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
523                                    "cvt.s16.s8 \t$dst, $src;", []>;
524  def CVT_INREG_s32_s8 :  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
525                                    "cvt.s32.s8 \t$dst, $src;", []>;
526  def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
527                                    "cvt.s32.s16 \t$dst, $src;", []>;
528  def CVT_INREG_s64_s8 :  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
529                                    "cvt.s64.s8 \t$dst, $src;", []>;
530  def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
531                                    "cvt.s64.s16 \t$dst, $src;", []>;
532  def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
533                                    "cvt.s64.s32 \t$dst, $src;", []>;
534
535multiclass CVT_FROM_FLOAT_SM80<string FromName, RegisterClass RC> {
536    def _f32 :
537      NVPTXInst<(outs RC:$dst),
538                (ins Float32Regs:$src, CvtMode:$mode),
539                !strconcat("cvt${mode:base}${mode:relu}.",
540                FromName, ".f32 \t$dst, $src;"), []>,
541                Requires<[hasPTX70, hasSM80]>;
542  }
543
544  defm CVT_bf16 : CVT_FROM_FLOAT_SM80<"bf16", Int16Regs>;
545
546    multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> {
547    def _f32 :
548      NVPTXInst<(outs RC:$dst),
549                (ins Float32Regs:$src1, Float32Regs:$src2,  CvtMode:$mode),
550                !strconcat("cvt${mode:base}${mode:relu}.",
551                FromName, ".f32 \t$dst, $src1, $src2;"), []>,
552    Requires<[hasPTX70, hasSM80]>;
553  }
554
555  defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Float16x2Regs>;
556  defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>;
557}
558
559//-----------------------------------
560// Integer Arithmetic
561//-----------------------------------
562
563// Template for xor masquerading as int1 arithmetic.
564multiclass ADD_SUB_i1<SDNode OpNode> {
565   def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
566                      "xor.pred \t$dst, $a, $b;",
567                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
568   def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
569                      "xor.pred \t$dst, $a, $b;",
570                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
571}
572
573// int1 addition and subtraction are both just xor.
574defm ADD_i1 : ADD_SUB_i1<add>;
575defm SUB_i1 : ADD_SUB_i1<sub>;
576
577// int16, int32, and int64 signed addition.  Since nvptx is 2's complement, we
578// also use these for unsigned arithmetic.
579defm ADD : I3<"add.s", add>;
580defm SUB : I3<"sub.s", sub>;
581
582// int32 addition and subtraction with carry-out.
583// FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?).
584defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
585defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
586
587// int32 addition and subtraction with carry-in and carry-out.
588defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
589defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
590
591defm MULT : I3<"mul.lo.s", mul>;
592
593defm MULTHS : I3<"mul.hi.s", mulhs>;
594defm MULTHU : I3<"mul.hi.u", mulhu>;
595
596defm SDIV : I3<"div.s", sdiv>;
597defm UDIV : I3<"div.u", udiv>;
598
599// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
600// will lower it.
601defm SREM : I3<"rem.s", srem>;
602defm UREM : I3<"rem.u", urem>;
603
604// Integer absolute value.  NumBits should be one minus the bit width of RC.
605// This idiom implements the algorithm at
606// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs.
607multiclass ABS<RegisterClass RC, string SizeName> {
608  def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
609                  !strconcat("abs", SizeName, " \t$dst, $a;"),
610                  [(set RC:$dst, (abs RC:$a))]>;
611}
612defm ABS_16 : ABS<Int16Regs, ".s16">;
613defm ABS_32 : ABS<Int32Regs, ".s32">;
614defm ABS_64 : ABS<Int64Regs, ".s64">;
615
616// Integer min/max.
617defm SMAX : I3<"max.s", smax>;
618defm UMAX : I3<"max.u", umax>;
619defm SMIN : I3<"min.s", smin>;
620defm UMIN : I3<"min.u", umin>;
621
622//
623// Wide multiplication
624//
625def MULWIDES64 :
626  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
627            "mul.wide.s32 \t$dst, $a, $b;", []>;
628def MULWIDES64Imm :
629  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
630            "mul.wide.s32 \t$dst, $a, $b;", []>;
631def MULWIDES64Imm64 :
632  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
633            "mul.wide.s32 \t$dst, $a, $b;", []>;
634
635def MULWIDEU64 :
636  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
637            "mul.wide.u32 \t$dst, $a, $b;", []>;
638def MULWIDEU64Imm :
639  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
640            "mul.wide.u32 \t$dst, $a, $b;", []>;
641def MULWIDEU64Imm64 :
642  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
643            "mul.wide.u32 \t$dst, $a, $b;", []>;
644
645def MULWIDES32 :
646  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
647            "mul.wide.s16 \t$dst, $a, $b;", []>;
648def MULWIDES32Imm :
649  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
650            "mul.wide.s16 \t$dst, $a, $b;", []>;
651def MULWIDES32Imm32 :
652  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
653            "mul.wide.s16 \t$dst, $a, $b;", []>;
654
655def MULWIDEU32 :
656  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
657            "mul.wide.u16 \t$dst, $a, $b;", []>;
658def MULWIDEU32Imm :
659  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
660            "mul.wide.u16 \t$dst, $a, $b;", []>;
661def MULWIDEU32Imm32 :
662  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
663            "mul.wide.u16 \t$dst, $a, $b;", []>;
664
665def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
666def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
667def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
668
669// Matchers for signed, unsigned mul.wide ISD nodes.
670def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
671          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
672      Requires<[doMulWide]>;
673def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
674          (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
675      Requires<[doMulWide]>;
676def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
677          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
678      Requires<[doMulWide]>;
679def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
680          (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
681      Requires<[doMulWide]>;
682
683def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
684          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
685      Requires<[doMulWide]>;
686def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
687          (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
688      Requires<[doMulWide]>;
689def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
690          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
691      Requires<[doMulWide]>;
692def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
693          (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
694      Requires<[doMulWide]>;
695
696// Predicates used for converting some patterns to mul.wide.
697def SInt32Const : PatLeaf<(imm), [{
698  const APInt &v = N->getAPIntValue();
699  return v.isSignedIntN(32);
700}]>;
701
702def UInt32Const : PatLeaf<(imm), [{
703  const APInt &v = N->getAPIntValue();
704  return v.isIntN(32);
705}]>;
706
707def SInt16Const : PatLeaf<(imm), [{
708  const APInt &v = N->getAPIntValue();
709  return v.isSignedIntN(16);
710}]>;
711
712def UInt16Const : PatLeaf<(imm), [{
713  const APInt &v = N->getAPIntValue();
714  return v.isIntN(16);
715}]>;
716
717def Int5Const : PatLeaf<(imm), [{
718  // Check if 0 <= v < 32; only then will the result of (x << v) be an int32.
719  const APInt &v = N->getAPIntValue();
720  return v.sge(0) && v.slt(32);
721}]>;
722
723def Int4Const : PatLeaf<(imm), [{
724  // Check if 0 <= v < 16; only then will the result of (x << v) be an int16.
725  const APInt &v = N->getAPIntValue();
726  return v.sge(0) && v.slt(16);
727}]>;
728
729def SHL2MUL32 : SDNodeXForm<imm, [{
730  const APInt &v = N->getAPIntValue();
731  APInt temp(32, 1);
732  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
733}]>;
734
735def SHL2MUL16 : SDNodeXForm<imm, [{
736  const APInt &v = N->getAPIntValue();
737  APInt temp(16, 1);
738  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
739}]>;
740
741// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
742def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
743          (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
744      Requires<[doMulWide]>;
745def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
746          (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
747      Requires<[doMulWide]>;
748
749def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
750          (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
751      Requires<[doMulWide]>;
752def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
753          (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
754      Requires<[doMulWide]>;
755
756// Convert "sign/zero-extend then multiply" to mul.wide.
757def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
758          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
759      Requires<[doMulWide]>;
760def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
761          (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
762      Requires<[doMulWide]>;
763
764def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
765          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
766      Requires<[doMulWide]>;
767def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
768          (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
769      Requires<[doMulWide]>;
770
771def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
772          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
773      Requires<[doMulWide]>;
774def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
775          (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
776      Requires<[doMulWide]>;
777
778def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
779          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
780      Requires<[doMulWide]>;
781def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
782          (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
783      Requires<[doMulWide]>;
784
785//
786// Integer multiply-add
787//
788def SDTIMAD :
789  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
790                       SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
791def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
792
793def MAD16rrr :
794  NVPTXInst<(outs Int16Regs:$dst),
795            (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
796            "mad.lo.s16 \t$dst, $a, $b, $c;",
797            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
798def MAD16rri :
799  NVPTXInst<(outs Int16Regs:$dst),
800            (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
801            "mad.lo.s16 \t$dst, $a, $b, $c;",
802            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
803def MAD16rir :
804  NVPTXInst<(outs Int16Regs:$dst),
805            (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
806            "mad.lo.s16 \t$dst, $a, $b, $c;",
807            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
808def MAD16rii :
809  NVPTXInst<(outs Int16Regs:$dst),
810            (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
811            "mad.lo.s16 \t$dst, $a, $b, $c;",
812            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
813
814def MAD32rrr :
815  NVPTXInst<(outs Int32Regs:$dst),
816            (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
817            "mad.lo.s32 \t$dst, $a, $b, $c;",
818            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
819def MAD32rri :
820  NVPTXInst<(outs Int32Regs:$dst),
821            (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
822            "mad.lo.s32 \t$dst, $a, $b, $c;",
823            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
824def MAD32rir :
825  NVPTXInst<(outs Int32Regs:$dst),
826            (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
827            "mad.lo.s32 \t$dst, $a, $b, $c;",
828            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
829def MAD32rii :
830  NVPTXInst<(outs Int32Regs:$dst),
831            (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
832            "mad.lo.s32 \t$dst, $a, $b, $c;",
833            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>;
834
835def MAD64rrr :
836  NVPTXInst<(outs Int64Regs:$dst),
837            (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
838            "mad.lo.s64 \t$dst, $a, $b, $c;",
839            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
840def MAD64rri :
841  NVPTXInst<(outs Int64Regs:$dst),
842            (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
843            "mad.lo.s64 \t$dst, $a, $b, $c;",
844            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
845def MAD64rir :
846  NVPTXInst<(outs Int64Regs:$dst),
847            (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
848            "mad.lo.s64 \t$dst, $a, $b, $c;",
849            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
850def MAD64rii :
851  NVPTXInst<(outs Int64Regs:$dst),
852            (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
853            "mad.lo.s64 \t$dst, $a, $b, $c;",
854            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
855
856def INEG16 :
857  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
858            "neg.s16 \t$dst, $src;",
859            [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
860def INEG32 :
861  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
862            "neg.s32 \t$dst, $src;",
863            [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
864def INEG64 :
865  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
866            "neg.s64 \t$dst, $src;",
867            [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
868
869//-----------------------------------
870// Floating Point Arithmetic
871//-----------------------------------
872
873// Constant 1.0f
874def FloatConst1 : PatLeaf<(fpimm), [{
875  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
876         N->getValueAPF().convertToFloat() == 1.0f;
877}]>;
878// Constant 1.0 (double)
879def DoubleConst1 : PatLeaf<(fpimm), [{
880  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
881         N->getValueAPF().convertToDouble() == 1.0;
882}]>;
883
884// Loads FP16 constant into a register.
885//
886// ptxas does not have hex representation for fp16, so we can't use
887// fp16 immediate values in .f16 instructions. Instead we have to load
888// the constant into a register using mov.b16.
889def LOAD_CONST_F16 :
890  NVPTXInst<(outs Float16Regs:$dst), (ins f16imm:$a),
891            "mov.b16 \t$dst, $a;", []>;
892
893defm FADD : F3_fma_component<"add", fadd>;
894defm FSUB : F3_fma_component<"sub", fsub>;
895defm FMUL : F3_fma_component<"mul", fmul>;
896
897defm FMIN : F3<"min", fminnum>;
898defm FMAX : F3<"max", fmaxnum>;
899// Note: min.NaN.f64 and max.NaN.f64 do not actually exist.
900defm FMINNAN : F3<"min.NaN", fminimum>;
901defm FMAXNAN : F3<"max.NaN", fmaximum>;
902
903defm FABS  : F2<"abs", fabs>;
904defm FNEG  : F2<"neg", fneg>;
905defm FSQRT : F2<"sqrt.rn", fsqrt>;
906
907//
908// F64 division
909//
910def FDIV641r :
911  NVPTXInst<(outs Float64Regs:$dst),
912            (ins f64imm:$a, Float64Regs:$b),
913            "rcp.rn.f64 \t$dst, $b;",
914            [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
915def FDIV64rr :
916  NVPTXInst<(outs Float64Regs:$dst),
917            (ins Float64Regs:$a, Float64Regs:$b),
918            "div.rn.f64 \t$dst, $a, $b;",
919            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
920def FDIV64ri :
921  NVPTXInst<(outs Float64Regs:$dst),
922            (ins Float64Regs:$a, f64imm:$b),
923            "div.rn.f64 \t$dst, $a, $b;",
924            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
925
926//
927// F32 Approximate reciprocal
928//
929def FDIV321r_ftz :
930  NVPTXInst<(outs Float32Regs:$dst),
931            (ins f32imm:$a, Float32Regs:$b),
932            "rcp.approx.ftz.f32 \t$dst, $b;",
933            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
934            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
935def FDIV321r :
936  NVPTXInst<(outs Float32Regs:$dst),
937            (ins f32imm:$a, Float32Regs:$b),
938            "rcp.approx.f32 \t$dst, $b;",
939            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
940            Requires<[do_DIVF32_APPROX]>;
941//
942// F32 Approximate division
943//
944def FDIV32approxrr_ftz :
945  NVPTXInst<(outs Float32Regs:$dst),
946            (ins Float32Regs:$a, Float32Regs:$b),
947            "div.approx.ftz.f32 \t$dst, $a, $b;",
948            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
949            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
950def FDIV32approxri_ftz :
951  NVPTXInst<(outs Float32Regs:$dst),
952            (ins Float32Regs:$a, f32imm:$b),
953            "div.approx.ftz.f32 \t$dst, $a, $b;",
954            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
955            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
956def FDIV32approxrr :
957  NVPTXInst<(outs Float32Regs:$dst),
958            (ins Float32Regs:$a, Float32Regs:$b),
959            "div.approx.f32 \t$dst, $a, $b;",
960            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
961            Requires<[do_DIVF32_APPROX]>;
962def FDIV32approxri :
963  NVPTXInst<(outs Float32Regs:$dst),
964            (ins Float32Regs:$a, f32imm:$b),
965            "div.approx.f32 \t$dst, $a, $b;",
966            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
967            Requires<[do_DIVF32_APPROX]>;
968//
969// F32 Semi-accurate reciprocal
970//
971// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
972//
973def FDIV321r_approx_ftz :
974  NVPTXInst<(outs Float32Regs:$dst),
975            (ins f32imm:$a, Float32Regs:$b),
976            "rcp.approx.ftz.f32 \t$dst, $b;",
977            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
978            Requires<[do_DIVF32_FULL, doF32FTZ]>;
979def FDIV321r_approx :
980  NVPTXInst<(outs Float32Regs:$dst),
981            (ins f32imm:$a, Float32Regs:$b),
982            "rcp.approx.f32 \t$dst, $b;",
983            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
984            Requires<[do_DIVF32_FULL]>;
985//
986// F32 Semi-accurate division
987//
988def FDIV32rr_ftz :
989  NVPTXInst<(outs Float32Regs:$dst),
990            (ins Float32Regs:$a, Float32Regs:$b),
991            "div.full.ftz.f32 \t$dst, $a, $b;",
992            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
993            Requires<[do_DIVF32_FULL, doF32FTZ]>;
994def FDIV32ri_ftz :
995  NVPTXInst<(outs Float32Regs:$dst),
996            (ins Float32Regs:$a, f32imm:$b),
997            "div.full.ftz.f32 \t$dst, $a, $b;",
998            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
999            Requires<[do_DIVF32_FULL, doF32FTZ]>;
1000def FDIV32rr :
1001  NVPTXInst<(outs Float32Regs:$dst),
1002            (ins Float32Regs:$a, Float32Regs:$b),
1003            "div.full.f32 \t$dst, $a, $b;",
1004            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1005            Requires<[do_DIVF32_FULL]>;
1006def FDIV32ri :
1007  NVPTXInst<(outs Float32Regs:$dst),
1008            (ins Float32Regs:$a, f32imm:$b),
1009            "div.full.f32 \t$dst, $a, $b;",
1010            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1011            Requires<[do_DIVF32_FULL]>;
1012//
1013// F32 Accurate reciprocal
1014//
1015def FDIV321r_prec_ftz :
1016  NVPTXInst<(outs Float32Regs:$dst),
1017            (ins f32imm:$a, Float32Regs:$b),
1018            "rcp.rn.ftz.f32 \t$dst, $b;",
1019            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
1020            Requires<[doF32FTZ]>;
1021def FDIV321r_prec :
1022  NVPTXInst<(outs Float32Regs:$dst),
1023            (ins f32imm:$a, Float32Regs:$b),
1024            "rcp.rn.f32 \t$dst, $b;",
1025            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>;
1026//
1027// F32 Accurate division
1028//
1029def FDIV32rr_prec_ftz :
1030  NVPTXInst<(outs Float32Regs:$dst),
1031            (ins Float32Regs:$a, Float32Regs:$b),
1032            "div.rn.ftz.f32 \t$dst, $a, $b;",
1033            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
1034            Requires<[doF32FTZ]>;
1035def FDIV32ri_prec_ftz :
1036  NVPTXInst<(outs Float32Regs:$dst),
1037            (ins Float32Regs:$a, f32imm:$b),
1038            "div.rn.ftz.f32 \t$dst, $a, $b;",
1039            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
1040            Requires<[doF32FTZ]>;
1041def FDIV32rr_prec :
1042  NVPTXInst<(outs Float32Regs:$dst),
1043            (ins Float32Regs:$a, Float32Regs:$b),
1044            "div.rn.f32 \t$dst, $a, $b;",
1045            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>;
1046def FDIV32ri_prec :
1047  NVPTXInst<(outs Float32Regs:$dst),
1048            (ins Float32Regs:$a, f32imm:$b),
1049            "div.rn.f32 \t$dst, $a, $b;",
1050            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>;
1051
1052//
1053// FMA
1054//
1055
1056multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
1057   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1058                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1059                       [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
1060                       Requires<[Pred]>;
1061   def rri : NVPTXInst<(outs RC:$dst),
1062                       (ins RC:$a, RC:$b, ImmCls:$c),
1063                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1064                       [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
1065                       Requires<[Pred]>;
1066   def rir : NVPTXInst<(outs RC:$dst),
1067                       (ins RC:$a, ImmCls:$b, RC:$c),
1068                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1069                       [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
1070                       Requires<[Pred]>;
1071   def rii : NVPTXInst<(outs RC:$dst),
1072                       (ins RC:$a, ImmCls:$b, ImmCls:$c),
1073                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1074                       [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
1075                       Requires<[Pred]>;
1076}
1077
1078multiclass FMA_F16<string OpcStr, RegisterClass RC, Predicate Pred> {
1079   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1080                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1081                       [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
1082                       Requires<[useFP16Math, Pred]>;
1083}
1084
1085defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>;
1086defm FMA16     : FMA_F16<"fma.rn.f16", Float16Regs, True>;
1087defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>;
1088defm FMA16x2     : FMA_F16<"fma.rn.f16x2", Float16x2Regs, True>;
1089defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
1090defm FMA32     : FMA<"fma.rn.f32", Float32Regs, f32imm, True>;
1091defm FMA64     : FMA<"fma.rn.f64", Float64Regs, f64imm, True>;
1092
1093// sin/cos
1094def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1095                      "sin.approx.f32 \t$dst, $src;",
1096                      [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,
1097                      Requires<[allowUnsafeFPMath]>;
1098def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1099                      "cos.approx.f32 \t$dst, $src;",
1100                      [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>,
1101                      Requires<[allowUnsafeFPMath]>;
1102
1103// Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
1104// i.e. "poor man's fmod()"
1105
1106// frem - f32 FTZ
1107def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1108          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
1109            (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
1110             Float32Regs:$y))>,
1111          Requires<[doF32FTZ]>;
1112def : Pat<(frem Float32Regs:$x, fpimm:$y),
1113          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
1114            (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
1115             fpimm:$y))>,
1116          Requires<[doF32FTZ]>;
1117
1118// frem - f32
1119def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1120          (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
1121            (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
1122             Float32Regs:$y))>;
1123def : Pat<(frem Float32Regs:$x, fpimm:$y),
1124          (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
1125            (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
1126             fpimm:$y))>;
1127
1128// frem - f64
1129def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
1130          (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
1131            (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
1132             Float64Regs:$y))>;
1133def : Pat<(frem Float64Regs:$x, fpimm:$y),
1134          (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
1135            (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
1136             fpimm:$y))>;
1137
1138//-----------------------------------
1139// Bitwise operations
1140//-----------------------------------
1141
1142// Template for three-arg bitwise operations.  Takes three args, Creates .b16,
1143// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
1144multiclass BITWISE<string OpcStr, SDNode OpNode> {
1145  def b1rr :
1146    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
1147              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
1148              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
1149  def b1ri :
1150    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
1151              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
1152              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
1153  def b16rr :
1154    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
1155              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
1156              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
1157  def b16ri :
1158    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
1159              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
1160              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
1161  def b32rr :
1162    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1163              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
1164              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
1165  def b32ri :
1166    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1167              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
1168              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
1169  def b64rr :
1170    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
1171              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
1172              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
1173  def b64ri :
1174    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
1175              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
1176              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
1177}
1178
1179defm OR  : BITWISE<"or", or>;
1180defm AND : BITWISE<"and", and>;
1181defm XOR : BITWISE<"xor", xor>;
1182
1183def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
1184                      "not.pred \t$dst, $src;",
1185                      [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
1186def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1187                      "not.b16 \t$dst, $src;",
1188                      [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
1189def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1190                      "not.b32 \t$dst, $src;",
1191                      [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
1192def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1193                       "not.b64 \t$dst, $src;",
1194                       [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
1195
1196// Template for left/right shifts.  Takes three operands,
1197//   [dest (reg), src (reg), shift (reg or imm)].
1198// dest and src may be int64, int32, or int16, but shift is always int32.
1199//
1200// This template also defines a 32-bit shift (imm, imm) instruction.
1201multiclass SHIFT<string OpcStr, SDNode OpNode> {
1202   def i64rr :
1203     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
1204               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1205               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>;
1206   def i64ri :
1207     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
1208               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1209               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
1210   def i32rr :
1211     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1212               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1213               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
1214   def i32ri :
1215     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1216               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1217               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>;
1218   def i32ii :
1219     NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
1220               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1221               [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
1222   def i16rr :
1223     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
1224               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1225               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>;
1226   def i16ri :
1227     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
1228               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1229               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
1230}
1231
1232defm SHL : SHIFT<"shl.b", shl>;
1233defm SRA : SHIFT<"shr.s", sra>;
1234defm SRL : SHIFT<"shr.u", srl>;
1235
1236// Bit-reverse
1237def BREV32 :
1238  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
1239             "brev.b32 \t$dst, $a;",
1240             [(set Int32Regs:$dst, (bitreverse Int32Regs:$a))]>;
1241def BREV64 :
1242  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
1243             "brev.b64 \t$dst, $a;",
1244             [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
1245
1246//
1247// Rotate: Use ptx shf instruction if available.
1248//
1249
1250// 32 bit r2 = rotl r1, n
1251//    =>
1252//        r2 = shf.l r1, r1, n
1253def ROTL32imm_hw :
1254  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1255            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1256            [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
1257           Requires<[hasHWROT32]>;
1258
1259def ROTL32reg_hw :
1260  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1261            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1262            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
1263           Requires<[hasHWROT32]>;
1264
1265// 32 bit r2 = rotr r1, n
1266//    =>
1267//        r2 = shf.r r1, r1, n
1268def ROTR32imm_hw :
1269  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1270            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1271            [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
1272           Requires<[hasHWROT32]>;
1273
1274def ROTR32reg_hw :
1275  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1276            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1277            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
1278           Requires<[hasHWROT32]>;
1279
1280// 32-bit software rotate by immediate.  $amt2 should equal 32 - $amt1.
1281def ROT32imm_sw :
1282  NVPTXInst<(outs Int32Regs:$dst),
1283            (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
1284            "{{\n\t"
1285            ".reg .b32 %lhs;\n\t"
1286            ".reg .b32 %rhs;\n\t"
1287            "shl.b32 \t%lhs, $src, $amt1;\n\t"
1288            "shr.b32 \t%rhs, $src, $amt2;\n\t"
1289            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1290            "}}",
1291            []>;
1292
1293def SUB_FRM_32 : SDNodeXForm<imm, [{
1294  return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
1295}]>;
1296
1297def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
1298          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
1299      Requires<[noHWROT32]>;
1300def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
1301          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
1302      Requires<[noHWROT32]>;
1303
1304// 32-bit software rotate left by register.
1305def ROTL32reg_sw :
1306  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1307            "{{\n\t"
1308            ".reg .b32 %lhs;\n\t"
1309            ".reg .b32 %rhs;\n\t"
1310            ".reg .b32 %amt2;\n\t"
1311            "shl.b32 \t%lhs, $src, $amt;\n\t"
1312            "sub.s32 \t%amt2, 32, $amt;\n\t"
1313            "shr.b32 \t%rhs, $src, %amt2;\n\t"
1314            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1315            "}}",
1316            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
1317           Requires<[noHWROT32]>;
1318
1319// 32-bit software rotate right by register.
1320def ROTR32reg_sw :
1321  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1322            "{{\n\t"
1323            ".reg .b32 %lhs;\n\t"
1324            ".reg .b32 %rhs;\n\t"
1325            ".reg .b32 %amt2;\n\t"
1326            "shr.b32 \t%lhs, $src, $amt;\n\t"
1327            "sub.s32 \t%amt2, 32, $amt;\n\t"
1328            "shl.b32 \t%rhs, $src, %amt2;\n\t"
1329            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1330            "}}",
1331            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
1332           Requires<[noHWROT32]>;
1333
1334// 64-bit software rotate by immediate.  $amt2 should equal 64 - $amt1.
1335def ROT64imm_sw :
1336  NVPTXInst<(outs Int64Regs:$dst),
1337            (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
1338            "{{\n\t"
1339            ".reg .b64 %lhs;\n\t"
1340            ".reg .b64 %rhs;\n\t"
1341            "shl.b64 \t%lhs, $src, $amt1;\n\t"
1342            "shr.b64 \t%rhs, $src, $amt2;\n\t"
1343            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1344            "}}",
1345            []>;
1346
1347def SUB_FRM_64 : SDNodeXForm<imm, [{
1348    return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
1349}]>;
1350
1351def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
1352          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
1353def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
1354          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
1355
1356// 64-bit software rotate left by register.
1357def ROTL64reg_sw :
1358  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1359            "{{\n\t"
1360            ".reg .b64 %lhs;\n\t"
1361            ".reg .b64 %rhs;\n\t"
1362            ".reg .u32 %amt2;\n\t"
1363            "shl.b64 \t%lhs, $src, $amt;\n\t"
1364            "sub.u32 \t%amt2, 64, $amt;\n\t"
1365            "shr.b64 \t%rhs, $src, %amt2;\n\t"
1366            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1367            "}}",
1368            [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
1369
1370def ROTR64reg_sw :
1371  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1372            "{{\n\t"
1373            ".reg .b64 %lhs;\n\t"
1374            ".reg .b64 %rhs;\n\t"
1375            ".reg .u32 %amt2;\n\t"
1376            "shr.b64 \t%lhs, $src, $amt;\n\t"
1377            "sub.u32 \t%amt2, 64, $amt;\n\t"
1378            "shl.b64 \t%rhs, $src, %amt2;\n\t"
1379            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1380            "}}",
1381            [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
1382
1383//
1384// Funnnel shift in clamp mode
1385//
1386
1387// Create SDNodes so they can be used in the DAG code, e.g.
1388// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
1389def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
1390def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
1391
1392def FUNSHFLCLAMP :
1393  NVPTXInst<(outs Int32Regs:$dst),
1394            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1395            "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
1396            [(set Int32Regs:$dst,
1397              (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
1398
1399def FUNSHFRCLAMP :
1400  NVPTXInst<(outs Int32Regs:$dst),
1401            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1402            "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
1403            [(set Int32Regs:$dst,
1404             (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
1405
1406//
1407// BFE - bit-field extract
1408//
1409
1410// Template for BFE instructions.  Takes four args,
1411//   [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
1412// Start may be an imm only if end is also an imm.  FIXME: Is this a
1413// restriction in PTX?
1414//
1415// dest and src may be int32 or int64, but start and end are always int32.
1416multiclass BFE<string TyStr, RegisterClass RC> {
1417  def rrr
1418    : NVPTXInst<(outs RC:$d),
1419                (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
1420                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1421  def rri
1422    : NVPTXInst<(outs RC:$d),
1423                (ins RC:$a, Int32Regs:$b, i32imm:$c),
1424                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1425  def rii
1426    : NVPTXInst<(outs RC:$d),
1427                (ins RC:$a, i32imm:$b, i32imm:$c),
1428                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1429}
1430
1431let hasSideEffects = false in {
1432  defm BFE_S32 : BFE<"s32", Int32Regs>;
1433  defm BFE_U32 : BFE<"u32", Int32Regs>;
1434  defm BFE_S64 : BFE<"s64", Int64Regs>;
1435  defm BFE_U64 : BFE<"u64", Int64Regs>;
1436}
1437
1438//-----------------------------------
1439// Comparison instructions (setp, set)
1440//-----------------------------------
1441
1442// FIXME: This doesn't cover versions of set and setp that combine with a
1443// boolean predicate, e.g. setp.eq.and.b16.
1444
1445let hasSideEffects = false in {
1446  multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
1447    def rr :
1448      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
1449                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1450                           " \t$dst, $a, $b;"), []>;
1451    def ri :
1452      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1453                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1454                           " \t$dst, $a, $b;"), []>;
1455    def ir :
1456      NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1457                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1458                           " \t$dst, $a, $b;"), []>;
1459  }
1460}
1461
1462defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
1463defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
1464defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
1465defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
1466defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
1467defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
1468defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
1469defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
1470defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
1471defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
1472defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
1473def SETP_f16rr :
1474      NVPTXInst<(outs Int1Regs:$dst),
1475                (ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp),
1476                "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
1477                []>, Requires<[useFP16Math]>;
1478
1479def SETP_f16x2rr :
1480      NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
1481                (ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp),
1482                "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
1483                []>,
1484                Requires<[useFP16Math]>;
1485
1486
1487// FIXME: This doesn't appear to be correct.  The "set" mnemonic has the form
1488// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
1489// reg, either u32, s32, or f32.  Anyway these aren't used at the moment.
1490
1491let hasSideEffects = false in {
1492  multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
1493    def rr : NVPTXInst<(outs Int32Regs:$dst),
1494                       (ins RC:$a, RC:$b, CmpMode:$cmp),
1495                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1496    def ri : NVPTXInst<(outs Int32Regs:$dst),
1497                       (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1498                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1499    def ir : NVPTXInst<(outs Int32Regs:$dst),
1500                       (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1501                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1502  }
1503}
1504
1505defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
1506defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
1507defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
1508defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
1509defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
1510defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
1511defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
1512defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
1513defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
1514defm SET_f16 : SET<"f16", Float16Regs, f16imm>;
1515defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
1516defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
1517
1518//-----------------------------------
1519// Selection instructions (selp)
1520//-----------------------------------
1521
1522// FIXME: Missing slct
1523
1524// selp instructions that don't have any pattern matches; we explicitly use
1525// them within this file.
1526let hasSideEffects = false in {
1527  multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
1528    def rr : NVPTXInst<(outs RC:$dst),
1529                       (ins RC:$a, RC:$b, Int1Regs:$p),
1530                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
1531    def ri : NVPTXInst<(outs RC:$dst),
1532                       (ins RC:$a, ImmCls:$b, Int1Regs:$p),
1533                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
1534    def ir : NVPTXInst<(outs RC:$dst),
1535                       (ins ImmCls:$a, RC:$b, Int1Regs:$p),
1536                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
1537    def ii : NVPTXInst<(outs RC:$dst),
1538                       (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
1539                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
1540  }
1541
1542  multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
1543                          SDNode ImmNode> {
1544    def rr :
1545      NVPTXInst<(outs RC:$dst),
1546                (ins RC:$a, RC:$b, Int1Regs:$p),
1547                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
1548                [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
1549    def ri :
1550      NVPTXInst<(outs RC:$dst),
1551                (ins RC:$a, ImmCls:$b, Int1Regs:$p),
1552                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
1553                [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
1554    def ir :
1555      NVPTXInst<(outs RC:$dst),
1556                (ins ImmCls:$a, RC:$b, Int1Regs:$p),
1557                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
1558                [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
1559    def ii :
1560      NVPTXInst<(outs RC:$dst),
1561                (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
1562                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
1563                [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
1564  }
1565}
1566
1567// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
1568// good.
1569defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
1570defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
1571defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
1572defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>;
1573defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
1574defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
1575defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;
1576defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
1577defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
1578defm SELP_f16 : SELP_PATTERN<"b16", Float16Regs, f16imm, fpimm>;
1579defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
1580defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
1581
1582def SELP_f16x2rr :
1583    NVPTXInst<(outs Float16x2Regs:$dst),
1584              (ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p),
1585              "selp.b32 \t$dst, $a, $b, $p;",
1586              [(set Float16x2Regs:$dst,
1587                    (select Int1Regs:$p, Float16x2Regs:$a, Float16x2Regs:$b))]>;
1588
1589//-----------------------------------
1590// Data Movement (Load / Store, Move)
1591//-----------------------------------
1592
1593def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
1594                            [SDNPWantRoot]>;
1595def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
1596                              [SDNPWantRoot]>;
1597def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>;
1598
1599def MEMri : Operand<i32> {
1600  let PrintMethod = "printMemOperand";
1601  let MIOperandInfo = (ops Int32Regs, i32imm);
1602}
1603def MEMri64 : Operand<i64> {
1604  let PrintMethod = "printMemOperand";
1605  let MIOperandInfo = (ops Int64Regs, i64imm);
1606}
1607
1608def imem : Operand<iPTR> {
1609  let PrintMethod = "printOperand";
1610}
1611
1612def imemAny : Operand<iPTRAny> {
1613  let PrintMethod = "printOperand";
1614}
1615
1616def LdStCode : Operand<i32> {
1617  let PrintMethod = "printLdStCode";
1618}
1619
1620def MmaCode : Operand<i32> {
1621  let PrintMethod = "printMmaCode";
1622}
1623
1624def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
1625def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
1626
1627// Load a memory address into a u32 or u64 register.
1628def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
1629                         "mov.u32 \t$dst, $a;",
1630                         [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
1631def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
1632                           "mov.u64 \t$dst, $a;",
1633                           [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
1634
1635// Get pointer to local stack.
1636let hasSideEffects = false in {
1637  def MOV_DEPOT_ADDR :    NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
1638                                     "mov.u32 \t$d, __local_depot$num;", []>;
1639  def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
1640                                    "mov.u64 \t$d, __local_depot$num;", []>;
1641}
1642
1643
1644// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
1645let IsSimpleMove=1, hasSideEffects=0 in {
1646  def IMOV1rr :  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
1647                           "mov.pred \t$dst, $sss;", []>;
1648  def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
1649                           "mov.u16 \t$dst, $sss;", []>;
1650  def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
1651                           "mov.u32 \t$dst, $sss;", []>;
1652  def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
1653                           "mov.u64 \t$dst, $sss;", []>;
1654
1655  def FMOV16rr : NVPTXInst<(outs Float16Regs:$dst), (ins Float16Regs:$src),
1656                           // We have to use .b16 here as there's no mov.f16.
1657                           "mov.b16 \t$dst, $src;", []>;
1658  def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1659                           "mov.f32 \t$dst, $src;", []>;
1660  def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
1661                           "mov.f64 \t$dst, $src;", []>;
1662}
1663
1664def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
1665                        "mov.pred \t$dst, $src;",
1666                        [(set Int1Regs:$dst, imm:$src)]>;
1667def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
1668                         "mov.u16 \t$dst, $src;",
1669                         [(set Int16Regs:$dst, imm:$src)]>;
1670def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
1671                         "mov.u32 \t$dst, $src;",
1672                         [(set Int32Regs:$dst, imm:$src)]>;
1673def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
1674                        "mov.u64 \t$dst, $src;",
1675                        [(set Int64Regs:$dst, imm:$src)]>;
1676
1677def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
1678                         "mov.f32 \t$dst, $src;",
1679                         [(set Float32Regs:$dst, fpimm:$src)]>;
1680def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
1681                         "mov.f64 \t$dst, $src;",
1682                         [(set Float64Regs:$dst, fpimm:$src)]>;
1683
1684def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
1685
1686//---- Copy Frame Index ----
1687def LEA_ADDRi :   NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
1688                            "add.u32 \t$dst, ${addr:add};",
1689                            [(set Int32Regs:$dst, ADDRri:$addr)]>;
1690def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
1691                            "add.u64 \t$dst, ${addr:add};",
1692                            [(set Int64Regs:$dst, ADDRri64:$addr)]>;
1693
1694//-----------------------------------
1695// Comparison and Selection
1696//-----------------------------------
1697
1698multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
1699                       Instruction setp_16rr,
1700                       Instruction setp_16ri,
1701                       Instruction setp_16ir,
1702                       Instruction setp_32rr,
1703                       Instruction setp_32ri,
1704                       Instruction setp_32ir,
1705                       Instruction setp_64rr,
1706                       Instruction setp_64ri,
1707                       Instruction setp_64ir,
1708                       Instruction set_16rr,
1709                       Instruction set_16ri,
1710                       Instruction set_16ir,
1711                       Instruction set_32rr,
1712                       Instruction set_32ri,
1713                       Instruction set_32ir,
1714                       Instruction set_64rr,
1715                       Instruction set_64ri,
1716                       Instruction set_64ir> {
1717  // i16 -> pred
1718  def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)),
1719            (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
1720  def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
1721            (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
1722  def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
1723            (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
1724  // i32 -> pred
1725  def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)),
1726            (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
1727  def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
1728            (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
1729  def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)),
1730            (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
1731  // i64 -> pred
1732  def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
1733            (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
1734  def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
1735            (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
1736  def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
1737            (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
1738
1739  // i16 -> i32
1740  def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)),
1741            (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
1742  def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
1743            (set_16ri Int16Regs:$a, imm:$b, Mode)>;
1744  def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
1745            (set_16ir imm:$a, Int16Regs:$b, Mode)>;
1746  // i32 -> i32
1747  def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)),
1748            (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
1749  def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
1750            (set_32ri Int32Regs:$a, imm:$b, Mode)>;
1751  def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)),
1752            (set_32ir imm:$a, Int32Regs:$b, Mode)>;
1753  // i64 -> i32
1754  def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
1755            (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
1756  def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
1757            (set_64ri Int64Regs:$a, imm:$b, Mode)>;
1758  def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
1759            (set_64ir imm:$a, Int64Regs:$b, Mode)>;
1760}
1761
1762multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
1763  : ISET_FORMAT<OpNode, Mode,
1764                SETP_s16rr, SETP_s16ri, SETP_s16ir,
1765                SETP_s32rr, SETP_s32ri, SETP_s32ir,
1766                SETP_s64rr, SETP_s64ri, SETP_s64ir,
1767                SET_s16rr, SET_s16ri, SET_s16ir,
1768                SET_s32rr, SET_s32ri, SET_s32ir,
1769                SET_s64rr, SET_s64ri, SET_s64ir> {
1770  // TableGen doesn't like empty multiclasses.
1771  def : PatLeaf<(i32 0)>;
1772}
1773
1774multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
1775  : ISET_FORMAT<OpNode, Mode,
1776                SETP_u16rr, SETP_u16ri, SETP_u16ir,
1777                SETP_u32rr, SETP_u32ri, SETP_u32ir,
1778                SETP_u64rr, SETP_u64ri, SETP_u64ir,
1779                SET_u16rr, SET_u16ri, SET_u16ir,
1780                SET_u32rr, SET_u32ri, SET_u32ir,
1781                SET_u64rr, SET_u64ri, SET_u64ir> {
1782  // TableGen doesn't like empty multiclasses.
1783  def : PatLeaf<(i32 0)>;
1784}
1785
1786defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
1787defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
1788defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
1789defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
1790defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
1791defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
1792defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
1793defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
1794defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
1795defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
1796defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
1797defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
1798
1799// i1 compares
1800def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
1801          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
1802def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
1803          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
1804
1805def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
1806          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1807def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
1808          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1809
1810// i1 compare -> i32
1811def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
1812          (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1813def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
1814          (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1815
1816
1817
1818multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
1819  // f16 -> pred
1820  def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
1821            (SETP_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
1822        Requires<[useFP16Math,doF32FTZ]>;
1823  def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
1824            (SETP_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
1825        Requires<[useFP16Math]>;
1826  def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
1827            (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
1828        Requires<[useFP16Math,doF32FTZ]>;
1829  def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
1830            (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
1831        Requires<[useFP16Math]>;
1832  def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
1833            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
1834        Requires<[useFP16Math,doF32FTZ]>;
1835  def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
1836            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
1837        Requires<[useFP16Math]>;
1838
1839  // f32 -> pred
1840  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
1841            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
1842        Requires<[doF32FTZ]>;
1843  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
1844            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
1845  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
1846            (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
1847        Requires<[doF32FTZ]>;
1848  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
1849            (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
1850  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
1851            (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
1852        Requires<[doF32FTZ]>;
1853  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
1854            (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
1855
1856  // f64 -> pred
1857  def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
1858            (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
1859  def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
1860            (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
1861  def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
1862            (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
1863
1864  // f16 -> i32
1865  def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
1866            (SET_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
1867        Requires<[useFP16Math, doF32FTZ]>;
1868  def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
1869            (SET_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
1870        Requires<[useFP16Math]>;
1871  def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
1872            (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
1873        Requires<[useFP16Math, doF32FTZ]>;
1874  def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
1875            (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
1876        Requires<[useFP16Math]>;
1877  def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
1878            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
1879        Requires<[useFP16Math, doF32FTZ]>;
1880  def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
1881            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
1882        Requires<[useFP16Math]>;
1883
1884  // f32 -> i32
1885  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
1886            (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
1887        Requires<[doF32FTZ]>;
1888  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
1889            (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
1890  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
1891            (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
1892        Requires<[doF32FTZ]>;
1893  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
1894            (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
1895  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
1896            (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
1897        Requires<[doF32FTZ]>;
1898  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
1899            (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
1900
1901  // f64 -> i32
1902  def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
1903            (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
1904  def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
1905            (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
1906  def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
1907            (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
1908}
1909
1910defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
1911defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
1912defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
1913defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
1914defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
1915defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
1916
1917defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
1918defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
1919defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
1920defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
1921defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
1922defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
1923
1924defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
1925defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
1926defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
1927defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
1928defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
1929defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
1930
1931defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
1932defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
1933
1934// FIXME: What is this doing here?  Can it be deleted?
1935// def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
1936//                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
1937
1938def SDTDeclareParamProfile :
1939  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
1940def SDTDeclareScalarParamProfile :
1941  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
1942def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
1943def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
1944def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
1945def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
1946def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
1947def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
1948def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
1949def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
1950def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
1951def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
1952def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
1953def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
1954def SDTCallValProfile : SDTypeProfile<1, 0, []>;
1955def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
1956def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
1957def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
1958def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
1959def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
1960def SDTProxyRegProfile : SDTypeProfile<1, 1, []>;
1961
1962def DeclareParam :
1963  SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
1964         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1965def DeclareScalarParam :
1966  SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
1967         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1968def DeclareRetParam :
1969  SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
1970         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1971def DeclareRet :
1972  SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
1973         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1974def LoadParam :
1975  SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
1976         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
1977def LoadParamV2 :
1978  SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
1979         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
1980def LoadParamV4 :
1981  SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
1982         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
1983def PrintCall :
1984  SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
1985         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1986def PrintConvergentCall :
1987  SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
1988         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1989def PrintCallUni :
1990  SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
1991         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1992def PrintConvergentCallUni :
1993  SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
1994         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1995def StoreParam :
1996  SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
1997         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1998def StoreParamV2 :
1999  SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
2000         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2001def StoreParamV4 :
2002  SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
2003         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2004def StoreParamU32 :
2005  SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
2006         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2007def StoreParamS32 :
2008  SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
2009         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2010def CallArgBegin :
2011  SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
2012         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2013def CallArg :
2014  SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
2015         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2016def LastCallArg :
2017  SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
2018         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2019def CallArgEnd :
2020  SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
2021         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2022def CallVoid :
2023  SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
2024         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2025def Prototype :
2026  SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
2027         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2028def CallVal :
2029  SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
2030         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2031def MoveParam :
2032  SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
2033def StoreRetval :
2034  SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
2035         [SDNPHasChain, SDNPSideEffect]>;
2036def StoreRetvalV2 :
2037  SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
2038         [SDNPHasChain, SDNPSideEffect]>;
2039def StoreRetvalV4 :
2040  SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
2041         [SDNPHasChain, SDNPSideEffect]>;
2042def PseudoUseParam :
2043  SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
2044         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2045def RETURNNode :
2046  SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
2047         [SDNPHasChain, SDNPSideEffect]>;
2048def ProxyReg :
2049  SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile,
2050         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2051
2052let mayLoad = true in {
2053  class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
2054        NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
2055                  !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
2056                  []>;
2057
2058  class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
2059        NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
2060                  !strconcat("ld.param.v2", opstr,
2061                             " \t{{$dst, $dst2}}, [retval0+$b];"), []>;
2062
2063  class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
2064        NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
2065                        regclass:$dst4),
2066                  (ins i32imm:$b),
2067                  !strconcat("ld.param.v4", opstr,
2068                             " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
2069                  []>;
2070}
2071
2072class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
2073      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
2074                !strconcat("mov", opstr, " \t$dst, retval$b;"),
2075                [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
2076
2077let mayStore = true in {
2078  class StoreParamInst<NVPTXRegClass regclass, string opstr> :
2079        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
2080                  !strconcat("st.param", opstr, " \t[param$a+$b], $val;"),
2081                  []>;
2082
2083  class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
2084        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
2085                               i32imm:$a, i32imm:$b),
2086                  !strconcat("st.param.v2", opstr,
2087                             " \t[param$a+$b], {{$val, $val2}};"),
2088                  []>;
2089
2090  class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
2091        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
2092                               regclass:$val4, i32imm:$a,
2093                               i32imm:$b),
2094                  !strconcat("st.param.v4", opstr,
2095                             " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
2096                  []>;
2097
2098  class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
2099        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
2100                  !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"),
2101                  []>;
2102
2103  class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
2104        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
2105                  !strconcat("st.param.v2", opstr,
2106                             " \t[func_retval0+$a], {{$val, $val2}};"),
2107                  []>;
2108
2109  class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
2110        NVPTXInst<(outs),
2111                  (ins regclass:$val, regclass:$val2, regclass:$val3,
2112                       regclass:$val4, i32imm:$a),
2113                  !strconcat("st.param.v4", opstr,
2114                             " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
2115                  []>;
2116}
2117
2118let isCall=1 in {
2119  multiclass CALL<string OpcStr, SDNode OpNode> {
2120     def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
2121       !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
2122     def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
2123       !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
2124     def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
2125       !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
2126     def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
2127       !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
2128     def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
2129       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
2130       [(OpNode (i32 4))]>;
2131     def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
2132       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
2133       [(OpNode (i32 5))]>;
2134     def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
2135       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2136                            "retval5), "),
2137       [(OpNode (i32 6))]>;
2138     def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
2139       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2140                            "retval5, retval6), "),
2141       [(OpNode (i32 7))]>;
2142     def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
2143       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2144                            "retval5, retval6, retval7), "),
2145       [(OpNode (i32 8))]>;
2146  }
2147}
2148
2149defm Call : CALL<"call", PrintCall>;
2150defm CallUni : CALL<"call.uni", PrintCallUni>;
2151
2152// Convergent call instructions.  These are identical to regular calls, except
2153// they have the isConvergent bit set.
2154let isConvergent=1 in {
2155  defm ConvergentCall : CALL<"call", PrintConvergentCall>;
2156  defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
2157}
2158
2159def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
2160def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
2161def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
2162def LoadParamMemI8     : LoadParamMemInst<Int16Regs, ".b8">;
2163def LoadParamMemV2I64  : LoadParamV2MemInst<Int64Regs, ".b64">;
2164def LoadParamMemV2I32  : LoadParamV2MemInst<Int32Regs, ".b32">;
2165def LoadParamMemV2I16  : LoadParamV2MemInst<Int16Regs, ".b16">;
2166def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
2167def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
2168def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
2169def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
2170def LoadParamMemF16    : LoadParamMemInst<Float16Regs, ".b16">;
2171def LoadParamMemF16x2  : LoadParamMemInst<Float16x2Regs, ".b32">;
2172def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
2173def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
2174def LoadParamMemV2F16  : LoadParamV2MemInst<Float16Regs, ".b16">;
2175def LoadParamMemV2F16x2: LoadParamV2MemInst<Float16x2Regs, ".b32">;
2176def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
2177def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
2178def LoadParamMemV4F16  : LoadParamV4MemInst<Float16Regs, ".b16">;
2179def LoadParamMemV4F16x2: LoadParamV4MemInst<Float16x2Regs, ".b32">;
2180def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
2181
2182def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
2183def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
2184
2185def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
2186def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
2187def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
2188def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
2189def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
2190def StoreParamV2I8   : StoreParamV2Inst<Int16Regs, ".b8">;
2191
2192def StoreParamV4I32  : StoreParamV4Inst<Int32Regs, ".b32">;
2193def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
2194def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
2195
2196def StoreParamF16      : StoreParamInst<Float16Regs, ".b16">;
2197def StoreParamF16x2    : StoreParamInst<Float16x2Regs, ".b32">;
2198def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
2199def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
2200def StoreParamV2F16    : StoreParamV2Inst<Float16Regs, ".b16">;
2201def StoreParamV2F16x2  : StoreParamV2Inst<Float16x2Regs, ".b32">;
2202def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
2203def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
2204def StoreParamV4F16    : StoreParamV4Inst<Float16Regs, ".b16">;
2205def StoreParamV4F16x2  : StoreParamV4Inst<Float16x2Regs, ".b32">;
2206def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
2207
2208def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
2209def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
2210def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
2211def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
2212def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
2213def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
2214def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
2215def StoreRetvalV2I8   : StoreRetvalV2Inst<Int16Regs, ".b8">;
2216def StoreRetvalV4I32  : StoreRetvalV4Inst<Int32Regs, ".b32">;
2217def StoreRetvalV4I16  : StoreRetvalV4Inst<Int16Regs, ".b16">;
2218def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
2219
2220def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
2221def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
2222def StoreRetvalF16    : StoreRetvalInst<Float16Regs, ".b16">;
2223def StoreRetvalF16x2  : StoreRetvalInst<Float16x2Regs, ".b32">;
2224def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
2225def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
2226def StoreRetvalV2F16  : StoreRetvalV2Inst<Float16Regs, ".b16">;
2227def StoreRetvalV2F16x2: StoreRetvalV2Inst<Float16x2Regs, ".b32">;
2228def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
2229def StoreRetvalV4F16  : StoreRetvalV4Inst<Float16Regs, ".b16">;
2230def StoreRetvalV4F16x2: StoreRetvalV4Inst<Float16x2Regs, ".b32">;
2231
2232def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
2233def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
2234def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
2235def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
2236
2237class CallArgInst<NVPTXRegClass regclass> :
2238  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
2239            [(CallArg (i32 0), regclass:$a)]>;
2240
2241class LastCallArgInst<NVPTXRegClass regclass> :
2242  NVPTXInst<(outs), (ins regclass:$a), "$a",
2243            [(LastCallArg (i32 0), regclass:$a)]>;
2244
2245def CallArgI64     : CallArgInst<Int64Regs>;
2246def CallArgI32     : CallArgInst<Int32Regs>;
2247def CallArgI16     : CallArgInst<Int16Regs>;
2248def CallArgF64     : CallArgInst<Float64Regs>;
2249def CallArgF32     : CallArgInst<Float32Regs>;
2250
2251def LastCallArgI64 : LastCallArgInst<Int64Regs>;
2252def LastCallArgI32 : LastCallArgInst<Int32Regs>;
2253def LastCallArgI16 : LastCallArgInst<Int16Regs>;
2254def LastCallArgF64 : LastCallArgInst<Float64Regs>;
2255def LastCallArgF32 : LastCallArgInst<Float32Regs>;
2256
2257def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
2258                              [(CallArg (i32 0), (i32 imm:$a))]>;
2259def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
2260                                  [(LastCallArg (i32 0), (i32 imm:$a))]>;
2261
2262def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
2263                             [(CallArg (i32 1), (i32 imm:$a))]>;
2264def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
2265                                 [(LastCallArg (i32 1), (i32 imm:$a))]>;
2266
2267def CallVoidInst :      NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
2268                                  [(CallVoid (Wrapper tglobaladdr:$addr))]>;
2269def CallVoidInstReg :   NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
2270                                  [(CallVoid Int32Regs:$addr)]>;
2271def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
2272                                  [(CallVoid Int64Regs:$addr)]>;
2273def PrototypeInst :     NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
2274                                  [(Prototype (i32 imm:$val))]>;
2275
2276def DeclareRetMemInst :
2277  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
2278            ".param .align $align .b8 retval$num[$size];",
2279            [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
2280def DeclareRetScalarInst :
2281  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2282            ".param .b$size retval$num;",
2283            [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
2284def DeclareRetRegInst :
2285  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2286            ".reg .b$size retval$num;",
2287            [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
2288
2289def DeclareParamInst :
2290  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
2291            ".param .align $align .b8 param$a[$size];",
2292            [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
2293def DeclareScalarParamInst :
2294  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2295            ".param .b$size param$a;",
2296            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
2297def DeclareScalarRegInst :
2298  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2299            ".reg .b$size param$a;",
2300            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
2301
2302class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
2303  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2304            !strconcat("mov", asmstr, " \t$dst, $src;"),
2305            [(set regclass:$dst, (MoveParam regclass:$src))]>;
2306
2307class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty,
2308                          string asmstr> :
2309  NVPTXInst<(outs regclass:$dst), (ins srcty:$src),
2310            !strconcat("mov", asmstr, " \t$dst, $src;"),
2311            [(set regclass:$dst, (MoveParam texternalsym:$src))]>;
2312
2313def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
2314def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
2315
2316def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, ".b64">;
2317def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, ".b32">;
2318
2319def MoveParamI16 :
2320  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
2321            "cvt.u16.u32 \t$dst, $src;",
2322            [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
2323def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
2324def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
2325def MoveParamF16 : MoveParamInst<Float16Regs, ".f16">;
2326
2327class PseudoUseParamInst<NVPTXRegClass regclass> :
2328  NVPTXInst<(outs), (ins regclass:$src),
2329            "// Pseudo use of $src",
2330            [(PseudoUseParam regclass:$src)]>;
2331
2332def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
2333def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
2334def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
2335def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
2336def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
2337
2338class ProxyRegInst<string SzStr, NVPTXRegClass regclass> :
2339  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2340            !strconcat("mov.", SzStr, " \t$dst, $src;"),
2341            [(set regclass:$dst, (ProxyReg regclass:$src))]>;
2342
2343let isCodeGenOnly=1, isPseudo=1 in {
2344  def ProxyRegI1    : ProxyRegInst<"pred", Int1Regs>;
2345  def ProxyRegI16   : ProxyRegInst<"b16",  Int16Regs>;
2346  def ProxyRegI32   : ProxyRegInst<"b32",  Int32Regs>;
2347  def ProxyRegI64   : ProxyRegInst<"b64",  Int64Regs>;
2348  def ProxyRegF16   : ProxyRegInst<"b16",  Float16Regs>;
2349  def ProxyRegF32   : ProxyRegInst<"f32",  Float32Regs>;
2350  def ProxyRegF64   : ProxyRegInst<"f64",  Float64Regs>;
2351  def ProxyRegF16x2 : ProxyRegInst<"b32",  Float16x2Regs>;
2352}
2353
2354//
2355// Load / Store Handling
2356//
2357multiclass LD<NVPTXRegClass regclass> {
2358  def _avar : NVPTXInst<
2359    (outs regclass:$dst),
2360    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2361         i32imm:$fromWidth, imem:$addr),
2362    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2363    "\t$dst, [$addr];", []>;
2364  def _areg : NVPTXInst<
2365    (outs regclass:$dst),
2366    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2367         i32imm:$fromWidth, Int32Regs:$addr),
2368    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2369    "\t$dst, [$addr];", []>;
2370  def _areg_64 : NVPTXInst<
2371    (outs regclass:$dst),
2372    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2373         i32imm:$fromWidth, Int64Regs:$addr),
2374    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2375    "\t$dst, [$addr];", []>;
2376  def _ari : NVPTXInst<
2377    (outs regclass:$dst),
2378    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2379         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2380    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2381    "\t$dst, [$addr+$offset];", []>;
2382  def _ari_64 : NVPTXInst<
2383    (outs regclass:$dst),
2384    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2385         LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2386    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2387    "\t$dst, [$addr+$offset];", []>;
2388  def _asi : NVPTXInst<
2389    (outs regclass:$dst),
2390    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2391         LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2392    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2393    "\t$dst, [$addr+$offset];", []>;
2394}
2395
2396let mayLoad=1, hasSideEffects=0 in {
2397  defm LD_i8  : LD<Int16Regs>;
2398  defm LD_i16 : LD<Int16Regs>;
2399  defm LD_i32 : LD<Int32Regs>;
2400  defm LD_i64 : LD<Int64Regs>;
2401  defm LD_f16 : LD<Float16Regs>;
2402  defm LD_f16x2 : LD<Float16x2Regs>;
2403  defm LD_f32 : LD<Float32Regs>;
2404  defm LD_f64 : LD<Float64Regs>;
2405}
2406
2407multiclass ST<NVPTXRegClass regclass> {
2408  def _avar : NVPTXInst<
2409    (outs),
2410    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2411         LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
2412    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2413    " \t[$addr], $src;", []>;
2414  def _areg : NVPTXInst<
2415    (outs),
2416    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
2417         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
2418    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2419    " \t[$addr], $src;", []>;
2420  def _areg_64 : NVPTXInst<
2421    (outs),
2422    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2423         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
2424    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2425    " \t[$addr], $src;", []>;
2426  def _ari : NVPTXInst<
2427    (outs),
2428    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2429         LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
2430    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2431    " \t[$addr+$offset], $src;", []>;
2432  def _ari_64 : NVPTXInst<
2433    (outs),
2434    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2435         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
2436    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2437    " \t[$addr+$offset], $src;", []>;
2438  def _asi : NVPTXInst<
2439    (outs),
2440    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2441         LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
2442    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2443    " \t[$addr+$offset], $src;", []>;
2444}
2445
2446let mayStore=1, hasSideEffects=0 in {
2447  defm ST_i8  : ST<Int16Regs>;
2448  defm ST_i16 : ST<Int16Regs>;
2449  defm ST_i32 : ST<Int32Regs>;
2450  defm ST_i64 : ST<Int64Regs>;
2451  defm ST_f16 : ST<Float16Regs>;
2452  defm ST_f16x2 : ST<Float16x2Regs>;
2453  defm ST_f32 : ST<Float32Regs>;
2454  defm ST_f64 : ST<Float64Regs>;
2455}
2456
2457// The following is used only in and after vector elementizations.  Vector
2458// elementization happens at the machine instruction level, so the following
2459// instructions never appear in the DAG.
2460multiclass LD_VEC<NVPTXRegClass regclass> {
2461  def _v2_avar : NVPTXInst<
2462    (outs regclass:$dst1, regclass:$dst2),
2463    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2464         i32imm:$fromWidth, imem:$addr),
2465    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2466    "\t{{$dst1, $dst2}}, [$addr];", []>;
2467  def _v2_areg : NVPTXInst<
2468    (outs regclass:$dst1, regclass:$dst2),
2469    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2470         i32imm:$fromWidth, Int32Regs:$addr),
2471    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2472    "\t{{$dst1, $dst2}}, [$addr];", []>;
2473  def _v2_areg_64 : NVPTXInst<
2474    (outs regclass:$dst1, regclass:$dst2),
2475    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2476         i32imm:$fromWidth, Int64Regs:$addr),
2477    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2478    "\t{{$dst1, $dst2}}, [$addr];", []>;
2479  def _v2_ari : NVPTXInst<
2480    (outs regclass:$dst1, regclass:$dst2),
2481    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2482         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2483    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2484    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2485  def _v2_ari_64 : NVPTXInst<
2486    (outs regclass:$dst1, regclass:$dst2),
2487    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2488         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2489    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2490    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2491  def _v2_asi : NVPTXInst<
2492    (outs regclass:$dst1, regclass:$dst2),
2493    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2494         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2495    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2496    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2497  def _v4_avar : NVPTXInst<
2498    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2499    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2500         i32imm:$fromWidth, imem:$addr),
2501    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2502    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2503  def _v4_areg : NVPTXInst<
2504    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2505    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2506         i32imm:$fromWidth, Int32Regs:$addr),
2507    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2508    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2509  def _v4_areg_64 : NVPTXInst<
2510    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2511    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2512         i32imm:$fromWidth, Int64Regs:$addr),
2513    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2514    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2515  def _v4_ari : NVPTXInst<
2516    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2517    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2518         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2519    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2520    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2521  def _v4_ari_64 : NVPTXInst<
2522    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2523    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2524         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2525    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2526    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2527  def _v4_asi : NVPTXInst<
2528    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2529    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2530         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2531    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2532    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2533}
2534let mayLoad=1, hasSideEffects=0 in {
2535  defm LDV_i8  : LD_VEC<Int16Regs>;
2536  defm LDV_i16 : LD_VEC<Int16Regs>;
2537  defm LDV_i32 : LD_VEC<Int32Regs>;
2538  defm LDV_i64 : LD_VEC<Int64Regs>;
2539  defm LDV_f16 : LD_VEC<Float16Regs>;
2540  defm LDV_f16x2 : LD_VEC<Float16x2Regs>;
2541  defm LDV_f32 : LD_VEC<Float32Regs>;
2542  defm LDV_f64 : LD_VEC<Float64Regs>;
2543}
2544
2545multiclass ST_VEC<NVPTXRegClass regclass> {
2546  def _v2_avar : NVPTXInst<
2547    (outs),
2548    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2549         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
2550    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2551    "\t[$addr], {{$src1, $src2}};", []>;
2552  def _v2_areg : NVPTXInst<
2553    (outs),
2554    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2555         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
2556    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2557    "\t[$addr], {{$src1, $src2}};", []>;
2558  def _v2_areg_64 : NVPTXInst<
2559    (outs),
2560    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2561         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
2562    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2563    "\t[$addr], {{$src1, $src2}};", []>;
2564  def _v2_ari : NVPTXInst<
2565    (outs),
2566    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2567         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
2568         i32imm:$offset),
2569    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2570    "\t[$addr+$offset], {{$src1, $src2}};", []>;
2571  def _v2_ari_64 : NVPTXInst<
2572    (outs),
2573    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2574         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
2575         i32imm:$offset),
2576    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2577    "\t[$addr+$offset], {{$src1, $src2}};", []>;
2578  def _v2_asi : NVPTXInst<
2579    (outs),
2580    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2581         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
2582         i32imm:$offset),
2583    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2584    "\t[$addr+$offset], {{$src1, $src2}};", []>;
2585  def _v4_avar : NVPTXInst<
2586    (outs),
2587    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2588         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2589         i32imm:$fromWidth, imem:$addr),
2590    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2591    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2592  def _v4_areg : NVPTXInst<
2593    (outs),
2594    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2595         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2596         i32imm:$fromWidth, Int32Regs:$addr),
2597    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2598    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2599  def _v4_areg_64 : NVPTXInst<
2600    (outs),
2601    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2602         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2603         i32imm:$fromWidth, Int64Regs:$addr),
2604    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2605    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2606  def _v4_ari : NVPTXInst<
2607    (outs),
2608    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2609         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2610         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2611    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2612    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2613  def _v4_ari_64 : NVPTXInst<
2614    (outs),
2615    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2616         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2617         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2618    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2619    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2620  def _v4_asi : NVPTXInst<
2621    (outs),
2622    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2623         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2624         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2625    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
2626    "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2627}
2628
2629let mayStore=1, hasSideEffects=0 in {
2630  defm STV_i8  : ST_VEC<Int16Regs>;
2631  defm STV_i16 : ST_VEC<Int16Regs>;
2632  defm STV_i32 : ST_VEC<Int32Regs>;
2633  defm STV_i64 : ST_VEC<Int64Regs>;
2634  defm STV_f16 : ST_VEC<Float16Regs>;
2635  defm STV_f16x2 : ST_VEC<Float16x2Regs>;
2636  defm STV_f32 : ST_VEC<Float32Regs>;
2637  defm STV_f64 : ST_VEC<Float64Regs>;
2638}
2639
2640//---- Conversion ----
2641
2642class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
2643  NVPTXRegClass regclassOut> :
2644           NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
2645           !strconcat("mov.b", SzStr, " \t$d, $a;"),
2646     [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
2647
2648def BITCONVERT_16_I2F : F_BITCONVERT<"16", Int16Regs, Float16Regs>;
2649def BITCONVERT_16_F2I : F_BITCONVERT<"16", Float16Regs, Int16Regs>;
2650def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
2651def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
2652def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
2653def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
2654def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>;
2655def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>;
2656
2657// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
2658// we cannot specify floating-point literals in isel patterns.  Therefore, we
2659// use an integer selp to select either 1 or 0 and then cvt to floating-point.
2660
2661// sint -> f16
2662def : Pat<(f16 (sint_to_fp Int1Regs:$a)),
2663          (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2664def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
2665          (CVT_f16_s16 Int16Regs:$a, CvtRN)>;
2666def : Pat<(f16 (sint_to_fp Int32Regs:$a)),
2667          (CVT_f16_s32 Int32Regs:$a, CvtRN)>;
2668def : Pat<(f16 (sint_to_fp Int64Regs:$a)),
2669          (CVT_f16_s64 Int64Regs:$a, CvtRN)>;
2670
2671// uint -> f16
2672def : Pat<(f16 (uint_to_fp Int1Regs:$a)),
2673          (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2674def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
2675          (CVT_f16_u16 Int16Regs:$a, CvtRN)>;
2676def : Pat<(f16 (uint_to_fp Int32Regs:$a)),
2677          (CVT_f16_u32 Int32Regs:$a, CvtRN)>;
2678def : Pat<(f16 (uint_to_fp Int64Regs:$a)),
2679          (CVT_f16_u64 Int64Regs:$a, CvtRN)>;
2680
2681// sint -> f32
2682def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
2683          (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2684def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
2685          (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
2686def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
2687          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
2688def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
2689          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
2690
2691// uint -> f32
2692def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
2693          (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2694def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
2695          (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
2696def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
2697          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
2698def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
2699          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
2700
2701// sint -> f64
2702def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
2703          (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2704def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
2705          (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
2706def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
2707          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
2708def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
2709          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
2710
2711// uint -> f64
2712def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
2713          (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2714def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
2715          (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
2716def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
2717          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
2718def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
2719          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
2720
2721
2722// f16 -> sint
2723def : Pat<(i1 (fp_to_sint Float16Regs:$a)),
2724          (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
2725def : Pat<(i16 (fp_to_sint Float16Regs:$a)),
2726          (CVT_s16_f16 Float16Regs:$a, CvtRZI)>;
2727def : Pat<(i32 (fp_to_sint Float16Regs:$a)),
2728          (CVT_s32_f16 Float16Regs:$a, CvtRZI)>;
2729def : Pat<(i64 (fp_to_sint Float16Regs:$a)),
2730          (CVT_s64_f16 Float16Regs:$a, CvtRZI)>;
2731
2732// f16 -> uint
2733def : Pat<(i1 (fp_to_uint Float16Regs:$a)),
2734          (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
2735def : Pat<(i16 (fp_to_uint Float16Regs:$a)),
2736          (CVT_u16_f16 Float16Regs:$a, CvtRZI)>;
2737def : Pat<(i32 (fp_to_uint Float16Regs:$a)),
2738          (CVT_u32_f16 Float16Regs:$a, CvtRZI)>;
2739def : Pat<(i64 (fp_to_uint Float16Regs:$a)),
2740          (CVT_u64_f16 Float16Regs:$a, CvtRZI)>;
2741
2742// f32 -> sint
2743def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
2744          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
2745def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
2746          (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2747def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
2748          (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
2749def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
2750          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2751def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
2752          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
2753def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
2754          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2755def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
2756          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
2757
2758// f32 -> uint
2759def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
2760          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
2761def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
2762          (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2763def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
2764          (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
2765def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
2766          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2767def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
2768          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
2769def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
2770          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2771def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
2772          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
2773
2774// f64 -> sint
2775def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
2776          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
2777def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
2778          (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
2779def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
2780          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
2781def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
2782          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
2783
2784// f64 -> uint
2785def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
2786          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
2787def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
2788          (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
2789def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
2790          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
2791def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
2792          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
2793
2794// sext i1
2795def : Pat<(i16 (sext Int1Regs:$a)),
2796          (SELP_s16ii -1, 0, Int1Regs:$a)>;
2797def : Pat<(i32 (sext Int1Regs:$a)),
2798          (SELP_s32ii -1, 0, Int1Regs:$a)>;
2799def : Pat<(i64 (sext Int1Regs:$a)),
2800          (SELP_s64ii -1, 0, Int1Regs:$a)>;
2801
2802// zext i1
2803def : Pat<(i16 (zext Int1Regs:$a)),
2804          (SELP_u16ii 1, 0, Int1Regs:$a)>;
2805def : Pat<(i32 (zext Int1Regs:$a)),
2806          (SELP_u32ii 1, 0, Int1Regs:$a)>;
2807def : Pat<(i64 (zext Int1Regs:$a)),
2808          (SELP_u64ii 1, 0, Int1Regs:$a)>;
2809
2810// anyext i1
2811def : Pat<(i16 (anyext Int1Regs:$a)),
2812          (SELP_u16ii -1, 0, Int1Regs:$a)>;
2813def : Pat<(i32 (anyext Int1Regs:$a)),
2814          (SELP_u32ii -1, 0, Int1Regs:$a)>;
2815def : Pat<(i64 (anyext Int1Regs:$a)),
2816          (SELP_u64ii -1, 0, Int1Regs:$a)>;
2817
2818// sext i16
2819def : Pat<(i32 (sext Int16Regs:$a)),
2820          (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
2821def : Pat<(i64 (sext Int16Regs:$a)),
2822          (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
2823
2824// zext i16
2825def : Pat<(i32 (zext Int16Regs:$a)),
2826          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
2827def : Pat<(i64 (zext Int16Regs:$a)),
2828          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
2829
2830// anyext i16
2831def : Pat<(i32 (anyext Int16Regs:$a)),
2832          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
2833def : Pat<(i64 (anyext Int16Regs:$a)),
2834          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
2835
2836// sext i32
2837def : Pat<(i64 (sext Int32Regs:$a)),
2838          (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
2839
2840// zext i32
2841def : Pat<(i64 (zext Int32Regs:$a)),
2842          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
2843
2844// anyext i32
2845def : Pat<(i64 (anyext Int32Regs:$a)),
2846          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
2847
2848
2849// truncate i64
2850def : Pat<(i32 (trunc Int64Regs:$a)),
2851          (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
2852def : Pat<(i16 (trunc Int64Regs:$a)),
2853          (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
2854def : Pat<(i1 (trunc Int64Regs:$a)),
2855          (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
2856
2857// truncate i32
2858def : Pat<(i16 (trunc Int32Regs:$a)),
2859          (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
2860def : Pat<(i1 (trunc Int32Regs:$a)),
2861          (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
2862
2863// truncate i16
2864def : Pat<(i1 (trunc Int16Regs:$a)),
2865          (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
2866
2867// sext_inreg
2868def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
2869def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
2870def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
2871def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
2872def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
2873def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
2874
2875
2876// Select instructions with 32-bit predicates
2877def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
2878          (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
2879          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2880def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
2881          (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
2882          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2883def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
2884          (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
2885          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2886def : Pat<(select Int32Regs:$pred, Float16Regs:$a, Float16Regs:$b),
2887          (SELP_f16rr Float16Regs:$a, Float16Regs:$b,
2888          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2889def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
2890          (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
2891          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2892def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
2893          (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
2894          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2895
2896
2897let hasSideEffects = false in {
2898  // pack a set of smaller int registers to a larger int register
2899  def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
2900                             (ins Int16Regs:$s1, Int16Regs:$s2,
2901                                  Int16Regs:$s3, Int16Regs:$s4),
2902                             "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
2903  def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
2904                             (ins Int16Regs:$s1, Int16Regs:$s2),
2905                             "mov.b32 \t$d, {{$s1, $s2}};", []>;
2906  def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
2907                             (ins Int32Regs:$s1, Int32Regs:$s2),
2908                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
2909  def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
2910                             (ins Float32Regs:$s1, Float32Regs:$s2),
2911                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
2912
2913  // unpack a larger int register to a set of smaller int registers
2914  def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
2915                                   Int16Regs:$d3, Int16Regs:$d4),
2916                             (ins Int64Regs:$s),
2917                             "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
2918  def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
2919                             (ins Int32Regs:$s),
2920                             "mov.b32 \t{{$d1, $d2}}, $s;", []>;
2921  def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
2922                             (ins Int64Regs:$s),
2923                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
2924  def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
2925                             (ins Float64Regs:$s),
2926                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
2927
2928}
2929
2930let hasSideEffects = false in {
2931  // Extract element of f16x2 register. PTX does not provide any way
2932  // to access elements of f16x2 vector directly, so we need to
2933  // extract it using a temporary register.
2934  def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst),
2935                               (ins Float16x2Regs:$src),
2936                               "{{ .reg .b16 \t%tmp_hi;\n\t"
2937                               "  mov.b32 \t{$dst, %tmp_hi}, $src; }}",
2938                               [(set Float16Regs:$dst,
2939                                 (extractelt (v2f16 Float16x2Regs:$src), 0))]>;
2940  def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst),
2941                               (ins Float16x2Regs:$src),
2942                               "{{ .reg .b16 \t%tmp_lo;\n\t"
2943                               "  mov.b32 \t{%tmp_lo, $dst}, $src; }}",
2944                               [(set Float16Regs:$dst,
2945                                 (extractelt (v2f16 Float16x2Regs:$src), 1))]>;
2946
2947  // Coalesce two f16 registers into f16x2
2948  def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst),
2949                             (ins Float16Regs:$a, Float16Regs:$b),
2950                             "mov.b32 \t$dst, {{$a, $b}};",
2951                             [(set Float16x2Regs:$dst,
2952                               (build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>;
2953
2954  // Directly initializing underlying the b32 register is one less SASS
2955  // instruction than than vector-packing move.
2956  def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src),
2957                              "mov.b32 \t$dst, $src;",
2958                              []>;
2959
2960  // Split f16x2 into two f16 registers.
2961  def SplitF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
2962                              (ins Float16x2Regs:$src),
2963                              "mov.b32 \t{{$lo, $hi}}, $src;",
2964                              []>;
2965  // Split an i32 into two f16
2966  def SplitI32toF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
2967                                   (ins Int32Regs:$src),
2968                                   "mov.b32 \t{{$lo, $hi}}, $src;",
2969                                   []>;
2970}
2971
2972// Count leading zeros
2973let hasSideEffects = false in {
2974  def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
2975                         "clz.b32 \t$d, $a;", []>;
2976  def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
2977                         "clz.b64 \t$d, $a;", []>;
2978}
2979
2980// 32-bit has a direct PTX instruction
2981def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
2982
2983// The return type of the ctlz ISD node is the same as its input, but the PTX
2984// ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
2985// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
2986// truncating back down to 32 bits.
2987def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
2988def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
2989
2990// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
2991// result back to 16-bits if necessary.  We also need to subtract 16 because
2992// the high-order 16 zeros were counted.
2993//
2994// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
2995// use to save one SASS instruction (on sm_35 anyway):
2996//
2997//   mov.b32 $tmp, {0xffff, $a}
2998//   ctlz.b32 $result, $tmp
2999//
3000// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
3001// and then ctlz that value.  This way we don't have to subtract 16 from the
3002// result.  Unfortunately today we don't have a way to generate
3003// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
3004def : Pat<(i16 (ctlz Int16Regs:$a)),
3005          (SUBi16ri (CVT_u16_u32
3006           (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
3007def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))),
3008          (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
3009
3010// Population count
3011let hasSideEffects = false in {
3012  def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3013                          "popc.b32 \t$d, $a;", []>;
3014  def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3015                          "popc.b64 \t$d, $a;", []>;
3016}
3017
3018// 32-bit has a direct PTX instruction
3019def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
3020
3021// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
3022// to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
3023// pattern that avoids the type conversion if we're truncating the result to
3024// i32 anyway.
3025def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
3026def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
3027
3028// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
3029// If we know that we're storing into an i32, we can avoid the final trunc.
3030def : Pat<(ctpop Int16Regs:$a),
3031          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
3032def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))),
3033          (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
3034
3035// fpround f32 -> f16
3036def : Pat<(f16 (fpround Float32Regs:$a)),
3037          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
3038
3039// fpround f64 -> f16
3040def : Pat<(f16 (fpround Float64Regs:$a)),
3041          (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
3042
3043// fpround f64 -> f32
3044def : Pat<(f32 (fpround Float64Regs:$a)),
3045          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
3046def : Pat<(f32 (fpround Float64Regs:$a)),
3047          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
3048
3049// fpextend f16 -> f32
3050def : Pat<(f32 (fpextend Float16Regs:$a)),
3051          (CVT_f32_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3052def : Pat<(f32 (fpextend Float16Regs:$a)),
3053          (CVT_f32_f16 Float16Regs:$a, CvtNONE)>;
3054
3055// fpextend f16 -> f64
3056def : Pat<(f64 (fpextend Float16Regs:$a)),
3057          (CVT_f64_f16 Float16Regs:$a, CvtNONE)>;
3058
3059// fpextend f32 -> f64
3060def : Pat<(f64 (fpextend Float32Regs:$a)),
3061          (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3062def : Pat<(f64 (fpextend Float32Regs:$a)),
3063          (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
3064
3065def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
3066                     [SDNPHasChain, SDNPOptInGlue]>;
3067
3068// fceil, ffloor, fround, ftrunc.
3069
3070def : Pat<(fceil Float16Regs:$a),
3071          (CVT_f16_f16 Float16Regs:$a, CvtRPI)>;
3072def : Pat<(fceil Float32Regs:$a),
3073          (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
3074def : Pat<(fceil Float32Regs:$a),
3075          (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
3076def : Pat<(fceil Float64Regs:$a),
3077          (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
3078
3079def : Pat<(ffloor Float16Regs:$a),
3080          (CVT_f16_f16 Float16Regs:$a, CvtRMI)>;
3081def : Pat<(ffloor Float32Regs:$a),
3082          (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
3083def : Pat<(ffloor Float32Regs:$a),
3084          (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
3085def : Pat<(ffloor Float64Regs:$a),
3086          (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
3087
3088def : Pat<(ftrunc Float16Regs:$a),
3089          (CVT_f16_f16 Float16Regs:$a, CvtRZI)>;
3090def : Pat<(ftrunc Float32Regs:$a),
3091          (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3092def : Pat<(ftrunc Float32Regs:$a),
3093          (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
3094def : Pat<(ftrunc Float64Regs:$a),
3095          (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
3096
3097// nearbyint and rint are implemented as rounding to nearest even.  This isn't
3098// strictly correct, because it causes us to ignore the rounding mode.  But it
3099// matches what CUDA's "libm" does.
3100
3101def : Pat<(fnearbyint Float16Regs:$a),
3102          (CVT_f16_f16 Float16Regs:$a, CvtRNI)>;
3103def : Pat<(fnearbyint Float32Regs:$a),
3104          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
3105def : Pat<(fnearbyint Float32Regs:$a),
3106          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
3107def : Pat<(fnearbyint Float64Regs:$a),
3108          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
3109
3110def : Pat<(frint Float16Regs:$a),
3111          (CVT_f16_f16 Float16Regs:$a, CvtRNI)>;
3112def : Pat<(frint Float32Regs:$a),
3113          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
3114def : Pat<(frint Float32Regs:$a),
3115          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
3116def : Pat<(frint Float64Regs:$a),
3117          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
3118
3119
3120//-----------------------------------
3121// Control-flow
3122//-----------------------------------
3123
3124let isTerminator=1 in {
3125   let isReturn=1, isBarrier=1 in
3126      def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
3127
3128   let isBranch=1 in
3129      def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3130                              "@$a bra \t$target;",
3131                              [(brcond Int1Regs:$a, bb:$target)]>;
3132   let isBranch=1 in
3133      def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3134                                   "@!$a bra \t$target;", []>;
3135
3136   let isBranch=1, isBarrier=1 in
3137      def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
3138                           "bra.uni \t$target;", [(br bb:$target)]>;
3139}
3140
3141def : Pat<(brcond Int32Regs:$a, bb:$target),
3142          (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
3143
3144// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
3145// conditional branch if the target block is the next block so that the code
3146// can fall through to the target block.  The invertion is done by 'xor
3147// condition, 1', which will be translated to (setne condition, -1).  Since ptx
3148// supports '@!pred bra target', we should use it.
3149def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
3150          (CBranchOther Int1Regs:$a, bb:$target)>;
3151
3152// Call
3153def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
3154                                            SDTCisVT<1, i32>]>;
3155def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
3156
3157def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
3158                           [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
3159def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
3160                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
3161                            SDNPSideEffect]>;
3162
3163def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
3164def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
3165                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
3166def calltarget : Operand<i32>;
3167let isCall=1 in {
3168   def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
3169}
3170
3171def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
3172def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
3173
3174// Pseudo instructions.
3175class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
3176   : NVPTXInst<outs, ins, asmstr, pattern>;
3177
3178def Callseq_Start :
3179  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3180            "\\{ // callseq $amt1, $amt2\n"
3181            "\t.reg .b32 temp_param_reg;",
3182            [(callseq_start timm:$amt1, timm:$amt2)]>;
3183def Callseq_End :
3184  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3185            "\\} // callseq $amt1",
3186            [(callseq_end timm:$amt1, timm:$amt2)]>;
3187
3188// trap instruction
3189def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>;
3190
3191// Call prototype wrapper
3192def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
3193def CallPrototype :
3194  SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
3195         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
3196def ProtoIdent : Operand<i32> {
3197  let PrintMethod = "printProtoIdent";
3198}
3199def CALL_PROTOTYPE :
3200  NVPTXInst<(outs), (ins ProtoIdent:$ident),
3201            "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
3202
3203
3204include "NVPTXIntrinsics.td"
3205
3206
3207//-----------------------------------
3208// Notes
3209//-----------------------------------
3210// BSWAP is currently expanded. The following is a more efficient
3211// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
3212// - for sm_20, use pmpt (use vector scalar mov to get the pack and
3213//   unpack). sm_20 supports native 32-bit register, but not native 16-bit
3214// register.
3215