1//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file describes the PTX instructions in TableGen format. 10// 11//===----------------------------------------------------------------------===// 12 13include "NVPTXInstrFormats.td" 14 15// A NOP instruction 16let hasSideEffects = false in { 17 def NOP : NVPTXInst<(outs), (ins), "", []>; 18} 19 20let OperandType = "OPERAND_IMMEDIATE" in { 21 def f16imm : Operand<f16>; 22} 23 24// List of vector specific properties 25def isVecLD : VecInstTypeEnum<1>; 26def isVecST : VecInstTypeEnum<2>; 27def isVecBuild : VecInstTypeEnum<3>; 28def isVecShuffle : VecInstTypeEnum<4>; 29def isVecExtract : VecInstTypeEnum<5>; 30def isVecInsert : VecInstTypeEnum<6>; 31def isVecDest : VecInstTypeEnum<7>; 32def isVecOther : VecInstTypeEnum<15>; 33 34//===----------------------------------------------------------------------===// 35// NVPTX Operand Definitions. 36//===----------------------------------------------------------------------===// 37 38def brtarget : Operand<OtherVT>; 39 40// CVT conversion modes 41// These must match the enum in NVPTX.h 42def CvtNONE : PatLeaf<(i32 0x0)>; 43def CvtRNI : PatLeaf<(i32 0x1)>; 44def CvtRZI : PatLeaf<(i32 0x2)>; 45def CvtRMI : PatLeaf<(i32 0x3)>; 46def CvtRPI : PatLeaf<(i32 0x4)>; 47def CvtRN : PatLeaf<(i32 0x5)>; 48def CvtRZ : PatLeaf<(i32 0x6)>; 49def CvtRM : PatLeaf<(i32 0x7)>; 50def CvtRP : PatLeaf<(i32 0x8)>; 51def CvtRNA : PatLeaf<(i32 0x9)>; 52 53def CvtNONE_FTZ : PatLeaf<(i32 0x10)>; 54def CvtRNI_FTZ : PatLeaf<(i32 0x11)>; 55def CvtRZI_FTZ : PatLeaf<(i32 0x12)>; 56def CvtRMI_FTZ : PatLeaf<(i32 0x13)>; 57def CvtRPI_FTZ : PatLeaf<(i32 0x14)>; 58def CvtRN_FTZ : PatLeaf<(i32 0x15)>; 59def CvtRZ_FTZ : PatLeaf<(i32 0x16)>; 60def CvtRM_FTZ : PatLeaf<(i32 0x17)>; 61def CvtRP_FTZ : PatLeaf<(i32 0x18)>; 62 63def CvtSAT : PatLeaf<(i32 0x20)>; 64def CvtSAT_FTZ : PatLeaf<(i32 0x30)>; 65 66def CvtNONE_RELU : PatLeaf<(i32 0x40)>; 67def CvtRN_RELU : PatLeaf<(i32 0x45)>; 68def CvtRZ_RELU : PatLeaf<(i32 0x46)>; 69 70def CvtMode : Operand<i32> { 71 let PrintMethod = "printCvtMode"; 72} 73 74// Compare modes 75// These must match the enum in NVPTX.h 76def CmpEQ : PatLeaf<(i32 0)>; 77def CmpNE : PatLeaf<(i32 1)>; 78def CmpLT : PatLeaf<(i32 2)>; 79def CmpLE : PatLeaf<(i32 3)>; 80def CmpGT : PatLeaf<(i32 4)>; 81def CmpGE : PatLeaf<(i32 5)>; 82def CmpEQU : PatLeaf<(i32 10)>; 83def CmpNEU : PatLeaf<(i32 11)>; 84def CmpLTU : PatLeaf<(i32 12)>; 85def CmpLEU : PatLeaf<(i32 13)>; 86def CmpGTU : PatLeaf<(i32 14)>; 87def CmpGEU : PatLeaf<(i32 15)>; 88def CmpNUM : PatLeaf<(i32 16)>; 89def CmpNAN : PatLeaf<(i32 17)>; 90 91def CmpEQ_FTZ : PatLeaf<(i32 0x100)>; 92def CmpNE_FTZ : PatLeaf<(i32 0x101)>; 93def CmpLT_FTZ : PatLeaf<(i32 0x102)>; 94def CmpLE_FTZ : PatLeaf<(i32 0x103)>; 95def CmpGT_FTZ : PatLeaf<(i32 0x104)>; 96def CmpGE_FTZ : PatLeaf<(i32 0x105)>; 97def CmpEQU_FTZ : PatLeaf<(i32 0x10A)>; 98def CmpNEU_FTZ : PatLeaf<(i32 0x10B)>; 99def CmpLTU_FTZ : PatLeaf<(i32 0x10C)>; 100def CmpLEU_FTZ : PatLeaf<(i32 0x10D)>; 101def CmpGTU_FTZ : PatLeaf<(i32 0x10E)>; 102def CmpGEU_FTZ : PatLeaf<(i32 0x10F)>; 103def CmpNUM_FTZ : PatLeaf<(i32 0x110)>; 104def CmpNAN_FTZ : PatLeaf<(i32 0x111)>; 105 106def CmpMode : Operand<i32> { 107 let PrintMethod = "printCmpMode"; 108} 109def VecElement : Operand<i32> { 110 let PrintMethod = "printVecElement"; 111} 112 113//===----------------------------------------------------------------------===// 114// NVPTX Instruction Predicate Definitions 115//===----------------------------------------------------------------------===// 116 117 118def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; 119def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; 120def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; 121def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; 122def hasVote : Predicate<"Subtarget->hasVote()">; 123def hasDouble : Predicate<"Subtarget->hasDouble()">; 124def hasLDG : Predicate<"Subtarget->hasLDG()">; 125def hasLDU : Predicate<"Subtarget->hasLDU()">; 126 127def doF32FTZ : Predicate<"useF32FTZ()">; 128def doNoF32FTZ : Predicate<"!useF32FTZ()">; 129 130def doMulWide : Predicate<"doMulWide">; 131 132def allowFMA : Predicate<"allowFMA()">; 133def noFMA : Predicate<"!allowFMA()">; 134def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">; 135 136def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">; 137def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">; 138 139def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">; 140def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; 141 142def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; 143def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; 144 145def True : Predicate<"true">; 146 147def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">; 148def hasPTX42 : Predicate<"Subtarget->getPTXVersion() >= 42">; 149def hasPTX43 : Predicate<"Subtarget->getPTXVersion() >= 43">; 150def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">; 151def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">; 152def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">; 153def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">; 154def hasPTX65 : Predicate<"Subtarget->getPTXVersion() >= 65">; 155def hasPTX70 : Predicate<"Subtarget->getPTXVersion() >= 70">; 156def hasPTX71 : Predicate<"Subtarget->getPTXVersion() >= 71">; 157def hasPTX72 : Predicate<"Subtarget->getPTXVersion() >= 72">; 158 159def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">; 160def hasSM32 : Predicate<"Subtarget->getSmVersion() >= 32">; 161def hasSM53 : Predicate<"Subtarget->getSmVersion() >= 53">; 162def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">; 163def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">; 164def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">; 165def hasSM80 : Predicate<"Subtarget->getSmVersion() >= 80">; 166def hasSM86 : Predicate<"Subtarget->getSmVersion() >= 86">; 167 168// non-sync shfl instructions are not available on sm_70+ in PTX6.4+ 169def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" 170 "&& Subtarget->getPTXVersion() >= 64)">; 171 172def useShortPtr : Predicate<"useShortPointers()">; 173def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; 174 175//===----------------------------------------------------------------------===// 176// Some Common Instruction Class Templates 177//===----------------------------------------------------------------------===// 178 179// Template for instructions which take three int64, int32, or int16 args. 180// The instructions are named "<OpcStr><Width>" (e.g. "add.s64"). 181multiclass I3<string OpcStr, SDNode OpNode> { 182 def i64rr : 183 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 184 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 185 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; 186 def i64ri : 187 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 188 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 189 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; 190 def i32rr : 191 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 192 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 193 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; 194 def i32ri : 195 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 196 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 197 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; 198 def i16rr : 199 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 200 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 201 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; 202 def i16ri : 203 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 204 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 205 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; 206} 207 208// Template for instructions which take 3 int args. The instructions are 209// named "<OpcStr>.s32" (e.g. "addc.cc.s32"). 210multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> { 211 let hasSideEffects = 1 in { 212 def i32rr : 213 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 214 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), 215 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; 216 def i32ri : 217 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 218 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), 219 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; 220 def i64rr : 221 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 222 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), 223 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>, 224 Requires<[hasPTX43]>; 225 def i64ri : 226 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 227 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), 228 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>, 229 Requires<[hasPTX43]>; 230 } 231} 232 233// Template for instructions which take three fp64 or fp32 args. The 234// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64"). 235// 236// Also defines ftz (flush subnormal inputs and results to sign-preserving 237// zero) variants for fp32 functions. 238// 239// This multiclass should be used for nodes that cannot be folded into FMAs. 240// For nodes that can be folded into FMAs (i.e. adds and muls), use 241// F3_fma_component. 242multiclass F3<string OpcStr, SDNode OpNode> { 243 def f64rr : 244 NVPTXInst<(outs Float64Regs:$dst), 245 (ins Float64Regs:$a, Float64Regs:$b), 246 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 247 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>; 248 def f64ri : 249 NVPTXInst<(outs Float64Regs:$dst), 250 (ins Float64Regs:$a, f64imm:$b), 251 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 252 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>; 253 def f32rr_ftz : 254 NVPTXInst<(outs Float32Regs:$dst), 255 (ins Float32Regs:$a, Float32Regs:$b), 256 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 257 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 258 Requires<[doF32FTZ]>; 259 def f32ri_ftz : 260 NVPTXInst<(outs Float32Regs:$dst), 261 (ins Float32Regs:$a, f32imm:$b), 262 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 263 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 264 Requires<[doF32FTZ]>; 265 def f32rr : 266 NVPTXInst<(outs Float32Regs:$dst), 267 (ins Float32Regs:$a, Float32Regs:$b), 268 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 269 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>; 270 def f32ri : 271 NVPTXInst<(outs Float32Regs:$dst), 272 (ins Float32Regs:$a, f32imm:$b), 273 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 274 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>; 275 276 def f16rr_ftz : 277 NVPTXInst<(outs Float16Regs:$dst), 278 (ins Float16Regs:$a, Float16Regs:$b), 279 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), 280 [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, 281 Requires<[useFP16Math, doF32FTZ]>; 282 def f16rr : 283 NVPTXInst<(outs Float16Regs:$dst), 284 (ins Float16Regs:$a, Float16Regs:$b), 285 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), 286 [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, 287 Requires<[useFP16Math]>; 288 289 def f16x2rr_ftz : 290 NVPTXInst<(outs Float16x2Regs:$dst), 291 (ins Float16x2Regs:$a, Float16x2Regs:$b), 292 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), 293 [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, 294 Requires<[useFP16Math, doF32FTZ]>; 295 def f16x2rr : 296 NVPTXInst<(outs Float16x2Regs:$dst), 297 (ins Float16x2Regs:$a, Float16x2Regs:$b), 298 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), 299 [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, 300 Requires<[useFP16Math]>; 301} 302 303// Template for instructions which take three FP args. The 304// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64"). 305// 306// Also defines ftz (flush subnormal inputs and results to sign-preserving 307// zero) variants for fp32/fp16 functions. 308// 309// This multiclass should be used for nodes that can be folded to make fma ops. 310// In this case, we use the ".rn" variant when FMA is disabled, as this behaves 311// just like the non ".rn" op, but prevents ptxas from creating FMAs. 312multiclass F3_fma_component<string OpcStr, SDNode OpNode> { 313 def f64rr : 314 NVPTXInst<(outs Float64Regs:$dst), 315 (ins Float64Regs:$a, Float64Regs:$b), 316 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 317 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, 318 Requires<[allowFMA]>; 319 def f64ri : 320 NVPTXInst<(outs Float64Regs:$dst), 321 (ins Float64Regs:$a, f64imm:$b), 322 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 323 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, 324 Requires<[allowFMA]>; 325 def f32rr_ftz : 326 NVPTXInst<(outs Float32Regs:$dst), 327 (ins Float32Regs:$a, Float32Regs:$b), 328 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 329 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 330 Requires<[allowFMA, doF32FTZ]>; 331 def f32ri_ftz : 332 NVPTXInst<(outs Float32Regs:$dst), 333 (ins Float32Regs:$a, f32imm:$b), 334 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 335 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 336 Requires<[allowFMA, doF32FTZ]>; 337 def f32rr : 338 NVPTXInst<(outs Float32Regs:$dst), 339 (ins Float32Regs:$a, Float32Regs:$b), 340 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 341 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 342 Requires<[allowFMA]>; 343 def f32ri : 344 NVPTXInst<(outs Float32Regs:$dst), 345 (ins Float32Regs:$a, f32imm:$b), 346 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 347 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 348 Requires<[allowFMA]>; 349 350 def f16rr_ftz : 351 NVPTXInst<(outs Float16Regs:$dst), 352 (ins Float16Regs:$a, Float16Regs:$b), 353 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), 354 [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, 355 Requires<[useFP16Math, allowFMA, doF32FTZ]>; 356 def f16rr : 357 NVPTXInst<(outs Float16Regs:$dst), 358 (ins Float16Regs:$a, Float16Regs:$b), 359 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), 360 [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, 361 Requires<[useFP16Math, allowFMA]>; 362 363 def f16x2rr_ftz : 364 NVPTXInst<(outs Float16x2Regs:$dst), 365 (ins Float16x2Regs:$a, Float16x2Regs:$b), 366 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), 367 [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, 368 Requires<[useFP16Math, allowFMA, doF32FTZ]>; 369 def f16x2rr : 370 NVPTXInst<(outs Float16x2Regs:$dst), 371 (ins Float16x2Regs:$a, Float16x2Regs:$b), 372 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), 373 [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, 374 Requires<[useFP16Math, allowFMA]>; 375 376 // These have strange names so we don't perturb existing mir tests. 377 def _rnf64rr : 378 NVPTXInst<(outs Float64Regs:$dst), 379 (ins Float64Regs:$a, Float64Regs:$b), 380 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), 381 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, 382 Requires<[noFMA]>; 383 def _rnf64ri : 384 NVPTXInst<(outs Float64Regs:$dst), 385 (ins Float64Regs:$a, f64imm:$b), 386 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), 387 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, 388 Requires<[noFMA]>; 389 def _rnf32rr_ftz : 390 NVPTXInst<(outs Float32Regs:$dst), 391 (ins Float32Regs:$a, Float32Regs:$b), 392 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), 393 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 394 Requires<[noFMA, doF32FTZ]>; 395 def _rnf32ri_ftz : 396 NVPTXInst<(outs Float32Regs:$dst), 397 (ins Float32Regs:$a, f32imm:$b), 398 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), 399 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 400 Requires<[noFMA, doF32FTZ]>; 401 def _rnf32rr : 402 NVPTXInst<(outs Float32Regs:$dst), 403 (ins Float32Regs:$a, Float32Regs:$b), 404 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), 405 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 406 Requires<[noFMA]>; 407 def _rnf32ri : 408 NVPTXInst<(outs Float32Regs:$dst), 409 (ins Float32Regs:$a, f32imm:$b), 410 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), 411 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 412 Requires<[noFMA]>; 413 def _rnf16rr_ftz : 414 NVPTXInst<(outs Float16Regs:$dst), 415 (ins Float16Regs:$a, Float16Regs:$b), 416 !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"), 417 [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, 418 Requires<[useFP16Math, noFMA, doF32FTZ]>; 419 def _rnf16rr : 420 NVPTXInst<(outs Float16Regs:$dst), 421 (ins Float16Regs:$a, Float16Regs:$b), 422 !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"), 423 [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, 424 Requires<[useFP16Math, noFMA]>; 425 def _rnf16x2rr_ftz : 426 NVPTXInst<(outs Float16x2Regs:$dst), 427 (ins Float16x2Regs:$a, Float16x2Regs:$b), 428 !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"), 429 [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, 430 Requires<[useFP16Math, noFMA, doF32FTZ]>; 431 def _rnf16x2rr : 432 NVPTXInst<(outs Float16x2Regs:$dst), 433 (ins Float16x2Regs:$a, Float16x2Regs:$b), 434 !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"), 435 [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, 436 Requires<[useFP16Math, noFMA]>; 437} 438 439// Template for operations which take two f32 or f64 operands. Provides three 440// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush 441// subnormal inputs and results to zero). 442multiclass F2<string OpcStr, SDNode OpNode> { 443 def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a), 444 !strconcat(OpcStr, ".f64 \t$dst, $a;"), 445 [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>; 446 def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), 447 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"), 448 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>, 449 Requires<[doF32FTZ]>; 450 def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), 451 !strconcat(OpcStr, ".f32 \t$dst, $a;"), 452 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>; 453} 454 455//===----------------------------------------------------------------------===// 456// NVPTX Instructions. 457//===----------------------------------------------------------------------===// 458 459//----------------------------------- 460// Type Conversion 461//----------------------------------- 462 463let hasSideEffects = false in { 464 // Generate a cvt to the given type from all possible types. Each instance 465 // takes a CvtMode immediate that defines the conversion mode to use. It can 466 // be CvtNONE to omit a conversion mode. 467 multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> { 468 def _s8 : 469 NVPTXInst<(outs RC:$dst), 470 (ins Int16Regs:$src, CvtMode:$mode), 471 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 472 FromName, ".s8 \t$dst, $src;"), []>; 473 def _u8 : 474 NVPTXInst<(outs RC:$dst), 475 (ins Int16Regs:$src, CvtMode:$mode), 476 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 477 FromName, ".u8 \t$dst, $src;"), []>; 478 def _s16 : 479 NVPTXInst<(outs RC:$dst), 480 (ins Int16Regs:$src, CvtMode:$mode), 481 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 482 FromName, ".s16 \t$dst, $src;"), []>; 483 def _u16 : 484 NVPTXInst<(outs RC:$dst), 485 (ins Int16Regs:$src, CvtMode:$mode), 486 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 487 FromName, ".u16 \t$dst, $src;"), []>; 488 def _s32 : 489 NVPTXInst<(outs RC:$dst), 490 (ins Int32Regs:$src, CvtMode:$mode), 491 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 492 FromName, ".s32 \t$dst, $src;"), []>; 493 def _u32 : 494 NVPTXInst<(outs RC:$dst), 495 (ins Int32Regs:$src, CvtMode:$mode), 496 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 497 FromName, ".u32 \t$dst, $src;"), []>; 498 def _s64 : 499 NVPTXInst<(outs RC:$dst), 500 (ins Int64Regs:$src, CvtMode:$mode), 501 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 502 FromName, ".s64 \t$dst, $src;"), []>; 503 def _u64 : 504 NVPTXInst<(outs RC:$dst), 505 (ins Int64Regs:$src, CvtMode:$mode), 506 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 507 FromName, ".u64 \t$dst, $src;"), []>; 508 def _f16 : 509 NVPTXInst<(outs RC:$dst), 510 (ins Float16Regs:$src, CvtMode:$mode), 511 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 512 FromName, ".f16 \t$dst, $src;"), []>; 513 def _f32 : 514 NVPTXInst<(outs RC:$dst), 515 (ins Float32Regs:$src, CvtMode:$mode), 516 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 517 FromName, ".f32 \t$dst, $src;"), []>; 518 def _f64 : 519 NVPTXInst<(outs RC:$dst), 520 (ins Float64Regs:$src, CvtMode:$mode), 521 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 522 FromName, ".f64 \t$dst, $src;"), []>; 523 } 524 525 // Generate cvts from all types to all types. 526 defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>; 527 defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>; 528 defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>; 529 defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>; 530 defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>; 531 defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>; 532 defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>; 533 defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>; 534 defm CVT_f16 : CVT_FROM_ALL<"f16", Float16Regs>; 535 defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>; 536 defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>; 537 538 // These cvts are different from those above: The source and dest registers 539 // are of the same type. 540 def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 541 "cvt.s16.s8 \t$dst, $src;", []>; 542 def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 543 "cvt.s32.s8 \t$dst, $src;", []>; 544 def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 545 "cvt.s32.s16 \t$dst, $src;", []>; 546 def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 547 "cvt.s64.s8 \t$dst, $src;", []>; 548 def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 549 "cvt.s64.s16 \t$dst, $src;", []>; 550 def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 551 "cvt.s64.s32 \t$dst, $src;", []>; 552 553multiclass CVT_FROM_FLOAT_SM80<string FromName, RegisterClass RC> { 554 def _f32 : 555 NVPTXInst<(outs RC:$dst), 556 (ins Float32Regs:$src, CvtMode:$mode), 557 !strconcat("cvt${mode:base}${mode:relu}.", 558 FromName, ".f32 \t$dst, $src;"), []>, 559 Requires<[hasPTX70, hasSM80]>; 560 } 561 562 defm CVT_bf16 : CVT_FROM_FLOAT_SM80<"bf16", Int16Regs>; 563 564 multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> { 565 def _f32 : 566 NVPTXInst<(outs RC:$dst), 567 (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode), 568 !strconcat("cvt${mode:base}${mode:relu}.", 569 FromName, ".f32 \t$dst, $src1, $src2;"), []>, 570 Requires<[hasPTX70, hasSM80]>; 571 } 572 573 defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Float16x2Regs>; 574 defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>; 575} 576 577//----------------------------------- 578// Integer Arithmetic 579//----------------------------------- 580 581// Template for xor masquerading as int1 arithmetic. 582multiclass ADD_SUB_i1<SDNode OpNode> { 583 def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), 584 "xor.pred \t$dst, $a, $b;", 585 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; 586 def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), 587 "xor.pred \t$dst, $a, $b;", 588 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>; 589} 590 591// int1 addition and subtraction are both just xor. 592defm ADD_i1 : ADD_SUB_i1<add>; 593defm SUB_i1 : ADD_SUB_i1<sub>; 594 595// int16, int32, and int64 signed addition. Since nvptx is 2's complement, we 596// also use these for unsigned arithmetic. 597defm ADD : I3<"add.s", add>; 598defm SUB : I3<"sub.s", sub>; 599 600// in32 and int64 addition and subtraction with carry-out. 601defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>; 602defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>; 603 604// int32 and int64 addition and subtraction with carry-in and carry-out. 605defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>; 606defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>; 607 608defm MULT : I3<"mul.lo.s", mul>; 609 610defm MULTHS : I3<"mul.hi.s", mulhs>; 611defm MULTHU : I3<"mul.hi.u", mulhu>; 612 613defm SDIV : I3<"div.s", sdiv>; 614defm UDIV : I3<"div.u", udiv>; 615 616// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM 617// will lower it. 618defm SREM : I3<"rem.s", srem>; 619defm UREM : I3<"rem.u", urem>; 620 621// Integer absolute value. NumBits should be one minus the bit width of RC. 622// This idiom implements the algorithm at 623// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs. 624multiclass ABS<RegisterClass RC, string SizeName> { 625 def : NVPTXInst<(outs RC:$dst), (ins RC:$a), 626 !strconcat("abs", SizeName, " \t$dst, $a;"), 627 [(set RC:$dst, (abs RC:$a))]>; 628} 629defm ABS_16 : ABS<Int16Regs, ".s16">; 630defm ABS_32 : ABS<Int32Regs, ".s32">; 631defm ABS_64 : ABS<Int64Regs, ".s64">; 632 633// Integer min/max. 634defm SMAX : I3<"max.s", smax>; 635defm UMAX : I3<"max.u", umax>; 636defm SMIN : I3<"min.s", smin>; 637defm UMIN : I3<"min.u", umin>; 638 639// 640// Wide multiplication 641// 642def MULWIDES64 : 643 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 644 "mul.wide.s32 \t$dst, $a, $b;", []>; 645def MULWIDES64Imm : 646 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 647 "mul.wide.s32 \t$dst, $a, $b;", []>; 648def MULWIDES64Imm64 : 649 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), 650 "mul.wide.s32 \t$dst, $a, $b;", []>; 651 652def MULWIDEU64 : 653 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 654 "mul.wide.u32 \t$dst, $a, $b;", []>; 655def MULWIDEU64Imm : 656 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 657 "mul.wide.u32 \t$dst, $a, $b;", []>; 658def MULWIDEU64Imm64 : 659 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), 660 "mul.wide.u32 \t$dst, $a, $b;", []>; 661 662def MULWIDES32 : 663 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 664 "mul.wide.s16 \t$dst, $a, $b;", []>; 665def MULWIDES32Imm : 666 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 667 "mul.wide.s16 \t$dst, $a, $b;", []>; 668def MULWIDES32Imm32 : 669 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 670 "mul.wide.s16 \t$dst, $a, $b;", []>; 671 672def MULWIDEU32 : 673 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 674 "mul.wide.u16 \t$dst, $a, $b;", []>; 675def MULWIDEU32Imm : 676 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 677 "mul.wide.u16 \t$dst, $a, $b;", []>; 678def MULWIDEU32Imm32 : 679 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 680 "mul.wide.u16 \t$dst, $a, $b;", []>; 681 682def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; 683def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; 684def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; 685 686// Matchers for signed, unsigned mul.wide ISD nodes. 687def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)), 688 (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, 689 Requires<[doMulWide]>; 690def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)), 691 (MULWIDES32Imm Int16Regs:$a, imm:$b)>, 692 Requires<[doMulWide]>; 693def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)), 694 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, 695 Requires<[doMulWide]>; 696def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)), 697 (MULWIDEU32Imm Int16Regs:$a, imm:$b)>, 698 Requires<[doMulWide]>; 699 700def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)), 701 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, 702 Requires<[doMulWide]>; 703def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)), 704 (MULWIDES64Imm Int32Regs:$a, imm:$b)>, 705 Requires<[doMulWide]>; 706def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)), 707 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, 708 Requires<[doMulWide]>; 709def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)), 710 (MULWIDEU64Imm Int32Regs:$a, imm:$b)>, 711 Requires<[doMulWide]>; 712 713// Predicates used for converting some patterns to mul.wide. 714def SInt32Const : PatLeaf<(imm), [{ 715 const APInt &v = N->getAPIntValue(); 716 return v.isSignedIntN(32); 717}]>; 718 719def UInt32Const : PatLeaf<(imm), [{ 720 const APInt &v = N->getAPIntValue(); 721 return v.isIntN(32); 722}]>; 723 724def SInt16Const : PatLeaf<(imm), [{ 725 const APInt &v = N->getAPIntValue(); 726 return v.isSignedIntN(16); 727}]>; 728 729def UInt16Const : PatLeaf<(imm), [{ 730 const APInt &v = N->getAPIntValue(); 731 return v.isIntN(16); 732}]>; 733 734def Int5Const : PatLeaf<(imm), [{ 735 // Check if 0 <= v < 32; only then will the result of (x << v) be an int32. 736 const APInt &v = N->getAPIntValue(); 737 return v.sge(0) && v.slt(32); 738}]>; 739 740def Int4Const : PatLeaf<(imm), [{ 741 // Check if 0 <= v < 16; only then will the result of (x << v) be an int16. 742 const APInt &v = N->getAPIntValue(); 743 return v.sge(0) && v.slt(16); 744}]>; 745 746def SHL2MUL32 : SDNodeXForm<imm, [{ 747 const APInt &v = N->getAPIntValue(); 748 APInt temp(32, 1); 749 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32); 750}]>; 751 752def SHL2MUL16 : SDNodeXForm<imm, [{ 753 const APInt &v = N->getAPIntValue(); 754 APInt temp(16, 1); 755 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16); 756}]>; 757 758// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide. 759def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)), 760 (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, 761 Requires<[doMulWide]>; 762def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)), 763 (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, 764 Requires<[doMulWide]>; 765 766def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)), 767 (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, 768 Requires<[doMulWide]>; 769def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)), 770 (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, 771 Requires<[doMulWide]>; 772 773// Convert "sign/zero-extend then multiply" to mul.wide. 774def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)), 775 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, 776 Requires<[doMulWide]>; 777def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)), 778 (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>, 779 Requires<[doMulWide]>; 780 781def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)), 782 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, 783 Requires<[doMulWide]>; 784def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)), 785 (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>, 786 Requires<[doMulWide]>; 787 788def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)), 789 (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, 790 Requires<[doMulWide]>; 791def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)), 792 (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>, 793 Requires<[doMulWide]>; 794 795def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)), 796 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, 797 Requires<[doMulWide]>; 798def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)), 799 (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>, 800 Requires<[doMulWide]>; 801 802// 803// Integer multiply-add 804// 805def SDTIMAD : 806 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>, 807 SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>; 808def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>; 809 810def MAD16rrr : 811 NVPTXInst<(outs Int16Regs:$dst), 812 (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), 813 "mad.lo.s16 \t$dst, $a, $b, $c;", 814 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>; 815def MAD16rri : 816 NVPTXInst<(outs Int16Regs:$dst), 817 (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), 818 "mad.lo.s16 \t$dst, $a, $b, $c;", 819 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>; 820def MAD16rir : 821 NVPTXInst<(outs Int16Regs:$dst), 822 (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), 823 "mad.lo.s16 \t$dst, $a, $b, $c;", 824 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>; 825def MAD16rii : 826 NVPTXInst<(outs Int16Regs:$dst), 827 (ins Int16Regs:$a, i16imm:$b, i16imm:$c), 828 "mad.lo.s16 \t$dst, $a, $b, $c;", 829 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>; 830 831def MAD32rrr : 832 NVPTXInst<(outs Int32Regs:$dst), 833 (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), 834 "mad.lo.s32 \t$dst, $a, $b, $c;", 835 [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>; 836def MAD32rri : 837 NVPTXInst<(outs Int32Regs:$dst), 838 (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), 839 "mad.lo.s32 \t$dst, $a, $b, $c;", 840 [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>; 841def MAD32rir : 842 NVPTXInst<(outs Int32Regs:$dst), 843 (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), 844 "mad.lo.s32 \t$dst, $a, $b, $c;", 845 [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>; 846def MAD32rii : 847 NVPTXInst<(outs Int32Regs:$dst), 848 (ins Int32Regs:$a, i32imm:$b, i32imm:$c), 849 "mad.lo.s32 \t$dst, $a, $b, $c;", 850 [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>; 851 852def MAD64rrr : 853 NVPTXInst<(outs Int64Regs:$dst), 854 (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), 855 "mad.lo.s64 \t$dst, $a, $b, $c;", 856 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>; 857def MAD64rri : 858 NVPTXInst<(outs Int64Regs:$dst), 859 (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), 860 "mad.lo.s64 \t$dst, $a, $b, $c;", 861 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>; 862def MAD64rir : 863 NVPTXInst<(outs Int64Regs:$dst), 864 (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), 865 "mad.lo.s64 \t$dst, $a, $b, $c;", 866 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>; 867def MAD64rii : 868 NVPTXInst<(outs Int64Regs:$dst), 869 (ins Int64Regs:$a, i64imm:$b, i64imm:$c), 870 "mad.lo.s64 \t$dst, $a, $b, $c;", 871 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>; 872 873def INEG16 : 874 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 875 "neg.s16 \t$dst, $src;", 876 [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>; 877def INEG32 : 878 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 879 "neg.s32 \t$dst, $src;", 880 [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>; 881def INEG64 : 882 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 883 "neg.s64 \t$dst, $src;", 884 [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>; 885 886//----------------------------------- 887// Floating Point Arithmetic 888//----------------------------------- 889 890// Constant 1.0f 891def FloatConst1 : PatLeaf<(fpimm), [{ 892 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() && 893 N->getValueAPF().convertToFloat() == 1.0f; 894}]>; 895// Constant 1.0 (double) 896def DoubleConst1 : PatLeaf<(fpimm), [{ 897 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() && 898 N->getValueAPF().convertToDouble() == 1.0; 899}]>; 900 901// Loads FP16 constant into a register. 902// 903// ptxas does not have hex representation for fp16, so we can't use 904// fp16 immediate values in .f16 instructions. Instead we have to load 905// the constant into a register using mov.b16. 906def LOAD_CONST_F16 : 907 NVPTXInst<(outs Float16Regs:$dst), (ins f16imm:$a), 908 "mov.b16 \t$dst, $a;", []>; 909 910defm FADD : F3_fma_component<"add", fadd>; 911defm FSUB : F3_fma_component<"sub", fsub>; 912defm FMUL : F3_fma_component<"mul", fmul>; 913 914defm FMIN : F3<"min", fminnum>; 915defm FMAX : F3<"max", fmaxnum>; 916// Note: min.NaN.f64 and max.NaN.f64 do not actually exist. 917defm FMINNAN : F3<"min.NaN", fminimum>; 918defm FMAXNAN : F3<"max.NaN", fmaximum>; 919 920defm FABS : F2<"abs", fabs>; 921defm FNEG : F2<"neg", fneg>; 922defm FSQRT : F2<"sqrt.rn", fsqrt>; 923 924// 925// F64 division 926// 927def FDIV641r : 928 NVPTXInst<(outs Float64Regs:$dst), 929 (ins f64imm:$a, Float64Regs:$b), 930 "rcp.rn.f64 \t$dst, $b;", 931 [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>; 932def FDIV64rr : 933 NVPTXInst<(outs Float64Regs:$dst), 934 (ins Float64Regs:$a, Float64Regs:$b), 935 "div.rn.f64 \t$dst, $a, $b;", 936 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>; 937def FDIV64ri : 938 NVPTXInst<(outs Float64Regs:$dst), 939 (ins Float64Regs:$a, f64imm:$b), 940 "div.rn.f64 \t$dst, $a, $b;", 941 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>; 942 943// 944// F32 Approximate reciprocal 945// 946def FDIV321r_ftz : 947 NVPTXInst<(outs Float32Regs:$dst), 948 (ins f32imm:$a, Float32Regs:$b), 949 "rcp.approx.ftz.f32 \t$dst, $b;", 950 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 951 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 952def FDIV321r : 953 NVPTXInst<(outs Float32Regs:$dst), 954 (ins f32imm:$a, Float32Regs:$b), 955 "rcp.approx.f32 \t$dst, $b;", 956 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 957 Requires<[do_DIVF32_APPROX]>; 958// 959// F32 Approximate division 960// 961def FDIV32approxrr_ftz : 962 NVPTXInst<(outs Float32Regs:$dst), 963 (ins Float32Regs:$a, Float32Regs:$b), 964 "div.approx.ftz.f32 \t$dst, $a, $b;", 965 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 966 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 967def FDIV32approxri_ftz : 968 NVPTXInst<(outs Float32Regs:$dst), 969 (ins Float32Regs:$a, f32imm:$b), 970 "div.approx.ftz.f32 \t$dst, $a, $b;", 971 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 972 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 973def FDIV32approxrr : 974 NVPTXInst<(outs Float32Regs:$dst), 975 (ins Float32Regs:$a, Float32Regs:$b), 976 "div.approx.f32 \t$dst, $a, $b;", 977 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 978 Requires<[do_DIVF32_APPROX]>; 979def FDIV32approxri : 980 NVPTXInst<(outs Float32Regs:$dst), 981 (ins Float32Regs:$a, f32imm:$b), 982 "div.approx.f32 \t$dst, $a, $b;", 983 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 984 Requires<[do_DIVF32_APPROX]>; 985// 986// F32 Semi-accurate reciprocal 987// 988// rcp.approx gives the same result as div.full(1.0f, a) and is faster. 989// 990def FDIV321r_approx_ftz : 991 NVPTXInst<(outs Float32Regs:$dst), 992 (ins f32imm:$a, Float32Regs:$b), 993 "rcp.approx.ftz.f32 \t$dst, $b;", 994 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 995 Requires<[do_DIVF32_FULL, doF32FTZ]>; 996def FDIV321r_approx : 997 NVPTXInst<(outs Float32Regs:$dst), 998 (ins f32imm:$a, Float32Regs:$b), 999 "rcp.approx.f32 \t$dst, $b;", 1000 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1001 Requires<[do_DIVF32_FULL]>; 1002// 1003// F32 Semi-accurate division 1004// 1005def FDIV32rr_ftz : 1006 NVPTXInst<(outs Float32Regs:$dst), 1007 (ins Float32Regs:$a, Float32Regs:$b), 1008 "div.full.ftz.f32 \t$dst, $a, $b;", 1009 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1010 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1011def FDIV32ri_ftz : 1012 NVPTXInst<(outs Float32Regs:$dst), 1013 (ins Float32Regs:$a, f32imm:$b), 1014 "div.full.ftz.f32 \t$dst, $a, $b;", 1015 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1016 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1017def FDIV32rr : 1018 NVPTXInst<(outs Float32Regs:$dst), 1019 (ins Float32Regs:$a, Float32Regs:$b), 1020 "div.full.f32 \t$dst, $a, $b;", 1021 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1022 Requires<[do_DIVF32_FULL]>; 1023def FDIV32ri : 1024 NVPTXInst<(outs Float32Regs:$dst), 1025 (ins Float32Regs:$a, f32imm:$b), 1026 "div.full.f32 \t$dst, $a, $b;", 1027 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1028 Requires<[do_DIVF32_FULL]>; 1029// 1030// F32 Accurate reciprocal 1031// 1032def FDIV321r_prec_ftz : 1033 NVPTXInst<(outs Float32Regs:$dst), 1034 (ins f32imm:$a, Float32Regs:$b), 1035 "rcp.rn.ftz.f32 \t$dst, $b;", 1036 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1037 Requires<[doF32FTZ]>; 1038def FDIV321r_prec : 1039 NVPTXInst<(outs Float32Regs:$dst), 1040 (ins f32imm:$a, Float32Regs:$b), 1041 "rcp.rn.f32 \t$dst, $b;", 1042 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>; 1043// 1044// F32 Accurate division 1045// 1046def FDIV32rr_prec_ftz : 1047 NVPTXInst<(outs Float32Regs:$dst), 1048 (ins Float32Regs:$a, Float32Regs:$b), 1049 "div.rn.ftz.f32 \t$dst, $a, $b;", 1050 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1051 Requires<[doF32FTZ]>; 1052def FDIV32ri_prec_ftz : 1053 NVPTXInst<(outs Float32Regs:$dst), 1054 (ins Float32Regs:$a, f32imm:$b), 1055 "div.rn.ftz.f32 \t$dst, $a, $b;", 1056 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1057 Requires<[doF32FTZ]>; 1058def FDIV32rr_prec : 1059 NVPTXInst<(outs Float32Regs:$dst), 1060 (ins Float32Regs:$a, Float32Regs:$b), 1061 "div.rn.f32 \t$dst, $a, $b;", 1062 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>; 1063def FDIV32ri_prec : 1064 NVPTXInst<(outs Float32Regs:$dst), 1065 (ins Float32Regs:$a, f32imm:$b), 1066 "div.rn.f32 \t$dst, $a, $b;", 1067 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>; 1068 1069// 1070// FMA 1071// 1072 1073multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> { 1074 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1075 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1076 [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, 1077 Requires<[Pred]>; 1078 def rri : NVPTXInst<(outs RC:$dst), 1079 (ins RC:$a, RC:$b, ImmCls:$c), 1080 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1081 [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>, 1082 Requires<[Pred]>; 1083 def rir : NVPTXInst<(outs RC:$dst), 1084 (ins RC:$a, ImmCls:$b, RC:$c), 1085 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1086 [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>, 1087 Requires<[Pred]>; 1088 def rii : NVPTXInst<(outs RC:$dst), 1089 (ins RC:$a, ImmCls:$b, ImmCls:$c), 1090 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1091 [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>, 1092 Requires<[Pred]>; 1093} 1094 1095multiclass FMA_F16<string OpcStr, RegisterClass RC, Predicate Pred> { 1096 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1097 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1098 [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, 1099 Requires<[useFP16Math, Pred]>; 1100} 1101 1102defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>; 1103defm FMA16 : FMA_F16<"fma.rn.f16", Float16Regs, True>; 1104defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>; 1105defm FMA16x2 : FMA_F16<"fma.rn.f16x2", Float16x2Regs, True>; 1106defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>; 1107defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, True>; 1108defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, True>; 1109 1110// sin/cos 1111def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1112 "sin.approx.f32 \t$dst, $src;", 1113 [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>, 1114 Requires<[allowUnsafeFPMath]>; 1115def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1116 "cos.approx.f32 \t$dst, $src;", 1117 [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>, 1118 Requires<[allowUnsafeFPMath]>; 1119 1120// Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)), 1121// i.e. "poor man's fmod()" 1122 1123// frem - f32 FTZ 1124def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1125 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32 1126 (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ), 1127 Float32Regs:$y))>, 1128 Requires<[doF32FTZ]>; 1129def : Pat<(frem Float32Regs:$x, fpimm:$y), 1130 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32 1131 (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ), 1132 fpimm:$y))>, 1133 Requires<[doF32FTZ]>; 1134 1135// frem - f32 1136def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1137 (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32 1138 (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI), 1139 Float32Regs:$y))>; 1140def : Pat<(frem Float32Regs:$x, fpimm:$y), 1141 (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32 1142 (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI), 1143 fpimm:$y))>; 1144 1145// frem - f64 1146def : Pat<(frem Float64Regs:$x, Float64Regs:$y), 1147 (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64 1148 (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI), 1149 Float64Regs:$y))>; 1150def : Pat<(frem Float64Regs:$x, fpimm:$y), 1151 (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64 1152 (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI), 1153 fpimm:$y))>; 1154 1155//----------------------------------- 1156// Bitwise operations 1157//----------------------------------- 1158 1159// Template for three-arg bitwise operations. Takes three args, Creates .b16, 1160// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr. 1161multiclass BITWISE<string OpcStr, SDNode OpNode> { 1162 def b1rr : 1163 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), 1164 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), 1165 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; 1166 def b1ri : 1167 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), 1168 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), 1169 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>; 1170 def b16rr : 1171 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 1172 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), 1173 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; 1174 def b16ri : 1175 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 1176 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), 1177 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>; 1178 def b32rr : 1179 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 1180 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), 1181 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; 1182 def b32ri : 1183 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 1184 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), 1185 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; 1186 def b64rr : 1187 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 1188 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), 1189 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; 1190 def b64ri : 1191 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 1192 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), 1193 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; 1194} 1195 1196defm OR : BITWISE<"or", or>; 1197defm AND : BITWISE<"and", and>; 1198defm XOR : BITWISE<"xor", xor>; 1199 1200def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src), 1201 "not.pred \t$dst, $src;", 1202 [(set Int1Regs:$dst, (not Int1Regs:$src))]>; 1203def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 1204 "not.b16 \t$dst, $src;", 1205 [(set Int16Regs:$dst, (not Int16Regs:$src))]>; 1206def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 1207 "not.b32 \t$dst, $src;", 1208 [(set Int32Regs:$dst, (not Int32Regs:$src))]>; 1209def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 1210 "not.b64 \t$dst, $src;", 1211 [(set Int64Regs:$dst, (not Int64Regs:$src))]>; 1212 1213// Template for left/right shifts. Takes three operands, 1214// [dest (reg), src (reg), shift (reg or imm)]. 1215// dest and src may be int64, int32, or int16, but shift is always int32. 1216// 1217// This template also defines a 32-bit shift (imm, imm) instruction. 1218multiclass SHIFT<string OpcStr, SDNode OpNode> { 1219 def i64rr : 1220 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b), 1221 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 1222 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>; 1223 def i64ri : 1224 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b), 1225 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 1226 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>; 1227 def i32rr : 1228 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 1229 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1230 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; 1231 def i32ri : 1232 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 1233 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1234 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>; 1235 def i32ii : 1236 NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b), 1237 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1238 [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>; 1239 def i16rr : 1240 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b), 1241 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 1242 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>; 1243 def i16ri : 1244 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 1245 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 1246 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>; 1247} 1248 1249defm SHL : SHIFT<"shl.b", shl>; 1250defm SRA : SHIFT<"shr.s", sra>; 1251defm SRL : SHIFT<"shr.u", srl>; 1252 1253// Bit-reverse 1254def BREV32 : 1255 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 1256 "brev.b32 \t$dst, $a;", 1257 [(set Int32Regs:$dst, (bitreverse Int32Regs:$a))]>; 1258def BREV64 : 1259 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a), 1260 "brev.b64 \t$dst, $a;", 1261 [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>; 1262 1263// 1264// Rotate: Use ptx shf instruction if available. 1265// 1266 1267// 32 bit r2 = rotl r1, n 1268// => 1269// r2 = shf.l r1, r1, n 1270def ROTL32imm_hw : 1271 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), 1272 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", 1273 [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>, 1274 Requires<[hasHWROT32]>; 1275 1276def ROTL32reg_hw : 1277 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1278 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", 1279 [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, 1280 Requires<[hasHWROT32]>; 1281 1282// 32 bit r2 = rotr r1, n 1283// => 1284// r2 = shf.r r1, r1, n 1285def ROTR32imm_hw : 1286 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), 1287 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", 1288 [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>, 1289 Requires<[hasHWROT32]>; 1290 1291def ROTR32reg_hw : 1292 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1293 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", 1294 [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, 1295 Requires<[hasHWROT32]>; 1296 1297// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1. 1298def ROT32imm_sw : 1299 NVPTXInst<(outs Int32Regs:$dst), 1300 (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), 1301 "{{\n\t" 1302 ".reg .b32 %lhs;\n\t" 1303 ".reg .b32 %rhs;\n\t" 1304 "shl.b32 \t%lhs, $src, $amt1;\n\t" 1305 "shr.b32 \t%rhs, $src, $amt2;\n\t" 1306 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1307 "}}", 1308 []>; 1309 1310def SUB_FRM_32 : SDNodeXForm<imm, [{ 1311 return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32); 1312}]>; 1313 1314def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)), 1315 (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, 1316 Requires<[noHWROT32]>; 1317def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)), 1318 (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>, 1319 Requires<[noHWROT32]>; 1320 1321// 32-bit software rotate left by register. 1322def ROTL32reg_sw : 1323 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1324 "{{\n\t" 1325 ".reg .b32 %lhs;\n\t" 1326 ".reg .b32 %rhs;\n\t" 1327 ".reg .b32 %amt2;\n\t" 1328 "shl.b32 \t%lhs, $src, $amt;\n\t" 1329 "sub.s32 \t%amt2, 32, $amt;\n\t" 1330 "shr.b32 \t%rhs, $src, %amt2;\n\t" 1331 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1332 "}}", 1333 [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, 1334 Requires<[noHWROT32]>; 1335 1336// 32-bit software rotate right by register. 1337def ROTR32reg_sw : 1338 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1339 "{{\n\t" 1340 ".reg .b32 %lhs;\n\t" 1341 ".reg .b32 %rhs;\n\t" 1342 ".reg .b32 %amt2;\n\t" 1343 "shr.b32 \t%lhs, $src, $amt;\n\t" 1344 "sub.s32 \t%amt2, 32, $amt;\n\t" 1345 "shl.b32 \t%rhs, $src, %amt2;\n\t" 1346 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1347 "}}", 1348 [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, 1349 Requires<[noHWROT32]>; 1350 1351// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1. 1352def ROT64imm_sw : 1353 NVPTXInst<(outs Int64Regs:$dst), 1354 (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2), 1355 "{{\n\t" 1356 ".reg .b64 %lhs;\n\t" 1357 ".reg .b64 %rhs;\n\t" 1358 "shl.b64 \t%lhs, $src, $amt1;\n\t" 1359 "shr.b64 \t%rhs, $src, $amt2;\n\t" 1360 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1361 "}}", 1362 []>; 1363 1364def SUB_FRM_64 : SDNodeXForm<imm, [{ 1365 return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32); 1366}]>; 1367 1368def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)), 1369 (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>; 1370def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)), 1371 (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>; 1372 1373// 64-bit software rotate left by register. 1374def ROTL64reg_sw : 1375 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), 1376 "{{\n\t" 1377 ".reg .b64 %lhs;\n\t" 1378 ".reg .b64 %rhs;\n\t" 1379 ".reg .u32 %amt2;\n\t" 1380 "shl.b64 \t%lhs, $src, $amt;\n\t" 1381 "sub.u32 \t%amt2, 64, $amt;\n\t" 1382 "shr.b64 \t%rhs, $src, %amt2;\n\t" 1383 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1384 "}}", 1385 [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>; 1386 1387def ROTR64reg_sw : 1388 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), 1389 "{{\n\t" 1390 ".reg .b64 %lhs;\n\t" 1391 ".reg .b64 %rhs;\n\t" 1392 ".reg .u32 %amt2;\n\t" 1393 "shr.b64 \t%lhs, $src, $amt;\n\t" 1394 "sub.u32 \t%amt2, 64, $amt;\n\t" 1395 "shl.b64 \t%rhs, $src, %amt2;\n\t" 1396 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1397 "}}", 1398 [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>; 1399 1400// 1401// Funnnel shift in clamp mode 1402// 1403 1404// Create SDNodes so they can be used in the DAG code, e.g. 1405// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) 1406def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; 1407def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; 1408 1409def FUNSHFLCLAMP : 1410 NVPTXInst<(outs Int32Regs:$dst), 1411 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), 1412 "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", 1413 [(set Int32Regs:$dst, 1414 (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>; 1415 1416def FUNSHFRCLAMP : 1417 NVPTXInst<(outs Int32Regs:$dst), 1418 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), 1419 "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", 1420 [(set Int32Regs:$dst, 1421 (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>; 1422 1423// 1424// BFE - bit-field extract 1425// 1426 1427// Template for BFE instructions. Takes four args, 1428// [dest (reg), src (reg), start (reg or imm), end (reg or imm)]. 1429// Start may be an imm only if end is also an imm. FIXME: Is this a 1430// restriction in PTX? 1431// 1432// dest and src may be int32 or int64, but start and end are always int32. 1433multiclass BFE<string TyStr, RegisterClass RC> { 1434 def rrr 1435 : NVPTXInst<(outs RC:$d), 1436 (ins RC:$a, Int32Regs:$b, Int32Regs:$c), 1437 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; 1438 def rri 1439 : NVPTXInst<(outs RC:$d), 1440 (ins RC:$a, Int32Regs:$b, i32imm:$c), 1441 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; 1442 def rii 1443 : NVPTXInst<(outs RC:$d), 1444 (ins RC:$a, i32imm:$b, i32imm:$c), 1445 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; 1446} 1447 1448let hasSideEffects = false in { 1449 defm BFE_S32 : BFE<"s32", Int32Regs>; 1450 defm BFE_U32 : BFE<"u32", Int32Regs>; 1451 defm BFE_S64 : BFE<"s64", Int64Regs>; 1452 defm BFE_U64 : BFE<"u64", Int64Regs>; 1453} 1454 1455//----------------------------------- 1456// Comparison instructions (setp, set) 1457//----------------------------------- 1458 1459// FIXME: This doesn't cover versions of set and setp that combine with a 1460// boolean predicate, e.g. setp.eq.and.b16. 1461 1462let hasSideEffects = false in { 1463 multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> { 1464 def rr : 1465 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp), 1466 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1467 " \t$dst, $a, $b;"), []>; 1468 def ri : 1469 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp), 1470 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1471 " \t$dst, $a, $b;"), []>; 1472 def ir : 1473 NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp), 1474 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1475 " \t$dst, $a, $b;"), []>; 1476 } 1477} 1478 1479defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>; 1480defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>; 1481defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>; 1482defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>; 1483defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>; 1484defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>; 1485defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>; 1486defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>; 1487defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>; 1488defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>; 1489defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>; 1490def SETP_f16rr : 1491 NVPTXInst<(outs Int1Regs:$dst), 1492 (ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp), 1493 "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;", 1494 []>, Requires<[useFP16Math]>; 1495 1496def SETP_f16x2rr : 1497 NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q), 1498 (ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp), 1499 "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;", 1500 []>, 1501 Requires<[useFP16Math]>; 1502 1503 1504// FIXME: This doesn't appear to be correct. The "set" mnemonic has the form 1505// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination 1506// reg, either u32, s32, or f32. Anyway these aren't used at the moment. 1507 1508let hasSideEffects = false in { 1509 multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> { 1510 def rr : NVPTXInst<(outs Int32Regs:$dst), 1511 (ins RC:$a, RC:$b, CmpMode:$cmp), 1512 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1513 def ri : NVPTXInst<(outs Int32Regs:$dst), 1514 (ins RC:$a, ImmCls:$b, CmpMode:$cmp), 1515 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1516 def ir : NVPTXInst<(outs Int32Regs:$dst), 1517 (ins ImmCls:$a, RC:$b, CmpMode:$cmp), 1518 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1519 } 1520} 1521 1522defm SET_b16 : SET<"b16", Int16Regs, i16imm>; 1523defm SET_s16 : SET<"s16", Int16Regs, i16imm>; 1524defm SET_u16 : SET<"u16", Int16Regs, i16imm>; 1525defm SET_b32 : SET<"b32", Int32Regs, i32imm>; 1526defm SET_s32 : SET<"s32", Int32Regs, i32imm>; 1527defm SET_u32 : SET<"u32", Int32Regs, i32imm>; 1528defm SET_b64 : SET<"b64", Int64Regs, i64imm>; 1529defm SET_s64 : SET<"s64", Int64Regs, i64imm>; 1530defm SET_u64 : SET<"u64", Int64Regs, i64imm>; 1531defm SET_f16 : SET<"f16", Float16Regs, f16imm>; 1532defm SET_f32 : SET<"f32", Float32Regs, f32imm>; 1533defm SET_f64 : SET<"f64", Float64Regs, f64imm>; 1534 1535//----------------------------------- 1536// Selection instructions (selp) 1537//----------------------------------- 1538 1539// FIXME: Missing slct 1540 1541// selp instructions that don't have any pattern matches; we explicitly use 1542// them within this file. 1543let hasSideEffects = false in { 1544 multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> { 1545 def rr : NVPTXInst<(outs RC:$dst), 1546 (ins RC:$a, RC:$b, Int1Regs:$p), 1547 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 1548 def ri : NVPTXInst<(outs RC:$dst), 1549 (ins RC:$a, ImmCls:$b, Int1Regs:$p), 1550 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 1551 def ir : NVPTXInst<(outs RC:$dst), 1552 (ins ImmCls:$a, RC:$b, Int1Regs:$p), 1553 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 1554 def ii : NVPTXInst<(outs RC:$dst), 1555 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), 1556 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 1557 } 1558 1559 multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls, 1560 SDNode ImmNode> { 1561 def rr : 1562 NVPTXInst<(outs RC:$dst), 1563 (ins RC:$a, RC:$b, Int1Regs:$p), 1564 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 1565 [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>; 1566 def ri : 1567 NVPTXInst<(outs RC:$dst), 1568 (ins RC:$a, ImmCls:$b, Int1Regs:$p), 1569 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 1570 [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>; 1571 def ir : 1572 NVPTXInst<(outs RC:$dst), 1573 (ins ImmCls:$a, RC:$b, Int1Regs:$p), 1574 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 1575 [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>; 1576 def ii : 1577 NVPTXInst<(outs RC:$dst), 1578 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), 1579 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 1580 [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>; 1581 } 1582} 1583 1584// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as 1585// good. 1586defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>; 1587defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>; 1588defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>; 1589defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>; 1590defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>; 1591defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>; 1592defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>; 1593defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>; 1594defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>; 1595defm SELP_f16 : SELP_PATTERN<"b16", Float16Regs, f16imm, fpimm>; 1596defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>; 1597defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>; 1598 1599def SELP_f16x2rr : 1600 NVPTXInst<(outs Float16x2Regs:$dst), 1601 (ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p), 1602 "selp.b32 \t$dst, $a, $b, $p;", 1603 [(set Float16x2Regs:$dst, 1604 (select Int1Regs:$p, Float16x2Regs:$a, Float16x2Regs:$b))]>; 1605 1606//----------------------------------- 1607// Data Movement (Load / Store, Move) 1608//----------------------------------- 1609 1610def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], 1611 [SDNPWantRoot]>; 1612def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex], 1613 [SDNPWantRoot]>; 1614def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>; 1615 1616def MEMri : Operand<i32> { 1617 let PrintMethod = "printMemOperand"; 1618 let MIOperandInfo = (ops Int32Regs, i32imm); 1619} 1620def MEMri64 : Operand<i64> { 1621 let PrintMethod = "printMemOperand"; 1622 let MIOperandInfo = (ops Int64Regs, i64imm); 1623} 1624 1625def imem : Operand<iPTR> { 1626 let PrintMethod = "printOperand"; 1627} 1628 1629def imemAny : Operand<iPTRAny> { 1630 let PrintMethod = "printOperand"; 1631} 1632 1633def LdStCode : Operand<i32> { 1634 let PrintMethod = "printLdStCode"; 1635} 1636 1637def MmaCode : Operand<i32> { 1638 let PrintMethod = "printMmaCode"; 1639} 1640 1641def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; 1642def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>; 1643 1644// Load a memory address into a u32 or u64 register. 1645def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a), 1646 "mov.u32 \t$dst, $a;", 1647 [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>; 1648def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a), 1649 "mov.u64 \t$dst, $a;", 1650 [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>; 1651 1652// Get pointer to local stack. 1653let hasSideEffects = false in { 1654 def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num), 1655 "mov.u32 \t$d, __local_depot$num;", []>; 1656 def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num), 1657 "mov.u64 \t$d, __local_depot$num;", []>; 1658} 1659 1660 1661// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp 1662let IsSimpleMove=1, hasSideEffects=0 in { 1663 def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss), 1664 "mov.pred \t$dst, $sss;", []>; 1665 def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), 1666 "mov.u16 \t$dst, $sss;", []>; 1667 def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), 1668 "mov.u32 \t$dst, $sss;", []>; 1669 def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), 1670 "mov.u64 \t$dst, $sss;", []>; 1671 1672 def FMOV16rr : NVPTXInst<(outs Float16Regs:$dst), (ins Float16Regs:$src), 1673 // We have to use .b16 here as there's no mov.f16. 1674 "mov.b16 \t$dst, $src;", []>; 1675 def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1676 "mov.f32 \t$dst, $src;", []>; 1677 def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src), 1678 "mov.f64 \t$dst, $src;", []>; 1679} 1680 1681def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), 1682 "mov.pred \t$dst, $src;", 1683 [(set Int1Regs:$dst, imm:$src)]>; 1684def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), 1685 "mov.u16 \t$dst, $src;", 1686 [(set Int16Regs:$dst, imm:$src)]>; 1687def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), 1688 "mov.u32 \t$dst, $src;", 1689 [(set Int32Regs:$dst, imm:$src)]>; 1690def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), 1691 "mov.u64 \t$dst, $src;", 1692 [(set Int64Regs:$dst, imm:$src)]>; 1693 1694def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), 1695 "mov.f32 \t$dst, $src;", 1696 [(set Float32Regs:$dst, fpimm:$src)]>; 1697def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), 1698 "mov.f64 \t$dst, $src;", 1699 [(set Float64Regs:$dst, fpimm:$src)]>; 1700 1701def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>; 1702 1703//---- Copy Frame Index ---- 1704def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr), 1705 "add.u32 \t$dst, ${addr:add};", 1706 [(set Int32Regs:$dst, ADDRri:$addr)]>; 1707def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr), 1708 "add.u64 \t$dst, ${addr:add};", 1709 [(set Int64Regs:$dst, ADDRri64:$addr)]>; 1710 1711//----------------------------------- 1712// Comparison and Selection 1713//----------------------------------- 1714 1715multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode, 1716 Instruction setp_16rr, 1717 Instruction setp_16ri, 1718 Instruction setp_16ir, 1719 Instruction setp_32rr, 1720 Instruction setp_32ri, 1721 Instruction setp_32ir, 1722 Instruction setp_64rr, 1723 Instruction setp_64ri, 1724 Instruction setp_64ir, 1725 Instruction set_16rr, 1726 Instruction set_16ri, 1727 Instruction set_16ir, 1728 Instruction set_32rr, 1729 Instruction set_32ri, 1730 Instruction set_32ir, 1731 Instruction set_64rr, 1732 Instruction set_64ri, 1733 Instruction set_64ir> { 1734 // i16 -> pred 1735 def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)), 1736 (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; 1737 def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)), 1738 (setp_16ri Int16Regs:$a, imm:$b, Mode)>; 1739 def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)), 1740 (setp_16ir imm:$a, Int16Regs:$b, Mode)>; 1741 // i32 -> pred 1742 def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)), 1743 (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; 1744 def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)), 1745 (setp_32ri Int32Regs:$a, imm:$b, Mode)>; 1746 def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)), 1747 (setp_32ir imm:$a, Int32Regs:$b, Mode)>; 1748 // i64 -> pred 1749 def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)), 1750 (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; 1751 def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)), 1752 (setp_64ri Int64Regs:$a, imm:$b, Mode)>; 1753 def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)), 1754 (setp_64ir imm:$a, Int64Regs:$b, Mode)>; 1755 1756 // i16 -> i32 1757 def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)), 1758 (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; 1759 def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)), 1760 (set_16ri Int16Regs:$a, imm:$b, Mode)>; 1761 def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)), 1762 (set_16ir imm:$a, Int16Regs:$b, Mode)>; 1763 // i32 -> i32 1764 def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)), 1765 (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; 1766 def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)), 1767 (set_32ri Int32Regs:$a, imm:$b, Mode)>; 1768 def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)), 1769 (set_32ir imm:$a, Int32Regs:$b, Mode)>; 1770 // i64 -> i32 1771 def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)), 1772 (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; 1773 def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)), 1774 (set_64ri Int64Regs:$a, imm:$b, Mode)>; 1775 def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)), 1776 (set_64ir imm:$a, Int64Regs:$b, Mode)>; 1777} 1778 1779multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode> 1780 : ISET_FORMAT<OpNode, Mode, 1781 SETP_s16rr, SETP_s16ri, SETP_s16ir, 1782 SETP_s32rr, SETP_s32ri, SETP_s32ir, 1783 SETP_s64rr, SETP_s64ri, SETP_s64ir, 1784 SET_s16rr, SET_s16ri, SET_s16ir, 1785 SET_s32rr, SET_s32ri, SET_s32ir, 1786 SET_s64rr, SET_s64ri, SET_s64ir> { 1787 // TableGen doesn't like empty multiclasses. 1788 def : PatLeaf<(i32 0)>; 1789} 1790 1791multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode> 1792 : ISET_FORMAT<OpNode, Mode, 1793 SETP_u16rr, SETP_u16ri, SETP_u16ir, 1794 SETP_u32rr, SETP_u32ri, SETP_u32ir, 1795 SETP_u64rr, SETP_u64ri, SETP_u64ir, 1796 SET_u16rr, SET_u16ri, SET_u16ir, 1797 SET_u32rr, SET_u32ri, SET_u32ir, 1798 SET_u64rr, SET_u64ri, SET_u64ir> { 1799 // TableGen doesn't like empty multiclasses. 1800 def : PatLeaf<(i32 0)>; 1801} 1802 1803defm : ISET_FORMAT_SIGNED<setgt, CmpGT>; 1804defm : ISET_FORMAT_SIGNED<setlt, CmpLT>; 1805defm : ISET_FORMAT_SIGNED<setge, CmpGE>; 1806defm : ISET_FORMAT_SIGNED<setle, CmpLE>; 1807defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>; 1808defm : ISET_FORMAT_SIGNED<setne, CmpNE>; 1809defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>; 1810defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>; 1811defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>; 1812defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>; 1813defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>; 1814defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>; 1815 1816// i1 compares 1817def : Pat<(setne Int1Regs:$a, Int1Regs:$b), 1818 (XORb1rr Int1Regs:$a, Int1Regs:$b)>; 1819def : Pat<(setune Int1Regs:$a, Int1Regs:$b), 1820 (XORb1rr Int1Regs:$a, Int1Regs:$b)>; 1821 1822def : Pat<(seteq Int1Regs:$a, Int1Regs:$b), 1823 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 1824def : Pat<(setueq Int1Regs:$a, Int1Regs:$b), 1825 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 1826 1827// i1 compare -> i32 1828def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), 1829 (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 1830def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), 1831 (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 1832 1833 1834 1835multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> { 1836 // f16 -> pred 1837 def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)), 1838 (SETP_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>, 1839 Requires<[useFP16Math,doF32FTZ]>; 1840 def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)), 1841 (SETP_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>, 1842 Requires<[useFP16Math]>; 1843 def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)), 1844 (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, 1845 Requires<[useFP16Math,doF32FTZ]>; 1846 def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)), 1847 (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, 1848 Requires<[useFP16Math]>; 1849 def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)), 1850 (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>, 1851 Requires<[useFP16Math,doF32FTZ]>; 1852 def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)), 1853 (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>, 1854 Requires<[useFP16Math]>; 1855 1856 // f32 -> pred 1857 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), 1858 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, 1859 Requires<[doF32FTZ]>; 1860 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), 1861 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; 1862 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), 1863 (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, 1864 Requires<[doF32FTZ]>; 1865 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), 1866 (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>; 1867 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), 1868 (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, 1869 Requires<[doF32FTZ]>; 1870 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), 1871 (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>; 1872 1873 // f64 -> pred 1874 def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)), 1875 (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; 1876 def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)), 1877 (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>; 1878 def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)), 1879 (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>; 1880 1881 // f16 -> i32 1882 def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)), 1883 (SET_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>, 1884 Requires<[useFP16Math, doF32FTZ]>; 1885 def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)), 1886 (SET_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>, 1887 Requires<[useFP16Math]>; 1888 def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)), 1889 (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, 1890 Requires<[useFP16Math, doF32FTZ]>; 1891 def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)), 1892 (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, 1893 Requires<[useFP16Math]>; 1894 def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)), 1895 (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>, 1896 Requires<[useFP16Math, doF32FTZ]>; 1897 def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)), 1898 (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>, 1899 Requires<[useFP16Math]>; 1900 1901 // f32 -> i32 1902 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), 1903 (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, 1904 Requires<[doF32FTZ]>; 1905 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), 1906 (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; 1907 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), 1908 (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, 1909 Requires<[doF32FTZ]>; 1910 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), 1911 (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>; 1912 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), 1913 (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, 1914 Requires<[doF32FTZ]>; 1915 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), 1916 (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>; 1917 1918 // f64 -> i32 1919 def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)), 1920 (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; 1921 def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)), 1922 (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>; 1923 def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)), 1924 (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>; 1925} 1926 1927defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>; 1928defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>; 1929defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>; 1930defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>; 1931defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>; 1932defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>; 1933 1934defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>; 1935defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>; 1936defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>; 1937defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>; 1938defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>; 1939defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>; 1940 1941defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>; 1942defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>; 1943defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>; 1944defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>; 1945defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>; 1946defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>; 1947 1948defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>; 1949defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>; 1950 1951// FIXME: What is this doing here? Can it be deleted? 1952// def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad, 1953// [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; 1954 1955def SDTDeclareParamProfile : 1956 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; 1957def SDTDeclareScalarParamProfile : 1958 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; 1959def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; 1960def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>; 1961def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>; 1962def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 1963def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 1964def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; 1965def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>; 1966def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>; 1967def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; 1968def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; 1969def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>; 1970def SDTCallVoidProfile : SDTypeProfile<0, 1, []>; 1971def SDTCallValProfile : SDTypeProfile<1, 0, []>; 1972def SDTMoveParamProfile : SDTypeProfile<1, 1, []>; 1973def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; 1974def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>; 1975def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>; 1976def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>; 1977def SDTProxyRegProfile : SDTypeProfile<1, 1, []>; 1978 1979def DeclareParam : 1980 SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile, 1981 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 1982def DeclareScalarParam : 1983 SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile, 1984 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 1985def DeclareRetParam : 1986 SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile, 1987 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 1988def DeclareRet : 1989 SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile, 1990 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 1991def LoadParam : 1992 SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, 1993 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 1994def LoadParamV2 : 1995 SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile, 1996 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 1997def LoadParamV4 : 1998 SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile, 1999 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2000def PrintCall : 2001 SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile, 2002 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2003def PrintConvergentCall : 2004 SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile, 2005 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2006def PrintCallUni : 2007 SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile, 2008 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2009def PrintConvergentCallUni : 2010 SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile, 2011 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2012def StoreParam : 2013 SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, 2014 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2015def StoreParamV2 : 2016 SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile, 2017 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2018def StoreParamV4 : 2019 SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile, 2020 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2021def StoreParamU32 : 2022 SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile, 2023 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2024def StoreParamS32 : 2025 SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile, 2026 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2027def CallArgBegin : 2028 SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile, 2029 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2030def CallArg : 2031 SDNode<"NVPTXISD::CallArg", SDTCallArgProfile, 2032 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2033def LastCallArg : 2034 SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile, 2035 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2036def CallArgEnd : 2037 SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile, 2038 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2039def CallVoid : 2040 SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile, 2041 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2042def Prototype : 2043 SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile, 2044 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2045def CallVal : 2046 SDNode<"NVPTXISD::CallVal", SDTCallValProfile, 2047 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2048def MoveParam : 2049 SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>; 2050def StoreRetval : 2051 SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile, 2052 [SDNPHasChain, SDNPSideEffect]>; 2053def StoreRetvalV2 : 2054 SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile, 2055 [SDNPHasChain, SDNPSideEffect]>; 2056def StoreRetvalV4 : 2057 SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile, 2058 [SDNPHasChain, SDNPSideEffect]>; 2059def PseudoUseParam : 2060 SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile, 2061 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2062def RETURNNode : 2063 SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile, 2064 [SDNPHasChain, SDNPSideEffect]>; 2065def ProxyReg : 2066 SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile, 2067 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2068 2069let mayLoad = true in { 2070 class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : 2071 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), 2072 !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"), 2073 []>; 2074 2075 class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> : 2076 NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b), 2077 !strconcat("ld.param.v2", opstr, 2078 " \t{{$dst, $dst2}}, [retval0+$b];"), []>; 2079 2080 class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> : 2081 NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, 2082 regclass:$dst4), 2083 (ins i32imm:$b), 2084 !strconcat("ld.param.v4", opstr, 2085 " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), 2086 []>; 2087} 2088 2089class LoadParamRegInst<NVPTXRegClass regclass, string opstr> : 2090 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), 2091 !strconcat("mov", opstr, " \t$dst, retval$b;"), 2092 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>; 2093 2094let mayStore = true in { 2095 class StoreParamInst<NVPTXRegClass regclass, string opstr> : 2096 NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b), 2097 !strconcat("st.param", opstr, " \t[param$a+$b], $val;"), 2098 []>; 2099 2100 class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> : 2101 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, 2102 i32imm:$a, i32imm:$b), 2103 !strconcat("st.param.v2", opstr, 2104 " \t[param$a+$b], {{$val, $val2}};"), 2105 []>; 2106 2107 class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> : 2108 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3, 2109 regclass:$val4, i32imm:$a, 2110 i32imm:$b), 2111 !strconcat("st.param.v4", opstr, 2112 " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"), 2113 []>; 2114 2115 class StoreRetvalInst<NVPTXRegClass regclass, string opstr> : 2116 NVPTXInst<(outs), (ins regclass:$val, i32imm:$a), 2117 !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"), 2118 []>; 2119 2120 class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> : 2121 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a), 2122 !strconcat("st.param.v2", opstr, 2123 " \t[func_retval0+$a], {{$val, $val2}};"), 2124 []>; 2125 2126 class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> : 2127 NVPTXInst<(outs), 2128 (ins regclass:$val, regclass:$val2, regclass:$val3, 2129 regclass:$val4, i32imm:$a), 2130 !strconcat("st.param.v4", opstr, 2131 " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"), 2132 []>; 2133} 2134 2135let isCall=1 in { 2136 multiclass CALL<string OpcStr, SDNode OpNode> { 2137 def PrintCallNoRetInst : NVPTXInst<(outs), (ins), 2138 !strconcat(OpcStr, " "), [(OpNode (i32 0))]>; 2139 def PrintCallRetInst1 : NVPTXInst<(outs), (ins), 2140 !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>; 2141 def PrintCallRetInst2 : NVPTXInst<(outs), (ins), 2142 !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>; 2143 def PrintCallRetInst3 : NVPTXInst<(outs), (ins), 2144 !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>; 2145 def PrintCallRetInst4 : NVPTXInst<(outs), (ins), 2146 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "), 2147 [(OpNode (i32 4))]>; 2148 def PrintCallRetInst5 : NVPTXInst<(outs), (ins), 2149 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "), 2150 [(OpNode (i32 5))]>; 2151 def PrintCallRetInst6 : NVPTXInst<(outs), (ins), 2152 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2153 "retval5), "), 2154 [(OpNode (i32 6))]>; 2155 def PrintCallRetInst7 : NVPTXInst<(outs), (ins), 2156 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2157 "retval5, retval6), "), 2158 [(OpNode (i32 7))]>; 2159 def PrintCallRetInst8 : NVPTXInst<(outs), (ins), 2160 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2161 "retval5, retval6, retval7), "), 2162 [(OpNode (i32 8))]>; 2163 } 2164} 2165 2166defm Call : CALL<"call", PrintCall>; 2167defm CallUni : CALL<"call.uni", PrintCallUni>; 2168 2169// Convergent call instructions. These are identical to regular calls, except 2170// they have the isConvergent bit set. 2171let isConvergent=1 in { 2172 defm ConvergentCall : CALL<"call", PrintConvergentCall>; 2173 defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>; 2174} 2175 2176def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">; 2177def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">; 2178def LoadParamMemI16 : LoadParamMemInst<Int16Regs, ".b16">; 2179def LoadParamMemI8 : LoadParamMemInst<Int16Regs, ".b8">; 2180def LoadParamMemV2I64 : LoadParamV2MemInst<Int64Regs, ".b64">; 2181def LoadParamMemV2I32 : LoadParamV2MemInst<Int32Regs, ".b32">; 2182def LoadParamMemV2I16 : LoadParamV2MemInst<Int16Regs, ".b16">; 2183def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">; 2184def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">; 2185def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">; 2186def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">; 2187def LoadParamMemF16 : LoadParamMemInst<Float16Regs, ".b16">; 2188def LoadParamMemF16x2 : LoadParamMemInst<Float16x2Regs, ".b32">; 2189def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">; 2190def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">; 2191def LoadParamMemV2F16 : LoadParamV2MemInst<Float16Regs, ".b16">; 2192def LoadParamMemV2F16x2: LoadParamV2MemInst<Float16x2Regs, ".b32">; 2193def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">; 2194def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">; 2195def LoadParamMemV4F16 : LoadParamV4MemInst<Float16Regs, ".b16">; 2196def LoadParamMemV4F16x2: LoadParamV4MemInst<Float16x2Regs, ".b32">; 2197def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">; 2198 2199def StoreParamI64 : StoreParamInst<Int64Regs, ".b64">; 2200def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">; 2201 2202def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">; 2203def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">; 2204def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">; 2205def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">; 2206def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">; 2207def StoreParamV2I8 : StoreParamV2Inst<Int16Regs, ".b8">; 2208 2209def StoreParamV4I32 : StoreParamV4Inst<Int32Regs, ".b32">; 2210def StoreParamV4I16 : StoreParamV4Inst<Int16Regs, ".b16">; 2211def StoreParamV4I8 : StoreParamV4Inst<Int16Regs, ".b8">; 2212 2213def StoreParamF16 : StoreParamInst<Float16Regs, ".b16">; 2214def StoreParamF16x2 : StoreParamInst<Float16x2Regs, ".b32">; 2215def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">; 2216def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">; 2217def StoreParamV2F16 : StoreParamV2Inst<Float16Regs, ".b16">; 2218def StoreParamV2F16x2 : StoreParamV2Inst<Float16x2Regs, ".b32">; 2219def StoreParamV2F32 : StoreParamV2Inst<Float32Regs, ".f32">; 2220def StoreParamV2F64 : StoreParamV2Inst<Float64Regs, ".f64">; 2221def StoreParamV4F16 : StoreParamV4Inst<Float16Regs, ".b16">; 2222def StoreParamV4F16x2 : StoreParamV4Inst<Float16x2Regs, ".b32">; 2223def StoreParamV4F32 : StoreParamV4Inst<Float32Regs, ".f32">; 2224 2225def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">; 2226def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">; 2227def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">; 2228def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">; 2229def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">; 2230def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">; 2231def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">; 2232def StoreRetvalV2I8 : StoreRetvalV2Inst<Int16Regs, ".b8">; 2233def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">; 2234def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">; 2235def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">; 2236 2237def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">; 2238def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">; 2239def StoreRetvalF16 : StoreRetvalInst<Float16Regs, ".b16">; 2240def StoreRetvalF16x2 : StoreRetvalInst<Float16x2Regs, ".b32">; 2241def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">; 2242def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">; 2243def StoreRetvalV2F16 : StoreRetvalV2Inst<Float16Regs, ".b16">; 2244def StoreRetvalV2F16x2: StoreRetvalV2Inst<Float16x2Regs, ".b32">; 2245def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">; 2246def StoreRetvalV4F16 : StoreRetvalV4Inst<Float16Regs, ".b16">; 2247def StoreRetvalV4F16x2: StoreRetvalV4Inst<Float16x2Regs, ".b32">; 2248 2249def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>; 2250def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>; 2251def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>; 2252def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>; 2253 2254class CallArgInst<NVPTXRegClass regclass> : 2255 NVPTXInst<(outs), (ins regclass:$a), "$a, ", 2256 [(CallArg (i32 0), regclass:$a)]>; 2257 2258class LastCallArgInst<NVPTXRegClass regclass> : 2259 NVPTXInst<(outs), (ins regclass:$a), "$a", 2260 [(LastCallArg (i32 0), regclass:$a)]>; 2261 2262def CallArgI64 : CallArgInst<Int64Regs>; 2263def CallArgI32 : CallArgInst<Int32Regs>; 2264def CallArgI16 : CallArgInst<Int16Regs>; 2265def CallArgF64 : CallArgInst<Float64Regs>; 2266def CallArgF32 : CallArgInst<Float32Regs>; 2267 2268def LastCallArgI64 : LastCallArgInst<Int64Regs>; 2269def LastCallArgI32 : LastCallArgInst<Int32Regs>; 2270def LastCallArgI16 : LastCallArgInst<Int16Regs>; 2271def LastCallArgF64 : LastCallArgInst<Float64Regs>; 2272def LastCallArgF32 : LastCallArgInst<Float32Regs>; 2273 2274def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ", 2275 [(CallArg (i32 0), (i32 imm:$a))]>; 2276def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a", 2277 [(LastCallArg (i32 0), (i32 imm:$a))]>; 2278 2279def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ", 2280 [(CallArg (i32 1), (i32 imm:$a))]>; 2281def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a", 2282 [(LastCallArg (i32 1), (i32 imm:$a))]>; 2283 2284def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ", 2285 [(CallVoid (Wrapper tglobaladdr:$addr))]>; 2286def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ", 2287 [(CallVoid Int32Regs:$addr)]>; 2288def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ", 2289 [(CallVoid Int64Regs:$addr)]>; 2290def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;", 2291 [(Prototype (i32 imm:$val))]>; 2292 2293def DeclareRetMemInst : 2294 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num), 2295 ".param .align $align .b8 retval$num[$size];", 2296 [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>; 2297def DeclareRetScalarInst : 2298 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), 2299 ".param .b$size retval$num;", 2300 [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>; 2301def DeclareRetRegInst : 2302 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), 2303 ".reg .b$size retval$num;", 2304 [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>; 2305 2306def DeclareParamInst : 2307 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size), 2308 ".param .align $align .b8 param$a[$size];", 2309 [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>; 2310def DeclareScalarParamInst : 2311 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), 2312 ".param .b$size param$a;", 2313 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>; 2314def DeclareScalarRegInst : 2315 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), 2316 ".reg .b$size param$a;", 2317 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>; 2318 2319class MoveParamInst<NVPTXRegClass regclass, string asmstr> : 2320 NVPTXInst<(outs regclass:$dst), (ins regclass:$src), 2321 !strconcat("mov", asmstr, " \t$dst, $src;"), 2322 [(set regclass:$dst, (MoveParam regclass:$src))]>; 2323 2324class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, 2325 string asmstr> : 2326 NVPTXInst<(outs regclass:$dst), (ins srcty:$src), 2327 !strconcat("mov", asmstr, " \t$dst, $src;"), 2328 [(set regclass:$dst, (MoveParam texternalsym:$src))]>; 2329 2330def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">; 2331def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">; 2332 2333def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, ".b64">; 2334def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, ".b32">; 2335 2336def MoveParamI16 : 2337 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 2338 "cvt.u16.u32 \t$dst, $src;", 2339 [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>; 2340def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">; 2341def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">; 2342def MoveParamF16 : MoveParamInst<Float16Regs, ".f16">; 2343 2344class PseudoUseParamInst<NVPTXRegClass regclass> : 2345 NVPTXInst<(outs), (ins regclass:$src), 2346 "// Pseudo use of $src", 2347 [(PseudoUseParam regclass:$src)]>; 2348 2349def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>; 2350def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>; 2351def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>; 2352def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>; 2353def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>; 2354 2355class ProxyRegInst<string SzStr, NVPTXRegClass regclass> : 2356 NVPTXInst<(outs regclass:$dst), (ins regclass:$src), 2357 !strconcat("mov.", SzStr, " \t$dst, $src;"), 2358 [(set regclass:$dst, (ProxyReg regclass:$src))]>; 2359 2360let isCodeGenOnly=1, isPseudo=1 in { 2361 def ProxyRegI1 : ProxyRegInst<"pred", Int1Regs>; 2362 def ProxyRegI16 : ProxyRegInst<"b16", Int16Regs>; 2363 def ProxyRegI32 : ProxyRegInst<"b32", Int32Regs>; 2364 def ProxyRegI64 : ProxyRegInst<"b64", Int64Regs>; 2365 def ProxyRegF16 : ProxyRegInst<"b16", Float16Regs>; 2366 def ProxyRegF32 : ProxyRegInst<"f32", Float32Regs>; 2367 def ProxyRegF64 : ProxyRegInst<"f64", Float64Regs>; 2368 def ProxyRegF16x2 : ProxyRegInst<"b32", Float16x2Regs>; 2369} 2370 2371// 2372// Load / Store Handling 2373// 2374multiclass LD<NVPTXRegClass regclass> { 2375 def _avar : NVPTXInst< 2376 (outs regclass:$dst), 2377 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2378 i32imm:$fromWidth, imem:$addr), 2379 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2380 "\t$dst, [$addr];", []>; 2381 def _areg : NVPTXInst< 2382 (outs regclass:$dst), 2383 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2384 i32imm:$fromWidth, Int32Regs:$addr), 2385 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2386 "\t$dst, [$addr];", []>; 2387 def _areg_64 : NVPTXInst< 2388 (outs regclass:$dst), 2389 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2390 i32imm:$fromWidth, Int64Regs:$addr), 2391 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2392 "\t$dst, [$addr];", []>; 2393 def _ari : NVPTXInst< 2394 (outs regclass:$dst), 2395 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2396 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 2397 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2398 "\t$dst, [$addr+$offset];", []>; 2399 def _ari_64 : NVPTXInst< 2400 (outs regclass:$dst), 2401 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2402 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 2403 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2404 "\t$dst, [$addr+$offset];", []>; 2405 def _asi : NVPTXInst< 2406 (outs regclass:$dst), 2407 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2408 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), 2409 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2410 "\t$dst, [$addr+$offset];", []>; 2411} 2412 2413let mayLoad=1, hasSideEffects=0 in { 2414 defm LD_i8 : LD<Int16Regs>; 2415 defm LD_i16 : LD<Int16Regs>; 2416 defm LD_i32 : LD<Int32Regs>; 2417 defm LD_i64 : LD<Int64Regs>; 2418 defm LD_f16 : LD<Float16Regs>; 2419 defm LD_f16x2 : LD<Float16x2Regs>; 2420 defm LD_f32 : LD<Float32Regs>; 2421 defm LD_f64 : LD<Float64Regs>; 2422} 2423 2424multiclass ST<NVPTXRegClass regclass> { 2425 def _avar : NVPTXInst< 2426 (outs), 2427 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2428 LdStCode:$Sign, i32imm:$toWidth, imem:$addr), 2429 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2430 " \t[$addr], $src;", []>; 2431 def _areg : NVPTXInst< 2432 (outs), 2433 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, 2434 LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), 2435 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2436 " \t[$addr], $src;", []>; 2437 def _areg_64 : NVPTXInst< 2438 (outs), 2439 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2440 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), 2441 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2442 " \t[$addr], $src;", []>; 2443 def _ari : NVPTXInst< 2444 (outs), 2445 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2446 LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), 2447 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2448 " \t[$addr+$offset], $src;", []>; 2449 def _ari_64 : NVPTXInst< 2450 (outs), 2451 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2452 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), 2453 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2454 " \t[$addr+$offset], $src;", []>; 2455 def _asi : NVPTXInst< 2456 (outs), 2457 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2458 LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), 2459 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2460 " \t[$addr+$offset], $src;", []>; 2461} 2462 2463let mayStore=1, hasSideEffects=0 in { 2464 defm ST_i8 : ST<Int16Regs>; 2465 defm ST_i16 : ST<Int16Regs>; 2466 defm ST_i32 : ST<Int32Regs>; 2467 defm ST_i64 : ST<Int64Regs>; 2468 defm ST_f16 : ST<Float16Regs>; 2469 defm ST_f16x2 : ST<Float16x2Regs>; 2470 defm ST_f32 : ST<Float32Regs>; 2471 defm ST_f64 : ST<Float64Regs>; 2472} 2473 2474// The following is used only in and after vector elementizations. Vector 2475// elementization happens at the machine instruction level, so the following 2476// instructions never appear in the DAG. 2477multiclass LD_VEC<NVPTXRegClass regclass> { 2478 def _v2_avar : NVPTXInst< 2479 (outs regclass:$dst1, regclass:$dst2), 2480 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2481 i32imm:$fromWidth, imem:$addr), 2482 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2483 "\t{{$dst1, $dst2}}, [$addr];", []>; 2484 def _v2_areg : NVPTXInst< 2485 (outs regclass:$dst1, regclass:$dst2), 2486 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2487 i32imm:$fromWidth, Int32Regs:$addr), 2488 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2489 "\t{{$dst1, $dst2}}, [$addr];", []>; 2490 def _v2_areg_64 : NVPTXInst< 2491 (outs regclass:$dst1, regclass:$dst2), 2492 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2493 i32imm:$fromWidth, Int64Regs:$addr), 2494 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2495 "\t{{$dst1, $dst2}}, [$addr];", []>; 2496 def _v2_ari : NVPTXInst< 2497 (outs regclass:$dst1, regclass:$dst2), 2498 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2499 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 2500 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2501 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 2502 def _v2_ari_64 : NVPTXInst< 2503 (outs regclass:$dst1, regclass:$dst2), 2504 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2505 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 2506 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2507 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 2508 def _v2_asi : NVPTXInst< 2509 (outs regclass:$dst1, regclass:$dst2), 2510 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2511 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 2512 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2513 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 2514 def _v4_avar : NVPTXInst< 2515 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2516 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2517 i32imm:$fromWidth, imem:$addr), 2518 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2519 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 2520 def _v4_areg : NVPTXInst< 2521 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2522 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2523 i32imm:$fromWidth, Int32Regs:$addr), 2524 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2525 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 2526 def _v4_areg_64 : NVPTXInst< 2527 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2528 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2529 i32imm:$fromWidth, Int64Regs:$addr), 2530 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2531 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 2532 def _v4_ari : NVPTXInst< 2533 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2534 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2535 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 2536 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2537 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 2538 def _v4_ari_64 : NVPTXInst< 2539 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2540 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2541 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 2542 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2543 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 2544 def _v4_asi : NVPTXInst< 2545 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2546 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2547 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 2548 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2549 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 2550} 2551let mayLoad=1, hasSideEffects=0 in { 2552 defm LDV_i8 : LD_VEC<Int16Regs>; 2553 defm LDV_i16 : LD_VEC<Int16Regs>; 2554 defm LDV_i32 : LD_VEC<Int32Regs>; 2555 defm LDV_i64 : LD_VEC<Int64Regs>; 2556 defm LDV_f16 : LD_VEC<Float16Regs>; 2557 defm LDV_f16x2 : LD_VEC<Float16x2Regs>; 2558 defm LDV_f32 : LD_VEC<Float32Regs>; 2559 defm LDV_f64 : LD_VEC<Float64Regs>; 2560} 2561 2562multiclass ST_VEC<NVPTXRegClass regclass> { 2563 def _v2_avar : NVPTXInst< 2564 (outs), 2565 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2566 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), 2567 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2568 "\t[$addr], {{$src1, $src2}};", []>; 2569 def _v2_areg : NVPTXInst< 2570 (outs), 2571 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2572 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), 2573 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2574 "\t[$addr], {{$src1, $src2}};", []>; 2575 def _v2_areg_64 : NVPTXInst< 2576 (outs), 2577 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2578 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), 2579 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2580 "\t[$addr], {{$src1, $src2}};", []>; 2581 def _v2_ari : NVPTXInst< 2582 (outs), 2583 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2584 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, 2585 i32imm:$offset), 2586 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2587 "\t[$addr+$offset], {{$src1, $src2}};", []>; 2588 def _v2_ari_64 : NVPTXInst< 2589 (outs), 2590 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2591 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, 2592 i32imm:$offset), 2593 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2594 "\t[$addr+$offset], {{$src1, $src2}};", []>; 2595 def _v2_asi : NVPTXInst< 2596 (outs), 2597 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2598 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, 2599 i32imm:$offset), 2600 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2601 "\t[$addr+$offset], {{$src1, $src2}};", []>; 2602 def _v4_avar : NVPTXInst< 2603 (outs), 2604 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2605 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2606 i32imm:$fromWidth, imem:$addr), 2607 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2608 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 2609 def _v4_areg : NVPTXInst< 2610 (outs), 2611 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2612 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2613 i32imm:$fromWidth, Int32Regs:$addr), 2614 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2615 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 2616 def _v4_areg_64 : NVPTXInst< 2617 (outs), 2618 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2619 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2620 i32imm:$fromWidth, Int64Regs:$addr), 2621 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2622 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 2623 def _v4_ari : NVPTXInst< 2624 (outs), 2625 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2626 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2627 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 2628 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2629 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 2630 def _v4_ari_64 : NVPTXInst< 2631 (outs), 2632 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2633 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2634 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 2635 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2636 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 2637 def _v4_asi : NVPTXInst< 2638 (outs), 2639 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2640 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2641 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 2642 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}" 2643 "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 2644} 2645 2646let mayStore=1, hasSideEffects=0 in { 2647 defm STV_i8 : ST_VEC<Int16Regs>; 2648 defm STV_i16 : ST_VEC<Int16Regs>; 2649 defm STV_i32 : ST_VEC<Int32Regs>; 2650 defm STV_i64 : ST_VEC<Int64Regs>; 2651 defm STV_f16 : ST_VEC<Float16Regs>; 2652 defm STV_f16x2 : ST_VEC<Float16x2Regs>; 2653 defm STV_f32 : ST_VEC<Float32Regs>; 2654 defm STV_f64 : ST_VEC<Float64Regs>; 2655} 2656 2657//---- Conversion ---- 2658 2659class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn, 2660 NVPTXRegClass regclassOut> : 2661 NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a), 2662 !strconcat("mov.b", SzStr, " \t$d, $a;"), 2663 [(set regclassOut:$d, (bitconvert regclassIn:$a))]>; 2664 2665def BITCONVERT_16_I2F : F_BITCONVERT<"16", Int16Regs, Float16Regs>; 2666def BITCONVERT_16_F2I : F_BITCONVERT<"16", Float16Regs, Int16Regs>; 2667def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>; 2668def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>; 2669def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>; 2670def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>; 2671def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>; 2672def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>; 2673def BITCONVERT_32_F2F16x2 : F_BITCONVERT<"32", Float32Regs, Float16x2Regs>; 2674def BITCONVERT_32_F16x22F : F_BITCONVERT<"32", Float16x2Regs, Float32Regs>; 2675 2676// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where 2677// we cannot specify floating-point literals in isel patterns. Therefore, we 2678// use an integer selp to select either 1 or 0 and then cvt to floating-point. 2679 2680// sint -> f16 2681def : Pat<(f16 (sint_to_fp Int1Regs:$a)), 2682 (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 2683def : Pat<(f16 (sint_to_fp Int16Regs:$a)), 2684 (CVT_f16_s16 Int16Regs:$a, CvtRN)>; 2685def : Pat<(f16 (sint_to_fp Int32Regs:$a)), 2686 (CVT_f16_s32 Int32Regs:$a, CvtRN)>; 2687def : Pat<(f16 (sint_to_fp Int64Regs:$a)), 2688 (CVT_f16_s64 Int64Regs:$a, CvtRN)>; 2689 2690// uint -> f16 2691def : Pat<(f16 (uint_to_fp Int1Regs:$a)), 2692 (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 2693def : Pat<(f16 (uint_to_fp Int16Regs:$a)), 2694 (CVT_f16_u16 Int16Regs:$a, CvtRN)>; 2695def : Pat<(f16 (uint_to_fp Int32Regs:$a)), 2696 (CVT_f16_u32 Int32Regs:$a, CvtRN)>; 2697def : Pat<(f16 (uint_to_fp Int64Regs:$a)), 2698 (CVT_f16_u64 Int64Regs:$a, CvtRN)>; 2699 2700// sint -> f32 2701def : Pat<(f32 (sint_to_fp Int1Regs:$a)), 2702 (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 2703def : Pat<(f32 (sint_to_fp Int16Regs:$a)), 2704 (CVT_f32_s16 Int16Regs:$a, CvtRN)>; 2705def : Pat<(f32 (sint_to_fp Int32Regs:$a)), 2706 (CVT_f32_s32 Int32Regs:$a, CvtRN)>; 2707def : Pat<(f32 (sint_to_fp Int64Regs:$a)), 2708 (CVT_f32_s64 Int64Regs:$a, CvtRN)>; 2709 2710// uint -> f32 2711def : Pat<(f32 (uint_to_fp Int1Regs:$a)), 2712 (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 2713def : Pat<(f32 (uint_to_fp Int16Regs:$a)), 2714 (CVT_f32_u16 Int16Regs:$a, CvtRN)>; 2715def : Pat<(f32 (uint_to_fp Int32Regs:$a)), 2716 (CVT_f32_u32 Int32Regs:$a, CvtRN)>; 2717def : Pat<(f32 (uint_to_fp Int64Regs:$a)), 2718 (CVT_f32_u64 Int64Regs:$a, CvtRN)>; 2719 2720// sint -> f64 2721def : Pat<(f64 (sint_to_fp Int1Regs:$a)), 2722 (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 2723def : Pat<(f64 (sint_to_fp Int16Regs:$a)), 2724 (CVT_f64_s16 Int16Regs:$a, CvtRN)>; 2725def : Pat<(f64 (sint_to_fp Int32Regs:$a)), 2726 (CVT_f64_s32 Int32Regs:$a, CvtRN)>; 2727def : Pat<(f64 (sint_to_fp Int64Regs:$a)), 2728 (CVT_f64_s64 Int64Regs:$a, CvtRN)>; 2729 2730// uint -> f64 2731def : Pat<(f64 (uint_to_fp Int1Regs:$a)), 2732 (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 2733def : Pat<(f64 (uint_to_fp Int16Regs:$a)), 2734 (CVT_f64_u16 Int16Regs:$a, CvtRN)>; 2735def : Pat<(f64 (uint_to_fp Int32Regs:$a)), 2736 (CVT_f64_u32 Int32Regs:$a, CvtRN)>; 2737def : Pat<(f64 (uint_to_fp Int64Regs:$a)), 2738 (CVT_f64_u64 Int64Regs:$a, CvtRN)>; 2739 2740 2741// f16 -> sint 2742def : Pat<(i1 (fp_to_sint Float16Regs:$a)), 2743 (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>; 2744def : Pat<(i16 (fp_to_sint Float16Regs:$a)), 2745 (CVT_s16_f16 Float16Regs:$a, CvtRZI)>; 2746def : Pat<(i32 (fp_to_sint Float16Regs:$a)), 2747 (CVT_s32_f16 Float16Regs:$a, CvtRZI)>; 2748def : Pat<(i64 (fp_to_sint Float16Regs:$a)), 2749 (CVT_s64_f16 Float16Regs:$a, CvtRZI)>; 2750 2751// f16 -> uint 2752def : Pat<(i1 (fp_to_uint Float16Regs:$a)), 2753 (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>; 2754def : Pat<(i16 (fp_to_uint Float16Regs:$a)), 2755 (CVT_u16_f16 Float16Regs:$a, CvtRZI)>; 2756def : Pat<(i32 (fp_to_uint Float16Regs:$a)), 2757 (CVT_u32_f16 Float16Regs:$a, CvtRZI)>; 2758def : Pat<(i64 (fp_to_uint Float16Regs:$a)), 2759 (CVT_u64_f16 Float16Regs:$a, CvtRZI)>; 2760 2761// f32 -> sint 2762def : Pat<(i1 (fp_to_sint Float32Regs:$a)), 2763 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; 2764def : Pat<(i16 (fp_to_sint Float32Regs:$a)), 2765 (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 2766def : Pat<(i16 (fp_to_sint Float32Regs:$a)), 2767 (CVT_s16_f32 Float32Regs:$a, CvtRZI)>; 2768def : Pat<(i32 (fp_to_sint Float32Regs:$a)), 2769 (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 2770def : Pat<(i32 (fp_to_sint Float32Regs:$a)), 2771 (CVT_s32_f32 Float32Regs:$a, CvtRZI)>; 2772def : Pat<(i64 (fp_to_sint Float32Regs:$a)), 2773 (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 2774def : Pat<(i64 (fp_to_sint Float32Regs:$a)), 2775 (CVT_s64_f32 Float32Regs:$a, CvtRZI)>; 2776 2777// f32 -> uint 2778def : Pat<(i1 (fp_to_uint Float32Regs:$a)), 2779 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; 2780def : Pat<(i16 (fp_to_uint Float32Regs:$a)), 2781 (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 2782def : Pat<(i16 (fp_to_uint Float32Regs:$a)), 2783 (CVT_u16_f32 Float32Regs:$a, CvtRZI)>; 2784def : Pat<(i32 (fp_to_uint Float32Regs:$a)), 2785 (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 2786def : Pat<(i32 (fp_to_uint Float32Regs:$a)), 2787 (CVT_u32_f32 Float32Regs:$a, CvtRZI)>; 2788def : Pat<(i64 (fp_to_uint Float32Regs:$a)), 2789 (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 2790def : Pat<(i64 (fp_to_uint Float32Regs:$a)), 2791 (CVT_u64_f32 Float32Regs:$a, CvtRZI)>; 2792 2793// f64 -> sint 2794def : Pat<(i1 (fp_to_sint Float64Regs:$a)), 2795 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; 2796def : Pat<(i16 (fp_to_sint Float64Regs:$a)), 2797 (CVT_s16_f64 Float64Regs:$a, CvtRZI)>; 2798def : Pat<(i32 (fp_to_sint Float64Regs:$a)), 2799 (CVT_s32_f64 Float64Regs:$a, CvtRZI)>; 2800def : Pat<(i64 (fp_to_sint Float64Regs:$a)), 2801 (CVT_s64_f64 Float64Regs:$a, CvtRZI)>; 2802 2803// f64 -> uint 2804def : Pat<(i1 (fp_to_uint Float64Regs:$a)), 2805 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; 2806def : Pat<(i16 (fp_to_uint Float64Regs:$a)), 2807 (CVT_u16_f64 Float64Regs:$a, CvtRZI)>; 2808def : Pat<(i32 (fp_to_uint Float64Regs:$a)), 2809 (CVT_u32_f64 Float64Regs:$a, CvtRZI)>; 2810def : Pat<(i64 (fp_to_uint Float64Regs:$a)), 2811 (CVT_u64_f64 Float64Regs:$a, CvtRZI)>; 2812 2813// sext i1 2814def : Pat<(i16 (sext Int1Regs:$a)), 2815 (SELP_s16ii -1, 0, Int1Regs:$a)>; 2816def : Pat<(i32 (sext Int1Regs:$a)), 2817 (SELP_s32ii -1, 0, Int1Regs:$a)>; 2818def : Pat<(i64 (sext Int1Regs:$a)), 2819 (SELP_s64ii -1, 0, Int1Regs:$a)>; 2820 2821// zext i1 2822def : Pat<(i16 (zext Int1Regs:$a)), 2823 (SELP_u16ii 1, 0, Int1Regs:$a)>; 2824def : Pat<(i32 (zext Int1Regs:$a)), 2825 (SELP_u32ii 1, 0, Int1Regs:$a)>; 2826def : Pat<(i64 (zext Int1Regs:$a)), 2827 (SELP_u64ii 1, 0, Int1Regs:$a)>; 2828 2829// anyext i1 2830def : Pat<(i16 (anyext Int1Regs:$a)), 2831 (SELP_u16ii -1, 0, Int1Regs:$a)>; 2832def : Pat<(i32 (anyext Int1Regs:$a)), 2833 (SELP_u32ii -1, 0, Int1Regs:$a)>; 2834def : Pat<(i64 (anyext Int1Regs:$a)), 2835 (SELP_u64ii -1, 0, Int1Regs:$a)>; 2836 2837// sext i16 2838def : Pat<(i32 (sext Int16Regs:$a)), 2839 (CVT_s32_s16 Int16Regs:$a, CvtNONE)>; 2840def : Pat<(i64 (sext Int16Regs:$a)), 2841 (CVT_s64_s16 Int16Regs:$a, CvtNONE)>; 2842 2843// zext i16 2844def : Pat<(i32 (zext Int16Regs:$a)), 2845 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; 2846def : Pat<(i64 (zext Int16Regs:$a)), 2847 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; 2848 2849// anyext i16 2850def : Pat<(i32 (anyext Int16Regs:$a)), 2851 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; 2852def : Pat<(i64 (anyext Int16Regs:$a)), 2853 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; 2854 2855// sext i32 2856def : Pat<(i64 (sext Int32Regs:$a)), 2857 (CVT_s64_s32 Int32Regs:$a, CvtNONE)>; 2858 2859// zext i32 2860def : Pat<(i64 (zext Int32Regs:$a)), 2861 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; 2862 2863// anyext i32 2864def : Pat<(i64 (anyext Int32Regs:$a)), 2865 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; 2866 2867 2868// truncate i64 2869def : Pat<(i32 (trunc Int64Regs:$a)), 2870 (CVT_u32_u64 Int64Regs:$a, CvtNONE)>; 2871def : Pat<(i16 (trunc Int64Regs:$a)), 2872 (CVT_u16_u64 Int64Regs:$a, CvtNONE)>; 2873def : Pat<(i1 (trunc Int64Regs:$a)), 2874 (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>; 2875 2876// truncate i32 2877def : Pat<(i16 (trunc Int32Regs:$a)), 2878 (CVT_u16_u32 Int32Regs:$a, CvtNONE)>; 2879def : Pat<(i1 (trunc Int32Regs:$a)), 2880 (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>; 2881 2882// truncate i16 2883def : Pat<(i1 (trunc Int16Regs:$a)), 2884 (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>; 2885 2886// sext_inreg 2887def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>; 2888def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>; 2889def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>; 2890def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>; 2891def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>; 2892def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>; 2893 2894 2895// Select instructions with 32-bit predicates 2896def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b), 2897 (SELP_b16rr Int16Regs:$a, Int16Regs:$b, 2898 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 2899def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b), 2900 (SELP_b32rr Int32Regs:$a, Int32Regs:$b, 2901 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 2902def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b), 2903 (SELP_b64rr Int64Regs:$a, Int64Regs:$b, 2904 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 2905def : Pat<(select Int32Regs:$pred, Float16Regs:$a, Float16Regs:$b), 2906 (SELP_f16rr Float16Regs:$a, Float16Regs:$b, 2907 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 2908def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b), 2909 (SELP_f32rr Float32Regs:$a, Float32Regs:$b, 2910 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 2911def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b), 2912 (SELP_f64rr Float64Regs:$a, Float64Regs:$b, 2913 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 2914 2915 2916let hasSideEffects = false in { 2917 // pack a set of smaller int registers to a larger int register 2918 def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d), 2919 (ins Int16Regs:$s1, Int16Regs:$s2, 2920 Int16Regs:$s3, Int16Regs:$s4), 2921 "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>; 2922 def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d), 2923 (ins Int16Regs:$s1, Int16Regs:$s2), 2924 "mov.b32 \t$d, {{$s1, $s2}};", []>; 2925 def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d), 2926 (ins Int32Regs:$s1, Int32Regs:$s2), 2927 "mov.b64 \t$d, {{$s1, $s2}};", []>; 2928 def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d), 2929 (ins Float32Regs:$s1, Float32Regs:$s2), 2930 "mov.b64 \t$d, {{$s1, $s2}};", []>; 2931 2932 // unpack a larger int register to a set of smaller int registers 2933 def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2, 2934 Int16Regs:$d3, Int16Regs:$d4), 2935 (ins Int64Regs:$s), 2936 "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>; 2937 def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2), 2938 (ins Int32Regs:$s), 2939 "mov.b32 \t{{$d1, $d2}}, $s;", []>; 2940 def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2), 2941 (ins Int64Regs:$s), 2942 "mov.b64 \t{{$d1, $d2}}, $s;", []>; 2943 def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2), 2944 (ins Float64Regs:$s), 2945 "mov.b64 \t{{$d1, $d2}}, $s;", []>; 2946 2947} 2948 2949let hasSideEffects = false in { 2950 // Extract element of f16x2 register. PTX does not provide any way 2951 // to access elements of f16x2 vector directly, so we need to 2952 // extract it using a temporary register. 2953 def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst), 2954 (ins Float16x2Regs:$src), 2955 "{{ .reg .b16 \t%tmp_hi;\n\t" 2956 " mov.b32 \t{$dst, %tmp_hi}, $src; }}", 2957 [(set Float16Regs:$dst, 2958 (extractelt (v2f16 Float16x2Regs:$src), 0))]>; 2959 def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst), 2960 (ins Float16x2Regs:$src), 2961 "{{ .reg .b16 \t%tmp_lo;\n\t" 2962 " mov.b32 \t{%tmp_lo, $dst}, $src; }}", 2963 [(set Float16Regs:$dst, 2964 (extractelt (v2f16 Float16x2Regs:$src), 1))]>; 2965 2966 // Coalesce two f16 registers into f16x2 2967 def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst), 2968 (ins Float16Regs:$a, Float16Regs:$b), 2969 "mov.b32 \t$dst, {{$a, $b}};", 2970 [(set Float16x2Regs:$dst, 2971 (build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>; 2972 2973 // Directly initializing underlying the b32 register is one less SASS 2974 // instruction than than vector-packing move. 2975 def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src), 2976 "mov.b32 \t$dst, $src;", 2977 []>; 2978 2979 // Split f16x2 into two f16 registers. 2980 def SplitF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi), 2981 (ins Float16x2Regs:$src), 2982 "mov.b32 \t{{$lo, $hi}}, $src;", 2983 []>; 2984 // Split an i32 into two f16 2985 def SplitI32toF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi), 2986 (ins Int32Regs:$src), 2987 "mov.b32 \t{{$lo, $hi}}, $src;", 2988 []>; 2989} 2990 2991// Count leading zeros 2992let hasSideEffects = false in { 2993 def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), 2994 "clz.b32 \t$d, $a;", []>; 2995 def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), 2996 "clz.b64 \t$d, $a;", []>; 2997} 2998 2999// 32-bit has a direct PTX instruction 3000def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>; 3001 3002// The return type of the ctlz ISD node is the same as its input, but the PTX 3003// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the 3004// ptx value to 64 bits to match the ISD node's semantics, unless we know we're 3005// truncating back down to 32 bits. 3006def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; 3007def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>; 3008 3009// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the 3010// result back to 16-bits if necessary. We also need to subtract 16 because 3011// the high-order 16 zeros were counted. 3012// 3013// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could 3014// use to save one SASS instruction (on sm_35 anyway): 3015// 3016// mov.b32 $tmp, {0xffff, $a} 3017// ctlz.b32 $result, $tmp 3018// 3019// That is, instead of zero-extending the input to 32 bits, we'd "one-extend" 3020// and then ctlz that value. This way we don't have to subtract 16 from the 3021// result. Unfortunately today we don't have a way to generate 3022// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization. 3023def : Pat<(i16 (ctlz Int16Regs:$a)), 3024 (SUBi16ri (CVT_u16_u32 3025 (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>; 3026def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))), 3027 (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>; 3028 3029// Population count 3030let hasSideEffects = false in { 3031 def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), 3032 "popc.b32 \t$d, $a;", []>; 3033 def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), 3034 "popc.b64 \t$d, $a;", []>; 3035} 3036 3037// 32-bit has a direct PTX instruction 3038def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>; 3039 3040// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit 3041// to match the LLVM semantics. Just as with ctlz.i64, we provide a second 3042// pattern that avoids the type conversion if we're truncating the result to 3043// i32 anyway. 3044def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>; 3045def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>; 3046 3047// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits. 3048// If we know that we're storing into an i32, we can avoid the final trunc. 3049def : Pat<(ctpop Int16Regs:$a), 3050 (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>; 3051def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))), 3052 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>; 3053 3054// fpround f32 -> f16 3055def : Pat<(f16 (fpround Float32Regs:$a)), 3056 (CVT_f16_f32 Float32Regs:$a, CvtRN)>; 3057 3058// fpround f64 -> f16 3059def : Pat<(f16 (fpround Float64Regs:$a)), 3060 (CVT_f16_f64 Float64Regs:$a, CvtRN)>; 3061 3062// fpround f64 -> f32 3063def : Pat<(f32 (fpround Float64Regs:$a)), 3064 (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; 3065def : Pat<(f32 (fpround Float64Regs:$a)), 3066 (CVT_f32_f64 Float64Regs:$a, CvtRN)>; 3067 3068// fpextend f16 -> f32 3069def : Pat<(f32 (fpextend Float16Regs:$a)), 3070 (CVT_f32_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3071def : Pat<(f32 (fpextend Float16Regs:$a)), 3072 (CVT_f32_f16 Float16Regs:$a, CvtNONE)>; 3073 3074// fpextend f16 -> f64 3075def : Pat<(f64 (fpextend Float16Regs:$a)), 3076 (CVT_f64_f16 Float16Regs:$a, CvtNONE)>; 3077 3078// fpextend f32 -> f64 3079def : Pat<(f64 (fpextend Float32Regs:$a)), 3080 (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3081def : Pat<(f64 (fpextend Float32Regs:$a)), 3082 (CVT_f64_f32 Float32Regs:$a, CvtNONE)>; 3083 3084def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone, 3085 [SDNPHasChain, SDNPOptInGlue]>; 3086 3087// fceil, ffloor, fround, ftrunc. 3088 3089def : Pat<(fceil Float16Regs:$a), 3090 (CVT_f16_f16 Float16Regs:$a, CvtRPI)>; 3091def : Pat<(fceil Float32Regs:$a), 3092 (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>; 3093def : Pat<(fceil Float32Regs:$a), 3094 (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>; 3095def : Pat<(fceil Float64Regs:$a), 3096 (CVT_f64_f64 Float64Regs:$a, CvtRPI)>; 3097 3098def : Pat<(ffloor Float16Regs:$a), 3099 (CVT_f16_f16 Float16Regs:$a, CvtRMI)>; 3100def : Pat<(ffloor Float32Regs:$a), 3101 (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>; 3102def : Pat<(ffloor Float32Regs:$a), 3103 (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>; 3104def : Pat<(ffloor Float64Regs:$a), 3105 (CVT_f64_f64 Float64Regs:$a, CvtRMI)>; 3106 3107def : Pat<(ftrunc Float16Regs:$a), 3108 (CVT_f16_f16 Float16Regs:$a, CvtRZI)>; 3109def : Pat<(ftrunc Float32Regs:$a), 3110 (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3111def : Pat<(ftrunc Float32Regs:$a), 3112 (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>; 3113def : Pat<(ftrunc Float64Regs:$a), 3114 (CVT_f64_f64 Float64Regs:$a, CvtRZI)>; 3115 3116// nearbyint and rint are implemented as rounding to nearest even. This isn't 3117// strictly correct, because it causes us to ignore the rounding mode. But it 3118// matches what CUDA's "libm" does. 3119 3120def : Pat<(fnearbyint Float16Regs:$a), 3121 (CVT_f16_f16 Float16Regs:$a, CvtRNI)>; 3122def : Pat<(fnearbyint Float32Regs:$a), 3123 (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; 3124def : Pat<(fnearbyint Float32Regs:$a), 3125 (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; 3126def : Pat<(fnearbyint Float64Regs:$a), 3127 (CVT_f64_f64 Float64Regs:$a, CvtRNI)>; 3128 3129def : Pat<(frint Float16Regs:$a), 3130 (CVT_f16_f16 Float16Regs:$a, CvtRNI)>; 3131def : Pat<(frint Float32Regs:$a), 3132 (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; 3133def : Pat<(frint Float32Regs:$a), 3134 (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; 3135def : Pat<(frint Float64Regs:$a), 3136 (CVT_f64_f64 Float64Regs:$a, CvtRNI)>; 3137 3138 3139//----------------------------------- 3140// Control-flow 3141//----------------------------------- 3142 3143let isTerminator=1 in { 3144 let isReturn=1, isBarrier=1 in 3145 def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>; 3146 3147 let isBranch=1 in 3148 def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), 3149 "@$a bra \t$target;", 3150 [(brcond Int1Regs:$a, bb:$target)]>; 3151 let isBranch=1 in 3152 def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), 3153 "@!$a bra \t$target;", []>; 3154 3155 let isBranch=1, isBarrier=1 in 3156 def GOTO : NVPTXInst<(outs), (ins brtarget:$target), 3157 "bra.uni \t$target;", [(br bb:$target)]>; 3158} 3159 3160def : Pat<(brcond Int32Regs:$a, bb:$target), 3161 (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>; 3162 3163// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a 3164// conditional branch if the target block is the next block so that the code 3165// can fall through to the target block. The invertion is done by 'xor 3166// condition, 1', which will be translated to (setne condition, -1). Since ptx 3167// supports '@!pred bra target', we should use it. 3168def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target), 3169 (CBranchOther Int1Regs:$a, bb:$target)>; 3170 3171// Call 3172def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, 3173 SDTCisVT<1, i32>]>; 3174def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; 3175 3176def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart, 3177 [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; 3178def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd, 3179 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, 3180 SDNPSideEffect]>; 3181 3182def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; 3183def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall, 3184 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; 3185def calltarget : Operand<i32>; 3186let isCall=1 in { 3187 def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>; 3188} 3189 3190def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>; 3191def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>; 3192 3193// Pseudo instructions. 3194class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern> 3195 : NVPTXInst<outs, ins, asmstr, pattern>; 3196 3197def Callseq_Start : 3198 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), 3199 "\\{ // callseq $amt1, $amt2\n" 3200 "\t.reg .b32 temp_param_reg;", 3201 [(callseq_start timm:$amt1, timm:$amt2)]>; 3202def Callseq_End : 3203 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), 3204 "\\} // callseq $amt1", 3205 [(callseq_end timm:$amt1, timm:$amt2)]>; 3206 3207// trap instruction 3208def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>; 3209 3210// Call prototype wrapper 3211def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 3212def CallPrototype : 3213 SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, 3214 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 3215def ProtoIdent : Operand<i32> { 3216 let PrintMethod = "printProtoIdent"; 3217} 3218def CALL_PROTOTYPE : 3219 NVPTXInst<(outs), (ins ProtoIdent:$ident), 3220 "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; 3221 3222 3223include "NVPTXIntrinsics.td" 3224 3225 3226//----------------------------------- 3227// Notes 3228//----------------------------------- 3229// BSWAP is currently expanded. The following is a more efficient 3230// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register 3231// - for sm_20, use pmpt (use vector scalar mov to get the pack and 3232// unpack). sm_20 supports native 32-bit register, but not native 16-bit 3233// register. 3234