1//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file describes the PTX instructions in TableGen format. 10// 11//===----------------------------------------------------------------------===// 12 13include "NVPTXInstrFormats.td" 14 15// A NOP instruction 16let hasSideEffects = false in { 17 def NOP : NVPTXInst<(outs), (ins), "", []>; 18} 19 20let OperandType = "OPERAND_IMMEDIATE" in { 21 def f16imm : Operand<f16>; 22} 23 24// List of vector specific properties 25def isVecLD : VecInstTypeEnum<1>; 26def isVecST : VecInstTypeEnum<2>; 27def isVecBuild : VecInstTypeEnum<3>; 28def isVecShuffle : VecInstTypeEnum<4>; 29def isVecExtract : VecInstTypeEnum<5>; 30def isVecInsert : VecInstTypeEnum<6>; 31def isVecDest : VecInstTypeEnum<7>; 32def isVecOther : VecInstTypeEnum<15>; 33 34//===----------------------------------------------------------------------===// 35// NVPTX Operand Definitions. 36//===----------------------------------------------------------------------===// 37 38def brtarget : Operand<OtherVT>; 39 40// CVT conversion modes 41// These must match the enum in NVPTX.h 42def CvtNONE : PatLeaf<(i32 0x0)>; 43def CvtRNI : PatLeaf<(i32 0x1)>; 44def CvtRZI : PatLeaf<(i32 0x2)>; 45def CvtRMI : PatLeaf<(i32 0x3)>; 46def CvtRPI : PatLeaf<(i32 0x4)>; 47def CvtRN : PatLeaf<(i32 0x5)>; 48def CvtRZ : PatLeaf<(i32 0x6)>; 49def CvtRM : PatLeaf<(i32 0x7)>; 50def CvtRP : PatLeaf<(i32 0x8)>; 51def CvtRNA : PatLeaf<(i32 0x9)>; 52 53def CvtNONE_FTZ : PatLeaf<(i32 0x10)>; 54def CvtRNI_FTZ : PatLeaf<(i32 0x11)>; 55def CvtRZI_FTZ : PatLeaf<(i32 0x12)>; 56def CvtRMI_FTZ : PatLeaf<(i32 0x13)>; 57def CvtRPI_FTZ : PatLeaf<(i32 0x14)>; 58def CvtRN_FTZ : PatLeaf<(i32 0x15)>; 59def CvtRZ_FTZ : PatLeaf<(i32 0x16)>; 60def CvtRM_FTZ : PatLeaf<(i32 0x17)>; 61def CvtRP_FTZ : PatLeaf<(i32 0x18)>; 62 63def CvtSAT : PatLeaf<(i32 0x20)>; 64def CvtSAT_FTZ : PatLeaf<(i32 0x30)>; 65 66def CvtNONE_RELU : PatLeaf<(i32 0x40)>; 67def CvtRN_RELU : PatLeaf<(i32 0x45)>; 68def CvtRZ_RELU : PatLeaf<(i32 0x46)>; 69 70def CvtMode : Operand<i32> { 71 let PrintMethod = "printCvtMode"; 72} 73 74// Compare modes 75// These must match the enum in NVPTX.h 76def CmpEQ : PatLeaf<(i32 0)>; 77def CmpNE : PatLeaf<(i32 1)>; 78def CmpLT : PatLeaf<(i32 2)>; 79def CmpLE : PatLeaf<(i32 3)>; 80def CmpGT : PatLeaf<(i32 4)>; 81def CmpGE : PatLeaf<(i32 5)>; 82def CmpEQU : PatLeaf<(i32 10)>; 83def CmpNEU : PatLeaf<(i32 11)>; 84def CmpLTU : PatLeaf<(i32 12)>; 85def CmpLEU : PatLeaf<(i32 13)>; 86def CmpGTU : PatLeaf<(i32 14)>; 87def CmpGEU : PatLeaf<(i32 15)>; 88def CmpNUM : PatLeaf<(i32 16)>; 89def CmpNAN : PatLeaf<(i32 17)>; 90 91def CmpEQ_FTZ : PatLeaf<(i32 0x100)>; 92def CmpNE_FTZ : PatLeaf<(i32 0x101)>; 93def CmpLT_FTZ : PatLeaf<(i32 0x102)>; 94def CmpLE_FTZ : PatLeaf<(i32 0x103)>; 95def CmpGT_FTZ : PatLeaf<(i32 0x104)>; 96def CmpGE_FTZ : PatLeaf<(i32 0x105)>; 97def CmpEQU_FTZ : PatLeaf<(i32 0x10A)>; 98def CmpNEU_FTZ : PatLeaf<(i32 0x10B)>; 99def CmpLTU_FTZ : PatLeaf<(i32 0x10C)>; 100def CmpLEU_FTZ : PatLeaf<(i32 0x10D)>; 101def CmpGTU_FTZ : PatLeaf<(i32 0x10E)>; 102def CmpGEU_FTZ : PatLeaf<(i32 0x10F)>; 103def CmpNUM_FTZ : PatLeaf<(i32 0x110)>; 104def CmpNAN_FTZ : PatLeaf<(i32 0x111)>; 105 106def CmpMode : Operand<i32> { 107 let PrintMethod = "printCmpMode"; 108} 109def VecElement : Operand<i32> { 110 let PrintMethod = "printVecElement"; 111} 112 113//===----------------------------------------------------------------------===// 114// NVPTX Instruction Predicate Definitions 115//===----------------------------------------------------------------------===// 116 117 118def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; 119def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; 120def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; 121def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; 122def hasVote : Predicate<"Subtarget->hasVote()">; 123def hasDouble : Predicate<"Subtarget->hasDouble()">; 124def hasLDG : Predicate<"Subtarget->hasLDG()">; 125def hasLDU : Predicate<"Subtarget->hasLDU()">; 126 127def doF32FTZ : Predicate<"useF32FTZ()">; 128def doNoF32FTZ : Predicate<"!useF32FTZ()">; 129 130def doMulWide : Predicate<"doMulWide">; 131 132def allowFMA : Predicate<"allowFMA()">; 133def noFMA : Predicate<"!allowFMA()">; 134def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">; 135def noUnsafeFPMath : Predicate<"!allowUnsafeFPMath()">; 136 137def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">; 138def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">; 139 140def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">; 141def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; 142 143def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; 144def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; 145 146def True : Predicate<"true">; 147 148def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">; 149def hasPTX42 : Predicate<"Subtarget->getPTXVersion() >= 42">; 150def hasPTX43 : Predicate<"Subtarget->getPTXVersion() >= 43">; 151def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">; 152def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">; 153def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">; 154def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">; 155def hasPTX65 : Predicate<"Subtarget->getPTXVersion() >= 65">; 156def hasPTX70 : Predicate<"Subtarget->getPTXVersion() >= 70">; 157def hasPTX71 : Predicate<"Subtarget->getPTXVersion() >= 71">; 158def hasPTX72 : Predicate<"Subtarget->getPTXVersion() >= 72">; 159 160def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">; 161def hasSM32 : Predicate<"Subtarget->getSmVersion() >= 32">; 162def hasSM53 : Predicate<"Subtarget->getSmVersion() >= 53">; 163def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">; 164def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">; 165def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">; 166def hasSM80 : Predicate<"Subtarget->getSmVersion() >= 80">; 167def hasSM86 : Predicate<"Subtarget->getSmVersion() >= 86">; 168 169// non-sync shfl instructions are not available on sm_70+ in PTX6.4+ 170def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" 171 "&& Subtarget->getPTXVersion() >= 64)">; 172 173def useShortPtr : Predicate<"useShortPointers()">; 174def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; 175 176// Helper class to aid conversion between ValueType and a matching RegisterClass. 177 178class ValueToRegClass<ValueType T> { 179 string name = !cast<string>(T); 180 NVPTXRegClass ret = !cond( 181 !eq(name, "i1"): Int1Regs, 182 !eq(name, "i16"): Int16Regs, 183 !eq(name, "i32"): Int32Regs, 184 !eq(name, "i64"): Int64Regs, 185 !eq(name, "f16"): Float16Regs, 186 !eq(name, "v2f16"): Float16x2Regs, 187 !eq(name, "bf16"): Float16Regs, 188 !eq(name, "v2bf16"): Float16x2Regs, 189 !eq(name, "f32"): Float32Regs, 190 !eq(name, "f64"): Float64Regs, 191 !eq(name, "ai32"): Int32ArgRegs, 192 !eq(name, "ai64"): Int64ArgRegs, 193 !eq(name, "af32"): Float32ArgRegs, 194 !eq(name, "if64"): Float64ArgRegs, 195 ); 196} 197 198 199//===----------------------------------------------------------------------===// 200// Some Common Instruction Class Templates 201//===----------------------------------------------------------------------===// 202 203// Template for instructions which take three int64, int32, or int16 args. 204// The instructions are named "<OpcStr><Width>" (e.g. "add.s64"). 205multiclass I3<string OpcStr, SDNode OpNode> { 206 def i64rr : 207 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 208 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 209 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; 210 def i64ri : 211 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 212 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 213 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; 214 def i32rr : 215 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 216 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 217 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; 218 def i32ri : 219 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 220 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 221 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; 222 def i16rr : 223 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 224 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 225 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; 226 def i16ri : 227 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 228 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 229 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; 230} 231 232// Template for instructions which take 3 int args. The instructions are 233// named "<OpcStr>.s32" (e.g. "addc.cc.s32"). 234multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> { 235 let hasSideEffects = 1 in { 236 def i32rr : 237 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 238 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), 239 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; 240 def i32ri : 241 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 242 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), 243 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; 244 def i64rr : 245 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 246 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), 247 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>, 248 Requires<[hasPTX43]>; 249 def i64ri : 250 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 251 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), 252 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>, 253 Requires<[hasPTX43]>; 254 } 255} 256 257// Template for instructions which take three fp64 or fp32 args. The 258// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64"). 259// 260// Also defines ftz (flush subnormal inputs and results to sign-preserving 261// zero) variants for fp32 functions. 262// 263// This multiclass should be used for nodes that cannot be folded into FMAs. 264// For nodes that can be folded into FMAs (i.e. adds and muls), use 265// F3_fma_component. 266multiclass F3<string OpcStr, SDNode OpNode> { 267 def f64rr : 268 NVPTXInst<(outs Float64Regs:$dst), 269 (ins Float64Regs:$a, Float64Regs:$b), 270 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 271 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>; 272 def f64ri : 273 NVPTXInst<(outs Float64Regs:$dst), 274 (ins Float64Regs:$a, f64imm:$b), 275 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 276 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>; 277 def f32rr_ftz : 278 NVPTXInst<(outs Float32Regs:$dst), 279 (ins Float32Regs:$a, Float32Regs:$b), 280 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 281 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 282 Requires<[doF32FTZ]>; 283 def f32ri_ftz : 284 NVPTXInst<(outs Float32Regs:$dst), 285 (ins Float32Regs:$a, f32imm:$b), 286 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 287 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 288 Requires<[doF32FTZ]>; 289 def f32rr : 290 NVPTXInst<(outs Float32Regs:$dst), 291 (ins Float32Regs:$a, Float32Regs:$b), 292 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 293 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>; 294 def f32ri : 295 NVPTXInst<(outs Float32Regs:$dst), 296 (ins Float32Regs:$a, f32imm:$b), 297 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 298 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>; 299 300 def f16rr_ftz : 301 NVPTXInst<(outs Float16Regs:$dst), 302 (ins Float16Regs:$a, Float16Regs:$b), 303 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), 304 [(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>, 305 Requires<[useFP16Math, doF32FTZ]>; 306 def f16rr : 307 NVPTXInst<(outs Float16Regs:$dst), 308 (ins Float16Regs:$a, Float16Regs:$b), 309 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), 310 [(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>, 311 Requires<[useFP16Math]>; 312 313 def f16x2rr_ftz : 314 NVPTXInst<(outs Float16x2Regs:$dst), 315 (ins Float16x2Regs:$a, Float16x2Regs:$b), 316 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), 317 [(set Float16x2Regs:$dst, (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>, 318 Requires<[useFP16Math, doF32FTZ]>; 319 def f16x2rr : 320 NVPTXInst<(outs Float16x2Regs:$dst), 321 (ins Float16x2Regs:$a, Float16x2Regs:$b), 322 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), 323 [(set Float16x2Regs:$dst, (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>, 324 Requires<[useFP16Math]>; 325} 326 327// Template for instructions which take three FP args. The 328// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64"). 329// 330// Also defines ftz (flush subnormal inputs and results to sign-preserving 331// zero) variants for fp32/fp16 functions. 332// 333// This multiclass should be used for nodes that can be folded to make fma ops. 334// In this case, we use the ".rn" variant when FMA is disabled, as this behaves 335// just like the non ".rn" op, but prevents ptxas from creating FMAs. 336multiclass F3_fma_component<string OpcStr, SDNode OpNode> { 337 def f64rr : 338 NVPTXInst<(outs Float64Regs:$dst), 339 (ins Float64Regs:$a, Float64Regs:$b), 340 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 341 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, 342 Requires<[allowFMA]>; 343 def f64ri : 344 NVPTXInst<(outs Float64Regs:$dst), 345 (ins Float64Regs:$a, f64imm:$b), 346 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 347 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, 348 Requires<[allowFMA]>; 349 def f32rr_ftz : 350 NVPTXInst<(outs Float32Regs:$dst), 351 (ins Float32Regs:$a, Float32Regs:$b), 352 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 353 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 354 Requires<[allowFMA, doF32FTZ]>; 355 def f32ri_ftz : 356 NVPTXInst<(outs Float32Regs:$dst), 357 (ins Float32Regs:$a, f32imm:$b), 358 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 359 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 360 Requires<[allowFMA, doF32FTZ]>; 361 def f32rr : 362 NVPTXInst<(outs Float32Regs:$dst), 363 (ins Float32Regs:$a, Float32Regs:$b), 364 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 365 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 366 Requires<[allowFMA]>; 367 def f32ri : 368 NVPTXInst<(outs Float32Regs:$dst), 369 (ins Float32Regs:$a, f32imm:$b), 370 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 371 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 372 Requires<[allowFMA]>; 373 374 def f16rr_ftz : 375 NVPTXInst<(outs Float16Regs:$dst), 376 (ins Float16Regs:$a, Float16Regs:$b), 377 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), 378 [(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>, 379 Requires<[useFP16Math, allowFMA, doF32FTZ]>; 380 def f16rr : 381 NVPTXInst<(outs Float16Regs:$dst), 382 (ins Float16Regs:$a, Float16Regs:$b), 383 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), 384 [(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>, 385 Requires<[useFP16Math, allowFMA]>; 386 387 def f16x2rr_ftz : 388 NVPTXInst<(outs Float16x2Regs:$dst), 389 (ins Float16x2Regs:$a, Float16x2Regs:$b), 390 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), 391 [(set (v2f16 Float16x2Regs:$dst), (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>, 392 Requires<[useFP16Math, allowFMA, doF32FTZ]>; 393 def f16x2rr : 394 NVPTXInst<(outs Float16x2Regs:$dst), 395 (ins Float16x2Regs:$a, Float16x2Regs:$b), 396 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), 397 [(set Float16x2Regs:$dst, (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>, 398 Requires<[useFP16Math, allowFMA]>; 399 400 // These have strange names so we don't perturb existing mir tests. 401 def _rnf64rr : 402 NVPTXInst<(outs Float64Regs:$dst), 403 (ins Float64Regs:$a, Float64Regs:$b), 404 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), 405 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, 406 Requires<[noFMA]>; 407 def _rnf64ri : 408 NVPTXInst<(outs Float64Regs:$dst), 409 (ins Float64Regs:$a, f64imm:$b), 410 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), 411 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, 412 Requires<[noFMA]>; 413 def _rnf32rr_ftz : 414 NVPTXInst<(outs Float32Regs:$dst), 415 (ins Float32Regs:$a, Float32Regs:$b), 416 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), 417 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 418 Requires<[noFMA, doF32FTZ]>; 419 def _rnf32ri_ftz : 420 NVPTXInst<(outs Float32Regs:$dst), 421 (ins Float32Regs:$a, f32imm:$b), 422 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), 423 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 424 Requires<[noFMA, doF32FTZ]>; 425 def _rnf32rr : 426 NVPTXInst<(outs Float32Regs:$dst), 427 (ins Float32Regs:$a, Float32Regs:$b), 428 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), 429 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 430 Requires<[noFMA]>; 431 def _rnf32ri : 432 NVPTXInst<(outs Float32Regs:$dst), 433 (ins Float32Regs:$a, f32imm:$b), 434 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), 435 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 436 Requires<[noFMA]>; 437 def _rnf16rr_ftz : 438 NVPTXInst<(outs Float16Regs:$dst), 439 (ins Float16Regs:$a, Float16Regs:$b), 440 !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"), 441 [(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>, 442 Requires<[useFP16Math, noFMA, doF32FTZ]>; 443 def _rnf16rr : 444 NVPTXInst<(outs Float16Regs:$dst), 445 (ins Float16Regs:$a, Float16Regs:$b), 446 !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"), 447 [(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>, 448 Requires<[useFP16Math, noFMA]>; 449 def _rnf16x2rr_ftz : 450 NVPTXInst<(outs Float16x2Regs:$dst), 451 (ins Float16x2Regs:$a, Float16x2Regs:$b), 452 !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"), 453 [(set Float16x2Regs:$dst, (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>, 454 Requires<[useFP16Math, noFMA, doF32FTZ]>; 455 def _rnf16x2rr : 456 NVPTXInst<(outs Float16x2Regs:$dst), 457 (ins Float16x2Regs:$a, Float16x2Regs:$b), 458 !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"), 459 [(set Float16x2Regs:$dst, (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>, 460 Requires<[useFP16Math, noFMA]>; 461} 462 463// Template for operations which take two f32 or f64 operands. Provides three 464// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush 465// subnormal inputs and results to zero). 466multiclass F2<string OpcStr, SDNode OpNode> { 467 def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a), 468 !strconcat(OpcStr, ".f64 \t$dst, $a;"), 469 [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>; 470 def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), 471 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"), 472 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>, 473 Requires<[doF32FTZ]>; 474 def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), 475 !strconcat(OpcStr, ".f32 \t$dst, $a;"), 476 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>; 477} 478 479//===----------------------------------------------------------------------===// 480// NVPTX Instructions. 481//===----------------------------------------------------------------------===// 482 483//----------------------------------- 484// Type Conversion 485//----------------------------------- 486 487let hasSideEffects = false in { 488 // Generate a cvt to the given type from all possible types. Each instance 489 // takes a CvtMode immediate that defines the conversion mode to use. It can 490 // be CvtNONE to omit a conversion mode. 491 multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> { 492 def _s8 : 493 NVPTXInst<(outs RC:$dst), 494 (ins Int16Regs:$src, CvtMode:$mode), 495 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 496 FromName, ".s8 \t$dst, $src;"), []>; 497 def _u8 : 498 NVPTXInst<(outs RC:$dst), 499 (ins Int16Regs:$src, CvtMode:$mode), 500 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 501 FromName, ".u8 \t$dst, $src;"), []>; 502 def _s16 : 503 NVPTXInst<(outs RC:$dst), 504 (ins Int16Regs:$src, CvtMode:$mode), 505 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 506 FromName, ".s16 \t$dst, $src;"), []>; 507 def _u16 : 508 NVPTXInst<(outs RC:$dst), 509 (ins Int16Regs:$src, CvtMode:$mode), 510 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 511 FromName, ".u16 \t$dst, $src;"), []>; 512 def _s32 : 513 NVPTXInst<(outs RC:$dst), 514 (ins Int32Regs:$src, CvtMode:$mode), 515 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 516 FromName, ".s32 \t$dst, $src;"), []>; 517 def _u32 : 518 NVPTXInst<(outs RC:$dst), 519 (ins Int32Regs:$src, CvtMode:$mode), 520 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 521 FromName, ".u32 \t$dst, $src;"), []>; 522 def _s64 : 523 NVPTXInst<(outs RC:$dst), 524 (ins Int64Regs:$src, CvtMode:$mode), 525 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 526 FromName, ".s64 \t$dst, $src;"), []>; 527 def _u64 : 528 NVPTXInst<(outs RC:$dst), 529 (ins Int64Regs:$src, CvtMode:$mode), 530 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 531 FromName, ".u64 \t$dst, $src;"), []>; 532 def _f16 : 533 NVPTXInst<(outs RC:$dst), 534 (ins Float16Regs:$src, CvtMode:$mode), 535 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 536 FromName, ".f16 \t$dst, $src;"), []>; 537 def _f32 : 538 NVPTXInst<(outs RC:$dst), 539 (ins Float32Regs:$src, CvtMode:$mode), 540 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 541 FromName, ".f32 \t$dst, $src;"), []>; 542 def _f64 : 543 NVPTXInst<(outs RC:$dst), 544 (ins Float64Regs:$src, CvtMode:$mode), 545 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 546 FromName, ".f64 \t$dst, $src;"), []>; 547 } 548 549 // Generate cvts from all types to all types. 550 defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>; 551 defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>; 552 defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>; 553 defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>; 554 defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>; 555 defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>; 556 defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>; 557 defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>; 558 defm CVT_f16 : CVT_FROM_ALL<"f16", Float16Regs>; 559 defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>; 560 defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>; 561 562 // These cvts are different from those above: The source and dest registers 563 // are of the same type. 564 def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 565 "cvt.s16.s8 \t$dst, $src;", []>; 566 def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 567 "cvt.s32.s8 \t$dst, $src;", []>; 568 def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 569 "cvt.s32.s16 \t$dst, $src;", []>; 570 def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 571 "cvt.s64.s8 \t$dst, $src;", []>; 572 def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 573 "cvt.s64.s16 \t$dst, $src;", []>; 574 def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 575 "cvt.s64.s32 \t$dst, $src;", []>; 576 577multiclass CVT_FROM_FLOAT_SM80<string FromName, RegisterClass RC> { 578 def _f32 : 579 NVPTXInst<(outs RC:$dst), 580 (ins Float32Regs:$src, CvtMode:$mode), 581 !strconcat("cvt${mode:base}${mode:relu}.", 582 FromName, ".f32 \t$dst, $src;"), []>, 583 Requires<[hasPTX70, hasSM80]>; 584 } 585 586 defm CVT_bf16 : CVT_FROM_FLOAT_SM80<"bf16", Int16Regs>; 587 588 multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> { 589 def _f32 : 590 NVPTXInst<(outs RC:$dst), 591 (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode), 592 !strconcat("cvt${mode:base}${mode:relu}.", 593 FromName, ".f32 \t$dst, $src1, $src2;"), []>, 594 Requires<[hasPTX70, hasSM80]>; 595 } 596 597 defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Float16x2Regs>; 598 defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>; 599} 600 601//----------------------------------- 602// Selection instructions (selp) 603//----------------------------------- 604 605// TODO: Missing slct 606 607// selp instructions that don't have any pattern matches; we explicitly use 608// them within this file. 609let hasSideEffects = false in { 610 multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> { 611 def rr : NVPTXInst<(outs RC:$dst), 612 (ins RC:$a, RC:$b, Int1Regs:$p), 613 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 614 def ri : NVPTXInst<(outs RC:$dst), 615 (ins RC:$a, ImmCls:$b, Int1Regs:$p), 616 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 617 def ir : NVPTXInst<(outs RC:$dst), 618 (ins ImmCls:$a, RC:$b, Int1Regs:$p), 619 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 620 def ii : NVPTXInst<(outs RC:$dst), 621 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), 622 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 623 } 624 625 multiclass SELP_PATTERN<string TypeStr, ValueType T, RegisterClass RC, 626 Operand ImmCls, SDNode ImmNode> { 627 def rr : 628 NVPTXInst<(outs RC:$dst), 629 (ins RC:$a, RC:$b, Int1Regs:$p), 630 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 631 [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T RC:$b)))]>; 632 def ri : 633 NVPTXInst<(outs RC:$dst), 634 (ins RC:$a, ImmCls:$b, Int1Regs:$p), 635 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 636 [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T ImmNode:$b)))]>; 637 def ir : 638 NVPTXInst<(outs RC:$dst), 639 (ins ImmCls:$a, RC:$b, Int1Regs:$p), 640 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 641 [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, (T RC:$b)))]>; 642 def ii : 643 NVPTXInst<(outs RC:$dst), 644 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), 645 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 646 [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>; 647 } 648} 649 650// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as 651// good. 652defm SELP_b16 : SELP_PATTERN<"b16", i16, Int16Regs, i16imm, imm>; 653defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>; 654defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>; 655defm SELP_b32 : SELP_PATTERN<"b32", i32, Int32Regs, i32imm, imm>; 656defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>; 657defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>; 658defm SELP_b64 : SELP_PATTERN<"b64", i64, Int64Regs, i64imm, imm>; 659defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>; 660defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>; 661defm SELP_f16 : SELP_PATTERN<"b16", f16, Float16Regs, f16imm, fpimm>; 662 663defm SELP_f32 : SELP_PATTERN<"f32", f32, Float32Regs, f32imm, fpimm>; 664defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>; 665 666// This does not work as tablegen fails to infer the type of 'imm'. 667// def v2f16imm : Operand<v2f16>; 668// defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Float16x2Regs, v2f16imm, imm>; 669 670def SELP_f16x2rr : 671 NVPTXInst<(outs Float16x2Regs:$dst), 672 (ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p), 673 "selp.b32 \t$dst, $a, $b, $p;", 674 [(set Float16x2Regs:$dst, 675 (select Int1Regs:$p, (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>; 676 677//----------------------------------- 678// Test Instructions 679//----------------------------------- 680 681def TESTINF_f32r : NVPTXInst<(outs Int1Regs:$p), (ins Float32Regs:$a), 682 "testp.infinite.f32 \t$p, $a;", 683 []>; 684def TESTINF_f32i : NVPTXInst<(outs Int1Regs:$p), (ins f32imm:$a), 685 "testp.infinite.f32 \t$p, $a;", 686 []>; 687def TESTINF_f64r : NVPTXInst<(outs Int1Regs:$p), (ins Float64Regs:$a), 688 "testp.infinite.f64 \t$p, $a;", 689 []>; 690def TESTINF_f64i : NVPTXInst<(outs Int1Regs:$p), (ins f64imm:$a), 691 "testp.infinite.f64 \t$p, $a;", 692 []>; 693 694//----------------------------------- 695// Integer Arithmetic 696//----------------------------------- 697 698// Template for xor masquerading as int1 arithmetic. 699multiclass ADD_SUB_i1<SDNode OpNode> { 700 def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), 701 "xor.pred \t$dst, $a, $b;", 702 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; 703 def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), 704 "xor.pred \t$dst, $a, $b;", 705 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>; 706} 707 708// int1 addition and subtraction are both just xor. 709defm ADD_i1 : ADD_SUB_i1<add>; 710defm SUB_i1 : ADD_SUB_i1<sub>; 711 712// int16, int32, and int64 signed addition. Since nvptx is 2's complement, we 713// also use these for unsigned arithmetic. 714defm ADD : I3<"add.s", add>; 715defm SUB : I3<"sub.s", sub>; 716 717// in32 and int64 addition and subtraction with carry-out. 718defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>; 719defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>; 720 721// int32 and int64 addition and subtraction with carry-in and carry-out. 722defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>; 723defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>; 724 725defm MULT : I3<"mul.lo.s", mul>; 726 727defm MULTHS : I3<"mul.hi.s", mulhs>; 728defm MULTHU : I3<"mul.hi.u", mulhu>; 729 730defm SDIV : I3<"div.s", sdiv>; 731defm UDIV : I3<"div.u", udiv>; 732 733// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM 734// will lower it. 735defm SREM : I3<"rem.s", srem>; 736defm UREM : I3<"rem.u", urem>; 737 738// Integer absolute value. NumBits should be one minus the bit width of RC. 739// This idiom implements the algorithm at 740// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs. 741multiclass ABS<RegisterClass RC, string SizeName> { 742 def : NVPTXInst<(outs RC:$dst), (ins RC:$a), 743 !strconcat("abs", SizeName, " \t$dst, $a;"), 744 [(set RC:$dst, (abs RC:$a))]>; 745} 746defm ABS_16 : ABS<Int16Regs, ".s16">; 747defm ABS_32 : ABS<Int32Regs, ".s32">; 748defm ABS_64 : ABS<Int64Regs, ".s64">; 749 750// Integer min/max. 751defm SMAX : I3<"max.s", smax>; 752defm UMAX : I3<"max.u", umax>; 753defm SMIN : I3<"min.s", smin>; 754defm UMIN : I3<"min.u", umin>; 755 756// 757// Wide multiplication 758// 759def MULWIDES64 : 760 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 761 "mul.wide.s32 \t$dst, $a, $b;", []>; 762def MULWIDES64Imm : 763 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 764 "mul.wide.s32 \t$dst, $a, $b;", []>; 765def MULWIDES64Imm64 : 766 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), 767 "mul.wide.s32 \t$dst, $a, $b;", []>; 768 769def MULWIDEU64 : 770 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 771 "mul.wide.u32 \t$dst, $a, $b;", []>; 772def MULWIDEU64Imm : 773 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 774 "mul.wide.u32 \t$dst, $a, $b;", []>; 775def MULWIDEU64Imm64 : 776 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), 777 "mul.wide.u32 \t$dst, $a, $b;", []>; 778 779def MULWIDES32 : 780 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 781 "mul.wide.s16 \t$dst, $a, $b;", []>; 782def MULWIDES32Imm : 783 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 784 "mul.wide.s16 \t$dst, $a, $b;", []>; 785def MULWIDES32Imm32 : 786 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 787 "mul.wide.s16 \t$dst, $a, $b;", []>; 788 789def MULWIDEU32 : 790 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 791 "mul.wide.u16 \t$dst, $a, $b;", []>; 792def MULWIDEU32Imm : 793 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 794 "mul.wide.u16 \t$dst, $a, $b;", []>; 795def MULWIDEU32Imm32 : 796 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 797 "mul.wide.u16 \t$dst, $a, $b;", []>; 798 799def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; 800def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; 801def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; 802 803// Matchers for signed, unsigned mul.wide ISD nodes. 804def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)), 805 (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, 806 Requires<[doMulWide]>; 807def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)), 808 (MULWIDES32Imm Int16Regs:$a, imm:$b)>, 809 Requires<[doMulWide]>; 810def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)), 811 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, 812 Requires<[doMulWide]>; 813def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)), 814 (MULWIDEU32Imm Int16Regs:$a, imm:$b)>, 815 Requires<[doMulWide]>; 816 817def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)), 818 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, 819 Requires<[doMulWide]>; 820def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)), 821 (MULWIDES64Imm Int32Regs:$a, imm:$b)>, 822 Requires<[doMulWide]>; 823def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)), 824 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, 825 Requires<[doMulWide]>; 826def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)), 827 (MULWIDEU64Imm Int32Regs:$a, imm:$b)>, 828 Requires<[doMulWide]>; 829 830// Predicates used for converting some patterns to mul.wide. 831def SInt32Const : PatLeaf<(imm), [{ 832 const APInt &v = N->getAPIntValue(); 833 return v.isSignedIntN(32); 834}]>; 835 836def UInt32Const : PatLeaf<(imm), [{ 837 const APInt &v = N->getAPIntValue(); 838 return v.isIntN(32); 839}]>; 840 841def SInt16Const : PatLeaf<(imm), [{ 842 const APInt &v = N->getAPIntValue(); 843 return v.isSignedIntN(16); 844}]>; 845 846def UInt16Const : PatLeaf<(imm), [{ 847 const APInt &v = N->getAPIntValue(); 848 return v.isIntN(16); 849}]>; 850 851def IntConst_0_30 : PatLeaf<(imm), [{ 852 // Check if 0 <= v < 31; only then will the result of (x << v) be an int32. 853 const APInt &v = N->getAPIntValue(); 854 return v.sge(0) && v.slt(31); 855}]>; 856 857def IntConst_0_14 : PatLeaf<(imm), [{ 858 // Check if 0 <= v < 15; only then will the result of (x << v) be an int16. 859 const APInt &v = N->getAPIntValue(); 860 return v.sge(0) && v.slt(15); 861}]>; 862 863def SHL2MUL32 : SDNodeXForm<imm, [{ 864 const APInt &v = N->getAPIntValue(); 865 APInt temp(32, 1); 866 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32); 867}]>; 868 869def SHL2MUL16 : SDNodeXForm<imm, [{ 870 const APInt &v = N->getAPIntValue(); 871 APInt temp(16, 1); 872 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16); 873}]>; 874 875// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide. 876def : Pat<(shl (sext Int32Regs:$a), (i32 IntConst_0_30:$b)), 877 (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, 878 Requires<[doMulWide]>; 879def : Pat<(shl (zext Int32Regs:$a), (i32 IntConst_0_30:$b)), 880 (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, 881 Requires<[doMulWide]>; 882 883def : Pat<(shl (sext Int16Regs:$a), (i16 IntConst_0_14:$b)), 884 (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, 885 Requires<[doMulWide]>; 886def : Pat<(shl (zext Int16Regs:$a), (i16 IntConst_0_14:$b)), 887 (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, 888 Requires<[doMulWide]>; 889 890// Convert "sign/zero-extend then multiply" to mul.wide. 891def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)), 892 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, 893 Requires<[doMulWide]>; 894def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)), 895 (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>, 896 Requires<[doMulWide]>; 897 898def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)), 899 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, 900 Requires<[doMulWide]>; 901def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)), 902 (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>, 903 Requires<[doMulWide]>; 904 905def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)), 906 (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, 907 Requires<[doMulWide]>; 908def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)), 909 (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>, 910 Requires<[doMulWide]>; 911 912def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)), 913 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, 914 Requires<[doMulWide]>; 915def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)), 916 (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>, 917 Requires<[doMulWide]>; 918 919// 920// Integer multiply-add 921// 922def SDTIMAD : 923 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>, 924 SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>; 925def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>; 926 927def MAD16rrr : 928 NVPTXInst<(outs Int16Regs:$dst), 929 (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), 930 "mad.lo.s16 \t$dst, $a, $b, $c;", 931 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>; 932def MAD16rri : 933 NVPTXInst<(outs Int16Regs:$dst), 934 (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), 935 "mad.lo.s16 \t$dst, $a, $b, $c;", 936 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>; 937def MAD16rir : 938 NVPTXInst<(outs Int16Regs:$dst), 939 (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), 940 "mad.lo.s16 \t$dst, $a, $b, $c;", 941 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>; 942def MAD16rii : 943 NVPTXInst<(outs Int16Regs:$dst), 944 (ins Int16Regs:$a, i16imm:$b, i16imm:$c), 945 "mad.lo.s16 \t$dst, $a, $b, $c;", 946 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>; 947 948def MAD32rrr : 949 NVPTXInst<(outs Int32Regs:$dst), 950 (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), 951 "mad.lo.s32 \t$dst, $a, $b, $c;", 952 [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>; 953def MAD32rri : 954 NVPTXInst<(outs Int32Regs:$dst), 955 (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), 956 "mad.lo.s32 \t$dst, $a, $b, $c;", 957 [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>; 958def MAD32rir : 959 NVPTXInst<(outs Int32Regs:$dst), 960 (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), 961 "mad.lo.s32 \t$dst, $a, $b, $c;", 962 [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>; 963def MAD32rii : 964 NVPTXInst<(outs Int32Regs:$dst), 965 (ins Int32Regs:$a, i32imm:$b, i32imm:$c), 966 "mad.lo.s32 \t$dst, $a, $b, $c;", 967 [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>; 968 969def MAD64rrr : 970 NVPTXInst<(outs Int64Regs:$dst), 971 (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), 972 "mad.lo.s64 \t$dst, $a, $b, $c;", 973 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>; 974def MAD64rri : 975 NVPTXInst<(outs Int64Regs:$dst), 976 (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), 977 "mad.lo.s64 \t$dst, $a, $b, $c;", 978 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>; 979def MAD64rir : 980 NVPTXInst<(outs Int64Regs:$dst), 981 (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), 982 "mad.lo.s64 \t$dst, $a, $b, $c;", 983 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>; 984def MAD64rii : 985 NVPTXInst<(outs Int64Regs:$dst), 986 (ins Int64Regs:$a, i64imm:$b, i64imm:$c), 987 "mad.lo.s64 \t$dst, $a, $b, $c;", 988 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>; 989 990def INEG16 : 991 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 992 "neg.s16 \t$dst, $src;", 993 [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>; 994def INEG32 : 995 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 996 "neg.s32 \t$dst, $src;", 997 [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>; 998def INEG64 : 999 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 1000 "neg.s64 \t$dst, $src;", 1001 [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>; 1002 1003//----------------------------------- 1004// Floating Point Arithmetic 1005//----------------------------------- 1006 1007// Constant 1.0f 1008def FloatConst1 : PatLeaf<(fpimm), [{ 1009 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() && 1010 N->getValueAPF().convertToFloat() == 1.0f; 1011}]>; 1012// Constant 1.0 (double) 1013def DoubleConst1 : PatLeaf<(fpimm), [{ 1014 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() && 1015 N->getValueAPF().convertToDouble() == 1.0; 1016}]>; 1017 1018// Loads FP16 constant into a register. 1019// 1020// ptxas does not have hex representation for fp16, so we can't use 1021// fp16 immediate values in .f16 instructions. Instead we have to load 1022// the constant into a register using mov.b16. 1023def LOAD_CONST_F16 : 1024 NVPTXInst<(outs Float16Regs:$dst), (ins f16imm:$a), 1025 "mov.b16 \t$dst, $a;", []>; 1026 1027defm FADD : F3_fma_component<"add", fadd>; 1028defm FSUB : F3_fma_component<"sub", fsub>; 1029defm FMUL : F3_fma_component<"mul", fmul>; 1030 1031defm FMIN : F3<"min", fminnum>; 1032defm FMAX : F3<"max", fmaxnum>; 1033// Note: min.NaN.f64 and max.NaN.f64 do not actually exist. 1034defm FMINNAN : F3<"min.NaN", fminimum>; 1035defm FMAXNAN : F3<"max.NaN", fmaximum>; 1036 1037defm FABS : F2<"abs", fabs>; 1038defm FNEG : F2<"neg", fneg>; 1039defm FSQRT : F2<"sqrt.rn", fsqrt>; 1040 1041// 1042// F16 NEG 1043// 1044class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> : 1045 NVPTXInst<(outs RC:$dst), (ins RC:$src), 1046 !strconcat(OpcStr, " \t$dst, $src;"), 1047 [(set RC:$dst, (fneg (T RC:$src)))]>, 1048 Requires<[useFP16Math, hasPTX60, hasSM53, Pred]>; 1049def FNEG16_ftz : FNEG_F16_F16X2<"neg.ftz.f16", f16, Float16Regs, doF32FTZ>; 1050def FNEG16 : FNEG_F16_F16X2<"neg.f16", f16, Float16Regs, True>; 1051def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", v2f16, Float16x2Regs, doF32FTZ>; 1052def FNEG16x2 : FNEG_F16_F16X2<"neg.f16x2", v2f16, Float16x2Regs, True>; 1053 1054// 1055// F64 division 1056// 1057def FDIV641r : 1058 NVPTXInst<(outs Float64Regs:$dst), 1059 (ins f64imm:$a, Float64Regs:$b), 1060 "rcp.rn.f64 \t$dst, $b;", 1061 [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>; 1062def FDIV64rr : 1063 NVPTXInst<(outs Float64Regs:$dst), 1064 (ins Float64Regs:$a, Float64Regs:$b), 1065 "div.rn.f64 \t$dst, $a, $b;", 1066 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>; 1067def FDIV64ri : 1068 NVPTXInst<(outs Float64Regs:$dst), 1069 (ins Float64Regs:$a, f64imm:$b), 1070 "div.rn.f64 \t$dst, $a, $b;", 1071 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>; 1072 1073// 1074// F32 Approximate reciprocal 1075// 1076def FDIV321r_ftz : 1077 NVPTXInst<(outs Float32Regs:$dst), 1078 (ins f32imm:$a, Float32Regs:$b), 1079 "rcp.approx.ftz.f32 \t$dst, $b;", 1080 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1081 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1082def FDIV321r : 1083 NVPTXInst<(outs Float32Regs:$dst), 1084 (ins f32imm:$a, Float32Regs:$b), 1085 "rcp.approx.f32 \t$dst, $b;", 1086 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1087 Requires<[do_DIVF32_APPROX]>; 1088// 1089// F32 Approximate division 1090// 1091def FDIV32approxrr_ftz : 1092 NVPTXInst<(outs Float32Regs:$dst), 1093 (ins Float32Regs:$a, Float32Regs:$b), 1094 "div.approx.ftz.f32 \t$dst, $a, $b;", 1095 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1096 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1097def FDIV32approxri_ftz : 1098 NVPTXInst<(outs Float32Regs:$dst), 1099 (ins Float32Regs:$a, f32imm:$b), 1100 "div.approx.ftz.f32 \t$dst, $a, $b;", 1101 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1102 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1103def FDIV32approxrr : 1104 NVPTXInst<(outs Float32Regs:$dst), 1105 (ins Float32Regs:$a, Float32Regs:$b), 1106 "div.approx.f32 \t$dst, $a, $b;", 1107 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1108 Requires<[do_DIVF32_APPROX]>; 1109def FDIV32approxri : 1110 NVPTXInst<(outs Float32Regs:$dst), 1111 (ins Float32Regs:$a, f32imm:$b), 1112 "div.approx.f32 \t$dst, $a, $b;", 1113 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1114 Requires<[do_DIVF32_APPROX]>; 1115// 1116// F32 Semi-accurate reciprocal 1117// 1118// rcp.approx gives the same result as div.full(1.0f, a) and is faster. 1119// 1120def FDIV321r_approx_ftz : 1121 NVPTXInst<(outs Float32Regs:$dst), 1122 (ins f32imm:$a, Float32Regs:$b), 1123 "rcp.approx.ftz.f32 \t$dst, $b;", 1124 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1125 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1126def FDIV321r_approx : 1127 NVPTXInst<(outs Float32Regs:$dst), 1128 (ins f32imm:$a, Float32Regs:$b), 1129 "rcp.approx.f32 \t$dst, $b;", 1130 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1131 Requires<[do_DIVF32_FULL]>; 1132// 1133// F32 Semi-accurate division 1134// 1135def FDIV32rr_ftz : 1136 NVPTXInst<(outs Float32Regs:$dst), 1137 (ins Float32Regs:$a, Float32Regs:$b), 1138 "div.full.ftz.f32 \t$dst, $a, $b;", 1139 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1140 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1141def FDIV32ri_ftz : 1142 NVPTXInst<(outs Float32Regs:$dst), 1143 (ins Float32Regs:$a, f32imm:$b), 1144 "div.full.ftz.f32 \t$dst, $a, $b;", 1145 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1146 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1147def FDIV32rr : 1148 NVPTXInst<(outs Float32Regs:$dst), 1149 (ins Float32Regs:$a, Float32Regs:$b), 1150 "div.full.f32 \t$dst, $a, $b;", 1151 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1152 Requires<[do_DIVF32_FULL]>; 1153def FDIV32ri : 1154 NVPTXInst<(outs Float32Regs:$dst), 1155 (ins Float32Regs:$a, f32imm:$b), 1156 "div.full.f32 \t$dst, $a, $b;", 1157 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1158 Requires<[do_DIVF32_FULL]>; 1159// 1160// F32 Accurate reciprocal 1161// 1162def FDIV321r_prec_ftz : 1163 NVPTXInst<(outs Float32Regs:$dst), 1164 (ins f32imm:$a, Float32Regs:$b), 1165 "rcp.rn.ftz.f32 \t$dst, $b;", 1166 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1167 Requires<[doF32FTZ]>; 1168def FDIV321r_prec : 1169 NVPTXInst<(outs Float32Regs:$dst), 1170 (ins f32imm:$a, Float32Regs:$b), 1171 "rcp.rn.f32 \t$dst, $b;", 1172 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>; 1173// 1174// F32 Accurate division 1175// 1176def FDIV32rr_prec_ftz : 1177 NVPTXInst<(outs Float32Regs:$dst), 1178 (ins Float32Regs:$a, Float32Regs:$b), 1179 "div.rn.ftz.f32 \t$dst, $a, $b;", 1180 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1181 Requires<[doF32FTZ]>; 1182def FDIV32ri_prec_ftz : 1183 NVPTXInst<(outs Float32Regs:$dst), 1184 (ins Float32Regs:$a, f32imm:$b), 1185 "div.rn.ftz.f32 \t$dst, $a, $b;", 1186 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1187 Requires<[doF32FTZ]>; 1188def FDIV32rr_prec : 1189 NVPTXInst<(outs Float32Regs:$dst), 1190 (ins Float32Regs:$a, Float32Regs:$b), 1191 "div.rn.f32 \t$dst, $a, $b;", 1192 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>; 1193def FDIV32ri_prec : 1194 NVPTXInst<(outs Float32Regs:$dst), 1195 (ins Float32Regs:$a, f32imm:$b), 1196 "div.rn.f32 \t$dst, $a, $b;", 1197 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>; 1198 1199// 1200// FMA 1201// 1202 1203multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> { 1204 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1205 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1206 [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, 1207 Requires<[Pred]>; 1208 def rri : NVPTXInst<(outs RC:$dst), 1209 (ins RC:$a, RC:$b, ImmCls:$c), 1210 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1211 [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>, 1212 Requires<[Pred]>; 1213 def rir : NVPTXInst<(outs RC:$dst), 1214 (ins RC:$a, ImmCls:$b, RC:$c), 1215 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1216 [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>, 1217 Requires<[Pred]>; 1218 def rii : NVPTXInst<(outs RC:$dst), 1219 (ins RC:$a, ImmCls:$b, ImmCls:$c), 1220 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1221 [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>, 1222 Requires<[Pred]>; 1223} 1224 1225multiclass FMA_F16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> { 1226 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1227 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1228 [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>, 1229 Requires<[useFP16Math, Pred]>; 1230} 1231 1232defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", f16, Float16Regs, doF32FTZ>; 1233defm FMA16 : FMA_F16<"fma.rn.f16", f16, Float16Regs, True>; 1234defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Float16x2Regs, doF32FTZ>; 1235defm FMA16x2 : FMA_F16<"fma.rn.f16x2", v2f16, Float16x2Regs, True>; 1236defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>; 1237defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, True>; 1238defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, True>; 1239 1240// sin/cos 1241def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1242 "sin.approx.f32 \t$dst, $src;", 1243 [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>, 1244 Requires<[allowUnsafeFPMath]>; 1245def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1246 "cos.approx.f32 \t$dst, $src;", 1247 [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>, 1248 Requires<[allowUnsafeFPMath]>; 1249 1250// Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)), 1251// i.e. "poor man's fmod()". When y is infinite, x is returned. This matches the 1252// semantics of LLVM's frem. 1253 1254// frem - f32 FTZ 1255def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1256 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32 1257 (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ), 1258 Float32Regs:$y))>, 1259 Requires<[doF32FTZ, allowUnsafeFPMath]>; 1260def : Pat<(frem Float32Regs:$x, fpimm:$y), 1261 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32 1262 (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ), 1263 fpimm:$y))>, 1264 Requires<[doF32FTZ, allowUnsafeFPMath]>; 1265 1266def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1267 (SELP_f32rr Float32Regs:$x, 1268 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32 1269 (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ), 1270 Float32Regs:$y)), 1271 (TESTINF_f32r Float32Regs:$y))>, 1272 Requires<[doF32FTZ, noUnsafeFPMath]>; 1273def : Pat<(frem Float32Regs:$x, fpimm:$y), 1274 (SELP_f32rr Float32Regs:$x, 1275 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32 1276 (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ), 1277 fpimm:$y)), 1278 (TESTINF_f32i fpimm:$y))>, 1279 Requires<[doF32FTZ, noUnsafeFPMath]>; 1280 1281// frem - f32 1282def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1283 (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32 1284 (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI), 1285 Float32Regs:$y))>, 1286 Requires<[allowUnsafeFPMath]>; 1287def : Pat<(frem Float32Regs:$x, fpimm:$y), 1288 (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32 1289 (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI), 1290 fpimm:$y))>, 1291 Requires<[allowUnsafeFPMath]>; 1292 1293def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1294 (SELP_f32rr Float32Regs:$x, 1295 (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32 1296 (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI), 1297 Float32Regs:$y)), 1298 (TESTINF_f32r Float32Regs:$y))>, 1299 Requires<[noUnsafeFPMath]>; 1300def : Pat<(frem Float32Regs:$x, fpimm:$y), 1301 (SELP_f32rr Float32Regs:$x, 1302 (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32 1303 (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI), 1304 fpimm:$y)), 1305 (TESTINF_f32i fpimm:$y))>, 1306 Requires<[noUnsafeFPMath]>; 1307 1308// frem - f64 1309def : Pat<(frem Float64Regs:$x, Float64Regs:$y), 1310 (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64 1311 (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI), 1312 Float64Regs:$y))>, 1313 Requires<[allowUnsafeFPMath]>; 1314def : Pat<(frem Float64Regs:$x, fpimm:$y), 1315 (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64 1316 (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI), 1317 fpimm:$y))>, 1318 Requires<[allowUnsafeFPMath]>; 1319 1320def : Pat<(frem Float64Regs:$x, Float64Regs:$y), 1321 (SELP_f64rr Float64Regs:$x, 1322 (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64 1323 (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI), 1324 Float64Regs:$y)), 1325 (TESTINF_f64r Float64Regs:$y))>, 1326 Requires<[noUnsafeFPMath]>; 1327def : Pat<(frem Float64Regs:$x, fpimm:$y), 1328 (SELP_f64rr Float64Regs:$x, 1329 (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64 1330 (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI), 1331 fpimm:$y)), 1332 (TESTINF_f64r Float64Regs:$y))>, 1333 Requires<[noUnsafeFPMath]>; 1334 1335//----------------------------------- 1336// Bitwise operations 1337//----------------------------------- 1338 1339// Template for three-arg bitwise operations. Takes three args, Creates .b16, 1340// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr. 1341multiclass BITWISE<string OpcStr, SDNode OpNode> { 1342 def b1rr : 1343 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), 1344 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), 1345 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; 1346 def b1ri : 1347 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), 1348 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), 1349 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>; 1350 def b16rr : 1351 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 1352 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), 1353 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; 1354 def b16ri : 1355 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 1356 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), 1357 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>; 1358 def b32rr : 1359 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 1360 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), 1361 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; 1362 def b32ri : 1363 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 1364 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), 1365 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; 1366 def b64rr : 1367 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 1368 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), 1369 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; 1370 def b64ri : 1371 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 1372 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), 1373 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; 1374} 1375 1376defm OR : BITWISE<"or", or>; 1377defm AND : BITWISE<"and", and>; 1378defm XOR : BITWISE<"xor", xor>; 1379 1380def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src), 1381 "not.pred \t$dst, $src;", 1382 [(set Int1Regs:$dst, (not Int1Regs:$src))]>; 1383def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 1384 "not.b16 \t$dst, $src;", 1385 [(set Int16Regs:$dst, (not Int16Regs:$src))]>; 1386def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 1387 "not.b32 \t$dst, $src;", 1388 [(set Int32Regs:$dst, (not Int32Regs:$src))]>; 1389def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 1390 "not.b64 \t$dst, $src;", 1391 [(set Int64Regs:$dst, (not Int64Regs:$src))]>; 1392 1393// Template for left/right shifts. Takes three operands, 1394// [dest (reg), src (reg), shift (reg or imm)]. 1395// dest and src may be int64, int32, or int16, but shift is always int32. 1396// 1397// This template also defines a 32-bit shift (imm, imm) instruction. 1398multiclass SHIFT<string OpcStr, SDNode OpNode> { 1399 def i64rr : 1400 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b), 1401 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 1402 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>; 1403 def i64ri : 1404 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b), 1405 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 1406 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>; 1407 def i32rr : 1408 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 1409 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1410 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; 1411 def i32ri : 1412 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 1413 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1414 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>; 1415 def i32ii : 1416 NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b), 1417 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1418 [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>; 1419 def i16rr : 1420 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b), 1421 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 1422 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>; 1423 def i16ri : 1424 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 1425 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 1426 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>; 1427} 1428 1429defm SHL : SHIFT<"shl.b", shl>; 1430defm SRA : SHIFT<"shr.s", sra>; 1431defm SRL : SHIFT<"shr.u", srl>; 1432 1433// Bit-reverse 1434def BREV32 : 1435 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 1436 "brev.b32 \t$dst, $a;", 1437 [(set Int32Regs:$dst, (bitreverse Int32Regs:$a))]>; 1438def BREV64 : 1439 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a), 1440 "brev.b64 \t$dst, $a;", 1441 [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>; 1442 1443// 1444// Rotate: Use ptx shf instruction if available. 1445// 1446 1447// 32 bit r2 = rotl r1, n 1448// => 1449// r2 = shf.l r1, r1, n 1450def ROTL32imm_hw : 1451 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), 1452 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", 1453 [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>, 1454 Requires<[hasHWROT32]>; 1455 1456def ROTL32reg_hw : 1457 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1458 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", 1459 [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, 1460 Requires<[hasHWROT32]>; 1461 1462// 32 bit r2 = rotr r1, n 1463// => 1464// r2 = shf.r r1, r1, n 1465def ROTR32imm_hw : 1466 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), 1467 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", 1468 [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>, 1469 Requires<[hasHWROT32]>; 1470 1471def ROTR32reg_hw : 1472 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1473 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", 1474 [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, 1475 Requires<[hasHWROT32]>; 1476 1477// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1. 1478def ROT32imm_sw : 1479 NVPTXInst<(outs Int32Regs:$dst), 1480 (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), 1481 "{{\n\t" 1482 ".reg .b32 %lhs;\n\t" 1483 ".reg .b32 %rhs;\n\t" 1484 "shl.b32 \t%lhs, $src, $amt1;\n\t" 1485 "shr.b32 \t%rhs, $src, $amt2;\n\t" 1486 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1487 "}}", 1488 []>; 1489 1490def SUB_FRM_32 : SDNodeXForm<imm, [{ 1491 return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32); 1492}]>; 1493 1494def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)), 1495 (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, 1496 Requires<[noHWROT32]>; 1497def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)), 1498 (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>, 1499 Requires<[noHWROT32]>; 1500 1501// 32-bit software rotate left by register. 1502def ROTL32reg_sw : 1503 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1504 "{{\n\t" 1505 ".reg .b32 %lhs;\n\t" 1506 ".reg .b32 %rhs;\n\t" 1507 ".reg .b32 %amt2;\n\t" 1508 "shl.b32 \t%lhs, $src, $amt;\n\t" 1509 "sub.s32 \t%amt2, 32, $amt;\n\t" 1510 "shr.b32 \t%rhs, $src, %amt2;\n\t" 1511 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1512 "}}", 1513 [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, 1514 Requires<[noHWROT32]>; 1515 1516// 32-bit software rotate right by register. 1517def ROTR32reg_sw : 1518 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1519 "{{\n\t" 1520 ".reg .b32 %lhs;\n\t" 1521 ".reg .b32 %rhs;\n\t" 1522 ".reg .b32 %amt2;\n\t" 1523 "shr.b32 \t%lhs, $src, $amt;\n\t" 1524 "sub.s32 \t%amt2, 32, $amt;\n\t" 1525 "shl.b32 \t%rhs, $src, %amt2;\n\t" 1526 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1527 "}}", 1528 [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, 1529 Requires<[noHWROT32]>; 1530 1531// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1. 1532def ROT64imm_sw : 1533 NVPTXInst<(outs Int64Regs:$dst), 1534 (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2), 1535 "{{\n\t" 1536 ".reg .b64 %lhs;\n\t" 1537 ".reg .b64 %rhs;\n\t" 1538 "shl.b64 \t%lhs, $src, $amt1;\n\t" 1539 "shr.b64 \t%rhs, $src, $amt2;\n\t" 1540 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1541 "}}", 1542 []>; 1543 1544def SUB_FRM_64 : SDNodeXForm<imm, [{ 1545 return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32); 1546}]>; 1547 1548def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)), 1549 (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>; 1550def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)), 1551 (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>; 1552 1553// 64-bit software rotate left by register. 1554def ROTL64reg_sw : 1555 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), 1556 "{{\n\t" 1557 ".reg .b64 %lhs;\n\t" 1558 ".reg .b64 %rhs;\n\t" 1559 ".reg .u32 %amt2;\n\t" 1560 "shl.b64 \t%lhs, $src, $amt;\n\t" 1561 "sub.u32 \t%amt2, 64, $amt;\n\t" 1562 "shr.b64 \t%rhs, $src, %amt2;\n\t" 1563 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1564 "}}", 1565 [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>; 1566 1567def ROTR64reg_sw : 1568 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), 1569 "{{\n\t" 1570 ".reg .b64 %lhs;\n\t" 1571 ".reg .b64 %rhs;\n\t" 1572 ".reg .u32 %amt2;\n\t" 1573 "shr.b64 \t%lhs, $src, $amt;\n\t" 1574 "sub.u32 \t%amt2, 64, $amt;\n\t" 1575 "shl.b64 \t%rhs, $src, %amt2;\n\t" 1576 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1577 "}}", 1578 [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>; 1579 1580// 1581// Funnnel shift in clamp mode 1582// 1583 1584// Create SDNodes so they can be used in the DAG code, e.g. 1585// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) 1586def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; 1587def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; 1588 1589def FUNSHFLCLAMP : 1590 NVPTXInst<(outs Int32Regs:$dst), 1591 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), 1592 "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", 1593 [(set Int32Regs:$dst, 1594 (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>; 1595 1596def FUNSHFRCLAMP : 1597 NVPTXInst<(outs Int32Regs:$dst), 1598 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), 1599 "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", 1600 [(set Int32Regs:$dst, 1601 (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>; 1602 1603// 1604// BFE - bit-field extract 1605// 1606 1607// Template for BFE instructions. Takes four args, 1608// [dest (reg), src (reg), start (reg or imm), end (reg or imm)]. 1609// Start may be an imm only if end is also an imm. FIXME: Is this a 1610// restriction in PTX? 1611// 1612// dest and src may be int32 or int64, but start and end are always int32. 1613multiclass BFE<string TyStr, RegisterClass RC> { 1614 def rrr 1615 : NVPTXInst<(outs RC:$d), 1616 (ins RC:$a, Int32Regs:$b, Int32Regs:$c), 1617 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; 1618 def rri 1619 : NVPTXInst<(outs RC:$d), 1620 (ins RC:$a, Int32Regs:$b, i32imm:$c), 1621 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; 1622 def rii 1623 : NVPTXInst<(outs RC:$d), 1624 (ins RC:$a, i32imm:$b, i32imm:$c), 1625 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; 1626} 1627 1628let hasSideEffects = false in { 1629 defm BFE_S32 : BFE<"s32", Int32Regs>; 1630 defm BFE_U32 : BFE<"u32", Int32Regs>; 1631 defm BFE_S64 : BFE<"s64", Int64Regs>; 1632 defm BFE_U64 : BFE<"u64", Int64Regs>; 1633} 1634 1635//----------------------------------- 1636// Comparison instructions (setp, set) 1637//----------------------------------- 1638 1639// FIXME: This doesn't cover versions of set and setp that combine with a 1640// boolean predicate, e.g. setp.eq.and.b16. 1641 1642let hasSideEffects = false in { 1643 multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> { 1644 def rr : 1645 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp), 1646 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1647 " \t$dst, $a, $b;"), []>; 1648 def ri : 1649 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp), 1650 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1651 " \t$dst, $a, $b;"), []>; 1652 def ir : 1653 NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp), 1654 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1655 " \t$dst, $a, $b;"), []>; 1656 } 1657} 1658 1659defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>; 1660defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>; 1661defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>; 1662defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>; 1663defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>; 1664defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>; 1665defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>; 1666defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>; 1667defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>; 1668defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>; 1669defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>; 1670def SETP_f16rr : 1671 NVPTXInst<(outs Int1Regs:$dst), 1672 (ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp), 1673 "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;", 1674 []>, Requires<[useFP16Math]>; 1675 1676def SETP_f16x2rr : 1677 NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q), 1678 (ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp), 1679 "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;", 1680 []>, 1681 Requires<[useFP16Math]>; 1682 1683 1684// FIXME: This doesn't appear to be correct. The "set" mnemonic has the form 1685// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination 1686// reg, either u32, s32, or f32. Anyway these aren't used at the moment. 1687 1688let hasSideEffects = false in { 1689 multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> { 1690 def rr : NVPTXInst<(outs Int32Regs:$dst), 1691 (ins RC:$a, RC:$b, CmpMode:$cmp), 1692 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1693 def ri : NVPTXInst<(outs Int32Regs:$dst), 1694 (ins RC:$a, ImmCls:$b, CmpMode:$cmp), 1695 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1696 def ir : NVPTXInst<(outs Int32Regs:$dst), 1697 (ins ImmCls:$a, RC:$b, CmpMode:$cmp), 1698 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1699 } 1700} 1701 1702defm SET_b16 : SET<"b16", Int16Regs, i16imm>; 1703defm SET_s16 : SET<"s16", Int16Regs, i16imm>; 1704defm SET_u16 : SET<"u16", Int16Regs, i16imm>; 1705defm SET_b32 : SET<"b32", Int32Regs, i32imm>; 1706defm SET_s32 : SET<"s32", Int32Regs, i32imm>; 1707defm SET_u32 : SET<"u32", Int32Regs, i32imm>; 1708defm SET_b64 : SET<"b64", Int64Regs, i64imm>; 1709defm SET_s64 : SET<"s64", Int64Regs, i64imm>; 1710defm SET_u64 : SET<"u64", Int64Regs, i64imm>; 1711defm SET_f16 : SET<"f16", Float16Regs, f16imm>; 1712defm SET_f32 : SET<"f32", Float32Regs, f32imm>; 1713defm SET_f64 : SET<"f64", Float64Regs, f64imm>; 1714 1715//----------------------------------- 1716// Data Movement (Load / Store, Move) 1717//----------------------------------- 1718 1719def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], 1720 [SDNPWantRoot]>; 1721def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex], 1722 [SDNPWantRoot]>; 1723def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>; 1724 1725def MEMri : Operand<i32> { 1726 let PrintMethod = "printMemOperand"; 1727 let MIOperandInfo = (ops Int32Regs, i32imm); 1728} 1729def MEMri64 : Operand<i64> { 1730 let PrintMethod = "printMemOperand"; 1731 let MIOperandInfo = (ops Int64Regs, i64imm); 1732} 1733 1734def imem : Operand<iPTR> { 1735 let PrintMethod = "printOperand"; 1736} 1737 1738def imemAny : Operand<iPTRAny> { 1739 let PrintMethod = "printOperand"; 1740} 1741 1742def LdStCode : Operand<i32> { 1743 let PrintMethod = "printLdStCode"; 1744} 1745 1746def MmaCode : Operand<i32> { 1747 let PrintMethod = "printMmaCode"; 1748} 1749 1750def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; 1751def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>; 1752 1753// Load a memory address into a u32 or u64 register. 1754def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a), 1755 "mov.u32 \t$dst, $a;", 1756 [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>; 1757def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a), 1758 "mov.u64 \t$dst, $a;", 1759 [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>; 1760 1761// Get pointer to local stack. 1762let hasSideEffects = false in { 1763 def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num), 1764 "mov.u32 \t$d, __local_depot$num;", []>; 1765 def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num), 1766 "mov.u64 \t$d, __local_depot$num;", []>; 1767} 1768 1769 1770// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp 1771let IsSimpleMove=1, hasSideEffects=0 in { 1772 def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss), 1773 "mov.pred \t$dst, $sss;", []>; 1774 def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), 1775 "mov.u16 \t$dst, $sss;", []>; 1776 def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), 1777 "mov.u32 \t$dst, $sss;", []>; 1778 def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), 1779 "mov.u64 \t$dst, $sss;", []>; 1780 1781 def FMOV16rr : NVPTXInst<(outs Float16Regs:$dst), (ins Float16Regs:$src), 1782 // We have to use .b16 here as there's no mov.f16. 1783 "mov.b16 \t$dst, $src;", []>; 1784 def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1785 "mov.f32 \t$dst, $src;", []>; 1786 def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src), 1787 "mov.f64 \t$dst, $src;", []>; 1788} 1789 1790def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), 1791 "mov.pred \t$dst, $src;", 1792 [(set Int1Regs:$dst, imm:$src)]>; 1793def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), 1794 "mov.u16 \t$dst, $src;", 1795 [(set Int16Regs:$dst, imm:$src)]>; 1796def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), 1797 "mov.u32 \t$dst, $src;", 1798 [(set Int32Regs:$dst, imm:$src)]>; 1799def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), 1800 "mov.u64 \t$dst, $src;", 1801 [(set Int64Regs:$dst, imm:$src)]>; 1802 1803def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), 1804 "mov.f32 \t$dst, $src;", 1805 [(set Float32Regs:$dst, fpimm:$src)]>; 1806def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), 1807 "mov.f64 \t$dst, $src;", 1808 [(set Float64Regs:$dst, fpimm:$src)]>; 1809 1810def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>; 1811def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>; 1812 1813//---- Copy Frame Index ---- 1814def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr), 1815 "add.u32 \t$dst, ${addr:add};", 1816 [(set Int32Regs:$dst, ADDRri:$addr)]>; 1817def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr), 1818 "add.u64 \t$dst, ${addr:add};", 1819 [(set Int64Regs:$dst, ADDRri64:$addr)]>; 1820 1821//----------------------------------- 1822// Comparison and Selection 1823//----------------------------------- 1824 1825multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode, 1826 Instruction setp_16rr, 1827 Instruction setp_16ri, 1828 Instruction setp_16ir, 1829 Instruction setp_32rr, 1830 Instruction setp_32ri, 1831 Instruction setp_32ir, 1832 Instruction setp_64rr, 1833 Instruction setp_64ri, 1834 Instruction setp_64ir, 1835 Instruction set_16rr, 1836 Instruction set_16ri, 1837 Instruction set_16ir, 1838 Instruction set_32rr, 1839 Instruction set_32ri, 1840 Instruction set_32ir, 1841 Instruction set_64rr, 1842 Instruction set_64ri, 1843 Instruction set_64ir> { 1844 // i16 -> pred 1845 def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)), 1846 (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; 1847 def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)), 1848 (setp_16ri Int16Regs:$a, imm:$b, Mode)>; 1849 def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)), 1850 (setp_16ir imm:$a, Int16Regs:$b, Mode)>; 1851 // i32 -> pred 1852 def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)), 1853 (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; 1854 def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)), 1855 (setp_32ri Int32Regs:$a, imm:$b, Mode)>; 1856 def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)), 1857 (setp_32ir imm:$a, Int32Regs:$b, Mode)>; 1858 // i64 -> pred 1859 def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)), 1860 (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; 1861 def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)), 1862 (setp_64ri Int64Regs:$a, imm:$b, Mode)>; 1863 def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)), 1864 (setp_64ir imm:$a, Int64Regs:$b, Mode)>; 1865 1866 // i16 -> i32 1867 def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)), 1868 (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; 1869 def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)), 1870 (set_16ri Int16Regs:$a, imm:$b, Mode)>; 1871 def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)), 1872 (set_16ir imm:$a, Int16Regs:$b, Mode)>; 1873 // i32 -> i32 1874 def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)), 1875 (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; 1876 def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)), 1877 (set_32ri Int32Regs:$a, imm:$b, Mode)>; 1878 def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)), 1879 (set_32ir imm:$a, Int32Regs:$b, Mode)>; 1880 // i64 -> i32 1881 def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)), 1882 (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; 1883 def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)), 1884 (set_64ri Int64Regs:$a, imm:$b, Mode)>; 1885 def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)), 1886 (set_64ir imm:$a, Int64Regs:$b, Mode)>; 1887} 1888 1889multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode> 1890 : ISET_FORMAT<OpNode, Mode, 1891 SETP_s16rr, SETP_s16ri, SETP_s16ir, 1892 SETP_s32rr, SETP_s32ri, SETP_s32ir, 1893 SETP_s64rr, SETP_s64ri, SETP_s64ir, 1894 SET_s16rr, SET_s16ri, SET_s16ir, 1895 SET_s32rr, SET_s32ri, SET_s32ir, 1896 SET_s64rr, SET_s64ri, SET_s64ir> { 1897 // TableGen doesn't like empty multiclasses. 1898 def : PatLeaf<(i32 0)>; 1899} 1900 1901multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode> 1902 : ISET_FORMAT<OpNode, Mode, 1903 SETP_u16rr, SETP_u16ri, SETP_u16ir, 1904 SETP_u32rr, SETP_u32ri, SETP_u32ir, 1905 SETP_u64rr, SETP_u64ri, SETP_u64ir, 1906 SET_u16rr, SET_u16ri, SET_u16ir, 1907 SET_u32rr, SET_u32ri, SET_u32ir, 1908 SET_u64rr, SET_u64ri, SET_u64ir> { 1909 // TableGen doesn't like empty multiclasses. 1910 def : PatLeaf<(i32 0)>; 1911} 1912 1913defm : ISET_FORMAT_SIGNED<setgt, CmpGT>; 1914defm : ISET_FORMAT_SIGNED<setlt, CmpLT>; 1915defm : ISET_FORMAT_SIGNED<setge, CmpGE>; 1916defm : ISET_FORMAT_SIGNED<setle, CmpLE>; 1917defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>; 1918defm : ISET_FORMAT_SIGNED<setne, CmpNE>; 1919defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>; 1920defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>; 1921defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>; 1922defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>; 1923defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>; 1924defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>; 1925 1926// i1 compares 1927def : Pat<(setne Int1Regs:$a, Int1Regs:$b), 1928 (XORb1rr Int1Regs:$a, Int1Regs:$b)>; 1929def : Pat<(setune Int1Regs:$a, Int1Regs:$b), 1930 (XORb1rr Int1Regs:$a, Int1Regs:$b)>; 1931 1932def : Pat<(seteq Int1Regs:$a, Int1Regs:$b), 1933 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 1934def : Pat<(setueq Int1Regs:$a, Int1Regs:$b), 1935 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 1936 1937// i1 compare -> i32 1938def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), 1939 (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 1940def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), 1941 (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 1942 1943 1944 1945multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> { 1946 // f16 -> pred 1947 def : Pat<(i1 (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b))), 1948 (SETP_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>, 1949 Requires<[useFP16Math,doF32FTZ]>; 1950 def : Pat<(i1 (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b))), 1951 (SETP_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>, 1952 Requires<[useFP16Math]>; 1953 def : Pat<(i1 (OpNode (f16 Float16Regs:$a), fpimm:$b)), 1954 (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, 1955 Requires<[useFP16Math,doF32FTZ]>; 1956 def : Pat<(i1 (OpNode (f16 Float16Regs:$a), fpimm:$b)), 1957 (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, 1958 Requires<[useFP16Math]>; 1959 def : Pat<(i1 (OpNode fpimm:$a, (f16 Float16Regs:$b))), 1960 (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>, 1961 Requires<[useFP16Math,doF32FTZ]>; 1962 def : Pat<(i1 (OpNode fpimm:$a, (f16 Float16Regs:$b))), 1963 (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>, 1964 Requires<[useFP16Math]>; 1965 1966 // f32 -> pred 1967 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), 1968 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, 1969 Requires<[doF32FTZ]>; 1970 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), 1971 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; 1972 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), 1973 (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, 1974 Requires<[doF32FTZ]>; 1975 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), 1976 (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>; 1977 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), 1978 (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, 1979 Requires<[doF32FTZ]>; 1980 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), 1981 (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>; 1982 1983 // f64 -> pred 1984 def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)), 1985 (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; 1986 def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)), 1987 (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>; 1988 def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)), 1989 (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>; 1990 1991 // f16 -> i32 1992 def : Pat<(i32 (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b))), 1993 (SET_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>, 1994 Requires<[useFP16Math, doF32FTZ]>; 1995 def : Pat<(i32 (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b))), 1996 (SET_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>, 1997 Requires<[useFP16Math]>; 1998 def : Pat<(i32 (OpNode (f16 Float16Regs:$a), fpimm:$b)), 1999 (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, 2000 Requires<[useFP16Math, doF32FTZ]>; 2001 def : Pat<(i32 (OpNode (f16 Float16Regs:$a), fpimm:$b)), 2002 (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, 2003 Requires<[useFP16Math]>; 2004 def : Pat<(i32 (OpNode fpimm:$a, (f16 Float16Regs:$b))), 2005 (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>, 2006 Requires<[useFP16Math, doF32FTZ]>; 2007 def : Pat<(i32 (OpNode fpimm:$a, (f16 Float16Regs:$b))), 2008 (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>, 2009 Requires<[useFP16Math]>; 2010 2011 // f32 -> i32 2012 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), 2013 (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, 2014 Requires<[doF32FTZ]>; 2015 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), 2016 (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; 2017 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), 2018 (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, 2019 Requires<[doF32FTZ]>; 2020 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), 2021 (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>; 2022 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), 2023 (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, 2024 Requires<[doF32FTZ]>; 2025 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), 2026 (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>; 2027 2028 // f64 -> i32 2029 def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)), 2030 (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; 2031 def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)), 2032 (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>; 2033 def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)), 2034 (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>; 2035} 2036 2037defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>; 2038defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>; 2039defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>; 2040defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>; 2041defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>; 2042defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>; 2043 2044defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>; 2045defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>; 2046defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>; 2047defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>; 2048defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>; 2049defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>; 2050 2051defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>; 2052defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>; 2053defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>; 2054defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>; 2055defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>; 2056defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>; 2057 2058defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>; 2059defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>; 2060 2061// FIXME: What is this doing here? Can it be deleted? 2062// def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad, 2063// [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; 2064 2065def SDTDeclareParamProfile : 2066 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; 2067def SDTDeclareScalarParamProfile : 2068 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; 2069def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; 2070def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>; 2071def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>; 2072def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 2073def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 2074def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; 2075def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>; 2076def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>; 2077def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; 2078def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; 2079def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>; 2080def SDTCallVoidProfile : SDTypeProfile<0, 1, []>; 2081def SDTCallValProfile : SDTypeProfile<1, 0, []>; 2082def SDTMoveParamProfile : SDTypeProfile<1, 1, []>; 2083def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; 2084def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>; 2085def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>; 2086def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>; 2087def SDTProxyRegProfile : SDTypeProfile<1, 1, []>; 2088 2089def DeclareParam : 2090 SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile, 2091 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2092def DeclareScalarParam : 2093 SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile, 2094 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2095def DeclareRetParam : 2096 SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile, 2097 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2098def DeclareRet : 2099 SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile, 2100 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2101def LoadParam : 2102 SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, 2103 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2104def LoadParamV2 : 2105 SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile, 2106 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2107def LoadParamV4 : 2108 SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile, 2109 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2110def PrintCall : 2111 SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile, 2112 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2113def PrintConvergentCall : 2114 SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile, 2115 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2116def PrintCallUni : 2117 SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile, 2118 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2119def PrintConvergentCallUni : 2120 SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile, 2121 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2122def StoreParam : 2123 SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, 2124 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2125def StoreParamV2 : 2126 SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile, 2127 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2128def StoreParamV4 : 2129 SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile, 2130 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2131def StoreParamU32 : 2132 SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile, 2133 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2134def StoreParamS32 : 2135 SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile, 2136 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2137def CallArgBegin : 2138 SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile, 2139 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2140def CallArg : 2141 SDNode<"NVPTXISD::CallArg", SDTCallArgProfile, 2142 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2143def LastCallArg : 2144 SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile, 2145 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2146def CallArgEnd : 2147 SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile, 2148 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2149def CallVoid : 2150 SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile, 2151 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2152def Prototype : 2153 SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile, 2154 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2155def CallVal : 2156 SDNode<"NVPTXISD::CallVal", SDTCallValProfile, 2157 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2158def MoveParam : 2159 SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>; 2160def StoreRetval : 2161 SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile, 2162 [SDNPHasChain, SDNPSideEffect]>; 2163def StoreRetvalV2 : 2164 SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile, 2165 [SDNPHasChain, SDNPSideEffect]>; 2166def StoreRetvalV4 : 2167 SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile, 2168 [SDNPHasChain, SDNPSideEffect]>; 2169def PseudoUseParam : 2170 SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile, 2171 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2172def RETURNNode : 2173 SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile, 2174 [SDNPHasChain, SDNPSideEffect]>; 2175def ProxyReg : 2176 SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile, 2177 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2178 2179let mayLoad = true in { 2180 class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : 2181 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), 2182 !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"), 2183 []>; 2184 2185 class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> : 2186 NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b), 2187 !strconcat("ld.param.v2", opstr, 2188 " \t{{$dst, $dst2}}, [retval0+$b];"), []>; 2189 2190 class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> : 2191 NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, 2192 regclass:$dst4), 2193 (ins i32imm:$b), 2194 !strconcat("ld.param.v4", opstr, 2195 " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), 2196 []>; 2197} 2198 2199class LoadParamRegInst<NVPTXRegClass regclass, string opstr> : 2200 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), 2201 !strconcat("mov", opstr, " \t$dst, retval$b;"), 2202 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>; 2203 2204let mayStore = true in { 2205 class StoreParamInst<NVPTXRegClass regclass, string opstr> : 2206 NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b), 2207 !strconcat("st.param", opstr, " \t[param$a+$b], $val;"), 2208 []>; 2209 2210 class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> : 2211 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, 2212 i32imm:$a, i32imm:$b), 2213 !strconcat("st.param.v2", opstr, 2214 " \t[param$a+$b], {{$val, $val2}};"), 2215 []>; 2216 2217 class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> : 2218 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3, 2219 regclass:$val4, i32imm:$a, 2220 i32imm:$b), 2221 !strconcat("st.param.v4", opstr, 2222 " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"), 2223 []>; 2224 2225 class StoreRetvalInst<NVPTXRegClass regclass, string opstr> : 2226 NVPTXInst<(outs), (ins regclass:$val, i32imm:$a), 2227 !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"), 2228 []>; 2229 2230 class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> : 2231 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a), 2232 !strconcat("st.param.v2", opstr, 2233 " \t[func_retval0+$a], {{$val, $val2}};"), 2234 []>; 2235 2236 class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> : 2237 NVPTXInst<(outs), 2238 (ins regclass:$val, regclass:$val2, regclass:$val3, 2239 regclass:$val4, i32imm:$a), 2240 !strconcat("st.param.v4", opstr, 2241 " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"), 2242 []>; 2243} 2244 2245let isCall=1 in { 2246 multiclass CALL<string OpcStr, SDNode OpNode> { 2247 def PrintCallNoRetInst : NVPTXInst<(outs), (ins), 2248 !strconcat(OpcStr, " "), [(OpNode (i32 0))]>; 2249 def PrintCallRetInst1 : NVPTXInst<(outs), (ins), 2250 !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>; 2251 def PrintCallRetInst2 : NVPTXInst<(outs), (ins), 2252 !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>; 2253 def PrintCallRetInst3 : NVPTXInst<(outs), (ins), 2254 !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>; 2255 def PrintCallRetInst4 : NVPTXInst<(outs), (ins), 2256 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "), 2257 [(OpNode (i32 4))]>; 2258 def PrintCallRetInst5 : NVPTXInst<(outs), (ins), 2259 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "), 2260 [(OpNode (i32 5))]>; 2261 def PrintCallRetInst6 : NVPTXInst<(outs), (ins), 2262 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2263 "retval5), "), 2264 [(OpNode (i32 6))]>; 2265 def PrintCallRetInst7 : NVPTXInst<(outs), (ins), 2266 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2267 "retval5, retval6), "), 2268 [(OpNode (i32 7))]>; 2269 def PrintCallRetInst8 : NVPTXInst<(outs), (ins), 2270 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2271 "retval5, retval6, retval7), "), 2272 [(OpNode (i32 8))]>; 2273 } 2274} 2275 2276defm Call : CALL<"call", PrintCall>; 2277defm CallUni : CALL<"call.uni", PrintCallUni>; 2278 2279// Convergent call instructions. These are identical to regular calls, except 2280// they have the isConvergent bit set. 2281let isConvergent=1 in { 2282 defm ConvergentCall : CALL<"call", PrintConvergentCall>; 2283 defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>; 2284} 2285 2286def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">; 2287def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">; 2288def LoadParamMemI16 : LoadParamMemInst<Int16Regs, ".b16">; 2289def LoadParamMemI8 : LoadParamMemInst<Int16Regs, ".b8">; 2290def LoadParamMemV2I64 : LoadParamV2MemInst<Int64Regs, ".b64">; 2291def LoadParamMemV2I32 : LoadParamV2MemInst<Int32Regs, ".b32">; 2292def LoadParamMemV2I16 : LoadParamV2MemInst<Int16Regs, ".b16">; 2293def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">; 2294def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">; 2295def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">; 2296def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">; 2297def LoadParamMemF16 : LoadParamMemInst<Float16Regs, ".b16">; 2298def LoadParamMemF16x2 : LoadParamMemInst<Float16x2Regs, ".b32">; 2299def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">; 2300def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">; 2301def LoadParamMemV2F16 : LoadParamV2MemInst<Float16Regs, ".b16">; 2302def LoadParamMemV2F16x2: LoadParamV2MemInst<Float16x2Regs, ".b32">; 2303def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">; 2304def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">; 2305def LoadParamMemV4F16 : LoadParamV4MemInst<Float16Regs, ".b16">; 2306def LoadParamMemV4F16x2: LoadParamV4MemInst<Float16x2Regs, ".b32">; 2307def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">; 2308 2309def StoreParamI64 : StoreParamInst<Int64Regs, ".b64">; 2310def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">; 2311 2312def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">; 2313def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">; 2314def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">; 2315def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">; 2316def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">; 2317def StoreParamV2I8 : StoreParamV2Inst<Int16Regs, ".b8">; 2318 2319def StoreParamV4I32 : StoreParamV4Inst<Int32Regs, ".b32">; 2320def StoreParamV4I16 : StoreParamV4Inst<Int16Regs, ".b16">; 2321def StoreParamV4I8 : StoreParamV4Inst<Int16Regs, ".b8">; 2322 2323def StoreParamF16 : StoreParamInst<Float16Regs, ".b16">; 2324def StoreParamF16x2 : StoreParamInst<Float16x2Regs, ".b32">; 2325def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">; 2326def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">; 2327def StoreParamV2F16 : StoreParamV2Inst<Float16Regs, ".b16">; 2328def StoreParamV2F16x2 : StoreParamV2Inst<Float16x2Regs, ".b32">; 2329def StoreParamV2F32 : StoreParamV2Inst<Float32Regs, ".f32">; 2330def StoreParamV2F64 : StoreParamV2Inst<Float64Regs, ".f64">; 2331def StoreParamV4F16 : StoreParamV4Inst<Float16Regs, ".b16">; 2332def StoreParamV4F16x2 : StoreParamV4Inst<Float16x2Regs, ".b32">; 2333def StoreParamV4F32 : StoreParamV4Inst<Float32Regs, ".f32">; 2334 2335def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">; 2336def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">; 2337def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">; 2338def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">; 2339def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">; 2340def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">; 2341def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">; 2342def StoreRetvalV2I8 : StoreRetvalV2Inst<Int16Regs, ".b8">; 2343def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">; 2344def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">; 2345def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">; 2346 2347def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">; 2348def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">; 2349def StoreRetvalF16 : StoreRetvalInst<Float16Regs, ".b16">; 2350def StoreRetvalF16x2 : StoreRetvalInst<Float16x2Regs, ".b32">; 2351def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">; 2352def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">; 2353def StoreRetvalV2F16 : StoreRetvalV2Inst<Float16Regs, ".b16">; 2354def StoreRetvalV2F16x2: StoreRetvalV2Inst<Float16x2Regs, ".b32">; 2355def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">; 2356def StoreRetvalV4F16 : StoreRetvalV4Inst<Float16Regs, ".b16">; 2357def StoreRetvalV4F16x2: StoreRetvalV4Inst<Float16x2Regs, ".b32">; 2358 2359def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>; 2360def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>; 2361def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>; 2362def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>; 2363 2364class CallArgInst<NVPTXRegClass regclass> : 2365 NVPTXInst<(outs), (ins regclass:$a), "$a, ", 2366 [(CallArg (i32 0), regclass:$a)]>; 2367 2368class LastCallArgInst<NVPTXRegClass regclass> : 2369 NVPTXInst<(outs), (ins regclass:$a), "$a", 2370 [(LastCallArg (i32 0), regclass:$a)]>; 2371 2372def CallArgI64 : CallArgInst<Int64Regs>; 2373def CallArgI32 : CallArgInst<Int32Regs>; 2374def CallArgI16 : CallArgInst<Int16Regs>; 2375def CallArgF64 : CallArgInst<Float64Regs>; 2376def CallArgF32 : CallArgInst<Float32Regs>; 2377 2378def LastCallArgI64 : LastCallArgInst<Int64Regs>; 2379def LastCallArgI32 : LastCallArgInst<Int32Regs>; 2380def LastCallArgI16 : LastCallArgInst<Int16Regs>; 2381def LastCallArgF64 : LastCallArgInst<Float64Regs>; 2382def LastCallArgF32 : LastCallArgInst<Float32Regs>; 2383 2384def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ", 2385 [(CallArg (i32 0), (i32 imm:$a))]>; 2386def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a", 2387 [(LastCallArg (i32 0), (i32 imm:$a))]>; 2388 2389def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ", 2390 [(CallArg (i32 1), (i32 imm:$a))]>; 2391def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a", 2392 [(LastCallArg (i32 1), (i32 imm:$a))]>; 2393 2394def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ", 2395 [(CallVoid (Wrapper tglobaladdr:$addr))]>; 2396def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ", 2397 [(CallVoid Int32Regs:$addr)]>; 2398def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ", 2399 [(CallVoid Int64Regs:$addr)]>; 2400def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;", 2401 [(Prototype (i32 imm:$val))]>; 2402 2403def DeclareRetMemInst : 2404 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num), 2405 ".param .align $align .b8 retval$num[$size];", 2406 [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>; 2407def DeclareRetScalarInst : 2408 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), 2409 ".param .b$size retval$num;", 2410 [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>; 2411def DeclareRetRegInst : 2412 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), 2413 ".reg .b$size retval$num;", 2414 [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>; 2415 2416def DeclareParamInst : 2417 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size), 2418 ".param .align $align .b8 param$a[$size];", 2419 [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>; 2420def DeclareScalarParamInst : 2421 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), 2422 ".param .b$size param$a;", 2423 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>; 2424def DeclareScalarRegInst : 2425 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), 2426 ".reg .b$size param$a;", 2427 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>; 2428 2429class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> : 2430 NVPTXInst<(outs regclass:$dst), (ins regclass:$src), 2431 !strconcat("mov", asmstr, " \t$dst, $src;"), 2432 [(set (T regclass:$dst), (MoveParam (T regclass:$src)))]>; 2433 2434class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, 2435 string asmstr> : 2436 NVPTXInst<(outs regclass:$dst), (ins srcty:$src), 2437 !strconcat("mov", asmstr, " \t$dst, $src;"), 2438 [(set regclass:$dst, (MoveParam texternalsym:$src))]>; 2439 2440def MoveParamI64 : MoveParamInst<i64, Int64Regs, ".b64">; 2441def MoveParamI32 : MoveParamInst<i32, Int32Regs, ".b32">; 2442 2443def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, ".b64">; 2444def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, ".b32">; 2445 2446def MoveParamI16 : 2447 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 2448 "cvt.u16.u32 \t$dst, $src;", 2449 [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>; 2450def MoveParamF64 : MoveParamInst<f64, Float64Regs, ".f64">; 2451def MoveParamF32 : MoveParamInst<f32, Float32Regs, ".f32">; 2452def MoveParamF16 : MoveParamInst<f16, Float16Regs, ".f16">; 2453 2454class PseudoUseParamInst<NVPTXRegClass regclass> : 2455 NVPTXInst<(outs), (ins regclass:$src), 2456 "// Pseudo use of $src", 2457 [(PseudoUseParam regclass:$src)]>; 2458 2459def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>; 2460def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>; 2461def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>; 2462def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>; 2463def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>; 2464 2465class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> : 2466 NVPTXInst<(outs regclass:$dst), (ins regclass:$src), 2467 !strconcat("mov.", SzStr, " \t$dst, $src;"), 2468 [(set (T regclass:$dst), (ProxyReg (T regclass:$src)))]>; 2469 2470let isCodeGenOnly=1, isPseudo=1 in { 2471 def ProxyRegI1 : ProxyRegInst<"pred", i1, Int1Regs>; 2472 def ProxyRegI16 : ProxyRegInst<"b16", i16, Int16Regs>; 2473 def ProxyRegI32 : ProxyRegInst<"b32", i32, Int32Regs>; 2474 def ProxyRegI64 : ProxyRegInst<"b64", i64, Int64Regs>; 2475 def ProxyRegF16 : ProxyRegInst<"b16", f16, Float16Regs>; 2476 def ProxyRegBF16 : ProxyRegInst<"b16", bf16, Float16Regs>; 2477 def ProxyRegF32 : ProxyRegInst<"f32", f32, Float32Regs>; 2478 def ProxyRegF64 : ProxyRegInst<"f64", f64, Float64Regs>; 2479 def ProxyRegF16x2 : ProxyRegInst<"b32", v2f16, Float16x2Regs>; 2480 def ProxyRegBF16x2 : ProxyRegInst<"b32", v2bf16, Float16x2Regs>; 2481} 2482 2483// 2484// Load / Store Handling 2485// 2486multiclass LD<NVPTXRegClass regclass> { 2487 def _avar : NVPTXInst< 2488 (outs regclass:$dst), 2489 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2490 i32imm:$fromWidth, imem:$addr), 2491 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2492 "\t$dst, [$addr];", []>; 2493 def _areg : NVPTXInst< 2494 (outs regclass:$dst), 2495 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2496 i32imm:$fromWidth, Int32Regs:$addr), 2497 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2498 "\t$dst, [$addr];", []>; 2499 def _areg_64 : NVPTXInst< 2500 (outs regclass:$dst), 2501 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2502 i32imm:$fromWidth, Int64Regs:$addr), 2503 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2504 "\t$dst, [$addr];", []>; 2505 def _ari : NVPTXInst< 2506 (outs regclass:$dst), 2507 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2508 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 2509 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2510 "\t$dst, [$addr+$offset];", []>; 2511 def _ari_64 : NVPTXInst< 2512 (outs regclass:$dst), 2513 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2514 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 2515 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2516 "\t$dst, [$addr+$offset];", []>; 2517 def _asi : NVPTXInst< 2518 (outs regclass:$dst), 2519 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2520 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), 2521 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2522 "\t$dst, [$addr+$offset];", []>; 2523} 2524 2525let mayLoad=1, hasSideEffects=0 in { 2526 defm LD_i8 : LD<Int16Regs>; 2527 defm LD_i16 : LD<Int16Regs>; 2528 defm LD_i32 : LD<Int32Regs>; 2529 defm LD_i64 : LD<Int64Regs>; 2530 defm LD_f16 : LD<Float16Regs>; 2531 defm LD_f16x2 : LD<Float16x2Regs>; 2532 defm LD_f32 : LD<Float32Regs>; 2533 defm LD_f64 : LD<Float64Regs>; 2534} 2535 2536multiclass ST<NVPTXRegClass regclass> { 2537 def _avar : NVPTXInst< 2538 (outs), 2539 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2540 LdStCode:$Sign, i32imm:$toWidth, imem:$addr), 2541 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2542 " \t[$addr], $src;", []>; 2543 def _areg : NVPTXInst< 2544 (outs), 2545 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, 2546 LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), 2547 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2548 " \t[$addr], $src;", []>; 2549 def _areg_64 : NVPTXInst< 2550 (outs), 2551 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2552 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), 2553 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2554 " \t[$addr], $src;", []>; 2555 def _ari : NVPTXInst< 2556 (outs), 2557 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2558 LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), 2559 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2560 " \t[$addr+$offset], $src;", []>; 2561 def _ari_64 : NVPTXInst< 2562 (outs), 2563 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2564 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), 2565 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2566 " \t[$addr+$offset], $src;", []>; 2567 def _asi : NVPTXInst< 2568 (outs), 2569 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2570 LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), 2571 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2572 " \t[$addr+$offset], $src;", []>; 2573} 2574 2575let mayStore=1, hasSideEffects=0 in { 2576 defm ST_i8 : ST<Int16Regs>; 2577 defm ST_i16 : ST<Int16Regs>; 2578 defm ST_i32 : ST<Int32Regs>; 2579 defm ST_i64 : ST<Int64Regs>; 2580 defm ST_f16 : ST<Float16Regs>; 2581 defm ST_f16x2 : ST<Float16x2Regs>; 2582 defm ST_f32 : ST<Float32Regs>; 2583 defm ST_f64 : ST<Float64Regs>; 2584} 2585 2586// The following is used only in and after vector elementizations. Vector 2587// elementization happens at the machine instruction level, so the following 2588// instructions never appear in the DAG. 2589multiclass LD_VEC<NVPTXRegClass regclass> { 2590 def _v2_avar : NVPTXInst< 2591 (outs regclass:$dst1, regclass:$dst2), 2592 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2593 i32imm:$fromWidth, imem:$addr), 2594 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2595 "\t{{$dst1, $dst2}}, [$addr];", []>; 2596 def _v2_areg : NVPTXInst< 2597 (outs regclass:$dst1, regclass:$dst2), 2598 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2599 i32imm:$fromWidth, Int32Regs:$addr), 2600 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2601 "\t{{$dst1, $dst2}}, [$addr];", []>; 2602 def _v2_areg_64 : NVPTXInst< 2603 (outs regclass:$dst1, regclass:$dst2), 2604 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2605 i32imm:$fromWidth, Int64Regs:$addr), 2606 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2607 "\t{{$dst1, $dst2}}, [$addr];", []>; 2608 def _v2_ari : NVPTXInst< 2609 (outs regclass:$dst1, regclass:$dst2), 2610 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2611 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 2612 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2613 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 2614 def _v2_ari_64 : NVPTXInst< 2615 (outs regclass:$dst1, regclass:$dst2), 2616 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2617 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 2618 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2619 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 2620 def _v2_asi : NVPTXInst< 2621 (outs regclass:$dst1, regclass:$dst2), 2622 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2623 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 2624 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2625 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 2626 def _v4_avar : NVPTXInst< 2627 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2628 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2629 i32imm:$fromWidth, imem:$addr), 2630 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2631 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 2632 def _v4_areg : NVPTXInst< 2633 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2634 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2635 i32imm:$fromWidth, Int32Regs:$addr), 2636 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2637 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 2638 def _v4_areg_64 : NVPTXInst< 2639 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2640 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2641 i32imm:$fromWidth, Int64Regs:$addr), 2642 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2643 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 2644 def _v4_ari : NVPTXInst< 2645 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2646 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2647 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 2648 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2649 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 2650 def _v4_ari_64 : NVPTXInst< 2651 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2652 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2653 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 2654 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2655 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 2656 def _v4_asi : NVPTXInst< 2657 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2658 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2659 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 2660 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2661 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 2662} 2663let mayLoad=1, hasSideEffects=0 in { 2664 defm LDV_i8 : LD_VEC<Int16Regs>; 2665 defm LDV_i16 : LD_VEC<Int16Regs>; 2666 defm LDV_i32 : LD_VEC<Int32Regs>; 2667 defm LDV_i64 : LD_VEC<Int64Regs>; 2668 defm LDV_f16 : LD_VEC<Float16Regs>; 2669 defm LDV_f16x2 : LD_VEC<Float16x2Regs>; 2670 defm LDV_f32 : LD_VEC<Float32Regs>; 2671 defm LDV_f64 : LD_VEC<Float64Regs>; 2672} 2673 2674multiclass ST_VEC<NVPTXRegClass regclass> { 2675 def _v2_avar : NVPTXInst< 2676 (outs), 2677 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2678 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), 2679 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2680 "\t[$addr], {{$src1, $src2}};", []>; 2681 def _v2_areg : NVPTXInst< 2682 (outs), 2683 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2684 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), 2685 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2686 "\t[$addr], {{$src1, $src2}};", []>; 2687 def _v2_areg_64 : NVPTXInst< 2688 (outs), 2689 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2690 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), 2691 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2692 "\t[$addr], {{$src1, $src2}};", []>; 2693 def _v2_ari : NVPTXInst< 2694 (outs), 2695 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2696 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, 2697 i32imm:$offset), 2698 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2699 "\t[$addr+$offset], {{$src1, $src2}};", []>; 2700 def _v2_ari_64 : NVPTXInst< 2701 (outs), 2702 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2703 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, 2704 i32imm:$offset), 2705 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2706 "\t[$addr+$offset], {{$src1, $src2}};", []>; 2707 def _v2_asi : NVPTXInst< 2708 (outs), 2709 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2710 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, 2711 i32imm:$offset), 2712 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2713 "\t[$addr+$offset], {{$src1, $src2}};", []>; 2714 def _v4_avar : NVPTXInst< 2715 (outs), 2716 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2717 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2718 i32imm:$fromWidth, imem:$addr), 2719 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2720 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 2721 def _v4_areg : NVPTXInst< 2722 (outs), 2723 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2724 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2725 i32imm:$fromWidth, Int32Regs:$addr), 2726 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2727 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 2728 def _v4_areg_64 : NVPTXInst< 2729 (outs), 2730 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2731 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2732 i32imm:$fromWidth, Int64Regs:$addr), 2733 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2734 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 2735 def _v4_ari : NVPTXInst< 2736 (outs), 2737 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2738 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2739 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 2740 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2741 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 2742 def _v4_ari_64 : NVPTXInst< 2743 (outs), 2744 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2745 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2746 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 2747 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2748 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 2749 def _v4_asi : NVPTXInst< 2750 (outs), 2751 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2752 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2753 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 2754 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}" 2755 "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 2756} 2757 2758let mayStore=1, hasSideEffects=0 in { 2759 defm STV_i8 : ST_VEC<Int16Regs>; 2760 defm STV_i16 : ST_VEC<Int16Regs>; 2761 defm STV_i32 : ST_VEC<Int32Regs>; 2762 defm STV_i64 : ST_VEC<Int64Regs>; 2763 defm STV_f16 : ST_VEC<Float16Regs>; 2764 defm STV_f16x2 : ST_VEC<Float16x2Regs>; 2765 defm STV_f32 : ST_VEC<Float32Regs>; 2766 defm STV_f64 : ST_VEC<Float64Regs>; 2767} 2768 2769//---- Conversion ---- 2770 2771class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut, 2772 NVPTXRegClass regclassIn = ValueToRegClass<TIn>.ret, 2773 NVPTXRegClass regclassOut = ValueToRegClass<TOut>.ret> : 2774 NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a), 2775 !strconcat("mov.b", SzStr, " \t$d, $a;"), 2776 [(set (TOut regclassOut:$d), (bitconvert (TIn regclassIn:$a)))]>; 2777 2778def BITCONVERT_16_I2F : F_BITCONVERT<"16", i16, f16>; 2779def BITCONVERT_16_F2I : F_BITCONVERT<"16", f16, i16>; 2780def BITCONVERT_16_I2BF : F_BITCONVERT<"16", i16, bf16>; 2781def BITCONVERT_16_BF2I : F_BITCONVERT<"16", bf16, i16>; 2782def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>; 2783def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>; 2784def BITCONVERT_64_I2F : F_BITCONVERT<"64", i64, f64>; 2785def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>; 2786def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", i32, v2f16>; 2787def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", v2f16, i32>; 2788def BITCONVERT_32_F2F16x2 : F_BITCONVERT<"32", f32, v2f16>; 2789def BITCONVERT_32_F16x22F : F_BITCONVERT<"32", v2f16, f32>; 2790def BITCONVERT_32_I2BF16x2 : F_BITCONVERT<"32", i32, v2bf16>; 2791def BITCONVERT_32_BF16x22I : F_BITCONVERT<"32", v2bf16, i32>; 2792def BITCONVERT_32_F2BF16x2 : F_BITCONVERT<"32", f32, v2bf16>; 2793def BITCONVERT_32_BF16x22F : F_BITCONVERT<"32", v2bf16, f32>; 2794 2795// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where 2796// we cannot specify floating-point literals in isel patterns. Therefore, we 2797// use an integer selp to select either 1 or 0 and then cvt to floating-point. 2798 2799// sint -> f16 2800def : Pat<(f16 (sint_to_fp Int1Regs:$a)), 2801 (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 2802def : Pat<(f16 (sint_to_fp Int16Regs:$a)), 2803 (CVT_f16_s16 Int16Regs:$a, CvtRN)>; 2804def : Pat<(f16 (sint_to_fp Int32Regs:$a)), 2805 (CVT_f16_s32 Int32Regs:$a, CvtRN)>; 2806def : Pat<(f16 (sint_to_fp Int64Regs:$a)), 2807 (CVT_f16_s64 Int64Regs:$a, CvtRN)>; 2808 2809// uint -> f16 2810def : Pat<(f16 (uint_to_fp Int1Regs:$a)), 2811 (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 2812def : Pat<(f16 (uint_to_fp Int16Regs:$a)), 2813 (CVT_f16_u16 Int16Regs:$a, CvtRN)>; 2814def : Pat<(f16 (uint_to_fp Int32Regs:$a)), 2815 (CVT_f16_u32 Int32Regs:$a, CvtRN)>; 2816def : Pat<(f16 (uint_to_fp Int64Regs:$a)), 2817 (CVT_f16_u64 Int64Regs:$a, CvtRN)>; 2818 2819// sint -> f32 2820def : Pat<(f32 (sint_to_fp Int1Regs:$a)), 2821 (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 2822def : Pat<(f32 (sint_to_fp Int16Regs:$a)), 2823 (CVT_f32_s16 Int16Regs:$a, CvtRN)>; 2824def : Pat<(f32 (sint_to_fp Int32Regs:$a)), 2825 (CVT_f32_s32 Int32Regs:$a, CvtRN)>; 2826def : Pat<(f32 (sint_to_fp Int64Regs:$a)), 2827 (CVT_f32_s64 Int64Regs:$a, CvtRN)>; 2828 2829// uint -> f32 2830def : Pat<(f32 (uint_to_fp Int1Regs:$a)), 2831 (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 2832def : Pat<(f32 (uint_to_fp Int16Regs:$a)), 2833 (CVT_f32_u16 Int16Regs:$a, CvtRN)>; 2834def : Pat<(f32 (uint_to_fp Int32Regs:$a)), 2835 (CVT_f32_u32 Int32Regs:$a, CvtRN)>; 2836def : Pat<(f32 (uint_to_fp Int64Regs:$a)), 2837 (CVT_f32_u64 Int64Regs:$a, CvtRN)>; 2838 2839// sint -> f64 2840def : Pat<(f64 (sint_to_fp Int1Regs:$a)), 2841 (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 2842def : Pat<(f64 (sint_to_fp Int16Regs:$a)), 2843 (CVT_f64_s16 Int16Regs:$a, CvtRN)>; 2844def : Pat<(f64 (sint_to_fp Int32Regs:$a)), 2845 (CVT_f64_s32 Int32Regs:$a, CvtRN)>; 2846def : Pat<(f64 (sint_to_fp Int64Regs:$a)), 2847 (CVT_f64_s64 Int64Regs:$a, CvtRN)>; 2848 2849// uint -> f64 2850def : Pat<(f64 (uint_to_fp Int1Regs:$a)), 2851 (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 2852def : Pat<(f64 (uint_to_fp Int16Regs:$a)), 2853 (CVT_f64_u16 Int16Regs:$a, CvtRN)>; 2854def : Pat<(f64 (uint_to_fp Int32Regs:$a)), 2855 (CVT_f64_u32 Int32Regs:$a, CvtRN)>; 2856def : Pat<(f64 (uint_to_fp Int64Regs:$a)), 2857 (CVT_f64_u64 Int64Regs:$a, CvtRN)>; 2858 2859 2860// f16 -> sint 2861def : Pat<(i1 (fp_to_sint (f16 Float16Regs:$a))), 2862 (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>; 2863def : Pat<(i16 (fp_to_sint (f16 Float16Regs:$a))), 2864 (CVT_s16_f16 (f16 Float16Regs:$a), CvtRZI)>; 2865def : Pat<(i32 (fp_to_sint (f16 Float16Regs:$a))), 2866 (CVT_s32_f16 (f16 Float16Regs:$a), CvtRZI)>; 2867def : Pat<(i64 (fp_to_sint (f16 Float16Regs:$a))), 2868 (CVT_s64_f16 Float16Regs:$a, CvtRZI)>; 2869 2870// f16 -> uint 2871def : Pat<(i1 (fp_to_uint (f16 Float16Regs:$a))), 2872 (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>; 2873def : Pat<(i16 (fp_to_uint (f16 Float16Regs:$a))), 2874 (CVT_u16_f16 Float16Regs:$a, CvtRZI)>; 2875def : Pat<(i32 (fp_to_uint (f16 Float16Regs:$a))), 2876 (CVT_u32_f16 Float16Regs:$a, CvtRZI)>; 2877def : Pat<(i64 (fp_to_uint (f16 Float16Regs:$a))), 2878 (CVT_u64_f16 Float16Regs:$a, CvtRZI)>; 2879 2880// f32 -> sint 2881def : Pat<(i1 (fp_to_sint Float32Regs:$a)), 2882 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; 2883def : Pat<(i16 (fp_to_sint Float32Regs:$a)), 2884 (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 2885def : Pat<(i16 (fp_to_sint Float32Regs:$a)), 2886 (CVT_s16_f32 Float32Regs:$a, CvtRZI)>; 2887def : Pat<(i32 (fp_to_sint Float32Regs:$a)), 2888 (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 2889def : Pat<(i32 (fp_to_sint Float32Regs:$a)), 2890 (CVT_s32_f32 Float32Regs:$a, CvtRZI)>; 2891def : Pat<(i64 (fp_to_sint Float32Regs:$a)), 2892 (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 2893def : Pat<(i64 (fp_to_sint Float32Regs:$a)), 2894 (CVT_s64_f32 Float32Regs:$a, CvtRZI)>; 2895 2896// f32 -> uint 2897def : Pat<(i1 (fp_to_uint Float32Regs:$a)), 2898 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; 2899def : Pat<(i16 (fp_to_uint Float32Regs:$a)), 2900 (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 2901def : Pat<(i16 (fp_to_uint Float32Regs:$a)), 2902 (CVT_u16_f32 Float32Regs:$a, CvtRZI)>; 2903def : Pat<(i32 (fp_to_uint Float32Regs:$a)), 2904 (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 2905def : Pat<(i32 (fp_to_uint Float32Regs:$a)), 2906 (CVT_u32_f32 Float32Regs:$a, CvtRZI)>; 2907def : Pat<(i64 (fp_to_uint Float32Regs:$a)), 2908 (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 2909def : Pat<(i64 (fp_to_uint Float32Regs:$a)), 2910 (CVT_u64_f32 Float32Regs:$a, CvtRZI)>; 2911 2912// f64 -> sint 2913def : Pat<(i1 (fp_to_sint Float64Regs:$a)), 2914 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; 2915def : Pat<(i16 (fp_to_sint Float64Regs:$a)), 2916 (CVT_s16_f64 Float64Regs:$a, CvtRZI)>; 2917def : Pat<(i32 (fp_to_sint Float64Regs:$a)), 2918 (CVT_s32_f64 Float64Regs:$a, CvtRZI)>; 2919def : Pat<(i64 (fp_to_sint Float64Regs:$a)), 2920 (CVT_s64_f64 Float64Regs:$a, CvtRZI)>; 2921 2922// f64 -> uint 2923def : Pat<(i1 (fp_to_uint Float64Regs:$a)), 2924 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; 2925def : Pat<(i16 (fp_to_uint Float64Regs:$a)), 2926 (CVT_u16_f64 Float64Regs:$a, CvtRZI)>; 2927def : Pat<(i32 (fp_to_uint Float64Regs:$a)), 2928 (CVT_u32_f64 Float64Regs:$a, CvtRZI)>; 2929def : Pat<(i64 (fp_to_uint Float64Regs:$a)), 2930 (CVT_u64_f64 Float64Regs:$a, CvtRZI)>; 2931 2932// sext i1 2933def : Pat<(i16 (sext Int1Regs:$a)), 2934 (SELP_s16ii -1, 0, Int1Regs:$a)>; 2935def : Pat<(i32 (sext Int1Regs:$a)), 2936 (SELP_s32ii -1, 0, Int1Regs:$a)>; 2937def : Pat<(i64 (sext Int1Regs:$a)), 2938 (SELP_s64ii -1, 0, Int1Regs:$a)>; 2939 2940// zext i1 2941def : Pat<(i16 (zext Int1Regs:$a)), 2942 (SELP_u16ii 1, 0, Int1Regs:$a)>; 2943def : Pat<(i32 (zext Int1Regs:$a)), 2944 (SELP_u32ii 1, 0, Int1Regs:$a)>; 2945def : Pat<(i64 (zext Int1Regs:$a)), 2946 (SELP_u64ii 1, 0, Int1Regs:$a)>; 2947 2948// anyext i1 2949def : Pat<(i16 (anyext Int1Regs:$a)), 2950 (SELP_u16ii -1, 0, Int1Regs:$a)>; 2951def : Pat<(i32 (anyext Int1Regs:$a)), 2952 (SELP_u32ii -1, 0, Int1Regs:$a)>; 2953def : Pat<(i64 (anyext Int1Regs:$a)), 2954 (SELP_u64ii -1, 0, Int1Regs:$a)>; 2955 2956// sext i16 2957def : Pat<(i32 (sext Int16Regs:$a)), 2958 (CVT_s32_s16 Int16Regs:$a, CvtNONE)>; 2959def : Pat<(i64 (sext Int16Regs:$a)), 2960 (CVT_s64_s16 Int16Regs:$a, CvtNONE)>; 2961 2962// zext i16 2963def : Pat<(i32 (zext Int16Regs:$a)), 2964 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; 2965def : Pat<(i64 (zext Int16Regs:$a)), 2966 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; 2967 2968// anyext i16 2969def : Pat<(i32 (anyext Int16Regs:$a)), 2970 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; 2971def : Pat<(i64 (anyext Int16Regs:$a)), 2972 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; 2973 2974// sext i32 2975def : Pat<(i64 (sext Int32Regs:$a)), 2976 (CVT_s64_s32 Int32Regs:$a, CvtNONE)>; 2977 2978// zext i32 2979def : Pat<(i64 (zext Int32Regs:$a)), 2980 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; 2981 2982// anyext i32 2983def : Pat<(i64 (anyext Int32Regs:$a)), 2984 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; 2985 2986 2987// truncate i64 2988def : Pat<(i32 (trunc Int64Regs:$a)), 2989 (CVT_u32_u64 Int64Regs:$a, CvtNONE)>; 2990def : Pat<(i16 (trunc Int64Regs:$a)), 2991 (CVT_u16_u64 Int64Regs:$a, CvtNONE)>; 2992def : Pat<(i1 (trunc Int64Regs:$a)), 2993 (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>; 2994 2995// truncate i32 2996def : Pat<(i16 (trunc Int32Regs:$a)), 2997 (CVT_u16_u32 Int32Regs:$a, CvtNONE)>; 2998def : Pat<(i1 (trunc Int32Regs:$a)), 2999 (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>; 3000 3001// truncate i16 3002def : Pat<(i1 (trunc Int16Regs:$a)), 3003 (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>; 3004 3005// sext_inreg 3006def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>; 3007def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>; 3008def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>; 3009def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>; 3010def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>; 3011def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>; 3012 3013 3014// Select instructions with 32-bit predicates 3015def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b), 3016 (SELP_b16rr Int16Regs:$a, Int16Regs:$b, 3017 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3018def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b), 3019 (SELP_b32rr Int32Regs:$a, Int32Regs:$b, 3020 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3021def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b), 3022 (SELP_b64rr Int64Regs:$a, Int64Regs:$b, 3023 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3024def : Pat<(select Int32Regs:$pred, (f16 Float16Regs:$a), (f16 Float16Regs:$b)), 3025 (SELP_f16rr Float16Regs:$a, Float16Regs:$b, 3026 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3027def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b), 3028 (SELP_f32rr Float32Regs:$a, Float32Regs:$b, 3029 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3030def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b), 3031 (SELP_f64rr Float64Regs:$a, Float64Regs:$b, 3032 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3033 3034 3035let hasSideEffects = false in { 3036 // pack a set of smaller int registers to a larger int register 3037 def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d), 3038 (ins Int16Regs:$s1, Int16Regs:$s2, 3039 Int16Regs:$s3, Int16Regs:$s4), 3040 "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>; 3041 def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d), 3042 (ins Int16Regs:$s1, Int16Regs:$s2), 3043 "mov.b32 \t$d, {{$s1, $s2}};", []>; 3044 def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d), 3045 (ins Int32Regs:$s1, Int32Regs:$s2), 3046 "mov.b64 \t$d, {{$s1, $s2}};", []>; 3047 def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d), 3048 (ins Float32Regs:$s1, Float32Regs:$s2), 3049 "mov.b64 \t$d, {{$s1, $s2}};", []>; 3050 3051 // unpack a larger int register to a set of smaller int registers 3052 def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2, 3053 Int16Regs:$d3, Int16Regs:$d4), 3054 (ins Int64Regs:$s), 3055 "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>; 3056 def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2), 3057 (ins Int32Regs:$s), 3058 "mov.b32 \t{{$d1, $d2}}, $s;", []>; 3059 def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2), 3060 (ins Int64Regs:$s), 3061 "mov.b64 \t{{$d1, $d2}}, $s;", []>; 3062 def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2), 3063 (ins Float64Regs:$s), 3064 "mov.b64 \t{{$d1, $d2}}, $s;", []>; 3065 3066} 3067 3068let hasSideEffects = false in { 3069 // Extract element of f16x2 register. PTX does not provide any way 3070 // to access elements of f16x2 vector directly, so we need to 3071 // extract it using a temporary register. 3072 def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst), 3073 (ins Float16x2Regs:$src), 3074 "{{ .reg .b16 \t%tmp_hi;\n\t" 3075 " mov.b32 \t{$dst, %tmp_hi}, $src; }}", 3076 [(set Float16Regs:$dst, 3077 (extractelt (v2f16 Float16x2Regs:$src), 0))]>; 3078 def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst), 3079 (ins Float16x2Regs:$src), 3080 "{{ .reg .b16 \t%tmp_lo;\n\t" 3081 " mov.b32 \t{%tmp_lo, $dst}, $src; }}", 3082 [(set Float16Regs:$dst, 3083 (extractelt (v2f16 Float16x2Regs:$src), 1))]>; 3084 3085 // Coalesce two f16 registers into f16x2 3086 def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst), 3087 (ins Float16Regs:$a, Float16Regs:$b), 3088 "mov.b32 \t$dst, {{$a, $b}};", 3089 [(set (v2f16 Float16x2Regs:$dst), 3090 (build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>; 3091 3092 // Directly initializing underlying the b32 register is one less SASS 3093 // instruction than than vector-packing move. 3094 def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src), 3095 "mov.b32 \t$dst, $src;", 3096 []>; 3097 3098 // Split f16x2 into two f16 registers. 3099 def SplitF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi), 3100 (ins Float16x2Regs:$src), 3101 "mov.b32 \t{{$lo, $hi}}, $src;", 3102 []>; 3103 // Split an i32 into two f16 3104 def SplitI32toF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi), 3105 (ins Int32Regs:$src), 3106 "mov.b32 \t{{$lo, $hi}}, $src;", 3107 []>; 3108} 3109 3110// Count leading zeros 3111let hasSideEffects = false in { 3112 def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), 3113 "clz.b32 \t$d, $a;", []>; 3114 def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), 3115 "clz.b64 \t$d, $a;", []>; 3116} 3117 3118// 32-bit has a direct PTX instruction 3119def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>; 3120 3121// The return type of the ctlz ISD node is the same as its input, but the PTX 3122// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the 3123// ptx value to 64 bits to match the ISD node's semantics, unless we know we're 3124// truncating back down to 32 bits. 3125def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; 3126def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>; 3127 3128// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the 3129// result back to 16-bits if necessary. We also need to subtract 16 because 3130// the high-order 16 zeros were counted. 3131// 3132// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could 3133// use to save one SASS instruction (on sm_35 anyway): 3134// 3135// mov.b32 $tmp, {0xffff, $a} 3136// ctlz.b32 $result, $tmp 3137// 3138// That is, instead of zero-extending the input to 32 bits, we'd "one-extend" 3139// and then ctlz that value. This way we don't have to subtract 16 from the 3140// result. Unfortunately today we don't have a way to generate 3141// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization. 3142def : Pat<(i16 (ctlz Int16Regs:$a)), 3143 (SUBi16ri (CVT_u16_u32 3144 (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>; 3145def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))), 3146 (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>; 3147 3148// Population count 3149let hasSideEffects = false in { 3150 def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), 3151 "popc.b32 \t$d, $a;", []>; 3152 def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), 3153 "popc.b64 \t$d, $a;", []>; 3154} 3155 3156// 32-bit has a direct PTX instruction 3157def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>; 3158 3159// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit 3160// to match the LLVM semantics. Just as with ctlz.i64, we provide a second 3161// pattern that avoids the type conversion if we're truncating the result to 3162// i32 anyway. 3163def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>; 3164def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>; 3165 3166// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits. 3167// If we know that we're storing into an i32, we can avoid the final trunc. 3168def : Pat<(ctpop Int16Regs:$a), 3169 (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>; 3170def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))), 3171 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>; 3172 3173// fpround f32 -> f16 3174def : Pat<(f16 (fpround Float32Regs:$a)), 3175 (CVT_f16_f32 Float32Regs:$a, CvtRN)>; 3176 3177// fpround f64 -> f16 3178def : Pat<(f16 (fpround Float64Regs:$a)), 3179 (CVT_f16_f64 Float64Regs:$a, CvtRN)>; 3180 3181// fpround f64 -> f32 3182def : Pat<(f32 (fpround Float64Regs:$a)), 3183 (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; 3184def : Pat<(f32 (fpround Float64Regs:$a)), 3185 (CVT_f32_f64 Float64Regs:$a, CvtRN)>; 3186 3187// fpextend f16 -> f32 3188def : Pat<(f32 (fpextend (f16 Float16Regs:$a))), 3189 (CVT_f32_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3190def : Pat<(f32 (fpextend (f16 Float16Regs:$a))), 3191 (CVT_f32_f16 Float16Regs:$a, CvtNONE)>; 3192 3193// fpextend f16 -> f64 3194def : Pat<(f64 (fpextend (f16 Float16Regs:$a))), 3195 (CVT_f64_f16 Float16Regs:$a, CvtNONE)>; 3196 3197// fpextend f32 -> f64 3198def : Pat<(f64 (fpextend Float32Regs:$a)), 3199 (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3200def : Pat<(f64 (fpextend Float32Regs:$a)), 3201 (CVT_f64_f32 Float32Regs:$a, CvtNONE)>; 3202 3203def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone, 3204 [SDNPHasChain, SDNPOptInGlue]>; 3205 3206// fceil, ffloor, froundeven, ftrunc. 3207 3208multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> { 3209 def : Pat<(OpNode (f16 Float16Regs:$a)), 3210 (CVT_f16_f16 Float16Regs:$a, Mode)>; 3211 def : Pat<(OpNode Float32Regs:$a), 3212 (CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>; 3213 def : Pat<(OpNode Float32Regs:$a), 3214 (CVT_f32_f32 Float32Regs:$a, Mode)>, Requires<[doNoF32FTZ]>; 3215 def : Pat<(OpNode Float64Regs:$a), 3216 (CVT_f64_f64 Float64Regs:$a, Mode)>; 3217} 3218 3219defm : CVT_ROUND<fceil, CvtRPI, CvtRPI_FTZ>; 3220defm : CVT_ROUND<ffloor, CvtRMI, CvtRMI_FTZ>; 3221defm : CVT_ROUND<froundeven, CvtRNI, CvtRNI_FTZ>; 3222defm : CVT_ROUND<ftrunc, CvtRZI, CvtRZI_FTZ>; 3223 3224// nearbyint and rint are implemented as rounding to nearest even. This isn't 3225// strictly correct, because it causes us to ignore the rounding mode. But it 3226// matches what CUDA's "libm" does. 3227 3228defm : CVT_ROUND<fnearbyint, CvtRNI, CvtRNI_FTZ>; 3229defm : CVT_ROUND<frint, CvtRNI, CvtRNI_FTZ>; 3230 3231//----------------------------------- 3232// Control-flow 3233//----------------------------------- 3234 3235let isTerminator=1 in { 3236 let isReturn=1, isBarrier=1 in 3237 def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>; 3238 3239 let isBranch=1 in 3240 def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), 3241 "@$a bra \t$target;", 3242 [(brcond Int1Regs:$a, bb:$target)]>; 3243 let isBranch=1 in 3244 def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), 3245 "@!$a bra \t$target;", []>; 3246 3247 let isBranch=1, isBarrier=1 in 3248 def GOTO : NVPTXInst<(outs), (ins brtarget:$target), 3249 "bra.uni \t$target;", [(br bb:$target)]>; 3250} 3251 3252def : Pat<(brcond Int32Regs:$a, bb:$target), 3253 (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>; 3254 3255// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a 3256// conditional branch if the target block is the next block so that the code 3257// can fall through to the target block. The invertion is done by 'xor 3258// condition, 1', which will be translated to (setne condition, -1). Since ptx 3259// supports '@!pred bra target', we should use it. 3260def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target), 3261 (CBranchOther Int1Regs:$a, bb:$target)>; 3262 3263// Call 3264def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, 3265 SDTCisVT<1, i32>]>; 3266def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; 3267 3268def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart, 3269 [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; 3270def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd, 3271 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, 3272 SDNPSideEffect]>; 3273 3274def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; 3275def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall, 3276 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; 3277def calltarget : Operand<i32>; 3278let isCall=1 in { 3279 def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>; 3280} 3281 3282def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>; 3283def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>; 3284 3285// Pseudo instructions. 3286class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern> 3287 : NVPTXInst<outs, ins, asmstr, pattern>; 3288 3289def Callseq_Start : 3290 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), 3291 "\\{ // callseq $amt1, $amt2\n" 3292 "\t.reg .b32 temp_param_reg;", 3293 [(callseq_start timm:$amt1, timm:$amt2)]>; 3294def Callseq_End : 3295 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), 3296 "\\} // callseq $amt1", 3297 [(callseq_end timm:$amt1, timm:$amt2)]>; 3298 3299// trap instruction 3300def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>; 3301 3302// Call prototype wrapper 3303def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 3304def CallPrototype : 3305 SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, 3306 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 3307def ProtoIdent : Operand<i32> { 3308 let PrintMethod = "printProtoIdent"; 3309} 3310def CALL_PROTOTYPE : 3311 NVPTXInst<(outs), (ins ProtoIdent:$ident), 3312 "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; 3313 3314 3315include "NVPTXIntrinsics.td" 3316 3317 3318//----------------------------------- 3319// Notes 3320//----------------------------------- 3321// BSWAP is currently expanded. The following is a more efficient 3322// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register 3323// - for sm_20, use pmpt (use vector scalar mov to get the pack and 3324// unpack). sm_20 supports native 32-bit register, but not native 16-bit 3325// register. 3326