1//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file describes the PTX instructions in TableGen format. 10// 11//===----------------------------------------------------------------------===// 12 13include "NVPTXInstrFormats.td" 14 15// A NOP instruction 16let hasSideEffects = false in { 17 def NOP : NVPTXInst<(outs), (ins), "", []>; 18} 19 20let OperandType = "OPERAND_IMMEDIATE" in { 21 def f16imm : Operand<f16>; 22 def bf16imm : Operand<bf16>; 23 24} 25 26// List of vector specific properties 27def isVecLD : VecInstTypeEnum<1>; 28def isVecST : VecInstTypeEnum<2>; 29def isVecBuild : VecInstTypeEnum<3>; 30def isVecShuffle : VecInstTypeEnum<4>; 31def isVecExtract : VecInstTypeEnum<5>; 32def isVecInsert : VecInstTypeEnum<6>; 33def isVecDest : VecInstTypeEnum<7>; 34def isVecOther : VecInstTypeEnum<15>; 35 36//===----------------------------------------------------------------------===// 37// NVPTX Operand Definitions. 38//===----------------------------------------------------------------------===// 39 40def brtarget : Operand<OtherVT>; 41 42// CVT conversion modes 43// These must match the enum in NVPTX.h 44def CvtNONE : PatLeaf<(i32 0x0)>; 45def CvtRNI : PatLeaf<(i32 0x1)>; 46def CvtRZI : PatLeaf<(i32 0x2)>; 47def CvtRMI : PatLeaf<(i32 0x3)>; 48def CvtRPI : PatLeaf<(i32 0x4)>; 49def CvtRN : PatLeaf<(i32 0x5)>; 50def CvtRZ : PatLeaf<(i32 0x6)>; 51def CvtRM : PatLeaf<(i32 0x7)>; 52def CvtRP : PatLeaf<(i32 0x8)>; 53def CvtRNA : PatLeaf<(i32 0x9)>; 54 55def CvtNONE_FTZ : PatLeaf<(i32 0x10)>; 56def CvtRNI_FTZ : PatLeaf<(i32 0x11)>; 57def CvtRZI_FTZ : PatLeaf<(i32 0x12)>; 58def CvtRMI_FTZ : PatLeaf<(i32 0x13)>; 59def CvtRPI_FTZ : PatLeaf<(i32 0x14)>; 60def CvtRN_FTZ : PatLeaf<(i32 0x15)>; 61def CvtRZ_FTZ : PatLeaf<(i32 0x16)>; 62def CvtRM_FTZ : PatLeaf<(i32 0x17)>; 63def CvtRP_FTZ : PatLeaf<(i32 0x18)>; 64 65def CvtSAT : PatLeaf<(i32 0x20)>; 66def CvtSAT_FTZ : PatLeaf<(i32 0x30)>; 67 68def CvtNONE_RELU : PatLeaf<(i32 0x40)>; 69def CvtRN_RELU : PatLeaf<(i32 0x45)>; 70def CvtRZ_RELU : PatLeaf<(i32 0x46)>; 71 72def CvtMode : Operand<i32> { 73 let PrintMethod = "printCvtMode"; 74} 75 76// Compare modes 77// These must match the enum in NVPTX.h 78def CmpEQ : PatLeaf<(i32 0)>; 79def CmpNE : PatLeaf<(i32 1)>; 80def CmpLT : PatLeaf<(i32 2)>; 81def CmpLE : PatLeaf<(i32 3)>; 82def CmpGT : PatLeaf<(i32 4)>; 83def CmpGE : PatLeaf<(i32 5)>; 84def CmpEQU : PatLeaf<(i32 10)>; 85def CmpNEU : PatLeaf<(i32 11)>; 86def CmpLTU : PatLeaf<(i32 12)>; 87def CmpLEU : PatLeaf<(i32 13)>; 88def CmpGTU : PatLeaf<(i32 14)>; 89def CmpGEU : PatLeaf<(i32 15)>; 90def CmpNUM : PatLeaf<(i32 16)>; 91def CmpNAN : PatLeaf<(i32 17)>; 92 93def CmpEQ_FTZ : PatLeaf<(i32 0x100)>; 94def CmpNE_FTZ : PatLeaf<(i32 0x101)>; 95def CmpLT_FTZ : PatLeaf<(i32 0x102)>; 96def CmpLE_FTZ : PatLeaf<(i32 0x103)>; 97def CmpGT_FTZ : PatLeaf<(i32 0x104)>; 98def CmpGE_FTZ : PatLeaf<(i32 0x105)>; 99def CmpEQU_FTZ : PatLeaf<(i32 0x10A)>; 100def CmpNEU_FTZ : PatLeaf<(i32 0x10B)>; 101def CmpLTU_FTZ : PatLeaf<(i32 0x10C)>; 102def CmpLEU_FTZ : PatLeaf<(i32 0x10D)>; 103def CmpGTU_FTZ : PatLeaf<(i32 0x10E)>; 104def CmpGEU_FTZ : PatLeaf<(i32 0x10F)>; 105def CmpNUM_FTZ : PatLeaf<(i32 0x110)>; 106def CmpNAN_FTZ : PatLeaf<(i32 0x111)>; 107 108def CmpMode : Operand<i32> { 109 let PrintMethod = "printCmpMode"; 110} 111def VecElement : Operand<i32> { 112 let PrintMethod = "printVecElement"; 113} 114 115//===----------------------------------------------------------------------===// 116// NVPTX Instruction Predicate Definitions 117//===----------------------------------------------------------------------===// 118 119 120def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; 121def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; 122def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; 123def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; 124def hasVote : Predicate<"Subtarget->hasVote()">; 125def hasDouble : Predicate<"Subtarget->hasDouble()">; 126def hasLDG : Predicate<"Subtarget->hasLDG()">; 127def hasLDU : Predicate<"Subtarget->hasLDU()">; 128 129def doF32FTZ : Predicate<"useF32FTZ()">; 130def doNoF32FTZ : Predicate<"!useF32FTZ()">; 131 132def doMulWide : Predicate<"doMulWide">; 133 134def allowFMA : Predicate<"allowFMA()">; 135def noFMA : Predicate<"!allowFMA()">; 136def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">; 137def noUnsafeFPMath : Predicate<"!allowUnsafeFPMath()">; 138 139def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">; 140def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">; 141 142def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">; 143def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; 144 145def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; 146def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; 147 148def True : Predicate<"true">; 149 150class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>; 151class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>; 152 153// non-sync shfl instructions are not available on sm_70+ in PTX6.4+ 154def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" 155 "&& Subtarget->getPTXVersion() >= 64)">; 156 157def useShortPtr : Predicate<"useShortPointers()">; 158def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; 159def hasBF16Math: Predicate<"Subtarget->hasBF16Math()">; 160 161// Helper class to aid conversion between ValueType and a matching RegisterClass. 162 163class ValueToRegClass<ValueType T> { 164 string name = !cast<string>(T); 165 NVPTXRegClass ret = !cond( 166 !eq(name, "i1"): Int1Regs, 167 !eq(name, "i16"): Int16Regs, 168 !eq(name, "i32"): Int32Regs, 169 !eq(name, "i64"): Int64Regs, 170 !eq(name, "f16"): Int16Regs, 171 !eq(name, "v2f16"): Int32Regs, 172 !eq(name, "bf16"): Int16Regs, 173 !eq(name, "v2bf16"): Int32Regs, 174 !eq(name, "f32"): Float32Regs, 175 !eq(name, "f64"): Float64Regs, 176 !eq(name, "ai32"): Int32ArgRegs, 177 !eq(name, "ai64"): Int64ArgRegs, 178 !eq(name, "af32"): Float32ArgRegs, 179 !eq(name, "if64"): Float64ArgRegs, 180 ); 181} 182 183 184//===----------------------------------------------------------------------===// 185// Some Common Instruction Class Templates 186//===----------------------------------------------------------------------===// 187 188// Template for instructions which take three int64, int32, or int16 args. 189// The instructions are named "<OpcStr><Width>" (e.g. "add.s64"). 190multiclass I3<string OpcStr, SDNode OpNode> { 191 def i64rr : 192 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 193 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 194 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; 195 def i64ri : 196 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 197 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 198 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; 199 def i32rr : 200 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 201 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 202 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; 203 def i32ri : 204 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 205 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 206 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; 207 def i16rr : 208 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 209 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 210 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; 211 def i16ri : 212 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 213 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 214 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; 215} 216 217// Template for instructions which take 3 int args. The instructions are 218// named "<OpcStr>.s32" (e.g. "addc.cc.s32"). 219multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> { 220 let hasSideEffects = 1 in { 221 def i32rr : 222 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 223 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), 224 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; 225 def i32ri : 226 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 227 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), 228 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; 229 def i64rr : 230 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 231 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), 232 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>, 233 Requires<[hasPTX<43>]>; 234 def i64ri : 235 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 236 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), 237 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>, 238 Requires<[hasPTX<43>]>; 239 } 240} 241 242// Template for instructions which take three fp64 or fp32 args. The 243// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64"). 244// 245// Also defines ftz (flush subnormal inputs and results to sign-preserving 246// zero) variants for fp32 functions. 247// 248// This multiclass should be used for nodes that cannot be folded into FMAs. 249// For nodes that can be folded into FMAs (i.e. adds and muls), use 250// F3_fma_component. 251multiclass F3<string OpcStr, SDNode OpNode> { 252 def f64rr : 253 NVPTXInst<(outs Float64Regs:$dst), 254 (ins Float64Regs:$a, Float64Regs:$b), 255 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 256 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>; 257 def f64ri : 258 NVPTXInst<(outs Float64Regs:$dst), 259 (ins Float64Regs:$a, f64imm:$b), 260 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 261 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>; 262 def f32rr_ftz : 263 NVPTXInst<(outs Float32Regs:$dst), 264 (ins Float32Regs:$a, Float32Regs:$b), 265 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 266 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 267 Requires<[doF32FTZ]>; 268 def f32ri_ftz : 269 NVPTXInst<(outs Float32Regs:$dst), 270 (ins Float32Regs:$a, f32imm:$b), 271 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 272 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 273 Requires<[doF32FTZ]>; 274 def f32rr : 275 NVPTXInst<(outs Float32Regs:$dst), 276 (ins Float32Regs:$a, Float32Regs:$b), 277 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 278 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>; 279 def f32ri : 280 NVPTXInst<(outs Float32Regs:$dst), 281 (ins Float32Regs:$a, f32imm:$b), 282 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 283 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>; 284 285 def f16rr_ftz : 286 NVPTXInst<(outs Int16Regs:$dst), 287 (ins Int16Regs:$a, Int16Regs:$b), 288 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), 289 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 290 Requires<[useFP16Math, doF32FTZ]>; 291 def f16rr : 292 NVPTXInst<(outs Int16Regs:$dst), 293 (ins Int16Regs:$a, Int16Regs:$b), 294 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), 295 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 296 Requires<[useFP16Math]>; 297 298 def f16x2rr_ftz : 299 NVPTXInst<(outs Int32Regs:$dst), 300 (ins Int32Regs:$a, Int32Regs:$b), 301 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), 302 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 303 Requires<[useFP16Math, doF32FTZ]>; 304 def f16x2rr : 305 NVPTXInst<(outs Int32Regs:$dst), 306 (ins Int32Regs:$a, Int32Regs:$b), 307 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), 308 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 309 Requires<[useFP16Math]>; 310 def bf16rr_ftz : 311 NVPTXInst<(outs Int16Regs:$dst), 312 (ins Int16Regs:$a, Int16Regs:$b), 313 !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"), 314 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 315 Requires<[hasBF16Math, doF32FTZ]>; 316 def bf16rr : 317 NVPTXInst<(outs Int16Regs:$dst), 318 (ins Int16Regs:$a, Int16Regs:$b), 319 !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"), 320 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 321 Requires<[hasBF16Math]>; 322 323 def bf16x2rr_ftz : 324 NVPTXInst<(outs Int32Regs:$dst), 325 (ins Int32Regs:$a, Int32Regs:$b), 326 !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"), 327 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 328 Requires<[hasBF16Math, doF32FTZ]>; 329 def bf16x2rr : 330 NVPTXInst<(outs Int32Regs:$dst), 331 (ins Int32Regs:$a, Int32Regs:$b), 332 !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"), 333 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 334 Requires<[hasBF16Math]>; 335} 336 337// Template for instructions which take three FP args. The 338// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64"). 339// 340// Also defines ftz (flush subnormal inputs and results to sign-preserving 341// zero) variants for fp32/fp16 functions. 342// 343// This multiclass should be used for nodes that can be folded to make fma ops. 344// In this case, we use the ".rn" variant when FMA is disabled, as this behaves 345// just like the non ".rn" op, but prevents ptxas from creating FMAs. 346multiclass F3_fma_component<string OpcStr, SDNode OpNode> { 347 def f64rr : 348 NVPTXInst<(outs Float64Regs:$dst), 349 (ins Float64Regs:$a, Float64Regs:$b), 350 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 351 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, 352 Requires<[allowFMA]>; 353 def f64ri : 354 NVPTXInst<(outs Float64Regs:$dst), 355 (ins Float64Regs:$a, f64imm:$b), 356 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 357 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, 358 Requires<[allowFMA]>; 359 def f32rr_ftz : 360 NVPTXInst<(outs Float32Regs:$dst), 361 (ins Float32Regs:$a, Float32Regs:$b), 362 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 363 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 364 Requires<[allowFMA, doF32FTZ]>; 365 def f32ri_ftz : 366 NVPTXInst<(outs Float32Regs:$dst), 367 (ins Float32Regs:$a, f32imm:$b), 368 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 369 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 370 Requires<[allowFMA, doF32FTZ]>; 371 def f32rr : 372 NVPTXInst<(outs Float32Regs:$dst), 373 (ins Float32Regs:$a, Float32Regs:$b), 374 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 375 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 376 Requires<[allowFMA]>; 377 def f32ri : 378 NVPTXInst<(outs Float32Regs:$dst), 379 (ins Float32Regs:$a, f32imm:$b), 380 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 381 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 382 Requires<[allowFMA]>; 383 384 def f16rr_ftz : 385 NVPTXInst<(outs Int16Regs:$dst), 386 (ins Int16Regs:$a, Int16Regs:$b), 387 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), 388 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 389 Requires<[useFP16Math, allowFMA, doF32FTZ]>; 390 def f16rr : 391 NVPTXInst<(outs Int16Regs:$dst), 392 (ins Int16Regs:$a, Int16Regs:$b), 393 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), 394 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 395 Requires<[useFP16Math, allowFMA]>; 396 397 def f16x2rr_ftz : 398 NVPTXInst<(outs Int32Regs:$dst), 399 (ins Int32Regs:$a, Int32Regs:$b), 400 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), 401 [(set (v2f16 Int32Regs:$dst), (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 402 Requires<[useFP16Math, allowFMA, doF32FTZ]>; 403 def f16x2rr : 404 NVPTXInst<(outs Int32Regs:$dst), 405 (ins Int32Regs:$a, Int32Regs:$b), 406 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), 407 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 408 Requires<[useFP16Math, allowFMA]>; 409 def bf16rr_ftz : 410 NVPTXInst<(outs Int16Regs:$dst), 411 (ins Int16Regs:$a, Int16Regs:$b), 412 !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"), 413 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 414 Requires<[hasBF16Math, allowFMA, doF32FTZ]>; 415 def bf16rr : 416 NVPTXInst<(outs Int16Regs:$dst), 417 (ins Int16Regs:$a, Int16Regs:$b), 418 !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"), 419 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 420 Requires<[hasBF16Math, allowFMA]>; 421 422 def bf16x2rr_ftz : 423 NVPTXInst<(outs Int32Regs:$dst), 424 (ins Int32Regs:$a, Int32Regs:$b), 425 !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"), 426 [(set (v2bf16 Int32Regs:$dst), (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 427 Requires<[hasBF16Math, allowFMA, doF32FTZ]>; 428 def bf16x2rr : 429 NVPTXInst<(outs Int32Regs:$dst), 430 (ins Int32Regs:$a, Int32Regs:$b), 431 !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"), 432 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 433 Requires<[hasBF16Math, allowFMA]>; 434 // These have strange names so we don't perturb existing mir tests. 435 def _rnf64rr : 436 NVPTXInst<(outs Float64Regs:$dst), 437 (ins Float64Regs:$a, Float64Regs:$b), 438 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), 439 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, 440 Requires<[noFMA]>; 441 def _rnf64ri : 442 NVPTXInst<(outs Float64Regs:$dst), 443 (ins Float64Regs:$a, f64imm:$b), 444 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), 445 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, 446 Requires<[noFMA]>; 447 def _rnf32rr_ftz : 448 NVPTXInst<(outs Float32Regs:$dst), 449 (ins Float32Regs:$a, Float32Regs:$b), 450 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), 451 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 452 Requires<[noFMA, doF32FTZ]>; 453 def _rnf32ri_ftz : 454 NVPTXInst<(outs Float32Regs:$dst), 455 (ins Float32Regs:$a, f32imm:$b), 456 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), 457 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 458 Requires<[noFMA, doF32FTZ]>; 459 def _rnf32rr : 460 NVPTXInst<(outs Float32Regs:$dst), 461 (ins Float32Regs:$a, Float32Regs:$b), 462 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), 463 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 464 Requires<[noFMA]>; 465 def _rnf32ri : 466 NVPTXInst<(outs Float32Regs:$dst), 467 (ins Float32Regs:$a, f32imm:$b), 468 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), 469 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 470 Requires<[noFMA]>; 471 def _rnf16rr_ftz : 472 NVPTXInst<(outs Int16Regs:$dst), 473 (ins Int16Regs:$a, Int16Regs:$b), 474 !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"), 475 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 476 Requires<[useFP16Math, noFMA, doF32FTZ]>; 477 def _rnf16rr : 478 NVPTXInst<(outs Int16Regs:$dst), 479 (ins Int16Regs:$a, Int16Regs:$b), 480 !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"), 481 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 482 Requires<[useFP16Math, noFMA]>; 483 def _rnf16x2rr_ftz : 484 NVPTXInst<(outs Int32Regs:$dst), 485 (ins Int32Regs:$a, Int32Regs:$b), 486 !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"), 487 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 488 Requires<[useFP16Math, noFMA, doF32FTZ]>; 489 def _rnf16x2rr : 490 NVPTXInst<(outs Int32Regs:$dst), 491 (ins Int32Regs:$a, Int32Regs:$b), 492 !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"), 493 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 494 Requires<[useFP16Math, noFMA]>; 495 def _rnbf16rr_ftz : 496 NVPTXInst<(outs Int16Regs:$dst), 497 (ins Int16Regs:$a, Int16Regs:$b), 498 !strconcat(OpcStr, ".rn.ftz.bf16 \t$dst, $a, $b;"), 499 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 500 Requires<[hasBF16Math, noFMA, doF32FTZ]>; 501 def _rnbf16rr : 502 NVPTXInst<(outs Int16Regs:$dst), 503 (ins Int16Regs:$a, Int16Regs:$b), 504 !strconcat(OpcStr, ".rn.bf16 \t$dst, $a, $b;"), 505 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 506 Requires<[hasBF16Math, noFMA]>; 507 def _rnbf16x2rr_ftz : 508 NVPTXInst<(outs Int32Regs:$dst), 509 (ins Int32Regs:$a, Int32Regs:$b), 510 !strconcat(OpcStr, ".rn.ftz.bf16x2 \t$dst, $a, $b;"), 511 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 512 Requires<[hasBF16Math, noFMA, doF32FTZ]>; 513 def _rnbf16x2rr : 514 NVPTXInst<(outs Int32Regs:$dst), 515 (ins Int32Regs:$a, Int32Regs:$b), 516 !strconcat(OpcStr, ".rn.bf16x2 \t$dst, $a, $b;"), 517 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 518 Requires<[hasBF16Math, noFMA]>; 519} 520 521// Template for operations which take two f32 or f64 operands. Provides three 522// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush 523// subnormal inputs and results to zero). 524multiclass F2<string OpcStr, SDNode OpNode> { 525 def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a), 526 !strconcat(OpcStr, ".f64 \t$dst, $a;"), 527 [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>; 528 def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), 529 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"), 530 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>, 531 Requires<[doF32FTZ]>; 532 def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), 533 !strconcat(OpcStr, ".f32 \t$dst, $a;"), 534 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>; 535} 536 537//===----------------------------------------------------------------------===// 538// NVPTX Instructions. 539//===----------------------------------------------------------------------===// 540 541//----------------------------------- 542// Type Conversion 543//----------------------------------- 544 545let hasSideEffects = false in { 546 // Generate a cvt to the given type from all possible types. Each instance 547 // takes a CvtMode immediate that defines the conversion mode to use. It can 548 // be CvtNONE to omit a conversion mode. 549 multiclass CVT_FROM_ALL<string ToType, RegisterClass RC, list<Predicate> Preds = []> { 550 def _s8 : 551 NVPTXInst<(outs RC:$dst), 552 (ins Int16Regs:$src, CvtMode:$mode), 553 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 554 ToType, ".s8 \t$dst, $src;"), []>, 555 Requires<Preds>; 556 def _u8 : 557 NVPTXInst<(outs RC:$dst), 558 (ins Int16Regs:$src, CvtMode:$mode), 559 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 560 ToType, ".u8 \t$dst, $src;"), []>, 561 Requires<Preds>; 562 def _s16 : 563 NVPTXInst<(outs RC:$dst), 564 (ins Int16Regs:$src, CvtMode:$mode), 565 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 566 ToType, ".s16 \t$dst, $src;"), []>, 567 Requires<Preds>; 568 def _u16 : 569 NVPTXInst<(outs RC:$dst), 570 (ins Int16Regs:$src, CvtMode:$mode), 571 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 572 ToType, ".u16 \t$dst, $src;"), []>, 573 Requires<Preds>; 574 def _s32 : 575 NVPTXInst<(outs RC:$dst), 576 (ins Int32Regs:$src, CvtMode:$mode), 577 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 578 ToType, ".s32 \t$dst, $src;"), []>, 579 Requires<Preds>; 580 def _u32 : 581 NVPTXInst<(outs RC:$dst), 582 (ins Int32Regs:$src, CvtMode:$mode), 583 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 584 ToType, ".u32 \t$dst, $src;"), []>, 585 Requires<Preds>; 586 def _s64 : 587 NVPTXInst<(outs RC:$dst), 588 (ins Int64Regs:$src, CvtMode:$mode), 589 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 590 ToType, ".s64 \t$dst, $src;"), []>, 591 Requires<Preds>; 592 def _u64 : 593 NVPTXInst<(outs RC:$dst), 594 (ins Int64Regs:$src, CvtMode:$mode), 595 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 596 ToType, ".u64 \t$dst, $src;"), []>, 597 Requires<Preds>; 598 def _f16 : 599 NVPTXInst<(outs RC:$dst), 600 (ins Int16Regs:$src, CvtMode:$mode), 601 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 602 ToType, ".f16 \t$dst, $src;"), []>, 603 Requires<Preds>; 604 def _bf16 : 605 NVPTXInst<(outs RC:$dst), 606 (ins Int16Regs:$src, CvtMode:$mode), 607 !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.", 608 ToType, ".bf16 \t$dst, $src;"), []>, 609 Requires<!if(!eq(ToType, "f32"), 610 // bf16->f32 was introduced early. 611 [hasPTX<71>, hasSM<80>], 612 // bf16->everything else needs sm90/ptx78 613 [hasPTX<78>, hasSM<90>])>; 614 def _f32 : 615 NVPTXInst<(outs RC:$dst), 616 (ins Float32Regs:$src, CvtMode:$mode), 617 !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.", 618 ToType, ".f32 \t$dst, $src;"), []>, 619 Requires<!if(!eq(ToType, "bf16"), 620 // f32->bf16 was introduced early. 621 [hasPTX<70>, hasSM<80>], 622 Preds)>; 623 def _f64 : 624 NVPTXInst<(outs RC:$dst), 625 (ins Float64Regs:$src, CvtMode:$mode), 626 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 627 ToType, ".f64 \t$dst, $src;"), []>, 628 Requires<Preds>; 629 } 630 631 // Generate cvts from all types to all types. 632 defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>; 633 defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>; 634 defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>; 635 defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>; 636 defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>; 637 defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>; 638 defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>; 639 defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>; 640 defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>; 641 defm CVT_bf16 : CVT_FROM_ALL<"bf16", Int16Regs, [hasPTX<78>, hasSM<90>]>; 642 defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>; 643 defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>; 644 645 // These cvts are different from those above: The source and dest registers 646 // are of the same type. 647 def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 648 "cvt.s16.s8 \t$dst, $src;", []>; 649 def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 650 "cvt.s32.s8 \t$dst, $src;", []>; 651 def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 652 "cvt.s32.s16 \t$dst, $src;", []>; 653 def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 654 "cvt.s64.s8 \t$dst, $src;", []>; 655 def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 656 "cvt.s64.s16 \t$dst, $src;", []>; 657 def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 658 "cvt.s64.s32 \t$dst, $src;", []>; 659 660 multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> { 661 def _f32 : 662 NVPTXInst<(outs RC:$dst), 663 (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode), 664 !strconcat("cvt${mode:base}${mode:relu}.", 665 FromName, ".f32 \t$dst, $src1, $src2;"), []>, 666 Requires<[hasPTX<70>, hasSM<80>]>; 667 } 668 669 defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>; 670 defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>; 671} 672 673//----------------------------------- 674// Selection instructions (selp) 675//----------------------------------- 676 677// TODO: Missing slct 678 679// selp instructions that don't have any pattern matches; we explicitly use 680// them within this file. 681let hasSideEffects = false in { 682 multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> { 683 def rr : NVPTXInst<(outs RC:$dst), 684 (ins RC:$a, RC:$b, Int1Regs:$p), 685 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 686 def ri : NVPTXInst<(outs RC:$dst), 687 (ins RC:$a, ImmCls:$b, Int1Regs:$p), 688 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 689 def ir : NVPTXInst<(outs RC:$dst), 690 (ins ImmCls:$a, RC:$b, Int1Regs:$p), 691 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 692 def ii : NVPTXInst<(outs RC:$dst), 693 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), 694 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 695 } 696 697 multiclass SELP_PATTERN<string TypeStr, ValueType T, RegisterClass RC, 698 Operand ImmCls, SDNode ImmNode> { 699 def rr : 700 NVPTXInst<(outs RC:$dst), 701 (ins RC:$a, RC:$b, Int1Regs:$p), 702 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 703 [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T RC:$b)))]>; 704 def ri : 705 NVPTXInst<(outs RC:$dst), 706 (ins RC:$a, ImmCls:$b, Int1Regs:$p), 707 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 708 [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T ImmNode:$b)))]>; 709 def ir : 710 NVPTXInst<(outs RC:$dst), 711 (ins ImmCls:$a, RC:$b, Int1Regs:$p), 712 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 713 [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, (T RC:$b)))]>; 714 def ii : 715 NVPTXInst<(outs RC:$dst), 716 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), 717 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 718 [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>; 719 } 720} 721 722// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as 723// good. 724defm SELP_b16 : SELP_PATTERN<"b16", i16, Int16Regs, i16imm, imm>; 725defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>; 726defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>; 727defm SELP_b32 : SELP_PATTERN<"b32", i32, Int32Regs, i32imm, imm>; 728defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>; 729defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>; 730defm SELP_b64 : SELP_PATTERN<"b64", i64, Int64Regs, i64imm, imm>; 731defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>; 732defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>; 733defm SELP_f16 : SELP_PATTERN<"b16", f16, Int16Regs, f16imm, fpimm>; 734defm SELP_bf16 : SELP_PATTERN<"b16", bf16, Int16Regs, bf16imm, fpimm>; 735 736defm SELP_f32 : SELP_PATTERN<"f32", f32, Float32Regs, f32imm, fpimm>; 737defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>; 738 739// This does not work as tablegen fails to infer the type of 'imm'. 740// def v2f16imm : Operand<v2f16>; 741// defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>; 742 743def SELP_f16x2rr : 744 NVPTXInst<(outs Int32Regs:$dst), 745 (ins Int32Regs:$a, Int32Regs:$b, Int1Regs:$p), 746 "selp.b32 \t$dst, $a, $b, $p;", 747 [(set Int32Regs:$dst, 748 (select Int1Regs:$p, (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>; 749 750//----------------------------------- 751// Test Instructions 752//----------------------------------- 753 754def TESTINF_f32r : NVPTXInst<(outs Int1Regs:$p), (ins Float32Regs:$a), 755 "testp.infinite.f32 \t$p, $a;", 756 []>; 757def TESTINF_f32i : NVPTXInst<(outs Int1Regs:$p), (ins f32imm:$a), 758 "testp.infinite.f32 \t$p, $a;", 759 []>; 760def TESTINF_f64r : NVPTXInst<(outs Int1Regs:$p), (ins Float64Regs:$a), 761 "testp.infinite.f64 \t$p, $a;", 762 []>; 763def TESTINF_f64i : NVPTXInst<(outs Int1Regs:$p), (ins f64imm:$a), 764 "testp.infinite.f64 \t$p, $a;", 765 []>; 766 767//----------------------------------- 768// Integer Arithmetic 769//----------------------------------- 770 771// Template for xor masquerading as int1 arithmetic. 772multiclass ADD_SUB_i1<SDNode OpNode> { 773 def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), 774 "xor.pred \t$dst, $a, $b;", 775 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; 776 def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), 777 "xor.pred \t$dst, $a, $b;", 778 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>; 779} 780 781// int1 addition and subtraction are both just xor. 782defm ADD_i1 : ADD_SUB_i1<add>; 783defm SUB_i1 : ADD_SUB_i1<sub>; 784 785// int16, int32, and int64 signed addition. Since nvptx is 2's complement, we 786// also use these for unsigned arithmetic. 787defm ADD : I3<"add.s", add>; 788defm SUB : I3<"sub.s", sub>; 789 790// in32 and int64 addition and subtraction with carry-out. 791defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>; 792defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>; 793 794// int32 and int64 addition and subtraction with carry-in and carry-out. 795defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>; 796defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>; 797 798defm MULT : I3<"mul.lo.s", mul>; 799 800defm MULTHS : I3<"mul.hi.s", mulhs>; 801defm MULTHU : I3<"mul.hi.u", mulhu>; 802 803defm SDIV : I3<"div.s", sdiv>; 804defm UDIV : I3<"div.u", udiv>; 805 806// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM 807// will lower it. 808defm SREM : I3<"rem.s", srem>; 809defm UREM : I3<"rem.u", urem>; 810 811// Integer absolute value. NumBits should be one minus the bit width of RC. 812// This idiom implements the algorithm at 813// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs. 814multiclass ABS<RegisterClass RC, string SizeName> { 815 def : NVPTXInst<(outs RC:$dst), (ins RC:$a), 816 !strconcat("abs", SizeName, " \t$dst, $a;"), 817 [(set RC:$dst, (abs RC:$a))]>; 818} 819defm ABS_16 : ABS<Int16Regs, ".s16">; 820defm ABS_32 : ABS<Int32Regs, ".s32">; 821defm ABS_64 : ABS<Int64Regs, ".s64">; 822 823// Integer min/max. 824defm SMAX : I3<"max.s", smax>; 825defm UMAX : I3<"max.u", umax>; 826defm SMIN : I3<"min.s", smin>; 827defm UMIN : I3<"min.u", umin>; 828 829// 830// Wide multiplication 831// 832def MULWIDES64 : 833 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 834 "mul.wide.s32 \t$dst, $a, $b;", []>; 835def MULWIDES64Imm : 836 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 837 "mul.wide.s32 \t$dst, $a, $b;", []>; 838def MULWIDES64Imm64 : 839 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), 840 "mul.wide.s32 \t$dst, $a, $b;", []>; 841 842def MULWIDEU64 : 843 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 844 "mul.wide.u32 \t$dst, $a, $b;", []>; 845def MULWIDEU64Imm : 846 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 847 "mul.wide.u32 \t$dst, $a, $b;", []>; 848def MULWIDEU64Imm64 : 849 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), 850 "mul.wide.u32 \t$dst, $a, $b;", []>; 851 852def MULWIDES32 : 853 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 854 "mul.wide.s16 \t$dst, $a, $b;", []>; 855def MULWIDES32Imm : 856 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 857 "mul.wide.s16 \t$dst, $a, $b;", []>; 858def MULWIDES32Imm32 : 859 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 860 "mul.wide.s16 \t$dst, $a, $b;", []>; 861 862def MULWIDEU32 : 863 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 864 "mul.wide.u16 \t$dst, $a, $b;", []>; 865def MULWIDEU32Imm : 866 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 867 "mul.wide.u16 \t$dst, $a, $b;", []>; 868def MULWIDEU32Imm32 : 869 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 870 "mul.wide.u16 \t$dst, $a, $b;", []>; 871 872def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; 873def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; 874def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; 875 876// Matchers for signed, unsigned mul.wide ISD nodes. 877def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), 878 (MULWIDES32 i16:$a, i16:$b)>, 879 Requires<[doMulWide]>; 880def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)), 881 (MULWIDES32Imm Int16Regs:$a, imm:$b)>, 882 Requires<[doMulWide]>; 883def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), 884 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, 885 Requires<[doMulWide]>; 886def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)), 887 (MULWIDEU32Imm Int16Regs:$a, imm:$b)>, 888 Requires<[doMulWide]>; 889 890def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)), 891 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, 892 Requires<[doMulWide]>; 893def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)), 894 (MULWIDES64Imm Int32Regs:$a, imm:$b)>, 895 Requires<[doMulWide]>; 896def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)), 897 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, 898 Requires<[doMulWide]>; 899def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)), 900 (MULWIDEU64Imm Int32Regs:$a, imm:$b)>, 901 Requires<[doMulWide]>; 902 903// Predicates used for converting some patterns to mul.wide. 904def SInt32Const : PatLeaf<(imm), [{ 905 const APInt &v = N->getAPIntValue(); 906 return v.isSignedIntN(32); 907}]>; 908 909def UInt32Const : PatLeaf<(imm), [{ 910 const APInt &v = N->getAPIntValue(); 911 return v.isIntN(32); 912}]>; 913 914def SInt16Const : PatLeaf<(imm), [{ 915 const APInt &v = N->getAPIntValue(); 916 return v.isSignedIntN(16); 917}]>; 918 919def UInt16Const : PatLeaf<(imm), [{ 920 const APInt &v = N->getAPIntValue(); 921 return v.isIntN(16); 922}]>; 923 924def IntConst_0_30 : PatLeaf<(imm), [{ 925 // Check if 0 <= v < 31; only then will the result of (x << v) be an int32. 926 const APInt &v = N->getAPIntValue(); 927 return v.sge(0) && v.slt(31); 928}]>; 929 930def IntConst_0_14 : PatLeaf<(imm), [{ 931 // Check if 0 <= v < 15; only then will the result of (x << v) be an int16. 932 const APInt &v = N->getAPIntValue(); 933 return v.sge(0) && v.slt(15); 934}]>; 935 936def SHL2MUL32 : SDNodeXForm<imm, [{ 937 const APInt &v = N->getAPIntValue(); 938 APInt temp(32, 1); 939 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32); 940}]>; 941 942def SHL2MUL16 : SDNodeXForm<imm, [{ 943 const APInt &v = N->getAPIntValue(); 944 APInt temp(16, 1); 945 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16); 946}]>; 947 948// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide. 949def : Pat<(shl (sext Int32Regs:$a), (i32 IntConst_0_30:$b)), 950 (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, 951 Requires<[doMulWide]>; 952def : Pat<(shl (zext Int32Regs:$a), (i32 IntConst_0_30:$b)), 953 (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, 954 Requires<[doMulWide]>; 955 956def : Pat<(shl (sext Int16Regs:$a), (i16 IntConst_0_14:$b)), 957 (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, 958 Requires<[doMulWide]>; 959def : Pat<(shl (zext Int16Regs:$a), (i16 IntConst_0_14:$b)), 960 (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, 961 Requires<[doMulWide]>; 962 963// Convert "sign/zero-extend then multiply" to mul.wide. 964def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)), 965 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, 966 Requires<[doMulWide]>; 967def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)), 968 (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>, 969 Requires<[doMulWide]>; 970 971def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)), 972 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, 973 Requires<[doMulWide]>; 974def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)), 975 (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>, 976 Requires<[doMulWide]>; 977 978def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)), 979 (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, 980 Requires<[doMulWide]>; 981def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)), 982 (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>, 983 Requires<[doMulWide]>; 984 985def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)), 986 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, 987 Requires<[doMulWide]>; 988def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)), 989 (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>, 990 Requires<[doMulWide]>; 991 992// 993// Integer multiply-add 994// 995def SDTIMAD : 996 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>, 997 SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>; 998def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>; 999 1000def MAD16rrr : 1001 NVPTXInst<(outs Int16Regs:$dst), 1002 (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), 1003 "mad.lo.s16 \t$dst, $a, $b, $c;", 1004 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>; 1005def MAD16rri : 1006 NVPTXInst<(outs Int16Regs:$dst), 1007 (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), 1008 "mad.lo.s16 \t$dst, $a, $b, $c;", 1009 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>; 1010def MAD16rir : 1011 NVPTXInst<(outs Int16Regs:$dst), 1012 (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), 1013 "mad.lo.s16 \t$dst, $a, $b, $c;", 1014 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>; 1015def MAD16rii : 1016 NVPTXInst<(outs Int16Regs:$dst), 1017 (ins Int16Regs:$a, i16imm:$b, i16imm:$c), 1018 "mad.lo.s16 \t$dst, $a, $b, $c;", 1019 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>; 1020 1021def MAD32rrr : 1022 NVPTXInst<(outs Int32Regs:$dst), 1023 (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), 1024 "mad.lo.s32 \t$dst, $a, $b, $c;", 1025 [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>; 1026def MAD32rri : 1027 NVPTXInst<(outs Int32Regs:$dst), 1028 (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), 1029 "mad.lo.s32 \t$dst, $a, $b, $c;", 1030 [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>; 1031def MAD32rir : 1032 NVPTXInst<(outs Int32Regs:$dst), 1033 (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), 1034 "mad.lo.s32 \t$dst, $a, $b, $c;", 1035 [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>; 1036def MAD32rii : 1037 NVPTXInst<(outs Int32Regs:$dst), 1038 (ins Int32Regs:$a, i32imm:$b, i32imm:$c), 1039 "mad.lo.s32 \t$dst, $a, $b, $c;", 1040 [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>; 1041 1042def MAD64rrr : 1043 NVPTXInst<(outs Int64Regs:$dst), 1044 (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), 1045 "mad.lo.s64 \t$dst, $a, $b, $c;", 1046 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>; 1047def MAD64rri : 1048 NVPTXInst<(outs Int64Regs:$dst), 1049 (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), 1050 "mad.lo.s64 \t$dst, $a, $b, $c;", 1051 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>; 1052def MAD64rir : 1053 NVPTXInst<(outs Int64Regs:$dst), 1054 (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), 1055 "mad.lo.s64 \t$dst, $a, $b, $c;", 1056 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>; 1057def MAD64rii : 1058 NVPTXInst<(outs Int64Regs:$dst), 1059 (ins Int64Regs:$a, i64imm:$b, i64imm:$c), 1060 "mad.lo.s64 \t$dst, $a, $b, $c;", 1061 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>; 1062 1063def INEG16 : 1064 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 1065 "neg.s16 \t$dst, $src;", 1066 [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>; 1067def INEG32 : 1068 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 1069 "neg.s32 \t$dst, $src;", 1070 [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>; 1071def INEG64 : 1072 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 1073 "neg.s64 \t$dst, $src;", 1074 [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>; 1075 1076//----------------------------------- 1077// Floating Point Arithmetic 1078//----------------------------------- 1079 1080// Constant 1.0f 1081def FloatConst1 : PatLeaf<(fpimm), [{ 1082 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() && 1083 N->getValueAPF().convertToFloat() == 1.0f; 1084}]>; 1085// Constant 1.0 (double) 1086def DoubleConst1 : PatLeaf<(fpimm), [{ 1087 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() && 1088 N->getValueAPF().convertToDouble() == 1.0; 1089}]>; 1090 1091// Loads FP16 constant into a register. 1092// 1093// ptxas does not have hex representation for fp16, so we can't use 1094// fp16 immediate values in .f16 instructions. Instead we have to load 1095// the constant into a register using mov.b16. 1096def LOAD_CONST_F16 : 1097 NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$a), 1098 "mov.b16 \t$dst, $a;", []>; 1099def LOAD_CONST_BF16 : 1100 NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$a), 1101 "mov.b16 \t$dst, $a;", []>; 1102defm FADD : F3_fma_component<"add", fadd>; 1103defm FSUB : F3_fma_component<"sub", fsub>; 1104defm FMUL : F3_fma_component<"mul", fmul>; 1105 1106defm FMIN : F3<"min", fminnum>; 1107defm FMAX : F3<"max", fmaxnum>; 1108// Note: min.NaN.f64 and max.NaN.f64 do not actually exist. 1109defm FMINNAN : F3<"min.NaN", fminimum>; 1110defm FMAXNAN : F3<"max.NaN", fmaximum>; 1111 1112defm FABS : F2<"abs", fabs>; 1113defm FNEG : F2<"neg", fneg>; 1114defm FSQRT : F2<"sqrt.rn", fsqrt>; 1115 1116// 1117// F16 NEG 1118// 1119class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> : 1120 NVPTXInst<(outs RC:$dst), (ins RC:$src), 1121 !strconcat(OpcStr, " \t$dst, $src;"), 1122 [(set RC:$dst, (fneg (T RC:$src)))]>, 1123 Requires<[useFP16Math, hasPTX<60>, hasSM<53>, Pred]>; 1124def FNEG16_ftz : FNEG_F16_F16X2<"neg.ftz.f16", f16, Int16Regs, doF32FTZ>; 1125def FNEG16 : FNEG_F16_F16X2<"neg.f16", f16, Int16Regs, True>; 1126def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>; 1127def FNEG16x2 : FNEG_F16_F16X2<"neg.f16x2", v2f16, Int32Regs, True>; 1128 1129// 1130// BF16 NEG 1131// 1132 1133class FNEG_BF16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> : 1134 NVPTXInst<(outs RC:$dst), (ins RC:$src), 1135 !strconcat(OpcStr, " \t$dst, $src;"), 1136 [(set RC:$dst, (fneg (T RC:$src)))]>, 1137 Requires<[hasBF16Math, hasPTX<70>, hasSM<80>, Pred]>; 1138def BFNEG16_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, Int16Regs, doF32FTZ>; 1139def BFNEG16 : FNEG_BF16_F16X2<"neg.bf16", bf16, Int16Regs, True>; 1140def BFNEG16x2_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>; 1141def BFNEG16x2 : FNEG_BF16_F16X2<"neg.bf16x2", v2bf16, Int32Regs, True>; 1142 1143// 1144// F64 division 1145// 1146def FDIV641r : 1147 NVPTXInst<(outs Float64Regs:$dst), 1148 (ins f64imm:$a, Float64Regs:$b), 1149 "rcp.rn.f64 \t$dst, $b;", 1150 [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>; 1151def FDIV64rr : 1152 NVPTXInst<(outs Float64Regs:$dst), 1153 (ins Float64Regs:$a, Float64Regs:$b), 1154 "div.rn.f64 \t$dst, $a, $b;", 1155 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>; 1156def FDIV64ri : 1157 NVPTXInst<(outs Float64Regs:$dst), 1158 (ins Float64Regs:$a, f64imm:$b), 1159 "div.rn.f64 \t$dst, $a, $b;", 1160 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>; 1161 1162// 1163// F32 Approximate reciprocal 1164// 1165def FDIV321r_ftz : 1166 NVPTXInst<(outs Float32Regs:$dst), 1167 (ins f32imm:$a, Float32Regs:$b), 1168 "rcp.approx.ftz.f32 \t$dst, $b;", 1169 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1170 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1171def FDIV321r : 1172 NVPTXInst<(outs Float32Regs:$dst), 1173 (ins f32imm:$a, Float32Regs:$b), 1174 "rcp.approx.f32 \t$dst, $b;", 1175 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1176 Requires<[do_DIVF32_APPROX]>; 1177// 1178// F32 Approximate division 1179// 1180def FDIV32approxrr_ftz : 1181 NVPTXInst<(outs Float32Regs:$dst), 1182 (ins Float32Regs:$a, Float32Regs:$b), 1183 "div.approx.ftz.f32 \t$dst, $a, $b;", 1184 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1185 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1186def FDIV32approxri_ftz : 1187 NVPTXInst<(outs Float32Regs:$dst), 1188 (ins Float32Regs:$a, f32imm:$b), 1189 "div.approx.ftz.f32 \t$dst, $a, $b;", 1190 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1191 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1192def FDIV32approxrr : 1193 NVPTXInst<(outs Float32Regs:$dst), 1194 (ins Float32Regs:$a, Float32Regs:$b), 1195 "div.approx.f32 \t$dst, $a, $b;", 1196 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1197 Requires<[do_DIVF32_APPROX]>; 1198def FDIV32approxri : 1199 NVPTXInst<(outs Float32Regs:$dst), 1200 (ins Float32Regs:$a, f32imm:$b), 1201 "div.approx.f32 \t$dst, $a, $b;", 1202 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1203 Requires<[do_DIVF32_APPROX]>; 1204// 1205// F32 Semi-accurate reciprocal 1206// 1207// rcp.approx gives the same result as div.full(1.0f, a) and is faster. 1208// 1209def FDIV321r_approx_ftz : 1210 NVPTXInst<(outs Float32Regs:$dst), 1211 (ins f32imm:$a, Float32Regs:$b), 1212 "rcp.approx.ftz.f32 \t$dst, $b;", 1213 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1214 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1215def FDIV321r_approx : 1216 NVPTXInst<(outs Float32Regs:$dst), 1217 (ins f32imm:$a, Float32Regs:$b), 1218 "rcp.approx.f32 \t$dst, $b;", 1219 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1220 Requires<[do_DIVF32_FULL]>; 1221// 1222// F32 Semi-accurate division 1223// 1224def FDIV32rr_ftz : 1225 NVPTXInst<(outs Float32Regs:$dst), 1226 (ins Float32Regs:$a, Float32Regs:$b), 1227 "div.full.ftz.f32 \t$dst, $a, $b;", 1228 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1229 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1230def FDIV32ri_ftz : 1231 NVPTXInst<(outs Float32Regs:$dst), 1232 (ins Float32Regs:$a, f32imm:$b), 1233 "div.full.ftz.f32 \t$dst, $a, $b;", 1234 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1235 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1236def FDIV32rr : 1237 NVPTXInst<(outs Float32Regs:$dst), 1238 (ins Float32Regs:$a, Float32Regs:$b), 1239 "div.full.f32 \t$dst, $a, $b;", 1240 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1241 Requires<[do_DIVF32_FULL]>; 1242def FDIV32ri : 1243 NVPTXInst<(outs Float32Regs:$dst), 1244 (ins Float32Regs:$a, f32imm:$b), 1245 "div.full.f32 \t$dst, $a, $b;", 1246 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1247 Requires<[do_DIVF32_FULL]>; 1248// 1249// F32 Accurate reciprocal 1250// 1251def FDIV321r_prec_ftz : 1252 NVPTXInst<(outs Float32Regs:$dst), 1253 (ins f32imm:$a, Float32Regs:$b), 1254 "rcp.rn.ftz.f32 \t$dst, $b;", 1255 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1256 Requires<[doF32FTZ]>; 1257def FDIV321r_prec : 1258 NVPTXInst<(outs Float32Regs:$dst), 1259 (ins f32imm:$a, Float32Regs:$b), 1260 "rcp.rn.f32 \t$dst, $b;", 1261 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>; 1262// 1263// F32 Accurate division 1264// 1265def FDIV32rr_prec_ftz : 1266 NVPTXInst<(outs Float32Regs:$dst), 1267 (ins Float32Regs:$a, Float32Regs:$b), 1268 "div.rn.ftz.f32 \t$dst, $a, $b;", 1269 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1270 Requires<[doF32FTZ]>; 1271def FDIV32ri_prec_ftz : 1272 NVPTXInst<(outs Float32Regs:$dst), 1273 (ins Float32Regs:$a, f32imm:$b), 1274 "div.rn.ftz.f32 \t$dst, $a, $b;", 1275 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1276 Requires<[doF32FTZ]>; 1277def FDIV32rr_prec : 1278 NVPTXInst<(outs Float32Regs:$dst), 1279 (ins Float32Regs:$a, Float32Regs:$b), 1280 "div.rn.f32 \t$dst, $a, $b;", 1281 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>; 1282def FDIV32ri_prec : 1283 NVPTXInst<(outs Float32Regs:$dst), 1284 (ins Float32Regs:$a, f32imm:$b), 1285 "div.rn.f32 \t$dst, $a, $b;", 1286 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>; 1287 1288// 1289// FMA 1290// 1291 1292multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> { 1293 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1294 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1295 [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, 1296 Requires<[Pred]>; 1297 def rri : NVPTXInst<(outs RC:$dst), 1298 (ins RC:$a, RC:$b, ImmCls:$c), 1299 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1300 [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>, 1301 Requires<[Pred]>; 1302 def rir : NVPTXInst<(outs RC:$dst), 1303 (ins RC:$a, ImmCls:$b, RC:$c), 1304 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1305 [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>, 1306 Requires<[Pred]>; 1307 def rii : NVPTXInst<(outs RC:$dst), 1308 (ins RC:$a, ImmCls:$b, ImmCls:$c), 1309 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1310 [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>, 1311 Requires<[Pred]>; 1312} 1313 1314multiclass FMA_F16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> { 1315 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1316 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1317 [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>, 1318 Requires<[useFP16Math, Pred]>; 1319} 1320 1321multiclass FMA_BF16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> { 1322 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1323 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1324 [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>, 1325 Requires<[hasBF16Math, Pred]>; 1326} 1327 1328defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", f16, Int16Regs, doF32FTZ>; 1329defm FMA16 : FMA_F16<"fma.rn.f16", f16, Int16Regs, True>; 1330defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>; 1331defm FMA16x2 : FMA_F16<"fma.rn.f16x2", v2f16, Int32Regs, True>; 1332defm BFMA16_ftz : FMA_BF16<"fma.rn.ftz.bf16", bf16, Int16Regs, doF32FTZ>; 1333defm BFMA16 : FMA_BF16<"fma.rn.bf16", bf16, Int16Regs, True>; 1334defm BFMA16x2_ftz : FMA_BF16<"fma.rn.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>; 1335defm BFMA16x2 : FMA_BF16<"fma.rn.bf16x2", v2bf16, Int32Regs, True>; 1336defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>; 1337defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, True>; 1338defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, True>; 1339 1340// sin/cos 1341def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1342 "sin.approx.f32 \t$dst, $src;", 1343 [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>, 1344 Requires<[allowUnsafeFPMath]>; 1345def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1346 "cos.approx.f32 \t$dst, $src;", 1347 [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>, 1348 Requires<[allowUnsafeFPMath]>; 1349 1350// Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)), 1351// i.e. "poor man's fmod()". When y is infinite, x is returned. This matches the 1352// semantics of LLVM's frem. 1353 1354// frem - f32 FTZ 1355def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1356 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32 1357 (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ), 1358 Float32Regs:$y))>, 1359 Requires<[doF32FTZ, allowUnsafeFPMath]>; 1360def : Pat<(frem Float32Regs:$x, fpimm:$y), 1361 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32 1362 (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ), 1363 fpimm:$y))>, 1364 Requires<[doF32FTZ, allowUnsafeFPMath]>; 1365 1366def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1367 (SELP_f32rr Float32Regs:$x, 1368 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32 1369 (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ), 1370 Float32Regs:$y)), 1371 (TESTINF_f32r Float32Regs:$y))>, 1372 Requires<[doF32FTZ, noUnsafeFPMath]>; 1373def : Pat<(frem Float32Regs:$x, fpimm:$y), 1374 (SELP_f32rr Float32Regs:$x, 1375 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32 1376 (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ), 1377 fpimm:$y)), 1378 (TESTINF_f32i fpimm:$y))>, 1379 Requires<[doF32FTZ, noUnsafeFPMath]>; 1380 1381// frem - f32 1382def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1383 (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32 1384 (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI), 1385 Float32Regs:$y))>, 1386 Requires<[allowUnsafeFPMath]>; 1387def : Pat<(frem Float32Regs:$x, fpimm:$y), 1388 (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32 1389 (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI), 1390 fpimm:$y))>, 1391 Requires<[allowUnsafeFPMath]>; 1392 1393def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1394 (SELP_f32rr Float32Regs:$x, 1395 (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32 1396 (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI), 1397 Float32Regs:$y)), 1398 (TESTINF_f32r Float32Regs:$y))>, 1399 Requires<[noUnsafeFPMath]>; 1400def : Pat<(frem Float32Regs:$x, fpimm:$y), 1401 (SELP_f32rr Float32Regs:$x, 1402 (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32 1403 (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI), 1404 fpimm:$y)), 1405 (TESTINF_f32i fpimm:$y))>, 1406 Requires<[noUnsafeFPMath]>; 1407 1408// frem - f64 1409def : Pat<(frem Float64Regs:$x, Float64Regs:$y), 1410 (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64 1411 (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI), 1412 Float64Regs:$y))>, 1413 Requires<[allowUnsafeFPMath]>; 1414def : Pat<(frem Float64Regs:$x, fpimm:$y), 1415 (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64 1416 (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI), 1417 fpimm:$y))>, 1418 Requires<[allowUnsafeFPMath]>; 1419 1420def : Pat<(frem Float64Regs:$x, Float64Regs:$y), 1421 (SELP_f64rr Float64Regs:$x, 1422 (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64 1423 (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI), 1424 Float64Regs:$y)), 1425 (TESTINF_f64r Float64Regs:$y))>, 1426 Requires<[noUnsafeFPMath]>; 1427def : Pat<(frem Float64Regs:$x, fpimm:$y), 1428 (SELP_f64rr Float64Regs:$x, 1429 (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64 1430 (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI), 1431 fpimm:$y)), 1432 (TESTINF_f64r Float64Regs:$y))>, 1433 Requires<[noUnsafeFPMath]>; 1434 1435//----------------------------------- 1436// Bitwise operations 1437//----------------------------------- 1438 1439// Template for three-arg bitwise operations. Takes three args, Creates .b16, 1440// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr. 1441multiclass BITWISE<string OpcStr, SDNode OpNode> { 1442 def b1rr : 1443 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), 1444 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), 1445 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; 1446 def b1ri : 1447 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), 1448 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), 1449 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>; 1450 def b16rr : 1451 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 1452 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), 1453 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; 1454 def b16ri : 1455 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 1456 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), 1457 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>; 1458 def b32rr : 1459 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 1460 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), 1461 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; 1462 def b32ri : 1463 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 1464 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), 1465 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; 1466 def b64rr : 1467 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 1468 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), 1469 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; 1470 def b64ri : 1471 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 1472 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), 1473 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; 1474} 1475 1476defm OR : BITWISE<"or", or>; 1477defm AND : BITWISE<"and", and>; 1478defm XOR : BITWISE<"xor", xor>; 1479 1480def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src), 1481 "not.pred \t$dst, $src;", 1482 [(set Int1Regs:$dst, (not Int1Regs:$src))]>; 1483def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 1484 "not.b16 \t$dst, $src;", 1485 [(set Int16Regs:$dst, (not Int16Regs:$src))]>; 1486def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 1487 "not.b32 \t$dst, $src;", 1488 [(set Int32Regs:$dst, (not Int32Regs:$src))]>; 1489def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 1490 "not.b64 \t$dst, $src;", 1491 [(set Int64Regs:$dst, (not Int64Regs:$src))]>; 1492 1493// Template for left/right shifts. Takes three operands, 1494// [dest (reg), src (reg), shift (reg or imm)]. 1495// dest and src may be int64, int32, or int16, but shift is always int32. 1496// 1497// This template also defines a 32-bit shift (imm, imm) instruction. 1498multiclass SHIFT<string OpcStr, SDNode OpNode> { 1499 def i64rr : 1500 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b), 1501 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 1502 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>; 1503 def i64ri : 1504 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b), 1505 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 1506 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>; 1507 def i32rr : 1508 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 1509 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1510 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; 1511 def i32ri : 1512 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 1513 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1514 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>; 1515 def i32ii : 1516 NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b), 1517 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1518 [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>; 1519 def i16rr : 1520 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b), 1521 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 1522 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>; 1523 def i16ri : 1524 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 1525 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 1526 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>; 1527} 1528 1529defm SHL : SHIFT<"shl.b", shl>; 1530defm SRA : SHIFT<"shr.s", sra>; 1531defm SRL : SHIFT<"shr.u", srl>; 1532 1533// Bit-reverse 1534def BREV32 : 1535 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 1536 "brev.b32 \t$dst, $a;", 1537 [(set Int32Regs:$dst, (bitreverse Int32Regs:$a))]>; 1538def BREV64 : 1539 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a), 1540 "brev.b64 \t$dst, $a;", 1541 [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>; 1542 1543// 1544// Rotate: Use ptx shf instruction if available. 1545// 1546 1547// 32 bit r2 = rotl r1, n 1548// => 1549// r2 = shf.l r1, r1, n 1550def ROTL32imm_hw : 1551 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), 1552 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", 1553 [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>, 1554 Requires<[hasHWROT32]>; 1555 1556def ROTL32reg_hw : 1557 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1558 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", 1559 [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, 1560 Requires<[hasHWROT32]>; 1561 1562// 32 bit r2 = rotr r1, n 1563// => 1564// r2 = shf.r r1, r1, n 1565def ROTR32imm_hw : 1566 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), 1567 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", 1568 [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>, 1569 Requires<[hasHWROT32]>; 1570 1571def ROTR32reg_hw : 1572 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1573 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", 1574 [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, 1575 Requires<[hasHWROT32]>; 1576 1577// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1. 1578def ROT32imm_sw : 1579 NVPTXInst<(outs Int32Regs:$dst), 1580 (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), 1581 "{{\n\t" 1582 ".reg .b32 %lhs;\n\t" 1583 ".reg .b32 %rhs;\n\t" 1584 "shl.b32 \t%lhs, $src, $amt1;\n\t" 1585 "shr.b32 \t%rhs, $src, $amt2;\n\t" 1586 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1587 "}}", 1588 []>; 1589 1590def SUB_FRM_32 : SDNodeXForm<imm, [{ 1591 return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32); 1592}]>; 1593 1594def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)), 1595 (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, 1596 Requires<[noHWROT32]>; 1597def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)), 1598 (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>, 1599 Requires<[noHWROT32]>; 1600 1601// 32-bit software rotate left by register. 1602def ROTL32reg_sw : 1603 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1604 "{{\n\t" 1605 ".reg .b32 %lhs;\n\t" 1606 ".reg .b32 %rhs;\n\t" 1607 ".reg .b32 %amt2;\n\t" 1608 "shl.b32 \t%lhs, $src, $amt;\n\t" 1609 "sub.s32 \t%amt2, 32, $amt;\n\t" 1610 "shr.b32 \t%rhs, $src, %amt2;\n\t" 1611 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1612 "}}", 1613 [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, 1614 Requires<[noHWROT32]>; 1615 1616// 32-bit software rotate right by register. 1617def ROTR32reg_sw : 1618 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1619 "{{\n\t" 1620 ".reg .b32 %lhs;\n\t" 1621 ".reg .b32 %rhs;\n\t" 1622 ".reg .b32 %amt2;\n\t" 1623 "shr.b32 \t%lhs, $src, $amt;\n\t" 1624 "sub.s32 \t%amt2, 32, $amt;\n\t" 1625 "shl.b32 \t%rhs, $src, %amt2;\n\t" 1626 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1627 "}}", 1628 [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, 1629 Requires<[noHWROT32]>; 1630 1631// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1. 1632def ROT64imm_sw : 1633 NVPTXInst<(outs Int64Regs:$dst), 1634 (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2), 1635 "{{\n\t" 1636 ".reg .b64 %lhs;\n\t" 1637 ".reg .b64 %rhs;\n\t" 1638 "shl.b64 \t%lhs, $src, $amt1;\n\t" 1639 "shr.b64 \t%rhs, $src, $amt2;\n\t" 1640 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1641 "}}", 1642 []>; 1643 1644def SUB_FRM_64 : SDNodeXForm<imm, [{ 1645 return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32); 1646}]>; 1647 1648def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)), 1649 (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>; 1650def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)), 1651 (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>; 1652 1653// 64-bit software rotate left by register. 1654def ROTL64reg_sw : 1655 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), 1656 "{{\n\t" 1657 ".reg .b64 %lhs;\n\t" 1658 ".reg .b64 %rhs;\n\t" 1659 ".reg .u32 %amt2;\n\t" 1660 "shl.b64 \t%lhs, $src, $amt;\n\t" 1661 "sub.u32 \t%amt2, 64, $amt;\n\t" 1662 "shr.b64 \t%rhs, $src, %amt2;\n\t" 1663 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1664 "}}", 1665 [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>; 1666 1667def ROTR64reg_sw : 1668 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), 1669 "{{\n\t" 1670 ".reg .b64 %lhs;\n\t" 1671 ".reg .b64 %rhs;\n\t" 1672 ".reg .u32 %amt2;\n\t" 1673 "shr.b64 \t%lhs, $src, $amt;\n\t" 1674 "sub.u32 \t%amt2, 64, $amt;\n\t" 1675 "shl.b64 \t%rhs, $src, %amt2;\n\t" 1676 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1677 "}}", 1678 [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>; 1679 1680// 1681// Funnnel shift in clamp mode 1682// 1683 1684// Create SDNodes so they can be used in the DAG code, e.g. 1685// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) 1686def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; 1687def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; 1688 1689def FUNSHFLCLAMP : 1690 NVPTXInst<(outs Int32Regs:$dst), 1691 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), 1692 "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", 1693 [(set Int32Regs:$dst, 1694 (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>; 1695 1696def FUNSHFRCLAMP : 1697 NVPTXInst<(outs Int32Regs:$dst), 1698 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), 1699 "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", 1700 [(set Int32Regs:$dst, 1701 (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>; 1702 1703// 1704// BFE - bit-field extract 1705// 1706 1707// Template for BFE instructions. Takes four args, 1708// [dest (reg), src (reg), start (reg or imm), end (reg or imm)]. 1709// Start may be an imm only if end is also an imm. FIXME: Is this a 1710// restriction in PTX? 1711// 1712// dest and src may be int32 or int64, but start and end are always int32. 1713multiclass BFE<string TyStr, RegisterClass RC> { 1714 def rrr 1715 : NVPTXInst<(outs RC:$d), 1716 (ins RC:$a, Int32Regs:$b, Int32Regs:$c), 1717 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; 1718 def rri 1719 : NVPTXInst<(outs RC:$d), 1720 (ins RC:$a, Int32Regs:$b, i32imm:$c), 1721 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; 1722 def rii 1723 : NVPTXInst<(outs RC:$d), 1724 (ins RC:$a, i32imm:$b, i32imm:$c), 1725 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; 1726} 1727 1728let hasSideEffects = false in { 1729 defm BFE_S32 : BFE<"s32", Int32Regs>; 1730 defm BFE_U32 : BFE<"u32", Int32Regs>; 1731 defm BFE_S64 : BFE<"s64", Int64Regs>; 1732 defm BFE_U64 : BFE<"u64", Int64Regs>; 1733} 1734 1735//----------------------------------- 1736// Comparison instructions (setp, set) 1737//----------------------------------- 1738 1739// FIXME: This doesn't cover versions of set and setp that combine with a 1740// boolean predicate, e.g. setp.eq.and.b16. 1741 1742let hasSideEffects = false in { 1743 multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> { 1744 def rr : 1745 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp), 1746 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1747 " \t$dst, $a, $b;"), []>; 1748 def ri : 1749 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp), 1750 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1751 " \t$dst, $a, $b;"), []>; 1752 def ir : 1753 NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp), 1754 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1755 " \t$dst, $a, $b;"), []>; 1756 } 1757} 1758 1759defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>; 1760defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>; 1761defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>; 1762defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>; 1763defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>; 1764defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>; 1765defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>; 1766defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>; 1767defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>; 1768defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>; 1769defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>; 1770def SETP_f16rr : 1771 NVPTXInst<(outs Int1Regs:$dst), 1772 (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp), 1773 "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;", 1774 []>, Requires<[useFP16Math]>; 1775 1776def SETP_f16x2rr : 1777 NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q), 1778 (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp), 1779 "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;", 1780 []>, 1781 Requires<[useFP16Math]>; 1782def SETP_bf16rr : 1783 NVPTXInst<(outs Int1Regs:$dst), 1784 (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp), 1785 "setp${cmp:base}${cmp:ftz}.bf16 \t$dst, $a, $b;", 1786 []>, Requires<[hasBF16Math]>; 1787 1788def SETP_bf16x2rr : 1789 NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q), 1790 (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp), 1791 "setp${cmp:base}${cmp:ftz}.bf16x2 \t$p|$q, $a, $b;", 1792 []>, 1793 Requires<[hasBF16Math]>; 1794 1795 1796// FIXME: This doesn't appear to be correct. The "set" mnemonic has the form 1797// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination 1798// reg, either u32, s32, or f32. Anyway these aren't used at the moment. 1799 1800let hasSideEffects = false in { 1801 multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> { 1802 def rr : NVPTXInst<(outs Int32Regs:$dst), 1803 (ins RC:$a, RC:$b, CmpMode:$cmp), 1804 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1805 def ri : NVPTXInst<(outs Int32Regs:$dst), 1806 (ins RC:$a, ImmCls:$b, CmpMode:$cmp), 1807 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1808 def ir : NVPTXInst<(outs Int32Regs:$dst), 1809 (ins ImmCls:$a, RC:$b, CmpMode:$cmp), 1810 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1811 } 1812} 1813 1814defm SET_b16 : SET<"b16", Int16Regs, i16imm>; 1815defm SET_s16 : SET<"s16", Int16Regs, i16imm>; 1816defm SET_u16 : SET<"u16", Int16Regs, i16imm>; 1817defm SET_b32 : SET<"b32", Int32Regs, i32imm>; 1818defm SET_s32 : SET<"s32", Int32Regs, i32imm>; 1819defm SET_u32 : SET<"u32", Int32Regs, i32imm>; 1820defm SET_b64 : SET<"b64", Int64Regs, i64imm>; 1821defm SET_s64 : SET<"s64", Int64Regs, i64imm>; 1822defm SET_u64 : SET<"u64", Int64Regs, i64imm>; 1823defm SET_f16 : SET<"f16", Int16Regs, f16imm>; 1824defm SET_bf16 : SET<"bf16", Int16Regs, bf16imm>; 1825defm SET_f32 : SET<"f32", Float32Regs, f32imm>; 1826defm SET_f64 : SET<"f64", Float64Regs, f64imm>; 1827 1828//----------------------------------- 1829// Data Movement (Load / Store, Move) 1830//----------------------------------- 1831 1832def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], 1833 [SDNPWantRoot]>; 1834def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex], 1835 [SDNPWantRoot]>; 1836def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>; 1837 1838def MEMri : Operand<i32> { 1839 let PrintMethod = "printMemOperand"; 1840 let MIOperandInfo = (ops Int32Regs, i32imm); 1841} 1842def MEMri64 : Operand<i64> { 1843 let PrintMethod = "printMemOperand"; 1844 let MIOperandInfo = (ops Int64Regs, i64imm); 1845} 1846 1847def imem : Operand<iPTR> { 1848 let PrintMethod = "printOperand"; 1849} 1850 1851def imemAny : Operand<iPTRAny> { 1852 let PrintMethod = "printOperand"; 1853} 1854 1855def LdStCode : Operand<i32> { 1856 let PrintMethod = "printLdStCode"; 1857} 1858 1859def MmaCode : Operand<i32> { 1860 let PrintMethod = "printMmaCode"; 1861} 1862 1863def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; 1864def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>; 1865 1866// Load a memory address into a u32 or u64 register. 1867def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a), 1868 "mov.u32 \t$dst, $a;", 1869 [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>; 1870def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a), 1871 "mov.u64 \t$dst, $a;", 1872 [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>; 1873 1874// Get pointer to local stack. 1875let hasSideEffects = false in { 1876 def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num), 1877 "mov.u32 \t$d, __local_depot$num;", []>; 1878 def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num), 1879 "mov.u64 \t$d, __local_depot$num;", []>; 1880} 1881 1882 1883// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp 1884let IsSimpleMove=1, hasSideEffects=0 in { 1885 def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss), 1886 "mov.pred \t$dst, $sss;", []>; 1887 def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), 1888 "mov.u16 \t$dst, $sss;", []>; 1889 def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), 1890 "mov.u32 \t$dst, $sss;", []>; 1891 def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), 1892 "mov.u64 \t$dst, $sss;", []>; 1893 1894 def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), 1895 "mov.b16 \t$dst, $sss;", []>; 1896 def IMOVB32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), 1897 "mov.b32 \t$dst, $sss;", []>; 1898 def IMOVB64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), 1899 "mov.b64 \t$dst, $sss;", []>; 1900 1901 def FMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 1902 // We have to use .b16 here as there's no mov.f16. 1903 "mov.b16 \t$dst, $src;", []>; 1904 def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1905 "mov.f32 \t$dst, $src;", []>; 1906 def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src), 1907 "mov.f64 \t$dst, $src;", []>; 1908} 1909 1910def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), 1911 "mov.pred \t$dst, $src;", 1912 [(set Int1Regs:$dst, imm:$src)]>; 1913def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), 1914 "mov.u16 \t$dst, $src;", 1915 [(set Int16Regs:$dst, imm:$src)]>; 1916def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), 1917 "mov.u32 \t$dst, $src;", 1918 [(set Int32Regs:$dst, imm:$src)]>; 1919def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), 1920 "mov.u64 \t$dst, $src;", 1921 [(set Int64Regs:$dst, imm:$src)]>; 1922 1923def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), 1924 "mov.b16 \t$dst, $src;", []>; 1925def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), 1926 "mov.b32 \t$dst, $src;", []>; 1927def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), 1928 "mov.b64 \t$dst, $src;", []>; 1929 1930def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), 1931 "mov.f32 \t$dst, $src;", 1932 [(set Float32Regs:$dst, fpimm:$src)]>; 1933def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), 1934 "mov.f64 \t$dst, $src;", 1935 [(set Float64Regs:$dst, fpimm:$src)]>; 1936 1937def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>; 1938def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>; 1939 1940//---- Copy Frame Index ---- 1941def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr), 1942 "add.u32 \t$dst, ${addr:add};", 1943 [(set Int32Regs:$dst, ADDRri:$addr)]>; 1944def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr), 1945 "add.u64 \t$dst, ${addr:add};", 1946 [(set Int64Regs:$dst, ADDRri64:$addr)]>; 1947 1948//----------------------------------- 1949// Comparison and Selection 1950//----------------------------------- 1951 1952multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode, 1953 Instruction setp_16rr, 1954 Instruction setp_16ri, 1955 Instruction setp_16ir, 1956 Instruction setp_32rr, 1957 Instruction setp_32ri, 1958 Instruction setp_32ir, 1959 Instruction setp_64rr, 1960 Instruction setp_64ri, 1961 Instruction setp_64ir, 1962 Instruction set_16rr, 1963 Instruction set_16ri, 1964 Instruction set_16ir, 1965 Instruction set_32rr, 1966 Instruction set_32ri, 1967 Instruction set_32ir, 1968 Instruction set_64rr, 1969 Instruction set_64ri, 1970 Instruction set_64ir> { 1971 // i16 -> pred 1972 def : Pat<(i1 (OpNode i16:$a, i16:$b)), 1973 (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; 1974 def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)), 1975 (setp_16ri Int16Regs:$a, imm:$b, Mode)>; 1976 def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)), 1977 (setp_16ir imm:$a, Int16Regs:$b, Mode)>; 1978 // i32 -> pred 1979 def : Pat<(i1 (OpNode i32:$a, i32:$b)), 1980 (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; 1981 def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)), 1982 (setp_32ri Int32Regs:$a, imm:$b, Mode)>; 1983 def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)), 1984 (setp_32ir imm:$a, Int32Regs:$b, Mode)>; 1985 // i64 -> pred 1986 def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)), 1987 (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; 1988 def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)), 1989 (setp_64ri Int64Regs:$a, imm:$b, Mode)>; 1990 def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)), 1991 (setp_64ir imm:$a, Int64Regs:$b, Mode)>; 1992 1993 // i16 -> i32 1994 def : Pat<(i32 (OpNode i16:$a, i16:$b)), 1995 (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; 1996 def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)), 1997 (set_16ri Int16Regs:$a, imm:$b, Mode)>; 1998 def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)), 1999 (set_16ir imm:$a, Int16Regs:$b, Mode)>; 2000 // i32 -> i32 2001 def : Pat<(i32 (OpNode i32:$a, i32:$b)), 2002 (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; 2003 def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)), 2004 (set_32ri Int32Regs:$a, imm:$b, Mode)>; 2005 def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)), 2006 (set_32ir imm:$a, Int32Regs:$b, Mode)>; 2007 // i64 -> i32 2008 def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)), 2009 (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; 2010 def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)), 2011 (set_64ri Int64Regs:$a, imm:$b, Mode)>; 2012 def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)), 2013 (set_64ir imm:$a, Int64Regs:$b, Mode)>; 2014} 2015 2016multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode> 2017 : ISET_FORMAT<OpNode, Mode, 2018 SETP_s16rr, SETP_s16ri, SETP_s16ir, 2019 SETP_s32rr, SETP_s32ri, SETP_s32ir, 2020 SETP_s64rr, SETP_s64ri, SETP_s64ir, 2021 SET_s16rr, SET_s16ri, SET_s16ir, 2022 SET_s32rr, SET_s32ri, SET_s32ir, 2023 SET_s64rr, SET_s64ri, SET_s64ir> { 2024 // TableGen doesn't like empty multiclasses. 2025 def : PatLeaf<(i32 0)>; 2026} 2027 2028multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode> 2029 : ISET_FORMAT<OpNode, Mode, 2030 SETP_u16rr, SETP_u16ri, SETP_u16ir, 2031 SETP_u32rr, SETP_u32ri, SETP_u32ir, 2032 SETP_u64rr, SETP_u64ri, SETP_u64ir, 2033 SET_u16rr, SET_u16ri, SET_u16ir, 2034 SET_u32rr, SET_u32ri, SET_u32ir, 2035 SET_u64rr, SET_u64ri, SET_u64ir> { 2036 // TableGen doesn't like empty multiclasses. 2037 def : PatLeaf<(i32 0)>; 2038} 2039 2040defm : ISET_FORMAT_SIGNED<setgt, CmpGT>; 2041defm : ISET_FORMAT_SIGNED<setlt, CmpLT>; 2042defm : ISET_FORMAT_SIGNED<setge, CmpGE>; 2043defm : ISET_FORMAT_SIGNED<setle, CmpLE>; 2044defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>; 2045defm : ISET_FORMAT_SIGNED<setne, CmpNE>; 2046defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>; 2047defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>; 2048defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>; 2049defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>; 2050defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>; 2051defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>; 2052 2053// i1 compares 2054def : Pat<(setne Int1Regs:$a, Int1Regs:$b), 2055 (XORb1rr Int1Regs:$a, Int1Regs:$b)>; 2056def : Pat<(setune Int1Regs:$a, Int1Regs:$b), 2057 (XORb1rr Int1Regs:$a, Int1Regs:$b)>; 2058 2059def : Pat<(seteq Int1Regs:$a, Int1Regs:$b), 2060 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 2061def : Pat<(setueq Int1Regs:$a, Int1Regs:$b), 2062 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 2063 2064// i1 compare -> i32 2065def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), 2066 (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 2067def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), 2068 (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 2069 2070 2071 2072multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> { 2073 // f16 -> pred 2074 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 2075 (SETP_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, 2076 Requires<[useFP16Math,doF32FTZ]>; 2077 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 2078 (SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>, 2079 Requires<[useFP16Math]>; 2080 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)), 2081 (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, 2082 Requires<[useFP16Math,doF32FTZ]>; 2083 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)), 2084 (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, 2085 Requires<[useFP16Math]>; 2086 def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))), 2087 (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, 2088 Requires<[useFP16Math,doF32FTZ]>; 2089 def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))), 2090 (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>, 2091 Requires<[useFP16Math]>; 2092 2093 // bf16 -> pred 2094 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 2095 (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, 2096 Requires<[hasBF16Math,doF32FTZ]>; 2097 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 2098 (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>, 2099 Requires<[hasBF16Math]>; 2100 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), 2101 (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>, 2102 Requires<[hasBF16Math,doF32FTZ]>; 2103 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), 2104 (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>, 2105 Requires<[hasBF16Math]>; 2106 def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), 2107 (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, 2108 Requires<[hasBF16Math,doF32FTZ]>; 2109 def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), 2110 (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>, 2111 Requires<[hasBF16Math]>; 2112 2113 // f32 -> pred 2114 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), 2115 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, 2116 Requires<[doF32FTZ]>; 2117 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), 2118 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; 2119 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), 2120 (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, 2121 Requires<[doF32FTZ]>; 2122 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), 2123 (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>; 2124 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), 2125 (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, 2126 Requires<[doF32FTZ]>; 2127 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), 2128 (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>; 2129 2130 // f64 -> pred 2131 def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)), 2132 (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; 2133 def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)), 2134 (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>; 2135 def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)), 2136 (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>; 2137 2138 // f16 -> i32 2139 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 2140 (SET_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, 2141 Requires<[useFP16Math, doF32FTZ]>; 2142 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 2143 (SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>, 2144 Requires<[useFP16Math]>; 2145 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)), 2146 (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, 2147 Requires<[useFP16Math, doF32FTZ]>; 2148 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)), 2149 (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, 2150 Requires<[useFP16Math]>; 2151 def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))), 2152 (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, 2153 Requires<[useFP16Math, doF32FTZ]>; 2154 def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))), 2155 (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>, 2156 Requires<[useFP16Math]>; 2157 2158 // bf16 -> i32 2159 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 2160 (SET_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, 2161 Requires<[hasBF16Math, doF32FTZ]>; 2162 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 2163 (SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>, 2164 Requires<[hasBF16Math]>; 2165 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), 2166 (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>, 2167 Requires<[hasBF16Math, doF32FTZ]>; 2168 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), 2169 (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>, 2170 Requires<[hasBF16Math]>; 2171 def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), 2172 (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, 2173 Requires<[hasBF16Math, doF32FTZ]>; 2174 def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), 2175 (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>, 2176 Requires<[hasBF16Math]>; 2177 2178 // f32 -> i32 2179 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), 2180 (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, 2181 Requires<[doF32FTZ]>; 2182 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), 2183 (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; 2184 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), 2185 (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, 2186 Requires<[doF32FTZ]>; 2187 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), 2188 (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>; 2189 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), 2190 (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, 2191 Requires<[doF32FTZ]>; 2192 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), 2193 (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>; 2194 2195 // f64 -> i32 2196 def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)), 2197 (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; 2198 def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)), 2199 (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>; 2200 def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)), 2201 (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>; 2202} 2203 2204defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>; 2205defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>; 2206defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>; 2207defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>; 2208defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>; 2209defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>; 2210 2211defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>; 2212defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>; 2213defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>; 2214defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>; 2215defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>; 2216defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>; 2217 2218defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>; 2219defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>; 2220defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>; 2221defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>; 2222defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>; 2223defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>; 2224 2225defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>; 2226defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>; 2227 2228// FIXME: What is this doing here? Can it be deleted? 2229// def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad, 2230// [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; 2231 2232def SDTDeclareParamProfile : 2233 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; 2234def SDTDeclareScalarParamProfile : 2235 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; 2236def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; 2237def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>; 2238def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>; 2239def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 2240def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 2241def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; 2242def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>; 2243def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>; 2244def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; 2245def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; 2246def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>; 2247def SDTCallVoidProfile : SDTypeProfile<0, 1, []>; 2248def SDTCallValProfile : SDTypeProfile<1, 0, []>; 2249def SDTMoveParamProfile : SDTypeProfile<1, 1, []>; 2250def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; 2251def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>; 2252def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>; 2253def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>; 2254def SDTProxyRegProfile : SDTypeProfile<1, 1, []>; 2255 2256def DeclareParam : 2257 SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile, 2258 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2259def DeclareScalarParam : 2260 SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile, 2261 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2262def DeclareRetParam : 2263 SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile, 2264 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2265def DeclareRet : 2266 SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile, 2267 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2268def LoadParam : 2269 SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, 2270 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2271def LoadParamV2 : 2272 SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile, 2273 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2274def LoadParamV4 : 2275 SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile, 2276 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2277def PrintCall : 2278 SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile, 2279 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2280def PrintConvergentCall : 2281 SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile, 2282 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2283def PrintCallUni : 2284 SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile, 2285 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2286def PrintConvergentCallUni : 2287 SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile, 2288 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2289def StoreParam : 2290 SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, 2291 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2292def StoreParamV2 : 2293 SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile, 2294 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2295def StoreParamV4 : 2296 SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile, 2297 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2298def StoreParamU32 : 2299 SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile, 2300 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2301def StoreParamS32 : 2302 SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile, 2303 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2304def CallArgBegin : 2305 SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile, 2306 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2307def CallArg : 2308 SDNode<"NVPTXISD::CallArg", SDTCallArgProfile, 2309 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2310def LastCallArg : 2311 SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile, 2312 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2313def CallArgEnd : 2314 SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile, 2315 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2316def CallVoid : 2317 SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile, 2318 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2319def Prototype : 2320 SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile, 2321 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2322def CallVal : 2323 SDNode<"NVPTXISD::CallVal", SDTCallValProfile, 2324 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2325def MoveParam : 2326 SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>; 2327def StoreRetval : 2328 SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile, 2329 [SDNPHasChain, SDNPSideEffect]>; 2330def StoreRetvalV2 : 2331 SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile, 2332 [SDNPHasChain, SDNPSideEffect]>; 2333def StoreRetvalV4 : 2334 SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile, 2335 [SDNPHasChain, SDNPSideEffect]>; 2336def PseudoUseParam : 2337 SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile, 2338 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2339def RETURNNode : 2340 SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile, 2341 [SDNPHasChain, SDNPSideEffect]>; 2342def ProxyReg : 2343 SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile, 2344 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2345 2346let mayLoad = true in { 2347 class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : 2348 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), 2349 !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"), 2350 []>; 2351 2352 class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> : 2353 NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b), 2354 !strconcat("ld.param.v2", opstr, 2355 " \t{{$dst, $dst2}}, [retval0+$b];"), []>; 2356 2357 class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> : 2358 NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, 2359 regclass:$dst4), 2360 (ins i32imm:$b), 2361 !strconcat("ld.param.v4", opstr, 2362 " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), 2363 []>; 2364} 2365 2366class LoadParamRegInst<NVPTXRegClass regclass, string opstr> : 2367 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), 2368 !strconcat("mov", opstr, " \t$dst, retval$b;"), 2369 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>; 2370 2371let mayStore = true in { 2372 class StoreParamInst<NVPTXRegClass regclass, string opstr> : 2373 NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b), 2374 !strconcat("st.param", opstr, " \t[param$a+$b], $val;"), 2375 []>; 2376 2377 class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> : 2378 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, 2379 i32imm:$a, i32imm:$b), 2380 !strconcat("st.param.v2", opstr, 2381 " \t[param$a+$b], {{$val, $val2}};"), 2382 []>; 2383 2384 class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> : 2385 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3, 2386 regclass:$val4, i32imm:$a, 2387 i32imm:$b), 2388 !strconcat("st.param.v4", opstr, 2389 " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"), 2390 []>; 2391 2392 class StoreRetvalInst<NVPTXRegClass regclass, string opstr> : 2393 NVPTXInst<(outs), (ins regclass:$val, i32imm:$a), 2394 !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"), 2395 []>; 2396 2397 class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> : 2398 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a), 2399 !strconcat("st.param.v2", opstr, 2400 " \t[func_retval0+$a], {{$val, $val2}};"), 2401 []>; 2402 2403 class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> : 2404 NVPTXInst<(outs), 2405 (ins regclass:$val, regclass:$val2, regclass:$val3, 2406 regclass:$val4, i32imm:$a), 2407 !strconcat("st.param.v4", opstr, 2408 " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"), 2409 []>; 2410} 2411 2412let isCall=1 in { 2413 multiclass CALL<string OpcStr, SDNode OpNode> { 2414 def PrintCallNoRetInst : NVPTXInst<(outs), (ins), 2415 !strconcat(OpcStr, " "), [(OpNode (i32 0))]>; 2416 def PrintCallRetInst1 : NVPTXInst<(outs), (ins), 2417 !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>; 2418 def PrintCallRetInst2 : NVPTXInst<(outs), (ins), 2419 !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>; 2420 def PrintCallRetInst3 : NVPTXInst<(outs), (ins), 2421 !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>; 2422 def PrintCallRetInst4 : NVPTXInst<(outs), (ins), 2423 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "), 2424 [(OpNode (i32 4))]>; 2425 def PrintCallRetInst5 : NVPTXInst<(outs), (ins), 2426 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "), 2427 [(OpNode (i32 5))]>; 2428 def PrintCallRetInst6 : NVPTXInst<(outs), (ins), 2429 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2430 "retval5), "), 2431 [(OpNode (i32 6))]>; 2432 def PrintCallRetInst7 : NVPTXInst<(outs), (ins), 2433 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2434 "retval5, retval6), "), 2435 [(OpNode (i32 7))]>; 2436 def PrintCallRetInst8 : NVPTXInst<(outs), (ins), 2437 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2438 "retval5, retval6, retval7), "), 2439 [(OpNode (i32 8))]>; 2440 } 2441} 2442 2443defm Call : CALL<"call", PrintCall>; 2444defm CallUni : CALL<"call.uni", PrintCallUni>; 2445 2446// Convergent call instructions. These are identical to regular calls, except 2447// they have the isConvergent bit set. 2448let isConvergent=1 in { 2449 defm ConvergentCall : CALL<"call", PrintConvergentCall>; 2450 defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>; 2451} 2452 2453def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">; 2454def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">; 2455def LoadParamMemI16 : LoadParamMemInst<Int16Regs, ".b16">; 2456def LoadParamMemI8 : LoadParamMemInst<Int16Regs, ".b8">; 2457def LoadParamMemV2I64 : LoadParamV2MemInst<Int64Regs, ".b64">; 2458def LoadParamMemV2I32 : LoadParamV2MemInst<Int32Regs, ".b32">; 2459def LoadParamMemV2I16 : LoadParamV2MemInst<Int16Regs, ".b16">; 2460def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">; 2461def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">; 2462def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">; 2463def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">; 2464def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">; 2465def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">; 2466def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">; 2467def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">; 2468def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">; 2469 2470def StoreParamI64 : StoreParamInst<Int64Regs, ".b64">; 2471def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">; 2472 2473def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">; 2474def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">; 2475def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">; 2476def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">; 2477def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">; 2478def StoreParamV2I8 : StoreParamV2Inst<Int16Regs, ".b8">; 2479 2480def StoreParamV4I32 : StoreParamV4Inst<Int32Regs, ".b32">; 2481def StoreParamV4I16 : StoreParamV4Inst<Int16Regs, ".b16">; 2482def StoreParamV4I8 : StoreParamV4Inst<Int16Regs, ".b8">; 2483 2484def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">; 2485def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">; 2486def StoreParamV2F32 : StoreParamV2Inst<Float32Regs, ".f32">; 2487def StoreParamV2F64 : StoreParamV2Inst<Float64Regs, ".f64">; 2488def StoreParamV4F32 : StoreParamV4Inst<Float32Regs, ".f32">; 2489 2490def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">; 2491def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">; 2492def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">; 2493def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">; 2494def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">; 2495def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">; 2496def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">; 2497def StoreRetvalV2I8 : StoreRetvalV2Inst<Int16Regs, ".b8">; 2498def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">; 2499def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">; 2500def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">; 2501 2502def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">; 2503def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">; 2504def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">; 2505def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">; 2506def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">; 2507 2508def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>; 2509def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>; 2510def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>; 2511def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>; 2512 2513class CallArgInst<NVPTXRegClass regclass> : 2514 NVPTXInst<(outs), (ins regclass:$a), "$a, ", 2515 [(CallArg (i32 0), regclass:$a)]>; 2516 2517class CallArgInstVT<NVPTXRegClass regclass, ValueType vt> : 2518 NVPTXInst<(outs), (ins regclass:$a), "$a, ", 2519 [(CallArg (i32 0), vt:$a)]>; 2520 2521class LastCallArgInst<NVPTXRegClass regclass> : 2522 NVPTXInst<(outs), (ins regclass:$a), "$a", 2523 [(LastCallArg (i32 0), regclass:$a)]>; 2524class LastCallArgInstVT<NVPTXRegClass regclass, ValueType vt> : 2525 NVPTXInst<(outs), (ins regclass:$a), "$a", 2526 [(LastCallArg (i32 0), vt:$a)]>; 2527 2528def CallArgI64 : CallArgInst<Int64Regs>; 2529def CallArgI32 : CallArgInstVT<Int32Regs, i32>; 2530def CallArgI16 : CallArgInstVT<Int16Regs, i16>; 2531def CallArgF64 : CallArgInst<Float64Regs>; 2532def CallArgF32 : CallArgInst<Float32Regs>; 2533 2534def LastCallArgI64 : LastCallArgInst<Int64Regs>; 2535def LastCallArgI32 : LastCallArgInstVT<Int32Regs, i32>; 2536def LastCallArgI16 : LastCallArgInstVT<Int16Regs, i16>; 2537def LastCallArgF64 : LastCallArgInst<Float64Regs>; 2538def LastCallArgF32 : LastCallArgInst<Float32Regs>; 2539 2540def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ", 2541 [(CallArg (i32 0), (i32 imm:$a))]>; 2542def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a", 2543 [(LastCallArg (i32 0), (i32 imm:$a))]>; 2544 2545def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ", 2546 [(CallArg (i32 1), (i32 imm:$a))]>; 2547def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a", 2548 [(LastCallArg (i32 1), (i32 imm:$a))]>; 2549 2550def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ", 2551 [(CallVoid (Wrapper tglobaladdr:$addr))]>; 2552def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ", 2553 [(CallVoid i32:$addr)]>; 2554def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ", 2555 [(CallVoid Int64Regs:$addr)]>; 2556def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;", 2557 [(Prototype (i32 imm:$val))]>; 2558 2559def DeclareRetMemInst : 2560 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num), 2561 ".param .align $align .b8 retval$num[$size];", 2562 [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>; 2563def DeclareRetScalarInst : 2564 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), 2565 ".param .b$size retval$num;", 2566 [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>; 2567def DeclareRetRegInst : 2568 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), 2569 ".reg .b$size retval$num;", 2570 [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>; 2571 2572def DeclareParamInst : 2573 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size), 2574 ".param .align $align .b8 param$a[$size];", 2575 [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>; 2576def DeclareScalarParamInst : 2577 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), 2578 ".param .b$size param$a;", 2579 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>; 2580def DeclareScalarRegInst : 2581 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), 2582 ".reg .b$size param$a;", 2583 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>; 2584 2585class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> : 2586 NVPTXInst<(outs regclass:$dst), (ins regclass:$src), 2587 !strconcat("mov", asmstr, " \t$dst, $src;"), 2588 [(set (T regclass:$dst), (MoveParam (T regclass:$src)))]>; 2589 2590class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, ValueType vt, 2591 string asmstr> : 2592 NVPTXInst<(outs regclass:$dst), (ins srcty:$src), 2593 !strconcat("mov", asmstr, " \t$dst, $src;"), 2594 [(set vt:$dst, (MoveParam texternalsym:$src))]>; 2595 2596def MoveParamI64 : MoveParamInst<i64, Int64Regs, ".b64">; 2597def MoveParamI32 : MoveParamInst<i32, Int32Regs, ".b32">; 2598 2599def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, i64, ".b64">; 2600def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, i32, ".b32">; 2601 2602def MoveParamI16 : 2603 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 2604 "cvt.u16.u32 \t$dst, $src;", // ??? Why cvt.u16.u32 ? 2605 [(set i16:$dst, (MoveParam i16:$src))]>; 2606def MoveParamF64 : MoveParamInst<f64, Float64Regs, ".f64">; 2607def MoveParamF32 : MoveParamInst<f32, Float32Regs, ".f32">; 2608 2609class PseudoUseParamInst<NVPTXRegClass regclass, ValueType vt> : 2610 NVPTXInst<(outs), (ins regclass:$src), 2611 "// Pseudo use of $src", 2612 [(PseudoUseParam vt:$src)]>; 2613 2614def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs, i64>; 2615def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs, i32>; 2616def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs, i16>; 2617def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs, f64>; 2618def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>; 2619 2620class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> : 2621 NVPTXInst<(outs regclass:$dst), (ins regclass:$src), 2622 !strconcat("mov.", SzStr, " \t$dst, $src;"), 2623 [(set (T regclass:$dst), (ProxyReg (T regclass:$src)))]>; 2624 2625def ProxyRegI1 : ProxyRegInst<"pred", i1, Int1Regs>; 2626def ProxyRegI16 : ProxyRegInst<"b16", i16, Int16Regs>; 2627def ProxyRegI32 : ProxyRegInst<"b32", i32, Int32Regs>; 2628def ProxyRegI64 : ProxyRegInst<"b64", i64, Int64Regs>; 2629def ProxyRegF32 : ProxyRegInst<"f32", f32, Float32Regs>; 2630def ProxyRegF64 : ProxyRegInst<"f64", f64, Float64Regs>; 2631 2632foreach vt = [f16, bf16] in { 2633 def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI16 Int16Regs:$src)>; 2634} 2635 2636foreach vt = [v2f16, v2bf16] in { 2637 def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 Int32Regs:$src)>; 2638} 2639 2640// 2641// Load / Store Handling 2642// 2643multiclass LD<NVPTXRegClass regclass> { 2644 def _avar : NVPTXInst< 2645 (outs regclass:$dst), 2646 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2647 i32imm:$fromWidth, imem:$addr), 2648 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2649 "\t$dst, [$addr];", []>; 2650 def _areg : NVPTXInst< 2651 (outs regclass:$dst), 2652 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2653 i32imm:$fromWidth, Int32Regs:$addr), 2654 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2655 "\t$dst, [$addr];", []>; 2656 def _areg_64 : NVPTXInst< 2657 (outs regclass:$dst), 2658 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2659 i32imm:$fromWidth, Int64Regs:$addr), 2660 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2661 "\t$dst, [$addr];", []>; 2662 def _ari : NVPTXInst< 2663 (outs regclass:$dst), 2664 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2665 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 2666 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2667 "\t$dst, [$addr+$offset];", []>; 2668 def _ari_64 : NVPTXInst< 2669 (outs regclass:$dst), 2670 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2671 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 2672 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2673 "\t$dst, [$addr+$offset];", []>; 2674 def _asi : NVPTXInst< 2675 (outs regclass:$dst), 2676 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2677 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), 2678 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2679 "\t$dst, [$addr+$offset];", []>; 2680} 2681 2682let mayLoad=1, hasSideEffects=0 in { 2683 defm LD_i8 : LD<Int16Regs>; 2684 defm LD_i16 : LD<Int16Regs>; 2685 defm LD_i32 : LD<Int32Regs>; 2686 defm LD_i64 : LD<Int64Regs>; 2687 defm LD_f32 : LD<Float32Regs>; 2688 defm LD_f64 : LD<Float64Regs>; 2689} 2690 2691multiclass ST<NVPTXRegClass regclass> { 2692 def _avar : NVPTXInst< 2693 (outs), 2694 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2695 LdStCode:$Sign, i32imm:$toWidth, imem:$addr), 2696 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2697 " \t[$addr], $src;", []>; 2698 def _areg : NVPTXInst< 2699 (outs), 2700 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, 2701 LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), 2702 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2703 " \t[$addr], $src;", []>; 2704 def _areg_64 : NVPTXInst< 2705 (outs), 2706 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2707 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), 2708 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2709 " \t[$addr], $src;", []>; 2710 def _ari : NVPTXInst< 2711 (outs), 2712 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2713 LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), 2714 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2715 " \t[$addr+$offset], $src;", []>; 2716 def _ari_64 : NVPTXInst< 2717 (outs), 2718 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2719 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), 2720 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2721 " \t[$addr+$offset], $src;", []>; 2722 def _asi : NVPTXInst< 2723 (outs), 2724 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2725 LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), 2726 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2727 " \t[$addr+$offset], $src;", []>; 2728} 2729 2730let mayStore=1, hasSideEffects=0 in { 2731 defm ST_i8 : ST<Int16Regs>; 2732 defm ST_i16 : ST<Int16Regs>; 2733 defm ST_i32 : ST<Int32Regs>; 2734 defm ST_i64 : ST<Int64Regs>; 2735 defm ST_f32 : ST<Float32Regs>; 2736 defm ST_f64 : ST<Float64Regs>; 2737} 2738 2739// The following is used only in and after vector elementizations. Vector 2740// elementization happens at the machine instruction level, so the following 2741// instructions never appear in the DAG. 2742multiclass LD_VEC<NVPTXRegClass regclass> { 2743 def _v2_avar : NVPTXInst< 2744 (outs regclass:$dst1, regclass:$dst2), 2745 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2746 i32imm:$fromWidth, imem:$addr), 2747 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2748 "\t{{$dst1, $dst2}}, [$addr];", []>; 2749 def _v2_areg : NVPTXInst< 2750 (outs regclass:$dst1, regclass:$dst2), 2751 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2752 i32imm:$fromWidth, Int32Regs:$addr), 2753 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2754 "\t{{$dst1, $dst2}}, [$addr];", []>; 2755 def _v2_areg_64 : NVPTXInst< 2756 (outs regclass:$dst1, regclass:$dst2), 2757 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2758 i32imm:$fromWidth, Int64Regs:$addr), 2759 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2760 "\t{{$dst1, $dst2}}, [$addr];", []>; 2761 def _v2_ari : NVPTXInst< 2762 (outs regclass:$dst1, regclass:$dst2), 2763 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2764 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 2765 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2766 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 2767 def _v2_ari_64 : NVPTXInst< 2768 (outs regclass:$dst1, regclass:$dst2), 2769 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2770 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 2771 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2772 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 2773 def _v2_asi : NVPTXInst< 2774 (outs regclass:$dst1, regclass:$dst2), 2775 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2776 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 2777 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2778 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 2779 def _v4_avar : NVPTXInst< 2780 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2781 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2782 i32imm:$fromWidth, imem:$addr), 2783 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2784 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 2785 def _v4_areg : NVPTXInst< 2786 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2787 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2788 i32imm:$fromWidth, Int32Regs:$addr), 2789 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2790 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 2791 def _v4_areg_64 : NVPTXInst< 2792 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2793 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2794 i32imm:$fromWidth, Int64Regs:$addr), 2795 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2796 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 2797 def _v4_ari : NVPTXInst< 2798 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2799 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2800 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 2801 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2802 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 2803 def _v4_ari_64 : NVPTXInst< 2804 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2805 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2806 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 2807 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2808 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 2809 def _v4_asi : NVPTXInst< 2810 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2811 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2812 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 2813 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2814 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 2815} 2816let mayLoad=1, hasSideEffects=0 in { 2817 defm LDV_i8 : LD_VEC<Int16Regs>; 2818 defm LDV_i16 : LD_VEC<Int16Regs>; 2819 defm LDV_i32 : LD_VEC<Int32Regs>; 2820 defm LDV_i64 : LD_VEC<Int64Regs>; 2821 defm LDV_f32 : LD_VEC<Float32Regs>; 2822 defm LDV_f64 : LD_VEC<Float64Regs>; 2823} 2824 2825multiclass ST_VEC<NVPTXRegClass regclass> { 2826 def _v2_avar : NVPTXInst< 2827 (outs), 2828 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2829 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), 2830 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2831 "\t[$addr], {{$src1, $src2}};", []>; 2832 def _v2_areg : NVPTXInst< 2833 (outs), 2834 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2835 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), 2836 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2837 "\t[$addr], {{$src1, $src2}};", []>; 2838 def _v2_areg_64 : NVPTXInst< 2839 (outs), 2840 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2841 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), 2842 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2843 "\t[$addr], {{$src1, $src2}};", []>; 2844 def _v2_ari : NVPTXInst< 2845 (outs), 2846 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2847 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, 2848 i32imm:$offset), 2849 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2850 "\t[$addr+$offset], {{$src1, $src2}};", []>; 2851 def _v2_ari_64 : NVPTXInst< 2852 (outs), 2853 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2854 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, 2855 i32imm:$offset), 2856 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2857 "\t[$addr+$offset], {{$src1, $src2}};", []>; 2858 def _v2_asi : NVPTXInst< 2859 (outs), 2860 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 2861 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, 2862 i32imm:$offset), 2863 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2864 "\t[$addr+$offset], {{$src1, $src2}};", []>; 2865 def _v4_avar : NVPTXInst< 2866 (outs), 2867 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2868 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2869 i32imm:$fromWidth, imem:$addr), 2870 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2871 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 2872 def _v4_areg : NVPTXInst< 2873 (outs), 2874 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2875 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2876 i32imm:$fromWidth, Int32Regs:$addr), 2877 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2878 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 2879 def _v4_areg_64 : NVPTXInst< 2880 (outs), 2881 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2882 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2883 i32imm:$fromWidth, Int64Regs:$addr), 2884 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2885 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 2886 def _v4_ari : NVPTXInst< 2887 (outs), 2888 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2889 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2890 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 2891 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2892 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 2893 def _v4_ari_64 : NVPTXInst< 2894 (outs), 2895 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2896 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2897 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 2898 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2899 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 2900 def _v4_asi : NVPTXInst< 2901 (outs), 2902 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2903 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2904 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 2905 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}" 2906 "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 2907} 2908 2909let mayStore=1, hasSideEffects=0 in { 2910 defm STV_i8 : ST_VEC<Int16Regs>; 2911 defm STV_i16 : ST_VEC<Int16Regs>; 2912 defm STV_i32 : ST_VEC<Int32Regs>; 2913 defm STV_i64 : ST_VEC<Int64Regs>; 2914 defm STV_f32 : ST_VEC<Float32Regs>; 2915 defm STV_f64 : ST_VEC<Float64Regs>; 2916} 2917 2918//---- Conversion ---- 2919 2920class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut, 2921 NVPTXRegClass regclassIn = ValueToRegClass<TIn>.ret, 2922 NVPTXRegClass regclassOut = ValueToRegClass<TOut>.ret> : 2923 NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a), 2924 !strconcat("mov.b", SzStr, " \t$d, $a;"), 2925 [(set (TOut regclassOut:$d), (bitconvert (TIn regclassIn:$a)))]>; 2926 2927def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>; 2928def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>; 2929def BITCONVERT_64_I2F : F_BITCONVERT<"64", i64, f64>; 2930def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>; 2931 2932foreach vt = [v2f16, v2bf16] in { 2933def: Pat<(vt (bitconvert (i32 UInt32Const:$a))), 2934 (IMOVB32ri UInt32Const:$a)>; 2935def: Pat<(vt (bitconvert (i32 Int32Regs:$a))), 2936 (ProxyRegI32 Int32Regs:$a)>; 2937def: Pat<(i32 (bitconvert (vt Int32Regs:$a))), 2938 (ProxyRegI32 Int32Regs:$a)>; 2939def: Pat<(vt (bitconvert (f32 Float32Regs:$a))), 2940 (BITCONVERT_32_F2I Float32Regs:$a)>; 2941} 2942foreach vt = [f16, bf16] in { 2943def: Pat<(vt (bitconvert (i16 UInt16Const:$a))), 2944 (IMOVB16ri UInt16Const:$a)>; 2945def: Pat<(vt (bitconvert (i16 Int16Regs:$a))), 2946 (ProxyRegI16 Int16Regs:$a)>; 2947def: Pat<(i16 (bitconvert (vt Int16Regs:$a))), 2948 (ProxyRegI16 Int16Regs:$a)>; 2949} 2950 2951// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where 2952// we cannot specify floating-point literals in isel patterns. Therefore, we 2953// use an integer selp to select either 1 or 0 and then cvt to floating-point. 2954 2955// sint -> f16 2956def : Pat<(f16 (sint_to_fp Int1Regs:$a)), 2957 (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 2958def : Pat<(f16 (sint_to_fp Int16Regs:$a)), 2959 (CVT_f16_s16 Int16Regs:$a, CvtRN)>; 2960def : Pat<(f16 (sint_to_fp Int32Regs:$a)), 2961 (CVT_f16_s32 Int32Regs:$a, CvtRN)>; 2962def : Pat<(f16 (sint_to_fp Int64Regs:$a)), 2963 (CVT_f16_s64 Int64Regs:$a, CvtRN)>; 2964 2965// uint -> f16 2966def : Pat<(f16 (uint_to_fp Int1Regs:$a)), 2967 (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 2968def : Pat<(f16 (uint_to_fp Int16Regs:$a)), 2969 (CVT_f16_u16 Int16Regs:$a, CvtRN)>; 2970def : Pat<(f16 (uint_to_fp Int32Regs:$a)), 2971 (CVT_f16_u32 Int32Regs:$a, CvtRN)>; 2972def : Pat<(f16 (uint_to_fp Int64Regs:$a)), 2973 (CVT_f16_u64 Int64Regs:$a, CvtRN)>; 2974 2975// sint -> bf16 2976def : Pat<(bf16 (sint_to_fp Int1Regs:$a)), 2977 (CVT_bf16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 2978def : Pat<(bf16 (sint_to_fp Int16Regs:$a)), 2979 (CVT_bf16_s16 Int16Regs:$a, CvtRN)>; 2980def : Pat<(bf16 (sint_to_fp Int32Regs:$a)), 2981 (CVT_bf16_s32 Int32Regs:$a, CvtRN)>; 2982def : Pat<(bf16 (sint_to_fp Int64Regs:$a)), 2983 (CVT_bf16_s64 Int64Regs:$a, CvtRN)>; 2984 2985// uint -> bf16 2986def : Pat<(bf16 (uint_to_fp Int1Regs:$a)), 2987 (CVT_bf16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 2988def : Pat<(bf16 (uint_to_fp Int16Regs:$a)), 2989 (CVT_bf16_u16 Int16Regs:$a, CvtRN)>; 2990def : Pat<(bf16 (uint_to_fp Int32Regs:$a)), 2991 (CVT_bf16_u32 Int32Regs:$a, CvtRN)>; 2992def : Pat<(bf16 (uint_to_fp Int64Regs:$a)), 2993 (CVT_bf16_u64 Int64Regs:$a, CvtRN)>; 2994 2995// sint -> f32 2996def : Pat<(f32 (sint_to_fp Int1Regs:$a)), 2997 (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 2998def : Pat<(f32 (sint_to_fp Int16Regs:$a)), 2999 (CVT_f32_s16 Int16Regs:$a, CvtRN)>; 3000def : Pat<(f32 (sint_to_fp Int32Regs:$a)), 3001 (CVT_f32_s32 Int32Regs:$a, CvtRN)>; 3002def : Pat<(f32 (sint_to_fp Int64Regs:$a)), 3003 (CVT_f32_s64 Int64Regs:$a, CvtRN)>; 3004 3005// uint -> f32 3006def : Pat<(f32 (uint_to_fp Int1Regs:$a)), 3007 (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3008def : Pat<(f32 (uint_to_fp Int16Regs:$a)), 3009 (CVT_f32_u16 Int16Regs:$a, CvtRN)>; 3010def : Pat<(f32 (uint_to_fp Int32Regs:$a)), 3011 (CVT_f32_u32 Int32Regs:$a, CvtRN)>; 3012def : Pat<(f32 (uint_to_fp Int64Regs:$a)), 3013 (CVT_f32_u64 Int64Regs:$a, CvtRN)>; 3014 3015// sint -> f64 3016def : Pat<(f64 (sint_to_fp Int1Regs:$a)), 3017 (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3018def : Pat<(f64 (sint_to_fp Int16Regs:$a)), 3019 (CVT_f64_s16 Int16Regs:$a, CvtRN)>; 3020def : Pat<(f64 (sint_to_fp Int32Regs:$a)), 3021 (CVT_f64_s32 Int32Regs:$a, CvtRN)>; 3022def : Pat<(f64 (sint_to_fp Int64Regs:$a)), 3023 (CVT_f64_s64 Int64Regs:$a, CvtRN)>; 3024 3025// uint -> f64 3026def : Pat<(f64 (uint_to_fp Int1Regs:$a)), 3027 (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3028def : Pat<(f64 (uint_to_fp Int16Regs:$a)), 3029 (CVT_f64_u16 Int16Regs:$a, CvtRN)>; 3030def : Pat<(f64 (uint_to_fp Int32Regs:$a)), 3031 (CVT_f64_u32 Int32Regs:$a, CvtRN)>; 3032def : Pat<(f64 (uint_to_fp Int64Regs:$a)), 3033 (CVT_f64_u64 Int64Regs:$a, CvtRN)>; 3034 3035 3036// f16 -> sint 3037def : Pat<(i1 (fp_to_sint (f16 Int16Regs:$a))), 3038 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; 3039def : Pat<(i16 (fp_to_sint (f16 Int16Regs:$a))), 3040 (CVT_s16_f16 (f16 Int16Regs:$a), CvtRZI)>; 3041def : Pat<(i32 (fp_to_sint (f16 Int16Regs:$a))), 3042 (CVT_s32_f16 (f16 Int16Regs:$a), CvtRZI)>; 3043def : Pat<(i64 (fp_to_sint (f16 Int16Regs:$a))), 3044 (CVT_s64_f16 Int16Regs:$a, CvtRZI)>; 3045 3046// f16 -> uint 3047def : Pat<(i1 (fp_to_uint (f16 Int16Regs:$a))), 3048 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; 3049def : Pat<(i16 (fp_to_uint (f16 Int16Regs:$a))), 3050 (CVT_u16_f16 Int16Regs:$a, CvtRZI)>; 3051def : Pat<(i32 (fp_to_uint (f16 Int16Regs:$a))), 3052 (CVT_u32_f16 Int16Regs:$a, CvtRZI)>; 3053def : Pat<(i64 (fp_to_uint (f16 Int16Regs:$a))), 3054 (CVT_u64_f16 Int16Regs:$a, CvtRZI)>; 3055 3056// bf16 -> sint 3057def : Pat<(i1 (fp_to_sint (bf16 Int16Regs:$a))), 3058 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; 3059def : Pat<(i16 (fp_to_sint (bf16 Int16Regs:$a))), 3060 (CVT_s16_bf16 (bf16 Int16Regs:$a), CvtRZI)>; 3061def : Pat<(i32 (fp_to_sint (bf16 Int16Regs:$a))), 3062 (CVT_s32_bf16 (bf16 Int16Regs:$a), CvtRZI)>; 3063def : Pat<(i64 (fp_to_sint (bf16 Int16Regs:$a))), 3064 (CVT_s64_bf16 Int16Regs:$a, CvtRZI)>; 3065 3066// bf16 -> uint 3067def : Pat<(i1 (fp_to_uint (bf16 Int16Regs:$a))), 3068 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; 3069def : Pat<(i16 (fp_to_uint (bf16 Int16Regs:$a))), 3070 (CVT_u16_bf16 Int16Regs:$a, CvtRZI)>; 3071def : Pat<(i32 (fp_to_uint (bf16 Int16Regs:$a))), 3072 (CVT_u32_bf16 Int16Regs:$a, CvtRZI)>; 3073def : Pat<(i64 (fp_to_uint (bf16 Int16Regs:$a))), 3074 (CVT_u64_bf16 Int16Regs:$a, CvtRZI)>; 3075// f32 -> sint 3076def : Pat<(i1 (fp_to_sint Float32Regs:$a)), 3077 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; 3078def : Pat<(i16 (fp_to_sint Float32Regs:$a)), 3079 (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3080def : Pat<(i16 (fp_to_sint Float32Regs:$a)), 3081 (CVT_s16_f32 Float32Regs:$a, CvtRZI)>; 3082def : Pat<(i32 (fp_to_sint Float32Regs:$a)), 3083 (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3084def : Pat<(i32 (fp_to_sint Float32Regs:$a)), 3085 (CVT_s32_f32 Float32Regs:$a, CvtRZI)>; 3086def : Pat<(i64 (fp_to_sint Float32Regs:$a)), 3087 (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3088def : Pat<(i64 (fp_to_sint Float32Regs:$a)), 3089 (CVT_s64_f32 Float32Regs:$a, CvtRZI)>; 3090 3091// f32 -> uint 3092def : Pat<(i1 (fp_to_uint Float32Regs:$a)), 3093 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; 3094def : Pat<(i16 (fp_to_uint Float32Regs:$a)), 3095 (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3096def : Pat<(i16 (fp_to_uint Float32Regs:$a)), 3097 (CVT_u16_f32 Float32Regs:$a, CvtRZI)>; 3098def : Pat<(i32 (fp_to_uint Float32Regs:$a)), 3099 (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3100def : Pat<(i32 (fp_to_uint Float32Regs:$a)), 3101 (CVT_u32_f32 Float32Regs:$a, CvtRZI)>; 3102def : Pat<(i64 (fp_to_uint Float32Regs:$a)), 3103 (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3104def : Pat<(i64 (fp_to_uint Float32Regs:$a)), 3105 (CVT_u64_f32 Float32Regs:$a, CvtRZI)>; 3106 3107// f64 -> sint 3108def : Pat<(i1 (fp_to_sint Float64Regs:$a)), 3109 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; 3110def : Pat<(i16 (fp_to_sint Float64Regs:$a)), 3111 (CVT_s16_f64 Float64Regs:$a, CvtRZI)>; 3112def : Pat<(i32 (fp_to_sint Float64Regs:$a)), 3113 (CVT_s32_f64 Float64Regs:$a, CvtRZI)>; 3114def : Pat<(i64 (fp_to_sint Float64Regs:$a)), 3115 (CVT_s64_f64 Float64Regs:$a, CvtRZI)>; 3116 3117// f64 -> uint 3118def : Pat<(i1 (fp_to_uint Float64Regs:$a)), 3119 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; 3120def : Pat<(i16 (fp_to_uint Float64Regs:$a)), 3121 (CVT_u16_f64 Float64Regs:$a, CvtRZI)>; 3122def : Pat<(i32 (fp_to_uint Float64Regs:$a)), 3123 (CVT_u32_f64 Float64Regs:$a, CvtRZI)>; 3124def : Pat<(i64 (fp_to_uint Float64Regs:$a)), 3125 (CVT_u64_f64 Float64Regs:$a, CvtRZI)>; 3126 3127// sext i1 3128def : Pat<(i16 (sext Int1Regs:$a)), 3129 (SELP_s16ii -1, 0, Int1Regs:$a)>; 3130def : Pat<(i32 (sext Int1Regs:$a)), 3131 (SELP_s32ii -1, 0, Int1Regs:$a)>; 3132def : Pat<(i64 (sext Int1Regs:$a)), 3133 (SELP_s64ii -1, 0, Int1Regs:$a)>; 3134 3135// zext i1 3136def : Pat<(i16 (zext Int1Regs:$a)), 3137 (SELP_u16ii 1, 0, Int1Regs:$a)>; 3138def : Pat<(i32 (zext Int1Regs:$a)), 3139 (SELP_u32ii 1, 0, Int1Regs:$a)>; 3140def : Pat<(i64 (zext Int1Regs:$a)), 3141 (SELP_u64ii 1, 0, Int1Regs:$a)>; 3142 3143// anyext i1 3144def : Pat<(i16 (anyext Int1Regs:$a)), 3145 (SELP_u16ii -1, 0, Int1Regs:$a)>; 3146def : Pat<(i32 (anyext Int1Regs:$a)), 3147 (SELP_u32ii -1, 0, Int1Regs:$a)>; 3148def : Pat<(i64 (anyext Int1Regs:$a)), 3149 (SELP_u64ii -1, 0, Int1Regs:$a)>; 3150 3151// sext i16 3152def : Pat<(i32 (sext Int16Regs:$a)), 3153 (CVT_s32_s16 Int16Regs:$a, CvtNONE)>; 3154def : Pat<(i64 (sext Int16Regs:$a)), 3155 (CVT_s64_s16 Int16Regs:$a, CvtNONE)>; 3156 3157// zext i16 3158def : Pat<(i32 (zext Int16Regs:$a)), 3159 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; 3160def : Pat<(i64 (zext Int16Regs:$a)), 3161 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; 3162 3163// anyext i16 3164def : Pat<(i32 (anyext Int16Regs:$a)), 3165 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; 3166def : Pat<(i64 (anyext Int16Regs:$a)), 3167 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; 3168 3169// sext i32 3170def : Pat<(i64 (sext Int32Regs:$a)), 3171 (CVT_s64_s32 Int32Regs:$a, CvtNONE)>; 3172 3173// zext i32 3174def : Pat<(i64 (zext Int32Regs:$a)), 3175 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; 3176 3177// anyext i32 3178def : Pat<(i64 (anyext Int32Regs:$a)), 3179 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; 3180 3181 3182// truncate i64 3183def : Pat<(i32 (trunc Int64Regs:$a)), 3184 (CVT_u32_u64 Int64Regs:$a, CvtNONE)>; 3185def : Pat<(i16 (trunc Int64Regs:$a)), 3186 (CVT_u16_u64 Int64Regs:$a, CvtNONE)>; 3187def : Pat<(i1 (trunc Int64Regs:$a)), 3188 (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>; 3189 3190// truncate i32 3191def : Pat<(i16 (trunc Int32Regs:$a)), 3192 (CVT_u16_u32 Int32Regs:$a, CvtNONE)>; 3193def : Pat<(i1 (trunc Int32Regs:$a)), 3194 (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>; 3195 3196// truncate i16 3197def : Pat<(i1 (trunc Int16Regs:$a)), 3198 (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>; 3199 3200// sext_inreg 3201def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>; 3202def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>; 3203def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>; 3204def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>; 3205def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>; 3206def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>; 3207 3208 3209// Select instructions with 32-bit predicates 3210def : Pat<(select Int32Regs:$pred, i16:$a, i16:$b), 3211 (SELP_b16rr Int16Regs:$a, Int16Regs:$b, 3212 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3213def : Pat<(select Int32Regs:$pred, i32:$a, i32:$b), 3214 (SELP_b32rr Int32Regs:$a, Int32Regs:$b, 3215 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3216def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b), 3217 (SELP_b64rr Int64Regs:$a, Int64Regs:$b, 3218 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3219def : Pat<(select Int32Regs:$pred, (f16 Int16Regs:$a), (f16 Int16Regs:$b)), 3220 (SELP_f16rr Int16Regs:$a, Int16Regs:$b, 3221 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3222def : Pat<(select Int32Regs:$pred, (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)), 3223 (SELP_bf16rr Int16Regs:$a, Int16Regs:$b, 3224 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3225def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b), 3226 (SELP_f32rr Float32Regs:$a, Float32Regs:$b, 3227 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3228def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b), 3229 (SELP_f64rr Float64Regs:$a, Float64Regs:$b, 3230 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3231 3232 3233let hasSideEffects = false in { 3234 // pack a set of smaller int registers to a larger int register 3235 def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d), 3236 (ins Int16Regs:$s1, Int16Regs:$s2, 3237 Int16Regs:$s3, Int16Regs:$s4), 3238 "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>; 3239 def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d), 3240 (ins Int16Regs:$s1, Int16Regs:$s2), 3241 "mov.b32 \t$d, {{$s1, $s2}};", []>; 3242 def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d), 3243 (ins Int32Regs:$s1, Int32Regs:$s2), 3244 "mov.b64 \t$d, {{$s1, $s2}};", []>; 3245 def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d), 3246 (ins Float32Regs:$s1, Float32Regs:$s2), 3247 "mov.b64 \t$d, {{$s1, $s2}};", []>; 3248 3249 // unpack a larger int register to a set of smaller int registers 3250 def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2, 3251 Int16Regs:$d3, Int16Regs:$d4), 3252 (ins Int64Regs:$s), 3253 "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>; 3254 def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2), 3255 (ins Int32Regs:$s), 3256 "mov.b32 \t{{$d1, $d2}}, $s;", []>; 3257 def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2), 3258 (ins Int64Regs:$s), 3259 "mov.b64 \t{{$d1, $d2}}, $s;", []>; 3260 def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2), 3261 (ins Float64Regs:$s), 3262 "mov.b64 \t{{$d1, $d2}}, $s;", []>; 3263 3264 def I32toI16H : NVPTXInst<(outs Int16Regs:$high), 3265 (ins Int32Regs:$s), 3266 "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}", 3267 []>; 3268 def I32toI16L : NVPTXInst<(outs Int16Regs:$low), 3269 (ins Int32Regs:$s), 3270 "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}", 3271 []>; 3272 def I64toI32H : NVPTXInst<(outs Int32Regs:$high), 3273 (ins Int64Regs:$s), 3274 "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}", 3275 []>; 3276} 3277 3278// Using partial vectorized move produces better SASS code for extraction of 3279// upper/lower parts of an integer. 3280def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))), 3281 (I32toI16H Int32Regs:$s)>; 3282def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))), 3283 (I32toI16H Int32Regs:$s)>; 3284def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))), 3285 (I64toI32H Int64Regs:$s)>; 3286def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))), 3287 (I64toI32H Int64Regs:$s)>; 3288 3289def : Pat<(f16 (extractelt (v2f16 Int32Regs:$src), 0)), 3290 (I32toI16L Int32Regs:$src)>; 3291def : Pat<(f16 (extractelt (v2f16 Int32Regs:$src), 1)), 3292 (I32toI16H Int32Regs:$src)>; 3293def : Pat<(v2f16 (build_vector (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 3294 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; 3295 3296def : Pat<(bf16 (extractelt (v2bf16 Int32Regs:$src), 0)), 3297 (I32toI16L Int32Regs:$src)>; 3298def : Pat<(bf16 (extractelt (v2bf16 Int32Regs:$src), 1)), 3299 (I32toI16H Int32Regs:$src)>; 3300def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 3301 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; 3302 3303// Count leading zeros 3304let hasSideEffects = false in { 3305 def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), 3306 "clz.b32 \t$d, $a;", []>; 3307 def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), 3308 "clz.b64 \t$d, $a;", []>; 3309} 3310 3311// 32-bit has a direct PTX instruction 3312def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>; 3313 3314// The return type of the ctlz ISD node is the same as its input, but the PTX 3315// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the 3316// ptx value to 64 bits to match the ISD node's semantics, unless we know we're 3317// truncating back down to 32 bits. 3318def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; 3319def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>; 3320 3321// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the 3322// result back to 16-bits if necessary. We also need to subtract 16 because 3323// the high-order 16 zeros were counted. 3324// 3325// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could 3326// use to save one SASS instruction (on sm_35 anyway): 3327// 3328// mov.b32 $tmp, {0xffff, $a} 3329// ctlz.b32 $result, $tmp 3330// 3331// That is, instead of zero-extending the input to 32 bits, we'd "one-extend" 3332// and then ctlz that value. This way we don't have to subtract 16 from the 3333// result. Unfortunately today we don't have a way to generate 3334// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization. 3335def : Pat<(i16 (ctlz Int16Regs:$a)), 3336 (SUBi16ri (CVT_u16_u32 3337 (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>; 3338def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))), 3339 (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>; 3340 3341// Population count 3342let hasSideEffects = false in { 3343 def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), 3344 "popc.b32 \t$d, $a;", []>; 3345 def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), 3346 "popc.b64 \t$d, $a;", []>; 3347} 3348 3349// 32-bit has a direct PTX instruction 3350def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>; 3351 3352// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit 3353// to match the LLVM semantics. Just as with ctlz.i64, we provide a second 3354// pattern that avoids the type conversion if we're truncating the result to 3355// i32 anyway. 3356def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>; 3357def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>; 3358 3359// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits. 3360// If we know that we're storing into an i32, we can avoid the final trunc. 3361def : Pat<(ctpop Int16Regs:$a), 3362 (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>; 3363def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))), 3364 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>; 3365 3366// fpround f32 -> f16 3367def : Pat<(f16 (fpround Float32Regs:$a)), 3368 (CVT_f16_f32 Float32Regs:$a, CvtRN)>; 3369 3370// fpround f32 -> bf16 3371def : Pat<(bf16 (fpround Float32Regs:$a)), 3372 (CVT_bf16_f32 Float32Regs:$a, CvtRN)>; 3373 3374// fpround f64 -> f16 3375def : Pat<(f16 (fpround Float64Regs:$a)), 3376 (CVT_f16_f64 Float64Regs:$a, CvtRN)>; 3377 3378// fpround f64 -> bf16 3379def : Pat<(bf16 (fpround Float64Regs:$a)), 3380 (CVT_bf16_f64 Float64Regs:$a, CvtRN)>; 3381// fpround f64 -> f32 3382def : Pat<(f32 (fpround Float64Regs:$a)), 3383 (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; 3384def : Pat<(f32 (fpround Float64Regs:$a)), 3385 (CVT_f32_f64 Float64Regs:$a, CvtRN)>; 3386 3387// fpextend f16 -> f32 3388def : Pat<(f32 (fpextend (f16 Int16Regs:$a))), 3389 (CVT_f32_f16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3390def : Pat<(f32 (fpextend (f16 Int16Regs:$a))), 3391 (CVT_f32_f16 Int16Regs:$a, CvtNONE)>; 3392// fpextend bf16 -> f32 3393def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))), 3394 (CVT_f32_bf16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3395def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))), 3396 (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>; 3397 3398// fpextend f16 -> f64 3399def : Pat<(f64 (fpextend (f16 Int16Regs:$a))), 3400 (CVT_f64_f16 Int16Regs:$a, CvtNONE)>; 3401 3402// fpextend bf16 -> f64 3403def : Pat<(f64 (fpextend (bf16 Int16Regs:$a))), 3404 (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>; 3405 3406// fpextend f32 -> f64 3407def : Pat<(f64 (fpextend Float32Regs:$a)), 3408 (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3409def : Pat<(f64 (fpextend Float32Regs:$a)), 3410 (CVT_f64_f32 Float32Regs:$a, CvtNONE)>; 3411 3412def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone, 3413 [SDNPHasChain, SDNPOptInGlue]>; 3414 3415// fceil, ffloor, froundeven, ftrunc. 3416 3417multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> { 3418 def : Pat<(OpNode (f16 Int16Regs:$a)), 3419 (CVT_f16_f16 Int16Regs:$a, Mode)>; 3420 def : Pat<(OpNode (bf16 Int16Regs:$a)), 3421 (CVT_bf16_bf16 Int16Regs:$a, Mode)>; 3422 def : Pat<(OpNode Float32Regs:$a), 3423 (CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>; 3424 def : Pat<(OpNode Float32Regs:$a), 3425 (CVT_f32_f32 Float32Regs:$a, Mode)>, Requires<[doNoF32FTZ]>; 3426 def : Pat<(OpNode Float64Regs:$a), 3427 (CVT_f64_f64 Float64Regs:$a, Mode)>; 3428} 3429 3430defm : CVT_ROUND<fceil, CvtRPI, CvtRPI_FTZ>; 3431defm : CVT_ROUND<ffloor, CvtRMI, CvtRMI_FTZ>; 3432defm : CVT_ROUND<froundeven, CvtRNI, CvtRNI_FTZ>; 3433defm : CVT_ROUND<ftrunc, CvtRZI, CvtRZI_FTZ>; 3434 3435// nearbyint and rint are implemented as rounding to nearest even. This isn't 3436// strictly correct, because it causes us to ignore the rounding mode. But it 3437// matches what CUDA's "libm" does. 3438 3439defm : CVT_ROUND<fnearbyint, CvtRNI, CvtRNI_FTZ>; 3440defm : CVT_ROUND<frint, CvtRNI, CvtRNI_FTZ>; 3441 3442//----------------------------------- 3443// Control-flow 3444//----------------------------------- 3445 3446let isTerminator=1 in { 3447 let isReturn=1, isBarrier=1 in 3448 def Return : NVPTXInst<(outs), (ins), "ret;", [(retglue)]>; 3449 3450 let isBranch=1 in 3451 def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), 3452 "@$a bra \t$target;", 3453 [(brcond Int1Regs:$a, bb:$target)]>; 3454 let isBranch=1 in 3455 def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), 3456 "@!$a bra \t$target;", []>; 3457 3458 let isBranch=1, isBarrier=1 in 3459 def GOTO : NVPTXInst<(outs), (ins brtarget:$target), 3460 "bra.uni \t$target;", [(br bb:$target)]>; 3461} 3462 3463def : Pat<(brcond Int32Regs:$a, bb:$target), 3464 (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>; 3465 3466// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a 3467// conditional branch if the target block is the next block so that the code 3468// can fall through to the target block. The invertion is done by 'xor 3469// condition, 1', which will be translated to (setne condition, -1). Since ptx 3470// supports '@!pred bra target', we should use it. 3471def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target), 3472 (CBranchOther Int1Regs:$a, bb:$target)>; 3473 3474// Call 3475def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, 3476 SDTCisVT<1, i32>]>; 3477def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; 3478 3479def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart, 3480 [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; 3481def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd, 3482 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, 3483 SDNPSideEffect]>; 3484 3485def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; 3486def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall, 3487 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; 3488def calltarget : Operand<i32>; 3489let isCall=1 in { 3490 def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>; 3491} 3492 3493def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>; 3494def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>; 3495 3496// Pseudo instructions. 3497class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern> 3498 : NVPTXInst<outs, ins, asmstr, pattern>; 3499 3500def Callseq_Start : 3501 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), 3502 "\\{ // callseq $amt1, $amt2\n" 3503 "\t.reg .b32 temp_param_reg;", 3504 [(callseq_start timm:$amt1, timm:$amt2)]>; 3505def Callseq_End : 3506 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), 3507 "\\} // callseq $amt1", 3508 [(callseq_end timm:$amt1, timm:$amt2)]>; 3509 3510// trap instruction 3511def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>; 3512 3513// Call prototype wrapper 3514def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 3515def CallPrototype : 3516 SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, 3517 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 3518def ProtoIdent : Operand<i32> { 3519 let PrintMethod = "printProtoIdent"; 3520} 3521def CALL_PROTOTYPE : 3522 NVPTXInst<(outs), (ins ProtoIdent:$ident), 3523 "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; 3524 3525 3526include "NVPTXIntrinsics.td" 3527 3528 3529//----------------------------------- 3530// Notes 3531//----------------------------------- 3532// BSWAP is currently expanded. The following is a more efficient 3533// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register 3534// - for sm_20, use pmpt (use vector scalar mov to get the pack and 3535// unpack). sm_20 supports native 32-bit register, but not native 16-bit 3536// register. 3537