1//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file describes the PTX instructions in TableGen format. 10// 11//===----------------------------------------------------------------------===// 12 13include "NVPTXInstrFormats.td" 14 15let OperandType = "OPERAND_IMMEDIATE" in { 16 def f16imm : Operand<f16>; 17 def bf16imm : Operand<bf16>; 18 19} 20 21// List of vector specific properties 22def isVecLD : VecInstTypeEnum<1>; 23def isVecST : VecInstTypeEnum<2>; 24def isVecBuild : VecInstTypeEnum<3>; 25def isVecShuffle : VecInstTypeEnum<4>; 26def isVecExtract : VecInstTypeEnum<5>; 27def isVecInsert : VecInstTypeEnum<6>; 28def isVecDest : VecInstTypeEnum<7>; 29def isVecOther : VecInstTypeEnum<15>; 30 31//===----------------------------------------------------------------------===// 32// NVPTX Operand Definitions. 33//===----------------------------------------------------------------------===// 34 35def brtarget : Operand<OtherVT>; 36 37// CVT conversion modes 38// These must match the enum in NVPTX.h 39def CvtNONE : PatLeaf<(i32 0x0)>; 40def CvtRNI : PatLeaf<(i32 0x1)>; 41def CvtRZI : PatLeaf<(i32 0x2)>; 42def CvtRMI : PatLeaf<(i32 0x3)>; 43def CvtRPI : PatLeaf<(i32 0x4)>; 44def CvtRN : PatLeaf<(i32 0x5)>; 45def CvtRZ : PatLeaf<(i32 0x6)>; 46def CvtRM : PatLeaf<(i32 0x7)>; 47def CvtRP : PatLeaf<(i32 0x8)>; 48def CvtRNA : PatLeaf<(i32 0x9)>; 49 50def CvtNONE_FTZ : PatLeaf<(i32 0x10)>; 51def CvtRNI_FTZ : PatLeaf<(i32 0x11)>; 52def CvtRZI_FTZ : PatLeaf<(i32 0x12)>; 53def CvtRMI_FTZ : PatLeaf<(i32 0x13)>; 54def CvtRPI_FTZ : PatLeaf<(i32 0x14)>; 55def CvtRN_FTZ : PatLeaf<(i32 0x15)>; 56def CvtRZ_FTZ : PatLeaf<(i32 0x16)>; 57def CvtRM_FTZ : PatLeaf<(i32 0x17)>; 58def CvtRP_FTZ : PatLeaf<(i32 0x18)>; 59 60def CvtSAT : PatLeaf<(i32 0x20)>; 61def CvtSAT_FTZ : PatLeaf<(i32 0x30)>; 62 63def CvtNONE_RELU : PatLeaf<(i32 0x40)>; 64def CvtRN_RELU : PatLeaf<(i32 0x45)>; 65def CvtRZ_RELU : PatLeaf<(i32 0x46)>; 66 67def CvtMode : Operand<i32> { 68 let PrintMethod = "printCvtMode"; 69} 70 71// Compare modes 72// These must match the enum in NVPTX.h 73def CmpEQ : PatLeaf<(i32 0)>; 74def CmpNE : PatLeaf<(i32 1)>; 75def CmpLT : PatLeaf<(i32 2)>; 76def CmpLE : PatLeaf<(i32 3)>; 77def CmpGT : PatLeaf<(i32 4)>; 78def CmpGE : PatLeaf<(i32 5)>; 79def CmpLO : PatLeaf<(i32 6)>; 80def CmpLS : PatLeaf<(i32 7)>; 81def CmpHI : PatLeaf<(i32 8)>; 82def CmpHS : PatLeaf<(i32 9)>; 83def CmpEQU : PatLeaf<(i32 10)>; 84def CmpNEU : PatLeaf<(i32 11)>; 85def CmpLTU : PatLeaf<(i32 12)>; 86def CmpLEU : PatLeaf<(i32 13)>; 87def CmpGTU : PatLeaf<(i32 14)>; 88def CmpGEU : PatLeaf<(i32 15)>; 89def CmpNUM : PatLeaf<(i32 16)>; 90def CmpNAN : PatLeaf<(i32 17)>; 91 92def CmpEQ_FTZ : PatLeaf<(i32 0x100)>; 93def CmpNE_FTZ : PatLeaf<(i32 0x101)>; 94def CmpLT_FTZ : PatLeaf<(i32 0x102)>; 95def CmpLE_FTZ : PatLeaf<(i32 0x103)>; 96def CmpGT_FTZ : PatLeaf<(i32 0x104)>; 97def CmpGE_FTZ : PatLeaf<(i32 0x105)>; 98def CmpEQU_FTZ : PatLeaf<(i32 0x10A)>; 99def CmpNEU_FTZ : PatLeaf<(i32 0x10B)>; 100def CmpLTU_FTZ : PatLeaf<(i32 0x10C)>; 101def CmpLEU_FTZ : PatLeaf<(i32 0x10D)>; 102def CmpGTU_FTZ : PatLeaf<(i32 0x10E)>; 103def CmpGEU_FTZ : PatLeaf<(i32 0x10F)>; 104def CmpNUM_FTZ : PatLeaf<(i32 0x110)>; 105def CmpNAN_FTZ : PatLeaf<(i32 0x111)>; 106 107def CmpMode : Operand<i32> { 108 let PrintMethod = "printCmpMode"; 109} 110def VecElement : Operand<i32> { 111 let PrintMethod = "printVecElement"; 112} 113 114// PRMT modes 115// These must match the enum in NVPTX.h 116def PrmtNONE : PatLeaf<(i32 0x0)>; 117def PrmtF4E : PatLeaf<(i32 0x1)>; 118def PrmtB4E : PatLeaf<(i32 0x2)>; 119def PrmtRC8 : PatLeaf<(i32 0x3)>; 120def PrmtECL : PatLeaf<(i32 0x4)>; 121def PrmtECR : PatLeaf<(i32 0x5)>; 122def PrmtRC16 : PatLeaf<(i32 0x6)>; 123 124def PrmtMode : Operand<i32> { 125 let PrintMethod = "printPrmtMode"; 126} 127 128 129//===----------------------------------------------------------------------===// 130// NVPTX Instruction Predicate Definitions 131//===----------------------------------------------------------------------===// 132 133 134def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; 135def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; 136def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; 137def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; 138def hasVote : Predicate<"Subtarget->hasVote()">; 139def hasDouble : Predicate<"Subtarget->hasDouble()">; 140def hasLDG : Predicate<"Subtarget->hasLDG()">; 141def hasLDU : Predicate<"Subtarget->hasLDU()">; 142 143def doF32FTZ : Predicate<"useF32FTZ()">; 144def doNoF32FTZ : Predicate<"!useF32FTZ()">; 145 146def doMulWide : Predicate<"doMulWide">; 147 148def allowFMA : Predicate<"allowFMA()">; 149def noFMA : Predicate<"!allowFMA()">; 150def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">; 151def noUnsafeFPMath : Predicate<"!allowUnsafeFPMath()">; 152 153def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">; 154def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">; 155 156def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">; 157def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; 158 159def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; 160def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; 161 162def True : Predicate<"true">; 163 164class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>; 165class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>; 166 167// Explicit records for arch-accelerated SM versions 168def hasSM90a : Predicate<"Subtarget->getFullSmVersion() == 901">; 169 170// non-sync shfl instructions are not available on sm_70+ in PTX6.4+ 171def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" 172 "&& Subtarget->getPTXVersion() >= 64)">; 173 174def useShortPtr : Predicate<"useShortPointers()">; 175def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; 176def hasBF16Math: Predicate<"Subtarget->hasBF16Math()">; 177 178// Helper class to aid conversion between ValueType and a matching RegisterClass. 179 180class ValueToRegClass<ValueType T> { 181 string name = !cast<string>(T); 182 NVPTXRegClass ret = !cond( 183 !eq(name, "i1"): Int1Regs, 184 !eq(name, "i16"): Int16Regs, 185 !eq(name, "v2i16"): Int32Regs, 186 !eq(name, "i32"): Int32Regs, 187 !eq(name, "i64"): Int64Regs, 188 !eq(name, "f16"): Int16Regs, 189 !eq(name, "v2f16"): Int32Regs, 190 !eq(name, "bf16"): Int16Regs, 191 !eq(name, "v2bf16"): Int32Regs, 192 !eq(name, "f32"): Float32Regs, 193 !eq(name, "f64"): Float64Regs, 194 !eq(name, "ai32"): Int32ArgRegs, 195 !eq(name, "ai64"): Int64ArgRegs, 196 !eq(name, "af32"): Float32ArgRegs, 197 !eq(name, "if64"): Float64ArgRegs, 198 ); 199} 200 201 202//===----------------------------------------------------------------------===// 203// Some Common Instruction Class Templates 204//===----------------------------------------------------------------------===// 205 206// Template for instructions which take three int64, int32, or int16 args. 207// The instructions are named "<OpcStr><Width>" (e.g. "add.s64"). 208multiclass I3<string OpcStr, SDNode OpNode> { 209 def i64rr : 210 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 211 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 212 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; 213 def i64ri : 214 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 215 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 216 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; 217 def i32rr : 218 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 219 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 220 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>; 221 def i32ri : 222 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 223 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 224 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>; 225 def i16rr : 226 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 227 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 228 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; 229 def i16ri : 230 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 231 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 232 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; 233} 234 235class I16x2<string OpcStr, SDNode OpNode> : 236 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 237 !strconcat(OpcStr, "16x2 \t$dst, $a, $b;"), 238 [(set Int32Regs:$dst, (OpNode (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)))]>, 239 Requires<[hasPTX<80>, hasSM<90>]>; 240 241// Template for instructions which take 3 int args. The instructions are 242// named "<OpcStr>.s32" (e.g. "addc.cc.s32"). 243multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> { 244 let hasSideEffects = 1 in { 245 def i32rr : 246 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 247 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), 248 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>; 249 def i32ri : 250 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 251 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), 252 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>; 253 def i64rr : 254 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 255 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), 256 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>, 257 Requires<[hasPTX<43>]>; 258 def i64ri : 259 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 260 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), 261 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>, 262 Requires<[hasPTX<43>]>; 263 } 264} 265 266// Template for instructions which take three fp64 or fp32 args. The 267// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64"). 268// 269// Also defines ftz (flush subnormal inputs and results to sign-preserving 270// zero) variants for fp32 functions. 271// 272// This multiclass should be used for nodes that cannot be folded into FMAs. 273// For nodes that can be folded into FMAs (i.e. adds and muls), use 274// F3_fma_component. 275multiclass F3<string OpcStr, SDNode OpNode> { 276 def f64rr : 277 NVPTXInst<(outs Float64Regs:$dst), 278 (ins Float64Regs:$a, Float64Regs:$b), 279 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 280 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>; 281 def f64ri : 282 NVPTXInst<(outs Float64Regs:$dst), 283 (ins Float64Regs:$a, f64imm:$b), 284 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 285 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>; 286 def f32rr_ftz : 287 NVPTXInst<(outs Float32Regs:$dst), 288 (ins Float32Regs:$a, Float32Regs:$b), 289 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 290 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 291 Requires<[doF32FTZ]>; 292 def f32ri_ftz : 293 NVPTXInst<(outs Float32Regs:$dst), 294 (ins Float32Regs:$a, f32imm:$b), 295 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 296 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 297 Requires<[doF32FTZ]>; 298 def f32rr : 299 NVPTXInst<(outs Float32Regs:$dst), 300 (ins Float32Regs:$a, Float32Regs:$b), 301 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 302 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>; 303 def f32ri : 304 NVPTXInst<(outs Float32Regs:$dst), 305 (ins Float32Regs:$a, f32imm:$b), 306 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 307 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>; 308 309 def f16rr_ftz : 310 NVPTXInst<(outs Int16Regs:$dst), 311 (ins Int16Regs:$a, Int16Regs:$b), 312 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), 313 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 314 Requires<[useFP16Math, doF32FTZ]>; 315 def f16rr : 316 NVPTXInst<(outs Int16Regs:$dst), 317 (ins Int16Regs:$a, Int16Regs:$b), 318 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), 319 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 320 Requires<[useFP16Math]>; 321 322 def f16x2rr_ftz : 323 NVPTXInst<(outs Int32Regs:$dst), 324 (ins Int32Regs:$a, Int32Regs:$b), 325 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), 326 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 327 Requires<[useFP16Math, doF32FTZ]>; 328 def f16x2rr : 329 NVPTXInst<(outs Int32Regs:$dst), 330 (ins Int32Regs:$a, Int32Regs:$b), 331 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), 332 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 333 Requires<[useFP16Math]>; 334 def bf16rr_ftz : 335 NVPTXInst<(outs Int16Regs:$dst), 336 (ins Int16Regs:$a, Int16Regs:$b), 337 !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"), 338 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 339 Requires<[hasBF16Math, doF32FTZ]>; 340 def bf16rr : 341 NVPTXInst<(outs Int16Regs:$dst), 342 (ins Int16Regs:$a, Int16Regs:$b), 343 !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"), 344 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 345 Requires<[hasBF16Math]>; 346 347 def bf16x2rr_ftz : 348 NVPTXInst<(outs Int32Regs:$dst), 349 (ins Int32Regs:$a, Int32Regs:$b), 350 !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"), 351 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 352 Requires<[hasBF16Math, doF32FTZ]>; 353 def bf16x2rr : 354 NVPTXInst<(outs Int32Regs:$dst), 355 (ins Int32Regs:$a, Int32Regs:$b), 356 !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"), 357 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 358 Requires<[hasBF16Math]>; 359} 360 361// Template for instructions which take three FP args. The 362// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64"). 363// 364// Also defines ftz (flush subnormal inputs and results to sign-preserving 365// zero) variants for fp32/fp16 functions. 366// 367// This multiclass should be used for nodes that can be folded to make fma ops. 368// In this case, we use the ".rn" variant when FMA is disabled, as this behaves 369// just like the non ".rn" op, but prevents ptxas from creating FMAs. 370multiclass F3_fma_component<string OpcStr, SDNode OpNode> { 371 def f64rr : 372 NVPTXInst<(outs Float64Regs:$dst), 373 (ins Float64Regs:$a, Float64Regs:$b), 374 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 375 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, 376 Requires<[allowFMA]>; 377 def f64ri : 378 NVPTXInst<(outs Float64Regs:$dst), 379 (ins Float64Regs:$a, f64imm:$b), 380 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 381 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, 382 Requires<[allowFMA]>; 383 def f32rr_ftz : 384 NVPTXInst<(outs Float32Regs:$dst), 385 (ins Float32Regs:$a, Float32Regs:$b), 386 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 387 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 388 Requires<[allowFMA, doF32FTZ]>; 389 def f32ri_ftz : 390 NVPTXInst<(outs Float32Regs:$dst), 391 (ins Float32Regs:$a, f32imm:$b), 392 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 393 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 394 Requires<[allowFMA, doF32FTZ]>; 395 def f32rr : 396 NVPTXInst<(outs Float32Regs:$dst), 397 (ins Float32Regs:$a, Float32Regs:$b), 398 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 399 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 400 Requires<[allowFMA]>; 401 def f32ri : 402 NVPTXInst<(outs Float32Regs:$dst), 403 (ins Float32Regs:$a, f32imm:$b), 404 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 405 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 406 Requires<[allowFMA]>; 407 408 def f16rr_ftz : 409 NVPTXInst<(outs Int16Regs:$dst), 410 (ins Int16Regs:$a, Int16Regs:$b), 411 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), 412 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 413 Requires<[useFP16Math, allowFMA, doF32FTZ]>; 414 def f16rr : 415 NVPTXInst<(outs Int16Regs:$dst), 416 (ins Int16Regs:$a, Int16Regs:$b), 417 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), 418 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 419 Requires<[useFP16Math, allowFMA]>; 420 421 def f16x2rr_ftz : 422 NVPTXInst<(outs Int32Regs:$dst), 423 (ins Int32Regs:$a, Int32Regs:$b), 424 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), 425 [(set (v2f16 Int32Regs:$dst), (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 426 Requires<[useFP16Math, allowFMA, doF32FTZ]>; 427 def f16x2rr : 428 NVPTXInst<(outs Int32Regs:$dst), 429 (ins Int32Regs:$a, Int32Regs:$b), 430 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), 431 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 432 Requires<[useFP16Math, allowFMA]>; 433 def bf16rr_ftz : 434 NVPTXInst<(outs Int16Regs:$dst), 435 (ins Int16Regs:$a, Int16Regs:$b), 436 !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"), 437 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 438 Requires<[hasBF16Math, allowFMA, doF32FTZ]>; 439 def bf16rr : 440 NVPTXInst<(outs Int16Regs:$dst), 441 (ins Int16Regs:$a, Int16Regs:$b), 442 !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"), 443 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 444 Requires<[hasBF16Math, allowFMA]>; 445 446 def bf16x2rr_ftz : 447 NVPTXInst<(outs Int32Regs:$dst), 448 (ins Int32Regs:$a, Int32Regs:$b), 449 !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"), 450 [(set (v2bf16 Int32Regs:$dst), (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 451 Requires<[hasBF16Math, allowFMA, doF32FTZ]>; 452 def bf16x2rr : 453 NVPTXInst<(outs Int32Regs:$dst), 454 (ins Int32Regs:$a, Int32Regs:$b), 455 !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"), 456 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 457 Requires<[hasBF16Math, allowFMA]>; 458 // These have strange names so we don't perturb existing mir tests. 459 def _rnf64rr : 460 NVPTXInst<(outs Float64Regs:$dst), 461 (ins Float64Regs:$a, Float64Regs:$b), 462 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), 463 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, 464 Requires<[noFMA]>; 465 def _rnf64ri : 466 NVPTXInst<(outs Float64Regs:$dst), 467 (ins Float64Regs:$a, f64imm:$b), 468 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), 469 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, 470 Requires<[noFMA]>; 471 def _rnf32rr_ftz : 472 NVPTXInst<(outs Float32Regs:$dst), 473 (ins Float32Regs:$a, Float32Regs:$b), 474 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), 475 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 476 Requires<[noFMA, doF32FTZ]>; 477 def _rnf32ri_ftz : 478 NVPTXInst<(outs Float32Regs:$dst), 479 (ins Float32Regs:$a, f32imm:$b), 480 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), 481 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 482 Requires<[noFMA, doF32FTZ]>; 483 def _rnf32rr : 484 NVPTXInst<(outs Float32Regs:$dst), 485 (ins Float32Regs:$a, Float32Regs:$b), 486 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), 487 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 488 Requires<[noFMA]>; 489 def _rnf32ri : 490 NVPTXInst<(outs Float32Regs:$dst), 491 (ins Float32Regs:$a, f32imm:$b), 492 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), 493 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 494 Requires<[noFMA]>; 495 def _rnf16rr_ftz : 496 NVPTXInst<(outs Int16Regs:$dst), 497 (ins Int16Regs:$a, Int16Regs:$b), 498 !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"), 499 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 500 Requires<[useFP16Math, noFMA, doF32FTZ]>; 501 def _rnf16rr : 502 NVPTXInst<(outs Int16Regs:$dst), 503 (ins Int16Regs:$a, Int16Regs:$b), 504 !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"), 505 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 506 Requires<[useFP16Math, noFMA]>; 507 def _rnf16x2rr_ftz : 508 NVPTXInst<(outs Int32Regs:$dst), 509 (ins Int32Regs:$a, Int32Regs:$b), 510 !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"), 511 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 512 Requires<[useFP16Math, noFMA, doF32FTZ]>; 513 def _rnf16x2rr : 514 NVPTXInst<(outs Int32Regs:$dst), 515 (ins Int32Regs:$a, Int32Regs:$b), 516 !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"), 517 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 518 Requires<[useFP16Math, noFMA]>; 519 def _rnbf16rr_ftz : 520 NVPTXInst<(outs Int16Regs:$dst), 521 (ins Int16Regs:$a, Int16Regs:$b), 522 !strconcat(OpcStr, ".rn.ftz.bf16 \t$dst, $a, $b;"), 523 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 524 Requires<[hasBF16Math, noFMA, doF32FTZ]>; 525 def _rnbf16rr : 526 NVPTXInst<(outs Int16Regs:$dst), 527 (ins Int16Regs:$a, Int16Regs:$b), 528 !strconcat(OpcStr, ".rn.bf16 \t$dst, $a, $b;"), 529 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 530 Requires<[hasBF16Math, noFMA]>; 531 def _rnbf16x2rr_ftz : 532 NVPTXInst<(outs Int32Regs:$dst), 533 (ins Int32Regs:$a, Int32Regs:$b), 534 !strconcat(OpcStr, ".rn.ftz.bf16x2 \t$dst, $a, $b;"), 535 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 536 Requires<[hasBF16Math, noFMA, doF32FTZ]>; 537 def _rnbf16x2rr : 538 NVPTXInst<(outs Int32Regs:$dst), 539 (ins Int32Regs:$a, Int32Regs:$b), 540 !strconcat(OpcStr, ".rn.bf16x2 \t$dst, $a, $b;"), 541 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 542 Requires<[hasBF16Math, noFMA]>; 543} 544 545// Template for operations which take two f32 or f64 operands. Provides three 546// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush 547// subnormal inputs and results to zero). 548multiclass F2<string OpcStr, SDNode OpNode> { 549 def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a), 550 !strconcat(OpcStr, ".f64 \t$dst, $a;"), 551 [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>; 552 def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), 553 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"), 554 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>, 555 Requires<[doF32FTZ]>; 556 def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), 557 !strconcat(OpcStr, ".f32 \t$dst, $a;"), 558 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>; 559} 560 561multiclass F2_Support_Half<string OpcStr, SDNode OpNode> { 562 def bf16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), 563 !strconcat(OpcStr, ".bf16 \t$dst, $a;"), 564 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a)))]>, 565 Requires<[hasSM<80>, hasPTX<70>]>; 566 def bf16x2 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 567 !strconcat(OpcStr, ".bf16x2 \t$dst, $a;"), 568 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a)))]>, 569 Requires<[hasSM<80>, hasPTX<70>]>; 570 def f16_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), 571 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a;"), 572 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>, 573 Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>; 574 def f16x2_ftz : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 575 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a;"), 576 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>, 577 Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>; 578 def f16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), 579 !strconcat(OpcStr, ".f16 \t$dst, $a;"), 580 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>, 581 Requires<[hasSM<53>, hasPTX<65>]>; 582 def f16x2 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 583 !strconcat(OpcStr, ".f16x2 \t$dst, $a;"), 584 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>, 585 Requires<[hasSM<53>, hasPTX<65>]>; 586 587} 588 589//===----------------------------------------------------------------------===// 590// NVPTX Instructions. 591//===----------------------------------------------------------------------===// 592 593//----------------------------------- 594// Type Conversion 595//----------------------------------- 596 597let hasSideEffects = false in { 598 // Generate a cvt to the given type from all possible types. Each instance 599 // takes a CvtMode immediate that defines the conversion mode to use. It can 600 // be CvtNONE to omit a conversion mode. 601 multiclass CVT_FROM_ALL<string ToType, RegisterClass RC, list<Predicate> Preds = []> { 602 def _s8 : 603 NVPTXInst<(outs RC:$dst), 604 (ins Int16Regs:$src, CvtMode:$mode), 605 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 606 ToType, ".s8 \t$dst, $src;"), []>, 607 Requires<Preds>; 608 def _u8 : 609 NVPTXInst<(outs RC:$dst), 610 (ins Int16Regs:$src, CvtMode:$mode), 611 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 612 ToType, ".u8 \t$dst, $src;"), []>, 613 Requires<Preds>; 614 def _s16 : 615 NVPTXInst<(outs RC:$dst), 616 (ins Int16Regs:$src, CvtMode:$mode), 617 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 618 ToType, ".s16 \t$dst, $src;"), []>, 619 Requires<Preds>; 620 def _u16 : 621 NVPTXInst<(outs RC:$dst), 622 (ins Int16Regs:$src, CvtMode:$mode), 623 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 624 ToType, ".u16 \t$dst, $src;"), []>, 625 Requires<Preds>; 626 def _s32 : 627 NVPTXInst<(outs RC:$dst), 628 (ins Int32Regs:$src, CvtMode:$mode), 629 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 630 ToType, ".s32 \t$dst, $src;"), []>, 631 Requires<Preds>; 632 def _u32 : 633 NVPTXInst<(outs RC:$dst), 634 (ins Int32Regs:$src, CvtMode:$mode), 635 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 636 ToType, ".u32 \t$dst, $src;"), []>, 637 Requires<Preds>; 638 def _s64 : 639 NVPTXInst<(outs RC:$dst), 640 (ins Int64Regs:$src, CvtMode:$mode), 641 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 642 ToType, ".s64 \t$dst, $src;"), []>, 643 Requires<Preds>; 644 def _u64 : 645 NVPTXInst<(outs RC:$dst), 646 (ins Int64Regs:$src, CvtMode:$mode), 647 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 648 ToType, ".u64 \t$dst, $src;"), []>, 649 Requires<Preds>; 650 def _f16 : 651 NVPTXInst<(outs RC:$dst), 652 (ins Int16Regs:$src, CvtMode:$mode), 653 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 654 ToType, ".f16 \t$dst, $src;"), []>, 655 Requires<Preds>; 656 def _bf16 : 657 NVPTXInst<(outs RC:$dst), 658 (ins Int16Regs:$src, CvtMode:$mode), 659 !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.", 660 ToType, ".bf16 \t$dst, $src;"), []>, 661 Requires<!if(!eq(ToType, "f32"), 662 // bf16->f32 was introduced early. 663 [hasPTX<71>, hasSM<80>], 664 // bf16->everything else needs sm90/ptx78 665 [hasPTX<78>, hasSM<90>])>; 666 def _f32 : 667 NVPTXInst<(outs RC:$dst), 668 (ins Float32Regs:$src, CvtMode:$mode), 669 !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.", 670 ToType, ".f32 \t$dst, $src;"), []>, 671 Requires<!if(!eq(ToType, "bf16"), 672 // f32->bf16 was introduced early. 673 [hasPTX<70>, hasSM<80>], 674 Preds)>; 675 def _f64 : 676 NVPTXInst<(outs RC:$dst), 677 (ins Float64Regs:$src, CvtMode:$mode), 678 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 679 ToType, ".f64 \t$dst, $src;"), []>, 680 Requires<Preds>; 681 } 682 683 // Generate cvts from all types to all types. 684 defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>; 685 defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>; 686 defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>; 687 defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>; 688 defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>; 689 defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>; 690 defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>; 691 defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>; 692 defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>; 693 defm CVT_bf16 : CVT_FROM_ALL<"bf16", Int16Regs, [hasPTX<78>, hasSM<90>]>; 694 defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>; 695 defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>; 696 697 // These cvts are different from those above: The source and dest registers 698 // are of the same type. 699 def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 700 "cvt.s16.s8 \t$dst, $src;", []>; 701 def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 702 "cvt.s32.s8 \t$dst, $src;", []>; 703 def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 704 "cvt.s32.s16 \t$dst, $src;", []>; 705 def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 706 "cvt.s64.s8 \t$dst, $src;", []>; 707 def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 708 "cvt.s64.s16 \t$dst, $src;", []>; 709 def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 710 "cvt.s64.s32 \t$dst, $src;", []>; 711 712 multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> { 713 def _f32 : 714 NVPTXInst<(outs RC:$dst), 715 (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode), 716 !strconcat("cvt${mode:base}${mode:relu}.", 717 FromName, ".f32 \t$dst, $src1, $src2;"), []>, 718 Requires<[hasPTX<70>, hasSM<80>]>; 719 } 720 721 defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>; 722 defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>; 723} 724 725//----------------------------------- 726// Selection instructions (selp) 727//----------------------------------- 728 729// TODO: Missing slct 730 731// selp instructions that don't have any pattern matches; we explicitly use 732// them within this file. 733let hasSideEffects = false in { 734 multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> { 735 def rr : NVPTXInst<(outs RC:$dst), 736 (ins RC:$a, RC:$b, Int1Regs:$p), 737 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 738 def ri : NVPTXInst<(outs RC:$dst), 739 (ins RC:$a, ImmCls:$b, Int1Regs:$p), 740 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 741 def ir : NVPTXInst<(outs RC:$dst), 742 (ins ImmCls:$a, RC:$b, Int1Regs:$p), 743 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 744 def ii : NVPTXInst<(outs RC:$dst), 745 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), 746 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 747 } 748 749 multiclass SELP_PATTERN<string TypeStr, ValueType T, RegisterClass RC, 750 Operand ImmCls, SDNode ImmNode> { 751 def rr : 752 NVPTXInst<(outs RC:$dst), 753 (ins RC:$a, RC:$b, Int1Regs:$p), 754 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 755 [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T RC:$b)))]>; 756 def ri : 757 NVPTXInst<(outs RC:$dst), 758 (ins RC:$a, ImmCls:$b, Int1Regs:$p), 759 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 760 [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T ImmNode:$b)))]>; 761 def ir : 762 NVPTXInst<(outs RC:$dst), 763 (ins ImmCls:$a, RC:$b, Int1Regs:$p), 764 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 765 [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, (T RC:$b)))]>; 766 def ii : 767 NVPTXInst<(outs RC:$dst), 768 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), 769 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 770 [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>; 771 } 772} 773 774// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as 775// good. 776defm SELP_b16 : SELP_PATTERN<"b16", i16, Int16Regs, i16imm, imm>; 777defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>; 778defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>; 779defm SELP_b32 : SELP_PATTERN<"b32", i32, Int32Regs, i32imm, imm>; 780defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>; 781defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>; 782defm SELP_b64 : SELP_PATTERN<"b64", i64, Int64Regs, i64imm, imm>; 783defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>; 784defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>; 785defm SELP_f16 : SELP_PATTERN<"b16", f16, Int16Regs, f16imm, fpimm>; 786defm SELP_bf16 : SELP_PATTERN<"b16", bf16, Int16Regs, bf16imm, fpimm>; 787 788defm SELP_f32 : SELP_PATTERN<"f32", f32, Float32Regs, f32imm, fpimm>; 789defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>; 790 791// This does not work as tablegen fails to infer the type of 'imm'. 792// def v2f16imm : Operand<v2f16>; 793// defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>; 794 795foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { 796def : Pat<(vt (select Int1Regs:$p, (vt Int32Regs:$a), (vt Int32Regs:$b))), 797 (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>; 798} 799 800//----------------------------------- 801// Test Instructions 802//----------------------------------- 803 804def TESTINF_f32r : NVPTXInst<(outs Int1Regs:$p), (ins Float32Regs:$a), 805 "testp.infinite.f32 \t$p, $a;", 806 []>; 807def TESTINF_f32i : NVPTXInst<(outs Int1Regs:$p), (ins f32imm:$a), 808 "testp.infinite.f32 \t$p, $a;", 809 []>; 810def TESTINF_f64r : NVPTXInst<(outs Int1Regs:$p), (ins Float64Regs:$a), 811 "testp.infinite.f64 \t$p, $a;", 812 []>; 813def TESTINF_f64i : NVPTXInst<(outs Int1Regs:$p), (ins f64imm:$a), 814 "testp.infinite.f64 \t$p, $a;", 815 []>; 816 817//----------------------------------- 818// Integer Arithmetic 819//----------------------------------- 820 821// Template for xor masquerading as int1 arithmetic. 822multiclass ADD_SUB_i1<SDNode OpNode> { 823 def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), 824 "xor.pred \t$dst, $a, $b;", 825 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; 826 def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), 827 "xor.pred \t$dst, $a, $b;", 828 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>; 829} 830 831// int1 addition and subtraction are both just xor. 832defm ADD_i1 : ADD_SUB_i1<add>; 833defm SUB_i1 : ADD_SUB_i1<sub>; 834 835// int16, int32, and int64 signed addition. Since nvptx is 2's complement, we 836// also use these for unsigned arithmetic. 837defm ADD : I3<"add.s", add>; 838defm SUB : I3<"sub.s", sub>; 839 840def ADD16x2 : I16x2<"add.s", add>; 841def SUB16x2 : I16x2<"sub.s", sub>; 842 843// in32 and int64 addition and subtraction with carry-out. 844defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>; 845defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>; 846 847// int32 and int64 addition and subtraction with carry-in and carry-out. 848defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>; 849defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>; 850 851defm MULT : I3<"mul.lo.s", mul>; 852 853defm MULTHS : I3<"mul.hi.s", mulhs>; 854defm MULTHU : I3<"mul.hi.u", mulhu>; 855 856defm SDIV : I3<"div.s", sdiv>; 857defm UDIV : I3<"div.u", udiv>; 858 859// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM 860// will lower it. 861defm SREM : I3<"rem.s", srem>; 862defm UREM : I3<"rem.u", urem>; 863 864// Integer absolute value. NumBits should be one minus the bit width of RC. 865// This idiom implements the algorithm at 866// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs. 867multiclass ABS<ValueType T, RegisterClass RC, string SizeName> { 868 def : NVPTXInst<(outs RC:$dst), (ins RC:$a), 869 !strconcat("abs", SizeName, " \t$dst, $a;"), 870 [(set (T RC:$dst), (abs (T RC:$a)))]>; 871} 872defm ABS_16 : ABS<i16, Int16Regs, ".s16">; 873defm ABS_32 : ABS<i32, Int32Regs, ".s32">; 874defm ABS_64 : ABS<i64, Int64Regs, ".s64">; 875 876// Integer min/max. 877defm SMAX : I3<"max.s", smax>; 878defm UMAX : I3<"max.u", umax>; 879defm SMIN : I3<"min.s", smin>; 880defm UMIN : I3<"min.u", umin>; 881 882def SMAX16x2 : I16x2<"max.s", smax>; 883def UMAX16x2 : I16x2<"max.u", umax>; 884def SMIN16x2 : I16x2<"min.s", smin>; 885def UMIN16x2 : I16x2<"min.u", umin>; 886 887 888// 889// Wide multiplication 890// 891def MULWIDES64 : 892 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 893 "mul.wide.s32 \t$dst, $a, $b;", []>; 894def MULWIDES64Imm : 895 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 896 "mul.wide.s32 \t$dst, $a, $b;", []>; 897def MULWIDES64Imm64 : 898 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), 899 "mul.wide.s32 \t$dst, $a, $b;", []>; 900 901def MULWIDEU64 : 902 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 903 "mul.wide.u32 \t$dst, $a, $b;", []>; 904def MULWIDEU64Imm : 905 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 906 "mul.wide.u32 \t$dst, $a, $b;", []>; 907def MULWIDEU64Imm64 : 908 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), 909 "mul.wide.u32 \t$dst, $a, $b;", []>; 910 911def MULWIDES32 : 912 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 913 "mul.wide.s16 \t$dst, $a, $b;", []>; 914def MULWIDES32Imm : 915 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 916 "mul.wide.s16 \t$dst, $a, $b;", []>; 917def MULWIDES32Imm32 : 918 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 919 "mul.wide.s16 \t$dst, $a, $b;", []>; 920 921def MULWIDEU32 : 922 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 923 "mul.wide.u16 \t$dst, $a, $b;", []>; 924def MULWIDEU32Imm : 925 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 926 "mul.wide.u16 \t$dst, $a, $b;", []>; 927def MULWIDEU32Imm32 : 928 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 929 "mul.wide.u16 \t$dst, $a, $b;", []>; 930 931def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; 932def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; 933def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; 934 935// Matchers for signed, unsigned mul.wide ISD nodes. 936def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), 937 (MULWIDES32 i16:$a, i16:$b)>, 938 Requires<[doMulWide]>; 939def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)), 940 (MULWIDES32Imm Int16Regs:$a, imm:$b)>, 941 Requires<[doMulWide]>; 942def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), 943 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, 944 Requires<[doMulWide]>; 945def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)), 946 (MULWIDEU32Imm Int16Regs:$a, imm:$b)>, 947 Requires<[doMulWide]>; 948 949def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)), 950 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, 951 Requires<[doMulWide]>; 952def : Pat<(i64 (mul_wide_signed (i32 Int32Regs:$a), imm:$b)), 953 (MULWIDES64Imm Int32Regs:$a, imm:$b)>, 954 Requires<[doMulWide]>; 955def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)), 956 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, 957 Requires<[doMulWide]>; 958def : Pat<(i64 (mul_wide_unsigned (i32 Int32Regs:$a), imm:$b)), 959 (MULWIDEU64Imm Int32Regs:$a, imm:$b)>, 960 Requires<[doMulWide]>; 961 962// Predicates used for converting some patterns to mul.wide. 963def SInt32Const : PatLeaf<(imm), [{ 964 const APInt &v = N->getAPIntValue(); 965 return v.isSignedIntN(32); 966}]>; 967 968def UInt32Const : PatLeaf<(imm), [{ 969 const APInt &v = N->getAPIntValue(); 970 return v.isIntN(32); 971}]>; 972 973def SInt16Const : PatLeaf<(imm), [{ 974 const APInt &v = N->getAPIntValue(); 975 return v.isSignedIntN(16); 976}]>; 977 978def UInt16Const : PatLeaf<(imm), [{ 979 const APInt &v = N->getAPIntValue(); 980 return v.isIntN(16); 981}]>; 982 983def IntConst_0_30 : PatLeaf<(imm), [{ 984 // Check if 0 <= v < 31; only then will the result of (x << v) be an int32. 985 const APInt &v = N->getAPIntValue(); 986 return v.sge(0) && v.slt(31); 987}]>; 988 989def IntConst_0_14 : PatLeaf<(imm), [{ 990 // Check if 0 <= v < 15; only then will the result of (x << v) be an int16. 991 const APInt &v = N->getAPIntValue(); 992 return v.sge(0) && v.slt(15); 993}]>; 994 995def SHL2MUL32 : SDNodeXForm<imm, [{ 996 const APInt &v = N->getAPIntValue(); 997 APInt temp(32, 1); 998 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32); 999}]>; 1000 1001def SHL2MUL16 : SDNodeXForm<imm, [{ 1002 const APInt &v = N->getAPIntValue(); 1003 APInt temp(16, 1); 1004 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16); 1005}]>; 1006 1007// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide. 1008def : Pat<(shl (sext Int32Regs:$a), (i32 IntConst_0_30:$b)), 1009 (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, 1010 Requires<[doMulWide]>; 1011def : Pat<(shl (zext Int32Regs:$a), (i32 IntConst_0_30:$b)), 1012 (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, 1013 Requires<[doMulWide]>; 1014 1015def : Pat<(shl (sext Int16Regs:$a), (i16 IntConst_0_14:$b)), 1016 (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, 1017 Requires<[doMulWide]>; 1018def : Pat<(shl (zext Int16Regs:$a), (i16 IntConst_0_14:$b)), 1019 (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, 1020 Requires<[doMulWide]>; 1021 1022// Convert "sign/zero-extend then multiply" to mul.wide. 1023def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)), 1024 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, 1025 Requires<[doMulWide]>; 1026def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)), 1027 (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>, 1028 Requires<[doMulWide]>; 1029 1030def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)), 1031 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, 1032 Requires<[doMulWide]>; 1033def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)), 1034 (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>, 1035 Requires<[doMulWide]>; 1036 1037def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)), 1038 (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, 1039 Requires<[doMulWide]>; 1040def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)), 1041 (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>, 1042 Requires<[doMulWide]>; 1043 1044def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)), 1045 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, 1046 Requires<[doMulWide]>; 1047def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)), 1048 (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>, 1049 Requires<[doMulWide]>; 1050 1051// 1052// Integer multiply-add 1053// 1054def SDTIMAD : 1055 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>, 1056 SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>; 1057def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>; 1058 1059def MAD16rrr : 1060 NVPTXInst<(outs Int16Regs:$dst), 1061 (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), 1062 "mad.lo.s16 \t$dst, $a, $b, $c;", 1063 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>; 1064def MAD16rri : 1065 NVPTXInst<(outs Int16Regs:$dst), 1066 (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), 1067 "mad.lo.s16 \t$dst, $a, $b, $c;", 1068 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>; 1069def MAD16rir : 1070 NVPTXInst<(outs Int16Regs:$dst), 1071 (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), 1072 "mad.lo.s16 \t$dst, $a, $b, $c;", 1073 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>; 1074def MAD16rii : 1075 NVPTXInst<(outs Int16Regs:$dst), 1076 (ins Int16Regs:$a, i16imm:$b, i16imm:$c), 1077 "mad.lo.s16 \t$dst, $a, $b, $c;", 1078 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>; 1079 1080def MAD32rrr : 1081 NVPTXInst<(outs Int32Regs:$dst), 1082 (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), 1083 "mad.lo.s32 \t$dst, $a, $b, $c;", 1084 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>; 1085def MAD32rri : 1086 NVPTXInst<(outs Int32Regs:$dst), 1087 (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), 1088 "mad.lo.s32 \t$dst, $a, $b, $c;", 1089 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), imm:$c))]>; 1090def MAD32rir : 1091 NVPTXInst<(outs Int32Regs:$dst), 1092 (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), 1093 "mad.lo.s32 \t$dst, $a, $b, $c;", 1094 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, (i32 Int32Regs:$c)))]>; 1095def MAD32rii : 1096 NVPTXInst<(outs Int32Regs:$dst), 1097 (ins Int32Regs:$a, i32imm:$b, i32imm:$c), 1098 "mad.lo.s32 \t$dst, $a, $b, $c;", 1099 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, imm:$c))]>; 1100 1101def MAD64rrr : 1102 NVPTXInst<(outs Int64Regs:$dst), 1103 (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), 1104 "mad.lo.s64 \t$dst, $a, $b, $c;", 1105 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>; 1106def MAD64rri : 1107 NVPTXInst<(outs Int64Regs:$dst), 1108 (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), 1109 "mad.lo.s64 \t$dst, $a, $b, $c;", 1110 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>; 1111def MAD64rir : 1112 NVPTXInst<(outs Int64Regs:$dst), 1113 (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), 1114 "mad.lo.s64 \t$dst, $a, $b, $c;", 1115 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>; 1116def MAD64rii : 1117 NVPTXInst<(outs Int64Regs:$dst), 1118 (ins Int64Regs:$a, i64imm:$b, i64imm:$c), 1119 "mad.lo.s64 \t$dst, $a, $b, $c;", 1120 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>; 1121 1122def INEG16 : 1123 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 1124 "neg.s16 \t$dst, $src;", 1125 [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>; 1126def INEG32 : 1127 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 1128 "neg.s32 \t$dst, $src;", 1129 [(set (i32 Int32Regs:$dst), (ineg (i32 Int32Regs:$src)))]>; 1130def INEG64 : 1131 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 1132 "neg.s64 \t$dst, $src;", 1133 [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>; 1134 1135//----------------------------------- 1136// Floating Point Arithmetic 1137//----------------------------------- 1138 1139// Constant 1.0f 1140def FloatConst1 : PatLeaf<(fpimm), [{ 1141 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() && 1142 N->getValueAPF().convertToFloat() == 1.0f; 1143}]>; 1144// Constant 1.0 (double) 1145def DoubleConst1 : PatLeaf<(fpimm), [{ 1146 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() && 1147 N->getValueAPF().convertToDouble() == 1.0; 1148}]>; 1149 1150// Loads FP16 constant into a register. 1151// 1152// ptxas does not have hex representation for fp16, so we can't use 1153// fp16 immediate values in .f16 instructions. Instead we have to load 1154// the constant into a register using mov.b16. 1155def LOAD_CONST_F16 : 1156 NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$a), 1157 "mov.b16 \t$dst, $a;", []>; 1158def LOAD_CONST_BF16 : 1159 NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$a), 1160 "mov.b16 \t$dst, $a;", []>; 1161defm FADD : F3_fma_component<"add", fadd>; 1162defm FSUB : F3_fma_component<"sub", fsub>; 1163defm FMUL : F3_fma_component<"mul", fmul>; 1164 1165defm FMIN : F3<"min", fminnum>; 1166defm FMAX : F3<"max", fmaxnum>; 1167// Note: min.NaN.f64 and max.NaN.f64 do not actually exist. 1168defm FMINNAN : F3<"min.NaN", fminimum>; 1169defm FMAXNAN : F3<"max.NaN", fmaximum>; 1170 1171defm FABS : F2<"abs", fabs>; 1172defm FNEG : F2<"neg", fneg>; 1173defm FABS_H: F2_Support_Half<"abs", fabs>; 1174defm FNEG_H: F2_Support_Half<"neg", fneg>; 1175 1176defm FSQRT : F2<"sqrt.rn", fsqrt>; 1177 1178// 1179// F16 NEG 1180// 1181class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> : 1182 NVPTXInst<(outs RC:$dst), (ins RC:$src), 1183 !strconcat(OpcStr, " \t$dst, $src;"), 1184 [(set RC:$dst, (fneg (T RC:$src)))]>, 1185 Requires<[useFP16Math, hasPTX<60>, hasSM<53>, Pred]>; 1186def FNEG16_ftz : FNEG_F16_F16X2<"neg.ftz.f16", f16, Int16Regs, doF32FTZ>; 1187def FNEG16 : FNEG_F16_F16X2<"neg.f16", f16, Int16Regs, True>; 1188def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>; 1189def FNEG16x2 : FNEG_F16_F16X2<"neg.f16x2", v2f16, Int32Regs, True>; 1190 1191// 1192// BF16 NEG 1193// 1194 1195class FNEG_BF16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> : 1196 NVPTXInst<(outs RC:$dst), (ins RC:$src), 1197 !strconcat(OpcStr, " \t$dst, $src;"), 1198 [(set RC:$dst, (fneg (T RC:$src)))]>, 1199 Requires<[hasBF16Math, hasPTX<70>, hasSM<80>, Pred]>; 1200def BFNEG16_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, Int16Regs, doF32FTZ>; 1201def BFNEG16 : FNEG_BF16_F16X2<"neg.bf16", bf16, Int16Regs, True>; 1202def BFNEG16x2_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>; 1203def BFNEG16x2 : FNEG_BF16_F16X2<"neg.bf16x2", v2bf16, Int32Regs, True>; 1204 1205// 1206// F64 division 1207// 1208def FDIV641r : 1209 NVPTXInst<(outs Float64Regs:$dst), 1210 (ins f64imm:$a, Float64Regs:$b), 1211 "rcp.rn.f64 \t$dst, $b;", 1212 [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>; 1213def FDIV64rr : 1214 NVPTXInst<(outs Float64Regs:$dst), 1215 (ins Float64Regs:$a, Float64Regs:$b), 1216 "div.rn.f64 \t$dst, $a, $b;", 1217 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>; 1218def FDIV64ri : 1219 NVPTXInst<(outs Float64Regs:$dst), 1220 (ins Float64Regs:$a, f64imm:$b), 1221 "div.rn.f64 \t$dst, $a, $b;", 1222 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>; 1223 1224// 1225// F32 Approximate reciprocal 1226// 1227def FDIV321r_ftz : 1228 NVPTXInst<(outs Float32Regs:$dst), 1229 (ins f32imm:$a, Float32Regs:$b), 1230 "rcp.approx.ftz.f32 \t$dst, $b;", 1231 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1232 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1233def FDIV321r : 1234 NVPTXInst<(outs Float32Regs:$dst), 1235 (ins f32imm:$a, Float32Regs:$b), 1236 "rcp.approx.f32 \t$dst, $b;", 1237 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1238 Requires<[do_DIVF32_APPROX]>; 1239// 1240// F32 Approximate division 1241// 1242def FDIV32approxrr_ftz : 1243 NVPTXInst<(outs Float32Regs:$dst), 1244 (ins Float32Regs:$a, Float32Regs:$b), 1245 "div.approx.ftz.f32 \t$dst, $a, $b;", 1246 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1247 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1248def FDIV32approxri_ftz : 1249 NVPTXInst<(outs Float32Regs:$dst), 1250 (ins Float32Regs:$a, f32imm:$b), 1251 "div.approx.ftz.f32 \t$dst, $a, $b;", 1252 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1253 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1254def FDIV32approxrr : 1255 NVPTXInst<(outs Float32Regs:$dst), 1256 (ins Float32Regs:$a, Float32Regs:$b), 1257 "div.approx.f32 \t$dst, $a, $b;", 1258 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1259 Requires<[do_DIVF32_APPROX]>; 1260def FDIV32approxri : 1261 NVPTXInst<(outs Float32Regs:$dst), 1262 (ins Float32Regs:$a, f32imm:$b), 1263 "div.approx.f32 \t$dst, $a, $b;", 1264 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1265 Requires<[do_DIVF32_APPROX]>; 1266// 1267// F32 Semi-accurate reciprocal 1268// 1269// rcp.approx gives the same result as div.full(1.0f, a) and is faster. 1270// 1271def FDIV321r_approx_ftz : 1272 NVPTXInst<(outs Float32Regs:$dst), 1273 (ins f32imm:$a, Float32Regs:$b), 1274 "rcp.approx.ftz.f32 \t$dst, $b;", 1275 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1276 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1277def FDIV321r_approx : 1278 NVPTXInst<(outs Float32Regs:$dst), 1279 (ins f32imm:$a, Float32Regs:$b), 1280 "rcp.approx.f32 \t$dst, $b;", 1281 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1282 Requires<[do_DIVF32_FULL]>; 1283// 1284// F32 Semi-accurate division 1285// 1286def FDIV32rr_ftz : 1287 NVPTXInst<(outs Float32Regs:$dst), 1288 (ins Float32Regs:$a, Float32Regs:$b), 1289 "div.full.ftz.f32 \t$dst, $a, $b;", 1290 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1291 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1292def FDIV32ri_ftz : 1293 NVPTXInst<(outs Float32Regs:$dst), 1294 (ins Float32Regs:$a, f32imm:$b), 1295 "div.full.ftz.f32 \t$dst, $a, $b;", 1296 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1297 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1298def FDIV32rr : 1299 NVPTXInst<(outs Float32Regs:$dst), 1300 (ins Float32Regs:$a, Float32Regs:$b), 1301 "div.full.f32 \t$dst, $a, $b;", 1302 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1303 Requires<[do_DIVF32_FULL]>; 1304def FDIV32ri : 1305 NVPTXInst<(outs Float32Regs:$dst), 1306 (ins Float32Regs:$a, f32imm:$b), 1307 "div.full.f32 \t$dst, $a, $b;", 1308 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1309 Requires<[do_DIVF32_FULL]>; 1310// 1311// F32 Accurate reciprocal 1312// 1313def FDIV321r_prec_ftz : 1314 NVPTXInst<(outs Float32Regs:$dst), 1315 (ins f32imm:$a, Float32Regs:$b), 1316 "rcp.rn.ftz.f32 \t$dst, $b;", 1317 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1318 Requires<[doF32FTZ]>; 1319def FDIV321r_prec : 1320 NVPTXInst<(outs Float32Regs:$dst), 1321 (ins f32imm:$a, Float32Regs:$b), 1322 "rcp.rn.f32 \t$dst, $b;", 1323 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>; 1324// 1325// F32 Accurate division 1326// 1327def FDIV32rr_prec_ftz : 1328 NVPTXInst<(outs Float32Regs:$dst), 1329 (ins Float32Regs:$a, Float32Regs:$b), 1330 "div.rn.ftz.f32 \t$dst, $a, $b;", 1331 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1332 Requires<[doF32FTZ]>; 1333def FDIV32ri_prec_ftz : 1334 NVPTXInst<(outs Float32Regs:$dst), 1335 (ins Float32Regs:$a, f32imm:$b), 1336 "div.rn.ftz.f32 \t$dst, $a, $b;", 1337 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1338 Requires<[doF32FTZ]>; 1339def FDIV32rr_prec : 1340 NVPTXInst<(outs Float32Regs:$dst), 1341 (ins Float32Regs:$a, Float32Regs:$b), 1342 "div.rn.f32 \t$dst, $a, $b;", 1343 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>; 1344def FDIV32ri_prec : 1345 NVPTXInst<(outs Float32Regs:$dst), 1346 (ins Float32Regs:$a, f32imm:$b), 1347 "div.rn.f32 \t$dst, $a, $b;", 1348 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>; 1349 1350// 1351// FMA 1352// 1353 1354multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> { 1355 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1356 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1357 [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, 1358 Requires<[Pred]>; 1359 def rri : NVPTXInst<(outs RC:$dst), 1360 (ins RC:$a, RC:$b, ImmCls:$c), 1361 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1362 [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>, 1363 Requires<[Pred]>; 1364 def rir : NVPTXInst<(outs RC:$dst), 1365 (ins RC:$a, ImmCls:$b, RC:$c), 1366 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1367 [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>, 1368 Requires<[Pred]>; 1369 def rii : NVPTXInst<(outs RC:$dst), 1370 (ins RC:$a, ImmCls:$b, ImmCls:$c), 1371 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1372 [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>, 1373 Requires<[Pred]>; 1374} 1375 1376multiclass FMA_F16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> { 1377 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1378 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1379 [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>, 1380 Requires<[useFP16Math, Pred]>; 1381} 1382 1383multiclass FMA_BF16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> { 1384 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1385 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1386 [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>, 1387 Requires<[hasBF16Math, Pred]>; 1388} 1389 1390defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", f16, Int16Regs, doF32FTZ>; 1391defm FMA16 : FMA_F16<"fma.rn.f16", f16, Int16Regs, True>; 1392defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>; 1393defm FMA16x2 : FMA_F16<"fma.rn.f16x2", v2f16, Int32Regs, True>; 1394defm BFMA16_ftz : FMA_BF16<"fma.rn.ftz.bf16", bf16, Int16Regs, doF32FTZ>; 1395defm BFMA16 : FMA_BF16<"fma.rn.bf16", bf16, Int16Regs, True>; 1396defm BFMA16x2_ftz : FMA_BF16<"fma.rn.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>; 1397defm BFMA16x2 : FMA_BF16<"fma.rn.bf16x2", v2bf16, Int32Regs, True>; 1398defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>; 1399defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, True>; 1400defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, True>; 1401 1402// sin/cos 1403def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1404 "sin.approx.f32 \t$dst, $src;", 1405 [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>, 1406 Requires<[allowUnsafeFPMath]>; 1407def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1408 "cos.approx.f32 \t$dst, $src;", 1409 [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>, 1410 Requires<[allowUnsafeFPMath]>; 1411 1412// Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)), 1413// i.e. "poor man's fmod()". When y is infinite, x is returned. This matches the 1414// semantics of LLVM's frem. 1415 1416// frem - f32 FTZ 1417def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1418 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32 1419 (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ), 1420 Float32Regs:$y))>, 1421 Requires<[doF32FTZ, allowUnsafeFPMath]>; 1422def : Pat<(frem Float32Regs:$x, fpimm:$y), 1423 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32 1424 (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ), 1425 fpimm:$y))>, 1426 Requires<[doF32FTZ, allowUnsafeFPMath]>; 1427 1428def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1429 (SELP_f32rr Float32Regs:$x, 1430 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32 1431 (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ), 1432 Float32Regs:$y)), 1433 (TESTINF_f32r Float32Regs:$y))>, 1434 Requires<[doF32FTZ, noUnsafeFPMath]>; 1435def : Pat<(frem Float32Regs:$x, fpimm:$y), 1436 (SELP_f32rr Float32Regs:$x, 1437 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32 1438 (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ), 1439 fpimm:$y)), 1440 (TESTINF_f32i fpimm:$y))>, 1441 Requires<[doF32FTZ, noUnsafeFPMath]>; 1442 1443// frem - f32 1444def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1445 (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32 1446 (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI), 1447 Float32Regs:$y))>, 1448 Requires<[allowUnsafeFPMath]>; 1449def : Pat<(frem Float32Regs:$x, fpimm:$y), 1450 (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32 1451 (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI), 1452 fpimm:$y))>, 1453 Requires<[allowUnsafeFPMath]>; 1454 1455def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1456 (SELP_f32rr Float32Regs:$x, 1457 (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32 1458 (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI), 1459 Float32Regs:$y)), 1460 (TESTINF_f32r Float32Regs:$y))>, 1461 Requires<[noUnsafeFPMath]>; 1462def : Pat<(frem Float32Regs:$x, fpimm:$y), 1463 (SELP_f32rr Float32Regs:$x, 1464 (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32 1465 (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI), 1466 fpimm:$y)), 1467 (TESTINF_f32i fpimm:$y))>, 1468 Requires<[noUnsafeFPMath]>; 1469 1470// frem - f64 1471def : Pat<(frem Float64Regs:$x, Float64Regs:$y), 1472 (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64 1473 (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI), 1474 Float64Regs:$y))>, 1475 Requires<[allowUnsafeFPMath]>; 1476def : Pat<(frem Float64Regs:$x, fpimm:$y), 1477 (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64 1478 (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI), 1479 fpimm:$y))>, 1480 Requires<[allowUnsafeFPMath]>; 1481 1482def : Pat<(frem Float64Regs:$x, Float64Regs:$y), 1483 (SELP_f64rr Float64Regs:$x, 1484 (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64 1485 (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI), 1486 Float64Regs:$y)), 1487 (TESTINF_f64r Float64Regs:$y))>, 1488 Requires<[noUnsafeFPMath]>; 1489def : Pat<(frem Float64Regs:$x, fpimm:$y), 1490 (SELP_f64rr Float64Regs:$x, 1491 (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64 1492 (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI), 1493 fpimm:$y)), 1494 (TESTINF_f64r Float64Regs:$y))>, 1495 Requires<[noUnsafeFPMath]>; 1496 1497//----------------------------------- 1498// Bitwise operations 1499//----------------------------------- 1500 1501// Template for three-arg bitwise operations. Takes three args, Creates .b16, 1502// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr. 1503multiclass BITWISE<string OpcStr, SDNode OpNode> { 1504 def b1rr : 1505 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), 1506 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), 1507 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; 1508 def b1ri : 1509 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), 1510 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), 1511 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>; 1512 def b16rr : 1513 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 1514 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), 1515 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; 1516 def b16ri : 1517 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 1518 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), 1519 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>; 1520 def b32rr : 1521 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 1522 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), 1523 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>; 1524 def b32ri : 1525 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 1526 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), 1527 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>; 1528 def b64rr : 1529 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 1530 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), 1531 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; 1532 def b64ri : 1533 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 1534 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), 1535 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; 1536} 1537 1538defm OR : BITWISE<"or", or>; 1539defm AND : BITWISE<"and", and>; 1540defm XOR : BITWISE<"xor", xor>; 1541 1542// Lower logical v2i16/v4i8 ops as bitwise ops on b32. 1543foreach vt = [v2i16, v4i8] in { 1544 def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)), 1545 (ORb32rr Int32Regs:$a, Int32Regs:$b)>; 1546 def: Pat<(xor (vt Int32Regs:$a), (vt Int32Regs:$b)), 1547 (XORb32rr Int32Regs:$a, Int32Regs:$b)>; 1548 def: Pat<(and (vt Int32Regs:$a), (vt Int32Regs:$b)), 1549 (ANDb32rr Int32Regs:$a, Int32Regs:$b)>; 1550 1551 // The constants get legalized into a bitcast from i32, so that's what we need 1552 // to match here. 1553 def: Pat<(or Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))), 1554 (ORb32ri Int32Regs:$a, imm:$b)>; 1555 def: Pat<(xor Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))), 1556 (XORb32ri Int32Regs:$a, imm:$b)>; 1557 def: Pat<(and Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))), 1558 (ANDb32ri Int32Regs:$a, imm:$b)>; 1559} 1560 1561def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src), 1562 "not.pred \t$dst, $src;", 1563 [(set Int1Regs:$dst, (not Int1Regs:$src))]>; 1564def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 1565 "not.b16 \t$dst, $src;", 1566 [(set Int16Regs:$dst, (not Int16Regs:$src))]>; 1567def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 1568 "not.b32 \t$dst, $src;", 1569 [(set (i32 Int32Regs:$dst), (not (i32 Int32Regs:$src)))]>; 1570def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 1571 "not.b64 \t$dst, $src;", 1572 [(set Int64Regs:$dst, (not Int64Regs:$src))]>; 1573 1574// Template for left/right shifts. Takes three operands, 1575// [dest (reg), src (reg), shift (reg or imm)]. 1576// dest and src may be int64, int32, or int16, but shift is always int32. 1577// 1578// This template also defines a 32-bit shift (imm, imm) instruction. 1579multiclass SHIFT<string OpcStr, SDNode OpNode> { 1580 def i64rr : 1581 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b), 1582 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 1583 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 Int32Regs:$b)))]>; 1584 def i64ri : 1585 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b), 1586 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 1587 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>; 1588 def i32rr : 1589 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 1590 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1591 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>; 1592 def i32ri : 1593 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 1594 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1595 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 imm:$b)))]>; 1596 def i32ii : 1597 NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b), 1598 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1599 [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>; 1600 def i16rr : 1601 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b), 1602 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 1603 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 Int32Regs:$b)))]>; 1604 def i16ri : 1605 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 1606 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 1607 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>; 1608} 1609 1610defm SHL : SHIFT<"shl.b", shl>; 1611defm SRA : SHIFT<"shr.s", sra>; 1612defm SRL : SHIFT<"shr.u", srl>; 1613 1614// Bit-reverse 1615def BREV32 : 1616 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 1617 "brev.b32 \t$dst, $a;", 1618 [(set Int32Regs:$dst, (bitreverse (i32 Int32Regs:$a)))]>; 1619def BREV64 : 1620 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a), 1621 "brev.b64 \t$dst, $a;", 1622 [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>; 1623 1624// 1625// Rotate: Use ptx shf instruction if available. 1626// 1627 1628// 32 bit r2 = rotl r1, n 1629// => 1630// r2 = shf.l r1, r1, n 1631def ROTL32imm_hw : 1632 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), 1633 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", 1634 [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 imm:$amt)))]>, 1635 Requires<[hasHWROT32]>; 1636 1637def ROTL32reg_hw : 1638 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1639 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", 1640 [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, 1641 Requires<[hasHWROT32]>; 1642 1643// 32 bit r2 = rotr r1, n 1644// => 1645// r2 = shf.r r1, r1, n 1646def ROTR32imm_hw : 1647 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), 1648 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", 1649 [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 imm:$amt)))]>, 1650 Requires<[hasHWROT32]>; 1651 1652def ROTR32reg_hw : 1653 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1654 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", 1655 [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, 1656 Requires<[hasHWROT32]>; 1657 1658// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1. 1659def ROT32imm_sw : 1660 NVPTXInst<(outs Int32Regs:$dst), 1661 (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), 1662 "{{\n\t" 1663 ".reg .b32 %lhs;\n\t" 1664 ".reg .b32 %rhs;\n\t" 1665 "shl.b32 \t%lhs, $src, $amt1;\n\t" 1666 "shr.b32 \t%rhs, $src, $amt2;\n\t" 1667 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1668 "}}", 1669 []>; 1670 1671def SUB_FRM_32 : SDNodeXForm<imm, [{ 1672 return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32); 1673}]>; 1674 1675def : Pat<(rotl (i32 Int32Regs:$src), (i32 imm:$amt)), 1676 (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, 1677 Requires<[noHWROT32]>; 1678def : Pat<(rotr (i32 Int32Regs:$src), (i32 imm:$amt)), 1679 (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>, 1680 Requires<[noHWROT32]>; 1681 1682// 32-bit software rotate left by register. 1683def ROTL32reg_sw : 1684 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1685 "{{\n\t" 1686 ".reg .b32 %lhs;\n\t" 1687 ".reg .b32 %rhs;\n\t" 1688 ".reg .b32 %amt2;\n\t" 1689 "shl.b32 \t%lhs, $src, $amt;\n\t" 1690 "sub.s32 \t%amt2, 32, $amt;\n\t" 1691 "shr.b32 \t%rhs, $src, %amt2;\n\t" 1692 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1693 "}}", 1694 [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, 1695 Requires<[noHWROT32]>; 1696 1697// 32-bit software rotate right by register. 1698def ROTR32reg_sw : 1699 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1700 "{{\n\t" 1701 ".reg .b32 %lhs;\n\t" 1702 ".reg .b32 %rhs;\n\t" 1703 ".reg .b32 %amt2;\n\t" 1704 "shr.b32 \t%lhs, $src, $amt;\n\t" 1705 "sub.s32 \t%amt2, 32, $amt;\n\t" 1706 "shl.b32 \t%rhs, $src, %amt2;\n\t" 1707 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1708 "}}", 1709 [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, 1710 Requires<[noHWROT32]>; 1711 1712// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1. 1713def ROT64imm_sw : 1714 NVPTXInst<(outs Int64Regs:$dst), 1715 (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2), 1716 "{{\n\t" 1717 ".reg .b64 %lhs;\n\t" 1718 ".reg .b64 %rhs;\n\t" 1719 "shl.b64 \t%lhs, $src, $amt1;\n\t" 1720 "shr.b64 \t%rhs, $src, $amt2;\n\t" 1721 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1722 "}}", 1723 []>; 1724 1725def SUB_FRM_64 : SDNodeXForm<imm, [{ 1726 return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32); 1727}]>; 1728 1729def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)), 1730 (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>; 1731def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)), 1732 (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>; 1733 1734// 64-bit software rotate left by register. 1735def ROTL64reg_sw : 1736 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), 1737 "{{\n\t" 1738 ".reg .b64 %lhs;\n\t" 1739 ".reg .b64 %rhs;\n\t" 1740 ".reg .u32 %amt2;\n\t" 1741 "shl.b64 \t%lhs, $src, $amt;\n\t" 1742 "sub.u32 \t%amt2, 64, $amt;\n\t" 1743 "shr.b64 \t%rhs, $src, %amt2;\n\t" 1744 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1745 "}}", 1746 [(set Int64Regs:$dst, (rotl Int64Regs:$src, (i32 Int32Regs:$amt)))]>; 1747 1748def ROTR64reg_sw : 1749 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), 1750 "{{\n\t" 1751 ".reg .b64 %lhs;\n\t" 1752 ".reg .b64 %rhs;\n\t" 1753 ".reg .u32 %amt2;\n\t" 1754 "shr.b64 \t%lhs, $src, $amt;\n\t" 1755 "sub.u32 \t%amt2, 64, $amt;\n\t" 1756 "shl.b64 \t%rhs, $src, %amt2;\n\t" 1757 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1758 "}}", 1759 [(set Int64Regs:$dst, (rotr Int64Regs:$src, (i32 Int32Regs:$amt)))]>; 1760 1761// 1762// Funnnel shift in clamp mode 1763// 1764 1765// Create SDNodes so they can be used in the DAG code, e.g. 1766// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) 1767def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; 1768def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; 1769 1770def FUNSHFLCLAMP : 1771 NVPTXInst<(outs Int32Regs:$dst), 1772 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), 1773 "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", 1774 [(set Int32Regs:$dst, 1775 (FUN_SHFL_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>; 1776 1777def FUNSHFRCLAMP : 1778 NVPTXInst<(outs Int32Regs:$dst), 1779 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), 1780 "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", 1781 [(set Int32Regs:$dst, 1782 (FUN_SHFR_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>; 1783 1784// 1785// BFE - bit-field extract 1786// 1787 1788// Template for BFE/BFI instructions. 1789// Args: [dest (reg), src (reg), start (reg or imm), end (reg or imm)]. 1790// Start may be an imm only if end is also an imm. FIXME: Is this a 1791// restriction in PTX? 1792// 1793// dest and src may be int32 or int64, but start and end are always int32. 1794def SDTBFE : 1795 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, 1796 SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; 1797def bfe : SDNode<"NVPTXISD::BFE", SDTBFE>; 1798 1799def SDTBFI : 1800 SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, 1801 SDTCisVT<3, i32>, SDTCisVT<4, i32>]>; 1802def bfi : SDNode<"NVPTXISD::BFI", SDTBFI>; 1803 1804def SDTPRMT : 1805 SDTypeProfile<1, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, 1806 SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>,]>; 1807def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>; 1808 1809multiclass BFE<string Instr, ValueType T, RegisterClass RC> { 1810 def rrr 1811 : NVPTXInst<(outs RC:$d), 1812 (ins RC:$a, Int32Regs:$b, Int32Regs:$c), 1813 !strconcat(Instr, " \t$d, $a, $b, $c;"), 1814 [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>; 1815 def rri 1816 : NVPTXInst<(outs RC:$d), 1817 (ins RC:$a, Int32Regs:$b, i32imm:$c), 1818 !strconcat(Instr, " \t$d, $a, $b, $c;"), 1819 [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 imm:$c)))]>; 1820 def rii 1821 : NVPTXInst<(outs RC:$d), 1822 (ins RC:$a, i32imm:$b, i32imm:$c), 1823 !strconcat(Instr, " \t$d, $a, $b, $c;"), 1824 [(set (T RC:$d), (bfe (T RC:$a), (i32 imm:$b), (i32 imm:$c)))]>; 1825} 1826 1827multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> { 1828 def rrrr 1829 : NVPTXInst<(outs RC:$f), 1830 (ins RC:$a, RC:$b, Int32Regs:$c, Int32Regs:$d), 1831 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1832 [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>; 1833 def rrri 1834 : NVPTXInst<(outs RC:$f), 1835 (ins RC:$a, RC:$b, Int32Regs:$c, i32imm:$d), 1836 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1837 [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>; 1838 def rrii 1839 : NVPTXInst<(outs RC:$f), 1840 (ins RC:$a, RC:$b, i32imm:$c, i32imm:$d), 1841 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1842 [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>; 1843 def irrr 1844 : NVPTXInst<(outs RC:$f), 1845 (ins ImmCls:$a, RC:$b, Int32Regs:$c, Int32Regs:$d), 1846 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1847 [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>; 1848 def irri 1849 : NVPTXInst<(outs RC:$f), 1850 (ins ImmCls:$a, RC:$b, Int32Regs:$c, i32imm:$d), 1851 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1852 [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>; 1853 def irii 1854 : NVPTXInst<(outs RC:$f), 1855 (ins ImmCls:$a, RC:$b, i32imm:$c, i32imm:$d), 1856 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1857 [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>; 1858} 1859 1860multiclass PRMT<ValueType T, RegisterClass RC> { 1861 def rrr 1862 : NVPTXInst<(outs RC:$d), 1863 (ins RC:$a, Int32Regs:$b, Int32Regs:$c, PrmtMode:$mode), 1864 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), 1865 [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), imm:$mode))]>; 1866 def rri 1867 : NVPTXInst<(outs RC:$d), 1868 (ins RC:$a, Int32Regs:$b, i32imm:$c, PrmtMode:$mode), 1869 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), 1870 [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 imm:$c), imm:$mode))]>; 1871 def rii 1872 : NVPTXInst<(outs RC:$d), 1873 (ins RC:$a, i32imm:$b, i32imm:$c, PrmtMode:$mode), 1874 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), 1875 [(set (T RC:$d), (prmt (T RC:$a), (T imm:$b), (i32 imm:$c), imm:$mode))]>; 1876} 1877 1878let hasSideEffects = false in { 1879 defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>; 1880 defm BFE_U32 : BFE<"bfe.u32", i32, Int32Regs>; 1881 defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>; 1882 defm BFE_U64 : BFE<"bfe.u64", i64, Int64Regs>; 1883 1884 defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>; 1885 defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>; 1886 1887 defm PRMT_B32 : PRMT<i32, Int32Regs>; 1888} 1889 1890 1891// byte extraction + signed/unsigned extension to i32. 1892def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), i8)), 1893 (BFE_S32rri Int32Regs:$s, Int32Regs:$o, 8)>; 1894def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), i8)), 1895 (BFE_S32rii Int32Regs:$s, imm:$o, 8)>; 1896def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), 255)), 1897 (BFE_U32rri Int32Regs:$s, Int32Regs:$o, 8)>; 1898def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), 255)), 1899 (BFE_U32rii Int32Regs:$s, imm:$o, 8)>; 1900 1901// byte extraction + signed extension to i16 1902def : Pat<(i16 (sext_inreg (trunc (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8)), i8)), 1903 (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>; 1904 1905 1906// Byte extraction via shift/trunc/sext 1907def : Pat<(i16 (sext_inreg (trunc Int32Regs:$s), i8)), 1908 (CVT_s8_s32 Int32Regs:$s, CvtNONE)>; 1909def : Pat<(i16 (sext_inreg (trunc (srl (i32 Int32Regs:$s), (i32 imm:$o))), i8)), 1910 (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>; 1911def : Pat<(sext_inreg (srl (i32 Int32Regs:$s), (i32 imm:$o)), i8), 1912 (BFE_S32rii Int32Regs:$s, imm:$o, 8)>; 1913def : Pat<(i16 (sra (i16 (trunc Int32Regs:$s)), (i32 8))), 1914 (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, 8, 8), CvtNONE)>; 1915def : Pat<(sext_inreg (srl (i64 Int64Regs:$s), (i32 imm:$o)), i8), 1916 (BFE_S64rii Int64Regs:$s, imm:$o, 8)>; 1917def : Pat<(i16 (sext_inreg (trunc Int64Regs:$s), i8)), 1918 (CVT_s8_s64 Int64Regs:$s, CvtNONE)>; 1919def : Pat<(i16 (sext_inreg (trunc (srl (i64 Int64Regs:$s), (i32 imm:$o))), i8)), 1920 (CVT_s8_s64 (BFE_S64rii Int64Regs:$s, imm:$o, 8), CvtNONE)>; 1921 1922//----------------------------------- 1923// Comparison instructions (setp, set) 1924//----------------------------------- 1925 1926// FIXME: This doesn't cover versions of set and setp that combine with a 1927// boolean predicate, e.g. setp.eq.and.b16. 1928 1929let hasSideEffects = false in { 1930 multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> { 1931 def rr : 1932 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp), 1933 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1934 " \t$dst, $a, $b;"), []>; 1935 def ri : 1936 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp), 1937 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1938 " \t$dst, $a, $b;"), []>; 1939 def ir : 1940 NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp), 1941 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1942 " \t$dst, $a, $b;"), []>; 1943 } 1944} 1945 1946defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>; 1947defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>; 1948defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>; 1949defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>; 1950defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>; 1951defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>; 1952defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>; 1953defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>; 1954defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>; 1955defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>; 1956defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>; 1957def SETP_f16rr : 1958 NVPTXInst<(outs Int1Regs:$dst), 1959 (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp), 1960 "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;", 1961 []>, Requires<[useFP16Math]>; 1962 1963def SETP_f16x2rr : 1964 NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q), 1965 (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp), 1966 "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;", 1967 []>, 1968 Requires<[useFP16Math]>; 1969def SETP_bf16rr : 1970 NVPTXInst<(outs Int1Regs:$dst), 1971 (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp), 1972 "setp${cmp:base}${cmp:ftz}.bf16 \t$dst, $a, $b;", 1973 []>, Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>; 1974 1975def SETP_bf16x2rr : 1976 NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q), 1977 (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp), 1978 "setp${cmp:base}${cmp:ftz}.bf16x2 \t$p|$q, $a, $b;", 1979 []>, 1980 Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>; 1981 1982 1983// FIXME: This doesn't appear to be correct. The "set" mnemonic has the form 1984// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination 1985// reg, either u32, s32, or f32. Anyway these aren't used at the moment. 1986 1987let hasSideEffects = false in { 1988 multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> { 1989 def rr : NVPTXInst<(outs Int32Regs:$dst), 1990 (ins RC:$a, RC:$b, CmpMode:$cmp), 1991 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1992 def ri : NVPTXInst<(outs Int32Regs:$dst), 1993 (ins RC:$a, ImmCls:$b, CmpMode:$cmp), 1994 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1995 def ir : NVPTXInst<(outs Int32Regs:$dst), 1996 (ins ImmCls:$a, RC:$b, CmpMode:$cmp), 1997 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1998 } 1999} 2000 2001defm SET_b16 : SET<"b16", Int16Regs, i16imm>; 2002defm SET_s16 : SET<"s16", Int16Regs, i16imm>; 2003defm SET_u16 : SET<"u16", Int16Regs, i16imm>; 2004defm SET_b32 : SET<"b32", Int32Regs, i32imm>; 2005defm SET_s32 : SET<"s32", Int32Regs, i32imm>; 2006defm SET_u32 : SET<"u32", Int32Regs, i32imm>; 2007defm SET_b64 : SET<"b64", Int64Regs, i64imm>; 2008defm SET_s64 : SET<"s64", Int64Regs, i64imm>; 2009defm SET_u64 : SET<"u64", Int64Regs, i64imm>; 2010defm SET_f16 : SET<"f16", Int16Regs, f16imm>; 2011defm SET_bf16 : SET<"bf16", Int16Regs, bf16imm>, Requires<[hasPTX<78>, hasSM<90>]>; 2012defm SET_f32 : SET<"f32", Float32Regs, f32imm>; 2013defm SET_f64 : SET<"f64", Float64Regs, f64imm>; 2014 2015//----------------------------------- 2016// Data Movement (Load / Store, Move) 2017//----------------------------------- 2018 2019def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], 2020 [SDNPWantRoot]>; 2021def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex], 2022 [SDNPWantRoot]>; 2023def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>; 2024 2025def MEMri : Operand<i32> { 2026 let PrintMethod = "printMemOperand"; 2027 let MIOperandInfo = (ops Int32Regs, i32imm); 2028} 2029def MEMri64 : Operand<i64> { 2030 let PrintMethod = "printMemOperand"; 2031 let MIOperandInfo = (ops Int64Regs, i64imm); 2032} 2033 2034def imem : Operand<iPTR> { 2035 let PrintMethod = "printOperand"; 2036} 2037 2038def imemAny : Operand<iPTRAny> { 2039 let PrintMethod = "printOperand"; 2040} 2041 2042def LdStCode : Operand<i32> { 2043 let PrintMethod = "printLdStCode"; 2044} 2045 2046def MmaCode : Operand<i32> { 2047 let PrintMethod = "printMmaCode"; 2048} 2049 2050def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; 2051def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>; 2052 2053// Load a memory address into a u32 or u64 register. 2054def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a), 2055 "mov.u32 \t$dst, $a;", 2056 [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>; 2057def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a), 2058 "mov.u64 \t$dst, $a;", 2059 [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>; 2060 2061// Get pointer to local stack. 2062let hasSideEffects = false in { 2063 def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num), 2064 "mov.u32 \t$d, __local_depot$num;", []>; 2065 def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num), 2066 "mov.u64 \t$d, __local_depot$num;", []>; 2067} 2068 2069 2070// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp 2071let IsSimpleMove=1, hasSideEffects=0 in { 2072 def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss), 2073 "mov.pred \t$dst, $sss;", []>; 2074 def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), 2075 "mov.u16 \t$dst, $sss;", []>; 2076 def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), 2077 "mov.u32 \t$dst, $sss;", []>; 2078 def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), 2079 "mov.u64 \t$dst, $sss;", []>; 2080 2081 def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), 2082 "mov.b16 \t$dst, $sss;", []>; 2083 def IMOVB32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), 2084 "mov.b32 \t$dst, $sss;", []>; 2085 def IMOVB64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), 2086 "mov.b64 \t$dst, $sss;", []>; 2087 2088 def FMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 2089 // We have to use .b16 here as there's no mov.f16. 2090 "mov.b16 \t$dst, $src;", []>; 2091 def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 2092 "mov.f32 \t$dst, $src;", []>; 2093 def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src), 2094 "mov.f64 \t$dst, $src;", []>; 2095} 2096 2097def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), 2098 "mov.pred \t$dst, $src;", 2099 [(set Int1Regs:$dst, imm:$src)]>; 2100def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), 2101 "mov.u16 \t$dst, $src;", 2102 [(set Int16Regs:$dst, imm:$src)]>; 2103def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), 2104 "mov.u32 \t$dst, $src;", 2105 [(set (i32 Int32Regs:$dst), imm:$src)]>; 2106def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), 2107 "mov.u64 \t$dst, $src;", 2108 [(set Int64Regs:$dst, imm:$src)]>; 2109 2110def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), 2111 "mov.b16 \t$dst, $src;", []>; 2112def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), 2113 "mov.b32 \t$dst, $src;", []>; 2114def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), 2115 "mov.b64 \t$dst, $src;", []>; 2116 2117def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), 2118 "mov.f32 \t$dst, $src;", 2119 [(set Float32Regs:$dst, fpimm:$src)]>; 2120def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), 2121 "mov.f64 \t$dst, $src;", 2122 [(set Float64Regs:$dst, fpimm:$src)]>; 2123 2124def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>; 2125def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>; 2126 2127//---- Copy Frame Index ---- 2128def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr), 2129 "add.u32 \t$dst, ${addr:add};", 2130 [(set Int32Regs:$dst, ADDRri:$addr)]>; 2131def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr), 2132 "add.u64 \t$dst, ${addr:add};", 2133 [(set Int64Regs:$dst, ADDRri64:$addr)]>; 2134 2135//----------------------------------- 2136// Comparison and Selection 2137//----------------------------------- 2138 2139multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode, 2140 Instruction setp_16rr, 2141 Instruction setp_16ri, 2142 Instruction setp_16ir, 2143 Instruction setp_32rr, 2144 Instruction setp_32ri, 2145 Instruction setp_32ir, 2146 Instruction setp_64rr, 2147 Instruction setp_64ri, 2148 Instruction setp_64ir, 2149 Instruction set_16rr, 2150 Instruction set_16ri, 2151 Instruction set_16ir, 2152 Instruction set_32rr, 2153 Instruction set_32ri, 2154 Instruction set_32ir, 2155 Instruction set_64rr, 2156 Instruction set_64ri, 2157 Instruction set_64ir> { 2158 // i16 -> pred 2159 def : Pat<(i1 (OpNode i16:$a, i16:$b)), 2160 (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; 2161 def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)), 2162 (setp_16ri Int16Regs:$a, imm:$b, Mode)>; 2163 def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)), 2164 (setp_16ir imm:$a, Int16Regs:$b, Mode)>; 2165 // i32 -> pred 2166 def : Pat<(i1 (OpNode i32:$a, i32:$b)), 2167 (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; 2168 def : Pat<(i1 (OpNode (i32 Int32Regs:$a), imm:$b)), 2169 (setp_32ri Int32Regs:$a, imm:$b, Mode)>; 2170 def : Pat<(i1 (OpNode imm:$a, (i32 Int32Regs:$b))), 2171 (setp_32ir imm:$a, Int32Regs:$b, Mode)>; 2172 // i64 -> pred 2173 def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)), 2174 (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; 2175 def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)), 2176 (setp_64ri Int64Regs:$a, imm:$b, Mode)>; 2177 def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)), 2178 (setp_64ir imm:$a, Int64Regs:$b, Mode)>; 2179 2180 // i16 -> i32 2181 def : Pat<(i32 (OpNode i16:$a, i16:$b)), 2182 (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; 2183 def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)), 2184 (set_16ri Int16Regs:$a, imm:$b, Mode)>; 2185 def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)), 2186 (set_16ir imm:$a, Int16Regs:$b, Mode)>; 2187 // i32 -> i32 2188 def : Pat<(i32 (OpNode i32:$a, i32:$b)), 2189 (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; 2190 def : Pat<(i32 (OpNode (i32 Int32Regs:$a), imm:$b)), 2191 (set_32ri Int32Regs:$a, imm:$b, Mode)>; 2192 def : Pat<(i32 (OpNode imm:$a, (i32 Int32Regs:$b))), 2193 (set_32ir imm:$a, Int32Regs:$b, Mode)>; 2194 // i64 -> i32 2195 def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)), 2196 (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; 2197 def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)), 2198 (set_64ri Int64Regs:$a, imm:$b, Mode)>; 2199 def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)), 2200 (set_64ir imm:$a, Int64Regs:$b, Mode)>; 2201} 2202 2203multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode> 2204 : ISET_FORMAT<OpNode, Mode, 2205 SETP_s16rr, SETP_s16ri, SETP_s16ir, 2206 SETP_s32rr, SETP_s32ri, SETP_s32ir, 2207 SETP_s64rr, SETP_s64ri, SETP_s64ir, 2208 SET_s16rr, SET_s16ri, SET_s16ir, 2209 SET_s32rr, SET_s32ri, SET_s32ir, 2210 SET_s64rr, SET_s64ri, SET_s64ir> { 2211 // TableGen doesn't like empty multiclasses. 2212 def : PatLeaf<(i32 0)>; 2213} 2214 2215multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode> 2216 : ISET_FORMAT<OpNode, Mode, 2217 SETP_u16rr, SETP_u16ri, SETP_u16ir, 2218 SETP_u32rr, SETP_u32ri, SETP_u32ir, 2219 SETP_u64rr, SETP_u64ri, SETP_u64ir, 2220 SET_u16rr, SET_u16ri, SET_u16ir, 2221 SET_u32rr, SET_u32ri, SET_u32ir, 2222 SET_u64rr, SET_u64ri, SET_u64ir> { 2223 // TableGen doesn't like empty multiclasses. 2224 def : PatLeaf<(i32 0)>; 2225} 2226 2227defm : ISET_FORMAT_SIGNED<setgt, CmpGT>; 2228defm : ISET_FORMAT_SIGNED<setlt, CmpLT>; 2229defm : ISET_FORMAT_SIGNED<setge, CmpGE>; 2230defm : ISET_FORMAT_SIGNED<setle, CmpLE>; 2231defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>; 2232defm : ISET_FORMAT_SIGNED<setne, CmpNE>; 2233defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>; 2234defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>; 2235defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>; 2236defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>; 2237defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>; 2238defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>; 2239 2240// i1 compares 2241def : Pat<(setne Int1Regs:$a, Int1Regs:$b), 2242 (XORb1rr Int1Regs:$a, Int1Regs:$b)>; 2243def : Pat<(setune Int1Regs:$a, Int1Regs:$b), 2244 (XORb1rr Int1Regs:$a, Int1Regs:$b)>; 2245 2246def : Pat<(seteq Int1Regs:$a, Int1Regs:$b), 2247 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 2248def : Pat<(setueq Int1Regs:$a, Int1Regs:$b), 2249 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 2250 2251// comparisons of i8 extracted with BFE as i32 2252def: Pat<(setgt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)), 2253 (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGT)>; 2254def: Pat<(setge (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)), 2255 (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGE)>; 2256def: Pat<(setlt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)), 2257 (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLT)>; 2258def: Pat<(setle (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)), 2259 (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLE)>; 2260 2261def: Pat<(setugt (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), 2262 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHI)>; 2263def: Pat<(setuge (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), 2264 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHS)>; 2265def: Pat<(setult (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), 2266 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLO)>; 2267def: Pat<(setule (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), 2268 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLS)>; 2269def: Pat<(seteq (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), 2270 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpEQ)>; 2271def: Pat<(setne (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))), 2272 (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpNE)>; 2273 2274// i1 compare -> i32 2275def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), 2276 (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 2277def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), 2278 (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 2279 2280 2281 2282multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> { 2283 // f16 -> pred 2284 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 2285 (SETP_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, 2286 Requires<[useFP16Math,doF32FTZ]>; 2287 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 2288 (SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>, 2289 Requires<[useFP16Math]>; 2290 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)), 2291 (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, 2292 Requires<[useFP16Math,doF32FTZ]>; 2293 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)), 2294 (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, 2295 Requires<[useFP16Math]>; 2296 def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))), 2297 (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, 2298 Requires<[useFP16Math,doF32FTZ]>; 2299 def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))), 2300 (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>, 2301 Requires<[useFP16Math]>; 2302 2303 // bf16 -> pred 2304 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 2305 (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, 2306 Requires<[hasBF16Math,doF32FTZ]>; 2307 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 2308 (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>, 2309 Requires<[hasBF16Math]>; 2310 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), 2311 (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>, 2312 Requires<[hasBF16Math,doF32FTZ]>; 2313 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), 2314 (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>, 2315 Requires<[hasBF16Math]>; 2316 def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), 2317 (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, 2318 Requires<[hasBF16Math,doF32FTZ]>; 2319 def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), 2320 (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>, 2321 Requires<[hasBF16Math]>; 2322 2323 // f32 -> pred 2324 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), 2325 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, 2326 Requires<[doF32FTZ]>; 2327 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), 2328 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; 2329 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), 2330 (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, 2331 Requires<[doF32FTZ]>; 2332 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), 2333 (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>; 2334 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), 2335 (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, 2336 Requires<[doF32FTZ]>; 2337 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), 2338 (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>; 2339 2340 // f64 -> pred 2341 def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)), 2342 (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; 2343 def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)), 2344 (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>; 2345 def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)), 2346 (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>; 2347 2348 // f16 -> i32 2349 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 2350 (SET_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, 2351 Requires<[useFP16Math, doF32FTZ]>; 2352 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 2353 (SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>, 2354 Requires<[useFP16Math]>; 2355 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)), 2356 (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, 2357 Requires<[useFP16Math, doF32FTZ]>; 2358 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)), 2359 (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, 2360 Requires<[useFP16Math]>; 2361 def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))), 2362 (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, 2363 Requires<[useFP16Math, doF32FTZ]>; 2364 def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))), 2365 (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>, 2366 Requires<[useFP16Math]>; 2367 2368 // bf16 -> i32 2369 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 2370 (SET_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, 2371 Requires<[hasBF16Math, doF32FTZ]>; 2372 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 2373 (SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>, 2374 Requires<[hasBF16Math]>; 2375 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), 2376 (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>, 2377 Requires<[hasBF16Math, doF32FTZ]>; 2378 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), 2379 (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>, 2380 Requires<[hasBF16Math]>; 2381 def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), 2382 (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, 2383 Requires<[hasBF16Math, doF32FTZ]>; 2384 def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), 2385 (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>, 2386 Requires<[hasBF16Math]>; 2387 2388 // f32 -> i32 2389 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), 2390 (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, 2391 Requires<[doF32FTZ]>; 2392 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), 2393 (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; 2394 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), 2395 (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, 2396 Requires<[doF32FTZ]>; 2397 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), 2398 (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>; 2399 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), 2400 (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, 2401 Requires<[doF32FTZ]>; 2402 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), 2403 (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>; 2404 2405 // f64 -> i32 2406 def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)), 2407 (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; 2408 def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)), 2409 (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>; 2410 def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)), 2411 (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>; 2412} 2413 2414defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>; 2415defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>; 2416defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>; 2417defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>; 2418defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>; 2419defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>; 2420 2421defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>; 2422defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>; 2423defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>; 2424defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>; 2425defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>; 2426defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>; 2427 2428defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>; 2429defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>; 2430defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>; 2431defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>; 2432defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>; 2433defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>; 2434 2435defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>; 2436defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>; 2437 2438// FIXME: What is this doing here? Can it be deleted? 2439// def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad, 2440// [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; 2441 2442def SDTDeclareParamProfile : 2443 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; 2444def SDTDeclareScalarParamProfile : 2445 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; 2446def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; 2447def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>; 2448def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>; 2449def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 2450def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 2451def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; 2452def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>; 2453def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>; 2454def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; 2455def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; 2456def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>; 2457def SDTCallVoidProfile : SDTypeProfile<0, 1, []>; 2458def SDTCallValProfile : SDTypeProfile<1, 0, []>; 2459def SDTMoveParamProfile : SDTypeProfile<1, 1, []>; 2460def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; 2461def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>; 2462def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>; 2463def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>; 2464def SDTProxyRegProfile : SDTypeProfile<1, 1, []>; 2465 2466def DeclareParam : 2467 SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile, 2468 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2469def DeclareScalarParam : 2470 SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile, 2471 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2472def DeclareRetParam : 2473 SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile, 2474 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2475def DeclareRet : 2476 SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile, 2477 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2478def LoadParam : 2479 SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, 2480 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2481def LoadParamV2 : 2482 SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile, 2483 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2484def LoadParamV4 : 2485 SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile, 2486 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2487def PrintCall : 2488 SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile, 2489 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2490def PrintConvergentCall : 2491 SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile, 2492 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2493def PrintCallUni : 2494 SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile, 2495 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2496def PrintConvergentCallUni : 2497 SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile, 2498 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2499def StoreParam : 2500 SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, 2501 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2502def StoreParamV2 : 2503 SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile, 2504 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2505def StoreParamV4 : 2506 SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile, 2507 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2508def StoreParamU32 : 2509 SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile, 2510 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2511def StoreParamS32 : 2512 SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile, 2513 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2514def CallArgBegin : 2515 SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile, 2516 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2517def CallArg : 2518 SDNode<"NVPTXISD::CallArg", SDTCallArgProfile, 2519 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2520def LastCallArg : 2521 SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile, 2522 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2523def CallArgEnd : 2524 SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile, 2525 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2526def CallVoid : 2527 SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile, 2528 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2529def Prototype : 2530 SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile, 2531 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2532def CallVal : 2533 SDNode<"NVPTXISD::CallVal", SDTCallValProfile, 2534 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2535def MoveParam : 2536 SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>; 2537def StoreRetval : 2538 SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile, 2539 [SDNPHasChain, SDNPSideEffect]>; 2540def StoreRetvalV2 : 2541 SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile, 2542 [SDNPHasChain, SDNPSideEffect]>; 2543def StoreRetvalV4 : 2544 SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile, 2545 [SDNPHasChain, SDNPSideEffect]>; 2546def PseudoUseParam : 2547 SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile, 2548 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2549def RETURNNode : 2550 SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile, 2551 [SDNPHasChain, SDNPSideEffect]>; 2552def ProxyReg : 2553 SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile, 2554 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2555 2556let mayLoad = true in { 2557 class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : 2558 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), 2559 !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"), 2560 []>; 2561 2562 class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> : 2563 NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b), 2564 !strconcat("ld.param.v2", opstr, 2565 " \t{{$dst, $dst2}}, [retval0+$b];"), []>; 2566 2567 class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> : 2568 NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, 2569 regclass:$dst4), 2570 (ins i32imm:$b), 2571 !strconcat("ld.param.v4", opstr, 2572 " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), 2573 []>; 2574} 2575 2576class LoadParamRegInst<NVPTXRegClass regclass, string opstr> : 2577 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), 2578 !strconcat("mov", opstr, " \t$dst, retval$b;"), 2579 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>; 2580 2581let mayStore = true in { 2582 class StoreParamInst<NVPTXRegClass regclass, string opstr> : 2583 NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b), 2584 !strconcat("st.param", opstr, " \t[param$a+$b], $val;"), 2585 []>; 2586 2587 class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> : 2588 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, 2589 i32imm:$a, i32imm:$b), 2590 !strconcat("st.param.v2", opstr, 2591 " \t[param$a+$b], {{$val, $val2}};"), 2592 []>; 2593 2594 class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> : 2595 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3, 2596 regclass:$val4, i32imm:$a, 2597 i32imm:$b), 2598 !strconcat("st.param.v4", opstr, 2599 " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"), 2600 []>; 2601 2602 class StoreRetvalInst<NVPTXRegClass regclass, string opstr> : 2603 NVPTXInst<(outs), (ins regclass:$val, i32imm:$a), 2604 !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"), 2605 []>; 2606 2607 class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> : 2608 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a), 2609 !strconcat("st.param.v2", opstr, 2610 " \t[func_retval0+$a], {{$val, $val2}};"), 2611 []>; 2612 2613 class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> : 2614 NVPTXInst<(outs), 2615 (ins regclass:$val, regclass:$val2, regclass:$val3, 2616 regclass:$val4, i32imm:$a), 2617 !strconcat("st.param.v4", opstr, 2618 " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"), 2619 []>; 2620} 2621 2622let isCall=1 in { 2623 multiclass CALL<string OpcStr, SDNode OpNode> { 2624 def PrintCallNoRetInst : NVPTXInst<(outs), (ins), 2625 !strconcat(OpcStr, " "), [(OpNode (i32 0))]>; 2626 def PrintCallRetInst1 : NVPTXInst<(outs), (ins), 2627 !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>; 2628 def PrintCallRetInst2 : NVPTXInst<(outs), (ins), 2629 !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>; 2630 def PrintCallRetInst3 : NVPTXInst<(outs), (ins), 2631 !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>; 2632 def PrintCallRetInst4 : NVPTXInst<(outs), (ins), 2633 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "), 2634 [(OpNode (i32 4))]>; 2635 def PrintCallRetInst5 : NVPTXInst<(outs), (ins), 2636 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "), 2637 [(OpNode (i32 5))]>; 2638 def PrintCallRetInst6 : NVPTXInst<(outs), (ins), 2639 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2640 "retval5), "), 2641 [(OpNode (i32 6))]>; 2642 def PrintCallRetInst7 : NVPTXInst<(outs), (ins), 2643 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2644 "retval5, retval6), "), 2645 [(OpNode (i32 7))]>; 2646 def PrintCallRetInst8 : NVPTXInst<(outs), (ins), 2647 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2648 "retval5, retval6, retval7), "), 2649 [(OpNode (i32 8))]>; 2650 } 2651} 2652 2653defm Call : CALL<"call", PrintCall>; 2654defm CallUni : CALL<"call.uni", PrintCallUni>; 2655 2656// Convergent call instructions. These are identical to regular calls, except 2657// they have the isConvergent bit set. 2658let isConvergent=1 in { 2659 defm ConvergentCall : CALL<"call", PrintConvergentCall>; 2660 defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>; 2661} 2662 2663def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">; 2664def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">; 2665def LoadParamMemI16 : LoadParamMemInst<Int16Regs, ".b16">; 2666def LoadParamMemI8 : LoadParamMemInst<Int16Regs, ".b8">; 2667def LoadParamMemV2I64 : LoadParamV2MemInst<Int64Regs, ".b64">; 2668def LoadParamMemV2I32 : LoadParamV2MemInst<Int32Regs, ".b32">; 2669def LoadParamMemV2I16 : LoadParamV2MemInst<Int16Regs, ".b16">; 2670def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">; 2671def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">; 2672def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">; 2673def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">; 2674def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">; 2675def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">; 2676def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">; 2677def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">; 2678def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">; 2679 2680def StoreParamI64 : StoreParamInst<Int64Regs, ".b64">; 2681def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">; 2682 2683def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">; 2684def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">; 2685def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">; 2686def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">; 2687def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">; 2688def StoreParamV2I8 : StoreParamV2Inst<Int16Regs, ".b8">; 2689 2690def StoreParamV4I32 : StoreParamV4Inst<Int32Regs, ".b32">; 2691def StoreParamV4I16 : StoreParamV4Inst<Int16Regs, ".b16">; 2692def StoreParamV4I8 : StoreParamV4Inst<Int16Regs, ".b8">; 2693 2694def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">; 2695def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">; 2696def StoreParamV2F32 : StoreParamV2Inst<Float32Regs, ".f32">; 2697def StoreParamV2F64 : StoreParamV2Inst<Float64Regs, ".f64">; 2698def StoreParamV4F32 : StoreParamV4Inst<Float32Regs, ".f32">; 2699 2700def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">; 2701def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">; 2702def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">; 2703def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">; 2704def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">; 2705def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">; 2706def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">; 2707def StoreRetvalV2I8 : StoreRetvalV2Inst<Int16Regs, ".b8">; 2708def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">; 2709def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">; 2710def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">; 2711 2712def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">; 2713def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">; 2714def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">; 2715def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">; 2716def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">; 2717 2718def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>; 2719def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>; 2720def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>; 2721def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>; 2722 2723class CallArgInst<NVPTXRegClass regclass> : 2724 NVPTXInst<(outs), (ins regclass:$a), "$a, ", 2725 [(CallArg (i32 0), regclass:$a)]>; 2726 2727class CallArgInstVT<NVPTXRegClass regclass, ValueType vt> : 2728 NVPTXInst<(outs), (ins regclass:$a), "$a, ", 2729 [(CallArg (i32 0), vt:$a)]>; 2730 2731class LastCallArgInst<NVPTXRegClass regclass> : 2732 NVPTXInst<(outs), (ins regclass:$a), "$a", 2733 [(LastCallArg (i32 0), regclass:$a)]>; 2734class LastCallArgInstVT<NVPTXRegClass regclass, ValueType vt> : 2735 NVPTXInst<(outs), (ins regclass:$a), "$a", 2736 [(LastCallArg (i32 0), vt:$a)]>; 2737 2738def CallArgI64 : CallArgInst<Int64Regs>; 2739def CallArgI32 : CallArgInstVT<Int32Regs, i32>; 2740def CallArgI16 : CallArgInstVT<Int16Regs, i16>; 2741def CallArgF64 : CallArgInst<Float64Regs>; 2742def CallArgF32 : CallArgInst<Float32Regs>; 2743 2744def LastCallArgI64 : LastCallArgInst<Int64Regs>; 2745def LastCallArgI32 : LastCallArgInstVT<Int32Regs, i32>; 2746def LastCallArgI16 : LastCallArgInstVT<Int16Regs, i16>; 2747def LastCallArgF64 : LastCallArgInst<Float64Regs>; 2748def LastCallArgF32 : LastCallArgInst<Float32Regs>; 2749 2750def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ", 2751 [(CallArg (i32 0), (i32 imm:$a))]>; 2752def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a", 2753 [(LastCallArg (i32 0), (i32 imm:$a))]>; 2754 2755def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ", 2756 [(CallArg (i32 1), (i32 imm:$a))]>; 2757def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a", 2758 [(LastCallArg (i32 1), (i32 imm:$a))]>; 2759 2760def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ", 2761 [(CallVoid (Wrapper tglobaladdr:$addr))]>; 2762def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ", 2763 [(CallVoid i32:$addr)]>; 2764def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ", 2765 [(CallVoid Int64Regs:$addr)]>; 2766def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;", 2767 [(Prototype (i32 imm:$val))]>; 2768 2769def DeclareRetMemInst : 2770 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num), 2771 ".param .align $align .b8 retval$num[$size];", 2772 [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>; 2773def DeclareRetScalarInst : 2774 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), 2775 ".param .b$size retval$num;", 2776 [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>; 2777def DeclareRetRegInst : 2778 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), 2779 ".reg .b$size retval$num;", 2780 [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>; 2781 2782def DeclareParamInst : 2783 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size), 2784 ".param .align $align .b8 param$a[$size];", 2785 [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>; 2786def DeclareScalarParamInst : 2787 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), 2788 ".param .b$size param$a;", 2789 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>; 2790def DeclareScalarRegInst : 2791 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), 2792 ".reg .b$size param$a;", 2793 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>; 2794 2795class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> : 2796 NVPTXInst<(outs regclass:$dst), (ins regclass:$src), 2797 !strconcat("mov", asmstr, " \t$dst, $src;"), 2798 [(set (T regclass:$dst), (MoveParam (T regclass:$src)))]>; 2799 2800class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, ValueType vt, 2801 string asmstr> : 2802 NVPTXInst<(outs regclass:$dst), (ins srcty:$src), 2803 !strconcat("mov", asmstr, " \t$dst, $src;"), 2804 [(set vt:$dst, (MoveParam texternalsym:$src))]>; 2805 2806def MoveParamI64 : MoveParamInst<i64, Int64Regs, ".b64">; 2807def MoveParamI32 : MoveParamInst<i32, Int32Regs, ".b32">; 2808 2809def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, i64, ".b64">; 2810def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, i32, ".b32">; 2811 2812def MoveParamI16 : 2813 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 2814 "cvt.u16.u32 \t$dst, $src;", // ??? Why cvt.u16.u32 ? 2815 [(set i16:$dst, (MoveParam i16:$src))]>; 2816def MoveParamF64 : MoveParamInst<f64, Float64Regs, ".f64">; 2817def MoveParamF32 : MoveParamInst<f32, Float32Regs, ".f32">; 2818 2819class PseudoUseParamInst<NVPTXRegClass regclass, ValueType vt> : 2820 NVPTXInst<(outs), (ins regclass:$src), 2821 "// Pseudo use of $src", 2822 [(PseudoUseParam vt:$src)]>; 2823 2824def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs, i64>; 2825def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs, i32>; 2826def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs, i16>; 2827def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs, f64>; 2828def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>; 2829 2830class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> : 2831 NVPTXInst<(outs regclass:$dst), (ins regclass:$src), 2832 !strconcat("mov.", SzStr, " \t$dst, $src;"), 2833 [(set (T regclass:$dst), (ProxyReg (T regclass:$src)))]>; 2834 2835def ProxyRegI1 : ProxyRegInst<"pred", i1, Int1Regs>; 2836def ProxyRegI16 : ProxyRegInst<"b16", i16, Int16Regs>; 2837def ProxyRegI32 : ProxyRegInst<"b32", i32, Int32Regs>; 2838def ProxyRegI64 : ProxyRegInst<"b64", i64, Int64Regs>; 2839def ProxyRegF32 : ProxyRegInst<"f32", f32, Float32Regs>; 2840def ProxyRegF64 : ProxyRegInst<"f64", f64, Float64Regs>; 2841 2842foreach vt = [f16, bf16] in { 2843 def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI16 Int16Regs:$src)>; 2844} 2845 2846foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { 2847 def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 Int32Regs:$src)>; 2848} 2849 2850// 2851// Load / Store Handling 2852// 2853multiclass LD<NVPTXRegClass regclass> { 2854 def _avar : NVPTXInst< 2855 (outs regclass:$dst), 2856 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2857 i32imm:$fromWidth, imem:$addr), 2858 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2859 "\t$dst, [$addr];", []>; 2860 def _areg : NVPTXInst< 2861 (outs regclass:$dst), 2862 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2863 i32imm:$fromWidth, Int32Regs:$addr), 2864 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2865 "\t$dst, [$addr];", []>; 2866 def _areg_64 : NVPTXInst< 2867 (outs regclass:$dst), 2868 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2869 i32imm:$fromWidth, Int64Regs:$addr), 2870 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2871 "\t$dst, [$addr];", []>; 2872 def _ari : NVPTXInst< 2873 (outs regclass:$dst), 2874 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2875 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 2876 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2877 "\t$dst, [$addr+$offset];", []>; 2878 def _ari_64 : NVPTXInst< 2879 (outs regclass:$dst), 2880 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2881 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 2882 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2883 "\t$dst, [$addr+$offset];", []>; 2884 def _asi : NVPTXInst< 2885 (outs regclass:$dst), 2886 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2887 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), 2888 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2889 "\t$dst, [$addr+$offset];", []>; 2890} 2891 2892let mayLoad=1, hasSideEffects=0 in { 2893 defm LD_i8 : LD<Int16Regs>; 2894 defm LD_i16 : LD<Int16Regs>; 2895 defm LD_i32 : LD<Int32Regs>; 2896 defm LD_i64 : LD<Int64Regs>; 2897 defm LD_f32 : LD<Float32Regs>; 2898 defm LD_f64 : LD<Float64Regs>; 2899} 2900 2901multiclass ST<NVPTXRegClass regclass> { 2902 def _avar : NVPTXInst< 2903 (outs), 2904 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2905 LdStCode:$Sign, i32imm:$toWidth, imem:$addr), 2906 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2907 " \t[$addr], $src;", []>; 2908 def _areg : NVPTXInst< 2909 (outs), 2910 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, 2911 LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), 2912 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2913 " \t[$addr], $src;", []>; 2914 def _areg_64 : NVPTXInst< 2915 (outs), 2916 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2917 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), 2918 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2919 " \t[$addr], $src;", []>; 2920 def _ari : NVPTXInst< 2921 (outs), 2922 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2923 LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), 2924 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2925 " \t[$addr+$offset], $src;", []>; 2926 def _ari_64 : NVPTXInst< 2927 (outs), 2928 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2929 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), 2930 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2931 " \t[$addr+$offset], $src;", []>; 2932 def _asi : NVPTXInst< 2933 (outs), 2934 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2935 LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), 2936 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2937 " \t[$addr+$offset], $src;", []>; 2938} 2939 2940let mayStore=1, hasSideEffects=0 in { 2941 defm ST_i8 : ST<Int16Regs>; 2942 defm ST_i16 : ST<Int16Regs>; 2943 defm ST_i32 : ST<Int32Regs>; 2944 defm ST_i64 : ST<Int64Regs>; 2945 defm ST_f32 : ST<Float32Regs>; 2946 defm ST_f64 : ST<Float64Regs>; 2947} 2948 2949// The following is used only in and after vector elementizations. Vector 2950// elementization happens at the machine instruction level, so the following 2951// instructions never appear in the DAG. 2952multiclass LD_VEC<NVPTXRegClass regclass> { 2953 def _v2_avar : NVPTXInst< 2954 (outs regclass:$dst1, regclass:$dst2), 2955 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2956 i32imm:$fromWidth, imem:$addr), 2957 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2958 "\t{{$dst1, $dst2}}, [$addr];", []>; 2959 def _v2_areg : NVPTXInst< 2960 (outs regclass:$dst1, regclass:$dst2), 2961 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2962 i32imm:$fromWidth, Int32Regs:$addr), 2963 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2964 "\t{{$dst1, $dst2}}, [$addr];", []>; 2965 def _v2_areg_64 : NVPTXInst< 2966 (outs regclass:$dst1, regclass:$dst2), 2967 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2968 i32imm:$fromWidth, Int64Regs:$addr), 2969 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2970 "\t{{$dst1, $dst2}}, [$addr];", []>; 2971 def _v2_ari : NVPTXInst< 2972 (outs regclass:$dst1, regclass:$dst2), 2973 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2974 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 2975 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2976 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 2977 def _v2_ari_64 : NVPTXInst< 2978 (outs regclass:$dst1, regclass:$dst2), 2979 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2980 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 2981 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2982 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 2983 def _v2_asi : NVPTXInst< 2984 (outs regclass:$dst1, regclass:$dst2), 2985 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2986 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 2987 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2988 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 2989 def _v4_avar : NVPTXInst< 2990 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2991 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2992 i32imm:$fromWidth, imem:$addr), 2993 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2994 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 2995 def _v4_areg : NVPTXInst< 2996 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2997 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2998 i32imm:$fromWidth, Int32Regs:$addr), 2999 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3000 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 3001 def _v4_areg_64 : NVPTXInst< 3002 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 3003 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3004 i32imm:$fromWidth, Int64Regs:$addr), 3005 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3006 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 3007 def _v4_ari : NVPTXInst< 3008 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 3009 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3010 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 3011 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3012 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 3013 def _v4_ari_64 : NVPTXInst< 3014 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 3015 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3016 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 3017 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3018 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 3019 def _v4_asi : NVPTXInst< 3020 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 3021 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3022 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 3023 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3024 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 3025} 3026let mayLoad=1, hasSideEffects=0 in { 3027 defm LDV_i8 : LD_VEC<Int16Regs>; 3028 defm LDV_i16 : LD_VEC<Int16Regs>; 3029 defm LDV_i32 : LD_VEC<Int32Regs>; 3030 defm LDV_i64 : LD_VEC<Int64Regs>; 3031 defm LDV_f32 : LD_VEC<Float32Regs>; 3032 defm LDV_f64 : LD_VEC<Float64Regs>; 3033} 3034 3035multiclass ST_VEC<NVPTXRegClass regclass> { 3036 def _v2_avar : NVPTXInst< 3037 (outs), 3038 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3039 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), 3040 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3041 "\t[$addr], {{$src1, $src2}};", []>; 3042 def _v2_areg : NVPTXInst< 3043 (outs), 3044 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3045 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), 3046 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3047 "\t[$addr], {{$src1, $src2}};", []>; 3048 def _v2_areg_64 : NVPTXInst< 3049 (outs), 3050 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3051 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), 3052 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3053 "\t[$addr], {{$src1, $src2}};", []>; 3054 def _v2_ari : NVPTXInst< 3055 (outs), 3056 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3057 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, 3058 i32imm:$offset), 3059 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3060 "\t[$addr+$offset], {{$src1, $src2}};", []>; 3061 def _v2_ari_64 : NVPTXInst< 3062 (outs), 3063 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3064 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, 3065 i32imm:$offset), 3066 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3067 "\t[$addr+$offset], {{$src1, $src2}};", []>; 3068 def _v2_asi : NVPTXInst< 3069 (outs), 3070 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3071 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, 3072 i32imm:$offset), 3073 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3074 "\t[$addr+$offset], {{$src1, $src2}};", []>; 3075 def _v4_avar : NVPTXInst< 3076 (outs), 3077 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3078 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3079 i32imm:$fromWidth, imem:$addr), 3080 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3081 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 3082 def _v4_areg : NVPTXInst< 3083 (outs), 3084 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3085 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3086 i32imm:$fromWidth, Int32Regs:$addr), 3087 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3088 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 3089 def _v4_areg_64 : NVPTXInst< 3090 (outs), 3091 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3092 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3093 i32imm:$fromWidth, Int64Regs:$addr), 3094 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3095 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 3096 def _v4_ari : NVPTXInst< 3097 (outs), 3098 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3099 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3100 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 3101 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3102 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 3103 def _v4_ari_64 : NVPTXInst< 3104 (outs), 3105 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3106 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3107 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 3108 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3109 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 3110 def _v4_asi : NVPTXInst< 3111 (outs), 3112 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3113 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3114 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 3115 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}" 3116 "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 3117} 3118 3119let mayStore=1, hasSideEffects=0 in { 3120 defm STV_i8 : ST_VEC<Int16Regs>; 3121 defm STV_i16 : ST_VEC<Int16Regs>; 3122 defm STV_i32 : ST_VEC<Int32Regs>; 3123 defm STV_i64 : ST_VEC<Int64Regs>; 3124 defm STV_f32 : ST_VEC<Float32Regs>; 3125 defm STV_f64 : ST_VEC<Float64Regs>; 3126} 3127 3128//---- Conversion ---- 3129 3130class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut, 3131 NVPTXRegClass regclassIn = ValueToRegClass<TIn>.ret, 3132 NVPTXRegClass regclassOut = ValueToRegClass<TOut>.ret> : 3133 NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a), 3134 !strconcat("mov.b", SzStr, " \t$d, $a;"), 3135 [(set (TOut regclassOut:$d), (bitconvert (TIn regclassIn:$a)))]>; 3136 3137def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>; 3138def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>; 3139def BITCONVERT_64_I2F : F_BITCONVERT<"64", i64, f64>; 3140def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>; 3141 3142foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { 3143def: Pat<(vt (bitconvert (f32 Float32Regs:$a))), 3144 (BITCONVERT_32_F2I Float32Regs:$a)>; 3145def: Pat<(f32 (bitconvert (vt Int32Regs:$a))), 3146 (BITCONVERT_32_I2F Int32Regs:$a)>; 3147} 3148foreach vt = [f16, bf16] in { 3149def: Pat<(vt (bitconvert (i16 UInt16Const:$a))), 3150 (IMOVB16ri UInt16Const:$a)>; 3151def: Pat<(vt (bitconvert (i16 Int16Regs:$a))), 3152 (ProxyRegI16 Int16Regs:$a)>; 3153def: Pat<(i16 (bitconvert (vt Int16Regs:$a))), 3154 (ProxyRegI16 Int16Regs:$a)>; 3155} 3156 3157foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in { 3158 def: Pat<(ta (bitconvert (i32 UInt32Const:$a))), 3159 (IMOVB32ri UInt32Const:$a)>; 3160 foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in { 3161 if !ne(ta, tb) then { 3162 def: Pat<(ta (bitconvert (tb Int32Regs:$a))), 3163 (ProxyRegI32 Int32Regs:$a)>; 3164 } 3165 } 3166} 3167 3168// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where 3169// we cannot specify floating-point literals in isel patterns. Therefore, we 3170// use an integer selp to select either 1 or 0 and then cvt to floating-point. 3171 3172// sint -> f16 3173def : Pat<(f16 (sint_to_fp Int1Regs:$a)), 3174 (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3175def : Pat<(f16 (sint_to_fp Int16Regs:$a)), 3176 (CVT_f16_s16 Int16Regs:$a, CvtRN)>; 3177def : Pat<(f16 (sint_to_fp Int32Regs:$a)), 3178 (CVT_f16_s32 Int32Regs:$a, CvtRN)>; 3179def : Pat<(f16 (sint_to_fp Int64Regs:$a)), 3180 (CVT_f16_s64 Int64Regs:$a, CvtRN)>; 3181 3182// uint -> f16 3183def : Pat<(f16 (uint_to_fp Int1Regs:$a)), 3184 (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3185def : Pat<(f16 (uint_to_fp Int16Regs:$a)), 3186 (CVT_f16_u16 Int16Regs:$a, CvtRN)>; 3187def : Pat<(f16 (uint_to_fp Int32Regs:$a)), 3188 (CVT_f16_u32 Int32Regs:$a, CvtRN)>; 3189def : Pat<(f16 (uint_to_fp Int64Regs:$a)), 3190 (CVT_f16_u64 Int64Regs:$a, CvtRN)>; 3191 3192// sint -> bf16 3193def : Pat<(bf16 (sint_to_fp Int1Regs:$a)), 3194 (CVT_bf16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3195def : Pat<(bf16 (sint_to_fp Int16Regs:$a)), 3196 (CVT_bf16_s16 Int16Regs:$a, CvtRN)>; 3197def : Pat<(bf16 (sint_to_fp Int32Regs:$a)), 3198 (CVT_bf16_s32 Int32Regs:$a, CvtRN)>; 3199def : Pat<(bf16 (sint_to_fp Int64Regs:$a)), 3200 (CVT_bf16_s64 Int64Regs:$a, CvtRN)>; 3201 3202// uint -> bf16 3203def : Pat<(bf16 (uint_to_fp Int1Regs:$a)), 3204 (CVT_bf16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3205def : Pat<(bf16 (uint_to_fp Int16Regs:$a)), 3206 (CVT_bf16_u16 Int16Regs:$a, CvtRN)>; 3207def : Pat<(bf16 (uint_to_fp Int32Regs:$a)), 3208 (CVT_bf16_u32 Int32Regs:$a, CvtRN)>; 3209def : Pat<(bf16 (uint_to_fp Int64Regs:$a)), 3210 (CVT_bf16_u64 Int64Regs:$a, CvtRN)>; 3211 3212// sint -> f32 3213def : Pat<(f32 (sint_to_fp Int1Regs:$a)), 3214 (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3215def : Pat<(f32 (sint_to_fp Int16Regs:$a)), 3216 (CVT_f32_s16 Int16Regs:$a, CvtRN)>; 3217def : Pat<(f32 (sint_to_fp Int32Regs:$a)), 3218 (CVT_f32_s32 Int32Regs:$a, CvtRN)>; 3219def : Pat<(f32 (sint_to_fp Int64Regs:$a)), 3220 (CVT_f32_s64 Int64Regs:$a, CvtRN)>; 3221 3222// uint -> f32 3223def : Pat<(f32 (uint_to_fp Int1Regs:$a)), 3224 (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3225def : Pat<(f32 (uint_to_fp Int16Regs:$a)), 3226 (CVT_f32_u16 Int16Regs:$a, CvtRN)>; 3227def : Pat<(f32 (uint_to_fp Int32Regs:$a)), 3228 (CVT_f32_u32 Int32Regs:$a, CvtRN)>; 3229def : Pat<(f32 (uint_to_fp Int64Regs:$a)), 3230 (CVT_f32_u64 Int64Regs:$a, CvtRN)>; 3231 3232// sint -> f64 3233def : Pat<(f64 (sint_to_fp Int1Regs:$a)), 3234 (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3235def : Pat<(f64 (sint_to_fp Int16Regs:$a)), 3236 (CVT_f64_s16 Int16Regs:$a, CvtRN)>; 3237def : Pat<(f64 (sint_to_fp Int32Regs:$a)), 3238 (CVT_f64_s32 Int32Regs:$a, CvtRN)>; 3239def : Pat<(f64 (sint_to_fp Int64Regs:$a)), 3240 (CVT_f64_s64 Int64Regs:$a, CvtRN)>; 3241 3242// uint -> f64 3243def : Pat<(f64 (uint_to_fp Int1Regs:$a)), 3244 (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3245def : Pat<(f64 (uint_to_fp Int16Regs:$a)), 3246 (CVT_f64_u16 Int16Regs:$a, CvtRN)>; 3247def : Pat<(f64 (uint_to_fp Int32Regs:$a)), 3248 (CVT_f64_u32 Int32Regs:$a, CvtRN)>; 3249def : Pat<(f64 (uint_to_fp Int64Regs:$a)), 3250 (CVT_f64_u64 Int64Regs:$a, CvtRN)>; 3251 3252 3253// f16 -> sint 3254def : Pat<(i1 (fp_to_sint (f16 Int16Regs:$a))), 3255 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; 3256def : Pat<(i16 (fp_to_sint (f16 Int16Regs:$a))), 3257 (CVT_s16_f16 (f16 Int16Regs:$a), CvtRZI)>; 3258def : Pat<(i32 (fp_to_sint (f16 Int16Regs:$a))), 3259 (CVT_s32_f16 (f16 Int16Regs:$a), CvtRZI)>; 3260def : Pat<(i64 (fp_to_sint (f16 Int16Regs:$a))), 3261 (CVT_s64_f16 Int16Regs:$a, CvtRZI)>; 3262 3263// f16 -> uint 3264def : Pat<(i1 (fp_to_uint (f16 Int16Regs:$a))), 3265 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; 3266def : Pat<(i16 (fp_to_uint (f16 Int16Regs:$a))), 3267 (CVT_u16_f16 Int16Regs:$a, CvtRZI)>; 3268def : Pat<(i32 (fp_to_uint (f16 Int16Regs:$a))), 3269 (CVT_u32_f16 Int16Regs:$a, CvtRZI)>; 3270def : Pat<(i64 (fp_to_uint (f16 Int16Regs:$a))), 3271 (CVT_u64_f16 Int16Regs:$a, CvtRZI)>; 3272 3273// bf16 -> sint 3274def : Pat<(i1 (fp_to_sint (bf16 Int16Regs:$a))), 3275 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; 3276def : Pat<(i16 (fp_to_sint (bf16 Int16Regs:$a))), 3277 (CVT_s16_bf16 (bf16 Int16Regs:$a), CvtRZI)>; 3278def : Pat<(i32 (fp_to_sint (bf16 Int16Regs:$a))), 3279 (CVT_s32_bf16 (bf16 Int16Regs:$a), CvtRZI)>; 3280def : Pat<(i64 (fp_to_sint (bf16 Int16Regs:$a))), 3281 (CVT_s64_bf16 Int16Regs:$a, CvtRZI)>; 3282 3283// bf16 -> uint 3284def : Pat<(i1 (fp_to_uint (bf16 Int16Regs:$a))), 3285 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; 3286def : Pat<(i16 (fp_to_uint (bf16 Int16Regs:$a))), 3287 (CVT_u16_bf16 Int16Regs:$a, CvtRZI)>; 3288def : Pat<(i32 (fp_to_uint (bf16 Int16Regs:$a))), 3289 (CVT_u32_bf16 Int16Regs:$a, CvtRZI)>; 3290def : Pat<(i64 (fp_to_uint (bf16 Int16Regs:$a))), 3291 (CVT_u64_bf16 Int16Regs:$a, CvtRZI)>; 3292// f32 -> sint 3293def : Pat<(i1 (fp_to_sint Float32Regs:$a)), 3294 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; 3295def : Pat<(i16 (fp_to_sint Float32Regs:$a)), 3296 (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3297def : Pat<(i16 (fp_to_sint Float32Regs:$a)), 3298 (CVT_s16_f32 Float32Regs:$a, CvtRZI)>; 3299def : Pat<(i32 (fp_to_sint Float32Regs:$a)), 3300 (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3301def : Pat<(i32 (fp_to_sint Float32Regs:$a)), 3302 (CVT_s32_f32 Float32Regs:$a, CvtRZI)>; 3303def : Pat<(i64 (fp_to_sint Float32Regs:$a)), 3304 (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3305def : Pat<(i64 (fp_to_sint Float32Regs:$a)), 3306 (CVT_s64_f32 Float32Regs:$a, CvtRZI)>; 3307 3308// f32 -> uint 3309def : Pat<(i1 (fp_to_uint Float32Regs:$a)), 3310 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; 3311def : Pat<(i16 (fp_to_uint Float32Regs:$a)), 3312 (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3313def : Pat<(i16 (fp_to_uint Float32Regs:$a)), 3314 (CVT_u16_f32 Float32Regs:$a, CvtRZI)>; 3315def : Pat<(i32 (fp_to_uint Float32Regs:$a)), 3316 (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3317def : Pat<(i32 (fp_to_uint Float32Regs:$a)), 3318 (CVT_u32_f32 Float32Regs:$a, CvtRZI)>; 3319def : Pat<(i64 (fp_to_uint Float32Regs:$a)), 3320 (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3321def : Pat<(i64 (fp_to_uint Float32Regs:$a)), 3322 (CVT_u64_f32 Float32Regs:$a, CvtRZI)>; 3323 3324// f64 -> sint 3325def : Pat<(i1 (fp_to_sint Float64Regs:$a)), 3326 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; 3327def : Pat<(i16 (fp_to_sint Float64Regs:$a)), 3328 (CVT_s16_f64 Float64Regs:$a, CvtRZI)>; 3329def : Pat<(i32 (fp_to_sint Float64Regs:$a)), 3330 (CVT_s32_f64 Float64Regs:$a, CvtRZI)>; 3331def : Pat<(i64 (fp_to_sint Float64Regs:$a)), 3332 (CVT_s64_f64 Float64Regs:$a, CvtRZI)>; 3333 3334// f64 -> uint 3335def : Pat<(i1 (fp_to_uint Float64Regs:$a)), 3336 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; 3337def : Pat<(i16 (fp_to_uint Float64Regs:$a)), 3338 (CVT_u16_f64 Float64Regs:$a, CvtRZI)>; 3339def : Pat<(i32 (fp_to_uint Float64Regs:$a)), 3340 (CVT_u32_f64 Float64Regs:$a, CvtRZI)>; 3341def : Pat<(i64 (fp_to_uint Float64Regs:$a)), 3342 (CVT_u64_f64 Float64Regs:$a, CvtRZI)>; 3343 3344// sext i1 3345def : Pat<(i16 (sext Int1Regs:$a)), 3346 (SELP_s16ii -1, 0, Int1Regs:$a)>; 3347def : Pat<(i32 (sext Int1Regs:$a)), 3348 (SELP_s32ii -1, 0, Int1Regs:$a)>; 3349def : Pat<(i64 (sext Int1Regs:$a)), 3350 (SELP_s64ii -1, 0, Int1Regs:$a)>; 3351 3352// zext i1 3353def : Pat<(i16 (zext Int1Regs:$a)), 3354 (SELP_u16ii 1, 0, Int1Regs:$a)>; 3355def : Pat<(i32 (zext Int1Regs:$a)), 3356 (SELP_u32ii 1, 0, Int1Regs:$a)>; 3357def : Pat<(i64 (zext Int1Regs:$a)), 3358 (SELP_u64ii 1, 0, Int1Regs:$a)>; 3359 3360// anyext i1 3361def : Pat<(i16 (anyext Int1Regs:$a)), 3362 (SELP_u16ii -1, 0, Int1Regs:$a)>; 3363def : Pat<(i32 (anyext Int1Regs:$a)), 3364 (SELP_u32ii -1, 0, Int1Regs:$a)>; 3365def : Pat<(i64 (anyext Int1Regs:$a)), 3366 (SELP_u64ii -1, 0, Int1Regs:$a)>; 3367 3368// sext i16 3369def : Pat<(i32 (sext Int16Regs:$a)), 3370 (CVT_s32_s16 Int16Regs:$a, CvtNONE)>; 3371def : Pat<(i64 (sext Int16Regs:$a)), 3372 (CVT_s64_s16 Int16Regs:$a, CvtNONE)>; 3373 3374// zext i16 3375def : Pat<(i32 (zext Int16Regs:$a)), 3376 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; 3377def : Pat<(i64 (zext Int16Regs:$a)), 3378 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; 3379 3380// anyext i16 3381def : Pat<(i32 (anyext Int16Regs:$a)), 3382 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; 3383def : Pat<(i64 (anyext Int16Regs:$a)), 3384 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; 3385 3386// sext i32 3387def : Pat<(i64 (sext Int32Regs:$a)), 3388 (CVT_s64_s32 Int32Regs:$a, CvtNONE)>; 3389 3390// zext i32 3391def : Pat<(i64 (zext Int32Regs:$a)), 3392 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; 3393 3394// anyext i32 3395def : Pat<(i64 (anyext Int32Regs:$a)), 3396 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; 3397 3398 3399// truncate i64 3400def : Pat<(i32 (trunc Int64Regs:$a)), 3401 (CVT_u32_u64 Int64Regs:$a, CvtNONE)>; 3402def : Pat<(i16 (trunc Int64Regs:$a)), 3403 (CVT_u16_u64 Int64Regs:$a, CvtNONE)>; 3404def : Pat<(i1 (trunc Int64Regs:$a)), 3405 (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>; 3406 3407// truncate i32 3408def : Pat<(i16 (trunc Int32Regs:$a)), 3409 (CVT_u16_u32 Int32Regs:$a, CvtNONE)>; 3410def : Pat<(i1 (trunc Int32Regs:$a)), 3411 (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>; 3412 3413// truncate i16 3414def : Pat<(i1 (trunc Int16Regs:$a)), 3415 (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>; 3416 3417// sext_inreg 3418def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>; 3419def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>; 3420def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>; 3421def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>; 3422def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>; 3423def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>; 3424 3425 3426// Select instructions with 32-bit predicates 3427def : Pat<(select (i32 Int32Regs:$pred), i16:$a, i16:$b), 3428 (SELP_b16rr Int16Regs:$a, Int16Regs:$b, 3429 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3430def : Pat<(select (i32 Int32Regs:$pred), i32:$a, i32:$b), 3431 (SELP_b32rr Int32Regs:$a, Int32Regs:$b, 3432 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3433def : Pat<(select (i32 Int32Regs:$pred), Int64Regs:$a, Int64Regs:$b), 3434 (SELP_b64rr Int64Regs:$a, Int64Regs:$b, 3435 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3436def : Pat<(select (i32 Int32Regs:$pred), (f16 Int16Regs:$a), (f16 Int16Regs:$b)), 3437 (SELP_f16rr Int16Regs:$a, Int16Regs:$b, 3438 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3439def : Pat<(select (i32 Int32Regs:$pred), (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)), 3440 (SELP_bf16rr Int16Regs:$a, Int16Regs:$b, 3441 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3442def : Pat<(select (i32 Int32Regs:$pred), Float32Regs:$a, Float32Regs:$b), 3443 (SELP_f32rr Float32Regs:$a, Float32Regs:$b, 3444 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3445def : Pat<(select (i32 Int32Regs:$pred), Float64Regs:$a, Float64Regs:$b), 3446 (SELP_f64rr Float64Regs:$a, Float64Regs:$b, 3447 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3448 3449 3450let hasSideEffects = false in { 3451 // pack a set of smaller int registers to a larger int register 3452 def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d), 3453 (ins Int16Regs:$s1, Int16Regs:$s2, 3454 Int16Regs:$s3, Int16Regs:$s4), 3455 "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>; 3456 def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d), 3457 (ins Int16Regs:$s1, Int16Regs:$s2), 3458 "mov.b32 \t$d, {{$s1, $s2}};", []>; 3459 def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d), 3460 (ins Int32Regs:$s1, Int32Regs:$s2), 3461 "mov.b64 \t$d, {{$s1, $s2}};", []>; 3462 def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d), 3463 (ins Float32Regs:$s1, Float32Regs:$s2), 3464 "mov.b64 \t$d, {{$s1, $s2}};", []>; 3465 3466 // unpack a larger int register to a set of smaller int registers 3467 def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2, 3468 Int16Regs:$d3, Int16Regs:$d4), 3469 (ins Int64Regs:$s), 3470 "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>; 3471 def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2), 3472 (ins Int32Regs:$s), 3473 "mov.b32 \t{{$d1, $d2}}, $s;", []>; 3474 def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2), 3475 (ins Int64Regs:$s), 3476 "mov.b64 \t{{$d1, $d2}}, $s;", []>; 3477 def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2), 3478 (ins Float64Regs:$s), 3479 "mov.b64 \t{{$d1, $d2}}, $s;", []>; 3480 3481 def I32toI16H : NVPTXInst<(outs Int16Regs:$high), 3482 (ins Int32Regs:$s), 3483 "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}", 3484 []>; 3485 def I32toI16L : NVPTXInst<(outs Int16Regs:$low), 3486 (ins Int32Regs:$s), 3487 "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}", 3488 []>; 3489 def I64toI32H : NVPTXInst<(outs Int32Regs:$high), 3490 (ins Int64Regs:$s), 3491 "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}", 3492 []>; 3493} 3494 3495// Using partial vectorized move produces better SASS code for extraction of 3496// upper/lower parts of an integer. 3497def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))), 3498 (I32toI16H Int32Regs:$s)>; 3499def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))), 3500 (I32toI16H Int32Regs:$s)>; 3501def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))), 3502 (I64toI32H Int64Regs:$s)>; 3503def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))), 3504 (I64toI32H Int64Regs:$s)>; 3505 3506def: Pat<(i32 (sext (extractelt (v2i16 Int32Regs:$src), 0))), 3507 (CVT_INREG_s32_s16 Int32Regs:$src)>; 3508 3509foreach vt = [v2f16, v2bf16, v2i16] in { 3510def : Pat<(extractelt (vt Int32Regs:$src), 0), 3511 (I32toI16L Int32Regs:$src)>; 3512def : Pat<(extractelt (vt Int32Regs:$src), 1), 3513 (I32toI16H Int32Regs:$src)>; 3514} 3515def : Pat<(v2f16 (build_vector (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 3516 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; 3517def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 3518 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; 3519def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))), 3520 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; 3521 3522def: Pat<(v2i16 (scalar_to_vector (i16 Int16Regs:$a))), 3523 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; 3524 3525// Count leading zeros 3526let hasSideEffects = false in { 3527 def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), 3528 "clz.b32 \t$d, $a;", []>; 3529 def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), 3530 "clz.b64 \t$d, $a;", []>; 3531} 3532 3533// 32-bit has a direct PTX instruction 3534def : Pat<(i32 (ctlz (i32 Int32Regs:$a))), (CLZr32 Int32Regs:$a)>; 3535 3536// The return type of the ctlz ISD node is the same as its input, but the PTX 3537// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the 3538// ptx value to 64 bits to match the ISD node's semantics, unless we know we're 3539// truncating back down to 32 bits. 3540def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; 3541def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>; 3542 3543// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the 3544// result back to 16-bits if necessary. We also need to subtract 16 because 3545// the high-order 16 zeros were counted. 3546// 3547// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could 3548// use to save one SASS instruction (on sm_35 anyway): 3549// 3550// mov.b32 $tmp, {0xffff, $a} 3551// ctlz.b32 $result, $tmp 3552// 3553// That is, instead of zero-extending the input to 32 bits, we'd "one-extend" 3554// and then ctlz that value. This way we don't have to subtract 16 from the 3555// result. Unfortunately today we don't have a way to generate 3556// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization. 3557def : Pat<(i16 (ctlz Int16Regs:$a)), 3558 (SUBi16ri (CVT_u16_u32 3559 (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>; 3560def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))), 3561 (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>; 3562 3563// Population count 3564let hasSideEffects = false in { 3565 def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), 3566 "popc.b32 \t$d, $a;", []>; 3567 def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), 3568 "popc.b64 \t$d, $a;", []>; 3569} 3570 3571// 32-bit has a direct PTX instruction 3572def : Pat<(i32 (ctpop (i32 Int32Regs:$a))), (POPCr32 Int32Regs:$a)>; 3573 3574// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit 3575// to match the LLVM semantics. Just as with ctlz.i64, we provide a second 3576// pattern that avoids the type conversion if we're truncating the result to 3577// i32 anyway. 3578def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>; 3579def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>; 3580 3581// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits. 3582// If we know that we're storing into an i32, we can avoid the final trunc. 3583def : Pat<(ctpop Int16Regs:$a), 3584 (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>; 3585def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))), 3586 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>; 3587 3588// fpround f32 -> f16 3589def : Pat<(f16 (fpround Float32Regs:$a)), 3590 (CVT_f16_f32 Float32Regs:$a, CvtRN)>; 3591 3592// fpround f32 -> bf16 3593def : Pat<(bf16 (fpround Float32Regs:$a)), 3594 (CVT_bf16_f32 Float32Regs:$a, CvtRN)>; 3595 3596// fpround f64 -> f16 3597def : Pat<(f16 (fpround Float64Regs:$a)), 3598 (CVT_f16_f64 Float64Regs:$a, CvtRN)>; 3599 3600// fpround f64 -> bf16 3601def : Pat<(bf16 (fpround Float64Regs:$a)), 3602 (CVT_bf16_f64 Float64Regs:$a, CvtRN)>; 3603// fpround f64 -> f32 3604def : Pat<(f32 (fpround Float64Regs:$a)), 3605 (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; 3606def : Pat<(f32 (fpround Float64Regs:$a)), 3607 (CVT_f32_f64 Float64Regs:$a, CvtRN)>; 3608 3609// fpextend f16 -> f32 3610def : Pat<(f32 (fpextend (f16 Int16Regs:$a))), 3611 (CVT_f32_f16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3612def : Pat<(f32 (fpextend (f16 Int16Regs:$a))), 3613 (CVT_f32_f16 Int16Regs:$a, CvtNONE)>; 3614// fpextend bf16 -> f32 3615def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))), 3616 (CVT_f32_bf16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3617def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))), 3618 (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>; 3619 3620// fpextend f16 -> f64 3621def : Pat<(f64 (fpextend (f16 Int16Regs:$a))), 3622 (CVT_f64_f16 Int16Regs:$a, CvtNONE)>; 3623 3624// fpextend bf16 -> f64 3625def : Pat<(f64 (fpextend (bf16 Int16Regs:$a))), 3626 (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>; 3627 3628// fpextend f32 -> f64 3629def : Pat<(f64 (fpextend Float32Regs:$a)), 3630 (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3631def : Pat<(f64 (fpextend Float32Regs:$a)), 3632 (CVT_f64_f32 Float32Regs:$a, CvtNONE)>; 3633 3634def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone, 3635 [SDNPHasChain, SDNPOptInGlue]>; 3636 3637// fceil, ffloor, froundeven, ftrunc. 3638 3639multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> { 3640 def : Pat<(OpNode (f16 Int16Regs:$a)), 3641 (CVT_f16_f16 Int16Regs:$a, Mode)>; 3642 def : Pat<(OpNode (bf16 Int16Regs:$a)), 3643 (CVT_bf16_bf16 Int16Regs:$a, Mode)>; 3644 def : Pat<(OpNode Float32Regs:$a), 3645 (CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>; 3646 def : Pat<(OpNode Float32Regs:$a), 3647 (CVT_f32_f32 Float32Regs:$a, Mode)>, Requires<[doNoF32FTZ]>; 3648 def : Pat<(OpNode Float64Regs:$a), 3649 (CVT_f64_f64 Float64Regs:$a, Mode)>; 3650} 3651 3652defm : CVT_ROUND<fceil, CvtRPI, CvtRPI_FTZ>; 3653defm : CVT_ROUND<ffloor, CvtRMI, CvtRMI_FTZ>; 3654defm : CVT_ROUND<froundeven, CvtRNI, CvtRNI_FTZ>; 3655defm : CVT_ROUND<ftrunc, CvtRZI, CvtRZI_FTZ>; 3656 3657// nearbyint and rint are implemented as rounding to nearest even. This isn't 3658// strictly correct, because it causes us to ignore the rounding mode. But it 3659// matches what CUDA's "libm" does. 3660 3661defm : CVT_ROUND<fnearbyint, CvtRNI, CvtRNI_FTZ>; 3662defm : CVT_ROUND<frint, CvtRNI, CvtRNI_FTZ>; 3663 3664//----------------------------------- 3665// Control-flow 3666//----------------------------------- 3667 3668let isTerminator=1 in { 3669 let isReturn=1, isBarrier=1 in 3670 def Return : NVPTXInst<(outs), (ins), "ret;", [(retglue)]>; 3671 3672 let isBranch=1 in 3673 def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), 3674 "@$a bra \t$target;", 3675 [(brcond Int1Regs:$a, bb:$target)]>; 3676 let isBranch=1 in 3677 def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), 3678 "@!$a bra \t$target;", []>; 3679 3680 let isBranch=1, isBarrier=1 in 3681 def GOTO : NVPTXInst<(outs), (ins brtarget:$target), 3682 "bra.uni \t$target;", [(br bb:$target)]>; 3683} 3684 3685def : Pat<(brcond (i32 Int32Regs:$a), bb:$target), 3686 (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>; 3687 3688// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a 3689// conditional branch if the target block is the next block so that the code 3690// can fall through to the target block. The invertion is done by 'xor 3691// condition, 1', which will be translated to (setne condition, -1). Since ptx 3692// supports '@!pred bra target', we should use it. 3693def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target), 3694 (CBranchOther Int1Regs:$a, bb:$target)>; 3695 3696// Call 3697def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, 3698 SDTCisVT<1, i32>]>; 3699def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; 3700 3701def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart, 3702 [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; 3703def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd, 3704 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, 3705 SDNPSideEffect]>; 3706 3707def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; 3708def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall, 3709 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; 3710def calltarget : Operand<i32>; 3711let isCall=1 in { 3712 def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>; 3713} 3714 3715def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>; 3716def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>; 3717 3718// Pseudo instructions. 3719class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern> 3720 : NVPTXInst<outs, ins, asmstr, pattern>; 3721 3722def Callseq_Start : 3723 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), 3724 "\\{ // callseq $amt1, $amt2\n" 3725 "\t.reg .b32 temp_param_reg;", 3726 [(callseq_start timm:$amt1, timm:$amt2)]>; 3727def Callseq_End : 3728 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), 3729 "\\} // callseq $amt1", 3730 [(callseq_end timm:$amt1, timm:$amt2)]>; 3731 3732// trap instruction 3733// Emit an `exit` as well to convey to ptxas that `trap` exits the CFG. 3734// This won't be necessary in a future version of ptxas. 3735def trapinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>; 3736 3737// Call prototype wrapper 3738def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 3739def CallPrototype : 3740 SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, 3741 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 3742def ProtoIdent : Operand<i32> { 3743 let PrintMethod = "printProtoIdent"; 3744} 3745def CALL_PROTOTYPE : 3746 NVPTXInst<(outs), (ins ProtoIdent:$ident), 3747 "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; 3748 3749 3750include "NVPTXIntrinsics.td" 3751 3752 3753//----------------------------------- 3754// Notes 3755//----------------------------------- 3756// BSWAP is currently expanded. The following is a more efficient 3757// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register 3758// - for sm_20, use pmpt (use vector scalar mov to get the pack and 3759// unpack). sm_20 supports native 32-bit register, but not native 16-bit 3760// register. 3761