1//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file describes the PTX instructions in TableGen format. 10// 11//===----------------------------------------------------------------------===// 12 13include "NVPTXInstrFormats.td" 14 15let OperandType = "OPERAND_IMMEDIATE" in { 16 def f16imm : Operand<f16>; 17 def bf16imm : Operand<bf16>; 18 19} 20 21// List of vector specific properties 22def isVecLD : VecInstTypeEnum<1>; 23def isVecST : VecInstTypeEnum<2>; 24def isVecBuild : VecInstTypeEnum<3>; 25def isVecShuffle : VecInstTypeEnum<4>; 26def isVecExtract : VecInstTypeEnum<5>; 27def isVecInsert : VecInstTypeEnum<6>; 28def isVecDest : VecInstTypeEnum<7>; 29def isVecOther : VecInstTypeEnum<15>; 30 31//===----------------------------------------------------------------------===// 32// NVPTX Operand Definitions. 33//===----------------------------------------------------------------------===// 34 35def brtarget : Operand<OtherVT>; 36 37// CVT conversion modes 38// These must match the enum in NVPTX.h 39def CvtNONE : PatLeaf<(i32 0x0)>; 40def CvtRNI : PatLeaf<(i32 0x1)>; 41def CvtRZI : PatLeaf<(i32 0x2)>; 42def CvtRMI : PatLeaf<(i32 0x3)>; 43def CvtRPI : PatLeaf<(i32 0x4)>; 44def CvtRN : PatLeaf<(i32 0x5)>; 45def CvtRZ : PatLeaf<(i32 0x6)>; 46def CvtRM : PatLeaf<(i32 0x7)>; 47def CvtRP : PatLeaf<(i32 0x8)>; 48def CvtRNA : PatLeaf<(i32 0x9)>; 49 50def CvtNONE_FTZ : PatLeaf<(i32 0x10)>; 51def CvtRNI_FTZ : PatLeaf<(i32 0x11)>; 52def CvtRZI_FTZ : PatLeaf<(i32 0x12)>; 53def CvtRMI_FTZ : PatLeaf<(i32 0x13)>; 54def CvtRPI_FTZ : PatLeaf<(i32 0x14)>; 55def CvtRN_FTZ : PatLeaf<(i32 0x15)>; 56def CvtRZ_FTZ : PatLeaf<(i32 0x16)>; 57def CvtRM_FTZ : PatLeaf<(i32 0x17)>; 58def CvtRP_FTZ : PatLeaf<(i32 0x18)>; 59 60def CvtSAT : PatLeaf<(i32 0x20)>; 61def CvtSAT_FTZ : PatLeaf<(i32 0x30)>; 62 63def CvtNONE_RELU : PatLeaf<(i32 0x40)>; 64def CvtRN_RELU : PatLeaf<(i32 0x45)>; 65def CvtRZ_RELU : PatLeaf<(i32 0x46)>; 66 67def CvtMode : Operand<i32> { 68 let PrintMethod = "printCvtMode"; 69} 70 71// Compare modes 72// These must match the enum in NVPTX.h 73def CmpEQ : PatLeaf<(i32 0)>; 74def CmpNE : PatLeaf<(i32 1)>; 75def CmpLT : PatLeaf<(i32 2)>; 76def CmpLE : PatLeaf<(i32 3)>; 77def CmpGT : PatLeaf<(i32 4)>; 78def CmpGE : PatLeaf<(i32 5)>; 79def CmpLO : PatLeaf<(i32 6)>; 80def CmpLS : PatLeaf<(i32 7)>; 81def CmpHI : PatLeaf<(i32 8)>; 82def CmpHS : PatLeaf<(i32 9)>; 83def CmpEQU : PatLeaf<(i32 10)>; 84def CmpNEU : PatLeaf<(i32 11)>; 85def CmpLTU : PatLeaf<(i32 12)>; 86def CmpLEU : PatLeaf<(i32 13)>; 87def CmpGTU : PatLeaf<(i32 14)>; 88def CmpGEU : PatLeaf<(i32 15)>; 89def CmpNUM : PatLeaf<(i32 16)>; 90def CmpNAN : PatLeaf<(i32 17)>; 91 92def CmpEQ_FTZ : PatLeaf<(i32 0x100)>; 93def CmpNE_FTZ : PatLeaf<(i32 0x101)>; 94def CmpLT_FTZ : PatLeaf<(i32 0x102)>; 95def CmpLE_FTZ : PatLeaf<(i32 0x103)>; 96def CmpGT_FTZ : PatLeaf<(i32 0x104)>; 97def CmpGE_FTZ : PatLeaf<(i32 0x105)>; 98def CmpEQU_FTZ : PatLeaf<(i32 0x10A)>; 99def CmpNEU_FTZ : PatLeaf<(i32 0x10B)>; 100def CmpLTU_FTZ : PatLeaf<(i32 0x10C)>; 101def CmpLEU_FTZ : PatLeaf<(i32 0x10D)>; 102def CmpGTU_FTZ : PatLeaf<(i32 0x10E)>; 103def CmpGEU_FTZ : PatLeaf<(i32 0x10F)>; 104def CmpNUM_FTZ : PatLeaf<(i32 0x110)>; 105def CmpNAN_FTZ : PatLeaf<(i32 0x111)>; 106 107def CmpMode : Operand<i32> { 108 let PrintMethod = "printCmpMode"; 109} 110def VecElement : Operand<i32> { 111 let PrintMethod = "printVecElement"; 112} 113 114// PRMT modes 115// These must match the enum in NVPTX.h 116def PrmtNONE : PatLeaf<(i32 0x0)>; 117def PrmtF4E : PatLeaf<(i32 0x1)>; 118def PrmtB4E : PatLeaf<(i32 0x2)>; 119def PrmtRC8 : PatLeaf<(i32 0x3)>; 120def PrmtECL : PatLeaf<(i32 0x4)>; 121def PrmtECR : PatLeaf<(i32 0x5)>; 122def PrmtRC16 : PatLeaf<(i32 0x6)>; 123 124def PrmtMode : Operand<i32> { 125 let PrintMethod = "printPrmtMode"; 126} 127 128 129//===----------------------------------------------------------------------===// 130// NVPTX Instruction Predicate Definitions 131//===----------------------------------------------------------------------===// 132 133 134def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; 135def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; 136def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; 137def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; 138def hasVote : Predicate<"Subtarget->hasVote()">; 139def hasDouble : Predicate<"Subtarget->hasDouble()">; 140def hasLDG : Predicate<"Subtarget->hasLDG()">; 141def hasLDU : Predicate<"Subtarget->hasLDU()">; 142 143def doF32FTZ : Predicate<"useF32FTZ()">; 144def doNoF32FTZ : Predicate<"!useF32FTZ()">; 145def doRsqrtOpt : Predicate<"doRsqrtOpt()">; 146 147def doMulWide : Predicate<"doMulWide">; 148 149def allowFMA : Predicate<"allowFMA()">; 150def noFMA : Predicate<"!allowFMA()">; 151def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">; 152def noUnsafeFPMath : Predicate<"!allowUnsafeFPMath()">; 153 154def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">; 155def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">; 156 157def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">; 158def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; 159 160def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; 161def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; 162 163def True : Predicate<"true">; 164def False : Predicate<"false">; 165 166class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>; 167class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>; 168 169// Explicit records for arch-accelerated SM versions 170def hasSM90a : Predicate<"Subtarget->getFullSmVersion() == 901">; 171 172// non-sync shfl instructions are not available on sm_70+ in PTX6.4+ 173def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" 174 "&& Subtarget->getPTXVersion() >= 64)">; 175 176def useShortPtrLocal : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_LOCAL) == 32">; 177def useShortPtrShared : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32">; 178def useShortPtrConst : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_CONST) == 32">; 179 180def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; 181def hasBF16Math: Predicate<"Subtarget->hasBF16Math()">; 182 183// Helper class to aid conversion between ValueType and a matching RegisterClass. 184 185class ValueToRegClass<ValueType T> { 186 string name = !cast<string>(T); 187 NVPTXRegClass ret = !cond( 188 !eq(name, "i1"): Int1Regs, 189 !eq(name, "i16"): Int16Regs, 190 !eq(name, "v2i16"): Int32Regs, 191 !eq(name, "i32"): Int32Regs, 192 !eq(name, "i64"): Int64Regs, 193 !eq(name, "f16"): Int16Regs, 194 !eq(name, "v2f16"): Int32Regs, 195 !eq(name, "bf16"): Int16Regs, 196 !eq(name, "v2bf16"): Int32Regs, 197 !eq(name, "f32"): Float32Regs, 198 !eq(name, "f64"): Float64Regs, 199 !eq(name, "ai32"): Int32ArgRegs, 200 !eq(name, "ai64"): Int64ArgRegs, 201 !eq(name, "af32"): Float32ArgRegs, 202 !eq(name, "if64"): Float64ArgRegs, 203 ); 204} 205 206 207//===----------------------------------------------------------------------===// 208// Some Common Instruction Class Templates 209//===----------------------------------------------------------------------===// 210 211// Template for instructions which take three int64, int32, or int16 args. 212// The instructions are named "<OpcStr><Width>" (e.g. "add.s64"). 213multiclass I3<string OpcStr, SDNode OpNode> { 214 def i64rr : 215 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 216 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 217 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; 218 def i64ri : 219 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 220 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 221 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; 222 def i32rr : 223 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 224 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 225 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>; 226 def i32ri : 227 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 228 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 229 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>; 230 def i16rr : 231 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 232 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 233 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; 234 def i16ri : 235 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 236 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 237 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; 238} 239 240class I16x2<string OpcStr, SDNode OpNode> : 241 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 242 !strconcat(OpcStr, "16x2 \t$dst, $a, $b;"), 243 [(set Int32Regs:$dst, (OpNode (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)))]>, 244 Requires<[hasPTX<80>, hasSM<90>]>; 245 246// Template for instructions which take 3 int args. The instructions are 247// named "<OpcStr>.s32" (e.g. "addc.cc.s32"). 248multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> { 249 let hasSideEffects = 1 in { 250 def i32rr : 251 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 252 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), 253 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>; 254 def i32ri : 255 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 256 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), 257 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>; 258 def i64rr : 259 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 260 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), 261 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>, 262 Requires<[hasPTX<43>]>; 263 def i64ri : 264 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 265 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), 266 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>, 267 Requires<[hasPTX<43>]>; 268 } 269} 270 271// Template for instructions which take three fp64 or fp32 args. The 272// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64"). 273// 274// Also defines ftz (flush subnormal inputs and results to sign-preserving 275// zero) variants for fp32 functions. 276// 277// This multiclass should be used for nodes that cannot be folded into FMAs. 278// For nodes that can be folded into FMAs (i.e. adds and muls), use 279// F3_fma_component. 280multiclass F3<string OpcStr, SDNode OpNode> { 281 def f64rr : 282 NVPTXInst<(outs Float64Regs:$dst), 283 (ins Float64Regs:$a, Float64Regs:$b), 284 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 285 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>; 286 def f64ri : 287 NVPTXInst<(outs Float64Regs:$dst), 288 (ins Float64Regs:$a, f64imm:$b), 289 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 290 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>; 291 def f32rr_ftz : 292 NVPTXInst<(outs Float32Regs:$dst), 293 (ins Float32Regs:$a, Float32Regs:$b), 294 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 295 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 296 Requires<[doF32FTZ]>; 297 def f32ri_ftz : 298 NVPTXInst<(outs Float32Regs:$dst), 299 (ins Float32Regs:$a, f32imm:$b), 300 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 301 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 302 Requires<[doF32FTZ]>; 303 def f32rr : 304 NVPTXInst<(outs Float32Regs:$dst), 305 (ins Float32Regs:$a, Float32Regs:$b), 306 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 307 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>; 308 def f32ri : 309 NVPTXInst<(outs Float32Regs:$dst), 310 (ins Float32Regs:$a, f32imm:$b), 311 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 312 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>; 313 314 def f16rr_ftz : 315 NVPTXInst<(outs Int16Regs:$dst), 316 (ins Int16Regs:$a, Int16Regs:$b), 317 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), 318 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 319 Requires<[useFP16Math, doF32FTZ]>; 320 def f16rr : 321 NVPTXInst<(outs Int16Regs:$dst), 322 (ins Int16Regs:$a, Int16Regs:$b), 323 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), 324 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 325 Requires<[useFP16Math]>; 326 327 def f16x2rr_ftz : 328 NVPTXInst<(outs Int32Regs:$dst), 329 (ins Int32Regs:$a, Int32Regs:$b), 330 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), 331 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 332 Requires<[useFP16Math, doF32FTZ]>; 333 def f16x2rr : 334 NVPTXInst<(outs Int32Regs:$dst), 335 (ins Int32Regs:$a, Int32Regs:$b), 336 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), 337 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 338 Requires<[useFP16Math]>; 339 def bf16rr_ftz : 340 NVPTXInst<(outs Int16Regs:$dst), 341 (ins Int16Regs:$a, Int16Regs:$b), 342 !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"), 343 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 344 Requires<[hasBF16Math, doF32FTZ]>; 345 def bf16rr : 346 NVPTXInst<(outs Int16Regs:$dst), 347 (ins Int16Regs:$a, Int16Regs:$b), 348 !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"), 349 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 350 Requires<[hasBF16Math]>; 351 352 def bf16x2rr_ftz : 353 NVPTXInst<(outs Int32Regs:$dst), 354 (ins Int32Regs:$a, Int32Regs:$b), 355 !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"), 356 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 357 Requires<[hasBF16Math, doF32FTZ]>; 358 def bf16x2rr : 359 NVPTXInst<(outs Int32Regs:$dst), 360 (ins Int32Regs:$a, Int32Regs:$b), 361 !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"), 362 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 363 Requires<[hasBF16Math]>; 364} 365 366// Template for instructions which take three FP args. The 367// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64"). 368// 369// Also defines ftz (flush subnormal inputs and results to sign-preserving 370// zero) variants for fp32/fp16 functions. 371// 372// This multiclass should be used for nodes that can be folded to make fma ops. 373// In this case, we use the ".rn" variant when FMA is disabled, as this behaves 374// just like the non ".rn" op, but prevents ptxas from creating FMAs. 375multiclass F3_fma_component<string OpcStr, SDNode OpNode> { 376 def f64rr : 377 NVPTXInst<(outs Float64Regs:$dst), 378 (ins Float64Regs:$a, Float64Regs:$b), 379 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 380 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, 381 Requires<[allowFMA]>; 382 def f64ri : 383 NVPTXInst<(outs Float64Regs:$dst), 384 (ins Float64Regs:$a, f64imm:$b), 385 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 386 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, 387 Requires<[allowFMA]>; 388 def f32rr_ftz : 389 NVPTXInst<(outs Float32Regs:$dst), 390 (ins Float32Regs:$a, Float32Regs:$b), 391 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 392 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 393 Requires<[allowFMA, doF32FTZ]>; 394 def f32ri_ftz : 395 NVPTXInst<(outs Float32Regs:$dst), 396 (ins Float32Regs:$a, f32imm:$b), 397 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 398 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 399 Requires<[allowFMA, doF32FTZ]>; 400 def f32rr : 401 NVPTXInst<(outs Float32Regs:$dst), 402 (ins Float32Regs:$a, Float32Regs:$b), 403 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 404 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 405 Requires<[allowFMA]>; 406 def f32ri : 407 NVPTXInst<(outs Float32Regs:$dst), 408 (ins Float32Regs:$a, f32imm:$b), 409 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 410 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 411 Requires<[allowFMA]>; 412 413 def f16rr_ftz : 414 NVPTXInst<(outs Int16Regs:$dst), 415 (ins Int16Regs:$a, Int16Regs:$b), 416 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), 417 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 418 Requires<[useFP16Math, allowFMA, doF32FTZ]>; 419 def f16rr : 420 NVPTXInst<(outs Int16Regs:$dst), 421 (ins Int16Regs:$a, Int16Regs:$b), 422 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), 423 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 424 Requires<[useFP16Math, allowFMA]>; 425 426 def f16x2rr_ftz : 427 NVPTXInst<(outs Int32Regs:$dst), 428 (ins Int32Regs:$a, Int32Regs:$b), 429 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), 430 [(set (v2f16 Int32Regs:$dst), (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 431 Requires<[useFP16Math, allowFMA, doF32FTZ]>; 432 def f16x2rr : 433 NVPTXInst<(outs Int32Regs:$dst), 434 (ins Int32Regs:$a, Int32Regs:$b), 435 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), 436 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 437 Requires<[useFP16Math, allowFMA]>; 438 def bf16rr_ftz : 439 NVPTXInst<(outs Int16Regs:$dst), 440 (ins Int16Regs:$a, Int16Regs:$b), 441 !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"), 442 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 443 Requires<[hasBF16Math, allowFMA, doF32FTZ]>; 444 def bf16rr : 445 NVPTXInst<(outs Int16Regs:$dst), 446 (ins Int16Regs:$a, Int16Regs:$b), 447 !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"), 448 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 449 Requires<[hasBF16Math, allowFMA]>; 450 451 def bf16x2rr_ftz : 452 NVPTXInst<(outs Int32Regs:$dst), 453 (ins Int32Regs:$a, Int32Regs:$b), 454 !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"), 455 [(set (v2bf16 Int32Regs:$dst), (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 456 Requires<[hasBF16Math, allowFMA, doF32FTZ]>; 457 def bf16x2rr : 458 NVPTXInst<(outs Int32Regs:$dst), 459 (ins Int32Regs:$a, Int32Regs:$b), 460 !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"), 461 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 462 Requires<[hasBF16Math, allowFMA]>; 463 // These have strange names so we don't perturb existing mir tests. 464 def _rnf64rr : 465 NVPTXInst<(outs Float64Regs:$dst), 466 (ins Float64Regs:$a, Float64Regs:$b), 467 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), 468 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, 469 Requires<[noFMA]>; 470 def _rnf64ri : 471 NVPTXInst<(outs Float64Regs:$dst), 472 (ins Float64Regs:$a, f64imm:$b), 473 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), 474 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, 475 Requires<[noFMA]>; 476 def _rnf32rr_ftz : 477 NVPTXInst<(outs Float32Regs:$dst), 478 (ins Float32Regs:$a, Float32Regs:$b), 479 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), 480 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 481 Requires<[noFMA, doF32FTZ]>; 482 def _rnf32ri_ftz : 483 NVPTXInst<(outs Float32Regs:$dst), 484 (ins Float32Regs:$a, f32imm:$b), 485 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), 486 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 487 Requires<[noFMA, doF32FTZ]>; 488 def _rnf32rr : 489 NVPTXInst<(outs Float32Regs:$dst), 490 (ins Float32Regs:$a, Float32Regs:$b), 491 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), 492 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, 493 Requires<[noFMA]>; 494 def _rnf32ri : 495 NVPTXInst<(outs Float32Regs:$dst), 496 (ins Float32Regs:$a, f32imm:$b), 497 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), 498 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, 499 Requires<[noFMA]>; 500 def _rnf16rr_ftz : 501 NVPTXInst<(outs Int16Regs:$dst), 502 (ins Int16Regs:$a, Int16Regs:$b), 503 !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"), 504 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 505 Requires<[useFP16Math, noFMA, doF32FTZ]>; 506 def _rnf16rr : 507 NVPTXInst<(outs Int16Regs:$dst), 508 (ins Int16Regs:$a, Int16Regs:$b), 509 !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"), 510 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, 511 Requires<[useFP16Math, noFMA]>; 512 def _rnf16x2rr_ftz : 513 NVPTXInst<(outs Int32Regs:$dst), 514 (ins Int32Regs:$a, Int32Regs:$b), 515 !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"), 516 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 517 Requires<[useFP16Math, noFMA, doF32FTZ]>; 518 def _rnf16x2rr : 519 NVPTXInst<(outs Int32Regs:$dst), 520 (ins Int32Regs:$a, Int32Regs:$b), 521 !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"), 522 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, 523 Requires<[useFP16Math, noFMA]>; 524 def _rnbf16rr_ftz : 525 NVPTXInst<(outs Int16Regs:$dst), 526 (ins Int16Regs:$a, Int16Regs:$b), 527 !strconcat(OpcStr, ".rn.ftz.bf16 \t$dst, $a, $b;"), 528 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 529 Requires<[hasBF16Math, noFMA, doF32FTZ]>; 530 def _rnbf16rr : 531 NVPTXInst<(outs Int16Regs:$dst), 532 (ins Int16Regs:$a, Int16Regs:$b), 533 !strconcat(OpcStr, ".rn.bf16 \t$dst, $a, $b;"), 534 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, 535 Requires<[hasBF16Math, noFMA]>; 536 def _rnbf16x2rr_ftz : 537 NVPTXInst<(outs Int32Regs:$dst), 538 (ins Int32Regs:$a, Int32Regs:$b), 539 !strconcat(OpcStr, ".rn.ftz.bf16x2 \t$dst, $a, $b;"), 540 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 541 Requires<[hasBF16Math, noFMA, doF32FTZ]>; 542 def _rnbf16x2rr : 543 NVPTXInst<(outs Int32Regs:$dst), 544 (ins Int32Regs:$a, Int32Regs:$b), 545 !strconcat(OpcStr, ".rn.bf16x2 \t$dst, $a, $b;"), 546 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, 547 Requires<[hasBF16Math, noFMA]>; 548} 549 550// Template for operations which take two f32 or f64 operands. Provides three 551// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush 552// subnormal inputs and results to zero). 553multiclass F2<string OpcStr, SDNode OpNode> { 554 def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a), 555 !strconcat(OpcStr, ".f64 \t$dst, $a;"), 556 [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>; 557 def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), 558 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"), 559 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>, 560 Requires<[doF32FTZ]>; 561 def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), 562 !strconcat(OpcStr, ".f32 \t$dst, $a;"), 563 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>; 564} 565 566multiclass F2_Support_Half<string OpcStr, SDNode OpNode> { 567 def bf16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), 568 !strconcat(OpcStr, ".bf16 \t$dst, $a;"), 569 [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a)))]>, 570 Requires<[hasSM<80>, hasPTX<70>]>; 571 def bf16x2 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 572 !strconcat(OpcStr, ".bf16x2 \t$dst, $a;"), 573 [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a)))]>, 574 Requires<[hasSM<80>, hasPTX<70>]>; 575 def f16_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), 576 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a;"), 577 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>, 578 Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>; 579 def f16x2_ftz : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 580 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a;"), 581 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>, 582 Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>; 583 def f16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), 584 !strconcat(OpcStr, ".f16 \t$dst, $a;"), 585 [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>, 586 Requires<[hasSM<53>, hasPTX<65>]>; 587 def f16x2 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 588 !strconcat(OpcStr, ".f16x2 \t$dst, $a;"), 589 [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>, 590 Requires<[hasSM<53>, hasPTX<65>]>; 591 592} 593 594//===----------------------------------------------------------------------===// 595// NVPTX Instructions. 596//===----------------------------------------------------------------------===// 597 598//----------------------------------- 599// Type Conversion 600//----------------------------------- 601 602let hasSideEffects = false in { 603 // Generate a cvt to the given type from all possible types. Each instance 604 // takes a CvtMode immediate that defines the conversion mode to use. It can 605 // be CvtNONE to omit a conversion mode. 606 multiclass CVT_FROM_ALL<string ToType, RegisterClass RC, list<Predicate> Preds = []> { 607 def _s8 : 608 NVPTXInst<(outs RC:$dst), 609 (ins Int16Regs:$src, CvtMode:$mode), 610 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 611 ToType, ".s8 \t$dst, $src;"), []>, 612 Requires<Preds>; 613 def _u8 : 614 NVPTXInst<(outs RC:$dst), 615 (ins Int16Regs:$src, CvtMode:$mode), 616 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 617 ToType, ".u8 \t$dst, $src;"), []>, 618 Requires<Preds>; 619 def _s16 : 620 NVPTXInst<(outs RC:$dst), 621 (ins Int16Regs:$src, CvtMode:$mode), 622 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 623 ToType, ".s16 \t$dst, $src;"), []>, 624 Requires<Preds>; 625 def _u16 : 626 NVPTXInst<(outs RC:$dst), 627 (ins Int16Regs:$src, CvtMode:$mode), 628 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 629 ToType, ".u16 \t$dst, $src;"), []>, 630 Requires<Preds>; 631 def _s32 : 632 NVPTXInst<(outs RC:$dst), 633 (ins Int32Regs:$src, CvtMode:$mode), 634 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 635 ToType, ".s32 \t$dst, $src;"), []>, 636 Requires<Preds>; 637 def _u32 : 638 NVPTXInst<(outs RC:$dst), 639 (ins Int32Regs:$src, CvtMode:$mode), 640 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 641 ToType, ".u32 \t$dst, $src;"), []>, 642 Requires<Preds>; 643 def _s64 : 644 NVPTXInst<(outs RC:$dst), 645 (ins Int64Regs:$src, CvtMode:$mode), 646 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 647 ToType, ".s64 \t$dst, $src;"), []>, 648 Requires<Preds>; 649 def _u64 : 650 NVPTXInst<(outs RC:$dst), 651 (ins Int64Regs:$src, CvtMode:$mode), 652 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 653 ToType, ".u64 \t$dst, $src;"), []>, 654 Requires<Preds>; 655 def _f16 : 656 NVPTXInst<(outs RC:$dst), 657 (ins Int16Regs:$src, CvtMode:$mode), 658 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 659 ToType, ".f16 \t$dst, $src;"), []>, 660 Requires<Preds>; 661 def _bf16 : 662 NVPTXInst<(outs RC:$dst), 663 (ins Int16Regs:$src, CvtMode:$mode), 664 !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.", 665 ToType, ".bf16 \t$dst, $src;"), []>, 666 Requires<!if(!eq(ToType, "f32"), 667 // bf16->f32 was introduced early. 668 [hasPTX<71>, hasSM<80>], 669 // bf16->everything else needs sm90/ptx78 670 [hasPTX<78>, hasSM<90>])>; 671 def _f32 : 672 NVPTXInst<(outs RC:$dst), 673 (ins Float32Regs:$src, CvtMode:$mode), 674 !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.", 675 ToType, ".f32 \t$dst, $src;"), []>, 676 Requires<!if(!eq(ToType, "bf16"), 677 // f32->bf16 was introduced early. 678 [hasPTX<70>, hasSM<80>], 679 Preds)>; 680 def _f64 : 681 NVPTXInst<(outs RC:$dst), 682 (ins Float64Regs:$src, CvtMode:$mode), 683 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 684 ToType, ".f64 \t$dst, $src;"), []>, 685 Requires<Preds>; 686 } 687 688 // Generate cvts from all types to all types. 689 defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>; 690 defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>; 691 defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>; 692 defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>; 693 defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>; 694 defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>; 695 defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>; 696 defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>; 697 defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>; 698 defm CVT_bf16 : CVT_FROM_ALL<"bf16", Int16Regs, [hasPTX<78>, hasSM<90>]>; 699 defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>; 700 defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>; 701 702 // These cvts are different from those above: The source and dest registers 703 // are of the same type. 704 def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 705 "cvt.s16.s8 \t$dst, $src;", []>; 706 def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 707 "cvt.s32.s8 \t$dst, $src;", []>; 708 def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 709 "cvt.s32.s16 \t$dst, $src;", []>; 710 def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 711 "cvt.s64.s8 \t$dst, $src;", []>; 712 def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 713 "cvt.s64.s16 \t$dst, $src;", []>; 714 def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 715 "cvt.s64.s32 \t$dst, $src;", []>; 716 717 multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> { 718 def _f32 : 719 NVPTXInst<(outs RC:$dst), 720 (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode), 721 !strconcat("cvt${mode:base}${mode:relu}.", 722 FromName, ".f32 \t$dst, $src1, $src2;"), []>, 723 Requires<[hasPTX<70>, hasSM<80>]>; 724 } 725 726 defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>; 727 defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>; 728} 729 730//----------------------------------- 731// Selection instructions (selp) 732//----------------------------------- 733 734// TODO: Missing slct 735 736// selp instructions that don't have any pattern matches; we explicitly use 737// them within this file. 738let hasSideEffects = false in { 739 multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> { 740 def rr : NVPTXInst<(outs RC:$dst), 741 (ins RC:$a, RC:$b, Int1Regs:$p), 742 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 743 def ri : NVPTXInst<(outs RC:$dst), 744 (ins RC:$a, ImmCls:$b, Int1Regs:$p), 745 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 746 def ir : NVPTXInst<(outs RC:$dst), 747 (ins ImmCls:$a, RC:$b, Int1Regs:$p), 748 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 749 def ii : NVPTXInst<(outs RC:$dst), 750 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), 751 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 752 } 753 754 multiclass SELP_PATTERN<string TypeStr, ValueType T, RegisterClass RC, 755 Operand ImmCls, SDNode ImmNode> { 756 def rr : 757 NVPTXInst<(outs RC:$dst), 758 (ins RC:$a, RC:$b, Int1Regs:$p), 759 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 760 [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T RC:$b)))]>; 761 def ri : 762 NVPTXInst<(outs RC:$dst), 763 (ins RC:$a, ImmCls:$b, Int1Regs:$p), 764 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 765 [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T ImmNode:$b)))]>; 766 def ir : 767 NVPTXInst<(outs RC:$dst), 768 (ins ImmCls:$a, RC:$b, Int1Regs:$p), 769 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 770 [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, (T RC:$b)))]>; 771 def ii : 772 NVPTXInst<(outs RC:$dst), 773 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), 774 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 775 [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>; 776 } 777} 778 779// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as 780// good. 781defm SELP_b16 : SELP_PATTERN<"b16", i16, Int16Regs, i16imm, imm>; 782defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>; 783defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>; 784defm SELP_b32 : SELP_PATTERN<"b32", i32, Int32Regs, i32imm, imm>; 785defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>; 786defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>; 787defm SELP_b64 : SELP_PATTERN<"b64", i64, Int64Regs, i64imm, imm>; 788defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>; 789defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>; 790defm SELP_f16 : SELP_PATTERN<"b16", f16, Int16Regs, f16imm, fpimm>; 791defm SELP_bf16 : SELP_PATTERN<"b16", bf16, Int16Regs, bf16imm, fpimm>; 792 793defm SELP_f32 : SELP_PATTERN<"f32", f32, Float32Regs, f32imm, fpimm>; 794defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>; 795 796// This does not work as tablegen fails to infer the type of 'imm'. 797// def v2f16imm : Operand<v2f16>; 798// defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>; 799 800foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { 801def : Pat<(vt (select Int1Regs:$p, (vt Int32Regs:$a), (vt Int32Regs:$b))), 802 (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>; 803} 804 805//----------------------------------- 806// Test Instructions 807//----------------------------------- 808 809def TESTINF_f32r : NVPTXInst<(outs Int1Regs:$p), (ins Float32Regs:$a), 810 "testp.infinite.f32 \t$p, $a;", 811 []>; 812def TESTINF_f32i : NVPTXInst<(outs Int1Regs:$p), (ins f32imm:$a), 813 "testp.infinite.f32 \t$p, $a;", 814 []>; 815def TESTINF_f64r : NVPTXInst<(outs Int1Regs:$p), (ins Float64Regs:$a), 816 "testp.infinite.f64 \t$p, $a;", 817 []>; 818def TESTINF_f64i : NVPTXInst<(outs Int1Regs:$p), (ins f64imm:$a), 819 "testp.infinite.f64 \t$p, $a;", 820 []>; 821 822//----------------------------------- 823// Integer Arithmetic 824//----------------------------------- 825 826// Template for xor masquerading as int1 arithmetic. 827multiclass ADD_SUB_i1<SDNode OpNode> { 828 def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), 829 "xor.pred \t$dst, $a, $b;", 830 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; 831 def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), 832 "xor.pred \t$dst, $a, $b;", 833 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>; 834} 835 836// int1 addition and subtraction are both just xor. 837defm ADD_i1 : ADD_SUB_i1<add>; 838defm SUB_i1 : ADD_SUB_i1<sub>; 839 840// int16, int32, and int64 signed addition. Since nvptx is 2's complement, we 841// also use these for unsigned arithmetic. 842defm ADD : I3<"add.s", add>; 843defm SUB : I3<"sub.s", sub>; 844 845def ADD16x2 : I16x2<"add.s", add>; 846 847// in32 and int64 addition and subtraction with carry-out. 848defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>; 849defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>; 850 851// int32 and int64 addition and subtraction with carry-in and carry-out. 852defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>; 853defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>; 854 855defm MULT : I3<"mul.lo.s", mul>; 856 857defm MULTHS : I3<"mul.hi.s", mulhs>; 858defm MULTHU : I3<"mul.hi.u", mulhu>; 859 860defm SDIV : I3<"div.s", sdiv>; 861defm UDIV : I3<"div.u", udiv>; 862 863// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM 864// will lower it. 865defm SREM : I3<"rem.s", srem>; 866defm UREM : I3<"rem.u", urem>; 867 868// Integer absolute value. NumBits should be one minus the bit width of RC. 869// This idiom implements the algorithm at 870// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs. 871multiclass ABS<ValueType T, RegisterClass RC, string SizeName> { 872 def : NVPTXInst<(outs RC:$dst), (ins RC:$a), 873 !strconcat("abs", SizeName, " \t$dst, $a;"), 874 [(set (T RC:$dst), (abs (T RC:$a)))]>; 875} 876defm ABS_16 : ABS<i16, Int16Regs, ".s16">; 877defm ABS_32 : ABS<i32, Int32Regs, ".s32">; 878defm ABS_64 : ABS<i64, Int64Regs, ".s64">; 879 880// Integer min/max. 881defm SMAX : I3<"max.s", smax>; 882defm UMAX : I3<"max.u", umax>; 883defm SMIN : I3<"min.s", smin>; 884defm UMIN : I3<"min.u", umin>; 885 886def SMAX16x2 : I16x2<"max.s", smax>; 887def UMAX16x2 : I16x2<"max.u", umax>; 888def SMIN16x2 : I16x2<"min.s", smin>; 889def UMIN16x2 : I16x2<"min.u", umin>; 890 891 892// 893// Wide multiplication 894// 895def MULWIDES64 : 896 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 897 "mul.wide.s32 \t$dst, $a, $b;", []>; 898def MULWIDES64Imm : 899 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 900 "mul.wide.s32 \t$dst, $a, $b;", []>; 901def MULWIDES64Imm64 : 902 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), 903 "mul.wide.s32 \t$dst, $a, $b;", []>; 904 905def MULWIDEU64 : 906 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 907 "mul.wide.u32 \t$dst, $a, $b;", []>; 908def MULWIDEU64Imm : 909 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 910 "mul.wide.u32 \t$dst, $a, $b;", []>; 911def MULWIDEU64Imm64 : 912 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), 913 "mul.wide.u32 \t$dst, $a, $b;", []>; 914 915def MULWIDES32 : 916 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 917 "mul.wide.s16 \t$dst, $a, $b;", []>; 918def MULWIDES32Imm : 919 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 920 "mul.wide.s16 \t$dst, $a, $b;", []>; 921def MULWIDES32Imm32 : 922 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 923 "mul.wide.s16 \t$dst, $a, $b;", []>; 924 925def MULWIDEU32 : 926 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 927 "mul.wide.u16 \t$dst, $a, $b;", []>; 928def MULWIDEU32Imm : 929 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 930 "mul.wide.u16 \t$dst, $a, $b;", []>; 931def MULWIDEU32Imm32 : 932 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 933 "mul.wide.u16 \t$dst, $a, $b;", []>; 934 935def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; 936def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; 937def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; 938 939// Matchers for signed, unsigned mul.wide ISD nodes. 940def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), 941 (MULWIDES32 i16:$a, i16:$b)>, 942 Requires<[doMulWide]>; 943def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)), 944 (MULWIDES32Imm Int16Regs:$a, imm:$b)>, 945 Requires<[doMulWide]>; 946def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), 947 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, 948 Requires<[doMulWide]>; 949def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)), 950 (MULWIDEU32Imm Int16Regs:$a, imm:$b)>, 951 Requires<[doMulWide]>; 952 953def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)), 954 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, 955 Requires<[doMulWide]>; 956def : Pat<(i64 (mul_wide_signed (i32 Int32Regs:$a), imm:$b)), 957 (MULWIDES64Imm Int32Regs:$a, imm:$b)>, 958 Requires<[doMulWide]>; 959def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)), 960 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, 961 Requires<[doMulWide]>; 962def : Pat<(i64 (mul_wide_unsigned (i32 Int32Regs:$a), imm:$b)), 963 (MULWIDEU64Imm Int32Regs:$a, imm:$b)>, 964 Requires<[doMulWide]>; 965 966// Predicates used for converting some patterns to mul.wide. 967def SInt32Const : PatLeaf<(imm), [{ 968 const APInt &v = N->getAPIntValue(); 969 return v.isSignedIntN(32); 970}]>; 971 972def UInt32Const : PatLeaf<(imm), [{ 973 const APInt &v = N->getAPIntValue(); 974 return v.isIntN(32); 975}]>; 976 977def SInt16Const : PatLeaf<(imm), [{ 978 const APInt &v = N->getAPIntValue(); 979 return v.isSignedIntN(16); 980}]>; 981 982def UInt16Const : PatLeaf<(imm), [{ 983 const APInt &v = N->getAPIntValue(); 984 return v.isIntN(16); 985}]>; 986 987def IntConst_0_30 : PatLeaf<(imm), [{ 988 // Check if 0 <= v < 31; only then will the result of (x << v) be an int32. 989 const APInt &v = N->getAPIntValue(); 990 return v.sge(0) && v.slt(31); 991}]>; 992 993def IntConst_0_14 : PatLeaf<(imm), [{ 994 // Check if 0 <= v < 15; only then will the result of (x << v) be an int16. 995 const APInt &v = N->getAPIntValue(); 996 return v.sge(0) && v.slt(15); 997}]>; 998 999def SHL2MUL32 : SDNodeXForm<imm, [{ 1000 const APInt &v = N->getAPIntValue(); 1001 APInt temp(32, 1); 1002 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32); 1003}]>; 1004 1005def SHL2MUL16 : SDNodeXForm<imm, [{ 1006 const APInt &v = N->getAPIntValue(); 1007 APInt temp(16, 1); 1008 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16); 1009}]>; 1010 1011// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide. 1012def : Pat<(shl (sext Int32Regs:$a), (i32 IntConst_0_30:$b)), 1013 (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, 1014 Requires<[doMulWide]>; 1015def : Pat<(shl (zext Int32Regs:$a), (i32 IntConst_0_30:$b)), 1016 (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, 1017 Requires<[doMulWide]>; 1018 1019def : Pat<(shl (sext Int16Regs:$a), (i16 IntConst_0_14:$b)), 1020 (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, 1021 Requires<[doMulWide]>; 1022def : Pat<(shl (zext Int16Regs:$a), (i16 IntConst_0_14:$b)), 1023 (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, 1024 Requires<[doMulWide]>; 1025 1026// Convert "sign/zero-extend then multiply" to mul.wide. 1027def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)), 1028 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, 1029 Requires<[doMulWide]>; 1030def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)), 1031 (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>, 1032 Requires<[doMulWide]>; 1033 1034def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)), 1035 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, 1036 Requires<[doMulWide]>; 1037def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)), 1038 (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>, 1039 Requires<[doMulWide]>; 1040 1041def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)), 1042 (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, 1043 Requires<[doMulWide]>; 1044def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)), 1045 (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>, 1046 Requires<[doMulWide]>; 1047 1048def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)), 1049 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, 1050 Requires<[doMulWide]>; 1051def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)), 1052 (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>, 1053 Requires<[doMulWide]>; 1054 1055// 1056// Integer multiply-add 1057// 1058def SDTIMAD : 1059 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>, 1060 SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>; 1061def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>; 1062 1063def MAD16rrr : 1064 NVPTXInst<(outs Int16Regs:$dst), 1065 (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), 1066 "mad.lo.s16 \t$dst, $a, $b, $c;", 1067 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>; 1068def MAD16rri : 1069 NVPTXInst<(outs Int16Regs:$dst), 1070 (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), 1071 "mad.lo.s16 \t$dst, $a, $b, $c;", 1072 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>; 1073def MAD16rir : 1074 NVPTXInst<(outs Int16Regs:$dst), 1075 (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), 1076 "mad.lo.s16 \t$dst, $a, $b, $c;", 1077 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>; 1078def MAD16rii : 1079 NVPTXInst<(outs Int16Regs:$dst), 1080 (ins Int16Regs:$a, i16imm:$b, i16imm:$c), 1081 "mad.lo.s16 \t$dst, $a, $b, $c;", 1082 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>; 1083 1084def MAD32rrr : 1085 NVPTXInst<(outs Int32Regs:$dst), 1086 (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), 1087 "mad.lo.s32 \t$dst, $a, $b, $c;", 1088 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>; 1089def MAD32rri : 1090 NVPTXInst<(outs Int32Regs:$dst), 1091 (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), 1092 "mad.lo.s32 \t$dst, $a, $b, $c;", 1093 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), imm:$c))]>; 1094def MAD32rir : 1095 NVPTXInst<(outs Int32Regs:$dst), 1096 (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), 1097 "mad.lo.s32 \t$dst, $a, $b, $c;", 1098 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, (i32 Int32Regs:$c)))]>; 1099def MAD32rii : 1100 NVPTXInst<(outs Int32Regs:$dst), 1101 (ins Int32Regs:$a, i32imm:$b, i32imm:$c), 1102 "mad.lo.s32 \t$dst, $a, $b, $c;", 1103 [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, imm:$c))]>; 1104 1105def MAD64rrr : 1106 NVPTXInst<(outs Int64Regs:$dst), 1107 (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), 1108 "mad.lo.s64 \t$dst, $a, $b, $c;", 1109 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>; 1110def MAD64rri : 1111 NVPTXInst<(outs Int64Regs:$dst), 1112 (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), 1113 "mad.lo.s64 \t$dst, $a, $b, $c;", 1114 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>; 1115def MAD64rir : 1116 NVPTXInst<(outs Int64Regs:$dst), 1117 (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), 1118 "mad.lo.s64 \t$dst, $a, $b, $c;", 1119 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>; 1120def MAD64rii : 1121 NVPTXInst<(outs Int64Regs:$dst), 1122 (ins Int64Regs:$a, i64imm:$b, i64imm:$c), 1123 "mad.lo.s64 \t$dst, $a, $b, $c;", 1124 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>; 1125 1126def INEG16 : 1127 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 1128 "neg.s16 \t$dst, $src;", 1129 [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>; 1130def INEG32 : 1131 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 1132 "neg.s32 \t$dst, $src;", 1133 [(set (i32 Int32Regs:$dst), (ineg (i32 Int32Regs:$src)))]>; 1134def INEG64 : 1135 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 1136 "neg.s64 \t$dst, $src;", 1137 [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>; 1138 1139//----------------------------------- 1140// Floating Point Arithmetic 1141//----------------------------------- 1142 1143// Constant 1.0f 1144def FloatConst1 : PatLeaf<(fpimm), [{ 1145 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() && 1146 N->getValueAPF().convertToFloat() == 1.0f; 1147}]>; 1148// Constant 1.0 (double) 1149def DoubleConst1 : PatLeaf<(fpimm), [{ 1150 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() && 1151 N->getValueAPF().convertToDouble() == 1.0; 1152}]>; 1153// Constant -1.0 (double) 1154def DoubleConstNeg1 : PatLeaf<(fpimm), [{ 1155 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() && 1156 N->getValueAPF().convertToDouble() == -1.0; 1157}]>; 1158 1159 1160// Constant -X -> X (double) 1161def NegDoubleConst : SDNodeXForm<fpimm, [{ 1162 return CurDAG->getTargetConstantFP(-(N->getValueAPF()), 1163 SDLoc(N), MVT::f64); 1164}]>; 1165 1166// Loads FP16 constant into a register. 1167// 1168// ptxas does not have hex representation for fp16, so we can't use 1169// fp16 immediate values in .f16 instructions. Instead we have to load 1170// the constant into a register using mov.b16. 1171def LOAD_CONST_F16 : 1172 NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$a), 1173 "mov.b16 \t$dst, $a;", []>; 1174def LOAD_CONST_BF16 : 1175 NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$a), 1176 "mov.b16 \t$dst, $a;", []>; 1177defm FADD : F3_fma_component<"add", fadd>; 1178defm FSUB : F3_fma_component<"sub", fsub>; 1179defm FMUL : F3_fma_component<"mul", fmul>; 1180 1181defm FMIN : F3<"min", fminnum>; 1182defm FMAX : F3<"max", fmaxnum>; 1183// Note: min.NaN.f64 and max.NaN.f64 do not actually exist. 1184defm FMINNAN : F3<"min.NaN", fminimum>; 1185defm FMAXNAN : F3<"max.NaN", fmaximum>; 1186 1187defm FABS : F2<"abs", fabs>; 1188defm FNEG : F2<"neg", fneg>; 1189defm FABS_H: F2_Support_Half<"abs", fabs>; 1190defm FNEG_H: F2_Support_Half<"neg", fneg>; 1191 1192defm FSQRT : F2<"sqrt.rn", fsqrt>; 1193 1194// 1195// F16 NEG 1196// 1197class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> : 1198 NVPTXInst<(outs RC:$dst), (ins RC:$src), 1199 !strconcat(OpcStr, " \t$dst, $src;"), 1200 [(set RC:$dst, (fneg (T RC:$src)))]>, 1201 Requires<[useFP16Math, hasPTX<60>, hasSM<53>, Pred]>; 1202def FNEG16_ftz : FNEG_F16_F16X2<"neg.ftz.f16", f16, Int16Regs, doF32FTZ>; 1203def FNEG16 : FNEG_F16_F16X2<"neg.f16", f16, Int16Regs, True>; 1204def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>; 1205def FNEG16x2 : FNEG_F16_F16X2<"neg.f16x2", v2f16, Int32Regs, True>; 1206 1207// 1208// BF16 NEG 1209// 1210 1211class FNEG_BF16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> : 1212 NVPTXInst<(outs RC:$dst), (ins RC:$src), 1213 !strconcat(OpcStr, " \t$dst, $src;"), 1214 [(set RC:$dst, (fneg (T RC:$src)))]>, 1215 Requires<[hasBF16Math, hasPTX<70>, hasSM<80>, Pred]>; 1216def BFNEG16_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, Int16Regs, doF32FTZ>; 1217def BFNEG16 : FNEG_BF16_F16X2<"neg.bf16", bf16, Int16Regs, True>; 1218def BFNEG16x2_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>; 1219def BFNEG16x2 : FNEG_BF16_F16X2<"neg.bf16x2", v2bf16, Int32Regs, True>; 1220 1221// 1222// F64 division 1223// 1224def FDIV641r : 1225 NVPTXInst<(outs Float64Regs:$dst), 1226 (ins f64imm:$a, Float64Regs:$b), 1227 "rcp.rn.f64 \t$dst, $b;", 1228 [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>; 1229def FDIV64rr : 1230 NVPTXInst<(outs Float64Regs:$dst), 1231 (ins Float64Regs:$a, Float64Regs:$b), 1232 "div.rn.f64 \t$dst, $a, $b;", 1233 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>; 1234def FDIV64ri : 1235 NVPTXInst<(outs Float64Regs:$dst), 1236 (ins Float64Regs:$a, f64imm:$b), 1237 "div.rn.f64 \t$dst, $a, $b;", 1238 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>; 1239 1240// fdiv will be converted to rcp 1241// fneg (fdiv 1.0, X) => fneg (rcp.rn X) 1242def : Pat<(fdiv DoubleConstNeg1:$a, Float64Regs:$b), 1243 (FNEGf64 (FDIV641r (NegDoubleConst node:$a), Float64Regs:$b))>; 1244 1245// 1246// F32 Approximate reciprocal 1247// 1248def FDIV321r_ftz : 1249 NVPTXInst<(outs Float32Regs:$dst), 1250 (ins f32imm:$a, Float32Regs:$b), 1251 "rcp.approx.ftz.f32 \t$dst, $b;", 1252 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1253 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1254def FDIV321r : 1255 NVPTXInst<(outs Float32Regs:$dst), 1256 (ins f32imm:$a, Float32Regs:$b), 1257 "rcp.approx.f32 \t$dst, $b;", 1258 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1259 Requires<[do_DIVF32_APPROX]>; 1260// 1261// F32 Approximate division 1262// 1263def FDIV32approxrr_ftz : 1264 NVPTXInst<(outs Float32Regs:$dst), 1265 (ins Float32Regs:$a, Float32Regs:$b), 1266 "div.approx.ftz.f32 \t$dst, $a, $b;", 1267 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1268 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1269def FDIV32approxri_ftz : 1270 NVPTXInst<(outs Float32Regs:$dst), 1271 (ins Float32Regs:$a, f32imm:$b), 1272 "div.approx.ftz.f32 \t$dst, $a, $b;", 1273 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1274 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1275def FDIV32approxrr : 1276 NVPTXInst<(outs Float32Regs:$dst), 1277 (ins Float32Regs:$a, Float32Regs:$b), 1278 "div.approx.f32 \t$dst, $a, $b;", 1279 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1280 Requires<[do_DIVF32_APPROX]>; 1281def FDIV32approxri : 1282 NVPTXInst<(outs Float32Regs:$dst), 1283 (ins Float32Regs:$a, f32imm:$b), 1284 "div.approx.f32 \t$dst, $a, $b;", 1285 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1286 Requires<[do_DIVF32_APPROX]>; 1287// 1288// F32 Semi-accurate reciprocal 1289// 1290// rcp.approx gives the same result as div.full(1.0f, a) and is faster. 1291// 1292def FDIV321r_approx_ftz : 1293 NVPTXInst<(outs Float32Regs:$dst), 1294 (ins f32imm:$a, Float32Regs:$b), 1295 "rcp.approx.ftz.f32 \t$dst, $b;", 1296 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1297 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1298def FDIV321r_approx : 1299 NVPTXInst<(outs Float32Regs:$dst), 1300 (ins f32imm:$a, Float32Regs:$b), 1301 "rcp.approx.f32 \t$dst, $b;", 1302 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1303 Requires<[do_DIVF32_FULL]>; 1304// 1305// F32 Semi-accurate division 1306// 1307def FDIV32rr_ftz : 1308 NVPTXInst<(outs Float32Regs:$dst), 1309 (ins Float32Regs:$a, Float32Regs:$b), 1310 "div.full.ftz.f32 \t$dst, $a, $b;", 1311 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1312 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1313def FDIV32ri_ftz : 1314 NVPTXInst<(outs Float32Regs:$dst), 1315 (ins Float32Regs:$a, f32imm:$b), 1316 "div.full.ftz.f32 \t$dst, $a, $b;", 1317 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1318 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1319def FDIV32rr : 1320 NVPTXInst<(outs Float32Regs:$dst), 1321 (ins Float32Regs:$a, Float32Regs:$b), 1322 "div.full.f32 \t$dst, $a, $b;", 1323 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1324 Requires<[do_DIVF32_FULL]>; 1325def FDIV32ri : 1326 NVPTXInst<(outs Float32Regs:$dst), 1327 (ins Float32Regs:$a, f32imm:$b), 1328 "div.full.f32 \t$dst, $a, $b;", 1329 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1330 Requires<[do_DIVF32_FULL]>; 1331// 1332// F32 Accurate reciprocal 1333// 1334def FDIV321r_prec_ftz : 1335 NVPTXInst<(outs Float32Regs:$dst), 1336 (ins f32imm:$a, Float32Regs:$b), 1337 "rcp.rn.ftz.f32 \t$dst, $b;", 1338 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, 1339 Requires<[doF32FTZ]>; 1340def FDIV321r_prec : 1341 NVPTXInst<(outs Float32Regs:$dst), 1342 (ins f32imm:$a, Float32Regs:$b), 1343 "rcp.rn.f32 \t$dst, $b;", 1344 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>; 1345// 1346// F32 Accurate division 1347// 1348def FDIV32rr_prec_ftz : 1349 NVPTXInst<(outs Float32Regs:$dst), 1350 (ins Float32Regs:$a, Float32Regs:$b), 1351 "div.rn.ftz.f32 \t$dst, $a, $b;", 1352 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, 1353 Requires<[doF32FTZ]>; 1354def FDIV32ri_prec_ftz : 1355 NVPTXInst<(outs Float32Regs:$dst), 1356 (ins Float32Regs:$a, f32imm:$b), 1357 "div.rn.ftz.f32 \t$dst, $a, $b;", 1358 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, 1359 Requires<[doF32FTZ]>; 1360def FDIV32rr_prec : 1361 NVPTXInst<(outs Float32Regs:$dst), 1362 (ins Float32Regs:$a, Float32Regs:$b), 1363 "div.rn.f32 \t$dst, $a, $b;", 1364 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>; 1365def FDIV32ri_prec : 1366 NVPTXInst<(outs Float32Regs:$dst), 1367 (ins Float32Regs:$a, f32imm:$b), 1368 "div.rn.f32 \t$dst, $a, $b;", 1369 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>; 1370 1371// 1372// FMA 1373// 1374 1375multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> { 1376 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1377 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1378 [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, 1379 Requires<[Pred]>; 1380 def rri : NVPTXInst<(outs RC:$dst), 1381 (ins RC:$a, RC:$b, ImmCls:$c), 1382 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1383 [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>, 1384 Requires<[Pred]>; 1385 def rir : NVPTXInst<(outs RC:$dst), 1386 (ins RC:$a, ImmCls:$b, RC:$c), 1387 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1388 [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>, 1389 Requires<[Pred]>; 1390 def rii : NVPTXInst<(outs RC:$dst), 1391 (ins RC:$a, ImmCls:$b, ImmCls:$c), 1392 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1393 [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>, 1394 Requires<[Pred]>; 1395} 1396 1397multiclass FMA_F16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> { 1398 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1399 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1400 [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>, 1401 Requires<[useFP16Math, Pred]>; 1402} 1403 1404multiclass FMA_BF16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> { 1405 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1406 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1407 [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>, 1408 Requires<[hasBF16Math, Pred]>; 1409} 1410 1411defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", f16, Int16Regs, doF32FTZ>; 1412defm FMA16 : FMA_F16<"fma.rn.f16", f16, Int16Regs, True>; 1413defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>; 1414defm FMA16x2 : FMA_F16<"fma.rn.f16x2", v2f16, Int32Regs, True>; 1415defm BFMA16_ftz : FMA_BF16<"fma.rn.ftz.bf16", bf16, Int16Regs, doF32FTZ>; 1416defm BFMA16 : FMA_BF16<"fma.rn.bf16", bf16, Int16Regs, True>; 1417defm BFMA16x2_ftz : FMA_BF16<"fma.rn.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>; 1418defm BFMA16x2 : FMA_BF16<"fma.rn.bf16x2", v2bf16, Int32Regs, True>; 1419defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>; 1420defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, True>; 1421defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, True>; 1422 1423// sin/cos 1424def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1425 "sin.approx.f32 \t$dst, $src;", 1426 [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>, 1427 Requires<[allowUnsafeFPMath]>; 1428def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1429 "cos.approx.f32 \t$dst, $src;", 1430 [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>, 1431 Requires<[allowUnsafeFPMath]>; 1432 1433// Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)), 1434// i.e. "poor man's fmod()". When y is infinite, x is returned. This matches the 1435// semantics of LLVM's frem. 1436 1437// frem - f32 FTZ 1438def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1439 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32 1440 (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ), 1441 Float32Regs:$y))>, 1442 Requires<[doF32FTZ, allowUnsafeFPMath]>; 1443def : Pat<(frem Float32Regs:$x, fpimm:$y), 1444 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32 1445 (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ), 1446 fpimm:$y))>, 1447 Requires<[doF32FTZ, allowUnsafeFPMath]>; 1448 1449def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1450 (SELP_f32rr Float32Regs:$x, 1451 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32 1452 (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ), 1453 Float32Regs:$y)), 1454 (TESTINF_f32r Float32Regs:$y))>, 1455 Requires<[doF32FTZ, noUnsafeFPMath]>; 1456def : Pat<(frem Float32Regs:$x, fpimm:$y), 1457 (SELP_f32rr Float32Regs:$x, 1458 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32 1459 (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ), 1460 fpimm:$y)), 1461 (TESTINF_f32i fpimm:$y))>, 1462 Requires<[doF32FTZ, noUnsafeFPMath]>; 1463 1464// frem - f32 1465def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1466 (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32 1467 (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI), 1468 Float32Regs:$y))>, 1469 Requires<[allowUnsafeFPMath]>; 1470def : Pat<(frem Float32Regs:$x, fpimm:$y), 1471 (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32 1472 (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI), 1473 fpimm:$y))>, 1474 Requires<[allowUnsafeFPMath]>; 1475 1476def : Pat<(frem Float32Regs:$x, Float32Regs:$y), 1477 (SELP_f32rr Float32Regs:$x, 1478 (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32 1479 (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI), 1480 Float32Regs:$y)), 1481 (TESTINF_f32r Float32Regs:$y))>, 1482 Requires<[noUnsafeFPMath]>; 1483def : Pat<(frem Float32Regs:$x, fpimm:$y), 1484 (SELP_f32rr Float32Regs:$x, 1485 (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32 1486 (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI), 1487 fpimm:$y)), 1488 (TESTINF_f32i fpimm:$y))>, 1489 Requires<[noUnsafeFPMath]>; 1490 1491// frem - f64 1492def : Pat<(frem Float64Regs:$x, Float64Regs:$y), 1493 (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64 1494 (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI), 1495 Float64Regs:$y))>, 1496 Requires<[allowUnsafeFPMath]>; 1497def : Pat<(frem Float64Regs:$x, fpimm:$y), 1498 (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64 1499 (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI), 1500 fpimm:$y))>, 1501 Requires<[allowUnsafeFPMath]>; 1502 1503def : Pat<(frem Float64Regs:$x, Float64Regs:$y), 1504 (SELP_f64rr Float64Regs:$x, 1505 (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64 1506 (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI), 1507 Float64Regs:$y)), 1508 (TESTINF_f64r Float64Regs:$y))>, 1509 Requires<[noUnsafeFPMath]>; 1510def : Pat<(frem Float64Regs:$x, fpimm:$y), 1511 (SELP_f64rr Float64Regs:$x, 1512 (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64 1513 (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI), 1514 fpimm:$y)), 1515 (TESTINF_f64r Float64Regs:$y))>, 1516 Requires<[noUnsafeFPMath]>; 1517 1518//----------------------------------- 1519// Bitwise operations 1520//----------------------------------- 1521 1522// Template for three-arg bitwise operations. Takes three args, Creates .b16, 1523// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr. 1524multiclass BITWISE<string OpcStr, SDNode OpNode> { 1525 def b1rr : 1526 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), 1527 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), 1528 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; 1529 def b1ri : 1530 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), 1531 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), 1532 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>; 1533 def b16rr : 1534 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 1535 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), 1536 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; 1537 def b16ri : 1538 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 1539 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), 1540 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>; 1541 def b32rr : 1542 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 1543 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), 1544 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>; 1545 def b32ri : 1546 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 1547 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), 1548 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>; 1549 def b64rr : 1550 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 1551 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), 1552 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; 1553 def b64ri : 1554 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 1555 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), 1556 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; 1557} 1558 1559defm OR : BITWISE<"or", or>; 1560defm AND : BITWISE<"and", and>; 1561defm XOR : BITWISE<"xor", xor>; 1562 1563// PTX does not support mul on predicates, convert to and instructions 1564def : Pat<(mul Int1Regs:$a, Int1Regs:$b), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>; 1565def : Pat<(mul Int1Regs:$a, (i1 imm:$b)), (ANDb1ri Int1Regs:$a, imm:$b)>; 1566 1567// These transformations were once reliably performed by instcombine, but thanks 1568// to poison semantics they are no longer safe for LLVM IR, perform them here 1569// instead. 1570def : Pat<(select Int1Regs:$a, Int1Regs:$b, 0), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>; 1571def : Pat<(select Int1Regs:$a, 1, Int1Regs:$b), (ORb1rr Int1Regs:$a, Int1Regs:$b)>; 1572 1573// Lower logical v2i16/v4i8 ops as bitwise ops on b32. 1574foreach vt = [v2i16, v4i8] in { 1575 def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)), 1576 (ORb32rr Int32Regs:$a, Int32Regs:$b)>; 1577 def: Pat<(xor (vt Int32Regs:$a), (vt Int32Regs:$b)), 1578 (XORb32rr Int32Regs:$a, Int32Regs:$b)>; 1579 def: Pat<(and (vt Int32Regs:$a), (vt Int32Regs:$b)), 1580 (ANDb32rr Int32Regs:$a, Int32Regs:$b)>; 1581 1582 // The constants get legalized into a bitcast from i32, so that's what we need 1583 // to match here. 1584 def: Pat<(or Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))), 1585 (ORb32ri Int32Regs:$a, imm:$b)>; 1586 def: Pat<(xor Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))), 1587 (XORb32ri Int32Regs:$a, imm:$b)>; 1588 def: Pat<(and Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))), 1589 (ANDb32ri Int32Regs:$a, imm:$b)>; 1590} 1591 1592def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src), 1593 "not.pred \t$dst, $src;", 1594 [(set Int1Regs:$dst, (not Int1Regs:$src))]>; 1595def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 1596 "not.b16 \t$dst, $src;", 1597 [(set Int16Regs:$dst, (not Int16Regs:$src))]>; 1598def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 1599 "not.b32 \t$dst, $src;", 1600 [(set (i32 Int32Regs:$dst), (not (i32 Int32Regs:$src)))]>; 1601def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 1602 "not.b64 \t$dst, $src;", 1603 [(set Int64Regs:$dst, (not Int64Regs:$src))]>; 1604 1605// Template for left/right shifts. Takes three operands, 1606// [dest (reg), src (reg), shift (reg or imm)]. 1607// dest and src may be int64, int32, or int16, but shift is always int32. 1608// 1609// This template also defines a 32-bit shift (imm, imm) instruction. 1610multiclass SHIFT<string OpcStr, SDNode OpNode> { 1611 def i64rr : 1612 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b), 1613 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 1614 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 Int32Regs:$b)))]>; 1615 def i64ri : 1616 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b), 1617 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 1618 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>; 1619 def i32rr : 1620 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 1621 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1622 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>; 1623 def i32ri : 1624 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 1625 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1626 [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 imm:$b)))]>; 1627 def i32ii : 1628 NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b), 1629 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1630 [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>; 1631 def i16rr : 1632 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b), 1633 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 1634 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 Int32Regs:$b)))]>; 1635 def i16ri : 1636 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 1637 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 1638 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>; 1639} 1640 1641defm SHL : SHIFT<"shl.b", shl>; 1642defm SRA : SHIFT<"shr.s", sra>; 1643defm SRL : SHIFT<"shr.u", srl>; 1644 1645// Bit-reverse 1646def BREV32 : 1647 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 1648 "brev.b32 \t$dst, $a;", 1649 [(set Int32Regs:$dst, (bitreverse (i32 Int32Regs:$a)))]>; 1650def BREV64 : 1651 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a), 1652 "brev.b64 \t$dst, $a;", 1653 [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>; 1654 1655// 1656// Rotate: Use ptx shf instruction if available. 1657// 1658 1659// 32 bit r2 = rotl r1, n 1660// => 1661// r2 = shf.l r1, r1, n 1662def ROTL32imm_hw : 1663 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), 1664 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", 1665 [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 imm:$amt)))]>, 1666 Requires<[hasHWROT32]>; 1667 1668def ROTL32reg_hw : 1669 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1670 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", 1671 [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, 1672 Requires<[hasHWROT32]>; 1673 1674// 32 bit r2 = rotr r1, n 1675// => 1676// r2 = shf.r r1, r1, n 1677def ROTR32imm_hw : 1678 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), 1679 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", 1680 [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 imm:$amt)))]>, 1681 Requires<[hasHWROT32]>; 1682 1683def ROTR32reg_hw : 1684 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1685 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", 1686 [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, 1687 Requires<[hasHWROT32]>; 1688 1689// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1. 1690def ROT32imm_sw : 1691 NVPTXInst<(outs Int32Regs:$dst), 1692 (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), 1693 "{{\n\t" 1694 ".reg .b32 %lhs;\n\t" 1695 ".reg .b32 %rhs;\n\t" 1696 "shl.b32 \t%lhs, $src, $amt1;\n\t" 1697 "shr.b32 \t%rhs, $src, $amt2;\n\t" 1698 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1699 "}}", 1700 []>; 1701 1702def SUB_FRM_32 : SDNodeXForm<imm, [{ 1703 return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32); 1704}]>; 1705 1706def : Pat<(rotl (i32 Int32Regs:$src), (i32 imm:$amt)), 1707 (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, 1708 Requires<[noHWROT32]>; 1709def : Pat<(rotr (i32 Int32Regs:$src), (i32 imm:$amt)), 1710 (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>, 1711 Requires<[noHWROT32]>; 1712 1713// 32-bit software rotate left by register. 1714def ROTL32reg_sw : 1715 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1716 "{{\n\t" 1717 ".reg .b32 %lhs;\n\t" 1718 ".reg .b32 %rhs;\n\t" 1719 ".reg .b32 %amt2;\n\t" 1720 "shl.b32 \t%lhs, $src, $amt;\n\t" 1721 "sub.s32 \t%amt2, 32, $amt;\n\t" 1722 "shr.b32 \t%rhs, $src, %amt2;\n\t" 1723 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1724 "}}", 1725 [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, 1726 Requires<[noHWROT32]>; 1727 1728// 32-bit software rotate right by register. 1729def ROTR32reg_sw : 1730 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), 1731 "{{\n\t" 1732 ".reg .b32 %lhs;\n\t" 1733 ".reg .b32 %rhs;\n\t" 1734 ".reg .b32 %amt2;\n\t" 1735 "shr.b32 \t%lhs, $src, $amt;\n\t" 1736 "sub.s32 \t%amt2, 32, $amt;\n\t" 1737 "shl.b32 \t%rhs, $src, %amt2;\n\t" 1738 "add.u32 \t$dst, %lhs, %rhs;\n\t" 1739 "}}", 1740 [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, 1741 Requires<[noHWROT32]>; 1742 1743// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1. 1744def ROT64imm_sw : 1745 NVPTXInst<(outs Int64Regs:$dst), 1746 (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2), 1747 "{{\n\t" 1748 ".reg .b64 %lhs;\n\t" 1749 ".reg .b64 %rhs;\n\t" 1750 "shl.b64 \t%lhs, $src, $amt1;\n\t" 1751 "shr.b64 \t%rhs, $src, $amt2;\n\t" 1752 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1753 "}}", 1754 []>; 1755 1756def SUB_FRM_64 : SDNodeXForm<imm, [{ 1757 return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32); 1758}]>; 1759 1760def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)), 1761 (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>; 1762def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)), 1763 (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>; 1764 1765// 64-bit software rotate left by register. 1766def ROTL64reg_sw : 1767 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), 1768 "{{\n\t" 1769 ".reg .b64 %lhs;\n\t" 1770 ".reg .b64 %rhs;\n\t" 1771 ".reg .u32 %amt2;\n\t" 1772 "and.b32 \t%amt2, $amt, 63;\n\t" 1773 "shl.b64 \t%lhs, $src, %amt2;\n\t" 1774 "sub.u32 \t%amt2, 64, %amt2;\n\t" 1775 "shr.b64 \t%rhs, $src, %amt2;\n\t" 1776 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1777 "}}", 1778 [(set Int64Regs:$dst, (rotl Int64Regs:$src, (i32 Int32Regs:$amt)))]>; 1779 1780def ROTR64reg_sw : 1781 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), 1782 "{{\n\t" 1783 ".reg .b64 %lhs;\n\t" 1784 ".reg .b64 %rhs;\n\t" 1785 ".reg .u32 %amt2;\n\t" 1786 "and.b32 \t%amt2, $amt, 63;\n\t" 1787 "shr.b64 \t%lhs, $src, %amt2;\n\t" 1788 "sub.u32 \t%amt2, 64, %amt2;\n\t" 1789 "shl.b64 \t%rhs, $src, %amt2;\n\t" 1790 "add.u64 \t$dst, %lhs, %rhs;\n\t" 1791 "}}", 1792 [(set Int64Regs:$dst, (rotr Int64Regs:$src, (i32 Int32Regs:$amt)))]>; 1793 1794// 1795// Funnnel shift in clamp mode 1796// 1797 1798// Create SDNodes so they can be used in the DAG code, e.g. 1799// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) 1800def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; 1801def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; 1802 1803def FUNSHFLCLAMP : 1804 NVPTXInst<(outs Int32Regs:$dst), 1805 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), 1806 "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", 1807 [(set Int32Regs:$dst, 1808 (FUN_SHFL_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>; 1809 1810def FUNSHFRCLAMP : 1811 NVPTXInst<(outs Int32Regs:$dst), 1812 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), 1813 "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", 1814 [(set Int32Regs:$dst, 1815 (FUN_SHFR_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>; 1816 1817// 1818// BFE - bit-field extract 1819// 1820 1821// Template for BFE/BFI instructions. 1822// Args: [dest (reg), src (reg), start (reg or imm), end (reg or imm)]. 1823// Start may be an imm only if end is also an imm. FIXME: Is this a 1824// restriction in PTX? 1825// 1826// dest and src may be int32 or int64, but start and end are always int32. 1827def SDTBFE : 1828 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, 1829 SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; 1830def bfe : SDNode<"NVPTXISD::BFE", SDTBFE>; 1831 1832def SDTBFI : 1833 SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, 1834 SDTCisVT<3, i32>, SDTCisVT<4, i32>]>; 1835def bfi : SDNode<"NVPTXISD::BFI", SDTBFI>; 1836 1837def SDTPRMT : 1838 SDTypeProfile<1, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, 1839 SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>,]>; 1840def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>; 1841 1842multiclass BFE<string Instr, ValueType T, RegisterClass RC> { 1843 def rrr 1844 : NVPTXInst<(outs RC:$d), 1845 (ins RC:$a, Int32Regs:$b, Int32Regs:$c), 1846 !strconcat(Instr, " \t$d, $a, $b, $c;"), 1847 [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>; 1848 def rri 1849 : NVPTXInst<(outs RC:$d), 1850 (ins RC:$a, Int32Regs:$b, i32imm:$c), 1851 !strconcat(Instr, " \t$d, $a, $b, $c;"), 1852 [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 imm:$c)))]>; 1853 def rii 1854 : NVPTXInst<(outs RC:$d), 1855 (ins RC:$a, i32imm:$b, i32imm:$c), 1856 !strconcat(Instr, " \t$d, $a, $b, $c;"), 1857 [(set (T RC:$d), (bfe (T RC:$a), (i32 imm:$b), (i32 imm:$c)))]>; 1858} 1859 1860multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> { 1861 def rrrr 1862 : NVPTXInst<(outs RC:$f), 1863 (ins RC:$a, RC:$b, Int32Regs:$c, Int32Regs:$d), 1864 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1865 [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>; 1866 def rrri 1867 : NVPTXInst<(outs RC:$f), 1868 (ins RC:$a, RC:$b, Int32Regs:$c, i32imm:$d), 1869 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1870 [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>; 1871 def rrii 1872 : NVPTXInst<(outs RC:$f), 1873 (ins RC:$a, RC:$b, i32imm:$c, i32imm:$d), 1874 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1875 [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>; 1876 def irrr 1877 : NVPTXInst<(outs RC:$f), 1878 (ins ImmCls:$a, RC:$b, Int32Regs:$c, Int32Regs:$d), 1879 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1880 [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>; 1881 def irri 1882 : NVPTXInst<(outs RC:$f), 1883 (ins ImmCls:$a, RC:$b, Int32Regs:$c, i32imm:$d), 1884 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1885 [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>; 1886 def irii 1887 : NVPTXInst<(outs RC:$f), 1888 (ins ImmCls:$a, RC:$b, i32imm:$c, i32imm:$d), 1889 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1890 [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>; 1891} 1892 1893multiclass PRMT<ValueType T, RegisterClass RC> { 1894 def rrr 1895 : NVPTXInst<(outs RC:$d), 1896 (ins RC:$a, Int32Regs:$b, Int32Regs:$c, PrmtMode:$mode), 1897 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), 1898 [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), imm:$mode))]>; 1899 def rri 1900 : NVPTXInst<(outs RC:$d), 1901 (ins RC:$a, Int32Regs:$b, i32imm:$c, PrmtMode:$mode), 1902 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), 1903 [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 imm:$c), imm:$mode))]>; 1904 def rii 1905 : NVPTXInst<(outs RC:$d), 1906 (ins RC:$a, i32imm:$b, i32imm:$c, PrmtMode:$mode), 1907 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), 1908 [(set (T RC:$d), (prmt (T RC:$a), (T imm:$b), (i32 imm:$c), imm:$mode))]>; 1909} 1910 1911let hasSideEffects = false in { 1912 // order is somewhat important here. signed/unsigned variants match 1913 // the same patterns, so the first one wins. Having unsigned byte extraction 1914 // has the benefit of always having zero in unused bits, which makes some 1915 // optimizations easier (e.g. no need to mask them). 1916 defm BFE_U32 : BFE<"bfe.u32", i32, Int32Regs>; 1917 defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>; 1918 defm BFE_U64 : BFE<"bfe.u64", i64, Int64Regs>; 1919 defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>; 1920 1921 defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>; 1922 defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>; 1923 1924 defm PRMT_B32 : PRMT<i32, Int32Regs>; 1925} 1926 1927 1928// byte extraction + signed/unsigned extension to i32. 1929def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), i8)), 1930 (BFE_S32rri Int32Regs:$s, Int32Regs:$o, 8)>; 1931def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), i8)), 1932 (BFE_S32rii Int32Regs:$s, imm:$o, 8)>; 1933def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), 255)), 1934 (BFE_U32rri Int32Regs:$s, Int32Regs:$o, 8)>; 1935def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), 255)), 1936 (BFE_U32rii Int32Regs:$s, imm:$o, 8)>; 1937 1938// byte extraction + signed extension to i16 1939def : Pat<(i16 (sext_inreg (trunc (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8)), i8)), 1940 (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>; 1941 1942 1943// Byte extraction via shift/trunc/sext 1944def : Pat<(i16 (sext_inreg (trunc Int32Regs:$s), i8)), 1945 (CVT_s8_s32 Int32Regs:$s, CvtNONE)>; 1946def : Pat<(i16 (sext_inreg (trunc (srl (i32 Int32Regs:$s), (i32 imm:$o))), i8)), 1947 (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>; 1948def : Pat<(sext_inreg (srl (i32 Int32Regs:$s), (i32 imm:$o)), i8), 1949 (BFE_S32rii Int32Regs:$s, imm:$o, 8)>; 1950def : Pat<(i16 (sra (i16 (trunc Int32Regs:$s)), (i32 8))), 1951 (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, 8, 8), CvtNONE)>; 1952def : Pat<(sext_inreg (srl (i64 Int64Regs:$s), (i32 imm:$o)), i8), 1953 (BFE_S64rii Int64Regs:$s, imm:$o, 8)>; 1954def : Pat<(i16 (sext_inreg (trunc Int64Regs:$s), i8)), 1955 (CVT_s8_s64 Int64Regs:$s, CvtNONE)>; 1956def : Pat<(i16 (sext_inreg (trunc (srl (i64 Int64Regs:$s), (i32 imm:$o))), i8)), 1957 (CVT_s8_s64 (BFE_S64rii Int64Regs:$s, imm:$o, 8), CvtNONE)>; 1958 1959//----------------------------------- 1960// Comparison instructions (setp, set) 1961//----------------------------------- 1962 1963// FIXME: This doesn't cover versions of set and setp that combine with a 1964// boolean predicate, e.g. setp.eq.and.b16. 1965 1966let hasSideEffects = false in { 1967 multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> { 1968 def rr : 1969 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp), 1970 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1971 " \t$dst, $a, $b;"), []>; 1972 def ri : 1973 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp), 1974 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1975 " \t$dst, $a, $b;"), []>; 1976 def ir : 1977 NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp), 1978 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1979 " \t$dst, $a, $b;"), []>; 1980 } 1981} 1982 1983defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>; 1984defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>; 1985defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>; 1986defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>; 1987defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>; 1988defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>; 1989defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>; 1990defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>; 1991defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>; 1992defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>; 1993defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>; 1994def SETP_f16rr : 1995 NVPTXInst<(outs Int1Regs:$dst), 1996 (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp), 1997 "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;", 1998 []>, Requires<[useFP16Math]>; 1999 2000def SETP_f16x2rr : 2001 NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q), 2002 (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp), 2003 "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;", 2004 []>, 2005 Requires<[useFP16Math]>; 2006def SETP_bf16rr : 2007 NVPTXInst<(outs Int1Regs:$dst), 2008 (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp), 2009 "setp${cmp:base}${cmp:ftz}.bf16 \t$dst, $a, $b;", 2010 []>, Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>; 2011 2012def SETP_bf16x2rr : 2013 NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q), 2014 (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp), 2015 "setp${cmp:base}${cmp:ftz}.bf16x2 \t$p|$q, $a, $b;", 2016 []>, 2017 Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>; 2018 2019 2020// FIXME: This doesn't appear to be correct. The "set" mnemonic has the form 2021// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination 2022// reg, either u32, s32, or f32. Anyway these aren't used at the moment. 2023 2024let hasSideEffects = false in { 2025 multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> { 2026 def rr : NVPTXInst<(outs Int32Regs:$dst), 2027 (ins RC:$a, RC:$b, CmpMode:$cmp), 2028 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 2029 def ri : NVPTXInst<(outs Int32Regs:$dst), 2030 (ins RC:$a, ImmCls:$b, CmpMode:$cmp), 2031 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 2032 def ir : NVPTXInst<(outs Int32Regs:$dst), 2033 (ins ImmCls:$a, RC:$b, CmpMode:$cmp), 2034 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 2035 } 2036} 2037 2038defm SET_b16 : SET<"b16", Int16Regs, i16imm>; 2039defm SET_s16 : SET<"s16", Int16Regs, i16imm>; 2040defm SET_u16 : SET<"u16", Int16Regs, i16imm>; 2041defm SET_b32 : SET<"b32", Int32Regs, i32imm>; 2042defm SET_s32 : SET<"s32", Int32Regs, i32imm>; 2043defm SET_u32 : SET<"u32", Int32Regs, i32imm>; 2044defm SET_b64 : SET<"b64", Int64Regs, i64imm>; 2045defm SET_s64 : SET<"s64", Int64Regs, i64imm>; 2046defm SET_u64 : SET<"u64", Int64Regs, i64imm>; 2047defm SET_f16 : SET<"f16", Int16Regs, f16imm>; 2048defm SET_bf16 : SET<"bf16", Int16Regs, bf16imm>, Requires<[hasPTX<78>, hasSM<90>]>; 2049defm SET_f32 : SET<"f32", Float32Regs, f32imm>; 2050defm SET_f64 : SET<"f64", Float64Regs, f64imm>; 2051 2052//----------------------------------- 2053// Data Movement (Load / Store, Move) 2054//----------------------------------- 2055 2056def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], 2057 [SDNPWantRoot]>; 2058def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex], 2059 [SDNPWantRoot]>; 2060def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>; 2061 2062def MEMri : Operand<i32> { 2063 let PrintMethod = "printMemOperand"; 2064 let MIOperandInfo = (ops Int32Regs, i32imm); 2065} 2066def MEMri64 : Operand<i64> { 2067 let PrintMethod = "printMemOperand"; 2068 let MIOperandInfo = (ops Int64Regs, i64imm); 2069} 2070 2071def imem : Operand<iPTR> { 2072 let PrintMethod = "printOperand"; 2073} 2074 2075def imemAny : Operand<iPTRAny> { 2076 let PrintMethod = "printOperand"; 2077} 2078 2079def LdStCode : Operand<i32> { 2080 let PrintMethod = "printLdStCode"; 2081} 2082 2083def MmaCode : Operand<i32> { 2084 let PrintMethod = "printMmaCode"; 2085} 2086 2087def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; 2088def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>; 2089 2090// Load a memory address into a u32 or u64 register. 2091def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a), 2092 "mov.u32 \t$dst, $a;", 2093 [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>; 2094def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a), 2095 "mov.u64 \t$dst, $a;", 2096 [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>; 2097 2098// Get pointer to local stack. 2099let hasSideEffects = false in { 2100 def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num), 2101 "mov.u32 \t$d, __local_depot$num;", []>; 2102 def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num), 2103 "mov.u64 \t$d, __local_depot$num;", []>; 2104} 2105 2106 2107// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp 2108let IsSimpleMove=1, hasSideEffects=0 in { 2109 def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss), 2110 "mov.pred \t$dst, $sss;", []>; 2111 def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), 2112 "mov.u16 \t$dst, $sss;", []>; 2113 def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), 2114 "mov.u32 \t$dst, $sss;", []>; 2115 def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), 2116 "mov.u64 \t$dst, $sss;", []>; 2117 def IMOV128rr : NVPTXInst<(outs Int128Regs:$dst), (ins Int128Regs:$sss), 2118 "mov.b128 \t$dst, $sss;", []>; 2119 2120 def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), 2121 "mov.b16 \t$dst, $sss;", []>; 2122 def IMOVB32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), 2123 "mov.b32 \t$dst, $sss;", []>; 2124 def IMOVB64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), 2125 "mov.b64 \t$dst, $sss;", []>; 2126 2127 def FMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 2128 // We have to use .b16 here as there's no mov.f16. 2129 "mov.b16 \t$dst, $src;", []>; 2130 def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 2131 "mov.f32 \t$dst, $src;", []>; 2132 def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src), 2133 "mov.f64 \t$dst, $src;", []>; 2134} 2135 2136def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), 2137 "mov.pred \t$dst, $src;", 2138 [(set Int1Regs:$dst, imm:$src)]>; 2139def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), 2140 "mov.u16 \t$dst, $src;", 2141 [(set Int16Regs:$dst, imm:$src)]>; 2142def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), 2143 "mov.u32 \t$dst, $src;", 2144 [(set (i32 Int32Regs:$dst), imm:$src)]>; 2145def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), 2146 "mov.u64 \t$dst, $src;", 2147 [(set Int64Regs:$dst, imm:$src)]>; 2148 2149def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), 2150 "mov.b16 \t$dst, $src;", []>; 2151def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), 2152 "mov.b32 \t$dst, $src;", []>; 2153def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), 2154 "mov.b64 \t$dst, $src;", []>; 2155 2156def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), 2157 "mov.f32 \t$dst, $src;", 2158 [(set Float32Regs:$dst, fpimm:$src)]>; 2159def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), 2160 "mov.f64 \t$dst, $src;", 2161 [(set Float64Regs:$dst, fpimm:$src)]>; 2162 2163def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>; 2164def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>; 2165 2166//---- Copy Frame Index ---- 2167def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr), 2168 "add.u32 \t$dst, ${addr:add};", 2169 [(set Int32Regs:$dst, ADDRri:$addr)]>; 2170def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr), 2171 "add.u64 \t$dst, ${addr:add};", 2172 [(set Int64Regs:$dst, ADDRri64:$addr)]>; 2173 2174//----------------------------------- 2175// Comparison and Selection 2176//----------------------------------- 2177 2178multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode, 2179 Instruction setp_16rr, 2180 Instruction setp_16ri, 2181 Instruction setp_16ir, 2182 Instruction setp_32rr, 2183 Instruction setp_32ri, 2184 Instruction setp_32ir, 2185 Instruction setp_64rr, 2186 Instruction setp_64ri, 2187 Instruction setp_64ir, 2188 Instruction set_16rr, 2189 Instruction set_16ri, 2190 Instruction set_16ir, 2191 Instruction set_32rr, 2192 Instruction set_32ri, 2193 Instruction set_32ir, 2194 Instruction set_64rr, 2195 Instruction set_64ri, 2196 Instruction set_64ir> { 2197 // i16 -> pred 2198 def : Pat<(i1 (OpNode i16:$a, i16:$b)), 2199 (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; 2200 def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)), 2201 (setp_16ri Int16Regs:$a, imm:$b, Mode)>; 2202 def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)), 2203 (setp_16ir imm:$a, Int16Regs:$b, Mode)>; 2204 // i32 -> pred 2205 def : Pat<(i1 (OpNode i32:$a, i32:$b)), 2206 (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; 2207 def : Pat<(i1 (OpNode (i32 Int32Regs:$a), imm:$b)), 2208 (setp_32ri Int32Regs:$a, imm:$b, Mode)>; 2209 def : Pat<(i1 (OpNode imm:$a, (i32 Int32Regs:$b))), 2210 (setp_32ir imm:$a, Int32Regs:$b, Mode)>; 2211 // i64 -> pred 2212 def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)), 2213 (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; 2214 def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)), 2215 (setp_64ri Int64Regs:$a, imm:$b, Mode)>; 2216 def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)), 2217 (setp_64ir imm:$a, Int64Regs:$b, Mode)>; 2218 2219 // i16 -> i32 2220 def : Pat<(i32 (OpNode i16:$a, i16:$b)), 2221 (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; 2222 def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)), 2223 (set_16ri Int16Regs:$a, imm:$b, Mode)>; 2224 def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)), 2225 (set_16ir imm:$a, Int16Regs:$b, Mode)>; 2226 // i32 -> i32 2227 def : Pat<(i32 (OpNode i32:$a, i32:$b)), 2228 (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; 2229 def : Pat<(i32 (OpNode (i32 Int32Regs:$a), imm:$b)), 2230 (set_32ri Int32Regs:$a, imm:$b, Mode)>; 2231 def : Pat<(i32 (OpNode imm:$a, (i32 Int32Regs:$b))), 2232 (set_32ir imm:$a, Int32Regs:$b, Mode)>; 2233 // i64 -> i32 2234 def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)), 2235 (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; 2236 def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)), 2237 (set_64ri Int64Regs:$a, imm:$b, Mode)>; 2238 def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)), 2239 (set_64ir imm:$a, Int64Regs:$b, Mode)>; 2240} 2241 2242multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode> 2243 : ISET_FORMAT<OpNode, Mode, 2244 SETP_s16rr, SETP_s16ri, SETP_s16ir, 2245 SETP_s32rr, SETP_s32ri, SETP_s32ir, 2246 SETP_s64rr, SETP_s64ri, SETP_s64ir, 2247 SET_s16rr, SET_s16ri, SET_s16ir, 2248 SET_s32rr, SET_s32ri, SET_s32ir, 2249 SET_s64rr, SET_s64ri, SET_s64ir> { 2250 // TableGen doesn't like empty multiclasses. 2251 def : PatLeaf<(i32 0)>; 2252} 2253 2254multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode> 2255 : ISET_FORMAT<OpNode, Mode, 2256 SETP_u16rr, SETP_u16ri, SETP_u16ir, 2257 SETP_u32rr, SETP_u32ri, SETP_u32ir, 2258 SETP_u64rr, SETP_u64ri, SETP_u64ir, 2259 SET_u16rr, SET_u16ri, SET_u16ir, 2260 SET_u32rr, SET_u32ri, SET_u32ir, 2261 SET_u64rr, SET_u64ri, SET_u64ir> { 2262 // TableGen doesn't like empty multiclasses. 2263 def : PatLeaf<(i32 0)>; 2264} 2265 2266defm : ISET_FORMAT_SIGNED<setgt, CmpGT>; 2267defm : ISET_FORMAT_SIGNED<setlt, CmpLT>; 2268defm : ISET_FORMAT_SIGNED<setge, CmpGE>; 2269defm : ISET_FORMAT_SIGNED<setle, CmpLE>; 2270defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>; 2271defm : ISET_FORMAT_SIGNED<setne, CmpNE>; 2272defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>; 2273defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>; 2274defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>; 2275defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>; 2276defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>; 2277defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>; 2278 2279// i1 compares 2280def : Pat<(setne Int1Regs:$a, Int1Regs:$b), 2281 (XORb1rr Int1Regs:$a, Int1Regs:$b)>; 2282def : Pat<(setune Int1Regs:$a, Int1Regs:$b), 2283 (XORb1rr Int1Regs:$a, Int1Regs:$b)>; 2284 2285def : Pat<(seteq Int1Regs:$a, Int1Regs:$b), 2286 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 2287def : Pat<(setueq Int1Regs:$a, Int1Regs:$b), 2288 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 2289 2290// comparisons of i8 extracted with BFE as i32 2291// It's faster to do comparison directly on i32 extracted by BFE, 2292// instead of the long conversion and sign extending. 2293def: Pat<(setgt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)), 2294 (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))), 2295 (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGT)>; 2296def: Pat<(setgt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)), 2297 (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))), 2298 (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGT)>; 2299def: Pat<(setge (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)), 2300 (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))), 2301 (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGE)>; 2302def: Pat<(setge (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)), 2303 (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))), 2304 (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGE)>; 2305def: Pat<(setlt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)), 2306 (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))), 2307 (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLT)>; 2308def: Pat<(setlt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)), 2309 (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))), 2310 (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLT)>; 2311def: Pat<(setle (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)), 2312 (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))), 2313 (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLE)>; 2314def: Pat<(setle (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)), 2315 (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))), 2316 (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLE)>; 2317 2318def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), 2319 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), 2320 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHI)>; 2321def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), 2322 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), 2323 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHI)>; 2324def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), 2325 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), 2326 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHS)>; 2327def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), 2328 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), 2329 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHS)>; 2330def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), 2331 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), 2332 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLO)>; 2333def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), 2334 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), 2335 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLO)>; 2336def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), 2337 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), 2338 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLS)>; 2339def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), 2340 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), 2341 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLS)>; 2342def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), 2343 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), 2344 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpEQ)>; 2345def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), 2346 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), 2347 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpEQ)>; 2348def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), 2349 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), 2350 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpNE)>; 2351def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), 2352 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), 2353 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpNE)>; 2354 2355// i1 compare -> i32 2356def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), 2357 (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 2358def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), 2359 (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; 2360 2361 2362 2363multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> { 2364 // f16 -> pred 2365 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 2366 (SETP_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, 2367 Requires<[useFP16Math,doF32FTZ]>; 2368 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 2369 (SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>, 2370 Requires<[useFP16Math]>; 2371 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)), 2372 (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, 2373 Requires<[useFP16Math,doF32FTZ]>; 2374 def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)), 2375 (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, 2376 Requires<[useFP16Math]>; 2377 def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))), 2378 (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, 2379 Requires<[useFP16Math,doF32FTZ]>; 2380 def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))), 2381 (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>, 2382 Requires<[useFP16Math]>; 2383 2384 // bf16 -> pred 2385 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 2386 (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, 2387 Requires<[hasBF16Math,doF32FTZ]>; 2388 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 2389 (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>, 2390 Requires<[hasBF16Math]>; 2391 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), 2392 (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>, 2393 Requires<[hasBF16Math,doF32FTZ]>; 2394 def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), 2395 (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>, 2396 Requires<[hasBF16Math]>; 2397 def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), 2398 (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, 2399 Requires<[hasBF16Math,doF32FTZ]>; 2400 def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), 2401 (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>, 2402 Requires<[hasBF16Math]>; 2403 2404 // f32 -> pred 2405 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), 2406 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, 2407 Requires<[doF32FTZ]>; 2408 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), 2409 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; 2410 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), 2411 (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, 2412 Requires<[doF32FTZ]>; 2413 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), 2414 (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>; 2415 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), 2416 (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, 2417 Requires<[doF32FTZ]>; 2418 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), 2419 (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>; 2420 2421 // f64 -> pred 2422 def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)), 2423 (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; 2424 def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)), 2425 (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>; 2426 def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)), 2427 (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>; 2428 2429 // f16 -> i32 2430 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 2431 (SET_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, 2432 Requires<[useFP16Math, doF32FTZ]>; 2433 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 2434 (SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>, 2435 Requires<[useFP16Math]>; 2436 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)), 2437 (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, 2438 Requires<[useFP16Math, doF32FTZ]>; 2439 def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)), 2440 (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, 2441 Requires<[useFP16Math]>; 2442 def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))), 2443 (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, 2444 Requires<[useFP16Math, doF32FTZ]>; 2445 def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))), 2446 (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>, 2447 Requires<[useFP16Math]>; 2448 2449 // bf16 -> i32 2450 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 2451 (SET_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, 2452 Requires<[hasBF16Math, doF32FTZ]>; 2453 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 2454 (SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>, 2455 Requires<[hasBF16Math]>; 2456 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), 2457 (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>, 2458 Requires<[hasBF16Math, doF32FTZ]>; 2459 def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), 2460 (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>, 2461 Requires<[hasBF16Math]>; 2462 def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), 2463 (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, 2464 Requires<[hasBF16Math, doF32FTZ]>; 2465 def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), 2466 (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>, 2467 Requires<[hasBF16Math]>; 2468 2469 // f32 -> i32 2470 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), 2471 (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, 2472 Requires<[doF32FTZ]>; 2473 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), 2474 (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; 2475 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), 2476 (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, 2477 Requires<[doF32FTZ]>; 2478 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), 2479 (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>; 2480 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), 2481 (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, 2482 Requires<[doF32FTZ]>; 2483 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), 2484 (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>; 2485 2486 // f64 -> i32 2487 def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)), 2488 (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; 2489 def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)), 2490 (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>; 2491 def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)), 2492 (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>; 2493} 2494 2495defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>; 2496defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>; 2497defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>; 2498defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>; 2499defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>; 2500defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>; 2501 2502defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>; 2503defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>; 2504defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>; 2505defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>; 2506defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>; 2507defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>; 2508 2509defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>; 2510defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>; 2511defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>; 2512defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>; 2513defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>; 2514defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>; 2515 2516defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>; 2517defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>; 2518 2519def SDTDeclareParamProfile : 2520 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; 2521def SDTDeclareScalarParamProfile : 2522 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; 2523def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; 2524def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>; 2525def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>; 2526def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 2527def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 2528def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; 2529def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>; 2530def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>; 2531def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; 2532def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; 2533def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>; 2534def SDTCallVoidProfile : SDTypeProfile<0, 1, []>; 2535def SDTCallValProfile : SDTypeProfile<1, 0, []>; 2536def SDTMoveParamProfile : SDTypeProfile<1, 1, []>; 2537def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; 2538def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>; 2539def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>; 2540def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>; 2541def SDTProxyRegProfile : SDTypeProfile<1, 1, []>; 2542 2543def DeclareParam : 2544 SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile, 2545 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2546def DeclareScalarParam : 2547 SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile, 2548 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2549def DeclareRetParam : 2550 SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile, 2551 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2552def DeclareRet : 2553 SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile, 2554 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2555def LoadParam : 2556 SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, 2557 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2558def LoadParamV2 : 2559 SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile, 2560 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2561def LoadParamV4 : 2562 SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile, 2563 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2564def PrintCall : 2565 SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile, 2566 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2567def PrintConvergentCall : 2568 SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile, 2569 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2570def PrintCallUni : 2571 SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile, 2572 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2573def PrintConvergentCallUni : 2574 SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile, 2575 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2576def StoreParam : 2577 SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, 2578 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2579def StoreParamV2 : 2580 SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile, 2581 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2582def StoreParamV4 : 2583 SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile, 2584 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2585def StoreParamU32 : 2586 SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile, 2587 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2588def StoreParamS32 : 2589 SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile, 2590 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2591def CallArgBegin : 2592 SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile, 2593 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2594def CallArg : 2595 SDNode<"NVPTXISD::CallArg", SDTCallArgProfile, 2596 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2597def LastCallArg : 2598 SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile, 2599 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2600def CallArgEnd : 2601 SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile, 2602 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2603def CallVoid : 2604 SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile, 2605 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2606def Prototype : 2607 SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile, 2608 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2609def CallVal : 2610 SDNode<"NVPTXISD::CallVal", SDTCallValProfile, 2611 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2612def MoveParam : 2613 SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>; 2614def StoreRetval : 2615 SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile, 2616 [SDNPHasChain, SDNPSideEffect]>; 2617def StoreRetvalV2 : 2618 SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile, 2619 [SDNPHasChain, SDNPSideEffect]>; 2620def StoreRetvalV4 : 2621 SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile, 2622 [SDNPHasChain, SDNPSideEffect]>; 2623def PseudoUseParam : 2624 SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile, 2625 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2626def RETURNNode : 2627 SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile, 2628 [SDNPHasChain, SDNPSideEffect]>; 2629def ProxyReg : 2630 SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile, 2631 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2632 2633let mayLoad = true in { 2634 class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : 2635 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), 2636 !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"), 2637 []>; 2638 2639 class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> : 2640 NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b), 2641 !strconcat("ld.param.v2", opstr, 2642 " \t{{$dst, $dst2}}, [retval0+$b];"), []>; 2643 2644 class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> : 2645 NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, 2646 regclass:$dst4), 2647 (ins i32imm:$b), 2648 !strconcat("ld.param.v4", opstr, 2649 " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), 2650 []>; 2651} 2652 2653class LoadParamRegInst<NVPTXRegClass regclass, string opstr> : 2654 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), 2655 !strconcat("mov", opstr, " \t$dst, retval$b;"), 2656 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>; 2657 2658let mayStore = true in { 2659 2660 multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr, bit support_imm = true> { 2661 foreach op = [IMMType, regclass] in 2662 if !or(support_imm, !isa<NVPTXRegClass>(op)) then 2663 def _ # !if(!isa<NVPTXRegClass>(op), "r", "i") 2664 : NVPTXInst<(outs), 2665 (ins op:$val, i32imm:$a, i32imm:$b), 2666 "st.param" # opstr # " \t[param$a+$b], $val;", 2667 []>; 2668 } 2669 2670 multiclass StoreParamV2Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> { 2671 foreach op1 = [IMMType, regclass] in 2672 foreach op2 = [IMMType, regclass] in 2673 def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i") 2674 # !if(!isa<NVPTXRegClass>(op2), "r", "i") 2675 : NVPTXInst<(outs), 2676 (ins op1:$val1, op2:$val2, 2677 i32imm:$a, i32imm:$b), 2678 "st.param.v2" # opstr # " \t[param$a+$b], {{$val1, $val2}};", 2679 []>; 2680 } 2681 2682 multiclass StoreParamV4Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> { 2683 foreach op1 = [IMMType, regclass] in 2684 foreach op2 = [IMMType, regclass] in 2685 foreach op3 = [IMMType, regclass] in 2686 foreach op4 = [IMMType, regclass] in 2687 def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i") 2688 # !if(!isa<NVPTXRegClass>(op2), "r", "i") 2689 # !if(!isa<NVPTXRegClass>(op3), "r", "i") 2690 # !if(!isa<NVPTXRegClass>(op4), "r", "i") 2691 2692 : NVPTXInst<(outs), 2693 (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4, 2694 i32imm:$a, i32imm:$b), 2695 "st.param.v4" # opstr # 2696 " \t[param$a+$b], {{$val1, $val2, $val3, $val4}};", 2697 []>; 2698 } 2699 2700 class StoreRetvalInst<NVPTXRegClass regclass, string opstr> : 2701 NVPTXInst<(outs), (ins regclass:$val, i32imm:$a), 2702 !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"), 2703 []>; 2704 2705 class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> : 2706 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a), 2707 !strconcat("st.param.v2", opstr, 2708 " \t[func_retval0+$a], {{$val, $val2}};"), 2709 []>; 2710 2711 class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> : 2712 NVPTXInst<(outs), 2713 (ins regclass:$val, regclass:$val2, regclass:$val3, 2714 regclass:$val4, i32imm:$a), 2715 !strconcat("st.param.v4", opstr, 2716 " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"), 2717 []>; 2718} 2719 2720let isCall=1 in { 2721 multiclass CALL<string OpcStr, SDNode OpNode> { 2722 def PrintCallNoRetInst : NVPTXInst<(outs), (ins), 2723 !strconcat(OpcStr, " "), [(OpNode (i32 0))]>; 2724 def PrintCallRetInst1 : NVPTXInst<(outs), (ins), 2725 !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>; 2726 def PrintCallRetInst2 : NVPTXInst<(outs), (ins), 2727 !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>; 2728 def PrintCallRetInst3 : NVPTXInst<(outs), (ins), 2729 !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>; 2730 def PrintCallRetInst4 : NVPTXInst<(outs), (ins), 2731 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "), 2732 [(OpNode (i32 4))]>; 2733 def PrintCallRetInst5 : NVPTXInst<(outs), (ins), 2734 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "), 2735 [(OpNode (i32 5))]>; 2736 def PrintCallRetInst6 : NVPTXInst<(outs), (ins), 2737 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2738 "retval5), "), 2739 [(OpNode (i32 6))]>; 2740 def PrintCallRetInst7 : NVPTXInst<(outs), (ins), 2741 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2742 "retval5, retval6), "), 2743 [(OpNode (i32 7))]>; 2744 def PrintCallRetInst8 : NVPTXInst<(outs), (ins), 2745 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2746 "retval5, retval6, retval7), "), 2747 [(OpNode (i32 8))]>; 2748 } 2749} 2750 2751defm Call : CALL<"call", PrintCall>; 2752defm CallUni : CALL<"call.uni", PrintCallUni>; 2753 2754// Convergent call instructions. These are identical to regular calls, except 2755// they have the isConvergent bit set. 2756let isConvergent=1 in { 2757 defm ConvergentCall : CALL<"call", PrintConvergentCall>; 2758 defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>; 2759} 2760 2761def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">; 2762def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">; 2763def LoadParamMemI16 : LoadParamMemInst<Int16Regs, ".b16">; 2764def LoadParamMemI8 : LoadParamMemInst<Int16Regs, ".b8">; 2765def LoadParamMemV2I64 : LoadParamV2MemInst<Int64Regs, ".b64">; 2766def LoadParamMemV2I32 : LoadParamV2MemInst<Int32Regs, ".b32">; 2767def LoadParamMemV2I16 : LoadParamV2MemInst<Int16Regs, ".b16">; 2768def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">; 2769def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">; 2770def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">; 2771def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">; 2772def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">; 2773def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">; 2774def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">; 2775def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">; 2776def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">; 2777 2778defm StoreParamI64 : StoreParamInst<Int64Regs, i64imm, ".b64">; 2779defm StoreParamI32 : StoreParamInst<Int32Regs, i32imm, ".b32">; 2780defm StoreParamI16 : StoreParamInst<Int16Regs, i16imm, ".b16">; 2781defm StoreParamI8 : StoreParamInst<Int16Regs, i8imm, ".b8">; 2782 2783defm StoreParamI8TruncI32 : StoreParamInst<Int32Regs, i8imm, ".b8", /* support_imm */ false>; 2784defm StoreParamI8TruncI64 : StoreParamInst<Int64Regs, i8imm, ".b8", /* support_imm */ false>; 2785 2786defm StoreParamV2I64 : StoreParamV2Inst<Int64Regs, i64imm, ".b64">; 2787defm StoreParamV2I32 : StoreParamV2Inst<Int32Regs, i32imm, ".b32">; 2788defm StoreParamV2I16 : StoreParamV2Inst<Int16Regs, i16imm, ".b16">; 2789defm StoreParamV2I8 : StoreParamV2Inst<Int16Regs, i8imm, ".b8">; 2790 2791defm StoreParamV4I32 : StoreParamV4Inst<Int32Regs, i32imm, ".b32">; 2792defm StoreParamV4I16 : StoreParamV4Inst<Int16Regs, i16imm, ".b16">; 2793defm StoreParamV4I8 : StoreParamV4Inst<Int16Regs, i8imm, ".b8">; 2794 2795defm StoreParamF32 : StoreParamInst<Float32Regs, f32imm, ".f32">; 2796defm StoreParamF64 : StoreParamInst<Float64Regs, f64imm, ".f64">; 2797 2798defm StoreParamV2F32 : StoreParamV2Inst<Float32Regs, f32imm, ".f32">; 2799defm StoreParamV2F64 : StoreParamV2Inst<Float64Regs, f64imm, ".f64">; 2800 2801defm StoreParamV4F32 : StoreParamV4Inst<Float32Regs, f32imm, ".f32">; 2802 2803def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">; 2804def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">; 2805def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">; 2806def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">; 2807def StoreRetvalI8TruncI32 : StoreRetvalInst<Int32Regs, ".b8">; 2808def StoreRetvalI8TruncI64 : StoreRetvalInst<Int64Regs, ".b8">; 2809def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">; 2810def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">; 2811def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">; 2812def StoreRetvalV2I8 : StoreRetvalV2Inst<Int16Regs, ".b8">; 2813def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">; 2814def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">; 2815def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">; 2816 2817def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">; 2818def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">; 2819def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">; 2820def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">; 2821def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">; 2822 2823def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>; 2824def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>; 2825def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>; 2826def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>; 2827 2828class CallArgInst<NVPTXRegClass regclass> : 2829 NVPTXInst<(outs), (ins regclass:$a), "$a, ", 2830 [(CallArg (i32 0), regclass:$a)]>; 2831 2832class CallArgInstVT<NVPTXRegClass regclass, ValueType vt> : 2833 NVPTXInst<(outs), (ins regclass:$a), "$a, ", 2834 [(CallArg (i32 0), vt:$a)]>; 2835 2836class LastCallArgInst<NVPTXRegClass regclass> : 2837 NVPTXInst<(outs), (ins regclass:$a), "$a", 2838 [(LastCallArg (i32 0), regclass:$a)]>; 2839class LastCallArgInstVT<NVPTXRegClass regclass, ValueType vt> : 2840 NVPTXInst<(outs), (ins regclass:$a), "$a", 2841 [(LastCallArg (i32 0), vt:$a)]>; 2842 2843def CallArgI64 : CallArgInst<Int64Regs>; 2844def CallArgI32 : CallArgInstVT<Int32Regs, i32>; 2845def CallArgI16 : CallArgInstVT<Int16Regs, i16>; 2846def CallArgF64 : CallArgInst<Float64Regs>; 2847def CallArgF32 : CallArgInst<Float32Regs>; 2848 2849def LastCallArgI64 : LastCallArgInst<Int64Regs>; 2850def LastCallArgI32 : LastCallArgInstVT<Int32Regs, i32>; 2851def LastCallArgI16 : LastCallArgInstVT<Int16Regs, i16>; 2852def LastCallArgF64 : LastCallArgInst<Float64Regs>; 2853def LastCallArgF32 : LastCallArgInst<Float32Regs>; 2854 2855def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ", 2856 [(CallArg (i32 0), (i32 imm:$a))]>; 2857def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a", 2858 [(LastCallArg (i32 0), (i32 imm:$a))]>; 2859 2860def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ", 2861 [(CallArg (i32 1), (i32 imm:$a))]>; 2862def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a", 2863 [(LastCallArg (i32 1), (i32 imm:$a))]>; 2864 2865def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ", 2866 [(CallVoid (Wrapper tglobaladdr:$addr))]>; 2867def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ", 2868 [(CallVoid i32:$addr)]>; 2869def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ", 2870 [(CallVoid Int64Regs:$addr)]>; 2871def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;", 2872 [(Prototype (i32 imm:$val))]>; 2873 2874def DeclareRetMemInst : 2875 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num), 2876 ".param .align $align .b8 retval$num[$size];", 2877 [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>; 2878def DeclareRetScalarInst : 2879 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), 2880 ".param .b$size retval$num;", 2881 [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>; 2882def DeclareRetRegInst : 2883 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), 2884 ".reg .b$size retval$num;", 2885 [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>; 2886 2887def DeclareParamInst : 2888 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size), 2889 ".param .align $align .b8 param$a[$size];", 2890 [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>; 2891def DeclareScalarParamInst : 2892 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), 2893 ".param .b$size param$a;", 2894 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>; 2895def DeclareScalarRegInst : 2896 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), 2897 ".reg .b$size param$a;", 2898 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>; 2899 2900class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> : 2901 NVPTXInst<(outs regclass:$dst), (ins regclass:$src), 2902 !strconcat("mov", asmstr, " \t$dst, $src;"), 2903 [(set (T regclass:$dst), (MoveParam (T regclass:$src)))]>; 2904 2905class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, ValueType vt, 2906 string asmstr> : 2907 NVPTXInst<(outs regclass:$dst), (ins srcty:$src), 2908 !strconcat("mov", asmstr, " \t$dst, $src;"), 2909 [(set vt:$dst, (MoveParam texternalsym:$src))]>; 2910 2911def MoveParamI64 : MoveParamInst<i64, Int64Regs, ".b64">; 2912def MoveParamI32 : MoveParamInst<i32, Int32Regs, ".b32">; 2913 2914def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, i64, ".b64">; 2915def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, i32, ".b32">; 2916 2917def MoveParamI16 : 2918 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 2919 "cvt.u16.u32 \t$dst, $src;", // ??? Why cvt.u16.u32 ? 2920 [(set i16:$dst, (MoveParam i16:$src))]>; 2921def MoveParamF64 : MoveParamInst<f64, Float64Regs, ".f64">; 2922def MoveParamF32 : MoveParamInst<f32, Float32Regs, ".f32">; 2923 2924class PseudoUseParamInst<NVPTXRegClass regclass, ValueType vt> : 2925 NVPTXInst<(outs), (ins regclass:$src), 2926 "// Pseudo use of $src", 2927 [(PseudoUseParam vt:$src)]>; 2928 2929def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs, i64>; 2930def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs, i32>; 2931def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs, i16>; 2932def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs, f64>; 2933def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>; 2934 2935class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> : 2936 NVPTXInst<(outs regclass:$dst), (ins regclass:$src), 2937 !strconcat("mov.", SzStr, " \t$dst, $src;"), 2938 [(set (T regclass:$dst), (ProxyReg (T regclass:$src)))]>; 2939 2940def ProxyRegI1 : ProxyRegInst<"pred", i1, Int1Regs>; 2941def ProxyRegI16 : ProxyRegInst<"b16", i16, Int16Regs>; 2942def ProxyRegI32 : ProxyRegInst<"b32", i32, Int32Regs>; 2943def ProxyRegI64 : ProxyRegInst<"b64", i64, Int64Regs>; 2944def ProxyRegF32 : ProxyRegInst<"f32", f32, Float32Regs>; 2945def ProxyRegF64 : ProxyRegInst<"f64", f64, Float64Regs>; 2946 2947foreach vt = [f16, bf16] in { 2948 def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI16 Int16Regs:$src)>; 2949} 2950 2951foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { 2952 def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 Int32Regs:$src)>; 2953} 2954 2955// 2956// Load / Store Handling 2957// 2958multiclass LD<NVPTXRegClass regclass> { 2959 def _avar : NVPTXInst< 2960 (outs regclass:$dst), 2961 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2962 i32imm:$fromWidth, imem:$addr), 2963 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2964 "\t$dst, [$addr];", []>; 2965 def _areg : NVPTXInst< 2966 (outs regclass:$dst), 2967 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2968 i32imm:$fromWidth, Int32Regs:$addr), 2969 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2970 "\t$dst, [$addr];", []>; 2971 def _areg_64 : NVPTXInst< 2972 (outs regclass:$dst), 2973 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2974 i32imm:$fromWidth, Int64Regs:$addr), 2975 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2976 "\t$dst, [$addr];", []>; 2977 def _ari : NVPTXInst< 2978 (outs regclass:$dst), 2979 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2980 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 2981 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2982 "\t$dst, [$addr+$offset];", []>; 2983 def _ari_64 : NVPTXInst< 2984 (outs regclass:$dst), 2985 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2986 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 2987 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2988 "\t$dst, [$addr+$offset];", []>; 2989 def _asi : NVPTXInst< 2990 (outs regclass:$dst), 2991 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 2992 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), 2993 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2994 "\t$dst, [$addr+$offset];", []>; 2995} 2996 2997let mayLoad=1, hasSideEffects=0 in { 2998 defm LD_i8 : LD<Int16Regs>; 2999 defm LD_i16 : LD<Int16Regs>; 3000 defm LD_i32 : LD<Int32Regs>; 3001 defm LD_i64 : LD<Int64Regs>; 3002 defm LD_f32 : LD<Float32Regs>; 3003 defm LD_f64 : LD<Float64Regs>; 3004} 3005 3006multiclass ST<NVPTXRegClass regclass> { 3007 def _avar : NVPTXInst< 3008 (outs), 3009 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 3010 LdStCode:$Sign, i32imm:$toWidth, imem:$addr), 3011 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 3012 " \t[$addr], $src;", []>; 3013 def _areg : NVPTXInst< 3014 (outs), 3015 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, 3016 LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), 3017 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 3018 " \t[$addr], $src;", []>; 3019 def _areg_64 : NVPTXInst< 3020 (outs), 3021 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 3022 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), 3023 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 3024 " \t[$addr], $src;", []>; 3025 def _ari : NVPTXInst< 3026 (outs), 3027 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 3028 LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), 3029 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 3030 " \t[$addr+$offset], $src;", []>; 3031 def _ari_64 : NVPTXInst< 3032 (outs), 3033 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 3034 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), 3035 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 3036 " \t[$addr+$offset], $src;", []>; 3037 def _asi : NVPTXInst< 3038 (outs), 3039 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, 3040 LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), 3041 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 3042 " \t[$addr+$offset], $src;", []>; 3043} 3044 3045let mayStore=1, hasSideEffects=0 in { 3046 defm ST_i8 : ST<Int16Regs>; 3047 defm ST_i16 : ST<Int16Regs>; 3048 defm ST_i32 : ST<Int32Regs>; 3049 defm ST_i64 : ST<Int64Regs>; 3050 defm ST_f32 : ST<Float32Regs>; 3051 defm ST_f64 : ST<Float64Regs>; 3052} 3053 3054// The following is used only in and after vector elementizations. Vector 3055// elementization happens at the machine instruction level, so the following 3056// instructions never appear in the DAG. 3057multiclass LD_VEC<NVPTXRegClass regclass> { 3058 def _v2_avar : NVPTXInst< 3059 (outs regclass:$dst1, regclass:$dst2), 3060 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3061 i32imm:$fromWidth, imem:$addr), 3062 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3063 "\t{{$dst1, $dst2}}, [$addr];", []>; 3064 def _v2_areg : NVPTXInst< 3065 (outs regclass:$dst1, regclass:$dst2), 3066 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3067 i32imm:$fromWidth, Int32Regs:$addr), 3068 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3069 "\t{{$dst1, $dst2}}, [$addr];", []>; 3070 def _v2_areg_64 : NVPTXInst< 3071 (outs regclass:$dst1, regclass:$dst2), 3072 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3073 i32imm:$fromWidth, Int64Regs:$addr), 3074 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3075 "\t{{$dst1, $dst2}}, [$addr];", []>; 3076 def _v2_ari : NVPTXInst< 3077 (outs regclass:$dst1, regclass:$dst2), 3078 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3079 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 3080 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3081 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 3082 def _v2_ari_64 : NVPTXInst< 3083 (outs regclass:$dst1, regclass:$dst2), 3084 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3085 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 3086 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3087 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 3088 def _v2_asi : NVPTXInst< 3089 (outs regclass:$dst1, regclass:$dst2), 3090 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3091 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 3092 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3093 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; 3094 def _v4_avar : NVPTXInst< 3095 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 3096 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3097 i32imm:$fromWidth, imem:$addr), 3098 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3099 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 3100 def _v4_areg : NVPTXInst< 3101 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 3102 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3103 i32imm:$fromWidth, Int32Regs:$addr), 3104 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3105 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 3106 def _v4_areg_64 : NVPTXInst< 3107 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 3108 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3109 i32imm:$fromWidth, Int64Regs:$addr), 3110 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3111 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 3112 def _v4_ari : NVPTXInst< 3113 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 3114 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3115 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 3116 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3117 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 3118 def _v4_ari_64 : NVPTXInst< 3119 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 3120 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3121 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 3122 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3123 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 3124 def _v4_asi : NVPTXInst< 3125 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 3126 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3127 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 3128 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3129 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; 3130} 3131let mayLoad=1, hasSideEffects=0 in { 3132 defm LDV_i8 : LD_VEC<Int16Regs>; 3133 defm LDV_i16 : LD_VEC<Int16Regs>; 3134 defm LDV_i32 : LD_VEC<Int32Regs>; 3135 defm LDV_i64 : LD_VEC<Int64Regs>; 3136 defm LDV_f32 : LD_VEC<Float32Regs>; 3137 defm LDV_f64 : LD_VEC<Float64Regs>; 3138} 3139 3140multiclass ST_VEC<NVPTXRegClass regclass> { 3141 def _v2_avar : NVPTXInst< 3142 (outs), 3143 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3144 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), 3145 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3146 "\t[$addr], {{$src1, $src2}};", []>; 3147 def _v2_areg : NVPTXInst< 3148 (outs), 3149 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3150 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), 3151 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3152 "\t[$addr], {{$src1, $src2}};", []>; 3153 def _v2_areg_64 : NVPTXInst< 3154 (outs), 3155 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3156 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), 3157 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3158 "\t[$addr], {{$src1, $src2}};", []>; 3159 def _v2_ari : NVPTXInst< 3160 (outs), 3161 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3162 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, 3163 i32imm:$offset), 3164 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3165 "\t[$addr+$offset], {{$src1, $src2}};", []>; 3166 def _v2_ari_64 : NVPTXInst< 3167 (outs), 3168 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3169 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, 3170 i32imm:$offset), 3171 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3172 "\t[$addr+$offset], {{$src1, $src2}};", []>; 3173 def _v2_asi : NVPTXInst< 3174 (outs), 3175 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, 3176 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, 3177 i32imm:$offset), 3178 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3179 "\t[$addr+$offset], {{$src1, $src2}};", []>; 3180 def _v4_avar : NVPTXInst< 3181 (outs), 3182 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3183 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3184 i32imm:$fromWidth, imem:$addr), 3185 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3186 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 3187 def _v4_areg : NVPTXInst< 3188 (outs), 3189 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3190 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3191 i32imm:$fromWidth, Int32Regs:$addr), 3192 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3193 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 3194 def _v4_areg_64 : NVPTXInst< 3195 (outs), 3196 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3197 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3198 i32imm:$fromWidth, Int64Regs:$addr), 3199 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3200 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 3201 def _v4_ari : NVPTXInst< 3202 (outs), 3203 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3204 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3205 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), 3206 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3207 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 3208 def _v4_ari_64 : NVPTXInst< 3209 (outs), 3210 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3211 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3212 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), 3213 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3214 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 3215 def _v4_asi : NVPTXInst< 3216 (outs), 3217 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3218 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 3219 i32imm:$fromWidth, imem:$addr, i32imm:$offset), 3220 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}" 3221 "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; 3222} 3223 3224let mayStore=1, hasSideEffects=0 in { 3225 defm STV_i8 : ST_VEC<Int16Regs>; 3226 defm STV_i16 : ST_VEC<Int16Regs>; 3227 defm STV_i32 : ST_VEC<Int32Regs>; 3228 defm STV_i64 : ST_VEC<Int64Regs>; 3229 defm STV_f32 : ST_VEC<Float32Regs>; 3230 defm STV_f64 : ST_VEC<Float64Regs>; 3231} 3232 3233//---- Conversion ---- 3234 3235class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut, 3236 NVPTXRegClass regclassIn = ValueToRegClass<TIn>.ret, 3237 NVPTXRegClass regclassOut = ValueToRegClass<TOut>.ret> : 3238 NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a), 3239 !strconcat("mov.b", SzStr, " \t$d, $a;"), 3240 [(set (TOut regclassOut:$d), (bitconvert (TIn regclassIn:$a)))]>; 3241 3242def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>; 3243def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>; 3244def BITCONVERT_64_I2F : F_BITCONVERT<"64", i64, f64>; 3245def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>; 3246 3247foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { 3248def: Pat<(vt (bitconvert (f32 Float32Regs:$a))), 3249 (BITCONVERT_32_F2I Float32Regs:$a)>; 3250def: Pat<(f32 (bitconvert (vt Int32Regs:$a))), 3251 (BITCONVERT_32_I2F Int32Regs:$a)>; 3252} 3253foreach vt = [f16, bf16] in { 3254def: Pat<(vt (bitconvert (i16 UInt16Const:$a))), 3255 (IMOVB16ri UInt16Const:$a)>; 3256def: Pat<(vt (bitconvert (i16 Int16Regs:$a))), 3257 (ProxyRegI16 Int16Regs:$a)>; 3258def: Pat<(i16 (bitconvert (vt Int16Regs:$a))), 3259 (ProxyRegI16 Int16Regs:$a)>; 3260} 3261 3262foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in { 3263 def: Pat<(ta (bitconvert (i32 UInt32Const:$a))), 3264 (IMOVB32ri UInt32Const:$a)>; 3265 foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in { 3266 if !ne(ta, tb) then { 3267 def: Pat<(ta (bitconvert (tb Int32Regs:$a))), 3268 (ProxyRegI32 Int32Regs:$a)>; 3269 } 3270 } 3271} 3272 3273// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where 3274// we cannot specify floating-point literals in isel patterns. Therefore, we 3275// use an integer selp to select either 1 or 0 and then cvt to floating-point. 3276 3277// sint -> f16 3278def : Pat<(f16 (sint_to_fp Int1Regs:$a)), 3279 (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3280def : Pat<(f16 (sint_to_fp Int16Regs:$a)), 3281 (CVT_f16_s16 Int16Regs:$a, CvtRN)>; 3282def : Pat<(f16 (sint_to_fp Int32Regs:$a)), 3283 (CVT_f16_s32 Int32Regs:$a, CvtRN)>; 3284def : Pat<(f16 (sint_to_fp Int64Regs:$a)), 3285 (CVT_f16_s64 Int64Regs:$a, CvtRN)>; 3286 3287// uint -> f16 3288def : Pat<(f16 (uint_to_fp Int1Regs:$a)), 3289 (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3290def : Pat<(f16 (uint_to_fp Int16Regs:$a)), 3291 (CVT_f16_u16 Int16Regs:$a, CvtRN)>; 3292def : Pat<(f16 (uint_to_fp Int32Regs:$a)), 3293 (CVT_f16_u32 Int32Regs:$a, CvtRN)>; 3294def : Pat<(f16 (uint_to_fp Int64Regs:$a)), 3295 (CVT_f16_u64 Int64Regs:$a, CvtRN)>; 3296 3297// sint -> bf16 3298def : Pat<(bf16 (sint_to_fp Int1Regs:$a)), 3299 (CVT_bf16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3300def : Pat<(bf16 (sint_to_fp Int16Regs:$a)), 3301 (CVT_bf16_s16 Int16Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3302def : Pat<(bf16 (sint_to_fp Int32Regs:$a)), 3303 (CVT_bf16_s32 Int32Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3304def : Pat<(bf16 (sint_to_fp Int64Regs:$a)), 3305 (CVT_bf16_s64 Int64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3306 3307// uint -> bf16 3308def : Pat<(bf16 (uint_to_fp Int1Regs:$a)), 3309 (CVT_bf16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3310def : Pat<(bf16 (uint_to_fp Int16Regs:$a)), 3311 (CVT_bf16_u16 Int16Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3312def : Pat<(bf16 (uint_to_fp Int32Regs:$a)), 3313 (CVT_bf16_u32 Int32Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3314def : Pat<(bf16 (uint_to_fp Int64Regs:$a)), 3315 (CVT_bf16_u64 Int64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3316 3317// sint -> f32 3318def : Pat<(f32 (sint_to_fp Int1Regs:$a)), 3319 (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3320def : Pat<(f32 (sint_to_fp Int16Regs:$a)), 3321 (CVT_f32_s16 Int16Regs:$a, CvtRN)>; 3322def : Pat<(f32 (sint_to_fp Int32Regs:$a)), 3323 (CVT_f32_s32 Int32Regs:$a, CvtRN)>; 3324def : Pat<(f32 (sint_to_fp Int64Regs:$a)), 3325 (CVT_f32_s64 Int64Regs:$a, CvtRN)>; 3326 3327// uint -> f32 3328def : Pat<(f32 (uint_to_fp Int1Regs:$a)), 3329 (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3330def : Pat<(f32 (uint_to_fp Int16Regs:$a)), 3331 (CVT_f32_u16 Int16Regs:$a, CvtRN)>; 3332def : Pat<(f32 (uint_to_fp Int32Regs:$a)), 3333 (CVT_f32_u32 Int32Regs:$a, CvtRN)>; 3334def : Pat<(f32 (uint_to_fp Int64Regs:$a)), 3335 (CVT_f32_u64 Int64Regs:$a, CvtRN)>; 3336 3337// sint -> f64 3338def : Pat<(f64 (sint_to_fp Int1Regs:$a)), 3339 (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3340def : Pat<(f64 (sint_to_fp Int16Regs:$a)), 3341 (CVT_f64_s16 Int16Regs:$a, CvtRN)>; 3342def : Pat<(f64 (sint_to_fp Int32Regs:$a)), 3343 (CVT_f64_s32 Int32Regs:$a, CvtRN)>; 3344def : Pat<(f64 (sint_to_fp Int64Regs:$a)), 3345 (CVT_f64_s64 Int64Regs:$a, CvtRN)>; 3346 3347// uint -> f64 3348def : Pat<(f64 (uint_to_fp Int1Regs:$a)), 3349 (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; 3350def : Pat<(f64 (uint_to_fp Int16Regs:$a)), 3351 (CVT_f64_u16 Int16Regs:$a, CvtRN)>; 3352def : Pat<(f64 (uint_to_fp Int32Regs:$a)), 3353 (CVT_f64_u32 Int32Regs:$a, CvtRN)>; 3354def : Pat<(f64 (uint_to_fp Int64Regs:$a)), 3355 (CVT_f64_u64 Int64Regs:$a, CvtRN)>; 3356 3357 3358// f16 -> sint 3359def : Pat<(i1 (fp_to_sint (f16 Int16Regs:$a))), 3360 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; 3361def : Pat<(i16 (fp_to_sint (f16 Int16Regs:$a))), 3362 (CVT_s16_f16 (f16 Int16Regs:$a), CvtRZI)>; 3363def : Pat<(i32 (fp_to_sint (f16 Int16Regs:$a))), 3364 (CVT_s32_f16 (f16 Int16Regs:$a), CvtRZI)>; 3365def : Pat<(i64 (fp_to_sint (f16 Int16Regs:$a))), 3366 (CVT_s64_f16 Int16Regs:$a, CvtRZI)>; 3367 3368// f16 -> uint 3369def : Pat<(i1 (fp_to_uint (f16 Int16Regs:$a))), 3370 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; 3371def : Pat<(i16 (fp_to_uint (f16 Int16Regs:$a))), 3372 (CVT_u16_f16 Int16Regs:$a, CvtRZI)>; 3373def : Pat<(i32 (fp_to_uint (f16 Int16Regs:$a))), 3374 (CVT_u32_f16 Int16Regs:$a, CvtRZI)>; 3375def : Pat<(i64 (fp_to_uint (f16 Int16Regs:$a))), 3376 (CVT_u64_f16 Int16Regs:$a, CvtRZI)>; 3377 3378// bf16 -> sint 3379def : Pat<(i1 (fp_to_sint (bf16 Int16Regs:$a))), 3380 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; 3381def : Pat<(i16 (fp_to_sint (bf16 Int16Regs:$a))), 3382 (CVT_s16_bf16 (bf16 Int16Regs:$a), CvtRZI)>; 3383def : Pat<(i32 (fp_to_sint (bf16 Int16Regs:$a))), 3384 (CVT_s32_bf16 (bf16 Int16Regs:$a), CvtRZI)>; 3385def : Pat<(i64 (fp_to_sint (bf16 Int16Regs:$a))), 3386 (CVT_s64_bf16 Int16Regs:$a, CvtRZI)>; 3387 3388// bf16 -> uint 3389def : Pat<(i1 (fp_to_uint (bf16 Int16Regs:$a))), 3390 (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; 3391def : Pat<(i16 (fp_to_uint (bf16 Int16Regs:$a))), 3392 (CVT_u16_bf16 Int16Regs:$a, CvtRZI)>; 3393def : Pat<(i32 (fp_to_uint (bf16 Int16Regs:$a))), 3394 (CVT_u32_bf16 Int16Regs:$a, CvtRZI)>; 3395def : Pat<(i64 (fp_to_uint (bf16 Int16Regs:$a))), 3396 (CVT_u64_bf16 Int16Regs:$a, CvtRZI)>; 3397// f32 -> sint 3398def : Pat<(i1 (fp_to_sint Float32Regs:$a)), 3399 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; 3400def : Pat<(i16 (fp_to_sint Float32Regs:$a)), 3401 (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3402def : Pat<(i16 (fp_to_sint Float32Regs:$a)), 3403 (CVT_s16_f32 Float32Regs:$a, CvtRZI)>; 3404def : Pat<(i32 (fp_to_sint Float32Regs:$a)), 3405 (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3406def : Pat<(i32 (fp_to_sint Float32Regs:$a)), 3407 (CVT_s32_f32 Float32Regs:$a, CvtRZI)>; 3408def : Pat<(i64 (fp_to_sint Float32Regs:$a)), 3409 (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3410def : Pat<(i64 (fp_to_sint Float32Regs:$a)), 3411 (CVT_s64_f32 Float32Regs:$a, CvtRZI)>; 3412 3413// f32 -> uint 3414def : Pat<(i1 (fp_to_uint Float32Regs:$a)), 3415 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; 3416def : Pat<(i16 (fp_to_uint Float32Regs:$a)), 3417 (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3418def : Pat<(i16 (fp_to_uint Float32Regs:$a)), 3419 (CVT_u16_f32 Float32Regs:$a, CvtRZI)>; 3420def : Pat<(i32 (fp_to_uint Float32Regs:$a)), 3421 (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3422def : Pat<(i32 (fp_to_uint Float32Regs:$a)), 3423 (CVT_u32_f32 Float32Regs:$a, CvtRZI)>; 3424def : Pat<(i64 (fp_to_uint Float32Regs:$a)), 3425 (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3426def : Pat<(i64 (fp_to_uint Float32Regs:$a)), 3427 (CVT_u64_f32 Float32Regs:$a, CvtRZI)>; 3428 3429// f64 -> sint 3430def : Pat<(i1 (fp_to_sint Float64Regs:$a)), 3431 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; 3432def : Pat<(i16 (fp_to_sint Float64Regs:$a)), 3433 (CVT_s16_f64 Float64Regs:$a, CvtRZI)>; 3434def : Pat<(i32 (fp_to_sint Float64Regs:$a)), 3435 (CVT_s32_f64 Float64Regs:$a, CvtRZI)>; 3436def : Pat<(i64 (fp_to_sint Float64Regs:$a)), 3437 (CVT_s64_f64 Float64Regs:$a, CvtRZI)>; 3438 3439// f64 -> uint 3440def : Pat<(i1 (fp_to_uint Float64Regs:$a)), 3441 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; 3442def : Pat<(i16 (fp_to_uint Float64Regs:$a)), 3443 (CVT_u16_f64 Float64Regs:$a, CvtRZI)>; 3444def : Pat<(i32 (fp_to_uint Float64Regs:$a)), 3445 (CVT_u32_f64 Float64Regs:$a, CvtRZI)>; 3446def : Pat<(i64 (fp_to_uint Float64Regs:$a)), 3447 (CVT_u64_f64 Float64Regs:$a, CvtRZI)>; 3448 3449// sext i1 3450def : Pat<(i16 (sext Int1Regs:$a)), 3451 (SELP_s16ii -1, 0, Int1Regs:$a)>; 3452def : Pat<(i32 (sext Int1Regs:$a)), 3453 (SELP_s32ii -1, 0, Int1Regs:$a)>; 3454def : Pat<(i64 (sext Int1Regs:$a)), 3455 (SELP_s64ii -1, 0, Int1Regs:$a)>; 3456 3457// zext i1 3458def : Pat<(i16 (zext Int1Regs:$a)), 3459 (SELP_u16ii 1, 0, Int1Regs:$a)>; 3460def : Pat<(i32 (zext Int1Regs:$a)), 3461 (SELP_u32ii 1, 0, Int1Regs:$a)>; 3462def : Pat<(i64 (zext Int1Regs:$a)), 3463 (SELP_u64ii 1, 0, Int1Regs:$a)>; 3464 3465// anyext i1 3466def : Pat<(i16 (anyext Int1Regs:$a)), 3467 (SELP_u16ii -1, 0, Int1Regs:$a)>; 3468def : Pat<(i32 (anyext Int1Regs:$a)), 3469 (SELP_u32ii -1, 0, Int1Regs:$a)>; 3470def : Pat<(i64 (anyext Int1Regs:$a)), 3471 (SELP_u64ii -1, 0, Int1Regs:$a)>; 3472 3473// sext i16 3474def : Pat<(i32 (sext Int16Regs:$a)), 3475 (CVT_s32_s16 Int16Regs:$a, CvtNONE)>; 3476def : Pat<(i64 (sext Int16Regs:$a)), 3477 (CVT_s64_s16 Int16Regs:$a, CvtNONE)>; 3478 3479// zext i16 3480def : Pat<(i32 (zext Int16Regs:$a)), 3481 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; 3482def : Pat<(i64 (zext Int16Regs:$a)), 3483 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; 3484 3485// anyext i16 3486def : Pat<(i32 (anyext Int16Regs:$a)), 3487 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; 3488def : Pat<(i64 (anyext Int16Regs:$a)), 3489 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; 3490 3491// sext i32 3492def : Pat<(i64 (sext Int32Regs:$a)), 3493 (CVT_s64_s32 Int32Regs:$a, CvtNONE)>; 3494 3495// zext i32 3496def : Pat<(i64 (zext Int32Regs:$a)), 3497 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; 3498 3499// anyext i32 3500def : Pat<(i64 (anyext Int32Regs:$a)), 3501 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; 3502 3503 3504// truncate i64 3505def : Pat<(i32 (trunc Int64Regs:$a)), 3506 (CVT_u32_u64 Int64Regs:$a, CvtNONE)>; 3507def : Pat<(i16 (trunc Int64Regs:$a)), 3508 (CVT_u16_u64 Int64Regs:$a, CvtNONE)>; 3509def : Pat<(i1 (trunc Int64Regs:$a)), 3510 (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>; 3511 3512// truncate i32 3513def : Pat<(i16 (trunc Int32Regs:$a)), 3514 (CVT_u16_u32 Int32Regs:$a, CvtNONE)>; 3515def : Pat<(i1 (trunc Int32Regs:$a)), 3516 (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>; 3517 3518// truncate i16 3519def : Pat<(i1 (trunc Int16Regs:$a)), 3520 (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>; 3521 3522// sext_inreg 3523def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>; 3524def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>; 3525def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>; 3526def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>; 3527def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>; 3528def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>; 3529 3530 3531// Select instructions with 32-bit predicates 3532def : Pat<(select (i32 Int32Regs:$pred), i16:$a, i16:$b), 3533 (SELP_b16rr Int16Regs:$a, Int16Regs:$b, 3534 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3535def : Pat<(select (i32 Int32Regs:$pred), i32:$a, i32:$b), 3536 (SELP_b32rr Int32Regs:$a, Int32Regs:$b, 3537 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3538def : Pat<(select (i32 Int32Regs:$pred), Int64Regs:$a, Int64Regs:$b), 3539 (SELP_b64rr Int64Regs:$a, Int64Regs:$b, 3540 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3541def : Pat<(select (i32 Int32Regs:$pred), (f16 Int16Regs:$a), (f16 Int16Regs:$b)), 3542 (SELP_f16rr Int16Regs:$a, Int16Regs:$b, 3543 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3544def : Pat<(select (i32 Int32Regs:$pred), (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)), 3545 (SELP_bf16rr Int16Regs:$a, Int16Regs:$b, 3546 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3547def : Pat<(select (i32 Int32Regs:$pred), Float32Regs:$a, Float32Regs:$b), 3548 (SELP_f32rr Float32Regs:$a, Float32Regs:$b, 3549 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3550def : Pat<(select (i32 Int32Regs:$pred), Float64Regs:$a, Float64Regs:$b), 3551 (SELP_f64rr Float64Regs:$a, Float64Regs:$b, 3552 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; 3553 3554 3555let hasSideEffects = false in { 3556 // pack a set of smaller int registers to a larger int register 3557 def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d), 3558 (ins Int16Regs:$s1, Int16Regs:$s2, 3559 Int16Regs:$s3, Int16Regs:$s4), 3560 "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>; 3561 def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d), 3562 (ins Int16Regs:$s1, Int16Regs:$s2), 3563 "mov.b32 \t$d, {{$s1, $s2}};", []>; 3564 def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d), 3565 (ins Int32Regs:$s1, Int32Regs:$s2), 3566 "mov.b64 \t$d, {{$s1, $s2}};", []>; 3567 def V2I64toI128 : NVPTXInst<(outs Int128Regs:$d), 3568 (ins Int64Regs:$s1, Int64Regs:$s2), 3569 "mov.b128 \t$d, {{$s1, $s2}};", []>; 3570 def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d), 3571 (ins Float32Regs:$s1, Float32Regs:$s2), 3572 "mov.b64 \t$d, {{$s1, $s2}};", []>; 3573 3574 // unpack a larger int register to a set of smaller int registers 3575 def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2, 3576 Int16Regs:$d3, Int16Regs:$d4), 3577 (ins Int64Regs:$s), 3578 "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>; 3579 def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2), 3580 (ins Int32Regs:$s), 3581 "mov.b32 \t{{$d1, $d2}}, $s;", []>; 3582 def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2), 3583 (ins Int64Regs:$s), 3584 "mov.b64 \t{{$d1, $d2}}, $s;", []>; 3585 def I128toV2I64: NVPTXInst<(outs Int64Regs:$d1, Int64Regs:$d2), 3586 (ins Int128Regs:$s), 3587 "mov.b128 \t{{$d1, $d2}}, $s;", []>; 3588 def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2), 3589 (ins Float64Regs:$s), 3590 "mov.b64 \t{{$d1, $d2}}, $s;", []>; 3591 3592 def I32toI16H : NVPTXInst<(outs Int16Regs:$high), 3593 (ins Int32Regs:$s), 3594 "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}", 3595 []>; 3596 def I32toI16L : NVPTXInst<(outs Int16Regs:$low), 3597 (ins Int32Regs:$s), 3598 "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}", 3599 []>; 3600 def I64toI32H : NVPTXInst<(outs Int32Regs:$high), 3601 (ins Int64Regs:$s), 3602 "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}", 3603 []>; 3604 def I64toI32L : NVPTXInst<(outs Int32Regs:$low), 3605 (ins Int64Regs:$s), 3606 "{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}", 3607 []>; 3608 3609} 3610 3611// Using partial vectorized move produces better SASS code for extraction of 3612// upper/lower parts of an integer. 3613def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))), 3614 (I32toI16H Int32Regs:$s)>; 3615def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))), 3616 (I32toI16H Int32Regs:$s)>; 3617def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))), 3618 (I64toI32H Int64Regs:$s)>; 3619def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))), 3620 (I64toI32H Int64Regs:$s)>; 3621 3622def: Pat<(i32 (sext (extractelt (v2i16 Int32Regs:$src), 0))), 3623 (CVT_INREG_s32_s16 Int32Regs:$src)>; 3624 3625foreach vt = [v2f16, v2bf16, v2i16] in { 3626def : Pat<(extractelt (vt Int32Regs:$src), 0), 3627 (I32toI16L Int32Regs:$src)>; 3628def : Pat<(extractelt (vt Int32Regs:$src), 1), 3629 (I32toI16H Int32Regs:$src)>; 3630} 3631def : Pat<(v2f16 (build_vector (f16 Int16Regs:$a), (f16 Int16Regs:$b))), 3632 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; 3633def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), 3634 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; 3635def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))), 3636 (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; 3637 3638def: Pat<(v2i16 (scalar_to_vector (i16 Int16Regs:$a))), 3639 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; 3640 3641// Count leading zeros 3642let hasSideEffects = false in { 3643 def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), 3644 "clz.b32 \t$d, $a;", []>; 3645 def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), 3646 "clz.b64 \t$d, $a;", []>; 3647} 3648 3649// 32-bit has a direct PTX instruction 3650def : Pat<(i32 (ctlz (i32 Int32Regs:$a))), (CLZr32 Int32Regs:$a)>; 3651 3652// The return type of the ctlz ISD node is the same as its input, but the PTX 3653// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the 3654// ptx value to 64 bits to match the ISD node's semantics, unless we know we're 3655// truncating back down to 32 bits. 3656def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; 3657def : Pat<(i32 (trunc (i64 (ctlz Int64Regs:$a)))), (CLZr64 Int64Regs:$a)>; 3658 3659// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the 3660// result back to 16-bits if necessary. We also need to subtract 16 because 3661// the high-order 16 zeros were counted. 3662// 3663// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could 3664// use to save one SASS instruction (on sm_35 anyway): 3665// 3666// mov.b32 $tmp, {0xffff, $a} 3667// ctlz.b32 $result, $tmp 3668// 3669// That is, instead of zero-extending the input to 32 bits, we'd "one-extend" 3670// and then ctlz that value. This way we don't have to subtract 16 from the 3671// result. Unfortunately today we don't have a way to generate 3672// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization. 3673def : Pat<(i16 (ctlz Int16Regs:$a)), 3674 (SUBi16ri (CVT_u16_u32 3675 (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>; 3676def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))), 3677 (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>; 3678 3679// Population count 3680let hasSideEffects = false in { 3681 def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), 3682 "popc.b32 \t$d, $a;", []>; 3683 def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), 3684 "popc.b64 \t$d, $a;", []>; 3685} 3686 3687// 32-bit has a direct PTX instruction 3688def : Pat<(i32 (ctpop (i32 Int32Regs:$a))), (POPCr32 Int32Regs:$a)>; 3689 3690// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit 3691// to match the LLVM semantics. Just as with ctlz.i64, we provide a second 3692// pattern that avoids the type conversion if we're truncating the result to 3693// i32 anyway. 3694def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>; 3695def : Pat<(i32 (trunc (i64 (ctpop Int64Regs:$a)))), (POPCr64 Int64Regs:$a)>; 3696 3697// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits. 3698// If we know that we're storing into an i32, we can avoid the final trunc. 3699def : Pat<(ctpop Int16Regs:$a), 3700 (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>; 3701def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))), 3702 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>; 3703 3704// fpround f32 -> f16 3705def : Pat<(f16 (fpround Float32Regs:$a)), 3706 (CVT_f16_f32 Float32Regs:$a, CvtRN)>; 3707 3708// fpround f32 -> bf16 3709def : Pat<(bf16 (fpround Float32Regs:$a)), 3710 (CVT_bf16_f32 Float32Regs:$a, CvtRN)>, Requires<[hasPTX<70>, hasSM<80>]>; 3711 3712// fpround f64 -> f16 3713def : Pat<(f16 (fpround Float64Regs:$a)), 3714 (CVT_f16_f64 Float64Regs:$a, CvtRN)>; 3715 3716// fpround f64 -> bf16 3717def : Pat<(bf16 (fpround Float64Regs:$a)), 3718 (CVT_bf16_f64 Float64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3719// fpround f64 -> f32 3720def : Pat<(f32 (fpround Float64Regs:$a)), 3721 (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; 3722def : Pat<(f32 (fpround Float64Regs:$a)), 3723 (CVT_f32_f64 Float64Regs:$a, CvtRN)>; 3724 3725// fpextend f16 -> f32 3726def : Pat<(f32 (fpextend (f16 Int16Regs:$a))), 3727 (CVT_f32_f16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3728def : Pat<(f32 (fpextend (f16 Int16Regs:$a))), 3729 (CVT_f32_f16 Int16Regs:$a, CvtNONE)>; 3730// fpextend bf16 -> f32 3731def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))), 3732 (CVT_f32_bf16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3733def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))), 3734 (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>; 3735 3736// fpextend f16 -> f64 3737def : Pat<(f64 (fpextend (f16 Int16Regs:$a))), 3738 (CVT_f64_f16 Int16Regs:$a, CvtNONE)>; 3739 3740// fpextend bf16 -> f64 3741def : Pat<(f64 (fpextend (bf16 Int16Regs:$a))), 3742 (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>; 3743 3744// fpextend f32 -> f64 3745def : Pat<(f64 (fpextend Float32Regs:$a)), 3746 (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3747def : Pat<(f64 (fpextend Float32Regs:$a)), 3748 (CVT_f64_f32 Float32Regs:$a, CvtNONE)>; 3749 3750def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone, 3751 [SDNPHasChain, SDNPOptInGlue]>; 3752 3753// fceil, ffloor, froundeven, ftrunc. 3754 3755multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> { 3756 def : Pat<(OpNode (f16 Int16Regs:$a)), 3757 (CVT_f16_f16 Int16Regs:$a, Mode)>; 3758 def : Pat<(OpNode (bf16 Int16Regs:$a)), 3759 (CVT_bf16_bf16 Int16Regs:$a, Mode)>; 3760 def : Pat<(OpNode Float32Regs:$a), 3761 (CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>; 3762 def : Pat<(OpNode Float32Regs:$a), 3763 (CVT_f32_f32 Float32Regs:$a, Mode)>, Requires<[doNoF32FTZ]>; 3764 def : Pat<(OpNode Float64Regs:$a), 3765 (CVT_f64_f64 Float64Regs:$a, Mode)>; 3766} 3767 3768defm : CVT_ROUND<fceil, CvtRPI, CvtRPI_FTZ>; 3769defm : CVT_ROUND<ffloor, CvtRMI, CvtRMI_FTZ>; 3770defm : CVT_ROUND<froundeven, CvtRNI, CvtRNI_FTZ>; 3771defm : CVT_ROUND<ftrunc, CvtRZI, CvtRZI_FTZ>; 3772 3773// nearbyint and rint are implemented as rounding to nearest even. This isn't 3774// strictly correct, because it causes us to ignore the rounding mode. But it 3775// matches what CUDA's "libm" does. 3776 3777defm : CVT_ROUND<fnearbyint, CvtRNI, CvtRNI_FTZ>; 3778defm : CVT_ROUND<frint, CvtRNI, CvtRNI_FTZ>; 3779 3780//----------------------------------- 3781// Control-flow 3782//----------------------------------- 3783 3784let isTerminator=1 in { 3785 let isReturn=1, isBarrier=1 in 3786 def Return : NVPTXInst<(outs), (ins), "ret;", [(retglue)]>; 3787 3788 let isBranch=1 in 3789 def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), 3790 "@$a bra \t$target;", 3791 [(brcond Int1Regs:$a, bb:$target)]>; 3792 let isBranch=1 in 3793 def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), 3794 "@!$a bra \t$target;", []>; 3795 3796 let isBranch=1, isBarrier=1 in 3797 def GOTO : NVPTXInst<(outs), (ins brtarget:$target), 3798 "bra.uni \t$target;", [(br bb:$target)]>; 3799} 3800 3801def : Pat<(brcond (i32 Int32Regs:$a), bb:$target), 3802 (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>; 3803 3804// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a 3805// conditional branch if the target block is the next block so that the code 3806// can fall through to the target block. The invertion is done by 'xor 3807// condition, 1', which will be translated to (setne condition, -1). Since ptx 3808// supports '@!pred bra target', we should use it. 3809def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target), 3810 (CBranchOther Int1Regs:$a, bb:$target)>; 3811 3812// Call 3813def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, 3814 SDTCisVT<1, i32>]>; 3815def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; 3816 3817def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart, 3818 [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; 3819def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd, 3820 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, 3821 SDNPSideEffect]>; 3822 3823def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; 3824def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall, 3825 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; 3826def calltarget : Operand<i32>; 3827let isCall=1 in { 3828 def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>; 3829} 3830 3831def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>; 3832def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>; 3833 3834// Pseudo instructions. 3835class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern> 3836 : NVPTXInst<outs, ins, asmstr, pattern>; 3837 3838def Callseq_Start : 3839 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), 3840 "\\{ // callseq $amt1, $amt2", 3841 [(callseq_start timm:$amt1, timm:$amt2)]>; 3842def Callseq_End : 3843 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), 3844 "\\} // callseq $amt1", 3845 [(callseq_end timm:$amt1, timm:$amt2)]>; 3846 3847// trap instruction 3848// Emit an `exit` as well to convey to ptxas that `trap` exits the CFG. 3849// This won't be necessary in a future version of ptxas. 3850def trapinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>; 3851 3852// Call prototype wrapper 3853def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 3854def CallPrototype : 3855 SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, 3856 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 3857def ProtoIdent : Operand<i32> { 3858 let PrintMethod = "printProtoIdent"; 3859} 3860def CALL_PROTOTYPE : 3861 NVPTXInst<(outs), (ins ProtoIdent:$ident), 3862 "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; 3863 3864def SDTDynAllocaOp : 3865 SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisInt<2>]>; 3866 3867def dyn_alloca : 3868 SDNode<"NVPTXISD::DYNAMIC_STACKALLOC", SDTDynAllocaOp, 3869 [SDNPHasChain, SDNPSideEffect]>; 3870 3871def DYNAMIC_STACKALLOC32 : 3872 NVPTXInst<(outs Int32Regs:$ptr), 3873 (ins Int32Regs:$size, i32imm:$align), 3874 "alloca.u32 \t$ptr, $size, $align;\n\t" 3875 "cvta.local.u32 \t$ptr, $ptr;", 3876 [(set (i32 Int32Regs:$ptr), (dyn_alloca Int32Regs:$size, (i32 timm:$align)))]>, 3877 Requires<[hasPTX<73>, hasSM<52>]>; 3878 3879def DYNAMIC_STACKALLOC64 : 3880 NVPTXInst<(outs Int64Regs:$ptr), 3881 (ins Int64Regs:$size, i32imm:$align), 3882 "alloca.u64 \t$ptr, $size, $align;\n\t" 3883 "cvta.local.u64 \t$ptr, $ptr;", 3884 [(set Int64Regs:$ptr, (dyn_alloca Int64Regs:$size, (i32 timm:$align)))]>, 3885 Requires<[hasPTX<73>, hasSM<52>]>; 3886 3887include "NVPTXIntrinsics.td" 3888 3889//----------------------------------- 3890// Notes 3891//----------------------------------- 3892// BSWAP is currently expanded. The following is a more efficient 3893// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register 3894// - for sm_20, use pmpt (use vector scalar mov to get the pack and 3895// unpack). sm_20 supports native 32-bit register, but not native 16-bit 3896// register. 3897 3898def : Pat < 3899 (i32 (bswap i32:$a)), 3900 (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x0123))>; 3901 3902def : Pat < 3903 (v2i16 (bswap v2i16:$a)), 3904 (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x2301))>; 3905 3906def : Pat < 3907 (i64 (bswap i64:$a)), 3908 (V2I32toI64 3909 (INT_NVVM_PRMT (I64toI32H Int64Regs:$a), (i32 0), (i32 0x0123)), 3910 (INT_NVVM_PRMT (I64toI32L Int64Regs:$a), (i32 0), (i32 0x0123)))>; 3911