1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Custom DAG lowering for SI 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIISelLowering.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUInstrInfo.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "SIRegisterInfo.h" 22 #include "llvm/ADT/APInt.h" 23 #include "llvm/ADT/FloatingPointMode.h" 24 #include "llvm/ADT/Statistic.h" 25 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 26 #include "llvm/Analysis/UniformityAnalysis.h" 27 #include "llvm/BinaryFormat/ELF.h" 28 #include "llvm/CodeGen/Analysis.h" 29 #include "llvm/CodeGen/ByteProvider.h" 30 #include "llvm/CodeGen/FunctionLoweringInfo.h" 31 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 32 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 33 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 34 #include "llvm/CodeGen/MachineFrameInfo.h" 35 #include "llvm/CodeGen/MachineFunction.h" 36 #include "llvm/CodeGen/MachineLoopInfo.h" 37 #include "llvm/IR/DiagnosticInfo.h" 38 #include "llvm/IR/IRBuilder.h" 39 #include "llvm/IR/IntrinsicInst.h" 40 #include "llvm/IR/IntrinsicsAMDGPU.h" 41 #include "llvm/IR/IntrinsicsR600.h" 42 #include "llvm/Support/CommandLine.h" 43 #include "llvm/Support/KnownBits.h" 44 #include "llvm/Support/ModRef.h" 45 #include <optional> 46 47 using namespace llvm; 48 49 #define DEBUG_TYPE "si-lower" 50 51 STATISTIC(NumTailCalls, "Number of tail calls"); 52 53 static cl::opt<bool> DisableLoopAlignment( 54 "amdgpu-disable-loop-alignment", 55 cl::desc("Do not align and prefetch loops"), 56 cl::init(false)); 57 58 static cl::opt<bool> UseDivergentRegisterIndexing( 59 "amdgpu-use-divergent-register-indexing", 60 cl::Hidden, 61 cl::desc("Use indirect register addressing for divergent indexes"), 62 cl::init(false)); 63 64 static bool denormalModeIsFlushAllF32(const MachineFunction &MF) { 65 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign(); 67 } 68 69 static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) { 70 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign(); 72 } 73 74 static unsigned findFirstFreeSGPR(CCState &CCInfo) { 75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { 77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { 78 return AMDGPU::SGPR0 + Reg; 79 } 80 } 81 llvm_unreachable("Cannot allocate sgpr"); 82 } 83 84 SITargetLowering::SITargetLowering(const TargetMachine &TM, 85 const GCNSubtarget &STI) 86 : AMDGPUTargetLowering(TM, STI), 87 Subtarget(&STI) { 88 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); 89 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 90 91 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); 92 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); 93 94 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); 95 96 const SIRegisterInfo *TRI = STI.getRegisterInfo(); 97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class(); 98 99 addRegisterClass(MVT::f64, V64RegClass); 100 addRegisterClass(MVT::v2f32, V64RegClass); 101 addRegisterClass(MVT::Untyped, V64RegClass); 102 103 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); 104 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96)); 105 106 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass); 107 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass); 108 109 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass); 110 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128)); 111 112 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); 113 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160)); 114 115 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass); 116 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192)); 117 118 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass); 119 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192)); 120 121 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass); 122 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224)); 123 124 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass); 125 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256)); 126 127 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); 128 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256)); 129 130 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass); 131 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288)); 132 133 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass); 134 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320)); 135 136 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass); 137 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352)); 138 139 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass); 140 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384)); 141 142 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); 143 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512)); 144 145 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); 146 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512)); 147 148 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); 149 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024)); 150 151 if (Subtarget->has16BitInsts()) { 152 if (Subtarget->useRealTrue16Insts()) { 153 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass); 154 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass); 155 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass); 156 } else { 157 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); 158 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); 159 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass); 160 } 161 162 // Unless there are also VOP3P operations, not operations are really legal. 163 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass); 164 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass); 165 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass); 166 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass); 167 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); 168 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass); 169 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass); 170 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass); 171 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass); 172 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass); 173 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass); 174 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass); 175 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass); 176 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass); 177 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass); 178 } 179 180 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); 181 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024)); 182 183 computeRegisterProperties(Subtarget->getRegisterInfo()); 184 185 // The boolean content concept here is too inflexible. Compares only ever 186 // really produce a 1-bit result. Any copy/extend from these will turn into a 187 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as 188 // it's what most targets use. 189 setBooleanContents(ZeroOrOneBooleanContent); 190 setBooleanVectorContents(ZeroOrOneBooleanContent); 191 192 // We need to custom lower vector stores from local memory 193 setOperationAction(ISD::LOAD, 194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, 195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, 196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32, 197 MVT::i1, MVT::v32i32}, 198 Custom); 199 200 setOperationAction(ISD::STORE, 201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, 202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, 203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32, 204 MVT::i1, MVT::v32i32}, 205 Custom); 206 207 if (isTypeLegal(MVT::bf16)) { 208 for (unsigned Opc : 209 {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV, 210 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM, 211 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT, 212 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI, 213 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2, 214 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10, 215 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT, 216 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE, 217 ISD::SETCC}) { 218 // FIXME: The promoted to type shouldn't need to be explicit 219 setOperationAction(Opc, MVT::bf16, Promote); 220 AddPromotedToType(Opc, MVT::bf16, MVT::f32); 221 } 222 223 setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand); 224 225 setOperationAction(ISD::SELECT, MVT::bf16, Promote); 226 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16); 227 228 setOperationAction(ISD::FABS, MVT::bf16, Legal); 229 setOperationAction(ISD::FNEG, MVT::bf16, Legal); 230 setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Legal); 231 232 // We only need to custom lower because we can't specify an action for bf16 233 // sources. 234 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 235 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 236 } 237 238 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); 239 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand); 240 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); 241 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); 242 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); 243 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand); 244 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand); 245 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand); 246 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand); 247 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); 248 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand); 249 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); 250 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); 251 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); 252 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand); 253 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand); 254 255 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand); 256 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand); 257 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand); 258 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand); 259 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand); 260 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand); 261 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand); 262 263 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom); 264 265 setOperationAction(ISD::SELECT, MVT::i1, Promote); 266 setOperationAction(ISD::SELECT, MVT::i64, Custom); 267 setOperationAction(ISD::SELECT, MVT::f64, Promote); 268 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); 269 270 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom); 271 272 setOperationAction(ISD::SELECT_CC, 273 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand); 274 275 setOperationAction(ISD::SETCC, MVT::i1, Promote); 276 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand); 277 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32); 278 279 setOperationAction(ISD::TRUNCATE, 280 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, 281 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, 282 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32}, 283 Expand); 284 setOperationAction(ISD::FP_ROUND, 285 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, 286 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32, 287 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32}, 288 Expand); 289 290 setOperationAction(ISD::SIGN_EXTEND_INREG, 291 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16, 292 MVT::v3i16, MVT::v4i16, MVT::Other}, 293 Custom); 294 295 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 296 setOperationAction(ISD::BR_CC, 297 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand); 298 299 setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal); 300 301 setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal); 302 303 setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64, 304 Expand); 305 306 #if 0 307 setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal); 308 #endif 309 310 // We only support LOAD/STORE and vector manipulation ops for vectors 311 // with > 4 elements. 312 for (MVT VT : 313 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32, 314 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, 315 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, 316 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32, 317 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, 318 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16, 319 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32, 320 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) { 321 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 322 switch (Op) { 323 case ISD::LOAD: 324 case ISD::STORE: 325 case ISD::BUILD_VECTOR: 326 case ISD::BITCAST: 327 case ISD::UNDEF: 328 case ISD::EXTRACT_VECTOR_ELT: 329 case ISD::INSERT_VECTOR_ELT: 330 case ISD::SCALAR_TO_VECTOR: 331 case ISD::IS_FPCLASS: 332 break; 333 case ISD::EXTRACT_SUBVECTOR: 334 case ISD::INSERT_SUBVECTOR: 335 case ISD::CONCAT_VECTORS: 336 setOperationAction(Op, VT, Custom); 337 break; 338 default: 339 setOperationAction(Op, VT, Expand); 340 break; 341 } 342 } 343 } 344 345 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand); 346 347 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that 348 // is expanded to avoid having two separate loops in case the index is a VGPR. 349 350 // Most operations are naturally 32-bit vector operations. We only support 351 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. 352 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { 353 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 354 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); 355 356 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 357 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); 358 359 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 360 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); 361 362 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 363 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); 364 } 365 366 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) { 367 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 368 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32); 369 370 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 371 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32); 372 373 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 374 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32); 375 376 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 377 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32); 378 } 379 380 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) { 381 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 382 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32); 383 384 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 385 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32); 386 387 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 388 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32); 389 390 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 391 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32); 392 } 393 394 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) { 395 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 396 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32); 397 398 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 399 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32); 400 401 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 402 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32); 403 404 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 405 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32); 406 } 407 408 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) { 409 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 410 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32); 411 412 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 413 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32); 414 415 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 416 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32); 417 418 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 419 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32); 420 } 421 422 setOperationAction(ISD::VECTOR_SHUFFLE, 423 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}, 424 Expand); 425 426 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16}, 427 Custom); 428 429 // Avoid stack access for these. 430 // TODO: Generalize to more vector types. 431 setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT}, 432 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8, 433 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16}, 434 Custom); 435 436 // Deal with vec3 vector operations when widened to vec4. 437 setOperationAction(ISD::INSERT_SUBVECTOR, 438 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom); 439 440 // Deal with vec5/6/7 vector operations when widened to vec8. 441 setOperationAction(ISD::INSERT_SUBVECTOR, 442 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32, 443 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32, 444 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32, 445 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32}, 446 Custom); 447 448 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, 449 // and output demarshalling 450 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom); 451 452 // We can't return success/failure, only the old value, 453 // let LLVM add the comparison 454 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64}, 455 Expand); 456 457 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom); 458 459 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal); 460 461 // FIXME: This should be narrowed to i32, but that only happens if i64 is 462 // illegal. 463 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32. 464 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal); 465 466 // On SI this is s_memtime and s_memrealtime on VI. 467 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 468 469 if (Subtarget->hasSMemRealTime() || 470 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11) 471 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal); 472 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom); 473 474 if (Subtarget->has16BitInsts()) { 475 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote); 476 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom); 477 } else { 478 setOperationAction(ISD::FSQRT, MVT::f16, Custom); 479 } 480 481 if (Subtarget->hasMadMacF32Insts()) 482 setOperationAction(ISD::FMAD, MVT::f32, Legal); 483 484 if (!Subtarget->hasBFI()) 485 // fcopysign can be done in a single instruction with BFI. 486 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand); 487 488 if (!Subtarget->hasBCNT(32)) 489 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 490 491 if (!Subtarget->hasBCNT(64)) 492 setOperationAction(ISD::CTPOP, MVT::i64, Expand); 493 494 if (Subtarget->hasFFBH()) 495 setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom); 496 497 if (Subtarget->hasFFBL()) 498 setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom); 499 500 // We only really have 32-bit BFE instructions (and 16-bit on VI). 501 // 502 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any 503 // effort to match them now. We want this to be false for i64 cases when the 504 // extraction isn't restricted to the upper or lower half. Ideally we would 505 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that 506 // span the midpoint are probably relatively rare, so don't worry about them 507 // for now. 508 if (Subtarget->hasBFE()) 509 setHasExtractBitsInsn(true); 510 511 // Clamp modifier on add/sub 512 if (Subtarget->hasIntClamp()) 513 setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal); 514 515 if (Subtarget->hasAddNoCarry()) 516 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32}, 517 Legal); 518 519 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64}, 520 Custom); 521 522 // These are really only legal for ieee_mode functions. We should be avoiding 523 // them for functions that don't have ieee_mode enabled, so just say they are 524 // legal. 525 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE}, 526 {MVT::f32, MVT::f64}, Legal); 527 528 if (Subtarget->haveRoundOpsF64()) 529 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64, 530 Legal); 531 else 532 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR}, 533 MVT::f64, Custom); 534 535 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 536 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64}, 537 Legal); 538 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom); 539 540 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom); 541 setOperationAction(ISD::FDIV, MVT::f64, Custom); 542 543 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand); 544 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand); 545 546 // Custom lower these because we can't specify a rule based on an illegal 547 // source bf16. 548 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom); 549 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom); 550 551 if (Subtarget->has16BitInsts()) { 552 setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN, 553 ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT}, 554 MVT::i16, Legal); 555 556 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32); 557 558 setOperationAction({ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC}, 559 MVT::i16, Expand); 560 561 setOperationAction({ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM, 562 ISD::UREM, ISD::BITREVERSE, ISD::CTTZ, 563 ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF, 564 ISD::CTPOP}, 565 MVT::i16, Promote); 566 567 setOperationAction(ISD::LOAD, MVT::i16, Custom); 568 569 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 570 571 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote); 572 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32); 573 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); 574 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); 575 576 setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom); 577 setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom); 578 setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom); 579 580 setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i32, Custom); 581 582 // F16 - Constant Actions. 583 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 584 setOperationAction(ISD::ConstantFP, MVT::bf16, Legal); 585 586 // F16 - Load/Store Actions. 587 setOperationAction(ISD::LOAD, MVT::f16, Promote); 588 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16); 589 setOperationAction(ISD::STORE, MVT::f16, Promote); 590 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16); 591 592 // BF16 - Load/Store Actions. 593 setOperationAction(ISD::LOAD, MVT::bf16, Promote); 594 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16); 595 setOperationAction(ISD::STORE, MVT::bf16, Promote); 596 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16); 597 598 // F16 - VOP1 Actions. 599 setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS, 600 ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND}, 601 MVT::f16, Custom); 602 603 setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote); 604 setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote); 605 606 // F16 - VOP2 Actions. 607 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16}, 608 Expand); 609 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom); 610 setOperationAction(ISD::FFREXP, MVT::f16, Custom); 611 setOperationAction(ISD::FDIV, MVT::f16, Custom); 612 613 // F16 - VOP3 Actions. 614 setOperationAction(ISD::FMA, MVT::f16, Legal); 615 if (STI.hasMadF16()) 616 setOperationAction(ISD::FMAD, MVT::f16, Legal); 617 618 for (MVT VT : 619 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16, 620 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, 621 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) { 622 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 623 switch (Op) { 624 case ISD::LOAD: 625 case ISD::STORE: 626 case ISD::BUILD_VECTOR: 627 case ISD::BITCAST: 628 case ISD::UNDEF: 629 case ISD::EXTRACT_VECTOR_ELT: 630 case ISD::INSERT_VECTOR_ELT: 631 case ISD::INSERT_SUBVECTOR: 632 case ISD::EXTRACT_SUBVECTOR: 633 case ISD::SCALAR_TO_VECTOR: 634 case ISD::IS_FPCLASS: 635 break; 636 case ISD::CONCAT_VECTORS: 637 setOperationAction(Op, VT, Custom); 638 break; 639 default: 640 setOperationAction(Op, VT, Expand); 641 break; 642 } 643 } 644 } 645 646 // v_perm_b32 can handle either of these. 647 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal); 648 setOperationAction(ISD::BSWAP, MVT::v4i16, Custom); 649 650 // XXX - Do these do anything? Vector constants turn into build_vector. 651 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal); 652 653 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, 654 Legal); 655 656 setOperationAction(ISD::STORE, MVT::v2i16, Promote); 657 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32); 658 setOperationAction(ISD::STORE, MVT::v2f16, Promote); 659 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32); 660 661 setOperationAction(ISD::LOAD, MVT::v2i16, Promote); 662 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32); 663 setOperationAction(ISD::LOAD, MVT::v2f16, Promote); 664 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32); 665 666 setOperationAction(ISD::AND, MVT::v2i16, Promote); 667 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32); 668 setOperationAction(ISD::OR, MVT::v2i16, Promote); 669 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32); 670 setOperationAction(ISD::XOR, MVT::v2i16, Promote); 671 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32); 672 673 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 674 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32); 675 setOperationAction(ISD::LOAD, MVT::v4f16, Promote); 676 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32); 677 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote); 678 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32); 679 680 setOperationAction(ISD::STORE, MVT::v4i16, Promote); 681 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); 682 setOperationAction(ISD::STORE, MVT::v4f16, Promote); 683 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); 684 setOperationAction(ISD::STORE, MVT::v4bf16, Promote); 685 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32); 686 687 setOperationAction(ISD::LOAD, MVT::v8i16, Promote); 688 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32); 689 setOperationAction(ISD::LOAD, MVT::v8f16, Promote); 690 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32); 691 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote); 692 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32); 693 694 setOperationAction(ISD::STORE, MVT::v4i16, Promote); 695 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); 696 setOperationAction(ISD::STORE, MVT::v4f16, Promote); 697 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); 698 699 setOperationAction(ISD::STORE, MVT::v8i16, Promote); 700 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32); 701 setOperationAction(ISD::STORE, MVT::v8f16, Promote); 702 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32); 703 setOperationAction(ISD::STORE, MVT::v8bf16, Promote); 704 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32); 705 706 setOperationAction(ISD::LOAD, MVT::v16i16, Promote); 707 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32); 708 setOperationAction(ISD::LOAD, MVT::v16f16, Promote); 709 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32); 710 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote); 711 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32); 712 713 setOperationAction(ISD::STORE, MVT::v16i16, Promote); 714 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32); 715 setOperationAction(ISD::STORE, MVT::v16f16, Promote); 716 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32); 717 setOperationAction(ISD::STORE, MVT::v16bf16, Promote); 718 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32); 719 720 setOperationAction(ISD::LOAD, MVT::v32i16, Promote); 721 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32); 722 setOperationAction(ISD::LOAD, MVT::v32f16, Promote); 723 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32); 724 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote); 725 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32); 726 727 setOperationAction(ISD::STORE, MVT::v32i16, Promote); 728 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32); 729 setOperationAction(ISD::STORE, MVT::v32f16, Promote); 730 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32); 731 setOperationAction(ISD::STORE, MVT::v32bf16, Promote); 732 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32); 733 734 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, 735 MVT::v2i32, Expand); 736 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); 737 738 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, 739 MVT::v4i32, Expand); 740 741 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, 742 MVT::v8i32, Expand); 743 744 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, 745 Subtarget->hasVOP3PInsts() ? Legal : Custom); 746 747 setOperationAction(ISD::FNEG, MVT::v2f16, Legal); 748 // This isn't really legal, but this avoids the legalizer unrolling it (and 749 // allows matching fneg (fabs x) patterns) 750 setOperationAction(ISD::FABS, MVT::v2f16, Legal); 751 752 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); 753 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal); 754 755 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE}, 756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, 757 Custom); 758 759 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, 760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, 761 Expand); 762 763 for (MVT Vec16 : 764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16, 765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) { 766 setOperationAction( 767 {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR}, 768 Vec16, Custom); 769 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand); 770 } 771 } 772 773 if (Subtarget->hasVOP3PInsts()) { 774 setOperationAction({ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL, 775 ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX, 776 ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT}, 777 MVT::v2i16, Legal); 778 779 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE, 780 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE}, 781 MVT::v2f16, Legal); 782 783 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, 784 Custom); 785 786 setOperationAction(ISD::VECTOR_SHUFFLE, 787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16, 788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16}, 789 Custom); 790 791 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16}) 792 // Split vector operations. 793 setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB, 794 ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, 795 ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT, 796 ISD::SSUBSAT}, 797 VT, Custom); 798 799 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}) 800 // Split vector operations. 801 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE}, 802 VT, Custom); 803 804 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16}, 805 Custom); 806 807 setOperationAction(ISD::FEXP, MVT::v2f16, Custom); 808 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16}, 809 Custom); 810 811 if (Subtarget->hasPackedFP32Ops()) { 812 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG}, 813 MVT::v2f32, Legal); 814 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA}, 815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32}, 816 Custom); 817 } 818 } 819 820 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom); 821 822 if (Subtarget->has16BitInsts()) { 823 setOperationAction(ISD::SELECT, MVT::v2i16, Promote); 824 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); 825 setOperationAction(ISD::SELECT, MVT::v2f16, Promote); 826 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); 827 } else { 828 // Legalization hack. 829 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom); 830 831 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom); 832 } 833 834 setOperationAction(ISD::SELECT, 835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8, 836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16, 837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16, 838 MVT::v32f16, MVT::v32bf16}, 839 Custom); 840 841 setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom); 842 843 if (Subtarget->hasScalarSMulU64()) 844 setOperationAction(ISD::MUL, MVT::i64, Custom); 845 846 if (Subtarget->hasMad64_32()) 847 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); 848 849 if (Subtarget->hasPrefetch()) 850 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 851 852 if (Subtarget->hasIEEEMinMax()) { 853 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, 854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal); 855 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM}, 856 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, 857 Custom); 858 } 859 860 setOperationAction(ISD::INTRINSIC_WO_CHAIN, 861 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, 862 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128, 863 MVT::i8}, 864 Custom); 865 866 setOperationAction(ISD::INTRINSIC_W_CHAIN, 867 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16, 868 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16, 869 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16, 870 MVT::i16, MVT::bf16, MVT::i8, MVT::i128}, 871 Custom); 872 873 setOperationAction(ISD::INTRINSIC_VOID, 874 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16, 875 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16, 876 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, 877 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128}, 878 Custom); 879 880 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom); 881 setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); 882 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); 883 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom); 884 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom); 885 886 // TODO: Could move this to custom lowering, could benefit from combines on 887 // extract of relevant bits. 888 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal); 889 890 setOperationAction(ISD::MUL, MVT::i1, Promote); 891 892 setTargetDAGCombine({ISD::ADD, 893 ISD::UADDO_CARRY, 894 ISD::SUB, 895 ISD::USUBO_CARRY, 896 ISD::FADD, 897 ISD::FSUB, 898 ISD::FDIV, 899 ISD::FMINNUM, 900 ISD::FMAXNUM, 901 ISD::FMINNUM_IEEE, 902 ISD::FMAXNUM_IEEE, 903 ISD::FMINIMUM, 904 ISD::FMAXIMUM, 905 ISD::FMA, 906 ISD::SMIN, 907 ISD::SMAX, 908 ISD::UMIN, 909 ISD::UMAX, 910 ISD::SETCC, 911 ISD::AND, 912 ISD::OR, 913 ISD::XOR, 914 ISD::FSHR, 915 ISD::SINT_TO_FP, 916 ISD::UINT_TO_FP, 917 ISD::FCANONICALIZE, 918 ISD::SCALAR_TO_VECTOR, 919 ISD::ZERO_EXTEND, 920 ISD::SIGN_EXTEND_INREG, 921 ISD::EXTRACT_VECTOR_ELT, 922 ISD::INSERT_VECTOR_ELT, 923 ISD::FCOPYSIGN}); 924 925 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16()) 926 setTargetDAGCombine(ISD::FP_ROUND); 927 928 // All memory operations. Some folding on the pointer operand is done to help 929 // matching the constant offsets in the addressing modes. 930 setTargetDAGCombine({ISD::LOAD, 931 ISD::STORE, 932 ISD::ATOMIC_LOAD, 933 ISD::ATOMIC_STORE, 934 ISD::ATOMIC_CMP_SWAP, 935 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, 936 ISD::ATOMIC_SWAP, 937 ISD::ATOMIC_LOAD_ADD, 938 ISD::ATOMIC_LOAD_SUB, 939 ISD::ATOMIC_LOAD_AND, 940 ISD::ATOMIC_LOAD_OR, 941 ISD::ATOMIC_LOAD_XOR, 942 ISD::ATOMIC_LOAD_NAND, 943 ISD::ATOMIC_LOAD_MIN, 944 ISD::ATOMIC_LOAD_MAX, 945 ISD::ATOMIC_LOAD_UMIN, 946 ISD::ATOMIC_LOAD_UMAX, 947 ISD::ATOMIC_LOAD_FADD, 948 ISD::ATOMIC_LOAD_FMIN, 949 ISD::ATOMIC_LOAD_FMAX, 950 ISD::ATOMIC_LOAD_UINC_WRAP, 951 ISD::ATOMIC_LOAD_UDEC_WRAP, 952 ISD::INTRINSIC_VOID, 953 ISD::INTRINSIC_W_CHAIN}); 954 955 // FIXME: In other contexts we pretend this is a per-function property. 956 setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32); 957 958 setSchedulingPreference(Sched::RegPressure); 959 } 960 961 const GCNSubtarget *SITargetLowering::getSubtarget() const { 962 return Subtarget; 963 } 964 965 ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const { 966 static const MCPhysReg RCRegs[] = {AMDGPU::MODE}; 967 return RCRegs; 968 } 969 970 //===----------------------------------------------------------------------===// 971 // TargetLowering queries 972 //===----------------------------------------------------------------------===// 973 974 // v_mad_mix* support a conversion from f16 to f32. 975 // 976 // There is only one special case when denormals are enabled we don't currently, 977 // where this is OK to use. 978 bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, 979 EVT DestVT, EVT SrcVT) const { 980 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || 981 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && 982 DestVT.getScalarType() == MVT::f32 && 983 SrcVT.getScalarType() == MVT::f16 && 984 // TODO: This probably only requires no input flushing? 985 denormalModeIsFlushAllF32(DAG.getMachineFunction()); 986 } 987 988 bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode, 989 LLT DestTy, LLT SrcTy) const { 990 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) || 991 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) && 992 DestTy.getScalarSizeInBits() == 32 && 993 SrcTy.getScalarSizeInBits() == 16 && 994 // TODO: This probably only requires no input flushing? 995 denormalModeIsFlushAllF32(*MI.getMF()); 996 } 997 998 bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { 999 // SI has some legal vector types, but no legal vector operations. Say no 1000 // shuffles are legal in order to prefer scalarizing some vector operations. 1001 return false; 1002 } 1003 1004 MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, 1005 CallingConv::ID CC, 1006 EVT VT) const { 1007 if (CC == CallingConv::AMDGPU_KERNEL) 1008 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); 1009 1010 if (VT.isVector()) { 1011 EVT ScalarVT = VT.getScalarType(); 1012 unsigned Size = ScalarVT.getSizeInBits(); 1013 if (Size == 16) { 1014 if (Subtarget->has16BitInsts()) { 1015 if (VT.isInteger()) 1016 return MVT::v2i16; 1017 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16); 1018 } 1019 return VT.isInteger() ? MVT::i32 : MVT::f32; 1020 } 1021 1022 if (Size < 16) 1023 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32; 1024 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32; 1025 } 1026 1027 if (VT.getSizeInBits() > 32) 1028 return MVT::i32; 1029 1030 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); 1031 } 1032 1033 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, 1034 CallingConv::ID CC, 1035 EVT VT) const { 1036 if (CC == CallingConv::AMDGPU_KERNEL) 1037 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); 1038 1039 if (VT.isVector()) { 1040 unsigned NumElts = VT.getVectorNumElements(); 1041 EVT ScalarVT = VT.getScalarType(); 1042 unsigned Size = ScalarVT.getSizeInBits(); 1043 1044 // FIXME: Should probably promote 8-bit vectors to i16. 1045 if (Size == 16 && Subtarget->has16BitInsts()) 1046 return (NumElts + 1) / 2; 1047 1048 if (Size <= 32) 1049 return NumElts; 1050 1051 if (Size > 32) 1052 return NumElts * ((Size + 31) / 32); 1053 } else if (VT.getSizeInBits() > 32) 1054 return (VT.getSizeInBits() + 31) / 32; 1055 1056 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); 1057 } 1058 1059 unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( 1060 LLVMContext &Context, CallingConv::ID CC, 1061 EVT VT, EVT &IntermediateVT, 1062 unsigned &NumIntermediates, MVT &RegisterVT) const { 1063 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { 1064 unsigned NumElts = VT.getVectorNumElements(); 1065 EVT ScalarVT = VT.getScalarType(); 1066 unsigned Size = ScalarVT.getSizeInBits(); 1067 // FIXME: We should fix the ABI to be the same on targets without 16-bit 1068 // support, but unless we can properly handle 3-vectors, it will be still be 1069 // inconsistent. 1070 if (Size == 16 && Subtarget->has16BitInsts()) { 1071 if (ScalarVT == MVT::bf16) { 1072 RegisterVT = MVT::i32; 1073 IntermediateVT = MVT::v2bf16; 1074 } else { 1075 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; 1076 IntermediateVT = RegisterVT; 1077 } 1078 NumIntermediates = (NumElts + 1) / 2; 1079 return NumIntermediates; 1080 } 1081 1082 if (Size == 32) { 1083 RegisterVT = ScalarVT.getSimpleVT(); 1084 IntermediateVT = RegisterVT; 1085 NumIntermediates = NumElts; 1086 return NumIntermediates; 1087 } 1088 1089 if (Size < 16 && Subtarget->has16BitInsts()) { 1090 // FIXME: Should probably form v2i16 pieces 1091 RegisterVT = MVT::i16; 1092 IntermediateVT = ScalarVT; 1093 NumIntermediates = NumElts; 1094 return NumIntermediates; 1095 } 1096 1097 1098 if (Size != 16 && Size <= 32) { 1099 RegisterVT = MVT::i32; 1100 IntermediateVT = ScalarVT; 1101 NumIntermediates = NumElts; 1102 return NumIntermediates; 1103 } 1104 1105 if (Size > 32) { 1106 RegisterVT = MVT::i32; 1107 IntermediateVT = RegisterVT; 1108 NumIntermediates = NumElts * ((Size + 31) / 32); 1109 return NumIntermediates; 1110 } 1111 } 1112 1113 return TargetLowering::getVectorTypeBreakdownForCallingConv( 1114 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); 1115 } 1116 1117 static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, 1118 const DataLayout &DL, Type *Ty, 1119 unsigned MaxNumLanes) { 1120 assert(MaxNumLanes != 0); 1121 1122 LLVMContext &Ctx = Ty->getContext(); 1123 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { 1124 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements()); 1125 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()), 1126 NumElts); 1127 } 1128 1129 return TLI.getValueType(DL, Ty); 1130 } 1131 1132 // Peek through TFE struct returns to only use the data size. 1133 static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, 1134 const DataLayout &DL, Type *Ty, 1135 unsigned MaxNumLanes) { 1136 auto *ST = dyn_cast<StructType>(Ty); 1137 if (!ST) 1138 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes); 1139 1140 // TFE intrinsics return an aggregate type. 1141 assert(ST->getNumContainedTypes() == 2 && 1142 ST->getContainedType(1)->isIntegerTy(32)); 1143 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes); 1144 } 1145 1146 /// Map address space 7 to MVT::v5i32 because that's its in-memory 1147 /// representation. This return value is vector-typed because there is no 1148 /// MVT::i160 and it is not clear if one can be added. While this could 1149 /// cause issues during codegen, these address space 7 pointers will be 1150 /// rewritten away by then. Therefore, we can return MVT::v5i32 in order 1151 /// to allow pre-codegen passes that query TargetTransformInfo, often for cost 1152 /// modeling, to work. 1153 MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const { 1154 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160) 1155 return MVT::v5i32; 1156 if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS && 1157 DL.getPointerSizeInBits(AS) == 192) 1158 return MVT::v6i32; 1159 return AMDGPUTargetLowering::getPointerTy(DL, AS); 1160 } 1161 /// Similarly, the in-memory representation of a p7 is {p8, i32}, aka 1162 /// v8i32 when padding is added. 1163 /// The in-memory representation of a p9 is {p8, i32, i32}, which is 1164 /// also v8i32 with padding. 1165 MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { 1166 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS && 1167 DL.getPointerSizeInBits(AS) == 160) || 1168 (AMDGPUAS::BUFFER_STRIDED_POINTER == AS && 1169 DL.getPointerSizeInBits(AS) == 192)) 1170 return MVT::v8i32; 1171 return AMDGPUTargetLowering::getPointerMemTy(DL, AS); 1172 } 1173 1174 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 1175 const CallInst &CI, 1176 MachineFunction &MF, 1177 unsigned IntrID) const { 1178 Info.flags = MachineMemOperand::MONone; 1179 if (CI.hasMetadata(LLVMContext::MD_invariant_load)) 1180 Info.flags |= MachineMemOperand::MOInvariant; 1181 1182 if (const AMDGPU::RsrcIntrinsic *RsrcIntr = 1183 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 1184 AttributeList Attr = Intrinsic::getAttributes(CI.getContext(), 1185 (Intrinsic::ID)IntrID); 1186 MemoryEffects ME = Attr.getMemoryEffects(); 1187 if (ME.doesNotAccessMemory()) 1188 return false; 1189 1190 // TODO: Should images get their own address space? 1191 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; 1192 1193 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr; 1194 if (RsrcIntr->IsImage) { 1195 const AMDGPU::ImageDimIntrinsicInfo *Intr = 1196 AMDGPU::getImageDimIntrinsicInfo(IntrID); 1197 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 1198 Info.align.reset(); 1199 } 1200 1201 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg); 1202 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) { 1203 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) 1204 // We conservatively set the memory operand of a buffer intrinsic to the 1205 // base resource pointer, so that we can access alias information about 1206 // those pointers. Cases like "this points at the same value 1207 // but with a different offset" are handled in 1208 // areMemAccessesTriviallyDisjoint. 1209 Info.ptrVal = RsrcArg; 1210 } 1211 1212 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1)); 1213 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE) 1214 Info.flags |= MachineMemOperand::MOVolatile; 1215 Info.flags |= MachineMemOperand::MODereferenceable; 1216 if (ME.onlyReadsMemory()) { 1217 if (RsrcIntr->IsImage) { 1218 unsigned MaxNumLanes = 4; 1219 1220 if (!BaseOpcode->Gather4) { 1221 // If this isn't a gather, we may have excess loaded elements in the 1222 // IR type. Check the dmask for the real number of elements loaded. 1223 unsigned DMask 1224 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue(); 1225 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask); 1226 } 1227 1228 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(), 1229 CI.getType(), MaxNumLanes); 1230 } else { 1231 Info.memVT = 1232 memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(), 1233 std::numeric_limits<unsigned>::max()); 1234 } 1235 1236 // FIXME: What does alignment mean for an image? 1237 Info.opc = ISD::INTRINSIC_W_CHAIN; 1238 Info.flags |= MachineMemOperand::MOLoad; 1239 } else if (ME.onlyWritesMemory()) { 1240 Info.opc = ISD::INTRINSIC_VOID; 1241 1242 Type *DataTy = CI.getArgOperand(0)->getType(); 1243 if (RsrcIntr->IsImage) { 1244 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue(); 1245 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask); 1246 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy, 1247 DMaskLanes); 1248 } else 1249 Info.memVT = getValueType(MF.getDataLayout(), DataTy); 1250 1251 Info.flags |= MachineMemOperand::MOStore; 1252 } else { 1253 // Atomic or NoReturn Sampler 1254 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID : 1255 ISD::INTRINSIC_W_CHAIN; 1256 Info.flags |= MachineMemOperand::MOLoad | 1257 MachineMemOperand::MOStore | 1258 MachineMemOperand::MODereferenceable; 1259 1260 switch (IntrID) { 1261 default: 1262 if (RsrcIntr->IsImage && BaseOpcode->NoReturn) { 1263 // Fake memory access type for no return sampler intrinsics 1264 Info.memVT = MVT::i32; 1265 } else { 1266 // XXX - Should this be volatile without known ordering? 1267 Info.flags |= MachineMemOperand::MOVolatile; 1268 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType()); 1269 } 1270 break; 1271 case Intrinsic::amdgcn_raw_buffer_load_lds: 1272 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: 1273 case Intrinsic::amdgcn_struct_buffer_load_lds: 1274 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { 1275 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); 1276 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); 1277 Info.ptrVal = CI.getArgOperand(1); 1278 return true; 1279 } 1280 case Intrinsic::amdgcn_raw_atomic_buffer_load: 1281 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: { 1282 Info.memVT = 1283 memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(), 1284 std::numeric_limits<unsigned>::max()); 1285 Info.flags &= ~MachineMemOperand::MOStore; 1286 return true; 1287 } 1288 } 1289 } 1290 return true; 1291 } 1292 1293 switch (IntrID) { 1294 case Intrinsic::amdgcn_ds_ordered_add: 1295 case Intrinsic::amdgcn_ds_ordered_swap: { 1296 Info.opc = ISD::INTRINSIC_W_CHAIN; 1297 Info.memVT = MVT::getVT(CI.getType()); 1298 Info.ptrVal = CI.getOperand(0); 1299 Info.align.reset(); 1300 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 1301 1302 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4)); 1303 if (!Vol->isZero()) 1304 Info.flags |= MachineMemOperand::MOVolatile; 1305 1306 return true; 1307 } 1308 case Intrinsic::amdgcn_ds_add_gs_reg_rtn: 1309 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: { 1310 Info.opc = ISD::INTRINSIC_W_CHAIN; 1311 Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); 1312 Info.ptrVal = nullptr; 1313 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER; 1314 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 1315 return true; 1316 } 1317 case Intrinsic::amdgcn_ds_append: 1318 case Intrinsic::amdgcn_ds_consume: { 1319 Info.opc = ISD::INTRINSIC_W_CHAIN; 1320 Info.memVT = MVT::getVT(CI.getType()); 1321 Info.ptrVal = CI.getOperand(0); 1322 Info.align.reset(); 1323 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 1324 1325 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1)); 1326 if (!Vol->isZero()) 1327 Info.flags |= MachineMemOperand::MOVolatile; 1328 1329 return true; 1330 } 1331 case Intrinsic::amdgcn_global_atomic_csub: { 1332 Info.opc = ISD::INTRINSIC_W_CHAIN; 1333 Info.memVT = MVT::getVT(CI.getType()); 1334 Info.ptrVal = CI.getOperand(0); 1335 Info.align.reset(); 1336 Info.flags |= MachineMemOperand::MOLoad | 1337 MachineMemOperand::MOStore | 1338 MachineMemOperand::MOVolatile; 1339 return true; 1340 } 1341 case Intrinsic::amdgcn_image_bvh_intersect_ray: { 1342 Info.opc = ISD::INTRINSIC_W_CHAIN; 1343 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT? 1344 1345 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; 1346 Info.align.reset(); 1347 Info.flags |= MachineMemOperand::MOLoad | 1348 MachineMemOperand::MODereferenceable; 1349 return true; 1350 } 1351 case Intrinsic::amdgcn_global_atomic_fadd: 1352 case Intrinsic::amdgcn_global_atomic_fmin: 1353 case Intrinsic::amdgcn_global_atomic_fmax: 1354 case Intrinsic::amdgcn_global_atomic_fmin_num: 1355 case Intrinsic::amdgcn_global_atomic_fmax_num: 1356 case Intrinsic::amdgcn_global_atomic_ordered_add_b64: 1357 case Intrinsic::amdgcn_flat_atomic_fadd: 1358 case Intrinsic::amdgcn_flat_atomic_fmin: 1359 case Intrinsic::amdgcn_flat_atomic_fmax: 1360 case Intrinsic::amdgcn_flat_atomic_fmin_num: 1361 case Intrinsic::amdgcn_flat_atomic_fmax_num: 1362 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: 1363 case Intrinsic::amdgcn_atomic_cond_sub_u32: 1364 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: { 1365 Info.opc = ISD::INTRINSIC_W_CHAIN; 1366 Info.memVT = MVT::getVT(CI.getType()); 1367 Info.ptrVal = CI.getOperand(0); 1368 Info.align.reset(); 1369 Info.flags |= MachineMemOperand::MOLoad | 1370 MachineMemOperand::MOStore | 1371 MachineMemOperand::MODereferenceable | 1372 MachineMemOperand::MOVolatile; 1373 return true; 1374 } 1375 case Intrinsic::amdgcn_global_load_tr_b64: 1376 case Intrinsic::amdgcn_global_load_tr_b128: { 1377 Info.opc = ISD::INTRINSIC_W_CHAIN; 1378 Info.memVT = MVT::getVT(CI.getType()); 1379 Info.ptrVal = CI.getOperand(0); 1380 Info.align.reset(); 1381 Info.flags |= MachineMemOperand::MOLoad; 1382 return true; 1383 } 1384 case Intrinsic::amdgcn_ds_gws_init: 1385 case Intrinsic::amdgcn_ds_gws_barrier: 1386 case Intrinsic::amdgcn_ds_gws_sema_v: 1387 case Intrinsic::amdgcn_ds_gws_sema_br: 1388 case Intrinsic::amdgcn_ds_gws_sema_p: 1389 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 1390 Info.opc = ISD::INTRINSIC_VOID; 1391 1392 const GCNTargetMachine &TM = 1393 static_cast<const GCNTargetMachine &>(getTargetMachine()); 1394 1395 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1396 Info.ptrVal = MFI->getGWSPSV(TM); 1397 1398 // This is an abstract access, but we need to specify a type and size. 1399 Info.memVT = MVT::i32; 1400 Info.size = 4; 1401 Info.align = Align(4); 1402 1403 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier) 1404 Info.flags |= MachineMemOperand::MOLoad; 1405 else 1406 Info.flags |= MachineMemOperand::MOStore; 1407 return true; 1408 } 1409 case Intrinsic::amdgcn_global_load_lds: { 1410 Info.opc = ISD::INTRINSIC_VOID; 1411 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); 1412 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); 1413 Info.ptrVal = CI.getArgOperand(1); 1414 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 1415 return true; 1416 } 1417 case Intrinsic::amdgcn_ds_bvh_stack_rtn: { 1418 Info.opc = ISD::INTRINSIC_W_CHAIN; 1419 1420 const GCNTargetMachine &TM = 1421 static_cast<const GCNTargetMachine &>(getTargetMachine()); 1422 1423 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1424 Info.ptrVal = MFI->getGWSPSV(TM); 1425 1426 // This is an abstract access, but we need to specify a type and size. 1427 Info.memVT = MVT::i32; 1428 Info.size = 4; 1429 Info.align = Align(4); 1430 1431 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 1432 return true; 1433 } 1434 default: 1435 return false; 1436 } 1437 } 1438 1439 void SITargetLowering::CollectTargetIntrinsicOperands( 1440 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const { 1441 switch (cast<IntrinsicInst>(I).getIntrinsicID()) { 1442 case Intrinsic::amdgcn_addrspacecast_nonnull: { 1443 // The DAG's ValueType loses the addrspaces. 1444 // Add them as 2 extra Constant operands "from" and "to". 1445 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace(); 1446 unsigned DstAS = I.getType()->getPointerAddressSpace(); 1447 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32)); 1448 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32)); 1449 break; 1450 } 1451 default: 1452 break; 1453 } 1454 } 1455 1456 bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, 1457 SmallVectorImpl<Value*> &Ops, 1458 Type *&AccessTy) const { 1459 Value *Ptr = nullptr; 1460 switch (II->getIntrinsicID()) { 1461 case Intrinsic::amdgcn_atomic_cond_sub_u32: 1462 case Intrinsic::amdgcn_ds_append: 1463 case Intrinsic::amdgcn_ds_consume: 1464 case Intrinsic::amdgcn_ds_ordered_add: 1465 case Intrinsic::amdgcn_ds_ordered_swap: 1466 case Intrinsic::amdgcn_flat_atomic_fadd: 1467 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: 1468 case Intrinsic::amdgcn_flat_atomic_fmax: 1469 case Intrinsic::amdgcn_flat_atomic_fmax_num: 1470 case Intrinsic::amdgcn_flat_atomic_fmin: 1471 case Intrinsic::amdgcn_flat_atomic_fmin_num: 1472 case Intrinsic::amdgcn_global_atomic_csub: 1473 case Intrinsic::amdgcn_global_atomic_fadd: 1474 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: 1475 case Intrinsic::amdgcn_global_atomic_fmax: 1476 case Intrinsic::amdgcn_global_atomic_fmax_num: 1477 case Intrinsic::amdgcn_global_atomic_fmin: 1478 case Intrinsic::amdgcn_global_atomic_fmin_num: 1479 case Intrinsic::amdgcn_global_atomic_ordered_add_b64: 1480 case Intrinsic::amdgcn_global_load_tr_b64: 1481 case Intrinsic::amdgcn_global_load_tr_b128: 1482 Ptr = II->getArgOperand(0); 1483 break; 1484 case Intrinsic::amdgcn_global_load_lds: 1485 Ptr = II->getArgOperand(1); 1486 break; 1487 default: 1488 return false; 1489 } 1490 AccessTy = II->getType(); 1491 Ops.push_back(Ptr); 1492 return true; 1493 } 1494 1495 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM, 1496 unsigned AddrSpace) const { 1497 if (!Subtarget->hasFlatInstOffsets()) { 1498 // Flat instructions do not have offsets, and only have the register 1499 // address. 1500 return AM.BaseOffs == 0 && AM.Scale == 0; 1501 } 1502 1503 decltype(SIInstrFlags::FLAT) FlatVariant = 1504 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ? SIInstrFlags::FlatGlobal 1505 : AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? SIInstrFlags::FlatScratch 1506 : SIInstrFlags::FLAT; 1507 1508 return AM.Scale == 0 && 1509 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( 1510 AM.BaseOffs, AddrSpace, FlatVariant)); 1511 } 1512 1513 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { 1514 if (Subtarget->hasFlatGlobalInsts()) 1515 return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS); 1516 1517 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) { 1518 // Assume the we will use FLAT for all global memory accesses 1519 // on VI. 1520 // FIXME: This assumption is currently wrong. On VI we still use 1521 // MUBUF instructions for the r + i addressing mode. As currently 1522 // implemented, the MUBUF instructions only work on buffer < 4GB. 1523 // It may be possible to support > 4GB buffers with MUBUF instructions, 1524 // by setting the stride value in the resource descriptor which would 1525 // increase the size limit to (stride * 4GB). However, this is risky, 1526 // because it has never been validated. 1527 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS); 1528 } 1529 1530 return isLegalMUBUFAddressingMode(AM); 1531 } 1532 1533 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { 1534 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and 1535 // additionally can do r + r + i with addr64. 32-bit has more addressing 1536 // mode options. Depending on the resource constant, it can also do 1537 // (i64 r0) + (i32 r1) * (i14 i). 1538 // 1539 // Private arrays end up using a scratch buffer most of the time, so also 1540 // assume those use MUBUF instructions. Scratch loads / stores are currently 1541 // implemented as mubuf instructions with offen bit set, so slightly 1542 // different than the normal addr64. 1543 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 1544 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs)) 1545 return false; 1546 1547 // FIXME: Since we can split immediate into soffset and immediate offset, 1548 // would it make sense to allow any immediate? 1549 1550 switch (AM.Scale) { 1551 case 0: // r + i or just i, depending on HasBaseReg. 1552 return true; 1553 case 1: 1554 return true; // We have r + r or r + i. 1555 case 2: 1556 if (AM.HasBaseReg) { 1557 // Reject 2 * r + r. 1558 return false; 1559 } 1560 1561 // Allow 2 * r as r + r 1562 // Or 2 * r + i is allowed as r + r + i. 1563 return true; 1564 default: // Don't allow n * r 1565 return false; 1566 } 1567 } 1568 1569 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, 1570 const AddrMode &AM, Type *Ty, 1571 unsigned AS, Instruction *I) const { 1572 // No global is ever allowed as a base. 1573 if (AM.BaseGV) 1574 return false; 1575 1576 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 1577 return isLegalGlobalAddressingMode(AM); 1578 1579 if (AS == AMDGPUAS::CONSTANT_ADDRESS || 1580 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 1581 AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE || 1582 AS == AMDGPUAS::BUFFER_STRIDED_POINTER) { 1583 // If the offset isn't a multiple of 4, it probably isn't going to be 1584 // correctly aligned. 1585 // FIXME: Can we get the real alignment here? 1586 if (AM.BaseOffs % 4 != 0) 1587 return isLegalMUBUFAddressingMode(AM); 1588 1589 if (!Subtarget->hasScalarSubwordLoads()) { 1590 // There are no SMRD extloads, so if we have to do a small type access we 1591 // will use a MUBUF load. 1592 // FIXME?: We also need to do this if unaligned, but we don't know the 1593 // alignment here. 1594 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4) 1595 return isLegalGlobalAddressingMode(AM); 1596 } 1597 1598 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { 1599 // SMRD instructions have an 8-bit, dword offset on SI. 1600 if (!isUInt<8>(AM.BaseOffs / 4)) 1601 return false; 1602 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { 1603 // On CI+, this can also be a 32-bit literal constant offset. If it fits 1604 // in 8-bits, it can use a smaller encoding. 1605 if (!isUInt<32>(AM.BaseOffs / 4)) 1606 return false; 1607 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) { 1608 // On VI, these use the SMEM format and the offset is 20-bit in bytes. 1609 if (!isUInt<20>(AM.BaseOffs)) 1610 return false; 1611 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) { 1612 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative 1613 // for S_BUFFER_* instructions). 1614 if (!isInt<21>(AM.BaseOffs)) 1615 return false; 1616 } else { 1617 // On GFX12, all offsets are signed 24-bit in bytes. 1618 if (!isInt<24>(AM.BaseOffs)) 1619 return false; 1620 } 1621 1622 if ((AS == AMDGPUAS::CONSTANT_ADDRESS || 1623 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && 1624 AM.BaseOffs < 0) { 1625 // Scalar (non-buffer) loads can only use a negative offset if 1626 // soffset+offset is non-negative. Since the compiler can only prove that 1627 // in a few special cases, it is safer to claim that negative offsets are 1628 // not supported. 1629 return false; 1630 } 1631 1632 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 1633 return true; 1634 1635 if (AM.Scale == 1 && AM.HasBaseReg) 1636 return true; 1637 1638 return false; 1639 } 1640 1641 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 1642 return Subtarget->enableFlatScratch() 1643 ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS) 1644 : isLegalMUBUFAddressingMode(AM); 1645 1646 if (AS == AMDGPUAS::LOCAL_ADDRESS || 1647 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) { 1648 // Basic, single offset DS instructions allow a 16-bit unsigned immediate 1649 // field. 1650 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have 1651 // an 8-bit dword offset but we don't know the alignment here. 1652 if (!isUInt<16>(AM.BaseOffs)) 1653 return false; 1654 1655 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 1656 return true; 1657 1658 if (AM.Scale == 1 && AM.HasBaseReg) 1659 return true; 1660 1661 return false; 1662 } 1663 1664 if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) { 1665 // For an unknown address space, this usually means that this is for some 1666 // reason being used for pure arithmetic, and not based on some addressing 1667 // computation. We don't have instructions that compute pointers with any 1668 // addressing modes, so treat them as having no offset like flat 1669 // instructions. 1670 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS); 1671 } 1672 1673 // Assume a user alias of global for unknown address spaces. 1674 return isLegalGlobalAddressingMode(AM); 1675 } 1676 1677 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, 1678 const MachineFunction &MF) const { 1679 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) 1680 return (MemVT.getSizeInBits() <= 4 * 32); 1681 if (AS == AMDGPUAS::PRIVATE_ADDRESS) { 1682 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); 1683 return (MemVT.getSizeInBits() <= MaxPrivateBits); 1684 } 1685 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) 1686 return (MemVT.getSizeInBits() <= 2 * 32); 1687 return true; 1688 } 1689 1690 bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( 1691 unsigned Size, unsigned AddrSpace, Align Alignment, 1692 MachineMemOperand::Flags Flags, unsigned *IsFast) const { 1693 if (IsFast) 1694 *IsFast = 0; 1695 1696 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 1697 AddrSpace == AMDGPUAS::REGION_ADDRESS) { 1698 // Check if alignment requirements for ds_read/write instructions are 1699 // disabled. 1700 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4)) 1701 return false; 1702 1703 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment. 1704 if (Subtarget->hasLDSMisalignedBug() && Size > 32 && 1705 Alignment < RequiredAlignment) 1706 return false; 1707 1708 // Either, the alignment requirements are "enabled", or there is an 1709 // unaligned LDS access related hardware bug though alignment requirements 1710 // are "disabled". In either case, we need to check for proper alignment 1711 // requirements. 1712 // 1713 switch (Size) { 1714 case 64: 1715 // SI has a hardware bug in the LDS / GDS bounds checking: if the base 1716 // address is negative, then the instruction is incorrectly treated as 1717 // out-of-bounds even if base + offsets is in bounds. Split vectorized 1718 // loads here to avoid emitting ds_read2_b32. We may re-combine the 1719 // load later in the SILoadStoreOptimizer. 1720 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8)) 1721 return false; 1722 1723 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we 1724 // can do a 4 byte aligned, 8 byte access in a single operation using 1725 // ds_read2/write2_b32 with adjacent offsets. 1726 RequiredAlignment = Align(4); 1727 1728 if (Subtarget->hasUnalignedDSAccessEnabled()) { 1729 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/ 1730 // ds_write2_b32 depending on the alignment. In either case with either 1731 // alignment there is no faster way of doing this. 1732 1733 // The numbers returned here and below are not additive, it is a 'speed 1734 // rank'. They are just meant to be compared to decide if a certain way 1735 // of lowering an operation is faster than another. For that purpose 1736 // naturally aligned operation gets it bitsize to indicate that "it 1737 // operates with a speed comparable to N-bit wide load". With the full 1738 // alignment ds128 is slower than ds96 for example. If underaligned it 1739 // is comparable to a speed of a single dword access, which would then 1740 // mean 32 < 128 and it is faster to issue a wide load regardless. 1741 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a 1742 // wider load which will not be aligned anymore the latter is slower. 1743 if (IsFast) 1744 *IsFast = (Alignment >= RequiredAlignment) ? 64 1745 : (Alignment < Align(4)) ? 32 1746 : 1; 1747 return true; 1748 } 1749 1750 break; 1751 case 96: 1752 if (!Subtarget->hasDS96AndDS128()) 1753 return false; 1754 1755 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on 1756 // gfx8 and older. 1757 1758 if (Subtarget->hasUnalignedDSAccessEnabled()) { 1759 // Naturally aligned access is fastest. However, also report it is Fast 1760 // if memory is aligned less than DWORD. A narrow load or store will be 1761 // be equally slow as a single ds_read_b96/ds_write_b96, but there will 1762 // be more of them, so overall we will pay less penalty issuing a single 1763 // instruction. 1764 1765 // See comment on the values above. 1766 if (IsFast) 1767 *IsFast = (Alignment >= RequiredAlignment) ? 96 1768 : (Alignment < Align(4)) ? 32 1769 : 1; 1770 return true; 1771 } 1772 1773 break; 1774 case 128: 1775 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128()) 1776 return false; 1777 1778 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on 1779 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a 1780 // single operation using ds_read2/write2_b64. 1781 RequiredAlignment = Align(8); 1782 1783 if (Subtarget->hasUnalignedDSAccessEnabled()) { 1784 // Naturally aligned access is fastest. However, also report it is Fast 1785 // if memory is aligned less than DWORD. A narrow load or store will be 1786 // be equally slow as a single ds_read_b128/ds_write_b128, but there 1787 // will be more of them, so overall we will pay less penalty issuing a 1788 // single instruction. 1789 1790 // See comment on the values above. 1791 if (IsFast) 1792 *IsFast = (Alignment >= RequiredAlignment) ? 128 1793 : (Alignment < Align(4)) ? 32 1794 : 1; 1795 return true; 1796 } 1797 1798 break; 1799 default: 1800 if (Size > 32) 1801 return false; 1802 1803 break; 1804 } 1805 1806 // See comment on the values above. 1807 // Note that we have a single-dword or sub-dword here, so if underaligned 1808 // it is a slowest possible access, hence returned value is 0. 1809 if (IsFast) 1810 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0; 1811 1812 return Alignment >= RequiredAlignment || 1813 Subtarget->hasUnalignedDSAccessEnabled(); 1814 } 1815 1816 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { 1817 bool AlignedBy4 = Alignment >= Align(4); 1818 if (IsFast) 1819 *IsFast = AlignedBy4; 1820 1821 return AlignedBy4 || 1822 Subtarget->enableFlatScratch() || 1823 Subtarget->hasUnalignedScratchAccess(); 1824 } 1825 1826 // FIXME: We have to be conservative here and assume that flat operations 1827 // will access scratch. If we had access to the IR function, then we 1828 // could determine if any private memory was used in the function. 1829 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS && 1830 !Subtarget->hasUnalignedScratchAccess()) { 1831 bool AlignedBy4 = Alignment >= Align(4); 1832 if (IsFast) 1833 *IsFast = AlignedBy4; 1834 1835 return AlignedBy4; 1836 } 1837 1838 // So long as they are correct, wide global memory operations perform better 1839 // than multiple smaller memory ops -- even when misaligned 1840 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) { 1841 if (IsFast) 1842 *IsFast = Size; 1843 1844 return Alignment >= Align(4) || 1845 Subtarget->hasUnalignedBufferAccessEnabled(); 1846 } 1847 1848 // Smaller than dword value must be aligned. 1849 if (Size < 32) 1850 return false; 1851 1852 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the 1853 // byte-address are ignored, thus forcing Dword alignment. 1854 // This applies to private, global, and constant memory. 1855 if (IsFast) 1856 *IsFast = 1; 1857 1858 return Size >= 32 && Alignment >= Align(4); 1859 } 1860 1861 bool SITargetLowering::allowsMisalignedMemoryAccesses( 1862 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, 1863 unsigned *IsFast) const { 1864 return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace, 1865 Alignment, Flags, IsFast); 1866 } 1867 1868 EVT SITargetLowering::getOptimalMemOpType( 1869 const MemOp &Op, const AttributeList &FuncAttributes) const { 1870 // FIXME: Should account for address space here. 1871 1872 // The default fallback uses the private pointer size as a guess for a type to 1873 // use. Make sure we switch these to 64-bit accesses. 1874 1875 if (Op.size() >= 16 && 1876 Op.isDstAligned(Align(4))) // XXX: Should only do for global 1877 return MVT::v4i32; 1878 1879 if (Op.size() >= 8 && Op.isDstAligned(Align(4))) 1880 return MVT::v2i32; 1881 1882 // Use the default. 1883 return MVT::Other; 1884 } 1885 1886 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { 1887 const MemSDNode *MemNode = cast<MemSDNode>(N); 1888 return MemNode->getMemOperand()->getFlags() & MONoClobber; 1889 } 1890 1891 bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) { 1892 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS || 1893 AS == AMDGPUAS::PRIVATE_ADDRESS; 1894 } 1895 1896 bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS, 1897 unsigned DestAS) const { 1898 // Flat -> private/local is a simple truncate. 1899 // Flat -> global is no-op 1900 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) 1901 return true; 1902 1903 const GCNTargetMachine &TM = 1904 static_cast<const GCNTargetMachine &>(getTargetMachine()); 1905 return TM.isNoopAddrSpaceCast(SrcAS, DestAS); 1906 } 1907 1908 bool SITargetLowering::isMemOpUniform(const SDNode *N) const { 1909 const MemSDNode *MemNode = cast<MemSDNode>(N); 1910 1911 return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand()); 1912 } 1913 1914 TargetLoweringBase::LegalizeTypeAction 1915 SITargetLowering::getPreferredVectorAction(MVT VT) const { 1916 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && 1917 VT.getScalarType().bitsLE(MVT::i16)) 1918 return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector; 1919 return TargetLoweringBase::getPreferredVectorAction(VT); 1920 } 1921 1922 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 1923 Type *Ty) const { 1924 // FIXME: Could be smarter if called for vector constants. 1925 return true; 1926 } 1927 1928 bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 1929 unsigned Index) const { 1930 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 1931 return false; 1932 1933 // TODO: Add more cases that are cheap. 1934 return Index == 0; 1935 } 1936 1937 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { 1938 if (Subtarget->has16BitInsts() && VT == MVT::i16) { 1939 switch (Op) { 1940 case ISD::LOAD: 1941 case ISD::STORE: 1942 1943 // These operations are done with 32-bit instructions anyway. 1944 case ISD::AND: 1945 case ISD::OR: 1946 case ISD::XOR: 1947 case ISD::SELECT: 1948 // TODO: Extensions? 1949 return true; 1950 default: 1951 return false; 1952 } 1953 } 1954 1955 // SimplifySetCC uses this function to determine whether or not it should 1956 // create setcc with i1 operands. We don't have instructions for i1 setcc. 1957 if (VT == MVT::i1 && Op == ISD::SETCC) 1958 return false; 1959 1960 return TargetLowering::isTypeDesirableForOp(Op, VT); 1961 } 1962 1963 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, 1964 const SDLoc &SL, 1965 SDValue Chain, 1966 uint64_t Offset) const { 1967 const DataLayout &DL = DAG.getDataLayout(); 1968 MachineFunction &MF = DAG.getMachineFunction(); 1969 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1970 1971 const ArgDescriptor *InputPtrReg; 1972 const TargetRegisterClass *RC; 1973 LLT ArgTy; 1974 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); 1975 1976 std::tie(InputPtrReg, RC, ArgTy) = 1977 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1978 1979 // We may not have the kernarg segment argument if we have no kernel 1980 // arguments. 1981 if (!InputPtrReg) 1982 return DAG.getConstant(Offset, SL, PtrVT); 1983 1984 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1985 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, 1986 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); 1987 1988 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset)); 1989 } 1990 1991 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG, 1992 const SDLoc &SL) const { 1993 uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(), 1994 FIRST_IMPLICIT); 1995 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset); 1996 } 1997 1998 SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG, 1999 const SDLoc &SL) const { 2000 2001 Function &F = DAG.getMachineFunction().getFunction(); 2002 std::optional<uint32_t> KnownSize = 2003 AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 2004 if (KnownSize.has_value()) 2005 return DAG.getConstant(*KnownSize, SL, MVT::i32); 2006 return SDValue(); 2007 } 2008 2009 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, 2010 const SDLoc &SL, SDValue Val, 2011 bool Signed, 2012 const ISD::InputArg *Arg) const { 2013 // First, if it is a widened vector, narrow it. 2014 if (VT.isVector() && 2015 VT.getVectorNumElements() != MemVT.getVectorNumElements()) { 2016 EVT NarrowedVT = 2017 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 2018 VT.getVectorNumElements()); 2019 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val, 2020 DAG.getConstant(0, SL, MVT::i32)); 2021 } 2022 2023 // Then convert the vector elements or scalar value. 2024 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && 2025 VT.bitsLT(MemVT)) { 2026 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; 2027 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT)); 2028 } 2029 2030 if (MemVT.isFloatingPoint()) 2031 Val = getFPExtOrFPRound(DAG, Val, SL, VT); 2032 else if (Signed) 2033 Val = DAG.getSExtOrTrunc(Val, SL, VT); 2034 else 2035 Val = DAG.getZExtOrTrunc(Val, SL, VT); 2036 2037 return Val; 2038 } 2039 2040 SDValue SITargetLowering::lowerKernargMemParameter( 2041 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, 2042 uint64_t Offset, Align Alignment, bool Signed, 2043 const ISD::InputArg *Arg) const { 2044 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 2045 2046 // Try to avoid using an extload by loading earlier than the argument address, 2047 // and extracting the relevant bits. The load should hopefully be merged with 2048 // the previous argument. 2049 if (MemVT.getStoreSize() < 4 && Alignment < 4) { 2050 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs). 2051 int64_t AlignDownOffset = alignDown(Offset, 4); 2052 int64_t OffsetDiff = Offset - AlignDownOffset; 2053 2054 EVT IntVT = MemVT.changeTypeToInteger(); 2055 2056 // TODO: If we passed in the base kernel offset we could have a better 2057 // alignment than 4, but we don't really need it. 2058 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset); 2059 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4), 2060 MachineMemOperand::MODereferenceable | 2061 MachineMemOperand::MOInvariant); 2062 2063 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32); 2064 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt); 2065 2066 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract); 2067 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal); 2068 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg); 2069 2070 2071 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL); 2072 } 2073 2074 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); 2075 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment, 2076 MachineMemOperand::MODereferenceable | 2077 MachineMemOperand::MOInvariant); 2078 2079 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg); 2080 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); 2081 } 2082 2083 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, 2084 const SDLoc &SL, SDValue Chain, 2085 const ISD::InputArg &Arg) const { 2086 MachineFunction &MF = DAG.getMachineFunction(); 2087 MachineFrameInfo &MFI = MF.getFrameInfo(); 2088 2089 if (Arg.Flags.isByVal()) { 2090 unsigned Size = Arg.Flags.getByValSize(); 2091 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false); 2092 return DAG.getFrameIndex(FrameIdx, MVT::i32); 2093 } 2094 2095 unsigned ArgOffset = VA.getLocMemOffset(); 2096 unsigned ArgSize = VA.getValVT().getStoreSize(); 2097 2098 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true); 2099 2100 // Create load nodes to retrieve arguments from the stack. 2101 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); 2102 SDValue ArgValue; 2103 2104 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) 2105 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 2106 MVT MemVT = VA.getValVT(); 2107 2108 switch (VA.getLocInfo()) { 2109 default: 2110 break; 2111 case CCValAssign::BCvt: 2112 MemVT = VA.getLocVT(); 2113 break; 2114 case CCValAssign::SExt: 2115 ExtType = ISD::SEXTLOAD; 2116 break; 2117 case CCValAssign::ZExt: 2118 ExtType = ISD::ZEXTLOAD; 2119 break; 2120 case CCValAssign::AExt: 2121 ExtType = ISD::EXTLOAD; 2122 break; 2123 } 2124 2125 ArgValue = DAG.getExtLoad( 2126 ExtType, SL, VA.getLocVT(), Chain, FIN, 2127 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 2128 MemVT); 2129 return ArgValue; 2130 } 2131 2132 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, 2133 const SIMachineFunctionInfo &MFI, 2134 EVT VT, 2135 AMDGPUFunctionArgInfo::PreloadedValue PVID) const { 2136 const ArgDescriptor *Reg = nullptr; 2137 const TargetRegisterClass *RC; 2138 LLT Ty; 2139 2140 CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv(); 2141 const ArgDescriptor WorkGroupIDX = 2142 ArgDescriptor::createRegister(AMDGPU::TTMP9); 2143 // If GridZ is not programmed in an entry function then the hardware will set 2144 // it to all zeros, so there is no need to mask the GridY value in the low 2145 // order bits. 2146 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( 2147 AMDGPU::TTMP7, 2148 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu); 2149 const ArgDescriptor WorkGroupIDZ = 2150 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); 2151 if (Subtarget->hasArchitectedSGPRs() && 2152 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) { 2153 switch (PVID) { 2154 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: 2155 Reg = &WorkGroupIDX; 2156 RC = &AMDGPU::SReg_32RegClass; 2157 Ty = LLT::scalar(32); 2158 break; 2159 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: 2160 Reg = &WorkGroupIDY; 2161 RC = &AMDGPU::SReg_32RegClass; 2162 Ty = LLT::scalar(32); 2163 break; 2164 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: 2165 Reg = &WorkGroupIDZ; 2166 RC = &AMDGPU::SReg_32RegClass; 2167 Ty = LLT::scalar(32); 2168 break; 2169 default: 2170 break; 2171 } 2172 } 2173 2174 if (!Reg) 2175 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID); 2176 if (!Reg) { 2177 if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) { 2178 // It's possible for a kernarg intrinsic call to appear in a kernel with 2179 // no allocated segment, in which case we do not add the user sgpr 2180 // argument, so just return null. 2181 return DAG.getConstant(0, SDLoc(), VT); 2182 } 2183 2184 // It's undefined behavior if a function marked with the amdgpu-no-* 2185 // attributes uses the corresponding intrinsic. 2186 return DAG.getUNDEF(VT); 2187 } 2188 2189 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg); 2190 } 2191 2192 static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, 2193 CallingConv::ID CallConv, 2194 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped, 2195 FunctionType *FType, 2196 SIMachineFunctionInfo *Info) { 2197 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { 2198 const ISD::InputArg *Arg = &Ins[I]; 2199 2200 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) && 2201 "vector type argument should have been split"); 2202 2203 // First check if it's a PS input addr. 2204 if (CallConv == CallingConv::AMDGPU_PS && 2205 !Arg->Flags.isInReg() && PSInputNum <= 15) { 2206 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum); 2207 2208 // Inconveniently only the first part of the split is marked as isSplit, 2209 // so skip to the end. We only want to increment PSInputNum once for the 2210 // entire split argument. 2211 if (Arg->Flags.isSplit()) { 2212 while (!Arg->Flags.isSplitEnd()) { 2213 assert((!Arg->VT.isVector() || 2214 Arg->VT.getScalarSizeInBits() == 16) && 2215 "unexpected vector split in ps argument type"); 2216 if (!SkipArg) 2217 Splits.push_back(*Arg); 2218 Arg = &Ins[++I]; 2219 } 2220 } 2221 2222 if (SkipArg) { 2223 // We can safely skip PS inputs. 2224 Skipped.set(Arg->getOrigArgIndex()); 2225 ++PSInputNum; 2226 continue; 2227 } 2228 2229 Info->markPSInputAllocated(PSInputNum); 2230 if (Arg->Used) 2231 Info->markPSInputEnabled(PSInputNum); 2232 2233 ++PSInputNum; 2234 } 2235 2236 Splits.push_back(*Arg); 2237 } 2238 } 2239 2240 // Allocate special inputs passed in VGPRs. 2241 void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo, 2242 MachineFunction &MF, 2243 const SIRegisterInfo &TRI, 2244 SIMachineFunctionInfo &Info) const { 2245 const LLT S32 = LLT::scalar(32); 2246 MachineRegisterInfo &MRI = MF.getRegInfo(); 2247 2248 if (Info.hasWorkItemIDX()) { 2249 Register Reg = AMDGPU::VGPR0; 2250 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); 2251 2252 CCInfo.AllocateReg(Reg); 2253 unsigned Mask = (Subtarget->hasPackedTID() && 2254 Info.hasWorkItemIDY()) ? 0x3ff : ~0u; 2255 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); 2256 } 2257 2258 if (Info.hasWorkItemIDY()) { 2259 assert(Info.hasWorkItemIDX()); 2260 if (Subtarget->hasPackedTID()) { 2261 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0, 2262 0x3ff << 10)); 2263 } else { 2264 unsigned Reg = AMDGPU::VGPR1; 2265 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); 2266 2267 CCInfo.AllocateReg(Reg); 2268 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); 2269 } 2270 } 2271 2272 if (Info.hasWorkItemIDZ()) { 2273 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY()); 2274 if (Subtarget->hasPackedTID()) { 2275 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0, 2276 0x3ff << 20)); 2277 } else { 2278 unsigned Reg = AMDGPU::VGPR2; 2279 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); 2280 2281 CCInfo.AllocateReg(Reg); 2282 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); 2283 } 2284 } 2285 } 2286 2287 // Try to allocate a VGPR at the end of the argument list, or if no argument 2288 // VGPRs are left allocating a stack slot. 2289 // If \p Mask is is given it indicates bitfield position in the register. 2290 // If \p Arg is given use it with new ]p Mask instead of allocating new. 2291 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u, 2292 ArgDescriptor Arg = ArgDescriptor()) { 2293 if (Arg.isSet()) 2294 return ArgDescriptor::createArg(Arg, Mask); 2295 2296 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32); 2297 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs); 2298 if (RegIdx == ArgVGPRs.size()) { 2299 // Spill to stack required. 2300 int64_t Offset = CCInfo.AllocateStack(4, Align(4)); 2301 2302 return ArgDescriptor::createStack(Offset, Mask); 2303 } 2304 2305 unsigned Reg = ArgVGPRs[RegIdx]; 2306 Reg = CCInfo.AllocateReg(Reg); 2307 assert(Reg != AMDGPU::NoRegister); 2308 2309 MachineFunction &MF = CCInfo.getMachineFunction(); 2310 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 2311 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32)); 2312 return ArgDescriptor::createRegister(Reg, Mask); 2313 } 2314 2315 static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, 2316 const TargetRegisterClass *RC, 2317 unsigned NumArgRegs) { 2318 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32); 2319 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs); 2320 if (RegIdx == ArgSGPRs.size()) 2321 report_fatal_error("ran out of SGPRs for arguments"); 2322 2323 unsigned Reg = ArgSGPRs[RegIdx]; 2324 Reg = CCInfo.AllocateReg(Reg); 2325 assert(Reg != AMDGPU::NoRegister); 2326 2327 MachineFunction &MF = CCInfo.getMachineFunction(); 2328 MF.addLiveIn(Reg, RC); 2329 return ArgDescriptor::createRegister(Reg); 2330 } 2331 2332 // If this has a fixed position, we still should allocate the register in the 2333 // CCInfo state. Technically we could get away with this for values passed 2334 // outside of the normal argument range. 2335 static void allocateFixedSGPRInputImpl(CCState &CCInfo, 2336 const TargetRegisterClass *RC, 2337 MCRegister Reg) { 2338 Reg = CCInfo.AllocateReg(Reg); 2339 assert(Reg != AMDGPU::NoRegister); 2340 MachineFunction &MF = CCInfo.getMachineFunction(); 2341 MF.addLiveIn(Reg, RC); 2342 } 2343 2344 static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) { 2345 if (Arg) { 2346 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 2347 Arg.getRegister()); 2348 } else 2349 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32); 2350 } 2351 2352 static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) { 2353 if (Arg) { 2354 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 2355 Arg.getRegister()); 2356 } else 2357 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); 2358 } 2359 2360 /// Allocate implicit function VGPR arguments at the end of allocated user 2361 /// arguments. 2362 void SITargetLowering::allocateSpecialInputVGPRs( 2363 CCState &CCInfo, MachineFunction &MF, 2364 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { 2365 const unsigned Mask = 0x3ff; 2366 ArgDescriptor Arg; 2367 2368 if (Info.hasWorkItemIDX()) { 2369 Arg = allocateVGPR32Input(CCInfo, Mask); 2370 Info.setWorkItemIDX(Arg); 2371 } 2372 2373 if (Info.hasWorkItemIDY()) { 2374 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg); 2375 Info.setWorkItemIDY(Arg); 2376 } 2377 2378 if (Info.hasWorkItemIDZ()) 2379 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg)); 2380 } 2381 2382 /// Allocate implicit function VGPR arguments in fixed registers. 2383 void SITargetLowering::allocateSpecialInputVGPRsFixed( 2384 CCState &CCInfo, MachineFunction &MF, 2385 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { 2386 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31); 2387 if (!Reg) 2388 report_fatal_error("failed to allocated VGPR for implicit arguments"); 2389 2390 const unsigned Mask = 0x3ff; 2391 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); 2392 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10)); 2393 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20)); 2394 } 2395 2396 void SITargetLowering::allocateSpecialInputSGPRs( 2397 CCState &CCInfo, 2398 MachineFunction &MF, 2399 const SIRegisterInfo &TRI, 2400 SIMachineFunctionInfo &Info) const { 2401 auto &ArgInfo = Info.getArgInfo(); 2402 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); 2403 2404 // TODO: Unify handling with private memory pointers. 2405 if (UserSGPRInfo.hasDispatchPtr()) 2406 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr); 2407 2408 const Module *M = MF.getFunction().getParent(); 2409 if (UserSGPRInfo.hasQueuePtr() && 2410 AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) 2411 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); 2412 2413 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a 2414 // constant offset from the kernarg segment. 2415 if (Info.hasImplicitArgPtr()) 2416 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr); 2417 2418 if (UserSGPRInfo.hasDispatchID()) 2419 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID); 2420 2421 // flat_scratch_init is not applicable for non-kernel functions. 2422 2423 if (Info.hasWorkGroupIDX()) 2424 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX); 2425 2426 if (Info.hasWorkGroupIDY()) 2427 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY); 2428 2429 if (Info.hasWorkGroupIDZ()) 2430 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ); 2431 2432 if (Info.hasLDSKernelId()) 2433 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId); 2434 } 2435 2436 // Allocate special inputs passed in user SGPRs. 2437 void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, 2438 MachineFunction &MF, 2439 const SIRegisterInfo &TRI, 2440 SIMachineFunctionInfo &Info) const { 2441 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); 2442 if (UserSGPRInfo.hasImplicitBufferPtr()) { 2443 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); 2444 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); 2445 CCInfo.AllocateReg(ImplicitBufferPtrReg); 2446 } 2447 2448 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 2449 if (UserSGPRInfo.hasPrivateSegmentBuffer()) { 2450 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); 2451 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); 2452 CCInfo.AllocateReg(PrivateSegmentBufferReg); 2453 } 2454 2455 if (UserSGPRInfo.hasDispatchPtr()) { 2456 Register DispatchPtrReg = Info.addDispatchPtr(TRI); 2457 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 2458 CCInfo.AllocateReg(DispatchPtrReg); 2459 } 2460 2461 const Module *M = MF.getFunction().getParent(); 2462 if (UserSGPRInfo.hasQueuePtr() && 2463 AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) { 2464 Register QueuePtrReg = Info.addQueuePtr(TRI); 2465 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 2466 CCInfo.AllocateReg(QueuePtrReg); 2467 } 2468 2469 if (UserSGPRInfo.hasKernargSegmentPtr()) { 2470 MachineRegisterInfo &MRI = MF.getRegInfo(); 2471 Register InputPtrReg = Info.addKernargSegmentPtr(TRI); 2472 CCInfo.AllocateReg(InputPtrReg); 2473 2474 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); 2475 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2476 } 2477 2478 if (UserSGPRInfo.hasDispatchID()) { 2479 Register DispatchIDReg = Info.addDispatchID(TRI); 2480 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 2481 CCInfo.AllocateReg(DispatchIDReg); 2482 } 2483 2484 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) { 2485 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); 2486 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 2487 CCInfo.AllocateReg(FlatScratchInitReg); 2488 } 2489 2490 if (UserSGPRInfo.hasPrivateSegmentSize()) { 2491 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI); 2492 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass); 2493 CCInfo.AllocateReg(PrivateSegmentSizeReg); 2494 } 2495 2496 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 2497 // these from the dispatch pointer. 2498 } 2499 2500 // Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be 2501 // sequential starting from the first argument. 2502 void SITargetLowering::allocatePreloadKernArgSGPRs( 2503 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs, 2504 const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF, 2505 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { 2506 Function &F = MF.getFunction(); 2507 unsigned LastExplicitArgOffset = 2508 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset(); 2509 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo(); 2510 bool InPreloadSequence = true; 2511 unsigned InIdx = 0; 2512 for (auto &Arg : F.args()) { 2513 if (!InPreloadSequence || !Arg.hasInRegAttr()) 2514 break; 2515 2516 int ArgIdx = Arg.getArgNo(); 2517 // Don't preload non-original args or parts not in the current preload 2518 // sequence. 2519 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() || 2520 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx)) 2521 break; 2522 2523 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() && 2524 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx; 2525 InIdx++) { 2526 assert(ArgLocs[ArgIdx].isMemLoc()); 2527 auto &ArgLoc = ArgLocs[InIdx]; 2528 const Align KernelArgBaseAlign = Align(16); 2529 unsigned ArgOffset = ArgLoc.getLocMemOffset(); 2530 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset); 2531 unsigned NumAllocSGPRs = 2532 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32; 2533 2534 // Arg is preloaded into the previous SGPR. 2535 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) { 2536 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back( 2537 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]); 2538 continue; 2539 } 2540 2541 unsigned Padding = ArgOffset - LastExplicitArgOffset; 2542 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4; 2543 // Check for free user SGPRs for preloading. 2544 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ > 2545 SGPRInfo.getNumFreeUserSGPRs()) { 2546 InPreloadSequence = false; 2547 break; 2548 } 2549 2550 // Preload this argument. 2551 const TargetRegisterClass *RC = 2552 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32); 2553 SmallVectorImpl<MCRegister> *PreloadRegs = 2554 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs); 2555 2556 if (PreloadRegs->size() > 1) 2557 RC = &AMDGPU::SGPR_32RegClass; 2558 for (auto &Reg : *PreloadRegs) { 2559 assert(Reg); 2560 MF.addLiveIn(Reg, RC); 2561 CCInfo.AllocateReg(Reg); 2562 } 2563 2564 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset; 2565 } 2566 } 2567 } 2568 2569 void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, 2570 const SIRegisterInfo &TRI, 2571 SIMachineFunctionInfo &Info) const { 2572 // Always allocate this last since it is a synthetic preload. 2573 if (Info.hasLDSKernelId()) { 2574 Register Reg = Info.addLDSKernelId(); 2575 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2576 CCInfo.AllocateReg(Reg); 2577 } 2578 } 2579 2580 // Allocate special input registers that are initialized per-wave. 2581 void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, 2582 MachineFunction &MF, 2583 SIMachineFunctionInfo &Info, 2584 CallingConv::ID CallConv, 2585 bool IsShader) const { 2586 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs(); 2587 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) { 2588 // Note: user SGPRs are handled by the front-end for graphics shaders 2589 // Pad up the used user SGPRs with dead inputs. 2590 2591 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately 2592 // before enabling architected SGPRs for workgroup IDs. 2593 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget"); 2594 2595 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs(); 2596 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to 2597 // rely on it to reach 16 since if we end up having no stack usage, it will 2598 // not really be added. 2599 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() + 2600 Info.hasWorkGroupIDY() + 2601 Info.hasWorkGroupIDZ() + 2602 Info.hasWorkGroupInfo(); 2603 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) { 2604 Register Reg = Info.addReservedUserSGPR(); 2605 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2606 CCInfo.AllocateReg(Reg); 2607 } 2608 } 2609 2610 if (!HasArchitectedSGPRs) { 2611 if (Info.hasWorkGroupIDX()) { 2612 Register Reg = Info.addWorkGroupIDX(); 2613 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2614 CCInfo.AllocateReg(Reg); 2615 } 2616 2617 if (Info.hasWorkGroupIDY()) { 2618 Register Reg = Info.addWorkGroupIDY(); 2619 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2620 CCInfo.AllocateReg(Reg); 2621 } 2622 2623 if (Info.hasWorkGroupIDZ()) { 2624 Register Reg = Info.addWorkGroupIDZ(); 2625 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2626 CCInfo.AllocateReg(Reg); 2627 } 2628 } 2629 2630 if (Info.hasWorkGroupInfo()) { 2631 Register Reg = Info.addWorkGroupInfo(); 2632 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2633 CCInfo.AllocateReg(Reg); 2634 } 2635 2636 if (Info.hasPrivateSegmentWaveByteOffset()) { 2637 // Scratch wave offset passed in system SGPR. 2638 unsigned PrivateSegmentWaveByteOffsetReg; 2639 2640 if (IsShader) { 2641 PrivateSegmentWaveByteOffsetReg = 2642 Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); 2643 2644 // This is true if the scratch wave byte offset doesn't have a fixed 2645 // location. 2646 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) { 2647 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); 2648 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); 2649 } 2650 } else 2651 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); 2652 2653 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); 2654 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); 2655 } 2656 2657 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader || 2658 Info.getNumPreloadedSGPRs() >= 16); 2659 } 2660 2661 static void reservePrivateMemoryRegs(const TargetMachine &TM, 2662 MachineFunction &MF, 2663 const SIRegisterInfo &TRI, 2664 SIMachineFunctionInfo &Info) { 2665 // Now that we've figured out where the scratch register inputs are, see if 2666 // should reserve the arguments and use them directly. 2667 MachineFrameInfo &MFI = MF.getFrameInfo(); 2668 bool HasStackObjects = MFI.hasStackObjects(); 2669 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 2670 2671 // Record that we know we have non-spill stack objects so we don't need to 2672 // check all stack objects later. 2673 if (HasStackObjects) 2674 Info.setHasNonSpillStackObjects(true); 2675 2676 // Everything live out of a block is spilled with fast regalloc, so it's 2677 // almost certain that spilling will be required. 2678 if (TM.getOptLevel() == CodeGenOptLevel::None) 2679 HasStackObjects = true; 2680 2681 // For now assume stack access is needed in any callee functions, so we need 2682 // the scratch registers to pass in. 2683 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); 2684 2685 if (!ST.enableFlatScratch()) { 2686 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) { 2687 // If we have stack objects, we unquestionably need the private buffer 2688 // resource. For the Code Object V2 ABI, this will be the first 4 user 2689 // SGPR inputs. We can reserve those and use them directly. 2690 2691 Register PrivateSegmentBufferReg = 2692 Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); 2693 Info.setScratchRSrcReg(PrivateSegmentBufferReg); 2694 } else { 2695 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); 2696 // We tentatively reserve the last registers (skipping the last registers 2697 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation, 2698 // we'll replace these with the ones immediately after those which were 2699 // really allocated. In the prologue copies will be inserted from the 2700 // argument to these reserved registers. 2701 2702 // Without HSA, relocations are used for the scratch pointer and the 2703 // buffer resource setup is always inserted in the prologue. Scratch wave 2704 // offset is still in an input SGPR. 2705 Info.setScratchRSrcReg(ReservedBufferReg); 2706 } 2707 } 2708 2709 MachineRegisterInfo &MRI = MF.getRegInfo(); 2710 2711 // For entry functions we have to set up the stack pointer if we use it, 2712 // whereas non-entry functions get this "for free". This means there is no 2713 // intrinsic advantage to using S32 over S34 in cases where we do not have 2714 // calls but do need a frame pointer (i.e. if we are requested to have one 2715 // because frame pointer elimination is disabled). To keep things simple we 2716 // only ever use S32 as the call ABI stack pointer, and so using it does not 2717 // imply we need a separate frame pointer. 2718 // 2719 // Try to use s32 as the SP, but move it if it would interfere with input 2720 // arguments. This won't work with calls though. 2721 // 2722 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input 2723 // registers. 2724 if (!MRI.isLiveIn(AMDGPU::SGPR32)) { 2725 Info.setStackPtrOffsetReg(AMDGPU::SGPR32); 2726 } else { 2727 assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); 2728 2729 if (MFI.hasCalls()) 2730 report_fatal_error("call in graphics shader with too many input SGPRs"); 2731 2732 for (unsigned Reg : AMDGPU::SGPR_32RegClass) { 2733 if (!MRI.isLiveIn(Reg)) { 2734 Info.setStackPtrOffsetReg(Reg); 2735 break; 2736 } 2737 } 2738 2739 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) 2740 report_fatal_error("failed to find register for SP"); 2741 } 2742 2743 // hasFP should be accurate for entry functions even before the frame is 2744 // finalized, because it does not rely on the known stack size, only 2745 // properties like whether variable sized objects are present. 2746 if (ST.getFrameLowering()->hasFP(MF)) { 2747 Info.setFrameOffsetReg(AMDGPU::SGPR33); 2748 } 2749 } 2750 2751 bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const { 2752 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2753 return !Info->isEntryFunction(); 2754 } 2755 2756 void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 2757 2758 } 2759 2760 void SITargetLowering::insertCopiesSplitCSR( 2761 MachineBasicBlock *Entry, 2762 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 2763 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 2764 2765 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 2766 if (!IStart) 2767 return; 2768 2769 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2770 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 2771 MachineBasicBlock::iterator MBBI = Entry->begin(); 2772 for (const MCPhysReg *I = IStart; *I; ++I) { 2773 const TargetRegisterClass *RC = nullptr; 2774 if (AMDGPU::SReg_64RegClass.contains(*I)) 2775 RC = &AMDGPU::SGPR_64RegClass; 2776 else if (AMDGPU::SReg_32RegClass.contains(*I)) 2777 RC = &AMDGPU::SGPR_32RegClass; 2778 else 2779 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 2780 2781 Register NewVR = MRI->createVirtualRegister(RC); 2782 // Create copy from CSR to a virtual register. 2783 Entry->addLiveIn(*I); 2784 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 2785 .addReg(*I); 2786 2787 // Insert the copy-back instructions right before the terminator. 2788 for (auto *Exit : Exits) 2789 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 2790 TII->get(TargetOpcode::COPY), *I) 2791 .addReg(NewVR); 2792 } 2793 } 2794 2795 SDValue SITargetLowering::LowerFormalArguments( 2796 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2797 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 2798 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 2799 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 2800 2801 MachineFunction &MF = DAG.getMachineFunction(); 2802 const Function &Fn = MF.getFunction(); 2803 FunctionType *FType = MF.getFunction().getFunctionType(); 2804 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2805 2806 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) { 2807 DiagnosticInfoUnsupported NoGraphicsHSA( 2808 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); 2809 DAG.getContext()->diagnose(NoGraphicsHSA); 2810 return DAG.getEntryNode(); 2811 } 2812 2813 SmallVector<ISD::InputArg, 16> Splits; 2814 SmallVector<CCValAssign, 16> ArgLocs; 2815 BitVector Skipped(Ins.size()); 2816 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2817 *DAG.getContext()); 2818 2819 bool IsGraphics = AMDGPU::isGraphics(CallConv); 2820 bool IsKernel = AMDGPU::isKernel(CallConv); 2821 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); 2822 2823 if (IsGraphics) { 2824 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo(); 2825 assert(!UserSGPRInfo.hasDispatchPtr() && 2826 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() && 2827 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() && 2828 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); 2829 (void)UserSGPRInfo; 2830 if (!Subtarget->enableFlatScratch()) 2831 assert(!UserSGPRInfo.hasFlatScratchInit()); 2832 if ((CallConv != CallingConv::AMDGPU_CS && 2833 CallConv != CallingConv::AMDGPU_Gfx) || 2834 !Subtarget->hasArchitectedSGPRs()) 2835 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && 2836 !Info->hasWorkGroupIDZ()); 2837 } 2838 2839 if (CallConv == CallingConv::AMDGPU_PS) { 2840 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); 2841 2842 // At least one interpolation mode must be enabled or else the GPU will 2843 // hang. 2844 // 2845 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user 2846 // set PSInputAddr, the user wants to enable some bits after the compilation 2847 // based on run-time states. Since we can't know what the final PSInputEna 2848 // will look like, so we shouldn't do anything here and the user should take 2849 // responsibility for the correct programming. 2850 // 2851 // Otherwise, the following restrictions apply: 2852 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 2853 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 2854 // enabled too. 2855 if ((Info->getPSInputAddr() & 0x7F) == 0 || 2856 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) { 2857 CCInfo.AllocateReg(AMDGPU::VGPR0); 2858 CCInfo.AllocateReg(AMDGPU::VGPR1); 2859 Info->markPSInputAllocated(0); 2860 Info->markPSInputEnabled(0); 2861 } 2862 if (Subtarget->isAmdPalOS()) { 2863 // For isAmdPalOS, the user does not enable some bits after compilation 2864 // based on run-time states; the register values being generated here are 2865 // the final ones set in hardware. Therefore we need to apply the 2866 // workaround to PSInputAddr and PSInputEnable together. (The case where 2867 // a bit is set in PSInputAddr but not PSInputEnable is where the 2868 // frontend set up an input arg for a particular interpolation mode, but 2869 // nothing uses that input arg. Really we should have an earlier pass 2870 // that removes such an arg.) 2871 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); 2872 if ((PsInputBits & 0x7F) == 0 || 2873 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1))) 2874 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr())); 2875 } 2876 } else if (IsKernel) { 2877 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); 2878 } else { 2879 Splits.append(Ins.begin(), Ins.end()); 2880 } 2881 2882 if (IsKernel) 2883 analyzeFormalArgumentsCompute(CCInfo, Ins); 2884 2885 if (IsEntryFunc) { 2886 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); 2887 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); 2888 if (IsKernel && Subtarget->hasKernargPreload()) 2889 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info); 2890 2891 allocateLDSKernelId(CCInfo, MF, *TRI, *Info); 2892 } else if (!IsGraphics) { 2893 // For the fixed ABI, pass workitem IDs in the last argument register. 2894 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); 2895 2896 // FIXME: Sink this into allocateSpecialInputSGPRs 2897 if (!Subtarget->enableFlatScratch()) 2898 CCInfo.AllocateReg(Info->getScratchRSrcReg()); 2899 2900 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); 2901 } 2902 2903 if (!IsKernel) { 2904 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); 2905 CCInfo.AnalyzeFormalArguments(Splits, AssignFn); 2906 } 2907 2908 SmallVector<SDValue, 16> Chains; 2909 2910 // FIXME: This is the minimum kernel argument alignment. We should improve 2911 // this to the maximum alignment of the arguments. 2912 // 2913 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit 2914 // kern arg offset. 2915 const Align KernelArgBaseAlign = Align(16); 2916 2917 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 2918 const ISD::InputArg &Arg = Ins[i]; 2919 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) { 2920 InVals.push_back(DAG.getUNDEF(Arg.VT)); 2921 continue; 2922 } 2923 2924 CCValAssign &VA = ArgLocs[ArgIdx++]; 2925 MVT VT = VA.getLocVT(); 2926 2927 if (IsEntryFunc && VA.isMemLoc()) { 2928 VT = Ins[i].VT; 2929 EVT MemVT = VA.getLocVT(); 2930 2931 const uint64_t Offset = VA.getLocMemOffset(); 2932 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset); 2933 2934 if (Arg.Flags.isByRef()) { 2935 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset); 2936 2937 const GCNTargetMachine &TM = 2938 static_cast<const GCNTargetMachine &>(getTargetMachine()); 2939 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS, 2940 Arg.Flags.getPointerAddrSpace())) { 2941 Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS, 2942 Arg.Flags.getPointerAddrSpace()); 2943 } 2944 2945 InVals.push_back(Ptr); 2946 continue; 2947 } 2948 2949 SDValue NewArg; 2950 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) { 2951 if (MemVT.getStoreSize() < 4 && Alignment < 4) { 2952 // In this case the argument is packed into the previous preload SGPR. 2953 int64_t AlignDownOffset = alignDown(Offset, 4); 2954 int64_t OffsetDiff = Offset - AlignDownOffset; 2955 EVT IntVT = MemVT.changeTypeToInteger(); 2956 2957 const SIMachineFunctionInfo *Info = 2958 MF.getInfo<SIMachineFunctionInfo>(); 2959 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 2960 Register Reg = 2961 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0]; 2962 2963 assert(Reg); 2964 Register VReg = MRI.getLiveInVirtReg(Reg); 2965 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); 2966 2967 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32); 2968 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt); 2969 2970 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract); 2971 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal); 2972 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal, 2973 Ins[i].Flags.isSExt(), &Ins[i]); 2974 2975 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL); 2976 } else { 2977 const SIMachineFunctionInfo *Info = 2978 MF.getInfo<SIMachineFunctionInfo>(); 2979 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 2980 const SmallVectorImpl<MCRegister> &PreloadRegs = 2981 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs; 2982 2983 SDValue Copy; 2984 if (PreloadRegs.size() == 1) { 2985 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]); 2986 const TargetRegisterClass *RC = MRI.getRegClass(VReg); 2987 NewArg = DAG.getCopyFromReg( 2988 Chain, DL, VReg, 2989 EVT::getIntegerVT(*DAG.getContext(), 2990 TRI->getRegSizeInBits(*RC))); 2991 2992 } else { 2993 // If the kernarg alignment does not match the alignment of the SGPR 2994 // tuple RC that can accommodate this argument, it will be built up 2995 // via copies from from the individual SGPRs that the argument was 2996 // preloaded to. 2997 SmallVector<SDValue, 4> Elts; 2998 for (auto Reg : PreloadRegs) { 2999 Register VReg = MRI.getLiveInVirtReg(Reg); 3000 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); 3001 Elts.push_back(Copy); 3002 } 3003 NewArg = 3004 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32, 3005 PreloadRegs.size()), 3006 DL, Elts); 3007 } 3008 3009 // If the argument was preloaded to multiple consecutive 32-bit 3010 // registers because of misalignment between addressable SGPR tuples 3011 // and the argument size, we can still assume that because of kernarg 3012 // segment alignment restrictions that NewArg's size is the same as 3013 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a 3014 // truncate since we cannot preload to less than a single SGPR and the 3015 // MemVT may be smaller. 3016 EVT MemVTInt = 3017 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); 3018 if (MemVT.bitsLT(NewArg.getSimpleValueType())) 3019 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg); 3020 3021 NewArg = DAG.getBitcast(MemVT, NewArg); 3022 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg, 3023 Ins[i].Flags.isSExt(), &Ins[i]); 3024 NewArg = DAG.getMergeValues({NewArg, Chain}, DL); 3025 } 3026 } else { 3027 NewArg = 3028 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, 3029 Alignment, Ins[i].Flags.isSExt(), &Ins[i]); 3030 } 3031 Chains.push_back(NewArg.getValue(1)); 3032 3033 auto *ParamTy = 3034 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); 3035 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && 3036 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 3037 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) { 3038 // On SI local pointers are just offsets into LDS, so they are always 3039 // less than 16-bits. On CI and newer they could potentially be 3040 // real pointers, so we can't guarantee their size. 3041 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg, 3042 DAG.getValueType(MVT::i16)); 3043 } 3044 3045 InVals.push_back(NewArg); 3046 continue; 3047 } 3048 if (!IsEntryFunc && VA.isMemLoc()) { 3049 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg); 3050 InVals.push_back(Val); 3051 if (!Arg.Flags.isByVal()) 3052 Chains.push_back(Val.getValue(1)); 3053 continue; 3054 } 3055 3056 assert(VA.isRegLoc() && "Parameter must be in a register!"); 3057 3058 Register Reg = VA.getLocReg(); 3059 const TargetRegisterClass *RC = nullptr; 3060 if (AMDGPU::VGPR_32RegClass.contains(Reg)) 3061 RC = &AMDGPU::VGPR_32RegClass; 3062 else if (AMDGPU::SGPR_32RegClass.contains(Reg)) 3063 RC = &AMDGPU::SGPR_32RegClass; 3064 else 3065 llvm_unreachable("Unexpected register class in LowerFormalArguments!"); 3066 EVT ValVT = VA.getValVT(); 3067 3068 Reg = MF.addLiveIn(Reg, RC); 3069 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 3070 3071 if (Arg.Flags.isSRet()) { 3072 // The return object should be reasonably addressable. 3073 3074 // FIXME: This helps when the return is a real sret. If it is a 3075 // automatically inserted sret (i.e. CanLowerReturn returns false), an 3076 // extra copy is inserted in SelectionDAGBuilder which obscures this. 3077 unsigned NumBits 3078 = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex(); 3079 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, 3080 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits))); 3081 } 3082 3083 // If this is an 8 or 16-bit value, it is really passed promoted 3084 // to 32 bits. Insert an assert[sz]ext to capture this, then 3085 // truncate to the right size. 3086 switch (VA.getLocInfo()) { 3087 case CCValAssign::Full: 3088 break; 3089 case CCValAssign::BCvt: 3090 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val); 3091 break; 3092 case CCValAssign::SExt: 3093 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, 3094 DAG.getValueType(ValVT)); 3095 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); 3096 break; 3097 case CCValAssign::ZExt: 3098 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, 3099 DAG.getValueType(ValVT)); 3100 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); 3101 break; 3102 case CCValAssign::AExt: 3103 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); 3104 break; 3105 default: 3106 llvm_unreachable("Unknown loc info!"); 3107 } 3108 3109 InVals.push_back(Val); 3110 } 3111 3112 // Start adding system SGPRs. 3113 if (IsEntryFunc) 3114 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics); 3115 3116 // DAG.getPass() returns nullptr when using new pass manager. 3117 // TODO: Use DAG.getMFAM() to access analysis result. 3118 if (DAG.getPass()) { 3119 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); 3120 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo()); 3121 } 3122 3123 unsigned StackArgSize = CCInfo.getStackSize(); 3124 Info->setBytesInStackArgArea(StackArgSize); 3125 3126 return Chains.empty() ? Chain : 3127 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 3128 } 3129 3130 // TODO: If return values can't fit in registers, we should return as many as 3131 // possible in registers before passing on stack. 3132 bool SITargetLowering::CanLowerReturn( 3133 CallingConv::ID CallConv, 3134 MachineFunction &MF, bool IsVarArg, 3135 const SmallVectorImpl<ISD::OutputArg> &Outs, 3136 LLVMContext &Context) const { 3137 // Replacing returns with sret/stack usage doesn't make sense for shaders. 3138 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn 3139 // for shaders. Vector types should be explicitly handled by CC. 3140 if (AMDGPU::isEntryFunctionCC(CallConv)) 3141 return true; 3142 3143 SmallVector<CCValAssign, 16> RVLocs; 3144 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context); 3145 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg))) 3146 return false; 3147 3148 // We must use the stack if return would require unavailable registers. 3149 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF); 3150 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 3151 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) 3152 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i))) 3153 return false; 3154 3155 return true; 3156 } 3157 3158 SDValue 3159 SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 3160 bool isVarArg, 3161 const SmallVectorImpl<ISD::OutputArg> &Outs, 3162 const SmallVectorImpl<SDValue> &OutVals, 3163 const SDLoc &DL, SelectionDAG &DAG) const { 3164 MachineFunction &MF = DAG.getMachineFunction(); 3165 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 3166 3167 if (AMDGPU::isKernel(CallConv)) { 3168 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, 3169 OutVals, DL, DAG); 3170 } 3171 3172 bool IsShader = AMDGPU::isShader(CallConv); 3173 3174 Info->setIfReturnsVoid(Outs.empty()); 3175 bool IsWaveEnd = Info->returnsVoid() && IsShader; 3176 3177 // CCValAssign - represent the assignment of the return value to a location. 3178 SmallVector<CCValAssign, 48> RVLocs; 3179 SmallVector<ISD::OutputArg, 48> Splits; 3180 3181 // CCState - Info about the registers and stack slots. 3182 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 3183 *DAG.getContext()); 3184 3185 // Analyze outgoing return values. 3186 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 3187 3188 SDValue Glue; 3189 SmallVector<SDValue, 48> RetOps; 3190 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 3191 3192 // Copy the result values into the output registers. 3193 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E; 3194 ++I, ++RealRVLocIdx) { 3195 CCValAssign &VA = RVLocs[I]; 3196 assert(VA.isRegLoc() && "Can only return in registers!"); 3197 // TODO: Partially return in registers if return values don't fit. 3198 SDValue Arg = OutVals[RealRVLocIdx]; 3199 3200 // Copied from other backends. 3201 switch (VA.getLocInfo()) { 3202 case CCValAssign::Full: 3203 break; 3204 case CCValAssign::BCvt: 3205 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 3206 break; 3207 case CCValAssign::SExt: 3208 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 3209 break; 3210 case CCValAssign::ZExt: 3211 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 3212 break; 3213 case CCValAssign::AExt: 3214 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 3215 break; 3216 default: 3217 llvm_unreachable("Unknown loc info!"); 3218 } 3219 3220 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue); 3221 Glue = Chain.getValue(1); 3222 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3223 } 3224 3225 // FIXME: Does sret work properly? 3226 if (!Info->isEntryFunction()) { 3227 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 3228 const MCPhysReg *I = 3229 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 3230 if (I) { 3231 for (; *I; ++I) { 3232 if (AMDGPU::SReg_64RegClass.contains(*I)) 3233 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 3234 else if (AMDGPU::SReg_32RegClass.contains(*I)) 3235 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 3236 else 3237 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 3238 } 3239 } 3240 } 3241 3242 // Update chain and glue. 3243 RetOps[0] = Chain; 3244 if (Glue.getNode()) 3245 RetOps.push_back(Glue); 3246 3247 unsigned Opc = AMDGPUISD::ENDPGM; 3248 if (!IsWaveEnd) 3249 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE; 3250 return DAG.getNode(Opc, DL, MVT::Other, RetOps); 3251 } 3252 3253 SDValue SITargetLowering::LowerCallResult( 3254 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg, 3255 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 3256 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn, 3257 SDValue ThisVal) const { 3258 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg); 3259 3260 // Assign locations to each value returned by this call. 3261 SmallVector<CCValAssign, 16> RVLocs; 3262 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, 3263 *DAG.getContext()); 3264 CCInfo.AnalyzeCallResult(Ins, RetCC); 3265 3266 // Copy all of the result registers out of their specified physreg. 3267 for (CCValAssign VA : RVLocs) { 3268 SDValue Val; 3269 3270 if (VA.isRegLoc()) { 3271 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue); 3272 Chain = Val.getValue(1); 3273 InGlue = Val.getValue(2); 3274 } else if (VA.isMemLoc()) { 3275 report_fatal_error("TODO: return values in memory"); 3276 } else 3277 llvm_unreachable("unknown argument location type"); 3278 3279 switch (VA.getLocInfo()) { 3280 case CCValAssign::Full: 3281 break; 3282 case CCValAssign::BCvt: 3283 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); 3284 break; 3285 case CCValAssign::ZExt: 3286 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val, 3287 DAG.getValueType(VA.getValVT())); 3288 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); 3289 break; 3290 case CCValAssign::SExt: 3291 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val, 3292 DAG.getValueType(VA.getValVT())); 3293 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); 3294 break; 3295 case CCValAssign::AExt: 3296 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); 3297 break; 3298 default: 3299 llvm_unreachable("Unknown loc info!"); 3300 } 3301 3302 InVals.push_back(Val); 3303 } 3304 3305 return Chain; 3306 } 3307 3308 // Add code to pass special inputs required depending on used features separate 3309 // from the explicit user arguments present in the IR. 3310 void SITargetLowering::passSpecialInputs( 3311 CallLoweringInfo &CLI, 3312 CCState &CCInfo, 3313 const SIMachineFunctionInfo &Info, 3314 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, 3315 SmallVectorImpl<SDValue> &MemOpChains, 3316 SDValue Chain) const { 3317 // If we don't have a call site, this was a call inserted by 3318 // legalization. These can never use special inputs. 3319 if (!CLI.CB) 3320 return; 3321 3322 SelectionDAG &DAG = CLI.DAG; 3323 const SDLoc &DL = CLI.DL; 3324 const Function &F = DAG.getMachineFunction().getFunction(); 3325 3326 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 3327 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo(); 3328 3329 const AMDGPUFunctionArgInfo *CalleeArgInfo 3330 = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; 3331 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) { 3332 // DAG.getPass() returns nullptr when using new pass manager. 3333 // TODO: Use DAG.getMFAM() to access analysis result. 3334 if (DAG.getPass()) { 3335 auto &ArgUsageInfo = 3336 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); 3337 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc); 3338 } 3339 } 3340 3341 // TODO: Unify with private memory register handling. This is complicated by 3342 // the fact that at least in kernels, the input argument is not necessarily 3343 // in the same location as the input. 3344 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue, 3345 StringLiteral> ImplicitAttrs[] = { 3346 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"}, 3347 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" }, 3348 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"}, 3349 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"}, 3350 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"}, 3351 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"}, 3352 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"}, 3353 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"}, 3354 }; 3355 3356 for (auto Attr : ImplicitAttrs) { 3357 const ArgDescriptor *OutgoingArg; 3358 const TargetRegisterClass *ArgRC; 3359 LLT ArgTy; 3360 3361 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first; 3362 3363 // If the callee does not use the attribute value, skip copying the value. 3364 if (CLI.CB->hasFnAttr(Attr.second)) 3365 continue; 3366 3367 std::tie(OutgoingArg, ArgRC, ArgTy) = 3368 CalleeArgInfo->getPreloadedValue(InputID); 3369 if (!OutgoingArg) 3370 continue; 3371 3372 const ArgDescriptor *IncomingArg; 3373 const TargetRegisterClass *IncomingArgRC; 3374 LLT Ty; 3375 std::tie(IncomingArg, IncomingArgRC, Ty) = 3376 CallerArgInfo.getPreloadedValue(InputID); 3377 assert(IncomingArgRC == ArgRC); 3378 3379 // All special arguments are ints for now. 3380 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32; 3381 SDValue InputReg; 3382 3383 if (IncomingArg) { 3384 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg); 3385 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) { 3386 // The implicit arg ptr is special because it doesn't have a corresponding 3387 // input for kernels, and is computed from the kernarg segment pointer. 3388 InputReg = getImplicitArgPtr(DAG, DL); 3389 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) { 3390 std::optional<uint32_t> Id = 3391 AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 3392 if (Id.has_value()) { 3393 InputReg = DAG.getConstant(*Id, DL, ArgVT); 3394 } else { 3395 InputReg = DAG.getUNDEF(ArgVT); 3396 } 3397 } else { 3398 // We may have proven the input wasn't needed, although the ABI is 3399 // requiring it. We just need to allocate the register appropriately. 3400 InputReg = DAG.getUNDEF(ArgVT); 3401 } 3402 3403 if (OutgoingArg->isRegister()) { 3404 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); 3405 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 3406 report_fatal_error("failed to allocate implicit input argument"); 3407 } else { 3408 unsigned SpecialArgOffset = 3409 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4)); 3410 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, 3411 SpecialArgOffset); 3412 MemOpChains.push_back(ArgStore); 3413 } 3414 } 3415 3416 // Pack workitem IDs into a single register or pass it as is if already 3417 // packed. 3418 const ArgDescriptor *OutgoingArg; 3419 const TargetRegisterClass *ArgRC; 3420 LLT Ty; 3421 3422 std::tie(OutgoingArg, ArgRC, Ty) = 3423 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3424 if (!OutgoingArg) 3425 std::tie(OutgoingArg, ArgRC, Ty) = 3426 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3427 if (!OutgoingArg) 3428 std::tie(OutgoingArg, ArgRC, Ty) = 3429 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3430 if (!OutgoingArg) 3431 return; 3432 3433 const ArgDescriptor *IncomingArgX = std::get<0>( 3434 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X)); 3435 const ArgDescriptor *IncomingArgY = std::get<0>( 3436 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y)); 3437 const ArgDescriptor *IncomingArgZ = std::get<0>( 3438 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z)); 3439 3440 SDValue InputReg; 3441 SDLoc SL; 3442 3443 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x"); 3444 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y"); 3445 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z"); 3446 3447 // If incoming ids are not packed we need to pack them. 3448 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX && 3449 NeedWorkItemIDX) { 3450 if (Subtarget->getMaxWorkitemID(F, 0) != 0) { 3451 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX); 3452 } else { 3453 InputReg = DAG.getConstant(0, DL, MVT::i32); 3454 } 3455 } 3456 3457 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY && 3458 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) { 3459 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY); 3460 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y, 3461 DAG.getShiftAmountConstant(10, MVT::i32, SL)); 3462 InputReg = InputReg.getNode() ? 3463 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y; 3464 } 3465 3466 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ && 3467 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) { 3468 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ); 3469 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z, 3470 DAG.getShiftAmountConstant(20, MVT::i32, SL)); 3471 InputReg = InputReg.getNode() ? 3472 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z; 3473 } 3474 3475 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) { 3476 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) { 3477 // We're in a situation where the outgoing function requires the workitem 3478 // ID, but the calling function does not have it (e.g a graphics function 3479 // calling a C calling convention function). This is illegal, but we need 3480 // to produce something. 3481 InputReg = DAG.getUNDEF(MVT::i32); 3482 } else { 3483 // Workitem ids are already packed, any of present incoming arguments 3484 // will carry all required fields. 3485 ArgDescriptor IncomingArg = ArgDescriptor::createArg( 3486 IncomingArgX ? *IncomingArgX : 3487 IncomingArgY ? *IncomingArgY : 3488 *IncomingArgZ, ~0u); 3489 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg); 3490 } 3491 } 3492 3493 if (OutgoingArg->isRegister()) { 3494 if (InputReg) 3495 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); 3496 3497 CCInfo.AllocateReg(OutgoingArg->getRegister()); 3498 } else { 3499 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4)); 3500 if (InputReg) { 3501 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, 3502 SpecialArgOffset); 3503 MemOpChains.push_back(ArgStore); 3504 } 3505 } 3506 } 3507 3508 static bool canGuaranteeTCO(CallingConv::ID CC) { 3509 return CC == CallingConv::Fast; 3510 } 3511 3512 /// Return true if we might ever do TCO for calls with this calling convention. 3513 static bool mayTailCallThisCC(CallingConv::ID CC) { 3514 switch (CC) { 3515 case CallingConv::C: 3516 case CallingConv::AMDGPU_Gfx: 3517 return true; 3518 default: 3519 return canGuaranteeTCO(CC); 3520 } 3521 } 3522 3523 bool SITargetLowering::isEligibleForTailCallOptimization( 3524 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg, 3525 const SmallVectorImpl<ISD::OutputArg> &Outs, 3526 const SmallVectorImpl<SDValue> &OutVals, 3527 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { 3528 if (AMDGPU::isChainCC(CalleeCC)) 3529 return true; 3530 3531 if (!mayTailCallThisCC(CalleeCC)) 3532 return false; 3533 3534 // For a divergent call target, we need to do a waterfall loop over the 3535 // possible callees which precludes us from using a simple jump. 3536 if (Callee->isDivergent()) 3537 return false; 3538 3539 MachineFunction &MF = DAG.getMachineFunction(); 3540 const Function &CallerF = MF.getFunction(); 3541 CallingConv::ID CallerCC = CallerF.getCallingConv(); 3542 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 3543 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 3544 3545 // Kernels aren't callable, and don't have a live in return address so it 3546 // doesn't make sense to do a tail call with entry functions. 3547 if (!CallerPreserved) 3548 return false; 3549 3550 bool CCMatch = CallerCC == CalleeCC; 3551 3552 if (DAG.getTarget().Options.GuaranteedTailCallOpt) { 3553 if (canGuaranteeTCO(CalleeCC) && CCMatch) 3554 return true; 3555 return false; 3556 } 3557 3558 // TODO: Can we handle var args? 3559 if (IsVarArg) 3560 return false; 3561 3562 for (const Argument &Arg : CallerF.args()) { 3563 if (Arg.hasByValAttr()) 3564 return false; 3565 } 3566 3567 LLVMContext &Ctx = *DAG.getContext(); 3568 3569 // Check that the call results are passed in the same way. 3570 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins, 3571 CCAssignFnForCall(CalleeCC, IsVarArg), 3572 CCAssignFnForCall(CallerCC, IsVarArg))) 3573 return false; 3574 3575 // The callee has to preserve all registers the caller needs to preserve. 3576 if (!CCMatch) { 3577 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 3578 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 3579 return false; 3580 } 3581 3582 // Nothing more to check if the callee is taking no arguments. 3583 if (Outs.empty()) 3584 return true; 3585 3586 SmallVector<CCValAssign, 16> ArgLocs; 3587 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx); 3588 3589 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg)); 3590 3591 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 3592 // If the stack arguments for this call do not fit into our own save area then 3593 // the call cannot be made tail. 3594 // TODO: Is this really necessary? 3595 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) 3596 return false; 3597 3598 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3599 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals); 3600 } 3601 3602 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 3603 if (!CI->isTailCall()) 3604 return false; 3605 3606 const Function *ParentFn = CI->getParent()->getParent(); 3607 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv())) 3608 return false; 3609 return true; 3610 } 3611 3612 // The wave scratch offset register is used as the global base pointer. 3613 SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, 3614 SmallVectorImpl<SDValue> &InVals) const { 3615 CallingConv::ID CallConv = CLI.CallConv; 3616 bool IsChainCallConv = AMDGPU::isChainCC(CallConv); 3617 3618 SelectionDAG &DAG = CLI.DAG; 3619 3620 TargetLowering::ArgListEntry RequestedExec; 3621 if (IsChainCallConv) { 3622 // The last argument should be the value that we need to put in EXEC. 3623 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we 3624 // don't treat it like the rest of the arguments. 3625 RequestedExec = CLI.Args.back(); 3626 assert(RequestedExec.Node && "No node for EXEC"); 3627 3628 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize())) 3629 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC"); 3630 3631 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg"); 3632 CLI.Outs.pop_back(); 3633 CLI.OutVals.pop_back(); 3634 3635 if (RequestedExec.Ty->isIntegerTy(64)) { 3636 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up"); 3637 CLI.Outs.pop_back(); 3638 CLI.OutVals.pop_back(); 3639 } 3640 3641 assert(CLI.Outs.back().OrigArgIndex != 2 && 3642 "Haven't popped all the pieces of the EXEC mask"); 3643 } 3644 3645 const SDLoc &DL = CLI.DL; 3646 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 3647 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 3648 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 3649 SDValue Chain = CLI.Chain; 3650 SDValue Callee = CLI.Callee; 3651 bool &IsTailCall = CLI.IsTailCall; 3652 bool IsVarArg = CLI.IsVarArg; 3653 bool IsSibCall = false; 3654 MachineFunction &MF = DAG.getMachineFunction(); 3655 3656 if (Callee.isUndef() || isNullConstant(Callee)) { 3657 if (!CLI.IsTailCall) { 3658 for (ISD::InputArg &Arg : CLI.Ins) 3659 InVals.push_back(DAG.getUNDEF(Arg.VT)); 3660 } 3661 3662 return Chain; 3663 } 3664 3665 if (IsVarArg) { 3666 return lowerUnhandledCall(CLI, InVals, 3667 "unsupported call to variadic function "); 3668 } 3669 3670 if (!CLI.CB) 3671 report_fatal_error("unsupported libcall legalization"); 3672 3673 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) { 3674 return lowerUnhandledCall(CLI, InVals, 3675 "unsupported required tail call to function "); 3676 } 3677 3678 if (IsTailCall) { 3679 IsTailCall = isEligibleForTailCallOptimization( 3680 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); 3681 if (!IsTailCall && 3682 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) { 3683 report_fatal_error("failed to perform tail call elimination on a call " 3684 "site marked musttail or on llvm.amdgcn.cs.chain"); 3685 } 3686 3687 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 3688 3689 // A sibling call is one where we're under the usual C ABI and not planning 3690 // to change that but can still do a tail call: 3691 if (!TailCallOpt && IsTailCall) 3692 IsSibCall = true; 3693 3694 if (IsTailCall) 3695 ++NumTailCalls; 3696 } 3697 3698 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 3699 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 3700 SmallVector<SDValue, 8> MemOpChains; 3701 3702 // Analyze operands of the call, assigning locations to each operand. 3703 SmallVector<CCValAssign, 16> ArgLocs; 3704 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); 3705 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); 3706 3707 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) { 3708 // With a fixed ABI, allocate fixed registers before user arguments. 3709 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); 3710 } 3711 3712 CCInfo.AnalyzeCallOperands(Outs, AssignFn); 3713 3714 // Get a count of how many bytes are to be pushed on the stack. 3715 unsigned NumBytes = CCInfo.getStackSize(); 3716 3717 if (IsSibCall) { 3718 // Since we're not changing the ABI to make this a tail call, the memory 3719 // operands are already available in the caller's incoming argument space. 3720 NumBytes = 0; 3721 } 3722 3723 // FPDiff is the byte offset of the call's argument area from the callee's. 3724 // Stores to callee stack arguments will be placed in FixedStackSlots offset 3725 // by this amount for a tail call. In a sibling call it must be 0 because the 3726 // caller will deallocate the entire stack and the callee still expects its 3727 // arguments to begin at SP+0. Completely unused for non-tail calls. 3728 int32_t FPDiff = 0; 3729 MachineFrameInfo &MFI = MF.getFrameInfo(); 3730 3731 // Adjust the stack pointer for the new arguments... 3732 // These operations are automatically eliminated by the prolog/epilog pass 3733 if (!IsSibCall) 3734 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); 3735 3736 if (!IsSibCall || IsChainCallConv) { 3737 if (!Subtarget->enableFlatScratch()) { 3738 SmallVector<SDValue, 4> CopyFromChains; 3739 3740 // In the HSA case, this should be an identity copy. 3741 SDValue ScratchRSrcReg 3742 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); 3743 RegsToPass.emplace_back(IsChainCallConv 3744 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51 3745 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, 3746 ScratchRSrcReg); 3747 CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); 3748 Chain = DAG.getTokenFactor(DL, CopyFromChains); 3749 } 3750 } 3751 3752 MVT PtrVT = MVT::i32; 3753 3754 // Walk the register/memloc assignments, inserting copies/loads. 3755 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3756 CCValAssign &VA = ArgLocs[i]; 3757 SDValue Arg = OutVals[i]; 3758 3759 // Promote the value if needed. 3760 switch (VA.getLocInfo()) { 3761 case CCValAssign::Full: 3762 break; 3763 case CCValAssign::BCvt: 3764 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 3765 break; 3766 case CCValAssign::ZExt: 3767 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 3768 break; 3769 case CCValAssign::SExt: 3770 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 3771 break; 3772 case CCValAssign::AExt: 3773 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 3774 break; 3775 case CCValAssign::FPExt: 3776 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); 3777 break; 3778 default: 3779 llvm_unreachable("Unknown loc info!"); 3780 } 3781 3782 if (VA.isRegLoc()) { 3783 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg)); 3784 } else { 3785 assert(VA.isMemLoc()); 3786 3787 SDValue DstAddr; 3788 MachinePointerInfo DstInfo; 3789 3790 unsigned LocMemOffset = VA.getLocMemOffset(); 3791 int32_t Offset = LocMemOffset; 3792 3793 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT); 3794 MaybeAlign Alignment; 3795 3796 if (IsTailCall) { 3797 ISD::ArgFlagsTy Flags = Outs[i].Flags; 3798 unsigned OpSize = Flags.isByVal() ? 3799 Flags.getByValSize() : VA.getValVT().getStoreSize(); 3800 3801 // FIXME: We can have better than the minimum byval required alignment. 3802 Alignment = 3803 Flags.isByVal() 3804 ? Flags.getNonZeroByValAlign() 3805 : commonAlignment(Subtarget->getStackAlignment(), Offset); 3806 3807 Offset = Offset + FPDiff; 3808 int FI = MFI.CreateFixedObject(OpSize, Offset, true); 3809 3810 DstAddr = DAG.getFrameIndex(FI, PtrVT); 3811 DstInfo = MachinePointerInfo::getFixedStack(MF, FI); 3812 3813 // Make sure any stack arguments overlapping with where we're storing 3814 // are loaded before this eventual operation. Otherwise they'll be 3815 // clobbered. 3816 3817 // FIXME: Why is this really necessary? This seems to just result in a 3818 // lot of code to copy the stack and write them back to the same 3819 // locations, which are supposed to be immutable? 3820 Chain = addTokenForArgument(Chain, DAG, MFI, FI); 3821 } else { 3822 // Stores to the argument stack area are relative to the stack pointer. 3823 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(), 3824 MVT::i32); 3825 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff); 3826 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); 3827 Alignment = 3828 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset); 3829 } 3830 3831 if (Outs[i].Flags.isByVal()) { 3832 SDValue SizeNode = 3833 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32); 3834 SDValue Cpy = 3835 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode, 3836 Outs[i].Flags.getNonZeroByValAlign(), 3837 /*isVol = */ false, /*AlwaysInline = */ true, 3838 /*CI=*/nullptr, std::nullopt, DstInfo, 3839 MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS)); 3840 3841 MemOpChains.push_back(Cpy); 3842 } else { 3843 SDValue Store = 3844 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment); 3845 MemOpChains.push_back(Store); 3846 } 3847 } 3848 } 3849 3850 if (!MemOpChains.empty()) 3851 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); 3852 3853 // Build a sequence of copy-to-reg nodes chained together with token chain 3854 // and flag operands which copy the outgoing args into the appropriate regs. 3855 SDValue InGlue; 3856 for (auto &RegToPass : RegsToPass) { 3857 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, 3858 RegToPass.second, InGlue); 3859 InGlue = Chain.getValue(1); 3860 } 3861 3862 3863 // We don't usually want to end the call-sequence here because we would tidy 3864 // the frame up *after* the call, however in the ABI-changing tail-call case 3865 // we've carefully laid out the parameters so that when sp is reset they'll be 3866 // in the correct location. 3867 if (IsTailCall && !IsSibCall) { 3868 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL); 3869 InGlue = Chain.getValue(1); 3870 } 3871 3872 std::vector<SDValue> Ops; 3873 Ops.push_back(Chain); 3874 Ops.push_back(Callee); 3875 // Add a redundant copy of the callee global which will not be legalized, as 3876 // we need direct access to the callee later. 3877 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) { 3878 const GlobalValue *GV = GSD->getGlobal(); 3879 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64)); 3880 } else { 3881 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); 3882 } 3883 3884 if (IsTailCall) { 3885 // Each tail call may have to adjust the stack by a different amount, so 3886 // this information must travel along with the operation for eventual 3887 // consumption by emitEpilogue. 3888 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); 3889 } 3890 3891 if (IsChainCallConv) 3892 Ops.push_back(RequestedExec.Node); 3893 3894 // Add argument registers to the end of the list so that they are known live 3895 // into the call. 3896 for (auto &RegToPass : RegsToPass) { 3897 Ops.push_back(DAG.getRegister(RegToPass.first, 3898 RegToPass.second.getValueType())); 3899 } 3900 3901 // Add a register mask operand representing the call-preserved registers. 3902 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 3903 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); 3904 assert(Mask && "Missing call preserved mask for calling convention"); 3905 Ops.push_back(DAG.getRegisterMask(Mask)); 3906 3907 if (SDValue Token = CLI.ConvergenceControlToken) { 3908 SmallVector<SDValue, 2> GlueOps; 3909 GlueOps.push_back(Token); 3910 if (InGlue) 3911 GlueOps.push_back(InGlue); 3912 3913 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL, 3914 MVT::Glue, GlueOps), 3915 0); 3916 } 3917 3918 if (InGlue) 3919 Ops.push_back(InGlue); 3920 3921 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 3922 3923 // If we're doing a tall call, use a TC_RETURN here rather than an 3924 // actual call instruction. 3925 if (IsTailCall) { 3926 MFI.setHasTailCall(); 3927 unsigned OPC = AMDGPUISD::TC_RETURN; 3928 switch (CallConv) { 3929 case CallingConv::AMDGPU_Gfx: 3930 OPC = AMDGPUISD::TC_RETURN_GFX; 3931 break; 3932 case CallingConv::AMDGPU_CS_Chain: 3933 case CallingConv::AMDGPU_CS_ChainPreserve: 3934 OPC = AMDGPUISD::TC_RETURN_CHAIN; 3935 break; 3936 } 3937 3938 return DAG.getNode(OPC, DL, NodeTys, Ops); 3939 } 3940 3941 // Returns a chain and a flag for retval copy to use. 3942 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops); 3943 Chain = Call.getValue(0); 3944 InGlue = Call.getValue(1); 3945 3946 uint64_t CalleePopBytes = NumBytes; 3947 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL); 3948 if (!Ins.empty()) 3949 InGlue = Chain.getValue(1); 3950 3951 // Handle result values, copying them out of physregs into vregs that we 3952 // return. 3953 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG, 3954 InVals, /*IsThisReturn=*/false, SDValue()); 3955 } 3956 3957 // This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC, 3958 // except for applying the wave size scale to the increment amount. 3959 SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl( 3960 SDValue Op, SelectionDAG &DAG) const { 3961 const MachineFunction &MF = DAG.getMachineFunction(); 3962 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 3963 3964 SDLoc dl(Op); 3965 EVT VT = Op.getValueType(); 3966 SDValue Tmp1 = Op; 3967 SDValue Tmp2 = Op.getValue(1); 3968 SDValue Tmp3 = Op.getOperand(2); 3969 SDValue Chain = Tmp1.getOperand(0); 3970 3971 Register SPReg = Info->getStackPtrOffsetReg(); 3972 3973 // Chain the dynamic stack allocation so that it doesn't modify the stack 3974 // pointer when other instructions are using the stack. 3975 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); 3976 3977 SDValue Size = Tmp2.getOperand(1); 3978 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); 3979 Chain = SP.getValue(1); 3980 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue(); 3981 const TargetFrameLowering *TFL = Subtarget->getFrameLowering(); 3982 unsigned Opc = 3983 TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ? 3984 ISD::ADD : ISD::SUB; 3985 3986 SDValue ScaledSize = DAG.getNode( 3987 ISD::SHL, dl, VT, Size, 3988 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32)); 3989 3990 Align StackAlign = TFL->getStackAlign(); 3991 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value 3992 if (Alignment && *Alignment > StackAlign) { 3993 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, 3994 DAG.getConstant(-(uint64_t)Alignment->value() 3995 << Subtarget->getWavefrontSizeLog2(), 3996 dl, VT)); 3997 } 3998 3999 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain 4000 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); 4001 4002 return DAG.getMergeValues({Tmp1, Tmp2}, dl); 4003 } 4004 4005 SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 4006 SelectionDAG &DAG) const { 4007 // We only handle constant sizes here to allow non-entry block, static sized 4008 // allocas. A truly dynamic value is more difficult to support because we 4009 // don't know if the size value is uniform or not. If the size isn't uniform, 4010 // we would need to do a wave reduction to get the maximum size to know how 4011 // much to increment the uniform stack pointer. 4012 SDValue Size = Op.getOperand(1); 4013 if (isa<ConstantSDNode>(Size)) 4014 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion. 4015 4016 return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG); 4017 } 4018 4019 SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const { 4020 if (Op.getValueType() != MVT::i32) 4021 return Op; // Defer to cannot select error. 4022 4023 Register SP = getStackPointerRegisterToSaveRestore(); 4024 SDLoc SL(Op); 4025 4026 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32); 4027 4028 // Convert from wave uniform to swizzled vector address. This should protect 4029 // from any edge cases where the stacksave result isn't directly used with 4030 // stackrestore. 4031 SDValue VectorAddress = 4032 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP); 4033 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL); 4034 } 4035 4036 SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op, 4037 SelectionDAG &DAG) const { 4038 SDLoc SL(Op); 4039 assert(Op.getValueType() == MVT::i32); 4040 4041 uint32_t BothRoundHwReg = 4042 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4); 4043 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32); 4044 4045 SDValue IntrinID = 4046 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32); 4047 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(), 4048 Op.getOperand(0), IntrinID, GetRoundBothImm); 4049 4050 // There are two rounding modes, one for f32 and one for f64/f16. We only 4051 // report in the standard value range if both are the same. 4052 // 4053 // The raw values also differ from the expected FLT_ROUNDS values. Nearest 4054 // ties away from zero is not supported, and the other values are rotated by 4055 // 1. 4056 // 4057 // If the two rounding modes are not the same, report a target defined value. 4058 4059 // Mode register rounding mode fields: 4060 // 4061 // [1:0] Single-precision round mode. 4062 // [3:2] Double/Half-precision round mode. 4063 // 4064 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero. 4065 // 4066 // Hardware Spec 4067 // Toward-0 3 0 4068 // Nearest Even 0 1 4069 // +Inf 1 2 4070 // -Inf 2 3 4071 // NearestAway0 N/A 4 4072 // 4073 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit 4074 // table we can index by the raw hardware mode. 4075 // 4076 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf 4077 4078 SDValue BitTable = 4079 DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64); 4080 4081 SDValue Two = DAG.getConstant(2, SL, MVT::i32); 4082 SDValue RoundModeTimesNumBits = 4083 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two); 4084 4085 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we 4086 // knew only one mode was demanded. 4087 SDValue TableValue = 4088 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits); 4089 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue); 4090 4091 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32); 4092 SDValue TableEntry = 4093 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask); 4094 4095 // There's a gap in the 4-bit encoded table and actual enum values, so offset 4096 // if it's an extended value. 4097 SDValue Four = DAG.getConstant(4, SL, MVT::i32); 4098 SDValue IsStandardValue = 4099 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT); 4100 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four); 4101 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue, 4102 TableEntry, EnumOffset); 4103 4104 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL); 4105 } 4106 4107 SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op, 4108 SelectionDAG &DAG) const { 4109 SDLoc SL(Op); 4110 4111 SDValue NewMode = Op.getOperand(1); 4112 assert(NewMode.getValueType() == MVT::i32); 4113 4114 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the 4115 // hardware MODE.fp_round values. 4116 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) { 4117 uint32_t ClampedVal = std::min( 4118 static_cast<uint32_t>(ConstMode->getZExtValue()), 4119 static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64)); 4120 NewMode = DAG.getConstant( 4121 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32); 4122 } else { 4123 // If we know the input can only be one of the supported standard modes in 4124 // the range 0-3, we can use a simplified mapping to hardware values. 4125 KnownBits KB = DAG.computeKnownBits(NewMode); 4126 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30; 4127 // The supported standard values are 0-3. The extended values start at 8. We 4128 // need to offset by 4 if the value is in the extended range. 4129 4130 if (UseReducedTable) { 4131 // Truncate to the low 32-bits. 4132 SDValue BitTable = DAG.getConstant( 4133 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32); 4134 4135 SDValue Two = DAG.getConstant(2, SL, MVT::i32); 4136 SDValue RoundModeTimesNumBits = 4137 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two); 4138 4139 NewMode = 4140 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits); 4141 4142 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce 4143 // the table extracted bits into inline immediates. 4144 } else { 4145 // table_index = umin(value, value - 4) 4146 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf 4147 SDValue BitTable = 4148 DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64); 4149 4150 SDValue Four = DAG.getConstant(4, SL, MVT::i32); 4151 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four); 4152 SDValue IndexVal = 4153 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum); 4154 4155 SDValue Two = DAG.getConstant(2, SL, MVT::i32); 4156 SDValue RoundModeTimesNumBits = 4157 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two); 4158 4159 SDValue TableValue = 4160 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits); 4161 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue); 4162 4163 // No need to mask out the high bits since the setreg will ignore them 4164 // anyway. 4165 NewMode = TruncTable; 4166 } 4167 4168 // Insert a readfirstlane in case the value is a VGPR. We could do this 4169 // earlier and keep more operations scalar, but that interferes with 4170 // combining the source. 4171 SDValue ReadFirstLaneID = 4172 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32); 4173 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, 4174 ReadFirstLaneID, NewMode); 4175 } 4176 4177 // N.B. The setreg will be later folded into s_round_mode on supported 4178 // targets. 4179 SDValue IntrinID = 4180 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32); 4181 uint32_t BothRoundHwReg = 4182 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4); 4183 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32); 4184 4185 SDValue SetReg = 4186 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0), 4187 IntrinID, RoundBothImm, NewMode); 4188 4189 return SetReg; 4190 } 4191 4192 SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { 4193 if (Op->isDivergent()) 4194 return SDValue(); 4195 4196 switch (cast<MemSDNode>(Op)->getAddressSpace()) { 4197 case AMDGPUAS::FLAT_ADDRESS: 4198 case AMDGPUAS::GLOBAL_ADDRESS: 4199 case AMDGPUAS::CONSTANT_ADDRESS: 4200 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 4201 break; 4202 default: 4203 return SDValue(); 4204 } 4205 4206 return Op; 4207 } 4208 4209 // Work around DAG legality rules only based on the result type. 4210 SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 4211 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND; 4212 SDValue Src = Op.getOperand(IsStrict ? 1 : 0); 4213 EVT SrcVT = Src.getValueType(); 4214 4215 if (SrcVT.getScalarType() != MVT::bf16) 4216 return Op; 4217 4218 SDLoc SL(Op); 4219 SDValue BitCast = 4220 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src); 4221 4222 EVT DstVT = Op.getValueType(); 4223 if (IsStrict) 4224 llvm_unreachable("Need STRICT_BF16_TO_FP"); 4225 4226 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast); 4227 } 4228 4229 SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const { 4230 SDLoc SL(Op); 4231 if (Op.getValueType() != MVT::i64) 4232 return Op; 4233 4234 uint32_t ModeHwReg = 4235 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23); 4236 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32); 4237 uint32_t TrapHwReg = 4238 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5); 4239 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32); 4240 4241 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other); 4242 SDValue IntrinID = 4243 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32); 4244 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList, 4245 Op.getOperand(0), IntrinID, ModeHwRegImm); 4246 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList, 4247 Op.getOperand(0), IntrinID, TrapHwRegImm); 4248 SDValue TokenReg = 4249 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1), 4250 GetTrapReg.getValue(1)); 4251 4252 SDValue CvtPtr = 4253 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg); 4254 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr); 4255 4256 return DAG.getMergeValues({Result, TokenReg}, SL); 4257 } 4258 4259 SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const { 4260 SDLoc SL(Op); 4261 if (Op.getOperand(1).getValueType() != MVT::i64) 4262 return Op; 4263 4264 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1)); 4265 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input, 4266 DAG.getConstant(0, SL, MVT::i32)); 4267 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input, 4268 DAG.getConstant(1, SL, MVT::i32)); 4269 4270 SDValue ReadFirstLaneID = 4271 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32); 4272 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, 4273 ReadFirstLaneID, NewModeReg); 4274 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, 4275 ReadFirstLaneID, NewTrapReg); 4276 4277 unsigned ModeHwReg = 4278 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23); 4279 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32); 4280 unsigned TrapHwReg = 4281 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5); 4282 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32); 4283 4284 SDValue IntrinID = 4285 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32); 4286 SDValue SetModeReg = 4287 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0), 4288 IntrinID, ModeHwRegImm, NewModeReg); 4289 SDValue SetTrapReg = 4290 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0), 4291 IntrinID, TrapHwRegImm, NewTrapReg); 4292 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg); 4293 } 4294 4295 Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT, 4296 const MachineFunction &MF) const { 4297 Register Reg = StringSwitch<Register>(RegName) 4298 .Case("m0", AMDGPU::M0) 4299 .Case("exec", AMDGPU::EXEC) 4300 .Case("exec_lo", AMDGPU::EXEC_LO) 4301 .Case("exec_hi", AMDGPU::EXEC_HI) 4302 .Case("flat_scratch", AMDGPU::FLAT_SCR) 4303 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) 4304 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) 4305 .Default(Register()); 4306 4307 if (Reg == AMDGPU::NoRegister) { 4308 report_fatal_error(Twine("invalid register name \"" 4309 + StringRef(RegName) + "\".")); 4310 4311 } 4312 4313 if (!Subtarget->hasFlatScrRegister() && 4314 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { 4315 report_fatal_error(Twine("invalid register \"" 4316 + StringRef(RegName) + "\" for subtarget.")); 4317 } 4318 4319 switch (Reg) { 4320 case AMDGPU::M0: 4321 case AMDGPU::EXEC_LO: 4322 case AMDGPU::EXEC_HI: 4323 case AMDGPU::FLAT_SCR_LO: 4324 case AMDGPU::FLAT_SCR_HI: 4325 if (VT.getSizeInBits() == 32) 4326 return Reg; 4327 break; 4328 case AMDGPU::EXEC: 4329 case AMDGPU::FLAT_SCR: 4330 if (VT.getSizeInBits() == 64) 4331 return Reg; 4332 break; 4333 default: 4334 llvm_unreachable("missing register type checking"); 4335 } 4336 4337 report_fatal_error(Twine("invalid type for register \"" 4338 + StringRef(RegName) + "\".")); 4339 } 4340 4341 // If kill is not the last instruction, split the block so kill is always a 4342 // proper terminator. 4343 MachineBasicBlock * 4344 SITargetLowering::splitKillBlock(MachineInstr &MI, 4345 MachineBasicBlock *BB) const { 4346 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/); 4347 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 4348 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode())); 4349 return SplitBB; 4350 } 4351 4352 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true, 4353 // \p MI will be the only instruction in the loop body block. Otherwise, it will 4354 // be the first instruction in the remainder block. 4355 // 4356 /// \returns { LoopBody, Remainder } 4357 static std::pair<MachineBasicBlock *, MachineBasicBlock *> 4358 splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) { 4359 MachineFunction *MF = MBB.getParent(); 4360 MachineBasicBlock::iterator I(&MI); 4361 4362 // To insert the loop we need to split the block. Move everything after this 4363 // point to a new block, and insert a new empty block between the two. 4364 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 4365 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 4366 MachineFunction::iterator MBBI(MBB); 4367 ++MBBI; 4368 4369 MF->insert(MBBI, LoopBB); 4370 MF->insert(MBBI, RemainderBB); 4371 4372 LoopBB->addSuccessor(LoopBB); 4373 LoopBB->addSuccessor(RemainderBB); 4374 4375 // Move the rest of the block into a new block. 4376 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 4377 4378 if (InstInLoop) { 4379 auto Next = std::next(I); 4380 4381 // Move instruction to loop body. 4382 LoopBB->splice(LoopBB->begin(), &MBB, I, Next); 4383 4384 // Move the rest of the block. 4385 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end()); 4386 } else { 4387 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 4388 } 4389 4390 MBB.addSuccessor(LoopBB); 4391 4392 return std::pair(LoopBB, RemainderBB); 4393 } 4394 4395 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it. 4396 void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const { 4397 MachineBasicBlock *MBB = MI.getParent(); 4398 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 4399 auto I = MI.getIterator(); 4400 auto E = std::next(I); 4401 4402 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) 4403 .addImm(0); 4404 4405 MIBundleBuilder Bundler(*MBB, I, E); 4406 finalizeBundle(*MBB, Bundler.begin()); 4407 } 4408 4409 MachineBasicBlock * 4410 SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, 4411 MachineBasicBlock *BB) const { 4412 const DebugLoc &DL = MI.getDebugLoc(); 4413 4414 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 4415 4416 MachineBasicBlock *LoopBB; 4417 MachineBasicBlock *RemainderBB; 4418 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 4419 4420 // Apparently kill flags are only valid if the def is in the same block? 4421 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) 4422 Src->setIsKill(false); 4423 4424 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true); 4425 4426 MachineBasicBlock::iterator I = LoopBB->end(); 4427 4428 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode( 4429 AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1); 4430 4431 // Clear TRAP_STS.MEM_VIOL 4432 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32)) 4433 .addImm(0) 4434 .addImm(EncodedReg); 4435 4436 bundleInstWithWaitcnt(MI); 4437 4438 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4439 4440 // Load and check TRAP_STS.MEM_VIOL 4441 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg) 4442 .addImm(EncodedReg); 4443 4444 // FIXME: Do we need to use an isel pseudo that may clobber scc? 4445 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)) 4446 .addReg(Reg, RegState::Kill) 4447 .addImm(0); 4448 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) 4449 .addMBB(LoopBB); 4450 4451 return RemainderBB; 4452 } 4453 4454 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the 4455 // wavefront. If the value is uniform and just happens to be in a VGPR, this 4456 // will only do one iteration. In the worst case, this will loop 64 times. 4457 // 4458 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value. 4459 static MachineBasicBlock::iterator 4460 emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, 4461 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, 4462 const DebugLoc &DL, const MachineOperand &Idx, 4463 unsigned InitReg, unsigned ResultReg, unsigned PhiReg, 4464 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, 4465 Register &SGPRIdxReg) { 4466 4467 MachineFunction *MF = OrigBB.getParent(); 4468 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 4469 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4470 MachineBasicBlock::iterator I = LoopBB.begin(); 4471 4472 const TargetRegisterClass *BoolRC = TRI->getBoolRC(); 4473 Register PhiExec = MRI.createVirtualRegister(BoolRC); 4474 Register NewExec = MRI.createVirtualRegister(BoolRC); 4475 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4476 Register CondReg = MRI.createVirtualRegister(BoolRC); 4477 4478 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) 4479 .addReg(InitReg) 4480 .addMBB(&OrigBB) 4481 .addReg(ResultReg) 4482 .addMBB(&LoopBB); 4483 4484 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec) 4485 .addReg(InitSaveExecReg) 4486 .addMBB(&OrigBB) 4487 .addReg(NewExec) 4488 .addMBB(&LoopBB); 4489 4490 // Read the next variant <- also loop target. 4491 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg) 4492 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef())); 4493 4494 // Compare the just read M0 value to all possible Idx values. 4495 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg) 4496 .addReg(CurrentIdxReg) 4497 .addReg(Idx.getReg(), 0, Idx.getSubReg()); 4498 4499 // Update EXEC, save the original EXEC value to VCC. 4500 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 4501 : AMDGPU::S_AND_SAVEEXEC_B64), 4502 NewExec) 4503 .addReg(CondReg, RegState::Kill); 4504 4505 MRI.setSimpleHint(NewExec, CondReg); 4506 4507 if (UseGPRIdxMode) { 4508 if (Offset == 0) { 4509 SGPRIdxReg = CurrentIdxReg; 4510 } else { 4511 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4512 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg) 4513 .addReg(CurrentIdxReg, RegState::Kill) 4514 .addImm(Offset); 4515 } 4516 } else { 4517 // Move index from VCC into M0 4518 if (Offset == 0) { 4519 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 4520 .addReg(CurrentIdxReg, RegState::Kill); 4521 } else { 4522 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 4523 .addReg(CurrentIdxReg, RegState::Kill) 4524 .addImm(Offset); 4525 } 4526 } 4527 4528 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 4529 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4530 MachineInstr *InsertPt = 4531 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term 4532 : AMDGPU::S_XOR_B64_term), Exec) 4533 .addReg(Exec) 4534 .addReg(NewExec); 4535 4536 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 4537 // s_cbranch_scc0? 4538 4539 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 4540 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 4541 .addMBB(&LoopBB); 4542 4543 return InsertPt->getIterator(); 4544 } 4545 4546 // This has slightly sub-optimal regalloc when the source vector is killed by 4547 // the read. The register allocator does not understand that the kill is 4548 // per-workitem, so is kept alive for the whole loop so we end up not re-using a 4549 // subregister from it, using 1 more VGPR than necessary. This was saved when 4550 // this was expanded after register allocation. 4551 static MachineBasicBlock::iterator 4552 loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, 4553 unsigned InitResultReg, unsigned PhiReg, int Offset, 4554 bool UseGPRIdxMode, Register &SGPRIdxReg) { 4555 MachineFunction *MF = MBB.getParent(); 4556 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 4557 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4558 MachineRegisterInfo &MRI = MF->getRegInfo(); 4559 const DebugLoc &DL = MI.getDebugLoc(); 4560 MachineBasicBlock::iterator I(&MI); 4561 4562 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 4563 Register DstReg = MI.getOperand(0).getReg(); 4564 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 4565 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC); 4566 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4567 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 4568 4569 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec); 4570 4571 // Save the EXEC mask 4572 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec) 4573 .addReg(Exec); 4574 4575 MachineBasicBlock *LoopBB; 4576 MachineBasicBlock *RemainderBB; 4577 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false); 4578 4579 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 4580 4581 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx, 4582 InitResultReg, DstReg, PhiReg, TmpExec, 4583 Offset, UseGPRIdxMode, SGPRIdxReg); 4584 4585 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock(); 4586 MachineFunction::iterator MBBI(LoopBB); 4587 ++MBBI; 4588 MF->insert(MBBI, LandingPad); 4589 LoopBB->removeSuccessor(RemainderBB); 4590 LandingPad->addSuccessor(RemainderBB); 4591 LoopBB->addSuccessor(LandingPad); 4592 MachineBasicBlock::iterator First = LandingPad->begin(); 4593 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec) 4594 .addReg(SaveExec); 4595 4596 return InsPt; 4597 } 4598 4599 // Returns subreg index, offset 4600 static std::pair<unsigned, int> 4601 computeIndirectRegAndOffset(const SIRegisterInfo &TRI, 4602 const TargetRegisterClass *SuperRC, 4603 unsigned VecReg, 4604 int Offset) { 4605 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32; 4606 4607 // Skip out of bounds offsets, or else we would end up using an undefined 4608 // register. 4609 if (Offset >= NumElts || Offset < 0) 4610 return std::pair(AMDGPU::sub0, Offset); 4611 4612 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0); 4613 } 4614 4615 static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, 4616 MachineRegisterInfo &MRI, MachineInstr &MI, 4617 int Offset) { 4618 MachineBasicBlock *MBB = MI.getParent(); 4619 const DebugLoc &DL = MI.getDebugLoc(); 4620 MachineBasicBlock::iterator I(&MI); 4621 4622 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 4623 4624 assert(Idx->getReg() != AMDGPU::NoRegister); 4625 4626 if (Offset == 0) { 4627 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx); 4628 } else { 4629 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 4630 .add(*Idx) 4631 .addImm(Offset); 4632 } 4633 } 4634 4635 static Register getIndirectSGPRIdx(const SIInstrInfo *TII, 4636 MachineRegisterInfo &MRI, MachineInstr &MI, 4637 int Offset) { 4638 MachineBasicBlock *MBB = MI.getParent(); 4639 const DebugLoc &DL = MI.getDebugLoc(); 4640 MachineBasicBlock::iterator I(&MI); 4641 4642 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 4643 4644 if (Offset == 0) 4645 return Idx->getReg(); 4646 4647 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4648 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp) 4649 .add(*Idx) 4650 .addImm(Offset); 4651 return Tmp; 4652 } 4653 4654 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, 4655 MachineBasicBlock &MBB, 4656 const GCNSubtarget &ST) { 4657 const SIInstrInfo *TII = ST.getInstrInfo(); 4658 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 4659 MachineFunction *MF = MBB.getParent(); 4660 MachineRegisterInfo &MRI = MF->getRegInfo(); 4661 4662 Register Dst = MI.getOperand(0).getReg(); 4663 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 4664 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); 4665 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 4666 4667 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg); 4668 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg()); 4669 4670 unsigned SubReg; 4671 std::tie(SubReg, Offset) 4672 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset); 4673 4674 const bool UseGPRIdxMode = ST.useVGPRIndexMode(); 4675 4676 // Check for a SGPR index. 4677 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) { 4678 MachineBasicBlock::iterator I(&MI); 4679 const DebugLoc &DL = MI.getDebugLoc(); 4680 4681 if (UseGPRIdxMode) { 4682 // TODO: Look at the uses to avoid the copy. This may require rescheduling 4683 // to avoid interfering with other uses, so probably requires a new 4684 // optimization pass. 4685 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset); 4686 4687 const MCInstrDesc &GPRIDXDesc = 4688 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true); 4689 BuildMI(MBB, I, DL, GPRIDXDesc, Dst) 4690 .addReg(SrcReg) 4691 .addReg(Idx) 4692 .addImm(SubReg); 4693 } else { 4694 setM0ToIndexFromSGPR(TII, MRI, MI, Offset); 4695 4696 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 4697 .addReg(SrcReg, 0, SubReg) 4698 .addReg(SrcReg, RegState::Implicit); 4699 } 4700 4701 MI.eraseFromParent(); 4702 4703 return &MBB; 4704 } 4705 4706 // Control flow needs to be inserted if indexing with a VGPR. 4707 const DebugLoc &DL = MI.getDebugLoc(); 4708 MachineBasicBlock::iterator I(&MI); 4709 4710 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4711 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4712 4713 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); 4714 4715 Register SGPRIdxReg; 4716 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, 4717 UseGPRIdxMode, SGPRIdxReg); 4718 4719 MachineBasicBlock *LoopBB = InsPt->getParent(); 4720 4721 if (UseGPRIdxMode) { 4722 const MCInstrDesc &GPRIDXDesc = 4723 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true); 4724 4725 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst) 4726 .addReg(SrcReg) 4727 .addReg(SGPRIdxReg) 4728 .addImm(SubReg); 4729 } else { 4730 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 4731 .addReg(SrcReg, 0, SubReg) 4732 .addReg(SrcReg, RegState::Implicit); 4733 } 4734 4735 MI.eraseFromParent(); 4736 4737 return LoopBB; 4738 } 4739 4740 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, 4741 MachineBasicBlock &MBB, 4742 const GCNSubtarget &ST) { 4743 const SIInstrInfo *TII = ST.getInstrInfo(); 4744 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 4745 MachineFunction *MF = MBB.getParent(); 4746 MachineRegisterInfo &MRI = MF->getRegInfo(); 4747 4748 Register Dst = MI.getOperand(0).getReg(); 4749 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); 4750 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 4751 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); 4752 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 4753 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg()); 4754 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg()); 4755 4756 // This can be an immediate, but will be folded later. 4757 assert(Val->getReg()); 4758 4759 unsigned SubReg; 4760 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC, 4761 SrcVec->getReg(), 4762 Offset); 4763 const bool UseGPRIdxMode = ST.useVGPRIndexMode(); 4764 4765 if (Idx->getReg() == AMDGPU::NoRegister) { 4766 MachineBasicBlock::iterator I(&MI); 4767 const DebugLoc &DL = MI.getDebugLoc(); 4768 4769 assert(Offset == 0); 4770 4771 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst) 4772 .add(*SrcVec) 4773 .add(*Val) 4774 .addImm(SubReg); 4775 4776 MI.eraseFromParent(); 4777 return &MBB; 4778 } 4779 4780 // Check for a SGPR index. 4781 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) { 4782 MachineBasicBlock::iterator I(&MI); 4783 const DebugLoc &DL = MI.getDebugLoc(); 4784 4785 if (UseGPRIdxMode) { 4786 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset); 4787 4788 const MCInstrDesc &GPRIDXDesc = 4789 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false); 4790 BuildMI(MBB, I, DL, GPRIDXDesc, Dst) 4791 .addReg(SrcVec->getReg()) 4792 .add(*Val) 4793 .addReg(Idx) 4794 .addImm(SubReg); 4795 } else { 4796 setM0ToIndexFromSGPR(TII, MRI, MI, Offset); 4797 4798 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo( 4799 TRI.getRegSizeInBits(*VecRC), 32, false); 4800 BuildMI(MBB, I, DL, MovRelDesc, Dst) 4801 .addReg(SrcVec->getReg()) 4802 .add(*Val) 4803 .addImm(SubReg); 4804 } 4805 MI.eraseFromParent(); 4806 return &MBB; 4807 } 4808 4809 // Control flow needs to be inserted if indexing with a VGPR. 4810 if (Val->isReg()) 4811 MRI.clearKillFlags(Val->getReg()); 4812 4813 const DebugLoc &DL = MI.getDebugLoc(); 4814 4815 Register PhiReg = MRI.createVirtualRegister(VecRC); 4816 4817 Register SGPRIdxReg; 4818 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset, 4819 UseGPRIdxMode, SGPRIdxReg); 4820 MachineBasicBlock *LoopBB = InsPt->getParent(); 4821 4822 if (UseGPRIdxMode) { 4823 const MCInstrDesc &GPRIDXDesc = 4824 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false); 4825 4826 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst) 4827 .addReg(PhiReg) 4828 .add(*Val) 4829 .addReg(SGPRIdxReg) 4830 .addImm(SubReg); 4831 } else { 4832 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo( 4833 TRI.getRegSizeInBits(*VecRC), 32, false); 4834 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst) 4835 .addReg(PhiReg) 4836 .add(*Val) 4837 .addImm(SubReg); 4838 } 4839 4840 MI.eraseFromParent(); 4841 return LoopBB; 4842 } 4843 4844 static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, 4845 MachineBasicBlock &BB, 4846 const GCNSubtarget &ST, 4847 unsigned Opc) { 4848 MachineRegisterInfo &MRI = BB.getParent()->getRegInfo(); 4849 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4850 const DebugLoc &DL = MI.getDebugLoc(); 4851 const SIInstrInfo *TII = ST.getInstrInfo(); 4852 4853 // Reduction operations depend on whether the input operand is SGPR or VGPR. 4854 Register SrcReg = MI.getOperand(1).getReg(); 4855 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg)); 4856 Register DstReg = MI.getOperand(0).getReg(); 4857 MachineBasicBlock *RetBB = nullptr; 4858 if (isSGPR) { 4859 // These operations with a uniform value i.e. SGPR are idempotent. 4860 // Reduced value will be same as given sgpr. 4861 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg); 4862 RetBB = &BB; 4863 } else { 4864 // TODO: Implement DPP Strategy and switch based on immediate strategy 4865 // operand. For now, for all the cases (default, Iterative and DPP we use 4866 // iterative approach by default.) 4867 4868 // To reduce the VGPR using iterative approach, we need to iterate 4869 // over all the active lanes. Lowering consists of ComputeLoop, 4870 // which iterate over only active lanes. We use copy of EXEC register 4871 // as induction variable and every active lane modifies it using bitset0 4872 // so that we will get the next active lane for next iteration. 4873 MachineBasicBlock::iterator I = BB.end(); 4874 Register SrcReg = MI.getOperand(1).getReg(); 4875 4876 // Create Control flow for loop 4877 // Split MI's Machine Basic block into For loop 4878 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true); 4879 4880 // Create virtual registers required for lowering. 4881 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); 4882 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); 4883 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass); 4884 Register InitalValReg = MRI.createVirtualRegister(DstRegClass); 4885 4886 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass); 4887 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); 4888 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); 4889 4890 Register FF1Reg = MRI.createVirtualRegister(DstRegClass); 4891 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass); 4892 4893 bool IsWave32 = ST.isWave32(); 4894 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 4895 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4896 4897 // Create initail values of induction variable from Exec, Accumulator and 4898 // insert branch instr to newly created ComputeBlockk 4899 uint32_t InitalValue = 4900 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0; 4901 auto TmpSReg = 4902 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); 4903 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg) 4904 .addImm(InitalValue); 4905 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop); 4906 4907 // Start constructing ComputeLoop 4908 I = ComputeLoop->end(); 4909 auto Accumulator = 4910 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg) 4911 .addReg(InitalValReg) 4912 .addMBB(&BB); 4913 auto ActiveBits = 4914 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg) 4915 .addReg(TmpSReg->getOperand(0).getReg()) 4916 .addMBB(&BB); 4917 4918 // Perform the computations 4919 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64; 4920 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg) 4921 .addReg(ActiveBits->getOperand(0).getReg()); 4922 auto LaneValue = BuildMI(*ComputeLoop, I, DL, 4923 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg) 4924 .addReg(SrcReg) 4925 .addReg(FF1->getOperand(0).getReg()); 4926 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) 4927 .addReg(Accumulator->getOperand(0).getReg()) 4928 .addReg(LaneValue->getOperand(0).getReg()); 4929 4930 // Manipulate the iterator to get the next active lane 4931 unsigned BITSETOpc = 4932 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64; 4933 auto NewActiveBits = 4934 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg) 4935 .addReg(FF1->getOperand(0).getReg()) 4936 .addReg(ActiveBits->getOperand(0).getReg()); 4937 4938 // Add phi nodes 4939 Accumulator.addReg(NewAccumulator->getOperand(0).getReg()) 4940 .addMBB(ComputeLoop); 4941 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg()) 4942 .addMBB(ComputeLoop); 4943 4944 // Creating branching 4945 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64; 4946 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc)) 4947 .addReg(NewActiveBits->getOperand(0).getReg()) 4948 .addImm(0); 4949 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) 4950 .addMBB(ComputeLoop); 4951 4952 RetBB = ComputeEnd; 4953 } 4954 MI.eraseFromParent(); 4955 return RetBB; 4956 } 4957 4958 MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( 4959 MachineInstr &MI, MachineBasicBlock *BB) const { 4960 4961 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 4962 MachineFunction *MF = BB->getParent(); 4963 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 4964 4965 switch (MI.getOpcode()) { 4966 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32: 4967 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32); 4968 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32: 4969 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32); 4970 case AMDGPU::S_UADDO_PSEUDO: 4971 case AMDGPU::S_USUBO_PSEUDO: { 4972 const DebugLoc &DL = MI.getDebugLoc(); 4973 MachineOperand &Dest0 = MI.getOperand(0); 4974 MachineOperand &Dest1 = MI.getOperand(1); 4975 MachineOperand &Src0 = MI.getOperand(2); 4976 MachineOperand &Src1 = MI.getOperand(3); 4977 4978 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO) 4979 ? AMDGPU::S_ADD_I32 4980 : AMDGPU::S_SUB_I32; 4981 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1); 4982 4983 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg()) 4984 .addImm(1) 4985 .addImm(0); 4986 4987 MI.eraseFromParent(); 4988 return BB; 4989 } 4990 case AMDGPU::S_ADD_U64_PSEUDO: 4991 case AMDGPU::S_SUB_U64_PSEUDO: { 4992 // For targets older than GFX12, we emit a sequence of 32-bit operations. 4993 // For GFX12, we emit s_add_u64 and s_sub_u64. 4994 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 4995 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 4996 const DebugLoc &DL = MI.getDebugLoc(); 4997 MachineOperand &Dest = MI.getOperand(0); 4998 MachineOperand &Src0 = MI.getOperand(1); 4999 MachineOperand &Src1 = MI.getOperand(2); 5000 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); 5001 if (Subtarget->hasScalarAddSub64()) { 5002 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64; 5003 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg()) 5004 .add(Src0) 5005 .add(Src1); 5006 } else { 5007 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5008 const TargetRegisterClass *BoolRC = TRI->getBoolRC(); 5009 5010 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5011 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5012 5013 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( 5014 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); 5015 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( 5016 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); 5017 5018 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( 5019 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); 5020 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( 5021 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); 5022 5023 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 5024 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 5025 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) 5026 .add(Src0Sub0) 5027 .add(Src1Sub0); 5028 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) 5029 .add(Src0Sub1) 5030 .add(Src1Sub1); 5031 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) 5032 .addReg(DestSub0) 5033 .addImm(AMDGPU::sub0) 5034 .addReg(DestSub1) 5035 .addImm(AMDGPU::sub1); 5036 } 5037 MI.eraseFromParent(); 5038 return BB; 5039 } 5040 case AMDGPU::V_ADD_U64_PSEUDO: 5041 case AMDGPU::V_SUB_U64_PSEUDO: { 5042 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5043 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 5044 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5045 const DebugLoc &DL = MI.getDebugLoc(); 5046 5047 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO); 5048 5049 MachineOperand &Dest = MI.getOperand(0); 5050 MachineOperand &Src0 = MI.getOperand(1); 5051 MachineOperand &Src1 = MI.getOperand(2); 5052 5053 if (IsAdd && ST.hasLshlAddB64()) { 5054 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64), 5055 Dest.getReg()) 5056 .add(Src0) 5057 .addImm(0) 5058 .add(Src1); 5059 TII->legalizeOperands(*Add); 5060 MI.eraseFromParent(); 5061 return BB; 5062 } 5063 5064 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5065 5066 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5067 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5068 5069 Register CarryReg = MRI.createVirtualRegister(CarryRC); 5070 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); 5071 5072 const TargetRegisterClass *Src0RC = Src0.isReg() 5073 ? MRI.getRegClass(Src0.getReg()) 5074 : &AMDGPU::VReg_64RegClass; 5075 const TargetRegisterClass *Src1RC = Src1.isReg() 5076 ? MRI.getRegClass(Src1.getReg()) 5077 : &AMDGPU::VReg_64RegClass; 5078 5079 const TargetRegisterClass *Src0SubRC = 5080 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); 5081 const TargetRegisterClass *Src1SubRC = 5082 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1); 5083 5084 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm( 5085 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); 5086 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm( 5087 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); 5088 5089 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm( 5090 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); 5091 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm( 5092 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); 5093 5094 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 5095 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) 5096 .addReg(CarryReg, RegState::Define) 5097 .add(SrcReg0Sub0) 5098 .add(SrcReg1Sub0) 5099 .addImm(0); // clamp bit 5100 5101 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 5102 MachineInstr *HiHalf = 5103 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) 5104 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 5105 .add(SrcReg0Sub1) 5106 .add(SrcReg1Sub1) 5107 .addReg(CarryReg, RegState::Kill) 5108 .addImm(0); // clamp bit 5109 5110 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) 5111 .addReg(DestSub0) 5112 .addImm(AMDGPU::sub0) 5113 .addReg(DestSub1) 5114 .addImm(AMDGPU::sub1); 5115 TII->legalizeOperands(*LoHalf); 5116 TII->legalizeOperands(*HiHalf); 5117 MI.eraseFromParent(); 5118 return BB; 5119 } 5120 case AMDGPU::S_ADD_CO_PSEUDO: 5121 case AMDGPU::S_SUB_CO_PSEUDO: { 5122 // This pseudo has a chance to be selected 5123 // only from uniform add/subcarry node. All the VGPR operands 5124 // therefore assumed to be splat vectors. 5125 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5126 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 5127 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5128 MachineBasicBlock::iterator MII = MI; 5129 const DebugLoc &DL = MI.getDebugLoc(); 5130 MachineOperand &Dest = MI.getOperand(0); 5131 MachineOperand &CarryDest = MI.getOperand(1); 5132 MachineOperand &Src0 = MI.getOperand(2); 5133 MachineOperand &Src1 = MI.getOperand(3); 5134 MachineOperand &Src2 = MI.getOperand(4); 5135 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) 5136 ? AMDGPU::S_ADDC_U32 5137 : AMDGPU::S_SUBB_U32; 5138 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) { 5139 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5140 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0) 5141 .addReg(Src0.getReg()); 5142 Src0.setReg(RegOp0); 5143 } 5144 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) { 5145 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5146 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1) 5147 .addReg(Src1.getReg()); 5148 Src1.setReg(RegOp1); 5149 } 5150 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5151 if (TRI->isVectorRegister(MRI, Src2.getReg())) { 5152 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2) 5153 .addReg(Src2.getReg()); 5154 Src2.setReg(RegOp2); 5155 } 5156 5157 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg()); 5158 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC); 5159 assert(WaveSize == 64 || WaveSize == 32); 5160 5161 if (WaveSize == 64) { 5162 if (ST.hasScalarCompareEq64()) { 5163 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64)) 5164 .addReg(Src2.getReg()) 5165 .addImm(0); 5166 } else { 5167 const TargetRegisterClass *SubRC = 5168 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0); 5169 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm( 5170 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC); 5171 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm( 5172 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC); 5173 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5174 5175 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32) 5176 .add(Src2Sub0) 5177 .add(Src2Sub1); 5178 5179 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32)) 5180 .addReg(Src2_32, RegState::Kill) 5181 .addImm(0); 5182 } 5183 } else { 5184 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32)) 5185 .addReg(Src2.getReg()) 5186 .addImm(0); 5187 } 5188 5189 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1); 5190 5191 unsigned SelOpc = 5192 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; 5193 5194 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg()) 5195 .addImm(-1) 5196 .addImm(0); 5197 5198 MI.eraseFromParent(); 5199 return BB; 5200 } 5201 case AMDGPU::SI_INIT_M0: { 5202 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), 5203 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 5204 .add(MI.getOperand(0)); 5205 MI.eraseFromParent(); 5206 return BB; 5207 } 5208 case AMDGPU::GET_GROUPSTATICSIZE: { 5209 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || 5210 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL); 5211 DebugLoc DL = MI.getDebugLoc(); 5212 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) 5213 .add(MI.getOperand(0)) 5214 .addImm(MFI->getLDSSize()); 5215 MI.eraseFromParent(); 5216 return BB; 5217 } 5218 case AMDGPU::GET_SHADERCYCLESHILO: { 5219 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters()); 5220 MachineRegisterInfo &MRI = MF->getRegInfo(); 5221 const DebugLoc &DL = MI.getDebugLoc(); 5222 // The algorithm is: 5223 // 5224 // hi1 = getreg(SHADER_CYCLES_HI) 5225 // lo1 = getreg(SHADER_CYCLES_LO) 5226 // hi2 = getreg(SHADER_CYCLES_HI) 5227 // 5228 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1. 5229 // Otherwise there was overflow and the result is hi2:0. In both cases the 5230 // result should represent the actual time at some point during the sequence 5231 // of three getregs. 5232 using namespace AMDGPU::Hwreg; 5233 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5234 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1) 5235 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32)); 5236 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5237 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1) 5238 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32)); 5239 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5240 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2) 5241 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32)); 5242 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) 5243 .addReg(RegHi1) 5244 .addReg(RegHi2); 5245 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5246 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo) 5247 .addReg(RegLo1) 5248 .addImm(0); 5249 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE)) 5250 .add(MI.getOperand(0)) 5251 .addReg(RegLo) 5252 .addImm(AMDGPU::sub0) 5253 .addReg(RegHi2) 5254 .addImm(AMDGPU::sub1); 5255 MI.eraseFromParent(); 5256 return BB; 5257 } 5258 case AMDGPU::SI_INDIRECT_SRC_V1: 5259 case AMDGPU::SI_INDIRECT_SRC_V2: 5260 case AMDGPU::SI_INDIRECT_SRC_V4: 5261 case AMDGPU::SI_INDIRECT_SRC_V8: 5262 case AMDGPU::SI_INDIRECT_SRC_V9: 5263 case AMDGPU::SI_INDIRECT_SRC_V10: 5264 case AMDGPU::SI_INDIRECT_SRC_V11: 5265 case AMDGPU::SI_INDIRECT_SRC_V12: 5266 case AMDGPU::SI_INDIRECT_SRC_V16: 5267 case AMDGPU::SI_INDIRECT_SRC_V32: 5268 return emitIndirectSrc(MI, *BB, *getSubtarget()); 5269 case AMDGPU::SI_INDIRECT_DST_V1: 5270 case AMDGPU::SI_INDIRECT_DST_V2: 5271 case AMDGPU::SI_INDIRECT_DST_V4: 5272 case AMDGPU::SI_INDIRECT_DST_V8: 5273 case AMDGPU::SI_INDIRECT_DST_V9: 5274 case AMDGPU::SI_INDIRECT_DST_V10: 5275 case AMDGPU::SI_INDIRECT_DST_V11: 5276 case AMDGPU::SI_INDIRECT_DST_V12: 5277 case AMDGPU::SI_INDIRECT_DST_V16: 5278 case AMDGPU::SI_INDIRECT_DST_V32: 5279 return emitIndirectDst(MI, *BB, *getSubtarget()); 5280 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: 5281 case AMDGPU::SI_KILL_I1_PSEUDO: 5282 return splitKillBlock(MI, BB); 5283 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 5284 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5285 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 5286 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5287 5288 Register Dst = MI.getOperand(0).getReg(); 5289 const MachineOperand &Src0 = MI.getOperand(1); 5290 const MachineOperand &Src1 = MI.getOperand(2); 5291 const DebugLoc &DL = MI.getDebugLoc(); 5292 Register SrcCond = MI.getOperand(3).getReg(); 5293 5294 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5295 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5296 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5297 Register SrcCondCopy = MRI.createVirtualRegister(CondRC); 5298 5299 const TargetRegisterClass *Src0RC = Src0.isReg() 5300 ? MRI.getRegClass(Src0.getReg()) 5301 : &AMDGPU::VReg_64RegClass; 5302 const TargetRegisterClass *Src1RC = Src1.isReg() 5303 ? MRI.getRegClass(Src1.getReg()) 5304 : &AMDGPU::VReg_64RegClass; 5305 5306 const TargetRegisterClass *Src0SubRC = 5307 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); 5308 const TargetRegisterClass *Src1SubRC = 5309 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1); 5310 5311 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( 5312 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); 5313 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( 5314 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); 5315 5316 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( 5317 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); 5318 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( 5319 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); 5320 5321 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy) 5322 .addReg(SrcCond); 5323 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 5324 .addImm(0) 5325 .add(Src0Sub0) 5326 .addImm(0) 5327 .add(Src1Sub0) 5328 .addReg(SrcCondCopy); 5329 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 5330 .addImm(0) 5331 .add(Src0Sub1) 5332 .addImm(0) 5333 .add(Src1Sub1) 5334 .addReg(SrcCondCopy); 5335 5336 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst) 5337 .addReg(DstLo) 5338 .addImm(AMDGPU::sub0) 5339 .addReg(DstHi) 5340 .addImm(AMDGPU::sub1); 5341 MI.eraseFromParent(); 5342 return BB; 5343 } 5344 case AMDGPU::SI_BR_UNDEF: { 5345 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 5346 const DebugLoc &DL = MI.getDebugLoc(); 5347 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) 5348 .add(MI.getOperand(0)); 5349 Br->getOperand(1).setIsUndef(); // read undef SCC 5350 MI.eraseFromParent(); 5351 return BB; 5352 } 5353 case AMDGPU::ADJCALLSTACKUP: 5354 case AMDGPU::ADJCALLSTACKDOWN: { 5355 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 5356 MachineInstrBuilder MIB(*MF, &MI); 5357 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine) 5358 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit); 5359 return BB; 5360 } 5361 case AMDGPU::SI_CALL_ISEL: { 5362 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 5363 const DebugLoc &DL = MI.getDebugLoc(); 5364 5365 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); 5366 5367 MachineInstrBuilder MIB; 5368 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg); 5369 5370 for (const MachineOperand &MO : MI.operands()) 5371 MIB.add(MO); 5372 5373 MIB.cloneMemRefs(MI); 5374 MI.eraseFromParent(); 5375 return BB; 5376 } 5377 case AMDGPU::V_ADD_CO_U32_e32: 5378 case AMDGPU::V_SUB_CO_U32_e32: 5379 case AMDGPU::V_SUBREV_CO_U32_e32: { 5380 // TODO: Define distinct V_*_I32_Pseudo instructions instead. 5381 const DebugLoc &DL = MI.getDebugLoc(); 5382 unsigned Opc = MI.getOpcode(); 5383 5384 bool NeedClampOperand = false; 5385 if (TII->pseudoToMCOpcode(Opc) == -1) { 5386 Opc = AMDGPU::getVOPe64(Opc); 5387 NeedClampOperand = true; 5388 } 5389 5390 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg()); 5391 if (TII->isVOP3(*I)) { 5392 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 5393 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5394 I.addReg(TRI->getVCC(), RegState::Define); 5395 } 5396 I.add(MI.getOperand(1)) 5397 .add(MI.getOperand(2)); 5398 if (NeedClampOperand) 5399 I.addImm(0); // clamp bit for e64 encoding 5400 5401 TII->legalizeOperands(*I); 5402 5403 MI.eraseFromParent(); 5404 return BB; 5405 } 5406 case AMDGPU::V_ADDC_U32_e32: 5407 case AMDGPU::V_SUBB_U32_e32: 5408 case AMDGPU::V_SUBBREV_U32_e32: 5409 // These instructions have an implicit use of vcc which counts towards the 5410 // constant bus limit. 5411 TII->legalizeOperands(MI); 5412 return BB; 5413 case AMDGPU::DS_GWS_INIT: 5414 case AMDGPU::DS_GWS_SEMA_BR: 5415 case AMDGPU::DS_GWS_BARRIER: 5416 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0); 5417 [[fallthrough]]; 5418 case AMDGPU::DS_GWS_SEMA_V: 5419 case AMDGPU::DS_GWS_SEMA_P: 5420 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: 5421 // A s_waitcnt 0 is required to be the instruction immediately following. 5422 if (getSubtarget()->hasGWSAutoReplay()) { 5423 bundleInstWithWaitcnt(MI); 5424 return BB; 5425 } 5426 5427 return emitGWSMemViolTestLoop(MI, BB); 5428 case AMDGPU::S_SETREG_B32: { 5429 // Try to optimize cases that only set the denormal mode or rounding mode. 5430 // 5431 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or 5432 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode 5433 // instead. 5434 // 5435 // FIXME: This could be predicates on the immediate, but tablegen doesn't 5436 // allow you to have a no side effect instruction in the output of a 5437 // sideeffecting pattern. 5438 auto [ID, Offset, Width] = 5439 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm()); 5440 if (ID != AMDGPU::Hwreg::ID_MODE) 5441 return BB; 5442 5443 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width); 5444 const unsigned SetMask = WidthMask << Offset; 5445 5446 if (getSubtarget()->hasDenormModeInst()) { 5447 unsigned SetDenormOp = 0; 5448 unsigned SetRoundOp = 0; 5449 5450 // The dedicated instructions can only set the whole denorm or round mode 5451 // at once, not a subset of bits in either. 5452 if (SetMask == 5453 (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) { 5454 // If this fully sets both the round and denorm mode, emit the two 5455 // dedicated instructions for these. 5456 SetRoundOp = AMDGPU::S_ROUND_MODE; 5457 SetDenormOp = AMDGPU::S_DENORM_MODE; 5458 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) { 5459 SetRoundOp = AMDGPU::S_ROUND_MODE; 5460 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) { 5461 SetDenormOp = AMDGPU::S_DENORM_MODE; 5462 } 5463 5464 if (SetRoundOp || SetDenormOp) { 5465 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5466 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg()); 5467 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) { 5468 unsigned ImmVal = Def->getOperand(1).getImm(); 5469 if (SetRoundOp) { 5470 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp)) 5471 .addImm(ImmVal & 0xf); 5472 5473 // If we also have the denorm mode, get just the denorm mode bits. 5474 ImmVal >>= 4; 5475 } 5476 5477 if (SetDenormOp) { 5478 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp)) 5479 .addImm(ImmVal & 0xf); 5480 } 5481 5482 MI.eraseFromParent(); 5483 return BB; 5484 } 5485 } 5486 } 5487 5488 // If only FP bits are touched, used the no side effects pseudo. 5489 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK | 5490 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) 5491 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode)); 5492 5493 return BB; 5494 } 5495 case AMDGPU::S_INVERSE_BALLOT_U32: 5496 case AMDGPU::S_INVERSE_BALLOT_U64: 5497 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if 5498 // necessary. After that they are equivalent to a COPY. 5499 MI.setDesc(TII->get(AMDGPU::COPY)); 5500 return BB; 5501 case AMDGPU::ENDPGM_TRAP: { 5502 const DebugLoc &DL = MI.getDebugLoc(); 5503 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) { 5504 MI.setDesc(TII->get(AMDGPU::S_ENDPGM)); 5505 MI.addOperand(MachineOperand::CreateImm(0)); 5506 return BB; 5507 } 5508 5509 // We need a block split to make the real endpgm a terminator. We also don't 5510 // want to break phis in successor blocks, so we can't just delete to the 5511 // end of the block. 5512 5513 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/); 5514 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 5515 MF->push_back(TrapBB); 5516 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM)) 5517 .addImm(0); 5518 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 5519 .addMBB(TrapBB); 5520 5521 BB->addSuccessor(TrapBB); 5522 MI.eraseFromParent(); 5523 return SplitBB; 5524 } 5525 case AMDGPU::SIMULATED_TRAP: { 5526 assert(Subtarget->hasPrivEnabledTrap2NopBug()); 5527 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5528 MachineBasicBlock *SplitBB = 5529 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc()); 5530 MI.eraseFromParent(); 5531 return SplitBB; 5532 } 5533 default: 5534 if (TII->isImage(MI) || TII->isMUBUF(MI)) { 5535 if (!MI.mayStore()) 5536 AddMemOpInit(MI); 5537 return BB; 5538 } 5539 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 5540 } 5541 } 5542 5543 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { 5544 // This currently forces unfolding various combinations of fsub into fma with 5545 // free fneg'd operands. As long as we have fast FMA (controlled by 5546 // isFMAFasterThanFMulAndFAdd), we should perform these. 5547 5548 // When fma is quarter rate, for f64 where add / sub are at best half rate, 5549 // most of these combines appear to be cycle neutral but save on instruction 5550 // count / code size. 5551 return true; 5552 } 5553 5554 bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; } 5555 5556 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, 5557 EVT VT) const { 5558 if (!VT.isVector()) { 5559 return MVT::i1; 5560 } 5561 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); 5562 } 5563 5564 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const { 5565 // TODO: Should i16 be used always if legal? For now it would force VALU 5566 // shifts. 5567 return (VT == MVT::i16) ? MVT::i16 : MVT::i32; 5568 } 5569 5570 LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const { 5571 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts()) 5572 ? Ty.changeElementSize(16) 5573 : Ty.changeElementSize(32); 5574 } 5575 5576 // Answering this is somewhat tricky and depends on the specific device which 5577 // have different rates for fma or all f64 operations. 5578 // 5579 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other 5580 // regardless of which device (although the number of cycles differs between 5581 // devices), so it is always profitable for f64. 5582 // 5583 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable 5584 // only on full rate devices. Normally, we should prefer selecting v_mad_f32 5585 // which we can always do even without fused FP ops since it returns the same 5586 // result as the separate operations and since it is always full 5587 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 5588 // however does not support denormals, so we do report fma as faster if we have 5589 // a fast fma device and require denormals. 5590 // 5591 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, 5592 EVT VT) const { 5593 VT = VT.getScalarType(); 5594 5595 switch (VT.getSimpleVT().SimpleTy) { 5596 case MVT::f32: { 5597 // If mad is not available this depends only on if f32 fma is full rate. 5598 if (!Subtarget->hasMadMacF32Insts()) 5599 return Subtarget->hasFastFMAF32(); 5600 5601 // Otherwise f32 mad is always full rate and returns the same result as 5602 // the separate operations so should be preferred over fma. 5603 // However does not support denormals. 5604 if (!denormalModeIsFlushAllF32(MF)) 5605 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); 5606 5607 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32. 5608 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts(); 5609 } 5610 case MVT::f64: 5611 return true; 5612 case MVT::f16: 5613 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF); 5614 default: 5615 break; 5616 } 5617 5618 return false; 5619 } 5620 5621 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, 5622 LLT Ty) const { 5623 switch (Ty.getScalarSizeInBits()) { 5624 case 16: 5625 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16); 5626 case 32: 5627 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32); 5628 case 64: 5629 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64); 5630 default: 5631 break; 5632 } 5633 5634 return false; 5635 } 5636 5637 bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const { 5638 if (!Ty.isScalar()) 5639 return false; 5640 5641 if (Ty.getScalarSizeInBits() == 16) 5642 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF()); 5643 if (Ty.getScalarSizeInBits() == 32) 5644 return Subtarget->hasMadMacF32Insts() && 5645 denormalModeIsFlushAllF32(*MI.getMF()); 5646 5647 return false; 5648 } 5649 5650 bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG, 5651 const SDNode *N) const { 5652 // TODO: Check future ftz flag 5653 // v_mad_f32/v_mac_f32 do not support denormals. 5654 EVT VT = N->getValueType(0); 5655 if (VT == MVT::f32) 5656 return Subtarget->hasMadMacF32Insts() && 5657 denormalModeIsFlushAllF32(DAG.getMachineFunction()); 5658 if (VT == MVT::f16) { 5659 return Subtarget->hasMadF16() && 5660 denormalModeIsFlushAllF64F16(DAG.getMachineFunction()); 5661 } 5662 5663 return false; 5664 } 5665 5666 //===----------------------------------------------------------------------===// 5667 // Custom DAG Lowering Operations 5668 //===----------------------------------------------------------------------===// 5669 5670 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the 5671 // wider vector type is legal. 5672 SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, 5673 SelectionDAG &DAG) const { 5674 unsigned Opc = Op.getOpcode(); 5675 EVT VT = Op.getValueType(); 5676 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || 5677 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || 5678 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || 5679 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); 5680 5681 SDValue Lo, Hi; 5682 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); 5683 5684 SDLoc SL(Op); 5685 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, 5686 Op->getFlags()); 5687 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, 5688 Op->getFlags()); 5689 5690 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); 5691 } 5692 5693 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the 5694 // wider vector type is legal. 5695 SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, 5696 SelectionDAG &DAG) const { 5697 unsigned Opc = Op.getOpcode(); 5698 EVT VT = Op.getValueType(); 5699 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || 5700 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || 5701 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || 5702 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); 5703 5704 SDValue Lo0, Hi0; 5705 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); 5706 SDValue Lo1, Hi1; 5707 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1); 5708 5709 SDLoc SL(Op); 5710 5711 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, 5712 Op->getFlags()); 5713 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, 5714 Op->getFlags()); 5715 5716 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); 5717 } 5718 5719 SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op, 5720 SelectionDAG &DAG) const { 5721 unsigned Opc = Op.getOpcode(); 5722 EVT VT = Op.getValueType(); 5723 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || 5724 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 || 5725 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || 5726 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 || 5727 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 || 5728 VT == MVT::v32bf16); 5729 5730 SDValue Lo0, Hi0; 5731 SDValue Op0 = Op.getOperand(0); 5732 std::tie(Lo0, Hi0) = Op0.getValueType().isVector() 5733 ? DAG.SplitVectorOperand(Op.getNode(), 0) 5734 : std::pair(Op0, Op0); 5735 SDValue Lo1, Hi1; 5736 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1); 5737 SDValue Lo2, Hi2; 5738 std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2); 5739 5740 SDLoc SL(Op); 5741 auto ResVT = DAG.GetSplitDestVTs(VT); 5742 5743 SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, 5744 Op->getFlags()); 5745 SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, 5746 Op->getFlags()); 5747 5748 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); 5749 } 5750 5751 5752 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 5753 switch (Op.getOpcode()) { 5754 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 5755 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 5756 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 5757 case ISD::LOAD: { 5758 SDValue Result = LowerLOAD(Op, DAG); 5759 assert((!Result.getNode() || 5760 Result.getNode()->getNumValues() == 2) && 5761 "Load should return a value and a chain"); 5762 return Result; 5763 } 5764 case ISD::FSQRT: { 5765 EVT VT = Op.getValueType(); 5766 if (VT == MVT::f32) 5767 return lowerFSQRTF32(Op, DAG); 5768 if (VT == MVT::f64) 5769 return lowerFSQRTF64(Op, DAG); 5770 return SDValue(); 5771 } 5772 case ISD::FSIN: 5773 case ISD::FCOS: 5774 return LowerTrig(Op, DAG); 5775 case ISD::SELECT: return LowerSELECT(Op, DAG); 5776 case ISD::FDIV: return LowerFDIV(Op, DAG); 5777 case ISD::FFREXP: return LowerFFREXP(Op, DAG); 5778 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); 5779 case ISD::STORE: return LowerSTORE(Op, DAG); 5780 case ISD::GlobalAddress: { 5781 MachineFunction &MF = DAG.getMachineFunction(); 5782 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 5783 return LowerGlobalAddress(MFI, Op, DAG); 5784 } 5785 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 5786 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); 5787 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); 5788 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); 5789 case ISD::INSERT_SUBVECTOR: 5790 return lowerINSERT_SUBVECTOR(Op, DAG); 5791 case ISD::INSERT_VECTOR_ELT: 5792 return lowerINSERT_VECTOR_ELT(Op, DAG); 5793 case ISD::EXTRACT_VECTOR_ELT: 5794 return lowerEXTRACT_VECTOR_ELT(Op, DAG); 5795 case ISD::VECTOR_SHUFFLE: 5796 return lowerVECTOR_SHUFFLE(Op, DAG); 5797 case ISD::SCALAR_TO_VECTOR: 5798 return lowerSCALAR_TO_VECTOR(Op, DAG); 5799 case ISD::BUILD_VECTOR: 5800 return lowerBUILD_VECTOR(Op, DAG); 5801 case ISD::FP_ROUND: 5802 case ISD::STRICT_FP_ROUND: 5803 return lowerFP_ROUND(Op, DAG); 5804 case ISD::FPTRUNC_ROUND: { 5805 unsigned Opc; 5806 SDLoc DL(Op); 5807 5808 if (Op.getOperand(0)->getValueType(0) != MVT::f32) 5809 return SDValue(); 5810 5811 // Get the rounding mode from the last operand 5812 int RoundMode = Op.getConstantOperandVal(1); 5813 if (RoundMode == (int)RoundingMode::TowardPositive) 5814 Opc = AMDGPUISD::FPTRUNC_ROUND_UPWARD; 5815 else if (RoundMode == (int)RoundingMode::TowardNegative) 5816 Opc = AMDGPUISD::FPTRUNC_ROUND_DOWNWARD; 5817 else 5818 return SDValue(); 5819 5820 return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0)); 5821 } 5822 case ISD::TRAP: 5823 return lowerTRAP(Op, DAG); 5824 case ISD::DEBUGTRAP: 5825 return lowerDEBUGTRAP(Op, DAG); 5826 case ISD::ABS: 5827 case ISD::FABS: 5828 case ISD::FNEG: 5829 case ISD::FCANONICALIZE: 5830 case ISD::BSWAP: 5831 return splitUnaryVectorOp(Op, DAG); 5832 case ISD::FMINNUM: 5833 case ISD::FMAXNUM: 5834 return lowerFMINNUM_FMAXNUM(Op, DAG); 5835 case ISD::FLDEXP: 5836 case ISD::STRICT_FLDEXP: 5837 return lowerFLDEXP(Op, DAG); 5838 case ISD::FMA: 5839 return splitTernaryVectorOp(Op, DAG); 5840 case ISD::FP_TO_SINT: 5841 case ISD::FP_TO_UINT: 5842 return LowerFP_TO_INT(Op, DAG); 5843 case ISD::SHL: 5844 case ISD::SRA: 5845 case ISD::SRL: 5846 case ISD::ADD: 5847 case ISD::SUB: 5848 case ISD::SMIN: 5849 case ISD::SMAX: 5850 case ISD::UMIN: 5851 case ISD::UMAX: 5852 case ISD::FADD: 5853 case ISD::FMUL: 5854 case ISD::FMINNUM_IEEE: 5855 case ISD::FMAXNUM_IEEE: 5856 case ISD::FMINIMUM: 5857 case ISD::FMAXIMUM: 5858 case ISD::UADDSAT: 5859 case ISD::USUBSAT: 5860 case ISD::SADDSAT: 5861 case ISD::SSUBSAT: 5862 return splitBinaryVectorOp(Op, DAG); 5863 case ISD::MUL: 5864 return lowerMUL(Op, DAG); 5865 case ISD::SMULO: 5866 case ISD::UMULO: 5867 return lowerXMULO(Op, DAG); 5868 case ISD::SMUL_LOHI: 5869 case ISD::UMUL_LOHI: 5870 return lowerXMUL_LOHI(Op, DAG); 5871 case ISD::DYNAMIC_STACKALLOC: 5872 return LowerDYNAMIC_STACKALLOC(Op, DAG); 5873 case ISD::STACKSAVE: 5874 return LowerSTACKSAVE(Op, DAG); 5875 case ISD::GET_ROUNDING: 5876 return lowerGET_ROUNDING(Op, DAG); 5877 case ISD::SET_ROUNDING: 5878 return lowerSET_ROUNDING(Op, DAG); 5879 case ISD::PREFETCH: 5880 return lowerPREFETCH(Op, DAG); 5881 case ISD::FP_EXTEND: 5882 case ISD::STRICT_FP_EXTEND: 5883 return lowerFP_EXTEND(Op, DAG); 5884 case ISD::GET_FPENV: 5885 return lowerGET_FPENV(Op, DAG); 5886 case ISD::SET_FPENV: 5887 return lowerSET_FPENV(Op, DAG); 5888 } 5889 return SDValue(); 5890 } 5891 5892 // Used for D16: Casts the result of an instruction into the right vector, 5893 // packs values if loads return unpacked values. 5894 static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, 5895 const SDLoc &DL, 5896 SelectionDAG &DAG, bool Unpacked) { 5897 if (!LoadVT.isVector()) 5898 return Result; 5899 5900 // Cast back to the original packed type or to a larger type that is a 5901 // multiple of 32 bit for D16. Widening the return type is a required for 5902 // legalization. 5903 EVT FittingLoadVT = LoadVT; 5904 if ((LoadVT.getVectorNumElements() % 2) == 1) { 5905 FittingLoadVT = 5906 EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(), 5907 LoadVT.getVectorNumElements() + 1); 5908 } 5909 5910 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16. 5911 // Truncate to v2i16/v4i16. 5912 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger(); 5913 5914 // Workaround legalizer not scalarizing truncate after vector op 5915 // legalization but not creating intermediate vector trunc. 5916 SmallVector<SDValue, 4> Elts; 5917 DAG.ExtractVectorElements(Result, Elts); 5918 for (SDValue &Elt : Elts) 5919 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt); 5920 5921 // Pad illegal v1i16/v3fi6 to v4i16 5922 if ((LoadVT.getVectorNumElements() % 2) == 1) 5923 Elts.push_back(DAG.getUNDEF(MVT::i16)); 5924 5925 Result = DAG.getBuildVector(IntLoadVT, DL, Elts); 5926 5927 // Bitcast to original type (v2f16/v4f16). 5928 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result); 5929 } 5930 5931 // Cast back to the original packed type. 5932 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result); 5933 } 5934 5935 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, 5936 MemSDNode *M, 5937 SelectionDAG &DAG, 5938 ArrayRef<SDValue> Ops, 5939 bool IsIntrinsic) const { 5940 SDLoc DL(M); 5941 5942 bool Unpacked = Subtarget->hasUnpackedD16VMem(); 5943 EVT LoadVT = M->getValueType(0); 5944 5945 EVT EquivLoadVT = LoadVT; 5946 if (LoadVT.isVector()) { 5947 if (Unpacked) { 5948 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, 5949 LoadVT.getVectorNumElements()); 5950 } else if ((LoadVT.getVectorNumElements() % 2) == 1) { 5951 // Widen v3f16 to legal type 5952 EquivLoadVT = 5953 EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(), 5954 LoadVT.getVectorNumElements() + 1); 5955 } 5956 } 5957 5958 // Change from v4f16/v2f16 to EquivLoadVT. 5959 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other); 5960 5961 SDValue Load 5962 = DAG.getMemIntrinsicNode( 5963 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, 5964 VTList, Ops, M->getMemoryVT(), 5965 M->getMemOperand()); 5966 5967 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked); 5968 5969 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL); 5970 } 5971 5972 SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, 5973 SelectionDAG &DAG, 5974 ArrayRef<SDValue> Ops) const { 5975 SDLoc DL(M); 5976 EVT LoadVT = M->getValueType(0); 5977 EVT EltType = LoadVT.getScalarType(); 5978 EVT IntVT = LoadVT.changeTypeToInteger(); 5979 5980 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); 5981 5982 assert(M->getNumValues() == 2 || M->getNumValues() == 3); 5983 bool IsTFE = M->getNumValues() == 3; 5984 5985 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE 5986 : AMDGPUISD::BUFFER_LOAD_FORMAT) 5987 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE 5988 : AMDGPUISD::BUFFER_LOAD; 5989 5990 if (IsD16) { 5991 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); 5992 } 5993 5994 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics 5995 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32) 5996 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(), 5997 IsTFE); 5998 5999 if (isTypeLegal(LoadVT)) { 6000 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT, 6001 M->getMemOperand(), DAG); 6002 } 6003 6004 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT); 6005 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other); 6006 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT, 6007 M->getMemOperand(), DAG); 6008 return DAG.getMergeValues( 6009 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)}, 6010 DL); 6011 } 6012 6013 static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, 6014 SDNode *N, SelectionDAG &DAG) { 6015 EVT VT = N->getValueType(0); 6016 unsigned CondCode = N->getConstantOperandVal(3); 6017 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode))) 6018 return DAG.getUNDEF(VT); 6019 6020 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); 6021 6022 SDValue LHS = N->getOperand(1); 6023 SDValue RHS = N->getOperand(2); 6024 6025 SDLoc DL(N); 6026 6027 EVT CmpVT = LHS.getValueType(); 6028 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) { 6029 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ? 6030 ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 6031 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS); 6032 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS); 6033 } 6034 6035 ISD::CondCode CCOpcode = getICmpCondCode(IcInput); 6036 6037 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); 6038 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize); 6039 6040 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS, 6041 DAG.getCondCode(CCOpcode)); 6042 if (VT.bitsEq(CCVT)) 6043 return SetCC; 6044 return DAG.getZExtOrTrunc(SetCC, DL, VT); 6045 } 6046 6047 static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, 6048 SDNode *N, SelectionDAG &DAG) { 6049 EVT VT = N->getValueType(0); 6050 6051 unsigned CondCode = N->getConstantOperandVal(3); 6052 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode))) 6053 return DAG.getUNDEF(VT); 6054 6055 SDValue Src0 = N->getOperand(1); 6056 SDValue Src1 = N->getOperand(2); 6057 EVT CmpVT = Src0.getValueType(); 6058 SDLoc SL(N); 6059 6060 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) { 6061 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0); 6062 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1); 6063 } 6064 6065 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode); 6066 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput); 6067 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); 6068 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize); 6069 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, 6070 Src1, DAG.getCondCode(CCOpcode)); 6071 if (VT.bitsEq(CCVT)) 6072 return SetCC; 6073 return DAG.getZExtOrTrunc(SetCC, SL, VT); 6074 } 6075 6076 static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, 6077 SelectionDAG &DAG) { 6078 EVT VT = N->getValueType(0); 6079 SDValue Src = N->getOperand(1); 6080 SDLoc SL(N); 6081 6082 if (Src.getOpcode() == ISD::SETCC) { 6083 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...) 6084 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0), 6085 Src.getOperand(1), Src.getOperand(2)); 6086 } 6087 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) { 6088 // (ballot 0) -> 0 6089 if (Arg->isZero()) 6090 return DAG.getConstant(0, SL, VT); 6091 6092 // (ballot 1) -> EXEC/EXEC_LO 6093 if (Arg->isOne()) { 6094 Register Exec; 6095 if (VT.getScalarSizeInBits() == 32) 6096 Exec = AMDGPU::EXEC_LO; 6097 else if (VT.getScalarSizeInBits() == 64) 6098 Exec = AMDGPU::EXEC; 6099 else 6100 return SDValue(); 6101 6102 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT); 6103 } 6104 } 6105 6106 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0) 6107 // ISD::SETNE) 6108 return DAG.getNode( 6109 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32), 6110 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE)); 6111 } 6112 6113 static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, 6114 SelectionDAG &DAG) { 6115 EVT VT = N->getValueType(0); 6116 unsigned ValSize = VT.getSizeInBits(); 6117 unsigned IID = N->getConstantOperandVal(0); 6118 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || 6119 IID == Intrinsic::amdgcn_permlanex16; 6120 SDLoc SL(N); 6121 MVT IntVT = MVT::getIntegerVT(ValSize); 6122 6123 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1, 6124 SDValue Src2, MVT ValT) -> SDValue { 6125 SmallVector<SDValue, 8> Operands; 6126 switch (IID) { 6127 case Intrinsic::amdgcn_permlane16: 6128 case Intrinsic::amdgcn_permlanex16: 6129 Operands.push_back(N->getOperand(6)); 6130 Operands.push_back(N->getOperand(5)); 6131 Operands.push_back(N->getOperand(4)); 6132 [[fallthrough]]; 6133 case Intrinsic::amdgcn_writelane: 6134 Operands.push_back(Src2); 6135 [[fallthrough]]; 6136 case Intrinsic::amdgcn_readlane: 6137 Operands.push_back(Src1); 6138 [[fallthrough]]; 6139 case Intrinsic::amdgcn_readfirstlane: 6140 case Intrinsic::amdgcn_permlane64: 6141 Operands.push_back(Src0); 6142 break; 6143 default: 6144 llvm_unreachable("unhandled lane op"); 6145 } 6146 6147 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32)); 6148 std::reverse(Operands.begin(), Operands.end()); 6149 6150 if (SDNode *GL = N->getGluedNode()) { 6151 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE); 6152 GL = GL->getOperand(0).getNode(); 6153 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue, 6154 SDValue(GL, 0))); 6155 } 6156 6157 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands); 6158 }; 6159 6160 SDValue Src0 = N->getOperand(1); 6161 SDValue Src1, Src2; 6162 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || 6163 IsPermLane16) { 6164 Src1 = N->getOperand(2); 6165 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) 6166 Src2 = N->getOperand(3); 6167 } 6168 6169 if (ValSize == 32) { 6170 // Already legal 6171 return SDValue(); 6172 } 6173 6174 if (ValSize < 32) { 6175 bool IsFloat = VT.isFloatingPoint(); 6176 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0, 6177 SL, MVT::i32); 6178 6179 if (IsPermLane16) { 6180 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1, 6181 SL, MVT::i32); 6182 } 6183 6184 if (IID == Intrinsic::amdgcn_writelane) { 6185 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2, 6186 SL, MVT::i32); 6187 } 6188 6189 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32); 6190 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT); 6191 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc; 6192 } 6193 6194 if (ValSize % 32 != 0) 6195 return SDValue(); 6196 6197 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue { 6198 EVT VT = N->getValueType(0); 6199 unsigned NE = VT.getVectorNumElements(); 6200 EVT EltVT = VT.getVectorElementType(); 6201 SmallVector<SDValue, 8> Scalars; 6202 unsigned NumOperands = N->getNumOperands(); 6203 SmallVector<SDValue, 4> Operands(NumOperands); 6204 SDNode *GL = N->getGluedNode(); 6205 6206 // only handle convergencectrl_glue 6207 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE); 6208 6209 for (unsigned i = 0; i != NE; ++i) { 6210 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e; 6211 ++j) { 6212 SDValue Operand = N->getOperand(j); 6213 EVT OperandVT = Operand.getValueType(); 6214 if (OperandVT.isVector()) { 6215 // A vector operand; extract a single element. 6216 EVT OperandEltVT = OperandVT.getVectorElementType(); 6217 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT, 6218 Operand, DAG.getVectorIdxConstant(i, SL)); 6219 } else { 6220 // A scalar operand; just use it as is. 6221 Operands[j] = Operand; 6222 } 6223 } 6224 6225 if (GL) 6226 Operands[NumOperands - 1] = 6227 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue, 6228 SDValue(GL->getOperand(0).getNode(), 0)); 6229 6230 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands)); 6231 } 6232 6233 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE); 6234 return DAG.getBuildVector(VecVT, SL, Scalars); 6235 }; 6236 6237 if (VT.isVector()) { 6238 switch (MVT::SimpleValueType EltTy = 6239 VT.getVectorElementType().getSimpleVT().SimpleTy) { 6240 case MVT::i32: 6241 case MVT::f32: { 6242 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT()); 6243 return unrollLaneOp(LaneOp.getNode()); 6244 } 6245 case MVT::i16: 6246 case MVT::f16: 6247 case MVT::bf16: { 6248 MVT SubVecVT = MVT::getVectorVT(EltTy, 2); 6249 SmallVector<SDValue, 4> Pieces; 6250 SDValue Src0SubVec, Src1SubVec, Src2SubVec; 6251 for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) { 6252 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0, 6253 DAG.getConstant(EltIdx, SL, MVT::i32)); 6254 6255 if (IsPermLane16) 6256 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1, 6257 DAG.getConstant(EltIdx, SL, MVT::i32)); 6258 6259 if (IID == Intrinsic::amdgcn_writelane) 6260 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2, 6261 DAG.getConstant(EltIdx, SL, MVT::i32)); 6262 6263 Pieces.push_back( 6264 IsPermLane16 6265 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT) 6266 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT)); 6267 EltIdx += 2; 6268 } 6269 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces); 6270 } 6271 default: 6272 // Handle all other cases by bitcasting to i32 vectors 6273 break; 6274 } 6275 } 6276 6277 MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32); 6278 Src0 = DAG.getBitcast(VecVT, Src0); 6279 6280 if (IsPermLane16) 6281 Src1 = DAG.getBitcast(VecVT, Src1); 6282 6283 if (IID == Intrinsic::amdgcn_writelane) 6284 Src2 = DAG.getBitcast(VecVT, Src2); 6285 6286 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT); 6287 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode()); 6288 return DAG.getBitcast(VT, UnrolledLaneOp); 6289 } 6290 6291 void SITargetLowering::ReplaceNodeResults(SDNode *N, 6292 SmallVectorImpl<SDValue> &Results, 6293 SelectionDAG &DAG) const { 6294 switch (N->getOpcode()) { 6295 case ISD::INSERT_VECTOR_ELT: { 6296 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG)) 6297 Results.push_back(Res); 6298 return; 6299 } 6300 case ISD::EXTRACT_VECTOR_ELT: { 6301 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG)) 6302 Results.push_back(Res); 6303 return; 6304 } 6305 case ISD::INTRINSIC_WO_CHAIN: { 6306 unsigned IID = N->getConstantOperandVal(0); 6307 switch (IID) { 6308 case Intrinsic::amdgcn_make_buffer_rsrc: 6309 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG)); 6310 return; 6311 case Intrinsic::amdgcn_cvt_pkrtz: { 6312 SDValue Src0 = N->getOperand(1); 6313 SDValue Src1 = N->getOperand(2); 6314 SDLoc SL(N); 6315 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, 6316 Src0, Src1); 6317 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt)); 6318 return; 6319 } 6320 case Intrinsic::amdgcn_cvt_pknorm_i16: 6321 case Intrinsic::amdgcn_cvt_pknorm_u16: 6322 case Intrinsic::amdgcn_cvt_pk_i16: 6323 case Intrinsic::amdgcn_cvt_pk_u16: { 6324 SDValue Src0 = N->getOperand(1); 6325 SDValue Src1 = N->getOperand(2); 6326 SDLoc SL(N); 6327 unsigned Opcode; 6328 6329 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16) 6330 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32; 6331 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16) 6332 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32; 6333 else if (IID == Intrinsic::amdgcn_cvt_pk_i16) 6334 Opcode = AMDGPUISD::CVT_PK_I16_I32; 6335 else 6336 Opcode = AMDGPUISD::CVT_PK_U16_U32; 6337 6338 EVT VT = N->getValueType(0); 6339 if (isTypeLegal(VT)) 6340 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1)); 6341 else { 6342 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1); 6343 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt)); 6344 } 6345 return; 6346 } 6347 case Intrinsic::amdgcn_s_buffer_load: { 6348 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate 6349 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG 6350 // combiner tries to merge the s_buffer_load_u8 with a sext instruction 6351 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with 6352 // s_buffer_load_i8. 6353 if (!Subtarget->hasScalarSubwordLoads()) 6354 return; 6355 SDValue Op = SDValue(N, 0); 6356 SDValue Rsrc = Op.getOperand(1); 6357 SDValue Offset = Op.getOperand(2); 6358 SDValue CachePolicy = Op.getOperand(3); 6359 EVT VT = Op.getValueType(); 6360 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n"); 6361 SDLoc DL(Op); 6362 MachineFunction &MF = DAG.getMachineFunction(); 6363 const DataLayout &DataLayout = DAG.getDataLayout(); 6364 Align Alignment = 6365 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext())); 6366 MachineMemOperand *MMO = MF.getMachineMemOperand( 6367 MachinePointerInfo(), 6368 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 6369 MachineMemOperand::MOInvariant, 6370 VT.getStoreSize(), Alignment); 6371 SDValue LoadVal; 6372 if (!Offset->isDivergent()) { 6373 SDValue Ops[] = {Rsrc, // source register 6374 Offset, CachePolicy}; 6375 SDValue BufferLoad = 6376 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL, 6377 DAG.getVTList(MVT::i32), Ops, VT, MMO); 6378 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad); 6379 } else { 6380 SDValue Ops[] = { 6381 DAG.getEntryNode(), // Chain 6382 Rsrc, // rsrc 6383 DAG.getConstant(0, DL, MVT::i32), // vindex 6384 {}, // voffset 6385 {}, // soffset 6386 {}, // offset 6387 CachePolicy, // cachepolicy 6388 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 6389 }; 6390 setBufferOffsets(Offset, DAG, &Ops[3], Align(4)); 6391 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO); 6392 } 6393 Results.push_back(LoadVal); 6394 return; 6395 } 6396 } 6397 break; 6398 } 6399 case ISD::INTRINSIC_W_CHAIN: { 6400 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) { 6401 if (Res.getOpcode() == ISD::MERGE_VALUES) { 6402 // FIXME: Hacky 6403 for (unsigned I = 0; I < Res.getNumOperands(); I++) { 6404 Results.push_back(Res.getOperand(I)); 6405 } 6406 } else { 6407 Results.push_back(Res); 6408 Results.push_back(Res.getValue(1)); 6409 } 6410 return; 6411 } 6412 6413 break; 6414 } 6415 case ISD::SELECT: { 6416 SDLoc SL(N); 6417 EVT VT = N->getValueType(0); 6418 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); 6419 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1)); 6420 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2)); 6421 6422 EVT SelectVT = NewVT; 6423 if (NewVT.bitsLT(MVT::i32)) { 6424 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS); 6425 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS); 6426 SelectVT = MVT::i32; 6427 } 6428 6429 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT, 6430 N->getOperand(0), LHS, RHS); 6431 6432 if (NewVT != SelectVT) 6433 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect); 6434 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect)); 6435 return; 6436 } 6437 case ISD::FNEG: { 6438 if (N->getValueType(0) != MVT::v2f16) 6439 break; 6440 6441 SDLoc SL(N); 6442 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); 6443 6444 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, 6445 BC, 6446 DAG.getConstant(0x80008000, SL, MVT::i32)); 6447 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); 6448 return; 6449 } 6450 case ISD::FABS: { 6451 if (N->getValueType(0) != MVT::v2f16) 6452 break; 6453 6454 SDLoc SL(N); 6455 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); 6456 6457 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, 6458 BC, 6459 DAG.getConstant(0x7fff7fff, SL, MVT::i32)); 6460 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); 6461 return; 6462 } 6463 case ISD::FSQRT: { 6464 if (N->getValueType(0) != MVT::f16) 6465 break; 6466 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG)); 6467 break; 6468 } 6469 default: 6470 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); 6471 break; 6472 } 6473 } 6474 6475 /// Helper function for LowerBRCOND 6476 static SDNode *findUser(SDValue Value, unsigned Opcode) { 6477 6478 SDNode *Parent = Value.getNode(); 6479 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 6480 I != E; ++I) { 6481 6482 if (I.getUse().get() != Value) 6483 continue; 6484 6485 if (I->getOpcode() == Opcode) 6486 return *I; 6487 } 6488 return nullptr; 6489 } 6490 6491 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { 6492 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) { 6493 switch (Intr->getConstantOperandVal(1)) { 6494 case Intrinsic::amdgcn_if: 6495 return AMDGPUISD::IF; 6496 case Intrinsic::amdgcn_else: 6497 return AMDGPUISD::ELSE; 6498 case Intrinsic::amdgcn_loop: 6499 return AMDGPUISD::LOOP; 6500 case Intrinsic::amdgcn_end_cf: 6501 llvm_unreachable("should not occur"); 6502 default: 6503 return 0; 6504 } 6505 } 6506 6507 // break, if_break, else_break are all only used as inputs to loop, not 6508 // directly as branch conditions. 6509 return 0; 6510 } 6511 6512 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { 6513 const Triple &TT = getTargetMachine().getTargetTriple(); 6514 return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 6515 GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && 6516 AMDGPU::shouldEmitConstantsToTextSection(TT); 6517 } 6518 6519 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { 6520 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) 6521 return false; 6522 6523 // FIXME: Either avoid relying on address space here or change the default 6524 // address space for functions to avoid the explicit check. 6525 return (GV->getValueType()->isFunctionTy() || 6526 !isNonGlobalAddrSpace(GV->getAddressSpace())) && 6527 !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV); 6528 } 6529 6530 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const { 6531 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV); 6532 } 6533 6534 bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const { 6535 if (!GV->hasExternalLinkage()) 6536 return true; 6537 6538 const auto OS = getTargetMachine().getTargetTriple().getOS(); 6539 return OS == Triple::AMDHSA || OS == Triple::AMDPAL; 6540 } 6541 6542 /// This transforms the control flow intrinsics to get the branch destination as 6543 /// last parameter, also switches branch target with BR if the need arise 6544 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 6545 SelectionDAG &DAG) const { 6546 SDLoc DL(BRCOND); 6547 6548 SDNode *Intr = BRCOND.getOperand(1).getNode(); 6549 SDValue Target = BRCOND.getOperand(2); 6550 SDNode *BR = nullptr; 6551 SDNode *SetCC = nullptr; 6552 6553 if (Intr->getOpcode() == ISD::SETCC) { 6554 // As long as we negate the condition everything is fine 6555 SetCC = Intr; 6556 Intr = SetCC->getOperand(0).getNode(); 6557 6558 } else { 6559 // Get the target from BR if we don't negate the condition 6560 BR = findUser(BRCOND, ISD::BR); 6561 assert(BR && "brcond missing unconditional branch user"); 6562 Target = BR->getOperand(1); 6563 } 6564 6565 unsigned CFNode = isCFIntrinsic(Intr); 6566 if (CFNode == 0) { 6567 // This is a uniform branch so we don't need to legalize. 6568 return BRCOND; 6569 } 6570 6571 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID || 6572 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN; 6573 6574 assert(!SetCC || 6575 (SetCC->getConstantOperandVal(1) == 1 && 6576 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 6577 ISD::SETNE)); 6578 6579 // operands of the new intrinsic call 6580 SmallVector<SDValue, 4> Ops; 6581 if (HaveChain) 6582 Ops.push_back(BRCOND.getOperand(0)); 6583 6584 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end()); 6585 Ops.push_back(Target); 6586 6587 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); 6588 6589 // build the new intrinsic call 6590 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode(); 6591 6592 if (!HaveChain) { 6593 SDValue Ops[] = { 6594 SDValue(Result, 0), 6595 BRCOND.getOperand(0) 6596 }; 6597 6598 Result = DAG.getMergeValues(Ops, DL).getNode(); 6599 } 6600 6601 if (BR) { 6602 // Give the branch instruction our target 6603 SDValue Ops[] = { 6604 BR->getOperand(0), 6605 BRCOND.getOperand(2) 6606 }; 6607 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); 6608 DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); 6609 } 6610 6611 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 6612 6613 // Copy the intrinsic results to registers 6614 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 6615 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 6616 if (!CopyToReg) 6617 continue; 6618 6619 Chain = DAG.getCopyToReg( 6620 Chain, DL, 6621 CopyToReg->getOperand(1), 6622 SDValue(Result, i - 1), 6623 SDValue()); 6624 6625 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 6626 } 6627 6628 // Remove the old intrinsic from the chain 6629 DAG.ReplaceAllUsesOfValueWith( 6630 SDValue(Intr, Intr->getNumValues() - 1), 6631 Intr->getOperand(0)); 6632 6633 return Chain; 6634 } 6635 6636 SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, 6637 SelectionDAG &DAG) const { 6638 MVT VT = Op.getSimpleValueType(); 6639 SDLoc DL(Op); 6640 // Checking the depth 6641 if (Op.getConstantOperandVal(0) != 0) 6642 return DAG.getConstant(0, DL, VT); 6643 6644 MachineFunction &MF = DAG.getMachineFunction(); 6645 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 6646 // Check for kernel and shader functions 6647 if (Info->isEntryFunction()) 6648 return DAG.getConstant(0, DL, VT); 6649 6650 MachineFrameInfo &MFI = MF.getFrameInfo(); 6651 // There is a call to @llvm.returnaddress in this function 6652 MFI.setReturnAddressIsTaken(true); 6653 6654 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 6655 // Get the return address reg and mark it as an implicit live-in 6656 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent())); 6657 6658 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); 6659 } 6660 6661 SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, 6662 SDValue Op, 6663 const SDLoc &DL, 6664 EVT VT) const { 6665 return Op.getValueType().bitsLE(VT) ? 6666 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) : 6667 DAG.getNode(ISD::FP_ROUND, DL, VT, Op, 6668 DAG.getTargetConstant(0, DL, MVT::i32)); 6669 } 6670 6671 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 6672 assert(Op.getValueType() == MVT::f16 && 6673 "Do not know how to custom lower FP_ROUND for non-f16 type"); 6674 6675 SDValue Src = Op.getOperand(0); 6676 EVT SrcVT = Src.getValueType(); 6677 if (SrcVT != MVT::f64) 6678 return Op; 6679 6680 // TODO: Handle strictfp 6681 if (Op.getOpcode() != ISD::FP_ROUND) 6682 return Op; 6683 6684 SDLoc DL(Op); 6685 6686 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src); 6687 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); 6688 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); 6689 } 6690 6691 SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, 6692 SelectionDAG &DAG) const { 6693 EVT VT = Op.getValueType(); 6694 const MachineFunction &MF = DAG.getMachineFunction(); 6695 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 6696 bool IsIEEEMode = Info->getMode().IEEE; 6697 6698 // FIXME: Assert during selection that this is only selected for 6699 // ieee_mode. Currently a combine can produce the ieee version for non-ieee 6700 // mode functions, but this happens to be OK since it's only done in cases 6701 // where there is known no sNaN. 6702 if (IsIEEEMode) 6703 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); 6704 6705 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 || 6706 VT == MVT::v16bf16) 6707 return splitBinaryVectorOp(Op, DAG); 6708 return Op; 6709 } 6710 6711 SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const { 6712 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP; 6713 EVT VT = Op.getValueType(); 6714 assert(VT == MVT::f16); 6715 6716 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1); 6717 EVT ExpVT = Exp.getValueType(); 6718 if (ExpVT == MVT::i16) 6719 return Op; 6720 6721 SDLoc DL(Op); 6722 6723 // Correct the exponent type for f16 to i16. 6724 // Clamp the range of the exponent to the instruction's range. 6725 6726 // TODO: This should be a generic narrowing legalization, and can easily be 6727 // for GlobalISel. 6728 6729 SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT); 6730 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp); 6731 6732 SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT); 6733 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp); 6734 6735 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp); 6736 6737 if (IsStrict) { 6738 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other}, 6739 {Op.getOperand(0), Op.getOperand(1), TruncExp}); 6740 } 6741 6742 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp); 6743 } 6744 6745 // Custom lowering for vector multiplications and s_mul_u64. 6746 SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const { 6747 EVT VT = Op.getValueType(); 6748 6749 // Split vector operands. 6750 if (VT.isVector()) 6751 return splitBinaryVectorOp(Op, DAG); 6752 6753 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64"); 6754 6755 // There are four ways to lower s_mul_u64: 6756 // 6757 // 1. If all the operands are uniform, then we lower it as it is. 6758 // 6759 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit 6760 // multiplications because there is not a vector equivalent of s_mul_u64. 6761 // 6762 // 3. If the cost model decides that it is more efficient to use vector 6763 // registers, then we have to split s_mul_u64 in 32-bit multiplications. 6764 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp . 6765 // 6766 // 4. If the cost model decides to use vector registers and both of the 6767 // operands are zero-extended/sign-extended from 32-bits, then we split the 6768 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not 6769 // possible to check if the operands are zero-extended or sign-extended in 6770 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with 6771 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace 6772 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended. 6773 // If the cost model decides that we have to use vector registers, then 6774 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/ 6775 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model 6776 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/ 6777 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in 6778 // SIInstrInfo.cpp . 6779 6780 if (Op->isDivergent()) 6781 return SDValue(); 6782 6783 SDValue Op0 = Op.getOperand(0); 6784 SDValue Op1 = Op.getOperand(1); 6785 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64 6786 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to 6787 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo. 6788 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0); 6789 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros(); 6790 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1); 6791 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros(); 6792 SDLoc SL(Op); 6793 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32) 6794 return SDValue( 6795 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0); 6796 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0); 6797 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1); 6798 if (Op0SignBits >= 33 && Op1SignBits >= 33) 6799 return SDValue( 6800 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0); 6801 // If all the operands are uniform, then we lower s_mul_u64 as it is. 6802 return Op; 6803 } 6804 6805 SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const { 6806 EVT VT = Op.getValueType(); 6807 SDLoc SL(Op); 6808 SDValue LHS = Op.getOperand(0); 6809 SDValue RHS = Op.getOperand(1); 6810 bool isSigned = Op.getOpcode() == ISD::SMULO; 6811 6812 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) { 6813 const APInt &C = RHSC->getAPIntValue(); 6814 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X } 6815 if (C.isPowerOf2()) { 6816 // smulo(x, signed_min) is same as umulo(x, signed_min). 6817 bool UseArithShift = isSigned && !C.isMinSignedValue(); 6818 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32); 6819 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt); 6820 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, 6821 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, 6822 SL, VT, Result, ShiftAmt), 6823 LHS, ISD::SETNE); 6824 return DAG.getMergeValues({ Result, Overflow }, SL); 6825 } 6826 } 6827 6828 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS); 6829 SDValue Top = DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, 6830 SL, VT, LHS, RHS); 6831 6832 SDValue Sign = isSigned 6833 ? DAG.getNode(ISD::SRA, SL, VT, Result, 6834 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32)) 6835 : DAG.getConstant(0, SL, VT); 6836 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE); 6837 6838 return DAG.getMergeValues({ Result, Overflow }, SL); 6839 } 6840 6841 SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const { 6842 if (Op->isDivergent()) { 6843 // Select to V_MAD_[IU]64_[IU]32. 6844 return Op; 6845 } 6846 if (Subtarget->hasSMulHi()) { 6847 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32. 6848 return SDValue(); 6849 } 6850 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to 6851 // calculate the high part, so we might as well do the whole thing with 6852 // V_MAD_[IU]64_[IU]32. 6853 return Op; 6854 } 6855 6856 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { 6857 if (!Subtarget->isTrapHandlerEnabled() || 6858 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 6859 return lowerTrapEndpgm(Op, DAG); 6860 6861 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) : 6862 lowerTrapHsaQueuePtr(Op, DAG); 6863 } 6864 6865 SDValue SITargetLowering::lowerTrapEndpgm( 6866 SDValue Op, SelectionDAG &DAG) const { 6867 SDLoc SL(Op); 6868 SDValue Chain = Op.getOperand(0); 6869 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain); 6870 } 6871 6872 SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT, 6873 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const { 6874 MachineFunction &MF = DAG.getMachineFunction(); 6875 uint64_t Offset = getImplicitParameterOffset(MF, Param); 6876 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset); 6877 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 6878 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment, 6879 MachineMemOperand::MODereferenceable | 6880 MachineMemOperand::MOInvariant); 6881 } 6882 6883 SDValue SITargetLowering::lowerTrapHsaQueuePtr( 6884 SDValue Op, SelectionDAG &DAG) const { 6885 SDLoc SL(Op); 6886 SDValue Chain = Op.getOperand(0); 6887 6888 SDValue QueuePtr; 6889 // For code object version 5, QueuePtr is passed through implicit kernarg. 6890 const Module *M = DAG.getMachineFunction().getFunction().getParent(); 6891 if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) { 6892 QueuePtr = 6893 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR); 6894 } else { 6895 MachineFunction &MF = DAG.getMachineFunction(); 6896 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 6897 Register UserSGPR = Info->getQueuePtrUserSGPR(); 6898 6899 if (UserSGPR == AMDGPU::NoRegister) { 6900 // We probably are in a function incorrectly marked with 6901 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the 6902 // trap, so just use a null pointer. 6903 QueuePtr = DAG.getConstant(0, SL, MVT::i64); 6904 } else { 6905 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, 6906 MVT::i64); 6907 } 6908 } 6909 6910 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); 6911 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, 6912 QueuePtr, SDValue()); 6913 6914 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap); 6915 SDValue Ops[] = { 6916 ToReg, 6917 DAG.getTargetConstant(TrapID, SL, MVT::i16), 6918 SGPR01, 6919 ToReg.getValue(1) 6920 }; 6921 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); 6922 } 6923 6924 SDValue SITargetLowering::lowerTrapHsa( 6925 SDValue Op, SelectionDAG &DAG) const { 6926 SDLoc SL(Op); 6927 SDValue Chain = Op.getOperand(0); 6928 6929 // We need to simulate the 's_trap 2' instruction on targets that run in 6930 // PRIV=1 (where it is treated as a nop). 6931 if (Subtarget->hasPrivEnabledTrap2NopBug()) 6932 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain); 6933 6934 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap); 6935 SDValue Ops[] = { 6936 Chain, 6937 DAG.getTargetConstant(TrapID, SL, MVT::i16) 6938 }; 6939 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); 6940 } 6941 6942 SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const { 6943 SDLoc SL(Op); 6944 SDValue Chain = Op.getOperand(0); 6945 MachineFunction &MF = DAG.getMachineFunction(); 6946 6947 if (!Subtarget->isTrapHandlerEnabled() || 6948 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 6949 DiagnosticInfoUnsupported NoTrap(MF.getFunction(), 6950 "debugtrap handler not supported", 6951 Op.getDebugLoc(), 6952 DS_Warning); 6953 LLVMContext &Ctx = MF.getFunction().getContext(); 6954 Ctx.diagnose(NoTrap); 6955 return Chain; 6956 } 6957 6958 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap); 6959 SDValue Ops[] = { 6960 Chain, 6961 DAG.getTargetConstant(TrapID, SL, MVT::i16) 6962 }; 6963 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); 6964 } 6965 6966 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, 6967 SelectionDAG &DAG) const { 6968 if (Subtarget->hasApertureRegs()) { 6969 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) 6970 ? AMDGPU::SRC_SHARED_BASE 6971 : AMDGPU::SRC_PRIVATE_BASE; 6972 // Note: this feature (register) is broken. When used as a 32-bit operand, 6973 // it returns a wrong value (all zeroes?). The real value is in the upper 32 6974 // bits. 6975 // 6976 // To work around the issue, directly emit a 64 bit mov from this register 6977 // then extract the high bits. Note that this shouldn't even result in a 6978 // shift being emitted and simply become a pair of registers (e.g.): 6979 // s_mov_b64 s[6:7], src_shared_base 6980 // v_mov_b32_e32 v1, s7 6981 // 6982 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy 6983 // coalescing would kick in and it would think it's okay to use the "HI" 6984 // subregister directly (instead of extracting the HI 32 bits) which is an 6985 // artificial (unusable) register. 6986 // Register TableGen definitions would need an overhaul to get rid of the 6987 // artificial "HI" aperture registers and prevent this kind of issue from 6988 // happening. 6989 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, 6990 DAG.getRegister(ApertureRegNo, MVT::i64)); 6991 return DAG.getNode( 6992 ISD::TRUNCATE, DL, MVT::i32, 6993 DAG.getNode(ISD::SRL, DL, MVT::i64, 6994 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)})); 6995 } 6996 6997 // For code object version 5, private_base and shared_base are passed through 6998 // implicit kernargs. 6999 const Module *M = DAG.getMachineFunction().getFunction().getParent(); 7000 if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) { 7001 ImplicitParameter Param = 7002 (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE; 7003 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param); 7004 } 7005 7006 MachineFunction &MF = DAG.getMachineFunction(); 7007 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 7008 Register UserSGPR = Info->getQueuePtrUserSGPR(); 7009 if (UserSGPR == AMDGPU::NoRegister) { 7010 // We probably are in a function incorrectly marked with 7011 // amdgpu-no-queue-ptr. This is undefined. 7012 return DAG.getUNDEF(MVT::i32); 7013 } 7014 7015 SDValue QueuePtr = CreateLiveInRegister( 7016 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); 7017 7018 // Offset into amd_queue_t for group_segment_aperture_base_hi / 7019 // private_segment_aperture_base_hi. 7020 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 7021 7022 SDValue Ptr = 7023 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset)); 7024 7025 // TODO: Use custom target PseudoSourceValue. 7026 // TODO: We should use the value from the IR intrinsic call, but it might not 7027 // be available and how do we get it? 7028 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 7029 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo, 7030 commonAlignment(Align(64), StructOffset), 7031 MachineMemOperand::MODereferenceable | 7032 MachineMemOperand::MOInvariant); 7033 } 7034 7035 /// Return true if the value is a known valid address, such that a null check is 7036 /// not necessary. 7037 static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG, 7038 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 7039 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) || 7040 isa<BasicBlockSDNode>(Val)) 7041 return true; 7042 7043 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val)) 7044 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace); 7045 7046 // TODO: Search through arithmetic, handle arguments and loads 7047 // marked nonnull. 7048 return false; 7049 } 7050 7051 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, 7052 SelectionDAG &DAG) const { 7053 SDLoc SL(Op); 7054 7055 const AMDGPUTargetMachine &TM = 7056 static_cast<const AMDGPUTargetMachine &>(getTargetMachine()); 7057 7058 unsigned DestAS, SrcAS; 7059 SDValue Src; 7060 bool IsNonNull = false; 7061 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) { 7062 SrcAS = ASC->getSrcAddressSpace(); 7063 Src = ASC->getOperand(0); 7064 DestAS = ASC->getDestAddressSpace(); 7065 } else { 7066 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN && 7067 Op.getConstantOperandVal(0) == 7068 Intrinsic::amdgcn_addrspacecast_nonnull); 7069 Src = Op->getOperand(1); 7070 SrcAS = Op->getConstantOperandVal(2); 7071 DestAS = Op->getConstantOperandVal(3); 7072 IsNonNull = true; 7073 } 7074 7075 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); 7076 7077 // flat -> local/private 7078 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 7079 if (DestAS == AMDGPUAS::LOCAL_ADDRESS || 7080 DestAS == AMDGPUAS::PRIVATE_ADDRESS) { 7081 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); 7082 7083 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS)) 7084 return Ptr; 7085 7086 unsigned NullVal = TM.getNullPointerValue(DestAS); 7087 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); 7088 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); 7089 7090 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr, 7091 SegmentNullPtr); 7092 } 7093 } 7094 7095 // local/private -> flat 7096 if (DestAS == AMDGPUAS::FLAT_ADDRESS) { 7097 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 7098 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) { 7099 7100 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG); 7101 SDValue CvtPtr = 7102 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); 7103 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr); 7104 7105 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS)) 7106 return CvtPtr; 7107 7108 unsigned NullVal = TM.getNullPointerValue(SrcAS); 7109 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); 7110 7111 SDValue NonNull 7112 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); 7113 7114 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr, 7115 FlatNullPtr); 7116 } 7117 } 7118 7119 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 7120 Op.getValueType() == MVT::i64) { 7121 const SIMachineFunctionInfo *Info = 7122 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>(); 7123 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32); 7124 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi); 7125 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 7126 } 7127 7128 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 7129 Src.getValueType() == MVT::i64) 7130 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); 7131 7132 // global <-> flat are no-ops and never emitted. 7133 7134 const MachineFunction &MF = DAG.getMachineFunction(); 7135 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 7136 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc()); 7137 DAG.getContext()->diagnose(InvalidAddrSpaceCast); 7138 7139 return DAG.getUNDEF(Op->getValueType(0)); 7140 } 7141 7142 // This lowers an INSERT_SUBVECTOR by extracting the individual elements from 7143 // the small vector and inserting them into the big vector. That is better than 7144 // the default expansion of doing it via a stack slot. Even though the use of 7145 // the stack slot would be optimized away afterwards, the stack slot itself 7146 // remains. 7147 SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, 7148 SelectionDAG &DAG) const { 7149 SDValue Vec = Op.getOperand(0); 7150 SDValue Ins = Op.getOperand(1); 7151 SDValue Idx = Op.getOperand(2); 7152 EVT VecVT = Vec.getValueType(); 7153 EVT InsVT = Ins.getValueType(); 7154 EVT EltVT = VecVT.getVectorElementType(); 7155 unsigned InsNumElts = InsVT.getVectorNumElements(); 7156 unsigned IdxVal = Idx->getAsZExtVal(); 7157 SDLoc SL(Op); 7158 7159 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) { 7160 // Insert 32-bit registers at a time. 7161 assert(InsNumElts % 2 == 0 && "expect legal vector types"); 7162 7163 unsigned VecNumElts = VecVT.getVectorNumElements(); 7164 EVT NewVecVT = 7165 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2); 7166 EVT NewInsVT = InsNumElts == 2 ? MVT::i32 7167 : EVT::getVectorVT(*DAG.getContext(), 7168 MVT::i32, InsNumElts / 2); 7169 7170 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec); 7171 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins); 7172 7173 for (unsigned I = 0; I != InsNumElts / 2; ++I) { 7174 SDValue Elt; 7175 if (InsNumElts == 2) { 7176 Elt = Ins; 7177 } else { 7178 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins, 7179 DAG.getConstant(I, SL, MVT::i32)); 7180 } 7181 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt, 7182 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32)); 7183 } 7184 7185 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec); 7186 } 7187 7188 for (unsigned I = 0; I != InsNumElts; ++I) { 7189 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins, 7190 DAG.getConstant(I, SL, MVT::i32)); 7191 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt, 7192 DAG.getConstant(IdxVal + I, SL, MVT::i32)); 7193 } 7194 return Vec; 7195 } 7196 7197 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, 7198 SelectionDAG &DAG) const { 7199 SDValue Vec = Op.getOperand(0); 7200 SDValue InsVal = Op.getOperand(1); 7201 SDValue Idx = Op.getOperand(2); 7202 EVT VecVT = Vec.getValueType(); 7203 EVT EltVT = VecVT.getVectorElementType(); 7204 unsigned VecSize = VecVT.getSizeInBits(); 7205 unsigned EltSize = EltVT.getSizeInBits(); 7206 SDLoc SL(Op); 7207 7208 // Specially handle the case of v4i16 with static indexing. 7209 unsigned NumElts = VecVT.getVectorNumElements(); 7210 auto KIdx = dyn_cast<ConstantSDNode>(Idx); 7211 if (NumElts == 4 && EltSize == 16 && KIdx) { 7212 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec); 7213 7214 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec, 7215 DAG.getConstant(0, SL, MVT::i32)); 7216 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec, 7217 DAG.getConstant(1, SL, MVT::i32)); 7218 7219 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf); 7220 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf); 7221 7222 unsigned Idx = KIdx->getZExtValue(); 7223 bool InsertLo = Idx < 2; 7224 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, 7225 InsertLo ? LoVec : HiVec, 7226 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal), 7227 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32)); 7228 7229 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf); 7230 7231 SDValue Concat = InsertLo ? 7232 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) : 7233 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf }); 7234 7235 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat); 7236 } 7237 7238 // Static indexing does not lower to stack access, and hence there is no need 7239 // for special custom lowering to avoid stack access. 7240 if (isa<ConstantSDNode>(Idx)) 7241 return SDValue(); 7242 7243 // Avoid stack access for dynamic indexing by custom lowering to 7244 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec 7245 7246 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits"); 7247 7248 MVT IntVT = MVT::getIntegerVT(VecSize); 7249 7250 // Convert vector index to bit-index and get the required bit mask. 7251 assert(isPowerOf2_32(EltSize)); 7252 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize); 7253 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); 7254 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); 7255 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT, 7256 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx); 7257 7258 // 1. Create a congruent vector with the target value in each element. 7259 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT, 7260 DAG.getSplatBuildVector(VecVT, SL, InsVal)); 7261 7262 // 2. Mask off all other indices except the required index within (1). 7263 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal); 7264 7265 // 3. Mask off the required index within the target vector. 7266 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); 7267 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT, 7268 DAG.getNOT(SL, BFM, IntVT), BCVec); 7269 7270 // 4. Get (2) and (3) ORed into the target vector. 7271 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS); 7272 7273 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI); 7274 } 7275 7276 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, 7277 SelectionDAG &DAG) const { 7278 SDLoc SL(Op); 7279 7280 EVT ResultVT = Op.getValueType(); 7281 SDValue Vec = Op.getOperand(0); 7282 SDValue Idx = Op.getOperand(1); 7283 EVT VecVT = Vec.getValueType(); 7284 unsigned VecSize = VecVT.getSizeInBits(); 7285 EVT EltVT = VecVT.getVectorElementType(); 7286 7287 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); 7288 7289 // Make sure we do any optimizations that will make it easier to fold 7290 // source modifiers before obscuring it with bit operations. 7291 7292 // XXX - Why doesn't this get called when vector_shuffle is expanded? 7293 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) 7294 return Combined; 7295 7296 if (VecSize == 128 || VecSize == 256 || VecSize == 512) { 7297 SDValue Lo, Hi; 7298 EVT LoVT, HiVT; 7299 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT); 7300 7301 if (VecSize == 128) { 7302 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec); 7303 Lo = DAG.getBitcast(LoVT, 7304 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, 7305 DAG.getConstant(0, SL, MVT::i32))); 7306 Hi = DAG.getBitcast(HiVT, 7307 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, 7308 DAG.getConstant(1, SL, MVT::i32))); 7309 } else if (VecSize == 256) { 7310 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec); 7311 SDValue Parts[4]; 7312 for (unsigned P = 0; P < 4; ++P) { 7313 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, 7314 DAG.getConstant(P, SL, MVT::i32)); 7315 } 7316 7317 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64, 7318 Parts[0], Parts[1])); 7319 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64, 7320 Parts[2], Parts[3])); 7321 } else { 7322 assert(VecSize == 512); 7323 7324 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec); 7325 SDValue Parts[8]; 7326 for (unsigned P = 0; P < 8; ++P) { 7327 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, 7328 DAG.getConstant(P, SL, MVT::i32)); 7329 } 7330 7331 Lo = DAG.getBitcast(LoVT, 7332 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64, 7333 Parts[0], Parts[1], Parts[2], Parts[3])); 7334 Hi = DAG.getBitcast(HiVT, 7335 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64, 7336 Parts[4], Parts[5],Parts[6], Parts[7])); 7337 } 7338 7339 EVT IdxVT = Idx.getValueType(); 7340 unsigned NElem = VecVT.getVectorNumElements(); 7341 assert(isPowerOf2_32(NElem)); 7342 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT); 7343 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask); 7344 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT); 7345 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx); 7346 } 7347 7348 assert(VecSize <= 64); 7349 7350 MVT IntVT = MVT::getIntegerVT(VecSize); 7351 7352 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly. 7353 SDValue VecBC = peekThroughBitcasts(Vec); 7354 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) { 7355 SDValue Src = VecBC.getOperand(0); 7356 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src); 7357 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT); 7358 } 7359 7360 unsigned EltSize = EltVT.getSizeInBits(); 7361 assert(isPowerOf2_32(EltSize)); 7362 7363 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); 7364 7365 // Convert vector index to bit-index (* EltSize) 7366 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); 7367 7368 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); 7369 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx); 7370 7371 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) { 7372 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt); 7373 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result); 7374 } 7375 7376 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT); 7377 } 7378 7379 static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) { 7380 assert(Elt % 2 == 0); 7381 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0); 7382 } 7383 7384 SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, 7385 SelectionDAG &DAG) const { 7386 SDLoc SL(Op); 7387 EVT ResultVT = Op.getValueType(); 7388 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op); 7389 7390 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16; 7391 EVT EltVT = PackVT.getVectorElementType(); 7392 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements(); 7393 7394 // vector_shuffle <0,1,6,7> lhs, rhs 7395 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2) 7396 // 7397 // vector_shuffle <6,7,2,3> lhs, rhs 7398 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2) 7399 // 7400 // vector_shuffle <6,7,0,1> lhs, rhs 7401 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0) 7402 7403 // Avoid scalarizing when both halves are reading from consecutive elements. 7404 SmallVector<SDValue, 4> Pieces; 7405 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) { 7406 if (elementPairIsContiguous(SVN->getMask(), I)) { 7407 const int Idx = SVN->getMaskElt(I); 7408 int VecIdx = Idx < SrcNumElts ? 0 : 1; 7409 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts; 7410 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, 7411 PackVT, SVN->getOperand(VecIdx), 7412 DAG.getConstant(EltIdx, SL, MVT::i32)); 7413 Pieces.push_back(SubVec); 7414 } else { 7415 const int Idx0 = SVN->getMaskElt(I); 7416 const int Idx1 = SVN->getMaskElt(I + 1); 7417 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1; 7418 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1; 7419 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts; 7420 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts; 7421 7422 SDValue Vec0 = SVN->getOperand(VecIdx0); 7423 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, 7424 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32)); 7425 7426 SDValue Vec1 = SVN->getOperand(VecIdx1); 7427 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, 7428 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32)); 7429 Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 })); 7430 } 7431 } 7432 7433 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces); 7434 } 7435 7436 SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op, 7437 SelectionDAG &DAG) const { 7438 SDValue SVal = Op.getOperand(0); 7439 EVT ResultVT = Op.getValueType(); 7440 EVT SValVT = SVal.getValueType(); 7441 SDValue UndefVal = DAG.getUNDEF(SValVT); 7442 SDLoc SL(Op); 7443 7444 SmallVector<SDValue, 8> VElts; 7445 VElts.push_back(SVal); 7446 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I) 7447 VElts.push_back(UndefVal); 7448 7449 return DAG.getBuildVector(ResultVT, SL, VElts); 7450 } 7451 7452 SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, 7453 SelectionDAG &DAG) const { 7454 SDLoc SL(Op); 7455 EVT VT = Op.getValueType(); 7456 7457 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || 7458 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { 7459 EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 7460 VT.getVectorNumElements() / 2); 7461 MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits()); 7462 7463 // Turn into pair of packed build_vectors. 7464 // TODO: Special case for constants that can be materialized with s_mov_b64. 7465 SmallVector<SDValue, 4> LoOps, HiOps; 7466 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) { 7467 LoOps.push_back(Op.getOperand(I)); 7468 HiOps.push_back(Op.getOperand(I + E)); 7469 } 7470 SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps); 7471 SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps); 7472 7473 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo); 7474 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi); 7475 7476 SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL, 7477 { CastLo, CastHi }); 7478 return DAG.getNode(ISD::BITCAST, SL, VT, Blend); 7479 } 7480 7481 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) { 7482 EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 7483 VT.getVectorNumElements() / 4); 7484 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits()); 7485 7486 SmallVector<SDValue, 4> Parts[4]; 7487 for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) { 7488 for (unsigned P = 0; P < 4; ++P) 7489 Parts[P].push_back(Op.getOperand(I + P * E)); 7490 } 7491 SDValue Casts[4]; 7492 for (unsigned P = 0; P < 4; ++P) { 7493 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]); 7494 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec); 7495 } 7496 7497 SDValue Blend = 7498 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts); 7499 return DAG.getNode(ISD::BITCAST, SL, VT, Blend); 7500 } 7501 7502 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) { 7503 EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 7504 VT.getVectorNumElements() / 8); 7505 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits()); 7506 7507 SmallVector<SDValue, 8> Parts[8]; 7508 for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) { 7509 for (unsigned P = 0; P < 8; ++P) 7510 Parts[P].push_back(Op.getOperand(I + P * E)); 7511 } 7512 SDValue Casts[8]; 7513 for (unsigned P = 0; P < 8; ++P) { 7514 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]); 7515 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec); 7516 } 7517 7518 SDValue Blend = 7519 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts); 7520 return DAG.getNode(ISD::BITCAST, SL, VT, Blend); 7521 } 7522 7523 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16); 7524 assert(!Subtarget->hasVOP3PInsts() && "this should be legal"); 7525 7526 SDValue Lo = Op.getOperand(0); 7527 SDValue Hi = Op.getOperand(1); 7528 7529 // Avoid adding defined bits with the zero_extend. 7530 if (Hi.isUndef()) { 7531 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); 7532 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo); 7533 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo); 7534 } 7535 7536 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi); 7537 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi); 7538 7539 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi, 7540 DAG.getConstant(16, SL, MVT::i32)); 7541 if (Lo.isUndef()) 7542 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi); 7543 7544 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); 7545 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo); 7546 7547 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi); 7548 return DAG.getNode(ISD::BITCAST, SL, VT, Or); 7549 } 7550 7551 bool 7552 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 7553 // OSes that use ELF REL relocations (instead of RELA) can only store a 7554 // 32-bit addend in the instruction, so it is not safe to allow offset folding 7555 // which can create arbitrary 64-bit addends. (This is only a problem for 7556 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by 7557 // the high 32 bits of the addend.) 7558 // 7559 // This should be kept in sync with how HasRelocationAddend is initialized in 7560 // the constructor of ELFAMDGPUAsmBackend. 7561 if (!Subtarget->isAmdHsaOS()) 7562 return false; 7563 7564 // We can fold offsets for anything that doesn't require a GOT relocation. 7565 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || 7566 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 7567 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && 7568 !shouldEmitGOTReloc(GA->getGlobal()); 7569 } 7570 7571 static SDValue 7572 buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, 7573 const SDLoc &DL, int64_t Offset, EVT PtrVT, 7574 unsigned GAFlags = SIInstrInfo::MO_NONE) { 7575 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 7576 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is 7577 // lowered to the following code sequence: 7578 // 7579 // For constant address space: 7580 // s_getpc_b64 s[0:1] 7581 // s_add_u32 s0, s0, $symbol 7582 // s_addc_u32 s1, s1, 0 7583 // 7584 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 7585 // a fixup or relocation is emitted to replace $symbol with a literal 7586 // constant, which is a pc-relative offset from the encoding of the $symbol 7587 // operand to the global variable. 7588 // 7589 // For global address space: 7590 // s_getpc_b64 s[0:1] 7591 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 7592 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 7593 // 7594 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 7595 // fixups or relocations are emitted to replace $symbol@*@lo and 7596 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 7597 // which is a 64-bit pc-relative offset from the encoding of the $symbol 7598 // operand to the global variable. 7599 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags); 7600 SDValue PtrHi; 7601 if (GAFlags == SIInstrInfo::MO_NONE) 7602 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32); 7603 else 7604 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1); 7605 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi); 7606 } 7607 7608 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 7609 SDValue Op, 7610 SelectionDAG &DAG) const { 7611 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 7612 SDLoc DL(GSD); 7613 EVT PtrVT = Op.getValueType(); 7614 7615 const GlobalValue *GV = GSD->getGlobal(); 7616 if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && 7617 shouldUseLDSConstAddress(GV)) || 7618 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || 7619 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 7620 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && 7621 GV->hasExternalLinkage()) { 7622 Type *Ty = GV->getValueType(); 7623 // HIP uses an unsized array `extern __shared__ T s[]` or similar 7624 // zero-sized type in other languages to declare the dynamic shared 7625 // memory which size is not known at the compile time. They will be 7626 // allocated by the runtime and placed directly after the static 7627 // allocated ones. They all share the same offset. 7628 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) { 7629 assert(PtrVT == MVT::i32 && "32-bit pointer is expected."); 7630 // Adjust alignment for that dynamic shared memory array. 7631 Function &F = DAG.getMachineFunction().getFunction(); 7632 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV)); 7633 MFI->setUsesDynamicLDS(true); 7634 return SDValue( 7635 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0); 7636 } 7637 } 7638 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 7639 } 7640 7641 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 7642 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(), 7643 SIInstrInfo::MO_ABS32_LO); 7644 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA); 7645 } 7646 7647 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) { 7648 SDValue AddrLo = DAG.getTargetGlobalAddress( 7649 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO); 7650 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0}; 7651 7652 SDValue AddrHi = DAG.getTargetGlobalAddress( 7653 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI); 7654 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0}; 7655 7656 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi); 7657 } 7658 7659 if (shouldEmitFixup(GV)) 7660 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); 7661 7662 if (shouldEmitPCReloc(GV)) 7663 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT, 7664 SIInstrInfo::MO_REL32); 7665 7666 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT, 7667 SIInstrInfo::MO_GOTPCREL32); 7668 7669 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext()); 7670 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); 7671 const DataLayout &DataLayout = DAG.getDataLayout(); 7672 Align Alignment = DataLayout.getABITypeAlign(PtrTy); 7673 MachinePointerInfo PtrInfo 7674 = MachinePointerInfo::getGOT(DAG.getMachineFunction()); 7675 7676 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment, 7677 MachineMemOperand::MODereferenceable | 7678 MachineMemOperand::MOInvariant); 7679 } 7680 7681 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, 7682 const SDLoc &DL, SDValue V) const { 7683 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as 7684 // the destination register. 7685 // 7686 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, 7687 // so we will end up with redundant moves to m0. 7688 // 7689 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result. 7690 7691 // A Null SDValue creates a glue result. 7692 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue, 7693 V, Chain); 7694 return SDValue(M0, 0); 7695 } 7696 7697 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, 7698 SDValue Op, 7699 MVT VT, 7700 unsigned Offset) const { 7701 SDLoc SL(Op); 7702 SDValue Param = lowerKernargMemParameter( 7703 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false); 7704 // The local size values will have the hi 16-bits as zero. 7705 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, 7706 DAG.getValueType(VT)); 7707 } 7708 7709 static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, 7710 EVT VT) { 7711 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(), 7712 "non-hsa intrinsic with hsa target", 7713 DL.getDebugLoc()); 7714 DAG.getContext()->diagnose(BadIntrin); 7715 return DAG.getUNDEF(VT); 7716 } 7717 7718 static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, 7719 EVT VT) { 7720 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(), 7721 "intrinsic not supported on subtarget", 7722 DL.getDebugLoc()); 7723 DAG.getContext()->diagnose(BadIntrin); 7724 return DAG.getUNDEF(VT); 7725 } 7726 7727 static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, 7728 ArrayRef<SDValue> Elts) { 7729 assert(!Elts.empty()); 7730 MVT Type; 7731 unsigned NumElts = Elts.size(); 7732 7733 if (NumElts <= 12) { 7734 Type = MVT::getVectorVT(MVT::f32, NumElts); 7735 } else { 7736 assert(Elts.size() <= 16); 7737 Type = MVT::v16f32; 7738 NumElts = 16; 7739 } 7740 7741 SmallVector<SDValue, 16> VecElts(NumElts); 7742 for (unsigned i = 0; i < Elts.size(); ++i) { 7743 SDValue Elt = Elts[i]; 7744 if (Elt.getValueType() != MVT::f32) 7745 Elt = DAG.getBitcast(MVT::f32, Elt); 7746 VecElts[i] = Elt; 7747 } 7748 for (unsigned i = Elts.size(); i < NumElts; ++i) 7749 VecElts[i] = DAG.getUNDEF(MVT::f32); 7750 7751 if (NumElts == 1) 7752 return VecElts[0]; 7753 return DAG.getBuildVector(Type, DL, VecElts); 7754 } 7755 7756 static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, 7757 SDValue Src, int ExtraElts) { 7758 EVT SrcVT = Src.getValueType(); 7759 7760 SmallVector<SDValue, 8> Elts; 7761 7762 if (SrcVT.isVector()) 7763 DAG.ExtractVectorElements(Src, Elts); 7764 else 7765 Elts.push_back(Src); 7766 7767 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType()); 7768 while (ExtraElts--) 7769 Elts.push_back(Undef); 7770 7771 return DAG.getBuildVector(CastVT, DL, Elts); 7772 } 7773 7774 // Re-construct the required return value for a image load intrinsic. 7775 // This is more complicated due to the optional use TexFailCtrl which means the required 7776 // return type is an aggregate 7777 static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, 7778 ArrayRef<EVT> ResultTypes, bool IsTexFail, 7779 bool Unpacked, bool IsD16, int DMaskPop, 7780 int NumVDataDwords, bool IsAtomicPacked16Bit, 7781 const SDLoc &DL) { 7782 // Determine the required return type. This is the same regardless of IsTexFail flag 7783 EVT ReqRetVT = ResultTypes[0]; 7784 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1; 7785 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit) 7786 ? (ReqRetNumElts + 1) / 2 7787 : ReqRetNumElts; 7788 7789 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2; 7790 7791 MVT DataDwordVT = NumDataDwords == 1 ? 7792 MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords); 7793 7794 MVT MaskPopVT = MaskPopDwords == 1 ? 7795 MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords); 7796 7797 SDValue Data(Result, 0); 7798 SDValue TexFail; 7799 7800 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) { 7801 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32); 7802 if (MaskPopVT.isVector()) { 7803 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT, 7804 SDValue(Result, 0), ZeroIdx); 7805 } else { 7806 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT, 7807 SDValue(Result, 0), ZeroIdx); 7808 } 7809 } 7810 7811 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit) 7812 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data, 7813 NumDataDwords - MaskPopDwords); 7814 7815 if (IsD16) 7816 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked); 7817 7818 EVT LegalReqRetVT = ReqRetVT; 7819 if (!ReqRetVT.isVector()) { 7820 if (!Data.getValueType().isInteger()) 7821 Data = DAG.getNode(ISD::BITCAST, DL, 7822 Data.getValueType().changeTypeToInteger(), Data); 7823 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data); 7824 } else { 7825 // We need to widen the return vector to a legal type 7826 if ((ReqRetVT.getVectorNumElements() % 2) == 1 && 7827 ReqRetVT.getVectorElementType().getSizeInBits() == 16) { 7828 LegalReqRetVT = 7829 EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(), 7830 ReqRetVT.getVectorNumElements() + 1); 7831 } 7832 } 7833 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data); 7834 7835 if (IsTexFail) { 7836 TexFail = 7837 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0), 7838 DAG.getConstant(MaskPopDwords, DL, MVT::i32)); 7839 7840 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL); 7841 } 7842 7843 if (Result->getNumValues() == 1) 7844 return Data; 7845 7846 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL); 7847 } 7848 7849 static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, 7850 SDValue *LWE, bool &IsTexFail) { 7851 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode()); 7852 7853 uint64_t Value = TexFailCtrlConst->getZExtValue(); 7854 if (Value) { 7855 IsTexFail = true; 7856 } 7857 7858 SDLoc DL(TexFailCtrlConst); 7859 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32); 7860 Value &= ~(uint64_t)0x1; 7861 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32); 7862 Value &= ~(uint64_t)0x2; 7863 7864 return Value == 0; 7865 } 7866 7867 static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op, 7868 MVT PackVectorVT, 7869 SmallVectorImpl<SDValue> &PackedAddrs, 7870 unsigned DimIdx, unsigned EndIdx, 7871 unsigned NumGradients) { 7872 SDLoc DL(Op); 7873 for (unsigned I = DimIdx; I < EndIdx; I++) { 7874 SDValue Addr = Op.getOperand(I); 7875 7876 // Gradients are packed with undef for each coordinate. 7877 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this: 7878 // 1D: undef,dx/dh; undef,dx/dv 7879 // 2D: dy/dh,dx/dh; dy/dv,dx/dv 7880 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv 7881 if (((I + 1) >= EndIdx) || 7882 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 || 7883 I == DimIdx + NumGradients - 1))) { 7884 if (Addr.getValueType() != MVT::i16) 7885 Addr = DAG.getBitcast(MVT::i16, Addr); 7886 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr); 7887 } else { 7888 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)}); 7889 I++; 7890 } 7891 Addr = DAG.getBitcast(MVT::f32, Addr); 7892 PackedAddrs.push_back(Addr); 7893 } 7894 } 7895 7896 SDValue SITargetLowering::lowerImage(SDValue Op, 7897 const AMDGPU::ImageDimIntrinsicInfo *Intr, 7898 SelectionDAG &DAG, bool WithChain) const { 7899 SDLoc DL(Op); 7900 MachineFunction &MF = DAG.getMachineFunction(); 7901 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>(); 7902 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 7903 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 7904 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); 7905 unsigned IntrOpcode = Intr->BaseOpcode; 7906 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); 7907 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); 7908 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); 7909 7910 SmallVector<EVT, 3> ResultTypes(Op->values()); 7911 SmallVector<EVT, 3> OrigResultTypes(Op->values()); 7912 bool IsD16 = false; 7913 bool IsG16 = false; 7914 bool IsA16 = false; 7915 SDValue VData; 7916 int NumVDataDwords = 0; 7917 bool AdjustRetType = false; 7918 bool IsAtomicPacked16Bit = false; 7919 7920 // Offset of intrinsic arguments 7921 const unsigned ArgOffset = WithChain ? 2 : 1; 7922 7923 unsigned DMask; 7924 unsigned DMaskLanes = 0; 7925 7926 if (BaseOpcode->Atomic) { 7927 VData = Op.getOperand(2); 7928 7929 IsAtomicPacked16Bit = 7930 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || 7931 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); 7932 7933 bool Is64Bit = VData.getValueSizeInBits() == 64; 7934 if (BaseOpcode->AtomicX2) { 7935 SDValue VData2 = Op.getOperand(3); 7936 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL, 7937 {VData, VData2}); 7938 if (Is64Bit) 7939 VData = DAG.getBitcast(MVT::v4i32, VData); 7940 7941 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; 7942 DMask = Is64Bit ? 0xf : 0x3; 7943 NumVDataDwords = Is64Bit ? 4 : 2; 7944 } else { 7945 DMask = Is64Bit ? 0x3 : 0x1; 7946 NumVDataDwords = Is64Bit ? 2 : 1; 7947 } 7948 } else { 7949 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex); 7950 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask); 7951 7952 if (BaseOpcode->Store) { 7953 VData = Op.getOperand(2); 7954 7955 MVT StoreVT = VData.getSimpleValueType(); 7956 if (StoreVT.getScalarType() == MVT::f16) { 7957 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) 7958 return Op; // D16 is unsupported for this instruction 7959 7960 IsD16 = true; 7961 VData = handleD16VData(VData, DAG, true); 7962 } 7963 7964 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32; 7965 } else if (!BaseOpcode->NoReturn) { 7966 // Work out the num dwords based on the dmask popcount and underlying type 7967 // and whether packing is supported. 7968 MVT LoadVT = ResultTypes[0].getSimpleVT(); 7969 if (LoadVT.getScalarType() == MVT::f16) { 7970 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) 7971 return Op; // D16 is unsupported for this instruction 7972 7973 IsD16 = true; 7974 } 7975 7976 // Confirm that the return type is large enough for the dmask specified 7977 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) || 7978 (!LoadVT.isVector() && DMaskLanes > 1)) 7979 return Op; 7980 7981 // The sq block of gfx8 and gfx9 do not estimate register use correctly 7982 // for d16 image_gather4, image_gather4_l, and image_gather4_lz 7983 // instructions. 7984 if (IsD16 && !Subtarget->hasUnpackedD16VMem() && 7985 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug())) 7986 NumVDataDwords = (DMaskLanes + 1) / 2; 7987 else 7988 NumVDataDwords = DMaskLanes; 7989 7990 AdjustRetType = true; 7991 } 7992 } 7993 7994 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd; 7995 SmallVector<SDValue, 4> VAddrs; 7996 7997 // Check for 16 bit addresses or derivatives and pack if true. 7998 MVT VAddrVT = 7999 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType(); 8000 MVT VAddrScalarVT = VAddrVT.getScalarType(); 8001 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; 8002 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; 8003 8004 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType(); 8005 VAddrScalarVT = VAddrVT.getScalarType(); 8006 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; 8007 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; 8008 8009 // Push back extra arguments. 8010 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) { 8011 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) { 8012 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 8013 // Special handling of bias when A16 is on. Bias is of type half but 8014 // occupies full 32-bit. 8015 SDValue Bias = DAG.getBuildVector( 8016 MVT::v2f16, DL, 8017 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)}); 8018 VAddrs.push_back(Bias); 8019 } else { 8020 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 8021 "Bias needs to be converted to 16 bit in A16 mode"); 8022 VAddrs.push_back(Op.getOperand(ArgOffset + I)); 8023 } 8024 } 8025 8026 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) { 8027 // 16 bit gradients are supported, but are tied to the A16 control 8028 // so both gradients and addresses must be 16 bit 8029 LLVM_DEBUG( 8030 dbgs() << "Failed to lower image intrinsic: 16 bit addresses " 8031 "require 16 bit args for both gradients and addresses"); 8032 return Op; 8033 } 8034 8035 if (IsA16) { 8036 if (!ST->hasA16()) { 8037 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " 8038 "support 16 bit addresses\n"); 8039 return Op; 8040 } 8041 } 8042 8043 // We've dealt with incorrect input so we know that if IsA16, IsG16 8044 // are set then we have to compress/pack operands (either address, 8045 // gradient or both) 8046 // In the case where a16 and gradients are tied (no G16 support) then we 8047 // have already verified that both IsA16 and IsG16 are true 8048 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) { 8049 // Activate g16 8050 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = 8051 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); 8052 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16 8053 } 8054 8055 // Add gradients (packed or unpacked) 8056 if (IsG16) { 8057 // Pack the gradients 8058 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart); 8059 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs, 8060 ArgOffset + Intr->GradientStart, 8061 ArgOffset + Intr->CoordStart, Intr->NumGradients); 8062 } else { 8063 for (unsigned I = ArgOffset + Intr->GradientStart; 8064 I < ArgOffset + Intr->CoordStart; I++) 8065 VAddrs.push_back(Op.getOperand(I)); 8066 } 8067 8068 // Add addresses (packed or unpacked) 8069 if (IsA16) { 8070 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs, 8071 ArgOffset + Intr->CoordStart, VAddrEnd, 8072 0 /* No gradients */); 8073 } else { 8074 // Add uncompressed address 8075 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++) 8076 VAddrs.push_back(Op.getOperand(I)); 8077 } 8078 8079 // If the register allocator cannot place the address registers contiguously 8080 // without introducing moves, then using the non-sequential address encoding 8081 // is always preferable, since it saves VALU instructions and is usually a 8082 // wash in terms of code size or even better. 8083 // 8084 // However, we currently have no way of hinting to the register allocator that 8085 // MIMG addresses should be placed contiguously when it is possible to do so, 8086 // so force non-NSA for the common 2-address case as a heuristic. 8087 // 8088 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 8089 // allocation when possible. 8090 // 8091 // Partial NSA is allowed on GFX11+ where the final register is a contiguous 8092 // set of the remaining addresses. 8093 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler); 8094 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding(); 8095 const bool UseNSA = ST->hasNSAEncoding() && 8096 VAddrs.size() >= ST->getNSAThreshold(MF) && 8097 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding); 8098 const bool UsePartialNSA = 8099 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize; 8100 8101 SDValue VAddr; 8102 if (UsePartialNSA) { 8103 VAddr = getBuildDwordsVector(DAG, DL, 8104 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1)); 8105 } 8106 else if (!UseNSA) { 8107 VAddr = getBuildDwordsVector(DAG, DL, VAddrs); 8108 } 8109 8110 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1); 8111 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1); 8112 SDValue Unorm; 8113 if (!BaseOpcode->Sampler) { 8114 Unorm = True; 8115 } else { 8116 uint64_t UnormConst = 8117 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex); 8118 8119 Unorm = UnormConst ? True : False; 8120 } 8121 8122 SDValue TFE; 8123 SDValue LWE; 8124 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex); 8125 bool IsTexFail = false; 8126 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail)) 8127 return Op; 8128 8129 if (IsTexFail) { 8130 if (!DMaskLanes) { 8131 // Expecting to get an error flag since TFC is on - and dmask is 0 8132 // Force dmask to be at least 1 otherwise the instruction will fail 8133 DMask = 0x1; 8134 DMaskLanes = 1; 8135 NumVDataDwords = 1; 8136 } 8137 NumVDataDwords += 1; 8138 AdjustRetType = true; 8139 } 8140 8141 // Has something earlier tagged that the return type needs adjusting 8142 // This happens if the instruction is a load or has set TexFailCtrl flags 8143 if (AdjustRetType) { 8144 // NumVDataDwords reflects the true number of dwords required in the return type 8145 if (DMaskLanes == 0 && !BaseOpcode->Store) { 8146 // This is a no-op load. This can be eliminated 8147 SDValue Undef = DAG.getUNDEF(Op.getValueType()); 8148 if (isa<MemSDNode>(Op)) 8149 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL); 8150 return Undef; 8151 } 8152 8153 EVT NewVT = NumVDataDwords > 1 ? 8154 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords) 8155 : MVT::i32; 8156 8157 ResultTypes[0] = NewVT; 8158 if (ResultTypes.size() == 3) { 8159 // Original result was aggregate type used for TexFailCtrl results 8160 // The actual instruction returns as a vector type which has now been 8161 // created. Remove the aggregate result. 8162 ResultTypes.erase(&ResultTypes[1]); 8163 } 8164 } 8165 8166 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex); 8167 if (BaseOpcode->Atomic) 8168 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization 8169 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | 8170 AMDGPU::CPol::VOLATILE)) 8171 return Op; 8172 8173 SmallVector<SDValue, 26> Ops; 8174 if (BaseOpcode->Store || BaseOpcode->Atomic) 8175 Ops.push_back(VData); // vdata 8176 if (UsePartialNSA) { 8177 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1)); 8178 Ops.push_back(VAddr); 8179 } 8180 else if (UseNSA) 8181 append_range(Ops, VAddrs); 8182 else 8183 Ops.push_back(VAddr); 8184 Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex)); 8185 if (BaseOpcode->Sampler) 8186 Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex)); 8187 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32)); 8188 if (IsGFX10Plus) 8189 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32)); 8190 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) 8191 Ops.push_back(Unorm); 8192 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32)); 8193 Ops.push_back(IsA16 && // r128, a16 for gfx9 8194 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); 8195 if (IsGFX10Plus) 8196 Ops.push_back(IsA16 ? True : False); 8197 if (!Subtarget->hasGFX90AInsts()) { 8198 Ops.push_back(TFE); //tfe 8199 } else if (TFE->getAsZExtVal()) { 8200 report_fatal_error("TFE is not supported on this GPU"); 8201 } 8202 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) 8203 Ops.push_back(LWE); // lwe 8204 if (!IsGFX10Plus) 8205 Ops.push_back(DimInfo->DA ? True : False); 8206 if (BaseOpcode->HasD16) 8207 Ops.push_back(IsD16 ? True : False); 8208 if (isa<MemSDNode>(Op)) 8209 Ops.push_back(Op.getOperand(0)); // chain 8210 8211 int NumVAddrDwords = 8212 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32; 8213 int Opcode = -1; 8214 8215 if (IsGFX12Plus) { 8216 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12, 8217 NumVDataDwords, NumVAddrDwords); 8218 } else if (IsGFX11Plus) { 8219 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 8220 UseNSA ? AMDGPU::MIMGEncGfx11NSA 8221 : AMDGPU::MIMGEncGfx11Default, 8222 NumVDataDwords, NumVAddrDwords); 8223 } else if (IsGFX10Plus) { 8224 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 8225 UseNSA ? AMDGPU::MIMGEncGfx10NSA 8226 : AMDGPU::MIMGEncGfx10Default, 8227 NumVDataDwords, NumVAddrDwords); 8228 } else { 8229 if (Subtarget->hasGFX90AInsts()) { 8230 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a, 8231 NumVDataDwords, NumVAddrDwords); 8232 if (Opcode == -1) 8233 report_fatal_error( 8234 "requested image instruction is not supported on this GPU"); 8235 } 8236 if (Opcode == -1 && 8237 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 8238 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, 8239 NumVDataDwords, NumVAddrDwords); 8240 if (Opcode == -1) 8241 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, 8242 NumVDataDwords, NumVAddrDwords); 8243 } 8244 if (Opcode == -1) 8245 return Op; 8246 8247 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops); 8248 if (auto MemOp = dyn_cast<MemSDNode>(Op)) { 8249 MachineMemOperand *MemRef = MemOp->getMemOperand(); 8250 DAG.setNodeMemRefs(NewNode, {MemRef}); 8251 } 8252 8253 if (BaseOpcode->AtomicX2) { 8254 SmallVector<SDValue, 1> Elt; 8255 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1); 8256 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL); 8257 } 8258 if (BaseOpcode->NoReturn) 8259 return SDValue(NewNode, 0); 8260 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail, 8261 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes, 8262 NumVDataDwords, IsAtomicPacked16Bit, DL); 8263 } 8264 8265 SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, 8266 SDValue Offset, SDValue CachePolicy, 8267 SelectionDAG &DAG) const { 8268 MachineFunction &MF = DAG.getMachineFunction(); 8269 8270 const DataLayout &DataLayout = DAG.getDataLayout(); 8271 Align Alignment = 8272 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext())); 8273 8274 MachineMemOperand *MMO = MF.getMachineMemOperand( 8275 MachinePointerInfo(), 8276 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 8277 MachineMemOperand::MOInvariant, 8278 VT.getStoreSize(), Alignment); 8279 8280 if (!Offset->isDivergent()) { 8281 SDValue Ops[] = {Rsrc, Offset, CachePolicy}; 8282 8283 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the 8284 // s_buffer_load_u16 instruction is emitted for both signed and unsigned 8285 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext 8286 // and generates s_buffer_load_i16 (performSignExtendInRegCombine). 8287 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) { 8288 SDValue BufferLoad = 8289 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL, 8290 DAG.getVTList(MVT::i32), Ops, VT, MMO); 8291 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad); 8292 } 8293 8294 // Widen vec3 load to vec4. 8295 if (VT.isVector() && VT.getVectorNumElements() == 3 && 8296 !Subtarget->hasScalarDwordx3Loads()) { 8297 EVT WidenedVT = 8298 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); 8299 auto WidenedOp = DAG.getMemIntrinsicNode( 8300 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT, 8301 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize())); 8302 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp, 8303 DAG.getVectorIdxConstant(0, DL)); 8304 return Subvector; 8305 } 8306 8307 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, 8308 DAG.getVTList(VT), Ops, VT, MMO); 8309 } 8310 8311 // We have a divergent offset. Emit a MUBUF buffer load instead. We can 8312 // assume that the buffer is unswizzled. 8313 SDValue Ops[] = { 8314 DAG.getEntryNode(), // Chain 8315 Rsrc, // rsrc 8316 DAG.getConstant(0, DL, MVT::i32), // vindex 8317 {}, // voffset 8318 {}, // soffset 8319 {}, // offset 8320 CachePolicy, // cachepolicy 8321 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 8322 }; 8323 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) { 8324 setBufferOffsets(Offset, DAG, &Ops[3], Align(4)); 8325 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO); 8326 } 8327 8328 SmallVector<SDValue, 4> Loads; 8329 unsigned NumLoads = 1; 8330 MVT LoadVT = VT.getSimpleVT(); 8331 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; 8332 assert((LoadVT.getScalarType() == MVT::i32 || 8333 LoadVT.getScalarType() == MVT::f32)); 8334 8335 if (NumElts == 8 || NumElts == 16) { 8336 NumLoads = NumElts / 4; 8337 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4); 8338 } 8339 8340 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue}); 8341 8342 // Use the alignment to ensure that the required offsets will fit into the 8343 // immediate offsets. 8344 setBufferOffsets(Offset, DAG, &Ops[3], 8345 NumLoads > 1 ? Align(16 * NumLoads) : Align(4)); 8346 8347 uint64_t InstOffset = Ops[5]->getAsZExtVal(); 8348 for (unsigned i = 0; i < NumLoads; ++i) { 8349 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32); 8350 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops, 8351 LoadVT, MMO, DAG)); 8352 } 8353 8354 if (NumElts == 8 || NumElts == 16) 8355 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads); 8356 8357 return Loads[0]; 8358 } 8359 8360 SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const { 8361 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25]. 8362 if (!Subtarget->hasArchitectedSGPRs()) 8363 return {}; 8364 SDLoc SL(Op); 8365 MVT VT = MVT::i32; 8366 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT); 8367 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8, 8368 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT)); 8369 } 8370 8371 SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op, 8372 unsigned Dim, 8373 const ArgDescriptor &Arg) const { 8374 SDLoc SL(Op); 8375 MachineFunction &MF = DAG.getMachineFunction(); 8376 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim); 8377 if (MaxID == 0) 8378 return DAG.getConstant(0, SL, MVT::i32); 8379 8380 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, 8381 SDLoc(DAG.getEntryNode()), Arg); 8382 8383 // Don't bother inserting AssertZext for packed IDs since we're emitting the 8384 // masking operations anyway. 8385 // 8386 // TODO: We could assert the top bit is 0 for the source copy. 8387 if (Arg.isMasked()) 8388 return Val; 8389 8390 // Preserve the known bits after expansion to a copy. 8391 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID)); 8392 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val, 8393 DAG.getValueType(SmallVT)); 8394 } 8395 8396 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 8397 SelectionDAG &DAG) const { 8398 MachineFunction &MF = DAG.getMachineFunction(); 8399 auto MFI = MF.getInfo<SIMachineFunctionInfo>(); 8400 8401 EVT VT = Op.getValueType(); 8402 SDLoc DL(Op); 8403 unsigned IntrinsicID = Op.getConstantOperandVal(0); 8404 8405 // TODO: Should this propagate fast-math-flags? 8406 8407 switch (IntrinsicID) { 8408 case Intrinsic::amdgcn_implicit_buffer_ptr: { 8409 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction())) 8410 return emitNonHSAIntrinsicError(DAG, DL, VT); 8411 return getPreloadedValue(DAG, *MFI, VT, 8412 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 8413 } 8414 case Intrinsic::amdgcn_dispatch_ptr: 8415 case Intrinsic::amdgcn_queue_ptr: { 8416 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) { 8417 DiagnosticInfoUnsupported BadIntrin( 8418 MF.getFunction(), "unsupported hsa intrinsic without hsa target", 8419 DL.getDebugLoc()); 8420 DAG.getContext()->diagnose(BadIntrin); 8421 return DAG.getUNDEF(VT); 8422 } 8423 8424 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? 8425 AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR; 8426 return getPreloadedValue(DAG, *MFI, VT, RegID); 8427 } 8428 case Intrinsic::amdgcn_implicitarg_ptr: { 8429 if (MFI->isEntryFunction()) 8430 return getImplicitArgPtr(DAG, DL); 8431 return getPreloadedValue(DAG, *MFI, VT, 8432 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 8433 } 8434 case Intrinsic::amdgcn_kernarg_segment_ptr: { 8435 if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) { 8436 // This only makes sense to call in a kernel, so just lower to null. 8437 return DAG.getConstant(0, DL, VT); 8438 } 8439 8440 return getPreloadedValue(DAG, *MFI, VT, 8441 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 8442 } 8443 case Intrinsic::amdgcn_dispatch_id: { 8444 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID); 8445 } 8446 case Intrinsic::amdgcn_rcp: 8447 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); 8448 case Intrinsic::amdgcn_rsq: 8449 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 8450 case Intrinsic::amdgcn_rsq_legacy: 8451 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 8452 return emitRemovedIntrinsicError(DAG, DL, VT); 8453 return SDValue(); 8454 case Intrinsic::amdgcn_rcp_legacy: 8455 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 8456 return emitRemovedIntrinsicError(DAG, DL, VT); 8457 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1)); 8458 case Intrinsic::amdgcn_rsq_clamp: { 8459 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 8460 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); 8461 8462 Type *Type = VT.getTypeForEVT(*DAG.getContext()); 8463 APFloat Max = APFloat::getLargest(Type->getFltSemantics()); 8464 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); 8465 8466 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 8467 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, 8468 DAG.getConstantFP(Max, DL, VT)); 8469 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, 8470 DAG.getConstantFP(Min, DL, VT)); 8471 } 8472 case Intrinsic::r600_read_ngroups_x: 8473 if (Subtarget->isAmdHsaOS()) 8474 return emitNonHSAIntrinsicError(DAG, DL, VT); 8475 8476 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8477 SI::KernelInputOffsets::NGROUPS_X, Align(4), 8478 false); 8479 case Intrinsic::r600_read_ngroups_y: 8480 if (Subtarget->isAmdHsaOS()) 8481 return emitNonHSAIntrinsicError(DAG, DL, VT); 8482 8483 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8484 SI::KernelInputOffsets::NGROUPS_Y, Align(4), 8485 false); 8486 case Intrinsic::r600_read_ngroups_z: 8487 if (Subtarget->isAmdHsaOS()) 8488 return emitNonHSAIntrinsicError(DAG, DL, VT); 8489 8490 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8491 SI::KernelInputOffsets::NGROUPS_Z, Align(4), 8492 false); 8493 case Intrinsic::r600_read_global_size_x: 8494 if (Subtarget->isAmdHsaOS()) 8495 return emitNonHSAIntrinsicError(DAG, DL, VT); 8496 8497 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8498 SI::KernelInputOffsets::GLOBAL_SIZE_X, 8499 Align(4), false); 8500 case Intrinsic::r600_read_global_size_y: 8501 if (Subtarget->isAmdHsaOS()) 8502 return emitNonHSAIntrinsicError(DAG, DL, VT); 8503 8504 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8505 SI::KernelInputOffsets::GLOBAL_SIZE_Y, 8506 Align(4), false); 8507 case Intrinsic::r600_read_global_size_z: 8508 if (Subtarget->isAmdHsaOS()) 8509 return emitNonHSAIntrinsicError(DAG, DL, VT); 8510 8511 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8512 SI::KernelInputOffsets::GLOBAL_SIZE_Z, 8513 Align(4), false); 8514 case Intrinsic::r600_read_local_size_x: 8515 if (Subtarget->isAmdHsaOS()) 8516 return emitNonHSAIntrinsicError(DAG, DL, VT); 8517 8518 return lowerImplicitZextParam(DAG, Op, MVT::i16, 8519 SI::KernelInputOffsets::LOCAL_SIZE_X); 8520 case Intrinsic::r600_read_local_size_y: 8521 if (Subtarget->isAmdHsaOS()) 8522 return emitNonHSAIntrinsicError(DAG, DL, VT); 8523 8524 return lowerImplicitZextParam(DAG, Op, MVT::i16, 8525 SI::KernelInputOffsets::LOCAL_SIZE_Y); 8526 case Intrinsic::r600_read_local_size_z: 8527 if (Subtarget->isAmdHsaOS()) 8528 return emitNonHSAIntrinsicError(DAG, DL, VT); 8529 8530 return lowerImplicitZextParam(DAG, Op, MVT::i16, 8531 SI::KernelInputOffsets::LOCAL_SIZE_Z); 8532 case Intrinsic::amdgcn_workgroup_id_x: 8533 return getPreloadedValue(DAG, *MFI, VT, 8534 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 8535 case Intrinsic::amdgcn_workgroup_id_y: 8536 return getPreloadedValue(DAG, *MFI, VT, 8537 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 8538 case Intrinsic::amdgcn_workgroup_id_z: 8539 return getPreloadedValue(DAG, *MFI, VT, 8540 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 8541 case Intrinsic::amdgcn_wave_id: 8542 return lowerWaveID(DAG, Op); 8543 case Intrinsic::amdgcn_lds_kernel_id: { 8544 if (MFI->isEntryFunction()) 8545 return getLDSKernelId(DAG, DL); 8546 return getPreloadedValue(DAG, *MFI, VT, 8547 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 8548 } 8549 case Intrinsic::amdgcn_workitem_id_x: 8550 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX); 8551 case Intrinsic::amdgcn_workitem_id_y: 8552 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY); 8553 case Intrinsic::amdgcn_workitem_id_z: 8554 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ); 8555 case Intrinsic::amdgcn_wavefrontsize: 8556 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(), 8557 SDLoc(Op), MVT::i32); 8558 case Intrinsic::amdgcn_s_buffer_load: { 8559 unsigned CPol = Op.getConstantOperandVal(3); 8560 // s_buffer_load, because of how it's optimized, can't be volatile 8561 // so reject ones with the volatile bit set. 8562 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12) 8563 ? AMDGPU::CPol::ALL 8564 : AMDGPU::CPol::ALL_pregfx12)) 8565 return Op; 8566 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), 8567 DAG); 8568 } 8569 case Intrinsic::amdgcn_fdiv_fast: 8570 return lowerFDIV_FAST(Op, DAG); 8571 case Intrinsic::amdgcn_sin: 8572 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); 8573 8574 case Intrinsic::amdgcn_cos: 8575 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1)); 8576 8577 case Intrinsic::amdgcn_mul_u24: 8578 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2)); 8579 case Intrinsic::amdgcn_mul_i24: 8580 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2)); 8581 8582 case Intrinsic::amdgcn_log_clamp: { 8583 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 8584 return SDValue(); 8585 8586 return emitRemovedIntrinsicError(DAG, DL, VT); 8587 } 8588 case Intrinsic::amdgcn_fract: 8589 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); 8590 8591 case Intrinsic::amdgcn_class: 8592 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, 8593 Op.getOperand(1), Op.getOperand(2)); 8594 case Intrinsic::amdgcn_div_fmas: 8595 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, 8596 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), 8597 Op.getOperand(4)); 8598 8599 case Intrinsic::amdgcn_div_fixup: 8600 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, 8601 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 8602 8603 case Intrinsic::amdgcn_div_scale: { 8604 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3)); 8605 8606 // Translate to the operands expected by the machine instruction. The 8607 // first parameter must be the same as the first instruction. 8608 SDValue Numerator = Op.getOperand(1); 8609 SDValue Denominator = Op.getOperand(2); 8610 8611 // Note this order is opposite of the machine instruction's operations, 8612 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The 8613 // intrinsic has the numerator as the first operand to match a normal 8614 // division operation. 8615 8616 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator; 8617 8618 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, 8619 Denominator, Numerator); 8620 } 8621 case Intrinsic::amdgcn_icmp: { 8622 // There is a Pat that handles this variant, so return it as-is. 8623 if (Op.getOperand(1).getValueType() == MVT::i1 && 8624 Op.getConstantOperandVal(2) == 0 && 8625 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE) 8626 return Op; 8627 return lowerICMPIntrinsic(*this, Op.getNode(), DAG); 8628 } 8629 case Intrinsic::amdgcn_fcmp: { 8630 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG); 8631 } 8632 case Intrinsic::amdgcn_ballot: 8633 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG); 8634 case Intrinsic::amdgcn_fmed3: 8635 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, 8636 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 8637 case Intrinsic::amdgcn_fdot2: 8638 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, 8639 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), 8640 Op.getOperand(4)); 8641 case Intrinsic::amdgcn_fmul_legacy: 8642 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, 8643 Op.getOperand(1), Op.getOperand(2)); 8644 case Intrinsic::amdgcn_sffbh: 8645 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1)); 8646 case Intrinsic::amdgcn_sbfe: 8647 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, 8648 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 8649 case Intrinsic::amdgcn_ubfe: 8650 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, 8651 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 8652 case Intrinsic::amdgcn_cvt_pkrtz: 8653 case Intrinsic::amdgcn_cvt_pknorm_i16: 8654 case Intrinsic::amdgcn_cvt_pknorm_u16: 8655 case Intrinsic::amdgcn_cvt_pk_i16: 8656 case Intrinsic::amdgcn_cvt_pk_u16: { 8657 // FIXME: Stop adding cast if v2f16/v2i16 are legal. 8658 EVT VT = Op.getValueType(); 8659 unsigned Opcode; 8660 8661 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz) 8662 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32; 8663 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16) 8664 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32; 8665 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16) 8666 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32; 8667 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16) 8668 Opcode = AMDGPUISD::CVT_PK_I16_I32; 8669 else 8670 Opcode = AMDGPUISD::CVT_PK_U16_U32; 8671 8672 if (isTypeLegal(VT)) 8673 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2)); 8674 8675 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32, 8676 Op.getOperand(1), Op.getOperand(2)); 8677 return DAG.getNode(ISD::BITCAST, DL, VT, Node); 8678 } 8679 case Intrinsic::amdgcn_fmad_ftz: 8680 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1), 8681 Op.getOperand(2), Op.getOperand(3)); 8682 8683 case Intrinsic::amdgcn_if_break: 8684 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT, 8685 Op->getOperand(1), Op->getOperand(2)), 0); 8686 8687 case Intrinsic::amdgcn_groupstaticsize: { 8688 Triple::OSType OS = getTargetMachine().getTargetTriple().getOS(); 8689 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) 8690 return Op; 8691 8692 const Module *M = MF.getFunction().getParent(); 8693 const GlobalValue *GV = 8694 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize)); 8695 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0, 8696 SIInstrInfo::MO_ABS32_LO); 8697 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; 8698 } 8699 case Intrinsic::amdgcn_is_shared: 8700 case Intrinsic::amdgcn_is_private: { 8701 SDLoc SL(Op); 8702 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ? 8703 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS; 8704 SDValue Aperture = getSegmentAperture(AS, SL, DAG); 8705 SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, 8706 Op.getOperand(1)); 8707 8708 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec, 8709 DAG.getConstant(1, SL, MVT::i32)); 8710 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ); 8711 } 8712 case Intrinsic::amdgcn_perm: 8713 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1), 8714 Op.getOperand(2), Op.getOperand(3)); 8715 case Intrinsic::amdgcn_reloc_constant: { 8716 Module *M = const_cast<Module *>(MF.getFunction().getParent()); 8717 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD(); 8718 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); 8719 auto RelocSymbol = cast<GlobalVariable>( 8720 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext()))); 8721 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0, 8722 SIInstrInfo::MO_ABS32_LO); 8723 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; 8724 } 8725 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: 8726 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: 8727 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: 8728 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: 8729 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: 8730 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: 8731 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: 8732 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: { 8733 if (Op.getOperand(4).getValueType() == MVT::i32) 8734 return SDValue(); 8735 8736 SDLoc SL(Op); 8737 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32); 8738 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), 8739 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), 8740 Op.getOperand(3), IndexKeyi32); 8741 } 8742 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: 8743 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: 8744 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { 8745 if (Op.getOperand(6).getValueType() == MVT::i32) 8746 return SDValue(); 8747 8748 SDLoc SL(Op); 8749 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32); 8750 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), 8751 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), 8752 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5), 8753 IndexKeyi32, Op.getOperand(7)}); 8754 } 8755 case Intrinsic::amdgcn_addrspacecast_nonnull: 8756 return lowerADDRSPACECAST(Op, DAG); 8757 case Intrinsic::amdgcn_readlane: 8758 case Intrinsic::amdgcn_readfirstlane: 8759 case Intrinsic::amdgcn_writelane: 8760 case Intrinsic::amdgcn_permlane16: 8761 case Intrinsic::amdgcn_permlanex16: 8762 case Intrinsic::amdgcn_permlane64: 8763 return lowerLaneOp(*this, Op.getNode(), DAG); 8764 default: 8765 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 8766 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) 8767 return lowerImage(Op, ImageDimIntr, DAG, false); 8768 8769 return Op; 8770 } 8771 } 8772 8773 // On targets not supporting constant in soffset field, turn zero to 8774 // SGPR_NULL to avoid generating an extra s_mov with zero. 8775 static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, 8776 const GCNSubtarget *Subtarget) { 8777 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset)) 8778 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32); 8779 return SOffset; 8780 } 8781 8782 SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, 8783 SelectionDAG &DAG, 8784 unsigned NewOpcode) const { 8785 SDLoc DL(Op); 8786 8787 SDValue VData = Op.getOperand(2); 8788 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 8789 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); 8790 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); 8791 SDValue Ops[] = { 8792 Op.getOperand(0), // Chain 8793 VData, // vdata 8794 Rsrc, // rsrc 8795 DAG.getConstant(0, DL, MVT::i32), // vindex 8796 Offsets.first, // voffset 8797 SOffset, // soffset 8798 Offsets.second, // offset 8799 Op.getOperand(6), // cachepolicy 8800 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 8801 }; 8802 8803 auto *M = cast<MemSDNode>(Op); 8804 8805 EVT MemVT = VData.getValueType(); 8806 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, 8807 M->getMemOperand()); 8808 } 8809 8810 SDValue 8811 SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, 8812 unsigned NewOpcode) const { 8813 SDLoc DL(Op); 8814 8815 SDValue VData = Op.getOperand(2); 8816 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 8817 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); 8818 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); 8819 SDValue Ops[] = { 8820 Op.getOperand(0), // Chain 8821 VData, // vdata 8822 Rsrc, // rsrc 8823 Op.getOperand(4), // vindex 8824 Offsets.first, // voffset 8825 SOffset, // soffset 8826 Offsets.second, // offset 8827 Op.getOperand(7), // cachepolicy 8828 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 8829 }; 8830 8831 auto *M = cast<MemSDNode>(Op); 8832 8833 EVT MemVT = VData.getValueType(); 8834 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, 8835 M->getMemOperand()); 8836 } 8837 8838 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, 8839 SelectionDAG &DAG) const { 8840 unsigned IntrID = Op.getConstantOperandVal(1); 8841 SDLoc DL(Op); 8842 8843 switch (IntrID) { 8844 case Intrinsic::amdgcn_ds_ordered_add: 8845 case Intrinsic::amdgcn_ds_ordered_swap: { 8846 MemSDNode *M = cast<MemSDNode>(Op); 8847 SDValue Chain = M->getOperand(0); 8848 SDValue M0 = M->getOperand(2); 8849 SDValue Value = M->getOperand(3); 8850 unsigned IndexOperand = M->getConstantOperandVal(7); 8851 unsigned WaveRelease = M->getConstantOperandVal(8); 8852 unsigned WaveDone = M->getConstantOperandVal(9); 8853 8854 unsigned OrderedCountIndex = IndexOperand & 0x3f; 8855 IndexOperand &= ~0x3f; 8856 unsigned CountDw = 0; 8857 8858 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) { 8859 CountDw = (IndexOperand >> 24) & 0xf; 8860 IndexOperand &= ~(0xf << 24); 8861 8862 if (CountDw < 1 || CountDw > 4) { 8863 report_fatal_error( 8864 "ds_ordered_count: dword count must be between 1 and 4"); 8865 } 8866 } 8867 8868 if (IndexOperand) 8869 report_fatal_error("ds_ordered_count: bad index operand"); 8870 8871 if (WaveDone && !WaveRelease) 8872 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 8873 8874 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 8875 unsigned ShaderType = 8876 SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction()); 8877 unsigned Offset0 = OrderedCountIndex << 2; 8878 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4); 8879 8880 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) 8881 Offset1 |= (CountDw - 1) << 6; 8882 8883 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11) 8884 Offset1 |= ShaderType << 2; 8885 8886 unsigned Offset = Offset0 | (Offset1 << 8); 8887 8888 SDValue Ops[] = { 8889 Chain, 8890 Value, 8891 DAG.getTargetConstant(Offset, DL, MVT::i16), 8892 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue 8893 }; 8894 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL, 8895 M->getVTList(), Ops, M->getMemoryVT(), 8896 M->getMemOperand()); 8897 } 8898 case Intrinsic::amdgcn_raw_buffer_load: 8899 case Intrinsic::amdgcn_raw_ptr_buffer_load: 8900 case Intrinsic::amdgcn_raw_atomic_buffer_load: 8901 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: 8902 case Intrinsic::amdgcn_raw_buffer_load_format: 8903 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: { 8904 const bool IsFormat = 8905 IntrID == Intrinsic::amdgcn_raw_buffer_load_format || 8906 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format; 8907 8908 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); 8909 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); 8910 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget); 8911 SDValue Ops[] = { 8912 Op.getOperand(0), // Chain 8913 Rsrc, // rsrc 8914 DAG.getConstant(0, DL, MVT::i32), // vindex 8915 Offsets.first, // voffset 8916 SOffset, // soffset 8917 Offsets.second, // offset 8918 Op.getOperand(5), // cachepolicy, swizzled buffer 8919 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 8920 }; 8921 8922 auto *M = cast<MemSDNode>(Op); 8923 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops); 8924 } 8925 case Intrinsic::amdgcn_struct_buffer_load: 8926 case Intrinsic::amdgcn_struct_ptr_buffer_load: 8927 case Intrinsic::amdgcn_struct_buffer_load_format: 8928 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: { 8929 const bool IsFormat = 8930 IntrID == Intrinsic::amdgcn_struct_buffer_load_format || 8931 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format; 8932 8933 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); 8934 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); 8935 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); 8936 SDValue Ops[] = { 8937 Op.getOperand(0), // Chain 8938 Rsrc, // rsrc 8939 Op.getOperand(3), // vindex 8940 Offsets.first, // voffset 8941 SOffset, // soffset 8942 Offsets.second, // offset 8943 Op.getOperand(6), // cachepolicy, swizzled buffer 8944 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 8945 }; 8946 8947 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops); 8948 } 8949 case Intrinsic::amdgcn_raw_tbuffer_load: 8950 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: { 8951 MemSDNode *M = cast<MemSDNode>(Op); 8952 EVT LoadVT = Op.getValueType(); 8953 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); 8954 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); 8955 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget); 8956 8957 SDValue Ops[] = { 8958 Op.getOperand(0), // Chain 8959 Rsrc, // rsrc 8960 DAG.getConstant(0, DL, MVT::i32), // vindex 8961 Offsets.first, // voffset 8962 SOffset, // soffset 8963 Offsets.second, // offset 8964 Op.getOperand(5), // format 8965 Op.getOperand(6), // cachepolicy, swizzled buffer 8966 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 8967 }; 8968 8969 if (LoadVT.getScalarType() == MVT::f16) 8970 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, 8971 M, DAG, Ops); 8972 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, 8973 Op->getVTList(), Ops, LoadVT, M->getMemOperand(), 8974 DAG); 8975 } 8976 case Intrinsic::amdgcn_struct_tbuffer_load: 8977 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: { 8978 MemSDNode *M = cast<MemSDNode>(Op); 8979 EVT LoadVT = Op.getValueType(); 8980 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); 8981 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); 8982 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); 8983 8984 SDValue Ops[] = { 8985 Op.getOperand(0), // Chain 8986 Rsrc, // rsrc 8987 Op.getOperand(3), // vindex 8988 Offsets.first, // voffset 8989 SOffset, // soffset 8990 Offsets.second, // offset 8991 Op.getOperand(6), // format 8992 Op.getOperand(7), // cachepolicy, swizzled buffer 8993 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 8994 }; 8995 8996 if (LoadVT.getScalarType() == MVT::f16) 8997 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, 8998 M, DAG, Ops); 8999 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, 9000 Op->getVTList(), Ops, LoadVT, M->getMemOperand(), 9001 DAG); 9002 } 9003 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 9004 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 9005 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); 9006 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 9007 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 9008 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); 9009 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 9010 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 9011 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN); 9012 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 9013 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 9014 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN); 9015 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 9016 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 9017 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX); 9018 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 9019 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 9020 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX); 9021 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 9022 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 9023 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP); 9024 case Intrinsic::amdgcn_raw_buffer_atomic_add: 9025 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 9026 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD); 9027 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 9028 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 9029 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB); 9030 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 9031 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 9032 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN); 9033 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 9034 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 9035 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN); 9036 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 9037 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 9038 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX); 9039 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 9040 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 9041 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX); 9042 case Intrinsic::amdgcn_raw_buffer_atomic_and: 9043 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 9044 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND); 9045 case Intrinsic::amdgcn_raw_buffer_atomic_or: 9046 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 9047 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR); 9048 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 9049 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 9050 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR); 9051 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 9052 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 9053 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC); 9054 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 9055 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 9056 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); 9057 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: 9058 return lowerRawBufferAtomicIntrin(Op, DAG, 9059 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); 9060 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 9061 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 9062 return lowerStructBufferAtomicIntrin(Op, DAG, 9063 AMDGPUISD::BUFFER_ATOMIC_SWAP); 9064 case Intrinsic::amdgcn_struct_buffer_atomic_add: 9065 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 9066 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD); 9067 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 9068 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 9069 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB); 9070 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 9071 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 9072 return lowerStructBufferAtomicIntrin(Op, DAG, 9073 AMDGPUISD::BUFFER_ATOMIC_SMIN); 9074 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 9075 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 9076 return lowerStructBufferAtomicIntrin(Op, DAG, 9077 AMDGPUISD::BUFFER_ATOMIC_UMIN); 9078 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 9079 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 9080 return lowerStructBufferAtomicIntrin(Op, DAG, 9081 AMDGPUISD::BUFFER_ATOMIC_SMAX); 9082 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 9083 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 9084 return lowerStructBufferAtomicIntrin(Op, DAG, 9085 AMDGPUISD::BUFFER_ATOMIC_UMAX); 9086 case Intrinsic::amdgcn_struct_buffer_atomic_and: 9087 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 9088 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND); 9089 case Intrinsic::amdgcn_struct_buffer_atomic_or: 9090 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 9091 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR); 9092 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 9093 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 9094 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR); 9095 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 9096 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 9097 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC); 9098 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 9099 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 9100 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); 9101 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: 9102 return lowerStructBufferAtomicIntrin(Op, DAG, 9103 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); 9104 9105 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 9106 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: { 9107 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG); 9108 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); 9109 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); 9110 SDValue Ops[] = { 9111 Op.getOperand(0), // Chain 9112 Op.getOperand(2), // src 9113 Op.getOperand(3), // cmp 9114 Rsrc, // rsrc 9115 DAG.getConstant(0, DL, MVT::i32), // vindex 9116 Offsets.first, // voffset 9117 SOffset, // soffset 9118 Offsets.second, // offset 9119 Op.getOperand(7), // cachepolicy 9120 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 9121 }; 9122 EVT VT = Op.getValueType(); 9123 auto *M = cast<MemSDNode>(Op); 9124 9125 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, 9126 Op->getVTList(), Ops, VT, M->getMemOperand()); 9127 } 9128 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 9129 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: { 9130 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG); 9131 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG); 9132 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget); 9133 SDValue Ops[] = { 9134 Op.getOperand(0), // Chain 9135 Op.getOperand(2), // src 9136 Op.getOperand(3), // cmp 9137 Rsrc, // rsrc 9138 Op.getOperand(5), // vindex 9139 Offsets.first, // voffset 9140 SOffset, // soffset 9141 Offsets.second, // offset 9142 Op.getOperand(8), // cachepolicy 9143 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 9144 }; 9145 EVT VT = Op.getValueType(); 9146 auto *M = cast<MemSDNode>(Op); 9147 9148 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, 9149 Op->getVTList(), Ops, VT, M->getMemOperand()); 9150 } 9151 case Intrinsic::amdgcn_image_bvh_intersect_ray: { 9152 MemSDNode *M = cast<MemSDNode>(Op); 9153 SDValue NodePtr = M->getOperand(2); 9154 SDValue RayExtent = M->getOperand(3); 9155 SDValue RayOrigin = M->getOperand(4); 9156 SDValue RayDir = M->getOperand(5); 9157 SDValue RayInvDir = M->getOperand(6); 9158 SDValue TDescr = M->getOperand(7); 9159 9160 assert(NodePtr.getValueType() == MVT::i32 || 9161 NodePtr.getValueType() == MVT::i64); 9162 assert(RayDir.getValueType() == MVT::v3f16 || 9163 RayDir.getValueType() == MVT::v3f32); 9164 9165 if (!Subtarget->hasGFX10_AEncoding()) { 9166 emitRemovedIntrinsicError(DAG, DL, Op.getValueType()); 9167 return SDValue(); 9168 } 9169 9170 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget); 9171 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); 9172 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); 9173 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; 9174 const bool Is64 = NodePtr.getValueType() == MVT::i64; 9175 const unsigned NumVDataDwords = 4; 9176 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 9177 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 9178 const bool UseNSA = (Subtarget->hasNSAEncoding() && 9179 NumVAddrs <= Subtarget->getNSAMaxSize()) || 9180 IsGFX12Plus; 9181 const unsigned BaseOpcodes[2][2] = { 9182 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 9183 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 9184 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 9185 int Opcode; 9186 if (UseNSA) { 9187 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 9188 IsGFX12Plus ? AMDGPU::MIMGEncGfx12 9189 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA 9190 : AMDGPU::MIMGEncGfx10NSA, 9191 NumVDataDwords, NumVAddrDwords); 9192 } else { 9193 assert(!IsGFX12Plus); 9194 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 9195 IsGFX11 ? AMDGPU::MIMGEncGfx11Default 9196 : AMDGPU::MIMGEncGfx10Default, 9197 NumVDataDwords, NumVAddrDwords); 9198 } 9199 assert(Opcode != -1); 9200 9201 SmallVector<SDValue, 16> Ops; 9202 9203 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) { 9204 SmallVector<SDValue, 3> Lanes; 9205 DAG.ExtractVectorElements(Op, Lanes, 0, 3); 9206 if (Lanes[0].getValueSizeInBits() == 32) { 9207 for (unsigned I = 0; I < 3; ++I) 9208 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I])); 9209 } else { 9210 if (IsAligned) { 9211 Ops.push_back( 9212 DAG.getBitcast(MVT::i32, 9213 DAG.getBuildVector(MVT::v2f16, DL, 9214 { Lanes[0], Lanes[1] }))); 9215 Ops.push_back(Lanes[2]); 9216 } else { 9217 SDValue Elt0 = Ops.pop_back_val(); 9218 Ops.push_back( 9219 DAG.getBitcast(MVT::i32, 9220 DAG.getBuildVector(MVT::v2f16, DL, 9221 { Elt0, Lanes[0] }))); 9222 Ops.push_back( 9223 DAG.getBitcast(MVT::i32, 9224 DAG.getBuildVector(MVT::v2f16, DL, 9225 { Lanes[1], Lanes[2] }))); 9226 } 9227 } 9228 }; 9229 9230 if (UseNSA && IsGFX11Plus) { 9231 Ops.push_back(NodePtr); 9232 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent)); 9233 Ops.push_back(RayOrigin); 9234 if (IsA16) { 9235 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes; 9236 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3); 9237 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3); 9238 for (unsigned I = 0; I < 3; ++I) { 9239 MergedLanes.push_back(DAG.getBitcast( 9240 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, 9241 {DirLanes[I], InvDirLanes[I]}))); 9242 } 9243 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes)); 9244 } else { 9245 Ops.push_back(RayDir); 9246 Ops.push_back(RayInvDir); 9247 } 9248 } else { 9249 if (Is64) 9250 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, 9251 2); 9252 else 9253 Ops.push_back(NodePtr); 9254 9255 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent)); 9256 packLanes(RayOrigin, true); 9257 packLanes(RayDir, true); 9258 packLanes(RayInvDir, false); 9259 } 9260 9261 if (!UseNSA) { 9262 // Build a single vector containing all the operands so far prepared. 9263 if (NumVAddrDwords > 12) { 9264 SDValue Undef = DAG.getUNDEF(MVT::i32); 9265 Ops.append(16 - Ops.size(), Undef); 9266 } 9267 assert(Ops.size() >= 8 && Ops.size() <= 12); 9268 SDValue MergedOps = DAG.getBuildVector( 9269 MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops); 9270 Ops.clear(); 9271 Ops.push_back(MergedOps); 9272 } 9273 9274 Ops.push_back(TDescr); 9275 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1)); 9276 Ops.push_back(M->getChain()); 9277 9278 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops); 9279 MachineMemOperand *MemRef = M->getMemOperand(); 9280 DAG.setNodeMemRefs(NewNode, {MemRef}); 9281 return SDValue(NewNode, 0); 9282 } 9283 case Intrinsic::amdgcn_global_atomic_fmin: 9284 case Intrinsic::amdgcn_global_atomic_fmax: 9285 case Intrinsic::amdgcn_global_atomic_fmin_num: 9286 case Intrinsic::amdgcn_global_atomic_fmax_num: 9287 case Intrinsic::amdgcn_flat_atomic_fmin: 9288 case Intrinsic::amdgcn_flat_atomic_fmax: 9289 case Intrinsic::amdgcn_flat_atomic_fmin_num: 9290 case Intrinsic::amdgcn_flat_atomic_fmax_num: { 9291 MemSDNode *M = cast<MemSDNode>(Op); 9292 SDValue Ops[] = { 9293 M->getOperand(0), // Chain 9294 M->getOperand(2), // Ptr 9295 M->getOperand(3) // Value 9296 }; 9297 unsigned Opcode = 0; 9298 switch (IntrID) { 9299 case Intrinsic::amdgcn_global_atomic_fmin: 9300 case Intrinsic::amdgcn_global_atomic_fmin_num: 9301 case Intrinsic::amdgcn_flat_atomic_fmin: 9302 case Intrinsic::amdgcn_flat_atomic_fmin_num: { 9303 Opcode = ISD::ATOMIC_LOAD_FMIN; 9304 break; 9305 } 9306 case Intrinsic::amdgcn_global_atomic_fmax: 9307 case Intrinsic::amdgcn_global_atomic_fmax_num: 9308 case Intrinsic::amdgcn_flat_atomic_fmax: 9309 case Intrinsic::amdgcn_flat_atomic_fmax_num: { 9310 Opcode = ISD::ATOMIC_LOAD_FMAX; 9311 break; 9312 } 9313 default: 9314 llvm_unreachable("unhandled atomic opcode"); 9315 } 9316 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(), 9317 Ops, M->getMemOperand()); 9318 } 9319 case Intrinsic::amdgcn_s_get_barrier_state: { 9320 SDValue Chain = Op->getOperand(0); 9321 SmallVector<SDValue, 2> Ops; 9322 unsigned Opc; 9323 bool IsInlinableBarID = false; 9324 int64_t BarID; 9325 9326 if (isa<ConstantSDNode>(Op->getOperand(2))) { 9327 BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue(); 9328 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID); 9329 } 9330 9331 if (IsInlinableBarID) { 9332 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM; 9333 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32); 9334 Ops.push_back(K); 9335 } else { 9336 Opc = AMDGPU::S_GET_BARRIER_STATE_M0; 9337 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2)); 9338 Ops.push_back(M0Val.getValue(0)); 9339 } 9340 9341 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); 9342 return SDValue(NewMI, 0); 9343 } 9344 default: 9345 9346 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 9347 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 9348 return lowerImage(Op, ImageDimIntr, DAG, true); 9349 9350 return SDValue(); 9351 } 9352 } 9353 9354 // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to 9355 // dwordx4 if on SI and handle TFE loads. 9356 SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, 9357 SDVTList VTList, 9358 ArrayRef<SDValue> Ops, EVT MemVT, 9359 MachineMemOperand *MMO, 9360 SelectionDAG &DAG) const { 9361 LLVMContext &C = *DAG.getContext(); 9362 MachineFunction &MF = DAG.getMachineFunction(); 9363 EVT VT = VTList.VTs[0]; 9364 9365 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3); 9366 bool IsTFE = VTList.NumVTs == 3; 9367 if (IsTFE) { 9368 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32); 9369 unsigned NumOpDWords = NumValueDWords + 1; 9370 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords); 9371 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]); 9372 MachineMemOperand *OpDWordsMMO = 9373 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4); 9374 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops, 9375 OpDWordsVT, OpDWordsMMO, DAG); 9376 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, 9377 DAG.getVectorIdxConstant(NumValueDWords, DL)); 9378 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL); 9379 SDValue ValueDWords = 9380 NumValueDWords == 1 9381 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx) 9382 : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, 9383 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op, 9384 ZeroIdx); 9385 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords); 9386 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL); 9387 } 9388 9389 if (!Subtarget->hasDwordx3LoadStores() && 9390 (VT == MVT::v3i32 || VT == MVT::v3f32)) { 9391 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4); 9392 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4); 9393 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16); 9394 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]); 9395 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops, 9396 WidenedMemVT, WidenedMMO); 9397 SDValue Value = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Op, 9398 DAG.getVectorIdxConstant(0, DL)); 9399 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL); 9400 } 9401 9402 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO); 9403 } 9404 9405 SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG, 9406 bool ImageStore) const { 9407 EVT StoreVT = VData.getValueType(); 9408 9409 // No change for f16 and legal vector D16 types. 9410 if (!StoreVT.isVector()) 9411 return VData; 9412 9413 SDLoc DL(VData); 9414 unsigned NumElements = StoreVT.getVectorNumElements(); 9415 9416 if (Subtarget->hasUnpackedD16VMem()) { 9417 // We need to unpack the packed data to store. 9418 EVT IntStoreVT = StoreVT.changeTypeToInteger(); 9419 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); 9420 9421 EVT EquivStoreVT = 9422 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements); 9423 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); 9424 return DAG.UnrollVectorOp(ZExt.getNode()); 9425 } 9426 9427 // The sq block of gfx8.1 does not estimate register use correctly for d16 9428 // image store instructions. The data operand is computed as if it were not a 9429 // d16 image instruction. 9430 if (ImageStore && Subtarget->hasImageStoreD16Bug()) { 9431 // Bitcast to i16 9432 EVT IntStoreVT = StoreVT.changeTypeToInteger(); 9433 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); 9434 9435 // Decompose into scalars 9436 SmallVector<SDValue, 4> Elts; 9437 DAG.ExtractVectorElements(IntVData, Elts); 9438 9439 // Group pairs of i16 into v2i16 and bitcast to i32 9440 SmallVector<SDValue, 4> PackedElts; 9441 for (unsigned I = 0; I < Elts.size() / 2; I += 1) { 9442 SDValue Pair = 9443 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]}); 9444 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair); 9445 PackedElts.push_back(IntPair); 9446 } 9447 if ((NumElements % 2) == 1) { 9448 // Handle v3i16 9449 unsigned I = Elts.size() / 2; 9450 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL, 9451 {Elts[I * 2], DAG.getUNDEF(MVT::i16)}); 9452 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair); 9453 PackedElts.push_back(IntPair); 9454 } 9455 9456 // Pad using UNDEF 9457 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32)); 9458 9459 // Build final vector 9460 EVT VecVT = 9461 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size()); 9462 return DAG.getBuildVector(VecVT, DL, PackedElts); 9463 } 9464 9465 if (NumElements == 3) { 9466 EVT IntStoreVT = 9467 EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits()); 9468 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); 9469 9470 EVT WidenedStoreVT = EVT::getVectorVT( 9471 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1); 9472 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(), 9473 WidenedStoreVT.getStoreSizeInBits()); 9474 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData); 9475 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt); 9476 } 9477 9478 assert(isTypeLegal(StoreVT)); 9479 return VData; 9480 } 9481 9482 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, 9483 SelectionDAG &DAG) const { 9484 SDLoc DL(Op); 9485 SDValue Chain = Op.getOperand(0); 9486 unsigned IntrinsicID = Op.getConstantOperandVal(1); 9487 MachineFunction &MF = DAG.getMachineFunction(); 9488 9489 switch (IntrinsicID) { 9490 case Intrinsic::amdgcn_exp_compr: { 9491 if (!Subtarget->hasCompressedExport()) { 9492 DiagnosticInfoUnsupported BadIntrin( 9493 DAG.getMachineFunction().getFunction(), 9494 "intrinsic not supported on subtarget", DL.getDebugLoc()); 9495 DAG.getContext()->diagnose(BadIntrin); 9496 } 9497 SDValue Src0 = Op.getOperand(4); 9498 SDValue Src1 = Op.getOperand(5); 9499 // Hack around illegal type on SI by directly selecting it. 9500 if (isTypeLegal(Src0.getValueType())) 9501 return SDValue(); 9502 9503 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6)); 9504 SDValue Undef = DAG.getUNDEF(MVT::f32); 9505 const SDValue Ops[] = { 9506 Op.getOperand(2), // tgt 9507 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0 9508 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1 9509 Undef, // src2 9510 Undef, // src3 9511 Op.getOperand(7), // vm 9512 DAG.getTargetConstant(1, DL, MVT::i1), // compr 9513 Op.getOperand(3), // en 9514 Op.getOperand(0) // Chain 9515 }; 9516 9517 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE; 9518 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0); 9519 } 9520 case Intrinsic::amdgcn_s_barrier: { 9521 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 9522 if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) { 9523 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second; 9524 if (WGSize <= ST.getWavefrontSize()) 9525 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other, 9526 Op.getOperand(0)), 0); 9527 } 9528 9529 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait 9530 if (ST.hasSplitBarriers()) { 9531 SDValue K = 9532 DAG.getTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32); 9533 SDValue BarSignal = 9534 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL, 9535 MVT::Other, K, Op.getOperand(0)), 9536 0); 9537 SDValue BarWait = 9538 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K, 9539 BarSignal.getValue(0)), 9540 0); 9541 return BarWait; 9542 } 9543 9544 return SDValue(); 9545 }; 9546 9547 case Intrinsic::amdgcn_struct_tbuffer_store: 9548 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { 9549 SDValue VData = Op.getOperand(2); 9550 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); 9551 if (IsD16) 9552 VData = handleD16VData(VData, DAG); 9553 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 9554 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); 9555 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); 9556 SDValue Ops[] = { 9557 Chain, 9558 VData, // vdata 9559 Rsrc, // rsrc 9560 Op.getOperand(4), // vindex 9561 Offsets.first, // voffset 9562 SOffset, // soffset 9563 Offsets.second, // offset 9564 Op.getOperand(7), // format 9565 Op.getOperand(8), // cachepolicy, swizzled buffer 9566 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 9567 }; 9568 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : 9569 AMDGPUISD::TBUFFER_STORE_FORMAT; 9570 MemSDNode *M = cast<MemSDNode>(Op); 9571 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, 9572 M->getMemoryVT(), M->getMemOperand()); 9573 } 9574 9575 case Intrinsic::amdgcn_raw_tbuffer_store: 9576 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: { 9577 SDValue VData = Op.getOperand(2); 9578 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); 9579 if (IsD16) 9580 VData = handleD16VData(VData, DAG); 9581 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 9582 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); 9583 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); 9584 SDValue Ops[] = { 9585 Chain, 9586 VData, // vdata 9587 Rsrc, // rsrc 9588 DAG.getConstant(0, DL, MVT::i32), // vindex 9589 Offsets.first, // voffset 9590 SOffset, // soffset 9591 Offsets.second, // offset 9592 Op.getOperand(6), // format 9593 Op.getOperand(7), // cachepolicy, swizzled buffer 9594 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 9595 }; 9596 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : 9597 AMDGPUISD::TBUFFER_STORE_FORMAT; 9598 MemSDNode *M = cast<MemSDNode>(Op); 9599 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, 9600 M->getMemoryVT(), M->getMemOperand()); 9601 } 9602 9603 case Intrinsic::amdgcn_raw_buffer_store: 9604 case Intrinsic::amdgcn_raw_ptr_buffer_store: 9605 case Intrinsic::amdgcn_raw_buffer_store_format: 9606 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: { 9607 const bool IsFormat = 9608 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format || 9609 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format; 9610 9611 SDValue VData = Op.getOperand(2); 9612 EVT VDataVT = VData.getValueType(); 9613 EVT EltType = VDataVT.getScalarType(); 9614 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); 9615 if (IsD16) { 9616 VData = handleD16VData(VData, DAG); 9617 VDataVT = VData.getValueType(); 9618 } 9619 9620 if (!isTypeLegal(VDataVT)) { 9621 VData = 9622 DAG.getNode(ISD::BITCAST, DL, 9623 getEquivalentMemType(*DAG.getContext(), VDataVT), VData); 9624 } 9625 9626 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 9627 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); 9628 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); 9629 SDValue Ops[] = { 9630 Chain, 9631 VData, 9632 Rsrc, 9633 DAG.getConstant(0, DL, MVT::i32), // vindex 9634 Offsets.first, // voffset 9635 SOffset, // soffset 9636 Offsets.second, // offset 9637 Op.getOperand(6), // cachepolicy, swizzled buffer 9638 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 9639 }; 9640 unsigned Opc = 9641 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE; 9642 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; 9643 MemSDNode *M = cast<MemSDNode>(Op); 9644 9645 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics 9646 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) 9647 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M); 9648 9649 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, 9650 M->getMemoryVT(), M->getMemOperand()); 9651 } 9652 9653 case Intrinsic::amdgcn_struct_buffer_store: 9654 case Intrinsic::amdgcn_struct_ptr_buffer_store: 9655 case Intrinsic::amdgcn_struct_buffer_store_format: 9656 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: { 9657 const bool IsFormat = 9658 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format || 9659 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format; 9660 9661 SDValue VData = Op.getOperand(2); 9662 EVT VDataVT = VData.getValueType(); 9663 EVT EltType = VDataVT.getScalarType(); 9664 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); 9665 9666 if (IsD16) { 9667 VData = handleD16VData(VData, DAG); 9668 VDataVT = VData.getValueType(); 9669 } 9670 9671 if (!isTypeLegal(VDataVT)) { 9672 VData = 9673 DAG.getNode(ISD::BITCAST, DL, 9674 getEquivalentMemType(*DAG.getContext(), VDataVT), VData); 9675 } 9676 9677 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 9678 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); 9679 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); 9680 SDValue Ops[] = { 9681 Chain, 9682 VData, 9683 Rsrc, 9684 Op.getOperand(4), // vindex 9685 Offsets.first, // voffset 9686 SOffset, // soffset 9687 Offsets.second, // offset 9688 Op.getOperand(7), // cachepolicy, swizzled buffer 9689 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 9690 }; 9691 unsigned Opc = 9692 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; 9693 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; 9694 MemSDNode *M = cast<MemSDNode>(Op); 9695 9696 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics 9697 EVT VDataType = VData.getValueType().getScalarType(); 9698 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) 9699 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); 9700 9701 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, 9702 M->getMemoryVT(), M->getMemOperand()); 9703 } 9704 case Intrinsic::amdgcn_raw_buffer_load_lds: 9705 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: 9706 case Intrinsic::amdgcn_struct_buffer_load_lds: 9707 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { 9708 assert(!AMDGPU::isGFX12Plus(*Subtarget)); 9709 unsigned Opc; 9710 bool HasVIndex = 9711 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds || 9712 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds; 9713 unsigned OpOffset = HasVIndex ? 1 : 0; 9714 SDValue VOffset = Op.getOperand(5 + OpOffset); 9715 bool HasVOffset = !isNullConstant(VOffset); 9716 unsigned Size = Op->getConstantOperandVal(4); 9717 9718 switch (Size) { 9719 default: 9720 return SDValue(); 9721 case 1: 9722 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN 9723 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN 9724 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN 9725 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET; 9726 break; 9727 case 2: 9728 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN 9729 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN 9730 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN 9731 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET; 9732 break; 9733 case 4: 9734 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN 9735 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN 9736 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN 9737 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET; 9738 break; 9739 } 9740 9741 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3)); 9742 9743 SmallVector<SDValue, 8> Ops; 9744 9745 if (HasVIndex && HasVOffset) 9746 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL, 9747 { Op.getOperand(5), // VIndex 9748 VOffset })); 9749 else if (HasVIndex) 9750 Ops.push_back(Op.getOperand(5)); 9751 else if (HasVOffset) 9752 Ops.push_back(VOffset); 9753 9754 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); 9755 Ops.push_back(Rsrc); 9756 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset 9757 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset 9758 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset); 9759 Ops.push_back( 9760 DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol 9761 Ops.push_back(DAG.getTargetConstant( 9762 Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz 9763 Ops.push_back(M0Val.getValue(0)); // Chain 9764 Ops.push_back(M0Val.getValue(1)); // Glue 9765 9766 auto *M = cast<MemSDNode>(Op); 9767 MachineMemOperand *LoadMMO = M->getMemOperand(); 9768 // Don't set the offset value here because the pointer points to the base of 9769 // the buffer. 9770 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); 9771 9772 MachinePointerInfo StorePtrI = LoadPtrI; 9773 LoadPtrI.V = PoisonValue::get( 9774 PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS)); 9775 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; 9776 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; 9777 9778 auto F = LoadMMO->getFlags() & 9779 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); 9780 LoadMMO = 9781 MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, 9782 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); 9783 9784 MachineMemOperand *StoreMMO = MF.getMachineMemOperand( 9785 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), 9786 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); 9787 9788 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops); 9789 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); 9790 9791 return SDValue(Load, 0); 9792 } 9793 case Intrinsic::amdgcn_global_load_lds: { 9794 unsigned Opc; 9795 unsigned Size = Op->getConstantOperandVal(4); 9796 switch (Size) { 9797 default: 9798 return SDValue(); 9799 case 1: 9800 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE; 9801 break; 9802 case 2: 9803 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT; 9804 break; 9805 case 4: 9806 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD; 9807 break; 9808 } 9809 9810 auto *M = cast<MemSDNode>(Op); 9811 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3)); 9812 9813 SmallVector<SDValue, 6> Ops; 9814 9815 SDValue Addr = Op.getOperand(2); // Global ptr 9816 SDValue VOffset; 9817 // Try to split SAddr and VOffset. Global and LDS pointers share the same 9818 // immediate offset, so we cannot use a regular SelectGlobalSAddr(). 9819 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) { 9820 SDValue LHS = Addr.getOperand(0); 9821 SDValue RHS = Addr.getOperand(1); 9822 9823 if (LHS->isDivergent()) 9824 std::swap(LHS, RHS); 9825 9826 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND && 9827 RHS.getOperand(0).getValueType() == MVT::i32) { 9828 // add (i64 sgpr), (zero_extend (i32 vgpr)) 9829 Addr = LHS; 9830 VOffset = RHS.getOperand(0); 9831 } 9832 } 9833 9834 Ops.push_back(Addr); 9835 if (!Addr->isDivergent()) { 9836 Opc = AMDGPU::getGlobalSaddrOp(Opc); 9837 if (!VOffset) 9838 VOffset = SDValue( 9839 DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, 9840 DAG.getTargetConstant(0, DL, MVT::i32)), 0); 9841 Ops.push_back(VOffset); 9842 } 9843 9844 Ops.push_back(Op.getOperand(5)); // Offset 9845 Ops.push_back(Op.getOperand(6)); // CPol 9846 Ops.push_back(M0Val.getValue(0)); // Chain 9847 Ops.push_back(M0Val.getValue(1)); // Glue 9848 9849 MachineMemOperand *LoadMMO = M->getMemOperand(); 9850 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); 9851 LoadPtrI.Offset = Op->getConstantOperandVal(5); 9852 MachinePointerInfo StorePtrI = LoadPtrI; 9853 LoadPtrI.V = PoisonValue::get( 9854 PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS)); 9855 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; 9856 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; 9857 auto F = LoadMMO->getFlags() & 9858 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); 9859 LoadMMO = 9860 MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, 9861 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); 9862 MachineMemOperand *StoreMMO = MF.getMachineMemOperand( 9863 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4), 9864 LoadMMO->getAAInfo()); 9865 9866 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); 9867 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); 9868 9869 return SDValue(Load, 0); 9870 } 9871 case Intrinsic::amdgcn_end_cf: 9872 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, 9873 Op->getOperand(2), Chain), 0); 9874 case Intrinsic::amdgcn_s_barrier_init: 9875 case Intrinsic::amdgcn_s_barrier_join: 9876 case Intrinsic::amdgcn_s_wakeup_barrier: { 9877 SDValue Chain = Op->getOperand(0); 9878 SmallVector<SDValue, 2> Ops; 9879 SDValue BarOp = Op->getOperand(2); 9880 unsigned Opc; 9881 bool IsInlinableBarID = false; 9882 int64_t BarVal; 9883 9884 if (isa<ConstantSDNode>(BarOp)) { 9885 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue(); 9886 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal); 9887 } 9888 9889 if (IsInlinableBarID) { 9890 switch (IntrinsicID) { 9891 default: 9892 return SDValue(); 9893 case Intrinsic::amdgcn_s_barrier_init: 9894 Opc = AMDGPU::S_BARRIER_INIT_IMM; 9895 break; 9896 case Intrinsic::amdgcn_s_barrier_join: 9897 Opc = AMDGPU::S_BARRIER_JOIN_IMM; 9898 break; 9899 case Intrinsic::amdgcn_s_wakeup_barrier: 9900 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM; 9901 break; 9902 } 9903 9904 SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32); 9905 Ops.push_back(K); 9906 } else { 9907 switch (IntrinsicID) { 9908 default: 9909 return SDValue(); 9910 case Intrinsic::amdgcn_s_barrier_init: 9911 Opc = AMDGPU::S_BARRIER_INIT_M0; 9912 break; 9913 case Intrinsic::amdgcn_s_barrier_join: 9914 Opc = AMDGPU::S_BARRIER_JOIN_M0; 9915 break; 9916 case Intrinsic::amdgcn_s_wakeup_barrier: 9917 Opc = AMDGPU::S_WAKEUP_BARRIER_M0; 9918 break; 9919 } 9920 } 9921 9922 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) { 9923 SDValue M0Val; 9924 // Member count will be read from M0[16:22] 9925 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3), 9926 DAG.getShiftAmountConstant(16, MVT::i32, DL)); 9927 9928 if (!IsInlinableBarID) { 9929 // If reference to barrier id is not an inline constant then it must be 9930 // referenced with M0[4:0]. Perform an OR with the member count to 9931 // include it in M0. 9932 M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, 9933 Op.getOperand(2), M0Val), 9934 0); 9935 } 9936 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); 9937 } else if (!IsInlinableBarID) { 9938 Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0)); 9939 } 9940 9941 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); 9942 return SDValue(NewMI, 0); 9943 } 9944 default: { 9945 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 9946 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) 9947 return lowerImage(Op, ImageDimIntr, DAG, true); 9948 9949 return Op; 9950 } 9951 } 9952 } 9953 9954 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 9955 // offset (the offset that is included in bounds checking and swizzling, to be 9956 // split between the instruction's voffset and immoffset fields) and soffset 9957 // (the offset that is excluded from bounds checking and swizzling, to go in 9958 // the instruction's soffset field). This function takes the first kind of 9959 // offset and figures out how to split it between voffset and immoffset. 9960 std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( 9961 SDValue Offset, SelectionDAG &DAG) const { 9962 SDLoc DL(Offset); 9963 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget); 9964 SDValue N0 = Offset; 9965 ConstantSDNode *C1 = nullptr; 9966 9967 if ((C1 = dyn_cast<ConstantSDNode>(N0))) 9968 N0 = SDValue(); 9969 else if (DAG.isBaseWithConstantOffset(N0)) { 9970 C1 = cast<ConstantSDNode>(N0.getOperand(1)); 9971 N0 = N0.getOperand(0); 9972 } 9973 9974 if (C1) { 9975 unsigned ImmOffset = C1->getZExtValue(); 9976 // If the immediate value is too big for the immoffset field, put only bits 9977 // that would normally fit in the immoffset field. The remaining value that 9978 // is copied/added for the voffset field is a large power of 2, and it 9979 // stands more chance of being CSEd with the copy/add for another similar 9980 // load/store. 9981 // However, do not do that rounding down if that is a negative 9982 // number, as it appears to be illegal to have a negative offset in the 9983 // vgpr, even if adding the immediate offset makes it positive. 9984 unsigned Overflow = ImmOffset & ~MaxImm; 9985 ImmOffset -= Overflow; 9986 if ((int32_t)Overflow < 0) { 9987 Overflow += ImmOffset; 9988 ImmOffset = 0; 9989 } 9990 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32)); 9991 if (Overflow) { 9992 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32); 9993 if (!N0) 9994 N0 = OverflowVal; 9995 else { 9996 SDValue Ops[] = { N0, OverflowVal }; 9997 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops); 9998 } 9999 } 10000 } 10001 if (!N0) 10002 N0 = DAG.getConstant(0, DL, MVT::i32); 10003 if (!C1) 10004 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32)); 10005 return {N0, SDValue(C1, 0)}; 10006 } 10007 10008 // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store 10009 // the three offsets (voffset, soffset and instoffset) into the SDValue[3] array 10010 // pointed to by Offsets. 10011 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, 10012 SelectionDAG &DAG, SDValue *Offsets, 10013 Align Alignment) const { 10014 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 10015 SDLoc DL(CombinedOffset); 10016 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) { 10017 uint32_t Imm = C->getZExtValue(); 10018 uint32_t SOffset, ImmOffset; 10019 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) { 10020 Offsets[0] = DAG.getConstant(0, DL, MVT::i32); 10021 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); 10022 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); 10023 return; 10024 } 10025 } 10026 if (DAG.isBaseWithConstantOffset(CombinedOffset)) { 10027 SDValue N0 = CombinedOffset.getOperand(0); 10028 SDValue N1 = CombinedOffset.getOperand(1); 10029 uint32_t SOffset, ImmOffset; 10030 int Offset = cast<ConstantSDNode>(N1)->getSExtValue(); 10031 if (Offset >= 0 && 10032 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) { 10033 Offsets[0] = N0; 10034 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); 10035 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); 10036 return; 10037 } 10038 } 10039 10040 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset() 10041 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32) 10042 : DAG.getConstant(0, DL, MVT::i32); 10043 10044 Offsets[0] = CombinedOffset; 10045 Offsets[1] = SOffsetZero; 10046 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32); 10047 } 10048 10049 SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer, 10050 SelectionDAG &DAG) const { 10051 if (!MaybePointer.getValueType().isScalarInteger()) 10052 return MaybePointer; 10053 10054 SDLoc DL(MaybePointer); 10055 10056 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer); 10057 return Rsrc; 10058 } 10059 10060 // Wrap a global or flat pointer into a buffer intrinsic using the flags 10061 // specified in the intrinsic. 10062 SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op, 10063 SelectionDAG &DAG) const { 10064 SDLoc Loc(Op); 10065 10066 SDValue Pointer = Op->getOperand(1); 10067 SDValue Stride = Op->getOperand(2); 10068 SDValue NumRecords = Op->getOperand(3); 10069 SDValue Flags = Op->getOperand(4); 10070 10071 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32); 10072 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32); 10073 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask); 10074 std::optional<uint32_t> ConstStride = std::nullopt; 10075 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride)) 10076 ConstStride = ConstNode->getZExtValue(); 10077 10078 SDValue NewHighHalf = Masked; 10079 if (!ConstStride || *ConstStride != 0) { 10080 SDValue ShiftedStride; 10081 if (ConstStride) { 10082 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32); 10083 } else { 10084 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32); 10085 ShiftedStride = 10086 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride, 10087 DAG.getShiftAmountConstant(16, MVT::i32, Loc)); 10088 } 10089 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride); 10090 } 10091 10092 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, 10093 NewHighHalf, NumRecords, Flags); 10094 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc); 10095 return RsrcPtr; 10096 } 10097 10098 // Handle 8 bit and 16 bit buffer loads 10099 SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, 10100 EVT LoadVT, SDLoc DL, 10101 ArrayRef<SDValue> Ops, 10102 MachineMemOperand *MMO, 10103 bool IsTFE) const { 10104 EVT IntVT = LoadVT.changeTypeToInteger(); 10105 10106 if (IsTFE) { 10107 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) 10108 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE 10109 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE; 10110 MachineFunction &MF = DAG.getMachineFunction(); 10111 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8); 10112 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other); 10113 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG); 10114 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, 10115 DAG.getConstant(1, DL, MVT::i32)); 10116 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, 10117 DAG.getConstant(0, DL, MVT::i32)); 10118 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data); 10119 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc); 10120 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL); 10121 } 10122 10123 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ? 10124 AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT; 10125 10126 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other); 10127 SDValue BufferLoad = 10128 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO); 10129 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad); 10130 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal); 10131 10132 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL); 10133 } 10134 10135 // Handle 8 bit and 16 bit buffer stores 10136 SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG, 10137 EVT VDataType, SDLoc DL, 10138 SDValue Ops[], 10139 MemSDNode *M) const { 10140 if (VDataType == MVT::f16 || VDataType == MVT::bf16) 10141 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]); 10142 10143 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]); 10144 Ops[1] = BufferStoreExt; 10145 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE : 10146 AMDGPUISD::BUFFER_STORE_SHORT; 10147 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9); 10148 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType, 10149 M->getMemOperand()); 10150 } 10151 10152 static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, 10153 ISD::LoadExtType ExtType, SDValue Op, 10154 const SDLoc &SL, EVT VT) { 10155 if (VT.bitsLT(Op.getValueType())) 10156 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op); 10157 10158 switch (ExtType) { 10159 case ISD::SEXTLOAD: 10160 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op); 10161 case ISD::ZEXTLOAD: 10162 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op); 10163 case ISD::EXTLOAD: 10164 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op); 10165 case ISD::NON_EXTLOAD: 10166 return Op; 10167 } 10168 10169 llvm_unreachable("invalid ext type"); 10170 } 10171 10172 // Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads. 10173 // TODO: Skip this on GFX12 which does have scalar sub-dword loads. 10174 SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const { 10175 SelectionDAG &DAG = DCI.DAG; 10176 if (Ld->getAlign() < Align(4) || Ld->isDivergent()) 10177 return SDValue(); 10178 10179 // FIXME: Constant loads should all be marked invariant. 10180 unsigned AS = Ld->getAddressSpace(); 10181 if (AS != AMDGPUAS::CONSTANT_ADDRESS && 10182 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT && 10183 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant())) 10184 return SDValue(); 10185 10186 // Don't do this early, since it may interfere with adjacent load merging for 10187 // illegal types. We can avoid losing alignment information for exotic types 10188 // pre-legalize. 10189 EVT MemVT = Ld->getMemoryVT(); 10190 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) || 10191 MemVT.getSizeInBits() >= 32) 10192 return SDValue(); 10193 10194 SDLoc SL(Ld); 10195 10196 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) && 10197 "unexpected vector extload"); 10198 10199 // TODO: Drop only high part of range. 10200 SDValue Ptr = Ld->getBasePtr(); 10201 SDValue NewLoad = DAG.getLoad( 10202 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr, 10203 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(), 10204 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(), 10205 nullptr); // Drop ranges 10206 10207 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); 10208 if (MemVT.isFloatingPoint()) { 10209 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD && 10210 "unexpected fp extload"); 10211 TruncVT = MemVT.changeTypeToInteger(); 10212 } 10213 10214 SDValue Cvt = NewLoad; 10215 if (Ld->getExtensionType() == ISD::SEXTLOAD) { 10216 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad, 10217 DAG.getValueType(TruncVT)); 10218 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD || 10219 Ld->getExtensionType() == ISD::NON_EXTLOAD) { 10220 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT); 10221 } else { 10222 assert(Ld->getExtensionType() == ISD::EXTLOAD); 10223 } 10224 10225 EVT VT = Ld->getValueType(0); 10226 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); 10227 10228 DCI.AddToWorklist(Cvt.getNode()); 10229 10230 // We may need to handle exotic cases, such as i16->i64 extloads, so insert 10231 // the appropriate extension from the 32-bit load. 10232 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT); 10233 DCI.AddToWorklist(Cvt.getNode()); 10234 10235 // Handle conversion back to floating point if necessary. 10236 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt); 10237 10238 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL); 10239 } 10240 10241 static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, 10242 const SIMachineFunctionInfo &Info) { 10243 // TODO: Should check if the address can definitely not access stack. 10244 if (Info.isEntryFunction()) 10245 return Info.getUserSGPRInfo().hasFlatScratchInit(); 10246 return true; 10247 } 10248 10249 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 10250 SDLoc DL(Op); 10251 LoadSDNode *Load = cast<LoadSDNode>(Op); 10252 ISD::LoadExtType ExtType = Load->getExtensionType(); 10253 EVT MemVT = Load->getMemoryVT(); 10254 10255 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { 10256 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16)) 10257 return SDValue(); 10258 10259 // FIXME: Copied from PPC 10260 // First, load into 32 bits, then truncate to 1 bit. 10261 10262 SDValue Chain = Load->getChain(); 10263 SDValue BasePtr = Load->getBasePtr(); 10264 MachineMemOperand *MMO = Load->getMemOperand(); 10265 10266 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16; 10267 10268 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, 10269 BasePtr, RealMemVT, MMO); 10270 10271 if (!MemVT.isVector()) { 10272 SDValue Ops[] = { 10273 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), 10274 NewLD.getValue(1) 10275 }; 10276 10277 return DAG.getMergeValues(Ops, DL); 10278 } 10279 10280 SmallVector<SDValue, 3> Elts; 10281 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) { 10282 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD, 10283 DAG.getConstant(I, DL, MVT::i32)); 10284 10285 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt)); 10286 } 10287 10288 SDValue Ops[] = { 10289 DAG.getBuildVector(MemVT, DL, Elts), 10290 NewLD.getValue(1) 10291 }; 10292 10293 return DAG.getMergeValues(Ops, DL); 10294 } 10295 10296 if (!MemVT.isVector()) 10297 return SDValue(); 10298 10299 assert(Op.getValueType().getVectorElementType() == MVT::i32 && 10300 "Custom lowering for non-i32 vectors hasn't been implemented."); 10301 10302 Align Alignment = Load->getAlign(); 10303 unsigned AS = Load->getAddressSpace(); 10304 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS && 10305 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) { 10306 return SplitVectorLoad(Op, DAG); 10307 } 10308 10309 MachineFunction &MF = DAG.getMachineFunction(); 10310 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 10311 // If there is a possibility that flat instruction access scratch memory 10312 // then we need to use the same legalization rules we use for private. 10313 if (AS == AMDGPUAS::FLAT_ADDRESS && 10314 !Subtarget->hasMultiDwordFlatScratchAddressing()) 10315 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ? 10316 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; 10317 10318 unsigned NumElements = MemVT.getVectorNumElements(); 10319 10320 if (AS == AMDGPUAS::CONSTANT_ADDRESS || 10321 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 10322 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) { 10323 if (MemVT.isPow2VectorType() || 10324 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3)) 10325 return SDValue(); 10326 return WidenOrSplitVectorLoad(Op, DAG); 10327 } 10328 // Non-uniform loads will be selected to MUBUF instructions, so they 10329 // have the same legalization requirements as global and private 10330 // loads. 10331 // 10332 } 10333 10334 if (AS == AMDGPUAS::CONSTANT_ADDRESS || 10335 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 10336 AS == AMDGPUAS::GLOBAL_ADDRESS) { 10337 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && 10338 Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) && 10339 Alignment >= Align(4) && NumElements < 32) { 10340 if (MemVT.isPow2VectorType() || 10341 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3)) 10342 return SDValue(); 10343 return WidenOrSplitVectorLoad(Op, DAG); 10344 } 10345 // Non-uniform loads will be selected to MUBUF instructions, so they 10346 // have the same legalization requirements as global and private 10347 // loads. 10348 // 10349 } 10350 if (AS == AMDGPUAS::CONSTANT_ADDRESS || 10351 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 10352 AS == AMDGPUAS::GLOBAL_ADDRESS || 10353 AS == AMDGPUAS::FLAT_ADDRESS) { 10354 if (NumElements > 4) 10355 return SplitVectorLoad(Op, DAG); 10356 // v3 loads not supported on SI. 10357 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) 10358 return WidenOrSplitVectorLoad(Op, DAG); 10359 10360 // v3 and v4 loads are supported for private and global memory. 10361 return SDValue(); 10362 } 10363 if (AS == AMDGPUAS::PRIVATE_ADDRESS) { 10364 // Depending on the setting of the private_element_size field in the 10365 // resource descriptor, we can only make private accesses up to a certain 10366 // size. 10367 switch (Subtarget->getMaxPrivateElementSize()) { 10368 case 4: { 10369 SDValue Ops[2]; 10370 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG); 10371 return DAG.getMergeValues(Ops, DL); 10372 } 10373 case 8: 10374 if (NumElements > 2) 10375 return SplitVectorLoad(Op, DAG); 10376 return SDValue(); 10377 case 16: 10378 // Same as global/flat 10379 if (NumElements > 4) 10380 return SplitVectorLoad(Op, DAG); 10381 // v3 loads not supported on SI. 10382 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) 10383 return WidenOrSplitVectorLoad(Op, DAG); 10384 10385 return SDValue(); 10386 default: 10387 llvm_unreachable("unsupported private_element_size"); 10388 } 10389 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 10390 unsigned Fast = 0; 10391 auto Flags = Load->getMemOperand()->getFlags(); 10392 if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS, 10393 Load->getAlign(), Flags, &Fast) && 10394 Fast > 1) 10395 return SDValue(); 10396 10397 if (MemVT.isVector()) 10398 return SplitVectorLoad(Op, DAG); 10399 } 10400 10401 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 10402 MemVT, *Load->getMemOperand())) { 10403 SDValue Ops[2]; 10404 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 10405 return DAG.getMergeValues(Ops, DL); 10406 } 10407 10408 return SDValue(); 10409 } 10410 10411 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 10412 EVT VT = Op.getValueType(); 10413 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 || 10414 VT.getSizeInBits() == 512) 10415 return splitTernaryVectorOp(Op, DAG); 10416 10417 assert(VT.getSizeInBits() == 64); 10418 10419 SDLoc DL(Op); 10420 SDValue Cond = Op.getOperand(0); 10421 10422 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 10423 SDValue One = DAG.getConstant(1, DL, MVT::i32); 10424 10425 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 10426 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); 10427 10428 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); 10429 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); 10430 10431 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); 10432 10433 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); 10434 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); 10435 10436 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); 10437 10438 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi}); 10439 return DAG.getNode(ISD::BITCAST, DL, VT, Res); 10440 } 10441 10442 // Catch division cases where we can use shortcuts with rcp and rsq 10443 // instructions. 10444 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, 10445 SelectionDAG &DAG) const { 10446 SDLoc SL(Op); 10447 SDValue LHS = Op.getOperand(0); 10448 SDValue RHS = Op.getOperand(1); 10449 EVT VT = Op.getValueType(); 10450 const SDNodeFlags Flags = Op->getFlags(); 10451 10452 bool AllowInaccurateRcp = Flags.hasApproximateFuncs() || 10453 DAG.getTarget().Options.UnsafeFPMath; 10454 10455 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { 10456 // Without !fpmath accuracy information, we can't do more because we don't 10457 // know exactly whether rcp is accurate enough to meet !fpmath requirement. 10458 // f16 is always accurate enough 10459 if (!AllowInaccurateRcp && VT != MVT::f16) 10460 return SDValue(); 10461 10462 if (CLHS->isExactlyValue(1.0)) { 10463 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 10464 // the CI documentation has a worst case error of 1 ulp. 10465 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 10466 // use it as long as we aren't trying to use denormals. 10467 // 10468 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. 10469 10470 // 1.0 / sqrt(x) -> rsq(x) 10471 10472 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP 10473 // error seems really high at 2^29 ULP. 10474 // 1.0 / x -> rcp(x) 10475 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 10476 } 10477 10478 // Same as for 1.0, but expand the sign out of the constant. 10479 if (CLHS->isExactlyValue(-1.0)) { 10480 // -1.0 / x -> rcp (fneg x) 10481 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 10482 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS); 10483 } 10484 } 10485 10486 // For f16 require afn or arcp. 10487 // For f32 require afn. 10488 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal())) 10489 return SDValue(); 10490 10491 // Turn into multiply by the reciprocal. 10492 // x / y -> x * (1.0 / y) 10493 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 10494 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags); 10495 } 10496 10497 SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op, 10498 SelectionDAG &DAG) const { 10499 SDLoc SL(Op); 10500 SDValue X = Op.getOperand(0); 10501 SDValue Y = Op.getOperand(1); 10502 EVT VT = Op.getValueType(); 10503 const SDNodeFlags Flags = Op->getFlags(); 10504 10505 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() || 10506 DAG.getTarget().Options.UnsafeFPMath; 10507 if (!AllowInaccurateDiv) 10508 return SDValue(); 10509 10510 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y); 10511 SDValue One = DAG.getConstantFP(1.0, SL, VT); 10512 10513 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y); 10514 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One); 10515 10516 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R); 10517 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One); 10518 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R); 10519 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R); 10520 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X); 10521 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret); 10522 } 10523 10524 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, 10525 EVT VT, SDValue A, SDValue B, SDValue GlueChain, 10526 SDNodeFlags Flags) { 10527 if (GlueChain->getNumValues() <= 1) { 10528 return DAG.getNode(Opcode, SL, VT, A, B, Flags); 10529 } 10530 10531 assert(GlueChain->getNumValues() == 3); 10532 10533 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); 10534 switch (Opcode) { 10535 default: llvm_unreachable("no chain equivalent for opcode"); 10536 case ISD::FMUL: 10537 Opcode = AMDGPUISD::FMUL_W_CHAIN; 10538 break; 10539 } 10540 10541 return DAG.getNode(Opcode, SL, VTList, 10542 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)}, 10543 Flags); 10544 } 10545 10546 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, 10547 EVT VT, SDValue A, SDValue B, SDValue C, 10548 SDValue GlueChain, SDNodeFlags Flags) { 10549 if (GlueChain->getNumValues() <= 1) { 10550 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags); 10551 } 10552 10553 assert(GlueChain->getNumValues() == 3); 10554 10555 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); 10556 switch (Opcode) { 10557 default: llvm_unreachable("no chain equivalent for opcode"); 10558 case ISD::FMA: 10559 Opcode = AMDGPUISD::FMA_W_CHAIN; 10560 break; 10561 } 10562 10563 return DAG.getNode(Opcode, SL, VTList, 10564 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)}, 10565 Flags); 10566 } 10567 10568 SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { 10569 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) 10570 return FastLowered; 10571 10572 SDLoc SL(Op); 10573 SDValue Src0 = Op.getOperand(0); 10574 SDValue Src1 = Op.getOperand(1); 10575 10576 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0); 10577 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1); 10578 10579 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1); 10580 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1); 10581 10582 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32); 10583 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag); 10584 10585 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0); 10586 } 10587 10588 // Faster 2.5 ULP division that does not support denormals. 10589 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { 10590 SDNodeFlags Flags = Op->getFlags(); 10591 SDLoc SL(Op); 10592 SDValue LHS = Op.getOperand(1); 10593 SDValue RHS = Op.getOperand(2); 10594 10595 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags); 10596 10597 const APFloat K0Val(0x1p+96f); 10598 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); 10599 10600 const APFloat K1Val(0x1p-32f); 10601 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); 10602 10603 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 10604 10605 EVT SetCCVT = 10606 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); 10607 10608 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); 10609 10610 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags); 10611 10612 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags); 10613 10614 // rcp does not support denormals. 10615 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags); 10616 10617 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags); 10618 10619 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags); 10620 } 10621 10622 // Returns immediate value for setting the F32 denorm mode when using the 10623 // S_DENORM_MODE instruction. 10624 static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, 10625 const SIMachineFunctionInfo *Info, 10626 const GCNSubtarget *ST) { 10627 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE"); 10628 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue(); 10629 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2); 10630 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32); 10631 } 10632 10633 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { 10634 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) 10635 return FastLowered; 10636 10637 // The selection matcher assumes anything with a chain selecting to a 10638 // mayRaiseFPException machine instruction. Since we're introducing a chain 10639 // here, we need to explicitly report nofpexcept for the regular fdiv 10640 // lowering. 10641 SDNodeFlags Flags = Op->getFlags(); 10642 Flags.setNoFPExcept(true); 10643 10644 SDLoc SL(Op); 10645 SDValue LHS = Op.getOperand(0); 10646 SDValue RHS = Op.getOperand(1); 10647 10648 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 10649 10650 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); 10651 10652 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, 10653 {RHS, RHS, LHS}, Flags); 10654 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, 10655 {LHS, RHS, LHS}, Flags); 10656 10657 // Denominator is scaled to not be denormal, so using rcp is ok. 10658 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, 10659 DenominatorScaled, Flags); 10660 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, 10661 DenominatorScaled, Flags); 10662 10663 using namespace AMDGPU::Hwreg; 10664 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2); 10665 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32); 10666 10667 const MachineFunction &MF = DAG.getMachineFunction(); 10668 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 10669 const DenormalMode DenormMode = Info->getMode().FP32Denormals; 10670 10671 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE(); 10672 const bool HasDynamicDenormals = 10673 (DenormMode.Input == DenormalMode::Dynamic) || 10674 (DenormMode.Output == DenormalMode::Dynamic); 10675 10676 SDValue SavedDenormMode; 10677 10678 if (!PreservesDenormals) { 10679 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV 10680 // lowering. The chain dependence is insufficient, and we need glue. We do 10681 // not need the glue variants in a strictfp function. 10682 10683 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 10684 10685 SDValue Glue = DAG.getEntryNode(); 10686 if (HasDynamicDenormals) { 10687 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL, 10688 DAG.getVTList(MVT::i32, MVT::Glue), 10689 {BitField, Glue}); 10690 SavedDenormMode = SDValue(GetReg, 0); 10691 10692 Glue = DAG.getMergeValues( 10693 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL); 10694 } 10695 10696 SDNode *EnableDenorm; 10697 if (Subtarget->hasDenormModeInst()) { 10698 const SDValue EnableDenormValue = 10699 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget); 10700 10701 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue, 10702 EnableDenormValue) 10703 .getNode(); 10704 } else { 10705 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, 10706 SL, MVT::i32); 10707 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs, 10708 {EnableDenormValue, BitField, Glue}); 10709 } 10710 10711 SDValue Ops[3] = { 10712 NegDivScale0, 10713 SDValue(EnableDenorm, 0), 10714 SDValue(EnableDenorm, 1) 10715 }; 10716 10717 NegDivScale0 = DAG.getMergeValues(Ops, SL); 10718 } 10719 10720 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, 10721 ApproxRcp, One, NegDivScale0, Flags); 10722 10723 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, 10724 ApproxRcp, Fma0, Flags); 10725 10726 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, 10727 Fma1, Fma1, Flags); 10728 10729 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, 10730 NumeratorScaled, Mul, Flags); 10731 10732 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, 10733 Fma2, Fma1, Mul, Fma2, Flags); 10734 10735 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, 10736 NumeratorScaled, Fma3, Flags); 10737 10738 if (!PreservesDenormals) { 10739 SDNode *DisableDenorm; 10740 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) { 10741 const SDValue DisableDenormValue = getSPDenormModeValue( 10742 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget); 10743 10744 DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, 10745 Fma4.getValue(1), DisableDenormValue, 10746 Fma4.getValue(2)).getNode(); 10747 } else { 10748 assert(HasDynamicDenormals == (bool)SavedDenormMode); 10749 const SDValue DisableDenormValue = 10750 HasDynamicDenormals 10751 ? SavedDenormMode 10752 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); 10753 10754 DisableDenorm = DAG.getMachineNode( 10755 AMDGPU::S_SETREG_B32, SL, MVT::Other, 10756 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)}); 10757 } 10758 10759 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, 10760 SDValue(DisableDenorm, 0), DAG.getRoot()); 10761 DAG.setRoot(OutputChain); 10762 } 10763 10764 SDValue Scale = NumeratorScaled.getValue(1); 10765 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, 10766 {Fma4, Fma1, Fma3, Scale}, Flags); 10767 10768 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags); 10769 } 10770 10771 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { 10772 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG)) 10773 return FastLowered; 10774 10775 SDLoc SL(Op); 10776 SDValue X = Op.getOperand(0); 10777 SDValue Y = Op.getOperand(1); 10778 10779 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 10780 10781 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); 10782 10783 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); 10784 10785 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); 10786 10787 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); 10788 10789 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); 10790 10791 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); 10792 10793 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); 10794 10795 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); 10796 10797 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); 10798 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); 10799 10800 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, 10801 NegDivScale0, Mul, DivScale1); 10802 10803 SDValue Scale; 10804 10805 if (!Subtarget->hasUsableDivScaleConditionOutput()) { 10806 // Workaround a hardware bug on SI where the condition output from div_scale 10807 // is not usable. 10808 10809 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); 10810 10811 // Figure out if the scale to use for div_fmas. 10812 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); 10813 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); 10814 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); 10815 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); 10816 10817 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); 10818 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); 10819 10820 SDValue Scale0Hi 10821 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); 10822 SDValue Scale1Hi 10823 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); 10824 10825 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); 10826 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); 10827 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); 10828 } else { 10829 Scale = DivScale1.getValue(1); 10830 } 10831 10832 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, 10833 Fma4, Fma3, Mul, Scale); 10834 10835 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); 10836 } 10837 10838 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { 10839 EVT VT = Op.getValueType(); 10840 10841 if (VT == MVT::f32) 10842 return LowerFDIV32(Op, DAG); 10843 10844 if (VT == MVT::f64) 10845 return LowerFDIV64(Op, DAG); 10846 10847 if (VT == MVT::f16) 10848 return LowerFDIV16(Op, DAG); 10849 10850 llvm_unreachable("Unexpected type for fdiv"); 10851 } 10852 10853 SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const { 10854 SDLoc dl(Op); 10855 SDValue Val = Op.getOperand(0); 10856 EVT VT = Val.getValueType(); 10857 EVT ResultExpVT = Op->getValueType(1); 10858 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32; 10859 10860 SDValue Mant = DAG.getNode( 10861 ISD::INTRINSIC_WO_CHAIN, dl, VT, 10862 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val); 10863 10864 SDValue Exp = DAG.getNode( 10865 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT, 10866 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val); 10867 10868 if (Subtarget->hasFractBug()) { 10869 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val); 10870 SDValue Inf = DAG.getConstantFP( 10871 APFloat::getInf(SelectionDAG::EVTToAPFloatSemantics(VT)), dl, VT); 10872 10873 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT); 10874 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT); 10875 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero); 10876 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val); 10877 } 10878 10879 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT); 10880 return DAG.getMergeValues({Mant, CastExp}, dl); 10881 } 10882 10883 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 10884 SDLoc DL(Op); 10885 StoreSDNode *Store = cast<StoreSDNode>(Op); 10886 EVT VT = Store->getMemoryVT(); 10887 10888 if (VT == MVT::i1) { 10889 return DAG.getTruncStore(Store->getChain(), DL, 10890 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), 10891 Store->getBasePtr(), MVT::i1, Store->getMemOperand()); 10892 } 10893 10894 assert(VT.isVector() && 10895 Store->getValue().getValueType().getScalarType() == MVT::i32); 10896 10897 unsigned AS = Store->getAddressSpace(); 10898 if (Subtarget->hasLDSMisalignedBug() && 10899 AS == AMDGPUAS::FLAT_ADDRESS && 10900 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) { 10901 return SplitVectorStore(Op, DAG); 10902 } 10903 10904 MachineFunction &MF = DAG.getMachineFunction(); 10905 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 10906 // If there is a possibility that flat instruction access scratch memory 10907 // then we need to use the same legalization rules we use for private. 10908 if (AS == AMDGPUAS::FLAT_ADDRESS && 10909 !Subtarget->hasMultiDwordFlatScratchAddressing()) 10910 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ? 10911 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; 10912 10913 unsigned NumElements = VT.getVectorNumElements(); 10914 if (AS == AMDGPUAS::GLOBAL_ADDRESS || 10915 AS == AMDGPUAS::FLAT_ADDRESS) { 10916 if (NumElements > 4) 10917 return SplitVectorStore(Op, DAG); 10918 // v3 stores not supported on SI. 10919 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) 10920 return SplitVectorStore(Op, DAG); 10921 10922 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 10923 VT, *Store->getMemOperand())) 10924 return expandUnalignedStore(Store, DAG); 10925 10926 return SDValue(); 10927 } 10928 if (AS == AMDGPUAS::PRIVATE_ADDRESS) { 10929 switch (Subtarget->getMaxPrivateElementSize()) { 10930 case 4: 10931 return scalarizeVectorStore(Store, DAG); 10932 case 8: 10933 if (NumElements > 2) 10934 return SplitVectorStore(Op, DAG); 10935 return SDValue(); 10936 case 16: 10937 if (NumElements > 4 || 10938 (NumElements == 3 && !Subtarget->enableFlatScratch())) 10939 return SplitVectorStore(Op, DAG); 10940 return SDValue(); 10941 default: 10942 llvm_unreachable("unsupported private_element_size"); 10943 } 10944 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 10945 unsigned Fast = 0; 10946 auto Flags = Store->getMemOperand()->getFlags(); 10947 if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS, 10948 Store->getAlign(), Flags, &Fast) && 10949 Fast > 1) 10950 return SDValue(); 10951 10952 if (VT.isVector()) 10953 return SplitVectorStore(Op, DAG); 10954 10955 return expandUnalignedStore(Store, DAG); 10956 } 10957 10958 // Probably an invalid store. If so we'll end up emitting a selection error. 10959 return SDValue(); 10960 } 10961 10962 // Avoid the full correct expansion for f32 sqrt when promoting from f16. 10963 SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const { 10964 SDLoc SL(Op); 10965 assert(!Subtarget->has16BitInsts()); 10966 SDNodeFlags Flags = Op->getFlags(); 10967 SDValue Ext = 10968 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags); 10969 10970 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32); 10971 SDValue Sqrt = 10972 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags); 10973 10974 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt, 10975 DAG.getTargetConstant(0, SL, MVT::i32), Flags); 10976 } 10977 10978 SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const { 10979 SDLoc DL(Op); 10980 SDNodeFlags Flags = Op->getFlags(); 10981 MVT VT = Op.getValueType().getSimpleVT(); 10982 const SDValue X = Op.getOperand(0); 10983 10984 if (allowApproxFunc(DAG, Flags)) { 10985 // Instruction is 1ulp but ignores denormals. 10986 return DAG.getNode( 10987 ISD::INTRINSIC_WO_CHAIN, DL, VT, 10988 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags); 10989 } 10990 10991 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT); 10992 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT); 10993 10994 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT); 10995 10996 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags); 10997 10998 SDValue SqrtX = 10999 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags); 11000 11001 SDValue SqrtS; 11002 if (needsDenormHandlingF32(DAG, X, Flags)) { 11003 SDValue SqrtID = 11004 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32); 11005 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags); 11006 11007 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS); 11008 SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, 11009 DAG.getConstant(-1, DL, MVT::i32)); 11010 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt); 11011 11012 SDValue NegSqrtSNextDown = 11013 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags); 11014 11015 SDValue SqrtVP = 11016 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags); 11017 11018 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, 11019 DAG.getConstant(1, DL, MVT::i32)); 11020 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt); 11021 11022 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags); 11023 SDValue SqrtVS = 11024 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags); 11025 11026 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT); 11027 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE); 11028 11029 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS, 11030 Flags); 11031 11032 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT); 11033 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS, 11034 Flags); 11035 } else { 11036 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags); 11037 11038 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags); 11039 11040 SDValue Half = DAG.getConstantFP(0.5f, DL, VT); 11041 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags); 11042 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags); 11043 11044 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags); 11045 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags); 11046 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags); 11047 11048 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags); 11049 SDValue SqrtD = 11050 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags); 11051 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags); 11052 } 11053 11054 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT); 11055 11056 SDValue ScaledDown = 11057 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags); 11058 11059 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags); 11060 SDValue IsZeroOrInf = 11061 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX, 11062 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32)); 11063 11064 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags); 11065 } 11066 11067 SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const { 11068 // For double type, the SQRT and RSQ instructions don't have required 11069 // precision, we apply Goldschmidt's algorithm to improve the result: 11070 // 11071 // y0 = rsq(x) 11072 // g0 = x * y0 11073 // h0 = 0.5 * y0 11074 // 11075 // r0 = 0.5 - h0 * g0 11076 // g1 = g0 * r0 + g0 11077 // h1 = h0 * r0 + h0 11078 // 11079 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 11080 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 11081 // h2 = h1 * r1 + h1 11082 // 11083 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 11084 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 11085 // 11086 // sqrt(x) = g3 11087 11088 SDNodeFlags Flags = Op->getFlags(); 11089 11090 SDLoc DL(Op); 11091 11092 SDValue X = Op.getOperand(0); 11093 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64); 11094 11095 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT); 11096 11097 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32); 11098 11099 // Scale up input if it is too small. 11100 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32); 11101 SDValue ScaleUp = 11102 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt); 11103 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags); 11104 11105 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX); 11106 11107 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY); 11108 11109 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64); 11110 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half); 11111 11112 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0); 11113 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half); 11114 11115 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0); 11116 11117 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0); 11118 11119 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1); 11120 SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX); 11121 11122 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1); 11123 11124 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2); 11125 SDValue SqrtD1 = 11126 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX); 11127 11128 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2); 11129 11130 SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32); 11131 SDValue ScaleDown = 11132 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt); 11133 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags); 11134 11135 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check 11136 // with finite only or nsz because rsq(+/-0) = +/-inf 11137 11138 // TODO: Check for DAZ and expand to subnormals 11139 SDValue IsZeroOrInf = 11140 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX, 11141 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32)); 11142 11143 // If x is +INF, +0, or -0, use its original value 11144 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet, 11145 Flags); 11146 } 11147 11148 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 11149 SDLoc DL(Op); 11150 EVT VT = Op.getValueType(); 11151 SDValue Arg = Op.getOperand(0); 11152 SDValue TrigVal; 11153 11154 // Propagate fast-math flags so that the multiply we introduce can be folded 11155 // if Arg is already the result of a multiply by constant. 11156 auto Flags = Op->getFlags(); 11157 11158 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT); 11159 11160 if (Subtarget->hasTrigReducedRange()) { 11161 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags); 11162 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags); 11163 } else { 11164 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags); 11165 } 11166 11167 switch (Op.getOpcode()) { 11168 case ISD::FCOS: 11169 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags); 11170 case ISD::FSIN: 11171 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags); 11172 default: 11173 llvm_unreachable("Wrong trig opcode"); 11174 } 11175 } 11176 11177 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 11178 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op); 11179 assert(AtomicNode->isCompareAndSwap()); 11180 unsigned AS = AtomicNode->getAddressSpace(); 11181 11182 // No custom lowering required for local address space 11183 if (!AMDGPU::isFlatGlobalAddrSpace(AS)) 11184 return Op; 11185 11186 // Non-local address space requires custom lowering for atomic compare 11187 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2 11188 SDLoc DL(Op); 11189 SDValue ChainIn = Op.getOperand(0); 11190 SDValue Addr = Op.getOperand(1); 11191 SDValue Old = Op.getOperand(2); 11192 SDValue New = Op.getOperand(3); 11193 EVT VT = Op.getValueType(); 11194 MVT SimpleVT = VT.getSimpleVT(); 11195 MVT VecType = MVT::getVectorVT(SimpleVT, 2); 11196 11197 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old}); 11198 SDValue Ops[] = { ChainIn, Addr, NewOld }; 11199 11200 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(), 11201 Ops, VT, AtomicNode->getMemOperand()); 11202 } 11203 11204 //===----------------------------------------------------------------------===// 11205 // Custom DAG optimizations 11206 //===----------------------------------------------------------------------===// 11207 11208 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, 11209 DAGCombinerInfo &DCI) const { 11210 EVT VT = N->getValueType(0); 11211 EVT ScalarVT = VT.getScalarType(); 11212 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16) 11213 return SDValue(); 11214 11215 SelectionDAG &DAG = DCI.DAG; 11216 SDLoc DL(N); 11217 11218 SDValue Src = N->getOperand(0); 11219 EVT SrcVT = Src.getValueType(); 11220 11221 // TODO: We could try to match extracting the higher bytes, which would be 11222 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 11223 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 11224 // about in practice. 11225 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) { 11226 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { 11227 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src); 11228 DCI.AddToWorklist(Cvt.getNode()); 11229 11230 // For the f16 case, fold to a cast to f32 and then cast back to f16. 11231 if (ScalarVT != MVT::f32) { 11232 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt, 11233 DAG.getTargetConstant(0, DL, MVT::i32)); 11234 } 11235 return Cvt; 11236 } 11237 } 11238 11239 return SDValue(); 11240 } 11241 11242 SDValue SITargetLowering::performFCopySignCombine(SDNode *N, 11243 DAGCombinerInfo &DCI) const { 11244 SDValue MagnitudeOp = N->getOperand(0); 11245 SDValue SignOp = N->getOperand(1); 11246 SelectionDAG &DAG = DCI.DAG; 11247 SDLoc DL(N); 11248 11249 // f64 fcopysign is really an f32 copysign on the high bits, so replace the 11250 // lower half with a copy. 11251 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) 11252 if (MagnitudeOp.getValueType() == MVT::f64) { 11253 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp); 11254 SDValue MagLo = 11255 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, 11256 DAG.getConstant(0, DL, MVT::i32)); 11257 SDValue MagHi = 11258 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, 11259 DAG.getConstant(1, DL, MVT::i32)); 11260 11261 SDValue HiOp = 11262 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp); 11263 11264 SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); 11265 11266 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); 11267 } 11268 11269 if (SignOp.getValueType() != MVT::f64) 11270 return SDValue(); 11271 11272 // Reduce width of sign operand, we only need the highest bit. 11273 // 11274 // fcopysign f64:x, f64:y -> 11275 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1) 11276 // TODO: In some cases it might make sense to go all the way to f16. 11277 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp); 11278 SDValue SignAsF32 = 11279 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, 11280 DAG.getConstant(1, DL, MVT::i32)); 11281 11282 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0), 11283 SignAsF32); 11284 } 11285 11286 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) 11287 // (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no 11288 // bits 11289 11290 // This is a variant of 11291 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), 11292 // 11293 // The normal DAG combiner will do this, but only if the add has one use since 11294 // that would increase the number of instructions. 11295 // 11296 // This prevents us from seeing a constant offset that can be folded into a 11297 // memory instruction's addressing mode. If we know the resulting add offset of 11298 // a pointer can be folded into an addressing offset, we can replace the pointer 11299 // operand with the add of new constant offset. This eliminates one of the uses, 11300 // and may allow the remaining use to also be simplified. 11301 // 11302 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, 11303 unsigned AddrSpace, 11304 EVT MemVT, 11305 DAGCombinerInfo &DCI) const { 11306 SDValue N0 = N->getOperand(0); 11307 SDValue N1 = N->getOperand(1); 11308 11309 // We only do this to handle cases where it's profitable when there are 11310 // multiple uses of the add, so defer to the standard combine. 11311 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) || 11312 N0->hasOneUse()) 11313 return SDValue(); 11314 11315 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); 11316 if (!CN1) 11317 return SDValue(); 11318 11319 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 11320 if (!CAdd) 11321 return SDValue(); 11322 11323 SelectionDAG &DAG = DCI.DAG; 11324 11325 if (N0->getOpcode() == ISD::OR && 11326 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) 11327 return SDValue(); 11328 11329 // If the resulting offset is too large, we can't fold it into the 11330 // addressing mode offset. 11331 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); 11332 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext()); 11333 11334 AddrMode AM; 11335 AM.HasBaseReg = true; 11336 AM.BaseOffs = Offset.getSExtValue(); 11337 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace)) 11338 return SDValue(); 11339 11340 SDLoc SL(N); 11341 EVT VT = N->getValueType(0); 11342 11343 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); 11344 SDValue COffset = DAG.getConstant(Offset, SL, VT); 11345 11346 SDNodeFlags Flags; 11347 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() && 11348 (N0.getOpcode() == ISD::OR || 11349 N0->getFlags().hasNoUnsignedWrap())); 11350 11351 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags); 11352 } 11353 11354 /// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset 11355 /// by the chain and intrinsic ID. Theoretically we would also need to check the 11356 /// specific intrinsic, but they all place the pointer operand first. 11357 static unsigned getBasePtrIndex(const MemSDNode *N) { 11358 switch (N->getOpcode()) { 11359 case ISD::STORE: 11360 case ISD::INTRINSIC_W_CHAIN: 11361 case ISD::INTRINSIC_VOID: 11362 return 2; 11363 default: 11364 return 1; 11365 } 11366 } 11367 11368 SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N, 11369 DAGCombinerInfo &DCI) const { 11370 SelectionDAG &DAG = DCI.DAG; 11371 SDLoc SL(N); 11372 11373 unsigned PtrIdx = getBasePtrIndex(N); 11374 SDValue Ptr = N->getOperand(PtrIdx); 11375 11376 // TODO: We could also do this for multiplies. 11377 if (Ptr.getOpcode() == ISD::SHL) { 11378 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(), 11379 N->getMemoryVT(), DCI); 11380 if (NewPtr) { 11381 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end()); 11382 11383 NewOps[PtrIdx] = NewPtr; 11384 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); 11385 } 11386 } 11387 11388 return SDValue(); 11389 } 11390 11391 static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) { 11392 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) || 11393 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) || 11394 (Opc == ISD::XOR && Val == 0); 11395 } 11396 11397 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This 11398 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit 11399 // integer combine opportunities since most 64-bit operations are decomposed 11400 // this way. TODO: We won't want this for SALU especially if it is an inline 11401 // immediate. 11402 SDValue SITargetLowering::splitBinaryBitConstantOp( 11403 DAGCombinerInfo &DCI, 11404 const SDLoc &SL, 11405 unsigned Opc, SDValue LHS, 11406 const ConstantSDNode *CRHS) const { 11407 uint64_t Val = CRHS->getZExtValue(); 11408 uint32_t ValLo = Lo_32(Val); 11409 uint32_t ValHi = Hi_32(Val); 11410 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 11411 11412 if ((bitOpWithConstantIsReducible(Opc, ValLo) || 11413 bitOpWithConstantIsReducible(Opc, ValHi)) || 11414 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) { 11415 // If we need to materialize a 64-bit immediate, it will be split up later 11416 // anyway. Avoid creating the harder to understand 64-bit immediate 11417 // materialization. 11418 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi); 11419 } 11420 11421 return SDValue(); 11422 } 11423 11424 bool llvm::isBoolSGPR(SDValue V) { 11425 if (V.getValueType() != MVT::i1) 11426 return false; 11427 switch (V.getOpcode()) { 11428 default: 11429 break; 11430 case ISD::SETCC: 11431 case AMDGPUISD::FP_CLASS: 11432 return true; 11433 case ISD::AND: 11434 case ISD::OR: 11435 case ISD::XOR: 11436 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1)); 11437 } 11438 return false; 11439 } 11440 11441 // If a constant has all zeroes or all ones within each byte return it. 11442 // Otherwise return 0. 11443 static uint32_t getConstantPermuteMask(uint32_t C) { 11444 // 0xff for any zero byte in the mask 11445 uint32_t ZeroByteMask = 0; 11446 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff; 11447 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00; 11448 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000; 11449 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000; 11450 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte 11451 if ((NonZeroByteMask & C) != NonZeroByteMask) 11452 return 0; // Partial bytes selected. 11453 return C; 11454 } 11455 11456 // Check if a node selects whole bytes from its operand 0 starting at a byte 11457 // boundary while masking the rest. Returns select mask as in the v_perm_b32 11458 // or -1 if not succeeded. 11459 // Note byte select encoding: 11460 // value 0-3 selects corresponding source byte; 11461 // value 0xc selects zero; 11462 // value 0xff selects 0xff. 11463 static uint32_t getPermuteMask(SDValue V) { 11464 assert(V.getValueSizeInBits() == 32); 11465 11466 if (V.getNumOperands() != 2) 11467 return ~0; 11468 11469 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1)); 11470 if (!N1) 11471 return ~0; 11472 11473 uint32_t C = N1->getZExtValue(); 11474 11475 switch (V.getOpcode()) { 11476 default: 11477 break; 11478 case ISD::AND: 11479 if (uint32_t ConstMask = getConstantPermuteMask(C)) 11480 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask); 11481 break; 11482 11483 case ISD::OR: 11484 if (uint32_t ConstMask = getConstantPermuteMask(C)) 11485 return (0x03020100 & ~ConstMask) | ConstMask; 11486 break; 11487 11488 case ISD::SHL: 11489 if (C % 8) 11490 return ~0; 11491 11492 return uint32_t((0x030201000c0c0c0cull << C) >> 32); 11493 11494 case ISD::SRL: 11495 if (C % 8) 11496 return ~0; 11497 11498 return uint32_t(0x0c0c0c0c03020100ull >> C); 11499 } 11500 11501 return ~0; 11502 } 11503 11504 SDValue SITargetLowering::performAndCombine(SDNode *N, 11505 DAGCombinerInfo &DCI) const { 11506 if (DCI.isBeforeLegalize()) 11507 return SDValue(); 11508 11509 SelectionDAG &DAG = DCI.DAG; 11510 EVT VT = N->getValueType(0); 11511 SDValue LHS = N->getOperand(0); 11512 SDValue RHS = N->getOperand(1); 11513 11514 11515 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 11516 if (VT == MVT::i64 && CRHS) { 11517 if (SDValue Split 11518 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) 11519 return Split; 11520 } 11521 11522 if (CRHS && VT == MVT::i32) { 11523 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb 11524 // nb = number of trailing zeroes in mask 11525 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass, 11526 // given that we are selecting 8 or 16 bit fields starting at byte boundary. 11527 uint64_t Mask = CRHS->getZExtValue(); 11528 unsigned Bits = llvm::popcount(Mask); 11529 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL && 11530 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) { 11531 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { 11532 unsigned Shift = CShift->getZExtValue(); 11533 unsigned NB = CRHS->getAPIntValue().countr_zero(); 11534 unsigned Offset = NB + Shift; 11535 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary. 11536 SDLoc SL(N); 11537 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, 11538 LHS->getOperand(0), 11539 DAG.getConstant(Offset, SL, MVT::i32), 11540 DAG.getConstant(Bits, SL, MVT::i32)); 11541 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits); 11542 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE, 11543 DAG.getValueType(NarrowVT)); 11544 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext, 11545 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32)); 11546 return Shl; 11547 } 11548 } 11549 } 11550 11551 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) 11552 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM && 11553 isa<ConstantSDNode>(LHS.getOperand(2))) { 11554 uint32_t Sel = getConstantPermuteMask(Mask); 11555 if (!Sel) 11556 return SDValue(); 11557 11558 // Select 0xc for all zero bytes 11559 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c); 11560 SDLoc DL(N); 11561 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), 11562 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32)); 11563 } 11564 } 11565 11566 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> 11567 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) 11568 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) { 11569 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); 11570 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); 11571 11572 SDValue X = LHS.getOperand(0); 11573 SDValue Y = RHS.getOperand(0); 11574 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X || 11575 !isTypeLegal(X.getValueType())) 11576 return SDValue(); 11577 11578 if (LCC == ISD::SETO) { 11579 if (X != LHS.getOperand(1)) 11580 return SDValue(); 11581 11582 if (RCC == ISD::SETUNE) { 11583 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); 11584 if (!C1 || !C1->isInfinity() || C1->isNegative()) 11585 return SDValue(); 11586 11587 const uint32_t Mask = SIInstrFlags::N_NORMAL | 11588 SIInstrFlags::N_SUBNORMAL | 11589 SIInstrFlags::N_ZERO | 11590 SIInstrFlags::P_ZERO | 11591 SIInstrFlags::P_SUBNORMAL | 11592 SIInstrFlags::P_NORMAL; 11593 11594 static_assert(((~(SIInstrFlags::S_NAN | 11595 SIInstrFlags::Q_NAN | 11596 SIInstrFlags::N_INFINITY | 11597 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, 11598 "mask not equal"); 11599 11600 SDLoc DL(N); 11601 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 11602 X, DAG.getConstant(Mask, DL, MVT::i32)); 11603 } 11604 } 11605 } 11606 11607 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS) 11608 std::swap(LHS, RHS); 11609 11610 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS && 11611 RHS.hasOneUse()) { 11612 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); 11613 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan) 11614 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan) 11615 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); 11616 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask && 11617 (RHS.getOperand(0) == LHS.getOperand(0) && 11618 LHS.getOperand(0) == LHS.getOperand(1))) { 11619 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN; 11620 unsigned NewMask = LCC == ISD::SETO ? 11621 Mask->getZExtValue() & ~OrdMask : 11622 Mask->getZExtValue() & OrdMask; 11623 11624 SDLoc DL(N); 11625 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0), 11626 DAG.getConstant(NewMask, DL, MVT::i32)); 11627 } 11628 } 11629 11630 if (VT == MVT::i32 && 11631 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) { 11632 // and x, (sext cc from i1) => select cc, x, 0 11633 if (RHS.getOpcode() != ISD::SIGN_EXTEND) 11634 std::swap(LHS, RHS); 11635 if (isBoolSGPR(RHS.getOperand(0))) 11636 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), 11637 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32)); 11638 } 11639 11640 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) 11641 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 11642 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && 11643 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { 11644 uint32_t LHSMask = getPermuteMask(LHS); 11645 uint32_t RHSMask = getPermuteMask(RHS); 11646 if (LHSMask != ~0u && RHSMask != ~0u) { 11647 // Canonicalize the expression in an attempt to have fewer unique masks 11648 // and therefore fewer registers used to hold the masks. 11649 if (LHSMask > RHSMask) { 11650 std::swap(LHSMask, RHSMask); 11651 std::swap(LHS, RHS); 11652 } 11653 11654 // Select 0xc for each lane used from source operand. Zero has 0xc mask 11655 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. 11656 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; 11657 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; 11658 11659 // Check of we need to combine values from two sources within a byte. 11660 if (!(LHSUsedLanes & RHSUsedLanes) && 11661 // If we select high and lower word keep it for SDWA. 11662 // TODO: teach SDWA to work with v_perm_b32 and remove the check. 11663 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { 11664 // Each byte in each mask is either selector mask 0-3, or has higher 11665 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for 11666 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise 11667 // mask which is not 0xff wins. By anding both masks we have a correct 11668 // result except that 0x0c shall be corrected to give 0x0c only. 11669 uint32_t Mask = LHSMask & RHSMask; 11670 for (unsigned I = 0; I < 32; I += 8) { 11671 uint32_t ByteSel = 0xff << I; 11672 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c) 11673 Mask &= (0x0c << I) & 0xffffffff; 11674 } 11675 11676 // Add 4 to each active LHS lane. It will not affect any existing 0xff 11677 // or 0x0c. 11678 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404); 11679 SDLoc DL(N); 11680 11681 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, 11682 LHS.getOperand(0), RHS.getOperand(0), 11683 DAG.getConstant(Sel, DL, MVT::i32)); 11684 } 11685 } 11686 } 11687 11688 return SDValue(); 11689 } 11690 11691 // A key component of v_perm is a mapping between byte position of the src 11692 // operands, and the byte position of the dest. To provide such, we need: 1. the 11693 // node that provides x byte of the dest of the OR, and 2. the byte of the node 11694 // used to provide that x byte. calculateByteProvider finds which node provides 11695 // a certain byte of the dest of the OR, and calculateSrcByte takes that node, 11696 // and finds an ultimate src and byte position For example: The supported 11697 // LoadCombine pattern for vector loads is as follows 11698 // t1 11699 // or 11700 // / \ 11701 // t2 t3 11702 // zext shl 11703 // | | \ 11704 // t4 t5 16 11705 // or anyext 11706 // / \ | 11707 // t6 t7 t8 11708 // srl shl or 11709 // / | / \ / \ 11710 // t9 t10 t11 t12 t13 t14 11711 // trunc* 8 trunc* 8 and and 11712 // | | / | | \ 11713 // t15 t16 t17 t18 t19 t20 11714 // trunc* 255 srl -256 11715 // | / \ 11716 // t15 t15 16 11717 // 11718 // *In this example, the truncs are from i32->i16 11719 // 11720 // calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3 11721 // respectively. calculateSrcByte would find (given node) -> ultimate src & 11722 // byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3. 11723 // After finding the mapping, we can combine the tree into vperm t15, t16, 11724 // 0x05000407 11725 11726 // Find the source and byte position from a node. 11727 // \p DestByte is the byte position of the dest of the or that the src 11728 // ultimately provides. \p SrcIndex is the byte of the src that maps to this 11729 // dest of the or byte. \p Depth tracks how many recursive iterations we have 11730 // performed. 11731 static const std::optional<ByteProvider<SDValue>> 11732 calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0, 11733 unsigned Depth = 0) { 11734 // We may need to recursively traverse a series of SRLs 11735 if (Depth >= 6) 11736 return std::nullopt; 11737 11738 if (Op.getValueSizeInBits() < 8) 11739 return std::nullopt; 11740 11741 if (Op.getValueType().isVector()) 11742 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex); 11743 11744 switch (Op->getOpcode()) { 11745 case ISD::TRUNCATE: { 11746 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); 11747 } 11748 11749 case ISD::SIGN_EXTEND: 11750 case ISD::ZERO_EXTEND: 11751 case ISD::SIGN_EXTEND_INREG: { 11752 SDValue NarrowOp = Op->getOperand(0); 11753 auto NarrowVT = NarrowOp.getValueType(); 11754 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) { 11755 auto *VTSign = cast<VTSDNode>(Op->getOperand(1)); 11756 NarrowVT = VTSign->getVT(); 11757 } 11758 if (!NarrowVT.isByteSized()) 11759 return std::nullopt; 11760 uint64_t NarrowByteWidth = NarrowVT.getStoreSize(); 11761 11762 if (SrcIndex >= NarrowByteWidth) 11763 return std::nullopt; 11764 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); 11765 } 11766 11767 case ISD::SRA: 11768 case ISD::SRL: { 11769 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 11770 if (!ShiftOp) 11771 return std::nullopt; 11772 11773 uint64_t BitShift = ShiftOp->getZExtValue(); 11774 11775 if (BitShift % 8 != 0) 11776 return std::nullopt; 11777 11778 SrcIndex += BitShift / 8; 11779 11780 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); 11781 } 11782 11783 default: { 11784 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex); 11785 } 11786 } 11787 llvm_unreachable("fully handled switch"); 11788 } 11789 11790 // For a byte position in the result of an Or, traverse the tree and find the 11791 // node (and the byte of the node) which ultimately provides this {Or, 11792 // BytePosition}. \p Op is the operand we are currently examining. \p Index is 11793 // the byte position of the Op that corresponds with the originally requested 11794 // byte of the Or \p Depth tracks how many recursive iterations we have 11795 // performed. \p StartingIndex is the originally requested byte of the Or 11796 static const std::optional<ByteProvider<SDValue>> 11797 calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, 11798 unsigned StartingIndex = 0) { 11799 // Finding Src tree of RHS of or typically requires at least 1 additional 11800 // depth 11801 if (Depth > 6) 11802 return std::nullopt; 11803 11804 unsigned BitWidth = Op.getScalarValueSizeInBits(); 11805 if (BitWidth % 8 != 0) 11806 return std::nullopt; 11807 if (Index > BitWidth / 8 - 1) 11808 return std::nullopt; 11809 11810 bool IsVec = Op.getValueType().isVector(); 11811 switch (Op.getOpcode()) { 11812 case ISD::OR: { 11813 if (IsVec) 11814 return std::nullopt; 11815 11816 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1, 11817 StartingIndex); 11818 if (!RHS) 11819 return std::nullopt; 11820 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1, 11821 StartingIndex); 11822 if (!LHS) 11823 return std::nullopt; 11824 // A well formed Or will have two ByteProviders for each byte, one of which 11825 // is constant zero 11826 if (!LHS->isConstantZero() && !RHS->isConstantZero()) 11827 return std::nullopt; 11828 if (!LHS || LHS->isConstantZero()) 11829 return RHS; 11830 if (!RHS || RHS->isConstantZero()) 11831 return LHS; 11832 return std::nullopt; 11833 } 11834 11835 case ISD::AND: { 11836 if (IsVec) 11837 return std::nullopt; 11838 11839 auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 11840 if (!BitMaskOp) 11841 return std::nullopt; 11842 11843 uint32_t BitMask = BitMaskOp->getZExtValue(); 11844 // Bits we expect for our StartingIndex 11845 uint32_t IndexMask = 0xFF << (Index * 8); 11846 11847 if ((IndexMask & BitMask) != IndexMask) { 11848 // If the result of the and partially provides the byte, then it 11849 // is not well formatted 11850 if (IndexMask & BitMask) 11851 return std::nullopt; 11852 return ByteProvider<SDValue>::getConstantZero(); 11853 } 11854 11855 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index); 11856 } 11857 11858 case ISD::FSHR: { 11859 if (IsVec) 11860 return std::nullopt; 11861 11862 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 11863 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2)); 11864 if (!ShiftOp || Op.getValueType().isVector()) 11865 return std::nullopt; 11866 11867 uint64_t BitsProvided = Op.getValueSizeInBits(); 11868 if (BitsProvided % 8 != 0) 11869 return std::nullopt; 11870 11871 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided); 11872 if (BitShift % 8) 11873 return std::nullopt; 11874 11875 uint64_t ConcatSizeInBytes = BitsProvided / 4; 11876 uint64_t ByteShift = BitShift / 8; 11877 11878 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes; 11879 uint64_t BytesProvided = BitsProvided / 8; 11880 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1); 11881 NewIndex %= BytesProvided; 11882 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex); 11883 } 11884 11885 case ISD::SRA: 11886 case ISD::SRL: { 11887 if (IsVec) 11888 return std::nullopt; 11889 11890 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 11891 if (!ShiftOp) 11892 return std::nullopt; 11893 11894 uint64_t BitShift = ShiftOp->getZExtValue(); 11895 if (BitShift % 8) 11896 return std::nullopt; 11897 11898 auto BitsProvided = Op.getScalarValueSizeInBits(); 11899 if (BitsProvided % 8 != 0) 11900 return std::nullopt; 11901 11902 uint64_t BytesProvided = BitsProvided / 8; 11903 uint64_t ByteShift = BitShift / 8; 11904 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes. 11905 // If the byte we are trying to provide (as tracked by index) falls in this 11906 // range, then the SRL provides the byte. The byte of interest of the src of 11907 // the SRL is Index + ByteShift 11908 return BytesProvided - ByteShift > Index 11909 ? calculateSrcByte(Op->getOperand(0), StartingIndex, 11910 Index + ByteShift) 11911 : ByteProvider<SDValue>::getConstantZero(); 11912 } 11913 11914 case ISD::SHL: { 11915 if (IsVec) 11916 return std::nullopt; 11917 11918 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 11919 if (!ShiftOp) 11920 return std::nullopt; 11921 11922 uint64_t BitShift = ShiftOp->getZExtValue(); 11923 if (BitShift % 8 != 0) 11924 return std::nullopt; 11925 uint64_t ByteShift = BitShift / 8; 11926 11927 // If we are shifting by an amount greater than (or equal to) 11928 // the index we are trying to provide, then it provides 0s. If not, 11929 // then this bytes are not definitively 0s, and the corresponding byte 11930 // of interest is Index - ByteShift of the src 11931 return Index < ByteShift 11932 ? ByteProvider<SDValue>::getConstantZero() 11933 : calculateByteProvider(Op.getOperand(0), Index - ByteShift, 11934 Depth + 1, StartingIndex); 11935 } 11936 case ISD::ANY_EXTEND: 11937 case ISD::SIGN_EXTEND: 11938 case ISD::ZERO_EXTEND: 11939 case ISD::SIGN_EXTEND_INREG: 11940 case ISD::AssertZext: 11941 case ISD::AssertSext: { 11942 if (IsVec) 11943 return std::nullopt; 11944 11945 SDValue NarrowOp = Op->getOperand(0); 11946 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits(); 11947 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG || 11948 Op->getOpcode() == ISD::AssertZext || 11949 Op->getOpcode() == ISD::AssertSext) { 11950 auto *VTSign = cast<VTSDNode>(Op->getOperand(1)); 11951 NarrowBitWidth = VTSign->getVT().getSizeInBits(); 11952 } 11953 if (NarrowBitWidth % 8 != 0) 11954 return std::nullopt; 11955 uint64_t NarrowByteWidth = NarrowBitWidth / 8; 11956 11957 if (Index >= NarrowByteWidth) 11958 return Op.getOpcode() == ISD::ZERO_EXTEND 11959 ? std::optional<ByteProvider<SDValue>>( 11960 ByteProvider<SDValue>::getConstantZero()) 11961 : std::nullopt; 11962 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex); 11963 } 11964 11965 case ISD::TRUNCATE: { 11966 if (IsVec) 11967 return std::nullopt; 11968 11969 uint64_t NarrowByteWidth = BitWidth / 8; 11970 11971 if (NarrowByteWidth >= Index) { 11972 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1, 11973 StartingIndex); 11974 } 11975 11976 return std::nullopt; 11977 } 11978 11979 case ISD::CopyFromReg: { 11980 if (BitWidth / 8 > Index) 11981 return calculateSrcByte(Op, StartingIndex, Index); 11982 11983 return std::nullopt; 11984 } 11985 11986 case ISD::LOAD: { 11987 auto L = cast<LoadSDNode>(Op.getNode()); 11988 11989 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); 11990 if (NarrowBitWidth % 8 != 0) 11991 return std::nullopt; 11992 uint64_t NarrowByteWidth = NarrowBitWidth / 8; 11993 11994 // If the width of the load does not reach byte we are trying to provide for 11995 // and it is not a ZEXTLOAD, then the load does not provide for the byte in 11996 // question 11997 if (Index >= NarrowByteWidth) { 11998 return L->getExtensionType() == ISD::ZEXTLOAD 11999 ? std::optional<ByteProvider<SDValue>>( 12000 ByteProvider<SDValue>::getConstantZero()) 12001 : std::nullopt; 12002 } 12003 12004 if (NarrowByteWidth > Index) { 12005 return calculateSrcByte(Op, StartingIndex, Index); 12006 } 12007 12008 return std::nullopt; 12009 } 12010 12011 case ISD::BSWAP: { 12012 if (IsVec) 12013 return std::nullopt; 12014 12015 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1, 12016 Depth + 1, StartingIndex); 12017 } 12018 12019 case ISD::EXTRACT_VECTOR_ELT: { 12020 auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 12021 if (!IdxOp) 12022 return std::nullopt; 12023 auto VecIdx = IdxOp->getZExtValue(); 12024 auto ScalarSize = Op.getScalarValueSizeInBits(); 12025 if (ScalarSize < 32) 12026 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index; 12027 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0), 12028 StartingIndex, Index); 12029 } 12030 12031 case AMDGPUISD::PERM: { 12032 if (IsVec) 12033 return std::nullopt; 12034 12035 auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2)); 12036 if (!PermMask) 12037 return std::nullopt; 12038 12039 auto IdxMask = 12040 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8); 12041 if (IdxMask > 0x07 && IdxMask != 0x0c) 12042 return std::nullopt; 12043 12044 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1); 12045 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask; 12046 12047 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex) 12048 : ByteProvider<SDValue>( 12049 ByteProvider<SDValue>::getConstantZero()); 12050 } 12051 12052 default: { 12053 return std::nullopt; 12054 } 12055 } 12056 12057 llvm_unreachable("fully handled switch"); 12058 } 12059 12060 // Returns true if the Operand is a scalar and is 16 bits 12061 static bool isExtendedFrom16Bits(SDValue &Operand) { 12062 12063 switch (Operand.getOpcode()) { 12064 case ISD::ANY_EXTEND: 12065 case ISD::SIGN_EXTEND: 12066 case ISD::ZERO_EXTEND: { 12067 auto OpVT = Operand.getOperand(0).getValueType(); 12068 return !OpVT.isVector() && OpVT.getSizeInBits() == 16; 12069 } 12070 case ISD::LOAD: { 12071 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode()); 12072 auto ExtType = cast<LoadSDNode>(L)->getExtensionType(); 12073 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD || 12074 ExtType == ISD::EXTLOAD) { 12075 auto MemVT = L->getMemoryVT(); 12076 return !MemVT.isVector() && MemVT.getSizeInBits() == 16; 12077 } 12078 return L->getMemoryVT().getSizeInBits() == 16; 12079 } 12080 default: 12081 return false; 12082 } 12083 } 12084 12085 // Returns true if the mask matches consecutive bytes, and the first byte 12086 // begins at a power of 2 byte offset from 0th byte 12087 static bool addresses16Bits(int Mask) { 12088 int Low8 = Mask & 0xff; 12089 int Hi8 = (Mask & 0xff00) >> 8; 12090 12091 assert(Low8 < 8 && Hi8 < 8); 12092 // Are the bytes contiguous in the order of increasing addresses. 12093 bool IsConsecutive = (Hi8 - Low8 == 1); 12094 // Is the first byte at location that is aligned for 16 bit instructions. 12095 // A counter example is taking 2 consecutive bytes starting at the 8th bit. 12096 // In this case, we still need code to extract the 16 bit operand, so it 12097 // is better to use i8 v_perm 12098 bool Is16Aligned = !(Low8 % 2); 12099 12100 return IsConsecutive && Is16Aligned; 12101 } 12102 12103 // Do not lower into v_perm if the operands are actually 16 bit 12104 // and the selected bits (based on PermMask) correspond with two 12105 // easily addressable 16 bit operands. 12106 static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, 12107 SDValue &OtherOp) { 12108 int Low16 = PermMask & 0xffff; 12109 int Hi16 = (PermMask & 0xffff0000) >> 16; 12110 12111 auto TempOp = peekThroughBitcasts(Op); 12112 auto TempOtherOp = peekThroughBitcasts(OtherOp); 12113 12114 auto OpIs16Bit = 12115 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp); 12116 if (!OpIs16Bit) 12117 return true; 12118 12119 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 || 12120 isExtendedFrom16Bits(TempOtherOp); 12121 if (!OtherOpIs16Bit) 12122 return true; 12123 12124 // Do we cleanly address both 12125 return !addresses16Bits(Low16) || !addresses16Bits(Hi16); 12126 } 12127 12128 static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, 12129 unsigned DWordOffset) { 12130 SDValue Ret; 12131 12132 auto TypeSize = Src.getValueSizeInBits().getFixedValue(); 12133 // ByteProvider must be at least 8 bits 12134 assert(Src.getValueSizeInBits().isKnownMultipleOf(8)); 12135 12136 if (TypeSize <= 32) 12137 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32); 12138 12139 if (Src.getValueType().isVector()) { 12140 auto ScalarTySize = Src.getScalarValueSizeInBits(); 12141 auto ScalarTy = Src.getValueType().getScalarType(); 12142 if (ScalarTySize == 32) { 12143 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src, 12144 DAG.getConstant(DWordOffset, SL, MVT::i32)); 12145 } 12146 if (ScalarTySize > 32) { 12147 Ret = DAG.getNode( 12148 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src, 12149 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32)); 12150 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32)); 12151 if (ShiftVal) 12152 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret, 12153 DAG.getConstant(ShiftVal, SL, MVT::i32)); 12154 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32); 12155 } 12156 12157 assert(ScalarTySize < 32); 12158 auto NumElements = TypeSize / ScalarTySize; 12159 auto Trunc32Elements = (ScalarTySize * NumElements) / 32; 12160 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize; 12161 auto NumElementsIn32 = 32 / ScalarTySize; 12162 auto NumAvailElements = DWordOffset < Trunc32Elements 12163 ? NumElementsIn32 12164 : NumElements - NormalizedTrunc; 12165 12166 SmallVector<SDValue, 4> VecSrcs; 12167 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32, 12168 NumAvailElements); 12169 12170 Ret = DAG.getBuildVector( 12171 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL, 12172 VecSrcs); 12173 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32); 12174 } 12175 12176 /// Scalar Type 12177 auto ShiftVal = 32 * DWordOffset; 12178 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src, 12179 DAG.getConstant(ShiftVal, SL, MVT::i32)); 12180 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32); 12181 } 12182 12183 static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 12184 SelectionDAG &DAG = DCI.DAG; 12185 [[maybe_unused]] EVT VT = N->getValueType(0); 12186 SmallVector<ByteProvider<SDValue>, 8> PermNodes; 12187 12188 // VT is known to be MVT::i32, so we need to provide 4 bytes. 12189 assert(VT == MVT::i32); 12190 for (int i = 0; i < 4; i++) { 12191 // Find the ByteProvider that provides the ith byte of the result of OR 12192 std::optional<ByteProvider<SDValue>> P = 12193 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); 12194 // TODO support constantZero 12195 if (!P || P->isConstantZero()) 12196 return SDValue(); 12197 12198 PermNodes.push_back(*P); 12199 } 12200 if (PermNodes.size() != 4) 12201 return SDValue(); 12202 12203 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4); 12204 std::optional<std::pair<unsigned, unsigned>> SecondSrc; 12205 uint64_t PermMask = 0x00000000; 12206 for (size_t i = 0; i < PermNodes.size(); i++) { 12207 auto PermOp = PermNodes[i]; 12208 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset 12209 // by sizeof(Src2) = 4 12210 int SrcByteAdjust = 4; 12211 12212 // If the Src uses a byte from a different DWORD, then it corresponds 12213 // with a difference source 12214 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) || 12215 ((PermOp.SrcOffset / 4) != FirstSrc.second)) { 12216 if (SecondSrc) 12217 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) || 12218 ((PermOp.SrcOffset / 4) != SecondSrc->second)) 12219 return SDValue(); 12220 12221 // Set the index of the second distinct Src node 12222 SecondSrc = {i, PermNodes[i].SrcOffset / 4}; 12223 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8)); 12224 SrcByteAdjust = 0; 12225 } 12226 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8); 12227 assert(!DAG.getDataLayout().isBigEndian()); 12228 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8); 12229 } 12230 SDLoc DL(N); 12231 SDValue Op = *PermNodes[FirstSrc.first].Src; 12232 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second); 12233 assert(Op.getValueSizeInBits() == 32); 12234 12235 // Check that we are not just extracting the bytes in order from an op 12236 if (!SecondSrc) { 12237 int Low16 = PermMask & 0xffff; 12238 int Hi16 = (PermMask & 0xffff0000) >> 16; 12239 12240 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100); 12241 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302); 12242 12243 // The perm op would really just produce Op. So combine into Op 12244 if (WellFormedLow && WellFormedHi) 12245 return DAG.getBitcast(MVT::getIntegerVT(32), Op); 12246 } 12247 12248 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op; 12249 12250 if (SecondSrc) { 12251 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second); 12252 assert(OtherOp.getValueSizeInBits() == 32); 12253 } 12254 12255 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { 12256 12257 assert(Op.getValueType().isByteSized() && 12258 OtherOp.getValueType().isByteSized()); 12259 12260 // If the ultimate src is less than 32 bits, then we will only be 12261 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or. 12262 // CalculateByteProvider would not have returned Op as source if we 12263 // used a byte that is outside its ValueType. Thus, we are free to 12264 // ANY_EXTEND as the extended bits are dont-cares. 12265 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32); 12266 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32); 12267 12268 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, 12269 DAG.getConstant(PermMask, DL, MVT::i32)); 12270 } 12271 return SDValue(); 12272 } 12273 12274 SDValue SITargetLowering::performOrCombine(SDNode *N, 12275 DAGCombinerInfo &DCI) const { 12276 SelectionDAG &DAG = DCI.DAG; 12277 SDValue LHS = N->getOperand(0); 12278 SDValue RHS = N->getOperand(1); 12279 12280 EVT VT = N->getValueType(0); 12281 if (VT == MVT::i1) { 12282 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) 12283 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && 12284 RHS.getOpcode() == AMDGPUISD::FP_CLASS) { 12285 SDValue Src = LHS.getOperand(0); 12286 if (Src != RHS.getOperand(0)) 12287 return SDValue(); 12288 12289 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 12290 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); 12291 if (!CLHS || !CRHS) 12292 return SDValue(); 12293 12294 // Only 10 bits are used. 12295 static const uint32_t MaxMask = 0x3ff; 12296 12297 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; 12298 SDLoc DL(N); 12299 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 12300 Src, DAG.getConstant(NewMask, DL, MVT::i32)); 12301 } 12302 12303 return SDValue(); 12304 } 12305 12306 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) 12307 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() && 12308 LHS.getOpcode() == AMDGPUISD::PERM && 12309 isa<ConstantSDNode>(LHS.getOperand(2))) { 12310 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1)); 12311 if (!Sel) 12312 return SDValue(); 12313 12314 Sel |= LHS.getConstantOperandVal(2); 12315 SDLoc DL(N); 12316 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), 12317 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32)); 12318 } 12319 12320 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) 12321 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 12322 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && 12323 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { 12324 12325 // If all the uses of an or need to extract the individual elements, do not 12326 // attempt to lower into v_perm 12327 auto usesCombinedOperand = [](SDNode *OrUse) { 12328 // If we have any non-vectorized use, then it is a candidate for v_perm 12329 if (OrUse->getOpcode() != ISD::BITCAST || 12330 !OrUse->getValueType(0).isVector()) 12331 return true; 12332 12333 // If we have any non-vectorized use, then it is a candidate for v_perm 12334 for (auto VUse : OrUse->uses()) { 12335 if (!VUse->getValueType(0).isVector()) 12336 return true; 12337 12338 // If the use of a vector is a store, then combining via a v_perm 12339 // is beneficial. 12340 // TODO -- whitelist more uses 12341 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg}) 12342 if (VUse->getOpcode() == VectorwiseOp) 12343 return true; 12344 } 12345 return false; 12346 }; 12347 12348 if (!any_of(N->uses(), usesCombinedOperand)) 12349 return SDValue(); 12350 12351 uint32_t LHSMask = getPermuteMask(LHS); 12352 uint32_t RHSMask = getPermuteMask(RHS); 12353 12354 if (LHSMask != ~0u && RHSMask != ~0u) { 12355 // Canonicalize the expression in an attempt to have fewer unique masks 12356 // and therefore fewer registers used to hold the masks. 12357 if (LHSMask > RHSMask) { 12358 std::swap(LHSMask, RHSMask); 12359 std::swap(LHS, RHS); 12360 } 12361 12362 // Select 0xc for each lane used from source operand. Zero has 0xc mask 12363 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. 12364 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; 12365 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; 12366 12367 // Check of we need to combine values from two sources within a byte. 12368 if (!(LHSUsedLanes & RHSUsedLanes) && 12369 // If we select high and lower word keep it for SDWA. 12370 // TODO: teach SDWA to work with v_perm_b32 and remove the check. 12371 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { 12372 // Kill zero bytes selected by other mask. Zero value is 0xc. 12373 LHSMask &= ~RHSUsedLanes; 12374 RHSMask &= ~LHSUsedLanes; 12375 // Add 4 to each active LHS lane 12376 LHSMask |= LHSUsedLanes & 0x04040404; 12377 // Combine masks 12378 uint32_t Sel = LHSMask | RHSMask; 12379 SDLoc DL(N); 12380 12381 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, 12382 LHS.getOperand(0), RHS.getOperand(0), 12383 DAG.getConstant(Sel, DL, MVT::i32)); 12384 } 12385 } 12386 if (LHSMask == ~0u || RHSMask == ~0u) { 12387 if (SDValue Perm = matchPERM(N, DCI)) 12388 return Perm; 12389 } 12390 } 12391 12392 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps()) 12393 return SDValue(); 12394 12395 // TODO: This could be a generic combine with a predicate for extracting the 12396 // high half of an integer being free. 12397 12398 // (or i64:x, (zero_extend i32:y)) -> 12399 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x))) 12400 if (LHS.getOpcode() == ISD::ZERO_EXTEND && 12401 RHS.getOpcode() != ISD::ZERO_EXTEND) 12402 std::swap(LHS, RHS); 12403 12404 if (RHS.getOpcode() == ISD::ZERO_EXTEND) { 12405 SDValue ExtSrc = RHS.getOperand(0); 12406 EVT SrcVT = ExtSrc.getValueType(); 12407 if (SrcVT == MVT::i32) { 12408 SDLoc SL(N); 12409 SDValue LowLHS, HiBits; 12410 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG); 12411 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc); 12412 12413 DCI.AddToWorklist(LowOr.getNode()); 12414 DCI.AddToWorklist(HiBits.getNode()); 12415 12416 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 12417 LowOr, HiBits); 12418 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 12419 } 12420 } 12421 12422 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 12423 if (CRHS) { 12424 if (SDValue Split 12425 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, 12426 N->getOperand(0), CRHS)) 12427 return Split; 12428 } 12429 12430 return SDValue(); 12431 } 12432 12433 SDValue SITargetLowering::performXorCombine(SDNode *N, 12434 DAGCombinerInfo &DCI) const { 12435 if (SDValue RV = reassociateScalarOps(N, DCI.DAG)) 12436 return RV; 12437 12438 SDValue LHS = N->getOperand(0); 12439 SDValue RHS = N->getOperand(1); 12440 12441 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 12442 SelectionDAG &DAG = DCI.DAG; 12443 12444 EVT VT = N->getValueType(0); 12445 if (CRHS && VT == MVT::i64) { 12446 if (SDValue Split 12447 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS)) 12448 return Split; 12449 } 12450 12451 // Make sure to apply the 64-bit constant splitting fold before trying to fold 12452 // fneg-like xors into 64-bit select. 12453 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) { 12454 // This looks like an fneg, try to fold as a source modifier. 12455 if (CRHS && CRHS->getAPIntValue().isSignMask() && 12456 shouldFoldFNegIntoSrc(N, LHS)) { 12457 // xor (select c, a, b), 0x80000000 -> 12458 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b))) 12459 SDLoc DL(N); 12460 SDValue CastLHS = 12461 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1)); 12462 SDValue CastRHS = 12463 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2)); 12464 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS); 12465 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS); 12466 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32, 12467 LHS->getOperand(0), FNegLHS, FNegRHS); 12468 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect); 12469 } 12470 } 12471 12472 return SDValue(); 12473 } 12474 12475 SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, 12476 DAGCombinerInfo &DCI) const { 12477 if (!Subtarget->has16BitInsts() || 12478 DCI.getDAGCombineLevel() < AfterLegalizeDAG) 12479 return SDValue(); 12480 12481 EVT VT = N->getValueType(0); 12482 if (VT != MVT::i32) 12483 return SDValue(); 12484 12485 SDValue Src = N->getOperand(0); 12486 if (Src.getValueType() != MVT::i16) 12487 return SDValue(); 12488 12489 return SDValue(); 12490 } 12491 12492 SDValue 12493 SITargetLowering::performSignExtendInRegCombine(SDNode *N, 12494 DAGCombinerInfo &DCI) const { 12495 SDValue Src = N->getOperand(0); 12496 auto *VTSign = cast<VTSDNode>(N->getOperand(1)); 12497 12498 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them 12499 // with s_buffer_load_i8 and s_buffer_load_i16 respectively. 12500 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE && 12501 VTSign->getVT() == MVT::i8) || 12502 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT && 12503 VTSign->getVT() == MVT::i16))) { 12504 assert(Subtarget->hasScalarSubwordLoads() && 12505 "s_buffer_load_{u8, i8} are supported " 12506 "in GFX12 (or newer) architectures."); 12507 EVT VT = Src.getValueType(); 12508 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE) 12509 ? AMDGPUISD::SBUFFER_LOAD_BYTE 12510 : AMDGPUISD::SBUFFER_LOAD_SHORT; 12511 SDLoc DL(N); 12512 SDVTList ResList = DCI.DAG.getVTList(MVT::i32); 12513 SDValue Ops[] = { 12514 Src.getOperand(0), // source register 12515 Src.getOperand(1), // offset 12516 Src.getOperand(2) // cachePolicy 12517 }; 12518 auto *M = cast<MemSDNode>(Src); 12519 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode( 12520 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand()); 12521 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad); 12522 return LoadVal; 12523 } 12524 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE && 12525 VTSign->getVT() == MVT::i8) || 12526 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT && 12527 VTSign->getVT() == MVT::i16)) && 12528 Src.hasOneUse()) { 12529 auto *M = cast<MemSDNode>(Src); 12530 SDValue Ops[] = { 12531 Src.getOperand(0), // Chain 12532 Src.getOperand(1), // rsrc 12533 Src.getOperand(2), // vindex 12534 Src.getOperand(3), // voffset 12535 Src.getOperand(4), // soffset 12536 Src.getOperand(5), // offset 12537 Src.getOperand(6), 12538 Src.getOperand(7) 12539 }; 12540 // replace with BUFFER_LOAD_BYTE/SHORT 12541 SDVTList ResList = DCI.DAG.getVTList(MVT::i32, 12542 Src.getOperand(0).getValueType()); 12543 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ? 12544 AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT; 12545 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N), 12546 ResList, 12547 Ops, M->getMemoryVT(), 12548 M->getMemOperand()); 12549 return DCI.DAG.getMergeValues({BufferLoadSignExt, 12550 BufferLoadSignExt.getValue(1)}, SDLoc(N)); 12551 } 12552 return SDValue(); 12553 } 12554 12555 SDValue SITargetLowering::performClassCombine(SDNode *N, 12556 DAGCombinerInfo &DCI) const { 12557 SelectionDAG &DAG = DCI.DAG; 12558 SDValue Mask = N->getOperand(1); 12559 12560 // fp_class x, 0 -> false 12561 if (isNullConstant(Mask)) 12562 return DAG.getConstant(0, SDLoc(N), MVT::i1); 12563 12564 if (N->getOperand(0).isUndef()) 12565 return DAG.getUNDEF(MVT::i1); 12566 12567 return SDValue(); 12568 } 12569 12570 SDValue SITargetLowering::performRcpCombine(SDNode *N, 12571 DAGCombinerInfo &DCI) const { 12572 EVT VT = N->getValueType(0); 12573 SDValue N0 = N->getOperand(0); 12574 12575 if (N0.isUndef()) { 12576 return DCI.DAG.getConstantFP( 12577 APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT)), SDLoc(N), 12578 VT); 12579 } 12580 12581 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP || 12582 N0.getOpcode() == ISD::SINT_TO_FP)) { 12583 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0, 12584 N->getFlags()); 12585 } 12586 12587 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here. 12588 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) && 12589 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) { 12590 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, 12591 N0.getOperand(0), N->getFlags()); 12592 } 12593 12594 return AMDGPUTargetLowering::performRcpCombine(N, DCI); 12595 } 12596 12597 bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, 12598 unsigned MaxDepth) const { 12599 unsigned Opcode = Op.getOpcode(); 12600 if (Opcode == ISD::FCANONICALIZE) 12601 return true; 12602 12603 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) { 12604 const auto &F = CFP->getValueAPF(); 12605 if (F.isNaN() && F.isSignaling()) 12606 return false; 12607 if (!F.isDenormal()) 12608 return true; 12609 12610 DenormalMode Mode = 12611 DAG.getMachineFunction().getDenormalMode(F.getSemantics()); 12612 return Mode == DenormalMode::getIEEE(); 12613 } 12614 12615 // If source is a result of another standard FP operation it is already in 12616 // canonical form. 12617 if (MaxDepth == 0) 12618 return false; 12619 12620 switch (Opcode) { 12621 // These will flush denorms if required. 12622 case ISD::FADD: 12623 case ISD::FSUB: 12624 case ISD::FMUL: 12625 case ISD::FCEIL: 12626 case ISD::FFLOOR: 12627 case ISD::FMA: 12628 case ISD::FMAD: 12629 case ISD::FSQRT: 12630 case ISD::FDIV: 12631 case ISD::FREM: 12632 case ISD::FP_ROUND: 12633 case ISD::FP_EXTEND: 12634 case ISD::FP16_TO_FP: 12635 case ISD::FP_TO_FP16: 12636 case ISD::BF16_TO_FP: 12637 case ISD::FP_TO_BF16: 12638 case ISD::FLDEXP: 12639 case AMDGPUISD::FMUL_LEGACY: 12640 case AMDGPUISD::FMAD_FTZ: 12641 case AMDGPUISD::RCP: 12642 case AMDGPUISD::RSQ: 12643 case AMDGPUISD::RSQ_CLAMP: 12644 case AMDGPUISD::RCP_LEGACY: 12645 case AMDGPUISD::RCP_IFLAG: 12646 case AMDGPUISD::LOG: 12647 case AMDGPUISD::EXP: 12648 case AMDGPUISD::DIV_SCALE: 12649 case AMDGPUISD::DIV_FMAS: 12650 case AMDGPUISD::DIV_FIXUP: 12651 case AMDGPUISD::FRACT: 12652 case AMDGPUISD::CVT_PKRTZ_F16_F32: 12653 case AMDGPUISD::CVT_F32_UBYTE0: 12654 case AMDGPUISD::CVT_F32_UBYTE1: 12655 case AMDGPUISD::CVT_F32_UBYTE2: 12656 case AMDGPUISD::CVT_F32_UBYTE3: 12657 case AMDGPUISD::FP_TO_FP16: 12658 case AMDGPUISD::SIN_HW: 12659 case AMDGPUISD::COS_HW: 12660 return true; 12661 12662 // It can/will be lowered or combined as a bit operation. 12663 // Need to check their input recursively to handle. 12664 case ISD::FNEG: 12665 case ISD::FABS: 12666 case ISD::FCOPYSIGN: 12667 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); 12668 12669 case ISD::AND: 12670 if (Op.getValueType() == MVT::i32) { 12671 // Be careful as we only know it is a bitcast floating point type. It 12672 // could be f32, v2f16, we have no way of knowing. Luckily the constant 12673 // value that we optimize for, which comes up in fp32 to bf16 conversions, 12674 // is valid to optimize for all types. 12675 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 12676 if (RHS->getZExtValue() == 0xffff0000) { 12677 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); 12678 } 12679 } 12680 } 12681 break; 12682 12683 case ISD::FSIN: 12684 case ISD::FCOS: 12685 case ISD::FSINCOS: 12686 return Op.getValueType().getScalarType() != MVT::f16; 12687 12688 case ISD::FMINNUM: 12689 case ISD::FMAXNUM: 12690 case ISD::FMINNUM_IEEE: 12691 case ISD::FMAXNUM_IEEE: 12692 case ISD::FMINIMUM: 12693 case ISD::FMAXIMUM: 12694 case AMDGPUISD::CLAMP: 12695 case AMDGPUISD::FMED3: 12696 case AMDGPUISD::FMAX3: 12697 case AMDGPUISD::FMIN3: 12698 case AMDGPUISD::FMAXIMUM3: 12699 case AMDGPUISD::FMINIMUM3: { 12700 // FIXME: Shouldn't treat the generic operations different based these. 12701 // However, we aren't really required to flush the result from 12702 // minnum/maxnum.. 12703 12704 // snans will be quieted, so we only need to worry about denormals. 12705 if (Subtarget->supportsMinMaxDenormModes() || 12706 // FIXME: denormalsEnabledForType is broken for dynamic 12707 denormalsEnabledForType(DAG, Op.getValueType())) 12708 return true; 12709 12710 // Flushing may be required. 12711 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such 12712 // targets need to check their input recursively. 12713 12714 // FIXME: Does this apply with clamp? It's implemented with max. 12715 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) { 12716 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1)) 12717 return false; 12718 } 12719 12720 return true; 12721 } 12722 case ISD::SELECT: { 12723 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) && 12724 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1); 12725 } 12726 case ISD::BUILD_VECTOR: { 12727 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { 12728 SDValue SrcOp = Op.getOperand(i); 12729 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1)) 12730 return false; 12731 } 12732 12733 return true; 12734 } 12735 case ISD::EXTRACT_VECTOR_ELT: 12736 case ISD::EXTRACT_SUBVECTOR: { 12737 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); 12738 } 12739 case ISD::INSERT_VECTOR_ELT: { 12740 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) && 12741 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1); 12742 } 12743 case ISD::UNDEF: 12744 // Could be anything. 12745 return false; 12746 12747 case ISD::BITCAST: 12748 // TODO: This is incorrect as it loses track of the operand's type. We may 12749 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the 12750 // same bits that are canonicalized in one type need not be in the other. 12751 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); 12752 case ISD::TRUNCATE: { 12753 // Hack round the mess we make when legalizing extract_vector_elt 12754 if (Op.getValueType() == MVT::i16) { 12755 SDValue TruncSrc = Op.getOperand(0); 12756 if (TruncSrc.getValueType() == MVT::i32 && 12757 TruncSrc.getOpcode() == ISD::BITCAST && 12758 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) { 12759 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1); 12760 } 12761 } 12762 return false; 12763 } 12764 case ISD::INTRINSIC_WO_CHAIN: { 12765 unsigned IntrinsicID = Op.getConstantOperandVal(0); 12766 // TODO: Handle more intrinsics 12767 switch (IntrinsicID) { 12768 case Intrinsic::amdgcn_cvt_pkrtz: 12769 case Intrinsic::amdgcn_cubeid: 12770 case Intrinsic::amdgcn_frexp_mant: 12771 case Intrinsic::amdgcn_fdot2: 12772 case Intrinsic::amdgcn_rcp: 12773 case Intrinsic::amdgcn_rsq: 12774 case Intrinsic::amdgcn_rsq_clamp: 12775 case Intrinsic::amdgcn_rcp_legacy: 12776 case Intrinsic::amdgcn_rsq_legacy: 12777 case Intrinsic::amdgcn_trig_preop: 12778 case Intrinsic::amdgcn_log: 12779 case Intrinsic::amdgcn_exp2: 12780 case Intrinsic::amdgcn_sqrt: 12781 return true; 12782 default: 12783 break; 12784 } 12785 12786 break; 12787 } 12788 default: 12789 break; 12790 } 12791 12792 // FIXME: denormalsEnabledForType is broken for dynamic 12793 return denormalsEnabledForType(DAG, Op.getValueType()) && 12794 DAG.isKnownNeverSNaN(Op); 12795 } 12796 12797 bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF, 12798 unsigned MaxDepth) const { 12799 const MachineRegisterInfo &MRI = MF.getRegInfo(); 12800 MachineInstr *MI = MRI.getVRegDef(Reg); 12801 unsigned Opcode = MI->getOpcode(); 12802 12803 if (Opcode == AMDGPU::G_FCANONICALIZE) 12804 return true; 12805 12806 std::optional<FPValueAndVReg> FCR; 12807 // Constant splat (can be padded with undef) or scalar constant. 12808 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) { 12809 if (FCR->Value.isSignaling()) 12810 return false; 12811 if (!FCR->Value.isDenormal()) 12812 return true; 12813 12814 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics()); 12815 return Mode == DenormalMode::getIEEE(); 12816 } 12817 12818 if (MaxDepth == 0) 12819 return false; 12820 12821 switch (Opcode) { 12822 case AMDGPU::G_FADD: 12823 case AMDGPU::G_FSUB: 12824 case AMDGPU::G_FMUL: 12825 case AMDGPU::G_FCEIL: 12826 case AMDGPU::G_FFLOOR: 12827 case AMDGPU::G_FRINT: 12828 case AMDGPU::G_FNEARBYINT: 12829 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND: 12830 case AMDGPU::G_INTRINSIC_TRUNC: 12831 case AMDGPU::G_INTRINSIC_ROUNDEVEN: 12832 case AMDGPU::G_FMA: 12833 case AMDGPU::G_FMAD: 12834 case AMDGPU::G_FSQRT: 12835 case AMDGPU::G_FDIV: 12836 case AMDGPU::G_FREM: 12837 case AMDGPU::G_FPOW: 12838 case AMDGPU::G_FPEXT: 12839 case AMDGPU::G_FLOG: 12840 case AMDGPU::G_FLOG2: 12841 case AMDGPU::G_FLOG10: 12842 case AMDGPU::G_FPTRUNC: 12843 case AMDGPU::G_AMDGPU_RCP_IFLAG: 12844 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: 12845 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: 12846 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: 12847 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: 12848 return true; 12849 case AMDGPU::G_FNEG: 12850 case AMDGPU::G_FABS: 12851 case AMDGPU::G_FCOPYSIGN: 12852 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1); 12853 case AMDGPU::G_FMINNUM: 12854 case AMDGPU::G_FMAXNUM: 12855 case AMDGPU::G_FMINNUM_IEEE: 12856 case AMDGPU::G_FMAXNUM_IEEE: 12857 case AMDGPU::G_FMINIMUM: 12858 case AMDGPU::G_FMAXIMUM: { 12859 if (Subtarget->supportsMinMaxDenormModes() || 12860 // FIXME: denormalsEnabledForType is broken for dynamic 12861 denormalsEnabledForType(MRI.getType(Reg), MF)) 12862 return true; 12863 12864 [[fallthrough]]; 12865 } 12866 case AMDGPU::G_BUILD_VECTOR: 12867 for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) 12868 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1)) 12869 return false; 12870 return true; 12871 case AMDGPU::G_INTRINSIC: 12872 case AMDGPU::G_INTRINSIC_CONVERGENT: 12873 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) { 12874 case Intrinsic::amdgcn_fmul_legacy: 12875 case Intrinsic::amdgcn_fmad_ftz: 12876 case Intrinsic::amdgcn_sqrt: 12877 case Intrinsic::amdgcn_fmed3: 12878 case Intrinsic::amdgcn_sin: 12879 case Intrinsic::amdgcn_cos: 12880 case Intrinsic::amdgcn_log: 12881 case Intrinsic::amdgcn_exp2: 12882 case Intrinsic::amdgcn_log_clamp: 12883 case Intrinsic::amdgcn_rcp: 12884 case Intrinsic::amdgcn_rcp_legacy: 12885 case Intrinsic::amdgcn_rsq: 12886 case Intrinsic::amdgcn_rsq_clamp: 12887 case Intrinsic::amdgcn_rsq_legacy: 12888 case Intrinsic::amdgcn_div_scale: 12889 case Intrinsic::amdgcn_div_fmas: 12890 case Intrinsic::amdgcn_div_fixup: 12891 case Intrinsic::amdgcn_fract: 12892 case Intrinsic::amdgcn_cvt_pkrtz: 12893 case Intrinsic::amdgcn_cubeid: 12894 case Intrinsic::amdgcn_cubema: 12895 case Intrinsic::amdgcn_cubesc: 12896 case Intrinsic::amdgcn_cubetc: 12897 case Intrinsic::amdgcn_frexp_mant: 12898 case Intrinsic::amdgcn_fdot2: 12899 case Intrinsic::amdgcn_trig_preop: 12900 return true; 12901 default: 12902 break; 12903 } 12904 12905 [[fallthrough]]; 12906 default: 12907 return false; 12908 } 12909 12910 llvm_unreachable("invalid operation"); 12911 } 12912 12913 // Constant fold canonicalize. 12914 SDValue SITargetLowering::getCanonicalConstantFP( 12915 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const { 12916 // Flush denormals to 0 if not enabled. 12917 if (C.isDenormal()) { 12918 DenormalMode Mode = 12919 DAG.getMachineFunction().getDenormalMode(C.getSemantics()); 12920 if (Mode == DenormalMode::getPreserveSign()) { 12921 return DAG.getConstantFP( 12922 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT); 12923 } 12924 12925 if (Mode != DenormalMode::getIEEE()) 12926 return SDValue(); 12927 } 12928 12929 if (C.isNaN()) { 12930 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics()); 12931 if (C.isSignaling()) { 12932 // Quiet a signaling NaN. 12933 // FIXME: Is this supposed to preserve payload bits? 12934 return DAG.getConstantFP(CanonicalQNaN, SL, VT); 12935 } 12936 12937 // Make sure it is the canonical NaN bitpattern. 12938 // 12939 // TODO: Can we use -1 as the canonical NaN value since it's an inline 12940 // immediate? 12941 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt()) 12942 return DAG.getConstantFP(CanonicalQNaN, SL, VT); 12943 } 12944 12945 // Already canonical. 12946 return DAG.getConstantFP(C, SL, VT); 12947 } 12948 12949 static bool vectorEltWillFoldAway(SDValue Op) { 12950 return Op.isUndef() || isa<ConstantFPSDNode>(Op); 12951 } 12952 12953 SDValue SITargetLowering::performFCanonicalizeCombine( 12954 SDNode *N, 12955 DAGCombinerInfo &DCI) const { 12956 SelectionDAG &DAG = DCI.DAG; 12957 SDValue N0 = N->getOperand(0); 12958 EVT VT = N->getValueType(0); 12959 12960 // fcanonicalize undef -> qnan 12961 if (N0.isUndef()) { 12962 APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT)); 12963 return DAG.getConstantFP(QNaN, SDLoc(N), VT); 12964 } 12965 12966 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) { 12967 EVT VT = N->getValueType(0); 12968 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF()); 12969 } 12970 12971 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x), 12972 // (fcanonicalize k) 12973 // 12974 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0 12975 12976 // TODO: This could be better with wider vectors that will be split to v2f16, 12977 // and to consider uses since there aren't that many packed operations. 12978 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 && 12979 isTypeLegal(MVT::v2f16)) { 12980 SDLoc SL(N); 12981 SDValue NewElts[2]; 12982 SDValue Lo = N0.getOperand(0); 12983 SDValue Hi = N0.getOperand(1); 12984 EVT EltVT = Lo.getValueType(); 12985 12986 if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) { 12987 for (unsigned I = 0; I != 2; ++I) { 12988 SDValue Op = N0.getOperand(I); 12989 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) { 12990 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT, 12991 CFP->getValueAPF()); 12992 } else if (Op.isUndef()) { 12993 // Handled below based on what the other operand is. 12994 NewElts[I] = Op; 12995 } else { 12996 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op); 12997 } 12998 } 12999 13000 // If one half is undef, and one is constant, prefer a splat vector rather 13001 // than the normal qNaN. If it's a register, prefer 0.0 since that's 13002 // cheaper to use and may be free with a packed operation. 13003 if (NewElts[0].isUndef()) { 13004 if (isa<ConstantFPSDNode>(NewElts[1])) 13005 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ? 13006 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT); 13007 } 13008 13009 if (NewElts[1].isUndef()) { 13010 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ? 13011 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT); 13012 } 13013 13014 return DAG.getBuildVector(VT, SL, NewElts); 13015 } 13016 } 13017 13018 return SDValue(); 13019 } 13020 13021 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { 13022 switch (Opc) { 13023 case ISD::FMAXNUM: 13024 case ISD::FMAXNUM_IEEE: 13025 return AMDGPUISD::FMAX3; 13026 case ISD::FMAXIMUM: 13027 return AMDGPUISD::FMAXIMUM3; 13028 case ISD::SMAX: 13029 return AMDGPUISD::SMAX3; 13030 case ISD::UMAX: 13031 return AMDGPUISD::UMAX3; 13032 case ISD::FMINNUM: 13033 case ISD::FMINNUM_IEEE: 13034 return AMDGPUISD::FMIN3; 13035 case ISD::FMINIMUM: 13036 return AMDGPUISD::FMINIMUM3; 13037 case ISD::SMIN: 13038 return AMDGPUISD::SMIN3; 13039 case ISD::UMIN: 13040 return AMDGPUISD::UMIN3; 13041 default: 13042 llvm_unreachable("Not a min/max opcode"); 13043 } 13044 } 13045 13046 SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG, 13047 const SDLoc &SL, SDValue Src, 13048 SDValue MinVal, 13049 SDValue MaxVal, 13050 bool Signed) const { 13051 13052 // med3 comes from 13053 // min(max(x, K0), K1), K0 < K1 13054 // max(min(x, K0), K1), K1 < K0 13055 // 13056 // "MinVal" and "MaxVal" respectively refer to the rhs of the 13057 // min/max op. 13058 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal); 13059 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal); 13060 13061 if (!MinK || !MaxK) 13062 return SDValue(); 13063 13064 if (Signed) { 13065 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue())) 13066 return SDValue(); 13067 } else { 13068 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue())) 13069 return SDValue(); 13070 } 13071 13072 EVT VT = MinK->getValueType(0); 13073 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3; 13074 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) 13075 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal); 13076 13077 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is 13078 // not available, but this is unlikely to be profitable as constants 13079 // will often need to be materialized & extended, especially on 13080 // pre-GFX10 where VOP3 instructions couldn't take literal operands. 13081 return SDValue(); 13082 } 13083 13084 static ConstantFPSDNode *getSplatConstantFP(SDValue Op) { 13085 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) 13086 return C; 13087 13088 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) { 13089 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode()) 13090 return C; 13091 } 13092 13093 return nullptr; 13094 } 13095 13096 SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, 13097 const SDLoc &SL, 13098 SDValue Op0, 13099 SDValue Op1) const { 13100 ConstantFPSDNode *K1 = getSplatConstantFP(Op1); 13101 if (!K1) 13102 return SDValue(); 13103 13104 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1)); 13105 if (!K0) 13106 return SDValue(); 13107 13108 // Ordered >= (although NaN inputs should have folded away by now). 13109 if (K0->getValueAPF() > K1->getValueAPF()) 13110 return SDValue(); 13111 13112 const MachineFunction &MF = DAG.getMachineFunction(); 13113 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 13114 13115 // TODO: Check IEEE bit enabled? 13116 EVT VT = Op0.getValueType(); 13117 if (Info->getMode().DX10Clamp) { 13118 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the 13119 // hardware fmed3 behavior converting to a min. 13120 // FIXME: Should this be allowing -0.0? 13121 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0)) 13122 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0)); 13123 } 13124 13125 // med3 for f16 is only available on gfx9+, and not available for v2f16. 13126 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) { 13127 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a 13128 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would 13129 // then give the other result, which is different from med3 with a NaN 13130 // input. 13131 SDValue Var = Op0.getOperand(0); 13132 if (!DAG.isKnownNeverSNaN(Var)) 13133 return SDValue(); 13134 13135 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 13136 13137 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) && 13138 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) { 13139 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), 13140 Var, SDValue(K0, 0), SDValue(K1, 0)); 13141 } 13142 } 13143 13144 return SDValue(); 13145 } 13146 13147 /// \return true if the subtarget supports minimum3 and maximum3 with the given 13148 /// base min/max opcode \p Opc for type \p VT. 13149 static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, 13150 EVT VT) { 13151 switch (Opc) { 13152 case ISD::FMINNUM: 13153 case ISD::FMAXNUM: 13154 case ISD::FMINNUM_IEEE: 13155 case ISD::FMAXNUM_IEEE: 13156 case AMDGPUISD::FMIN_LEGACY: 13157 case AMDGPUISD::FMAX_LEGACY: 13158 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()); 13159 case ISD::FMINIMUM: 13160 case ISD::FMAXIMUM: 13161 return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.hasIEEEMinMax3(); 13162 case ISD::SMAX: 13163 case ISD::SMIN: 13164 case ISD::UMAX: 13165 case ISD::UMIN: 13166 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16()); 13167 default: 13168 return false; 13169 } 13170 13171 llvm_unreachable("not a min/max opcode"); 13172 } 13173 13174 SDValue SITargetLowering::performMinMaxCombine(SDNode *N, 13175 DAGCombinerInfo &DCI) const { 13176 SelectionDAG &DAG = DCI.DAG; 13177 13178 EVT VT = N->getValueType(0); 13179 unsigned Opc = N->getOpcode(); 13180 SDValue Op0 = N->getOperand(0); 13181 SDValue Op1 = N->getOperand(1); 13182 13183 // Only do this if the inner op has one use since this will just increases 13184 // register pressure for no benefit. 13185 13186 if (supportsMin3Max3(*Subtarget, Opc, VT)) { 13187 // max(max(a, b), c) -> max3(a, b, c) 13188 // min(min(a, b), c) -> min3(a, b, c) 13189 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { 13190 SDLoc DL(N); 13191 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 13192 DL, 13193 N->getValueType(0), 13194 Op0.getOperand(0), 13195 Op0.getOperand(1), 13196 Op1); 13197 } 13198 13199 // Try commuted. 13200 // max(a, max(b, c)) -> max3(a, b, c) 13201 // min(a, min(b, c)) -> min3(a, b, c) 13202 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { 13203 SDLoc DL(N); 13204 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 13205 DL, 13206 N->getValueType(0), 13207 Op0, 13208 Op1.getOperand(0), 13209 Op1.getOperand(1)); 13210 } 13211 } 13212 13213 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) 13214 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0) 13215 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { 13216 if (SDValue Med3 = performIntMed3ImmCombine( 13217 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true)) 13218 return Med3; 13219 } 13220 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) { 13221 if (SDValue Med3 = performIntMed3ImmCombine( 13222 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true)) 13223 return Med3; 13224 } 13225 13226 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { 13227 if (SDValue Med3 = performIntMed3ImmCombine( 13228 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false)) 13229 return Med3; 13230 } 13231 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) { 13232 if (SDValue Med3 = performIntMed3ImmCombine( 13233 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false)) 13234 return Med3; 13235 } 13236 13237 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) 13238 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || 13239 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) || 13240 (Opc == AMDGPUISD::FMIN_LEGACY && 13241 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && 13242 (VT == MVT::f32 || VT == MVT::f64 || 13243 (VT == MVT::f16 && Subtarget->has16BitInsts()) || 13244 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) && 13245 Op0.hasOneUse()) { 13246 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) 13247 return Res; 13248 } 13249 13250 return SDValue(); 13251 } 13252 13253 static bool isClampZeroToOne(SDValue A, SDValue B) { 13254 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) { 13255 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) { 13256 // FIXME: Should this be allowing -0.0? 13257 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) || 13258 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0)); 13259 } 13260 } 13261 13262 return false; 13263 } 13264 13265 // FIXME: Should only worry about snans for version with chain. 13266 SDValue SITargetLowering::performFMed3Combine(SDNode *N, 13267 DAGCombinerInfo &DCI) const { 13268 EVT VT = N->getValueType(0); 13269 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and 13270 // NaNs. With a NaN input, the order of the operands may change the result. 13271 13272 SelectionDAG &DAG = DCI.DAG; 13273 SDLoc SL(N); 13274 13275 SDValue Src0 = N->getOperand(0); 13276 SDValue Src1 = N->getOperand(1); 13277 SDValue Src2 = N->getOperand(2); 13278 13279 if (isClampZeroToOne(Src0, Src1)) { 13280 // const_a, const_b, x -> clamp is safe in all cases including signaling 13281 // nans. 13282 // FIXME: Should this be allowing -0.0? 13283 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2); 13284 } 13285 13286 const MachineFunction &MF = DAG.getMachineFunction(); 13287 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 13288 13289 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother 13290 // handling no dx10-clamp? 13291 if (Info->getMode().DX10Clamp) { 13292 // If NaNs is clamped to 0, we are free to reorder the inputs. 13293 13294 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1)) 13295 std::swap(Src0, Src1); 13296 13297 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2)) 13298 std::swap(Src1, Src2); 13299 13300 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1)) 13301 std::swap(Src0, Src1); 13302 13303 if (isClampZeroToOne(Src1, Src2)) 13304 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0); 13305 } 13306 13307 return SDValue(); 13308 } 13309 13310 SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, 13311 DAGCombinerInfo &DCI) const { 13312 SDValue Src0 = N->getOperand(0); 13313 SDValue Src1 = N->getOperand(1); 13314 if (Src0.isUndef() && Src1.isUndef()) 13315 return DCI.DAG.getUNDEF(N->getValueType(0)); 13316 return SDValue(); 13317 } 13318 13319 // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be 13320 // expanded into a set of cmp/select instructions. 13321 bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize, 13322 unsigned NumElem, 13323 bool IsDivergentIdx, 13324 const GCNSubtarget *Subtarget) { 13325 if (UseDivergentRegisterIndexing) 13326 return false; 13327 13328 unsigned VecSize = EltSize * NumElem; 13329 13330 // Sub-dword vectors of size 2 dword or less have better implementation. 13331 if (VecSize <= 64 && EltSize < 32) 13332 return false; 13333 13334 // Always expand the rest of sub-dword instructions, otherwise it will be 13335 // lowered via memory. 13336 if (EltSize < 32) 13337 return true; 13338 13339 // Always do this if var-idx is divergent, otherwise it will become a loop. 13340 if (IsDivergentIdx) 13341 return true; 13342 13343 // Large vectors would yield too many compares and v_cndmask_b32 instructions. 13344 unsigned NumInsts = NumElem /* Number of compares */ + 13345 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */; 13346 13347 // On some architectures (GFX9) movrel is not available and it's better 13348 // to expand. 13349 if (!Subtarget->hasMovrel()) 13350 return NumInsts <= 16; 13351 13352 // If movrel is available, use it instead of expanding for vector of 8 13353 // elements. 13354 return NumInsts <= 15; 13355 } 13356 13357 bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const { 13358 SDValue Idx = N->getOperand(N->getNumOperands() - 1); 13359 if (isa<ConstantSDNode>(Idx)) 13360 return false; 13361 13362 SDValue Vec = N->getOperand(0); 13363 EVT VecVT = Vec.getValueType(); 13364 EVT EltVT = VecVT.getVectorElementType(); 13365 unsigned EltSize = EltVT.getSizeInBits(); 13366 unsigned NumElem = VecVT.getVectorNumElements(); 13367 13368 return SITargetLowering::shouldExpandVectorDynExt( 13369 EltSize, NumElem, Idx->isDivergent(), getSubtarget()); 13370 } 13371 13372 SDValue SITargetLowering::performExtractVectorEltCombine( 13373 SDNode *N, DAGCombinerInfo &DCI) const { 13374 SDValue Vec = N->getOperand(0); 13375 SelectionDAG &DAG = DCI.DAG; 13376 13377 EVT VecVT = Vec.getValueType(); 13378 EVT VecEltVT = VecVT.getVectorElementType(); 13379 EVT ResVT = N->getValueType(0); 13380 13381 unsigned VecSize = VecVT.getSizeInBits(); 13382 unsigned VecEltSize = VecEltVT.getSizeInBits(); 13383 13384 if ((Vec.getOpcode() == ISD::FNEG || 13385 Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) { 13386 SDLoc SL(N); 13387 SDValue Idx = N->getOperand(1); 13388 SDValue Elt = 13389 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx); 13390 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt); 13391 } 13392 13393 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx) 13394 // => 13395 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx) 13396 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx) 13397 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt 13398 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) { 13399 SDLoc SL(N); 13400 SDValue Idx = N->getOperand(1); 13401 unsigned Opc = Vec.getOpcode(); 13402 13403 switch(Opc) { 13404 default: 13405 break; 13406 // TODO: Support other binary operations. 13407 case ISD::FADD: 13408 case ISD::FSUB: 13409 case ISD::FMUL: 13410 case ISD::ADD: 13411 case ISD::UMIN: 13412 case ISD::UMAX: 13413 case ISD::SMIN: 13414 case ISD::SMAX: 13415 case ISD::FMAXNUM: 13416 case ISD::FMINNUM: 13417 case ISD::FMAXNUM_IEEE: 13418 case ISD::FMINNUM_IEEE: 13419 case ISD::FMAXIMUM: 13420 case ISD::FMINIMUM: { 13421 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, 13422 Vec.getOperand(0), Idx); 13423 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, 13424 Vec.getOperand(1), Idx); 13425 13426 DCI.AddToWorklist(Elt0.getNode()); 13427 DCI.AddToWorklist(Elt1.getNode()); 13428 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags()); 13429 } 13430 } 13431 } 13432 13433 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx) 13434 if (shouldExpandVectorDynExt(N)) { 13435 SDLoc SL(N); 13436 SDValue Idx = N->getOperand(1); 13437 SDValue V; 13438 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { 13439 SDValue IC = DAG.getVectorIdxConstant(I, SL); 13440 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC); 13441 if (I == 0) 13442 V = Elt; 13443 else 13444 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ); 13445 } 13446 return V; 13447 } 13448 13449 if (!DCI.isBeforeLegalize()) 13450 return SDValue(); 13451 13452 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit 13453 // elements. This exposes more load reduction opportunities by replacing 13454 // multiple small extract_vector_elements with a single 32-bit extract. 13455 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13456 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() && 13457 VecSize > 32 && VecSize % 32 == 0 && Idx) { 13458 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT); 13459 13460 unsigned BitIndex = Idx->getZExtValue() * VecEltSize; 13461 unsigned EltIdx = BitIndex / 32; 13462 unsigned LeftoverBitIdx = BitIndex % 32; 13463 SDLoc SL(N); 13464 13465 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec); 13466 DCI.AddToWorklist(Cast.getNode()); 13467 13468 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast, 13469 DAG.getConstant(EltIdx, SL, MVT::i32)); 13470 DCI.AddToWorklist(Elt.getNode()); 13471 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt, 13472 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32)); 13473 DCI.AddToWorklist(Srl.getNode()); 13474 13475 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger(); 13476 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl); 13477 DCI.AddToWorklist(Trunc.getNode()); 13478 13479 if (VecEltVT == ResVT) { 13480 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc); 13481 } 13482 13483 assert(ResVT.isScalarInteger()); 13484 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT); 13485 } 13486 13487 return SDValue(); 13488 } 13489 13490 SDValue 13491 SITargetLowering::performInsertVectorEltCombine(SDNode *N, 13492 DAGCombinerInfo &DCI) const { 13493 SDValue Vec = N->getOperand(0); 13494 SDValue Idx = N->getOperand(2); 13495 EVT VecVT = Vec.getValueType(); 13496 EVT EltVT = VecVT.getVectorElementType(); 13497 13498 // INSERT_VECTOR_ELT (<n x e>, var-idx) 13499 // => BUILD_VECTOR n x select (e, const-idx) 13500 if (!shouldExpandVectorDynExt(N)) 13501 return SDValue(); 13502 13503 SelectionDAG &DAG = DCI.DAG; 13504 SDLoc SL(N); 13505 SDValue Ins = N->getOperand(1); 13506 EVT IdxVT = Idx.getValueType(); 13507 13508 SmallVector<SDValue, 16> Ops; 13509 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { 13510 SDValue IC = DAG.getConstant(I, SL, IdxVT); 13511 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC); 13512 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ); 13513 Ops.push_back(V); 13514 } 13515 13516 return DAG.getBuildVector(VecVT, SL, Ops); 13517 } 13518 13519 /// Return the source of an fp_extend from f16 to f32, or a converted FP 13520 /// constant. 13521 static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) { 13522 if (Src.getOpcode() == ISD::FP_EXTEND && 13523 Src.getOperand(0).getValueType() == MVT::f16) { 13524 return Src.getOperand(0); 13525 } 13526 13527 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) { 13528 APFloat Val = CFP->getValueAPF(); 13529 bool LosesInfo = true; 13530 Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo); 13531 if (!LosesInfo) 13532 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16); 13533 } 13534 13535 return SDValue(); 13536 } 13537 13538 SDValue SITargetLowering::performFPRoundCombine(SDNode *N, 13539 DAGCombinerInfo &DCI) const { 13540 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() && 13541 "combine only useful on gfx8"); 13542 13543 SDValue TruncSrc = N->getOperand(0); 13544 EVT VT = N->getValueType(0); 13545 if (VT != MVT::f16) 13546 return SDValue(); 13547 13548 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 || 13549 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse()) 13550 return SDValue(); 13551 13552 SelectionDAG &DAG = DCI.DAG; 13553 SDLoc SL(N); 13554 13555 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3, 13556 // and expanding it with min/max saves 1 instruction vs. casting to f32 and 13557 // casting back. 13558 13559 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) => 13560 // fmin(fmax(a, b), fmax(fmin(a, b), c)) 13561 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0)); 13562 if (!A) 13563 return SDValue(); 13564 13565 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1)); 13566 if (!B) 13567 return SDValue(); 13568 13569 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2)); 13570 if (!C) 13571 return SDValue(); 13572 13573 // This changes signaling nan behavior. If an input is a signaling nan, it 13574 // would have been quieted by the fpext originally. We don't care because 13575 // these are unconstrained ops. If we needed to insert quieting canonicalizes 13576 // we would be worse off than just doing the promotion. 13577 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B); 13578 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B); 13579 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C); 13580 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1); 13581 } 13582 13583 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, 13584 const SDNode *N0, 13585 const SDNode *N1) const { 13586 EVT VT = N0->getValueType(0); 13587 13588 // Only do this if we are not trying to support denormals. v_mad_f32 does not 13589 // support denormals ever. 13590 if (((VT == MVT::f32 && 13591 denormalModeIsFlushAllF32(DAG.getMachineFunction())) || 13592 (VT == MVT::f16 && Subtarget->hasMadF16() && 13593 denormalModeIsFlushAllF64F16(DAG.getMachineFunction()))) && 13594 isOperationLegal(ISD::FMAD, VT)) 13595 return ISD::FMAD; 13596 13597 const TargetOptions &Options = DAG.getTarget().Options; 13598 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || 13599 (N0->getFlags().hasAllowContract() && 13600 N1->getFlags().hasAllowContract())) && 13601 isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) { 13602 return ISD::FMA; 13603 } 13604 13605 return 0; 13606 } 13607 13608 // For a reassociatable opcode perform: 13609 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform 13610 SDValue SITargetLowering::reassociateScalarOps(SDNode *N, 13611 SelectionDAG &DAG) const { 13612 EVT VT = N->getValueType(0); 13613 if (VT != MVT::i32 && VT != MVT::i64) 13614 return SDValue(); 13615 13616 if (DAG.isBaseWithConstantOffset(SDValue(N, 0))) 13617 return SDValue(); 13618 13619 unsigned Opc = N->getOpcode(); 13620 SDValue Op0 = N->getOperand(0); 13621 SDValue Op1 = N->getOperand(1); 13622 13623 if (!(Op0->isDivergent() ^ Op1->isDivergent())) 13624 return SDValue(); 13625 13626 if (Op0->isDivergent()) 13627 std::swap(Op0, Op1); 13628 13629 if (Op1.getOpcode() != Opc || !Op1.hasOneUse()) 13630 return SDValue(); 13631 13632 SDValue Op2 = Op1.getOperand(1); 13633 Op1 = Op1.getOperand(0); 13634 if (!(Op1->isDivergent() ^ Op2->isDivergent())) 13635 return SDValue(); 13636 13637 if (Op1->isDivergent()) 13638 std::swap(Op1, Op2); 13639 13640 SDLoc SL(N); 13641 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1); 13642 return DAG.getNode(Opc, SL, VT, Add1, Op2); 13643 } 13644 13645 static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, 13646 EVT VT, 13647 SDValue N0, SDValue N1, SDValue N2, 13648 bool Signed) { 13649 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32; 13650 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1); 13651 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2); 13652 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad); 13653 } 13654 13655 // Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high 13656 // multiplies, if any. 13657 // 13658 // Full 64-bit multiplies that feed into an addition are lowered here instead 13659 // of using the generic expansion. The generic expansion ends up with 13660 // a tree of ADD nodes that prevents us from using the "add" part of the 13661 // MAD instruction. The expansion produced here results in a chain of ADDs 13662 // instead of a tree. 13663 SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, 13664 DAGCombinerInfo &DCI) const { 13665 assert(N->getOpcode() == ISD::ADD); 13666 13667 SelectionDAG &DAG = DCI.DAG; 13668 EVT VT = N->getValueType(0); 13669 SDLoc SL(N); 13670 SDValue LHS = N->getOperand(0); 13671 SDValue RHS = N->getOperand(1); 13672 13673 if (VT.isVector()) 13674 return SDValue(); 13675 13676 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall 13677 // result in scalar registers for uniform values. 13678 if (!N->isDivergent() && Subtarget->hasSMulHi()) 13679 return SDValue(); 13680 13681 unsigned NumBits = VT.getScalarSizeInBits(); 13682 if (NumBits <= 32 || NumBits > 64) 13683 return SDValue(); 13684 13685 if (LHS.getOpcode() != ISD::MUL) { 13686 assert(RHS.getOpcode() == ISD::MUL); 13687 std::swap(LHS, RHS); 13688 } 13689 13690 // Avoid the fold if it would unduly increase the number of multiplies due to 13691 // multiple uses, except on hardware with full-rate multiply-add (which is 13692 // part of full-rate 64-bit ops). 13693 if (!Subtarget->hasFullRate64Ops()) { 13694 unsigned NumUsers = 0; 13695 for (SDNode *Use : LHS->uses()) { 13696 // There is a use that does not feed into addition, so the multiply can't 13697 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. 13698 if (Use->getOpcode() != ISD::ADD) 13699 return SDValue(); 13700 13701 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer 13702 // MUL + 3xADD + 3xADDC over 3xMAD. 13703 ++NumUsers; 13704 if (NumUsers >= 3) 13705 return SDValue(); 13706 } 13707 } 13708 13709 SDValue MulLHS = LHS.getOperand(0); 13710 SDValue MulRHS = LHS.getOperand(1); 13711 SDValue AddRHS = RHS; 13712 13713 // Always check whether operands are small unsigned values, since that 13714 // knowledge is useful in more cases. Check for small signed values only if 13715 // doing so can unlock a shorter code sequence. 13716 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32; 13717 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32; 13718 13719 bool MulSignedLo = false; 13720 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) { 13721 MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 && 13722 numBitsSigned(MulRHS, DAG) <= 32; 13723 } 13724 13725 // The operands and final result all have the same number of bits. If 13726 // operands need to be extended, they can be extended with garbage. The 13727 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is 13728 // truncated away in the end. 13729 if (VT != MVT::i64) { 13730 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS); 13731 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS); 13732 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS); 13733 } 13734 13735 // The basic code generated is conceptually straightforward. Pseudo code: 13736 // 13737 // accum = mad_64_32 lhs.lo, rhs.lo, accum 13738 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi 13739 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi 13740 // 13741 // The second and third lines are optional, depending on whether the factors 13742 // are {sign,zero}-extended or not. 13743 // 13744 // The actual DAG is noisier than the pseudo code, but only due to 13745 // instructions that disassemble values into low and high parts, and 13746 // assemble the final result. 13747 SDValue One = DAG.getConstant(1, SL, MVT::i32); 13748 13749 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS); 13750 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS); 13751 SDValue Accum = 13752 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo); 13753 13754 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) { 13755 SDValue AccumLo, AccumHi; 13756 std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32); 13757 13758 if (!MulLHSUnsigned32) { 13759 auto MulLHSHi = 13760 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One); 13761 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo); 13762 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi); 13763 } 13764 13765 if (!MulRHSUnsigned32) { 13766 auto MulRHSHi = 13767 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One); 13768 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi); 13769 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi); 13770 } 13771 13772 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi}); 13773 Accum = DAG.getBitcast(MVT::i64, Accum); 13774 } 13775 13776 if (VT != MVT::i64) 13777 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum); 13778 return Accum; 13779 } 13780 13781 // Collect the ultimate src of each of the mul node's operands, and confirm 13782 // each operand is 8 bytes. 13783 static std::optional<ByteProvider<SDValue>> 13784 handleMulOperand(const SDValue &MulOperand) { 13785 auto Byte0 = calculateByteProvider(MulOperand, 0, 0); 13786 if (!Byte0 || Byte0->isConstantZero()) { 13787 return std::nullopt; 13788 } 13789 auto Byte1 = calculateByteProvider(MulOperand, 1, 0); 13790 if (Byte1 && !Byte1->isConstantZero()) { 13791 return std::nullopt; 13792 } 13793 return Byte0; 13794 } 13795 13796 static unsigned addPermMasks(unsigned First, unsigned Second) { 13797 unsigned FirstCs = First & 0x0c0c0c0c; 13798 unsigned SecondCs = Second & 0x0c0c0c0c; 13799 unsigned FirstNoCs = First & ~0x0c0c0c0c; 13800 unsigned SecondNoCs = Second & ~0x0c0c0c0c; 13801 13802 assert((FirstCs & 0xFF) | (SecondCs & 0xFF)); 13803 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00)); 13804 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000)); 13805 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000)); 13806 13807 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs); 13808 } 13809 13810 struct DotSrc { 13811 SDValue SrcOp; 13812 int64_t PermMask; 13813 int64_t DWordOffset; 13814 }; 13815 13816 static void placeSources(ByteProvider<SDValue> &Src0, 13817 ByteProvider<SDValue> &Src1, 13818 SmallVectorImpl<DotSrc> &Src0s, 13819 SmallVectorImpl<DotSrc> &Src1s, int Step) { 13820 13821 assert(Src0.Src.has_value() && Src1.Src.has_value()); 13822 // Src0s and Src1s are empty, just place arbitrarily. 13823 if (Step == 0) { 13824 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c, 13825 Src0.SrcOffset / 4}); 13826 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c, 13827 Src1.SrcOffset / 4}); 13828 return; 13829 } 13830 13831 for (int BPI = 0; BPI < 2; BPI++) { 13832 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1}; 13833 if (BPI == 1) { 13834 BPP = {Src1, Src0}; 13835 } 13836 unsigned ZeroMask = 0x0c0c0c0c; 13837 unsigned FMask = 0xFF << (8 * (3 - Step)); 13838 13839 unsigned FirstMask = 13840 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask); 13841 unsigned SecondMask = 13842 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask); 13843 // Attempt to find Src vector which contains our SDValue, if so, add our 13844 // perm mask to the existing one. If we are unable to find a match for the 13845 // first SDValue, attempt to find match for the second. 13846 int FirstGroup = -1; 13847 for (int I = 0; I < 2; I++) { 13848 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s; 13849 auto MatchesFirst = [&BPP](DotSrc &IterElt) { 13850 return IterElt.SrcOp == *BPP.first.Src && 13851 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4)); 13852 }; 13853 13854 auto Match = llvm::find_if(Srcs, MatchesFirst); 13855 if (Match != Srcs.end()) { 13856 Match->PermMask = addPermMasks(FirstMask, Match->PermMask); 13857 FirstGroup = I; 13858 break; 13859 } 13860 } 13861 if (FirstGroup != -1) { 13862 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s; 13863 auto MatchesSecond = [&BPP](DotSrc &IterElt) { 13864 return IterElt.SrcOp == *BPP.second.Src && 13865 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4)); 13866 }; 13867 auto Match = llvm::find_if(Srcs, MatchesSecond); 13868 if (Match != Srcs.end()) { 13869 Match->PermMask = addPermMasks(SecondMask, Match->PermMask); 13870 } else 13871 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4}); 13872 return; 13873 } 13874 } 13875 13876 // If we have made it here, then we could not find a match in Src0s or Src1s 13877 // for either Src0 or Src1, so just place them arbitrarily. 13878 13879 unsigned ZeroMask = 0x0c0c0c0c; 13880 unsigned FMask = 0xFF << (8 * (3 - Step)); 13881 13882 Src0s.push_back( 13883 {*Src0.Src, 13884 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)), 13885 Src1.SrcOffset / 4}); 13886 Src1s.push_back( 13887 {*Src1.Src, 13888 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)), 13889 Src1.SrcOffset / 4}); 13890 13891 return; 13892 } 13893 13894 static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, 13895 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned, 13896 bool IsAny) { 13897 13898 // If we just have one source, just permute it accordingly. 13899 if (Srcs.size() == 1) { 13900 auto Elt = Srcs.begin(); 13901 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset); 13902 13903 // v_perm will produce the original value 13904 if (Elt->PermMask == 0x3020100) 13905 return EltOp; 13906 13907 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp, 13908 DAG.getConstant(Elt->PermMask, SL, MVT::i32)); 13909 } 13910 13911 auto FirstElt = Srcs.begin(); 13912 auto SecondElt = std::next(FirstElt); 13913 13914 SmallVector<SDValue, 2> Perms; 13915 13916 // If we have multiple sources in the chain, combine them via perms (using 13917 // calculated perm mask) and Ors. 13918 while (true) { 13919 auto FirstMask = FirstElt->PermMask; 13920 auto SecondMask = SecondElt->PermMask; 13921 13922 unsigned FirstCs = FirstMask & 0x0c0c0c0c; 13923 unsigned FirstPlusFour = FirstMask | 0x04040404; 13924 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any 13925 // original 0x0C. 13926 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs; 13927 13928 auto PermMask = addPermMasks(FirstMask, SecondMask); 13929 auto FirstVal = 13930 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset); 13931 auto SecondVal = 13932 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset); 13933 13934 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal, 13935 SecondVal, 13936 DAG.getConstant(PermMask, SL, MVT::i32))); 13937 13938 FirstElt = std::next(SecondElt); 13939 if (FirstElt == Srcs.end()) 13940 break; 13941 13942 SecondElt = std::next(FirstElt); 13943 // If we only have a FirstElt, then just combine that into the cumulative 13944 // source node. 13945 if (SecondElt == Srcs.end()) { 13946 auto EltOp = 13947 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset); 13948 13949 Perms.push_back( 13950 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp, 13951 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32))); 13952 break; 13953 } 13954 } 13955 13956 assert(Perms.size() == 1 || Perms.size() == 2); 13957 return Perms.size() == 2 13958 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1]) 13959 : Perms[0]; 13960 } 13961 13962 static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) { 13963 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) { 13964 EntryMask = EntryMask >> ((4 - ChainLength) * 8); 13965 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000; 13966 EntryMask += ZeroMask; 13967 } 13968 } 13969 13970 static bool isMul(const SDValue Op) { 13971 auto Opcode = Op.getOpcode(); 13972 13973 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 || 13974 Opcode == AMDGPUISD::MUL_I24); 13975 } 13976 13977 static std::optional<bool> 13978 checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0, 13979 ByteProvider<SDValue> &Src1, const SDValue &S0Op, 13980 const SDValue &S1Op, const SelectionDAG &DAG) { 13981 // If we both ops are i8s (pre legalize-dag), then the signedness semantics 13982 // of the dot4 is irrelevant. 13983 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8) 13984 return false; 13985 13986 auto Known0 = DAG.computeKnownBits(S0Op, 0); 13987 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0; 13988 bool S0IsSigned = Known0.countMinLeadingOnes() > 0; 13989 auto Known1 = DAG.computeKnownBits(S1Op, 0); 13990 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0; 13991 bool S1IsSigned = Known1.countMinLeadingOnes() > 0; 13992 13993 assert(!(S0IsUnsigned && S0IsSigned)); 13994 assert(!(S1IsUnsigned && S1IsSigned)); 13995 13996 // There are 9 possible permutations of 13997 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned} 13998 13999 // In two permutations, the sign bits are known to be the same for both Ops, 14000 // so simply return Signed / Unsigned corresponding to the MSB 14001 14002 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned)) 14003 return S0IsSigned; 14004 14005 // In another two permutations, the sign bits are known to be opposite. In 14006 // this case return std::nullopt to indicate a bad match. 14007 14008 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned)) 14009 return std::nullopt; 14010 14011 // In the remaining five permutations, we don't know the value of the sign 14012 // bit for at least one Op. Since we have a valid ByteProvider, we know that 14013 // the upper bits must be extension bits. Thus, the only ways for the sign 14014 // bit to be unknown is if it was sign extended from unknown value, or if it 14015 // was any extended. In either case, it is correct to use the signed 14016 // version of the signedness semantics of dot4 14017 14018 // In two of such permutations, we known the sign bit is set for 14019 // one op, and the other is unknown. It is okay to used signed version of 14020 // dot4. 14021 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) || 14022 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned)))) 14023 return true; 14024 14025 // In one such permutation, we don't know either of the sign bits. It is okay 14026 // to used the signed version of dot4. 14027 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned))) 14028 return true; 14029 14030 // In two of such permutations, we known the sign bit is unset for 14031 // one op, and the other is unknown. Return std::nullopt to indicate a 14032 // bad match. 14033 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) || 14034 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned)))) 14035 return std::nullopt; 14036 14037 llvm_unreachable("Fully covered condition"); 14038 } 14039 14040 SDValue SITargetLowering::performAddCombine(SDNode *N, 14041 DAGCombinerInfo &DCI) const { 14042 SelectionDAG &DAG = DCI.DAG; 14043 EVT VT = N->getValueType(0); 14044 SDLoc SL(N); 14045 SDValue LHS = N->getOperand(0); 14046 SDValue RHS = N->getOperand(1); 14047 14048 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) { 14049 if (Subtarget->hasMad64_32()) { 14050 if (SDValue Folded = tryFoldToMad64_32(N, DCI)) 14051 return Folded; 14052 } 14053 } 14054 14055 if (SDValue V = reassociateScalarOps(N, DAG)) { 14056 return V; 14057 } 14058 14059 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() && 14060 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) { 14061 SDValue TempNode(N, 0); 14062 std::optional<bool> IsSigned; 14063 SmallVector<DotSrc, 4> Src0s; 14064 SmallVector<DotSrc, 4> Src1s; 14065 SmallVector<SDValue, 4> Src2s; 14066 14067 // Match the v_dot4 tree, while collecting src nodes. 14068 int ChainLength = 0; 14069 for (int I = 0; I < 4; I++) { 14070 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1; 14071 if (MulIdx == -1) 14072 break; 14073 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0)); 14074 if (!Src0) 14075 break; 14076 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1)); 14077 if (!Src1) 14078 break; 14079 14080 auto IterIsSigned = checkDot4MulSignedness( 14081 TempNode->getOperand(MulIdx), *Src0, *Src1, 14082 TempNode->getOperand(MulIdx)->getOperand(0), 14083 TempNode->getOperand(MulIdx)->getOperand(1), DAG); 14084 if (!IterIsSigned) 14085 break; 14086 if (!IsSigned) 14087 IsSigned = *IterIsSigned; 14088 if (*IterIsSigned != *IsSigned) 14089 break; 14090 placeSources(*Src0, *Src1, Src0s, Src1s, I); 14091 auto AddIdx = 1 - MulIdx; 14092 // Allow the special case where add (add (mul24, 0), mul24) became -> 14093 // add (mul24, mul24). 14094 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) { 14095 Src2s.push_back(TempNode->getOperand(AddIdx)); 14096 auto Src0 = 14097 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0)); 14098 if (!Src0) 14099 break; 14100 auto Src1 = 14101 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1)); 14102 if (!Src1) 14103 break; 14104 auto IterIsSigned = checkDot4MulSignedness( 14105 TempNode->getOperand(AddIdx), *Src0, *Src1, 14106 TempNode->getOperand(AddIdx)->getOperand(0), 14107 TempNode->getOperand(AddIdx)->getOperand(1), DAG); 14108 if (!IterIsSigned) 14109 break; 14110 assert(IsSigned); 14111 if (*IterIsSigned != *IsSigned) 14112 break; 14113 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1); 14114 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32)); 14115 ChainLength = I + 2; 14116 break; 14117 } 14118 14119 TempNode = TempNode->getOperand(AddIdx); 14120 Src2s.push_back(TempNode); 14121 ChainLength = I + 1; 14122 if (TempNode->getNumOperands() < 2) 14123 break; 14124 LHS = TempNode->getOperand(0); 14125 RHS = TempNode->getOperand(1); 14126 } 14127 14128 if (ChainLength < 2) 14129 return SDValue(); 14130 14131 // Masks were constructed with assumption that we would find a chain of 14132 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of 14133 // 0x0c) so they do not affect dot calculation. 14134 if (ChainLength < 4) { 14135 fixMasks(Src0s, ChainLength); 14136 fixMasks(Src1s, ChainLength); 14137 } 14138 14139 SDValue Src0, Src1; 14140 14141 // If we are just using a single source for both, and have permuted the 14142 // bytes consistently, we can just use the sources without permuting 14143 // (commutation). 14144 bool UseOriginalSrc = false; 14145 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 && 14146 Src0s.begin()->PermMask == Src1s.begin()->PermMask && 14147 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 && 14148 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) { 14149 SmallVector<unsigned, 4> SrcBytes; 14150 auto Src0Mask = Src0s.begin()->PermMask; 14151 SrcBytes.push_back(Src0Mask & 0xFF000000); 14152 bool UniqueEntries = true; 14153 for (auto I = 1; I < 4; I++) { 14154 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8)); 14155 14156 if (is_contained(SrcBytes, NextByte)) { 14157 UniqueEntries = false; 14158 break; 14159 } 14160 SrcBytes.push_back(NextByte); 14161 } 14162 14163 if (UniqueEntries) { 14164 UseOriginalSrc = true; 14165 14166 auto FirstElt = Src0s.begin(); 14167 auto FirstEltOp = 14168 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset); 14169 14170 auto SecondElt = Src1s.begin(); 14171 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp, 14172 SecondElt->DWordOffset); 14173 14174 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL, 14175 MVT::getIntegerVT(32)); 14176 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL, 14177 MVT::getIntegerVT(32)); 14178 } 14179 } 14180 14181 if (!UseOriginalSrc) { 14182 Src0 = resolveSources(DAG, SL, Src0s, false, true); 14183 Src1 = resolveSources(DAG, SL, Src1s, false, true); 14184 } 14185 14186 assert(IsSigned); 14187 SDValue Src2 = 14188 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32); 14189 14190 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4 14191 : Intrinsic::amdgcn_udot4, 14192 SL, MVT::i64); 14193 14194 assert(!VT.isVector()); 14195 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0, 14196 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1)); 14197 14198 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT); 14199 } 14200 14201 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG()) 14202 return SDValue(); 14203 14204 // add x, zext (setcc) => uaddo_carry x, 0, setcc 14205 // add x, sext (setcc) => usubo_carry x, 0, setcc 14206 unsigned Opc = LHS.getOpcode(); 14207 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND || 14208 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY) 14209 std::swap(RHS, LHS); 14210 14211 Opc = RHS.getOpcode(); 14212 switch (Opc) { 14213 default: break; 14214 case ISD::ZERO_EXTEND: 14215 case ISD::SIGN_EXTEND: 14216 case ISD::ANY_EXTEND: { 14217 auto Cond = RHS.getOperand(0); 14218 // If this won't be a real VOPC output, we would still need to insert an 14219 // extra instruction anyway. 14220 if (!isBoolSGPR(Cond)) 14221 break; 14222 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); 14223 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; 14224 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY; 14225 return DAG.getNode(Opc, SL, VTList, Args); 14226 } 14227 case ISD::UADDO_CARRY: { 14228 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc 14229 if (!isNullConstant(RHS.getOperand(1))) 14230 break; 14231 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) }; 14232 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args); 14233 } 14234 } 14235 return SDValue(); 14236 } 14237 14238 SDValue SITargetLowering::performSubCombine(SDNode *N, 14239 DAGCombinerInfo &DCI) const { 14240 SelectionDAG &DAG = DCI.DAG; 14241 EVT VT = N->getValueType(0); 14242 14243 if (VT != MVT::i32) 14244 return SDValue(); 14245 14246 SDLoc SL(N); 14247 SDValue LHS = N->getOperand(0); 14248 SDValue RHS = N->getOperand(1); 14249 14250 // sub x, zext (setcc) => usubo_carry x, 0, setcc 14251 // sub x, sext (setcc) => uaddo_carry x, 0, setcc 14252 unsigned Opc = RHS.getOpcode(); 14253 switch (Opc) { 14254 default: break; 14255 case ISD::ZERO_EXTEND: 14256 case ISD::SIGN_EXTEND: 14257 case ISD::ANY_EXTEND: { 14258 auto Cond = RHS.getOperand(0); 14259 // If this won't be a real VOPC output, we would still need to insert an 14260 // extra instruction anyway. 14261 if (!isBoolSGPR(Cond)) 14262 break; 14263 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); 14264 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; 14265 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY; 14266 return DAG.getNode(Opc, SL, VTList, Args); 14267 } 14268 } 14269 14270 if (LHS.getOpcode() == ISD::USUBO_CARRY) { 14271 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc 14272 if (!isNullConstant(LHS.getOperand(1))) 14273 return SDValue(); 14274 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) }; 14275 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args); 14276 } 14277 return SDValue(); 14278 } 14279 14280 SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N, 14281 DAGCombinerInfo &DCI) const { 14282 14283 if (N->getValueType(0) != MVT::i32) 14284 return SDValue(); 14285 14286 if (!isNullConstant(N->getOperand(1))) 14287 return SDValue(); 14288 14289 SelectionDAG &DAG = DCI.DAG; 14290 SDValue LHS = N->getOperand(0); 14291 14292 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc 14293 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc 14294 unsigned LHSOpc = LHS.getOpcode(); 14295 unsigned Opc = N->getOpcode(); 14296 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) || 14297 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) { 14298 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) }; 14299 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args); 14300 } 14301 return SDValue(); 14302 } 14303 14304 SDValue SITargetLowering::performFAddCombine(SDNode *N, 14305 DAGCombinerInfo &DCI) const { 14306 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 14307 return SDValue(); 14308 14309 SelectionDAG &DAG = DCI.DAG; 14310 EVT VT = N->getValueType(0); 14311 14312 SDLoc SL(N); 14313 SDValue LHS = N->getOperand(0); 14314 SDValue RHS = N->getOperand(1); 14315 14316 // These should really be instruction patterns, but writing patterns with 14317 // source modifiers is a pain. 14318 14319 // fadd (fadd (a, a), b) -> mad 2.0, a, b 14320 if (LHS.getOpcode() == ISD::FADD) { 14321 SDValue A = LHS.getOperand(0); 14322 if (A == LHS.getOperand(1)) { 14323 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode()); 14324 if (FusedOp != 0) { 14325 const SDValue Two = DAG.getConstantFP(2.0, SL, VT); 14326 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS); 14327 } 14328 } 14329 } 14330 14331 // fadd (b, fadd (a, a)) -> mad 2.0, a, b 14332 if (RHS.getOpcode() == ISD::FADD) { 14333 SDValue A = RHS.getOperand(0); 14334 if (A == RHS.getOperand(1)) { 14335 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode()); 14336 if (FusedOp != 0) { 14337 const SDValue Two = DAG.getConstantFP(2.0, SL, VT); 14338 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS); 14339 } 14340 } 14341 } 14342 14343 return SDValue(); 14344 } 14345 14346 SDValue SITargetLowering::performFSubCombine(SDNode *N, 14347 DAGCombinerInfo &DCI) const { 14348 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 14349 return SDValue(); 14350 14351 SelectionDAG &DAG = DCI.DAG; 14352 SDLoc SL(N); 14353 EVT VT = N->getValueType(0); 14354 assert(!VT.isVector()); 14355 14356 // Try to get the fneg to fold into the source modifier. This undoes generic 14357 // DAG combines and folds them into the mad. 14358 // 14359 // Only do this if we are not trying to support denormals. v_mad_f32 does 14360 // not support denormals ever. 14361 SDValue LHS = N->getOperand(0); 14362 SDValue RHS = N->getOperand(1); 14363 if (LHS.getOpcode() == ISD::FADD) { 14364 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) 14365 SDValue A = LHS.getOperand(0); 14366 if (A == LHS.getOperand(1)) { 14367 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode()); 14368 if (FusedOp != 0){ 14369 const SDValue Two = DAG.getConstantFP(2.0, SL, VT); 14370 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 14371 14372 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS); 14373 } 14374 } 14375 } 14376 14377 if (RHS.getOpcode() == ISD::FADD) { 14378 // (fsub c, (fadd a, a)) -> mad -2.0, a, c 14379 14380 SDValue A = RHS.getOperand(0); 14381 if (A == RHS.getOperand(1)) { 14382 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode()); 14383 if (FusedOp != 0){ 14384 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT); 14385 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS); 14386 } 14387 } 14388 } 14389 14390 return SDValue(); 14391 } 14392 14393 SDValue SITargetLowering::performFDivCombine(SDNode *N, 14394 DAGCombinerInfo &DCI) const { 14395 SelectionDAG &DAG = DCI.DAG; 14396 SDLoc SL(N); 14397 EVT VT = N->getValueType(0); 14398 if (VT != MVT::f16 || !Subtarget->has16BitInsts()) 14399 return SDValue(); 14400 14401 SDValue LHS = N->getOperand(0); 14402 SDValue RHS = N->getOperand(1); 14403 14404 SDNodeFlags Flags = N->getFlags(); 14405 SDNodeFlags RHSFlags = RHS->getFlags(); 14406 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() || 14407 !RHS->hasOneUse()) 14408 return SDValue(); 14409 14410 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { 14411 bool IsNegative = false; 14412 if (CLHS->isExactlyValue(1.0) || 14413 (IsNegative = CLHS->isExactlyValue(-1.0))) { 14414 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16 14415 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16 14416 if (RHS.getOpcode() == ISD::FSQRT) { 14417 // TODO: Or in RHS flags, somehow missing from SDNodeFlags 14418 SDValue Rsq = 14419 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags); 14420 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq; 14421 } 14422 } 14423 } 14424 14425 return SDValue(); 14426 } 14427 14428 SDValue SITargetLowering::performFMACombine(SDNode *N, 14429 DAGCombinerInfo &DCI) const { 14430 SelectionDAG &DAG = DCI.DAG; 14431 EVT VT = N->getValueType(0); 14432 SDLoc SL(N); 14433 14434 if (!Subtarget->hasDot7Insts() || VT != MVT::f32) 14435 return SDValue(); 14436 14437 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) -> 14438 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z)) 14439 SDValue Op1 = N->getOperand(0); 14440 SDValue Op2 = N->getOperand(1); 14441 SDValue FMA = N->getOperand(2); 14442 14443 if (FMA.getOpcode() != ISD::FMA || 14444 Op1.getOpcode() != ISD::FP_EXTEND || 14445 Op2.getOpcode() != ISD::FP_EXTEND) 14446 return SDValue(); 14447 14448 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero, 14449 // regardless of the denorm mode setting. Therefore, 14450 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2. 14451 const TargetOptions &Options = DAG.getTarget().Options; 14452 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || 14453 (N->getFlags().hasAllowContract() && 14454 FMA->getFlags().hasAllowContract())) { 14455 Op1 = Op1.getOperand(0); 14456 Op2 = Op2.getOperand(0); 14457 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 14458 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 14459 return SDValue(); 14460 14461 SDValue Vec1 = Op1.getOperand(0); 14462 SDValue Idx1 = Op1.getOperand(1); 14463 SDValue Vec2 = Op2.getOperand(0); 14464 14465 SDValue FMAOp1 = FMA.getOperand(0); 14466 SDValue FMAOp2 = FMA.getOperand(1); 14467 SDValue FMAAcc = FMA.getOperand(2); 14468 14469 if (FMAOp1.getOpcode() != ISD::FP_EXTEND || 14470 FMAOp2.getOpcode() != ISD::FP_EXTEND) 14471 return SDValue(); 14472 14473 FMAOp1 = FMAOp1.getOperand(0); 14474 FMAOp2 = FMAOp2.getOperand(0); 14475 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 14476 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 14477 return SDValue(); 14478 14479 SDValue Vec3 = FMAOp1.getOperand(0); 14480 SDValue Vec4 = FMAOp2.getOperand(0); 14481 SDValue Idx2 = FMAOp1.getOperand(1); 14482 14483 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) || 14484 // Idx1 and Idx2 cannot be the same. 14485 Idx1 == Idx2) 14486 return SDValue(); 14487 14488 if (Vec1 == Vec2 || Vec3 == Vec4) 14489 return SDValue(); 14490 14491 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16) 14492 return SDValue(); 14493 14494 if ((Vec1 == Vec3 && Vec2 == Vec4) || 14495 (Vec1 == Vec4 && Vec2 == Vec3)) { 14496 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc, 14497 DAG.getTargetConstant(0, SL, MVT::i1)); 14498 } 14499 } 14500 return SDValue(); 14501 } 14502 14503 SDValue SITargetLowering::performSetCCCombine(SDNode *N, 14504 DAGCombinerInfo &DCI) const { 14505 SelectionDAG &DAG = DCI.DAG; 14506 SDLoc SL(N); 14507 14508 SDValue LHS = N->getOperand(0); 14509 SDValue RHS = N->getOperand(1); 14510 EVT VT = LHS.getValueType(); 14511 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 14512 14513 auto CRHS = dyn_cast<ConstantSDNode>(RHS); 14514 if (!CRHS) { 14515 CRHS = dyn_cast<ConstantSDNode>(LHS); 14516 if (CRHS) { 14517 std::swap(LHS, RHS); 14518 CC = getSetCCSwappedOperands(CC); 14519 } 14520 } 14521 14522 if (CRHS) { 14523 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND && 14524 isBoolSGPR(LHS.getOperand(0))) { 14525 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1 14526 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc 14527 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1 14528 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc 14529 if ((CRHS->isAllOnes() && 14530 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) || 14531 (CRHS->isZero() && 14532 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE))) 14533 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), 14534 DAG.getConstant(-1, SL, MVT::i1)); 14535 if ((CRHS->isAllOnes() && 14536 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) || 14537 (CRHS->isZero() && 14538 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT))) 14539 return LHS.getOperand(0); 14540 } 14541 14542 const APInt &CRHSVal = CRHS->getAPIntValue(); 14543 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && 14544 LHS.getOpcode() == ISD::SELECT && 14545 isa<ConstantSDNode>(LHS.getOperand(1)) && 14546 isa<ConstantSDNode>(LHS.getOperand(2)) && 14547 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) && 14548 isBoolSGPR(LHS.getOperand(0))) { 14549 // Given CT != FT: 14550 // setcc (select cc, CT, CF), CF, eq => xor cc, -1 14551 // setcc (select cc, CT, CF), CF, ne => cc 14552 // setcc (select cc, CT, CF), CT, ne => xor cc, -1 14553 // setcc (select cc, CT, CF), CT, eq => cc 14554 const APInt &CT = LHS.getConstantOperandAPInt(1); 14555 const APInt &CF = LHS.getConstantOperandAPInt(2); 14556 14557 if ((CF == CRHSVal && CC == ISD::SETEQ) || 14558 (CT == CRHSVal && CC == ISD::SETNE)) 14559 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), 14560 DAG.getConstant(-1, SL, MVT::i1)); 14561 if ((CF == CRHSVal && CC == ISD::SETNE) || 14562 (CT == CRHSVal && CC == ISD::SETEQ)) 14563 return LHS.getOperand(0); 14564 } 14565 } 14566 14567 if (VT != MVT::f32 && VT != MVT::f64 && 14568 (!Subtarget->has16BitInsts() || VT != MVT::f16)) 14569 return SDValue(); 14570 14571 // Match isinf/isfinite pattern 14572 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) 14573 // (fcmp one (fabs x), inf) -> (fp_class x, 14574 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero) 14575 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) { 14576 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 14577 if (!CRHS) 14578 return SDValue(); 14579 14580 const APFloat &APF = CRHS->getValueAPF(); 14581 if (APF.isInfinity() && !APF.isNegative()) { 14582 const unsigned IsInfMask = SIInstrFlags::P_INFINITY | 14583 SIInstrFlags::N_INFINITY; 14584 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO | 14585 SIInstrFlags::P_ZERO | 14586 SIInstrFlags::N_NORMAL | 14587 SIInstrFlags::P_NORMAL | 14588 SIInstrFlags::N_SUBNORMAL | 14589 SIInstrFlags::P_SUBNORMAL; 14590 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask; 14591 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), 14592 DAG.getConstant(Mask, SL, MVT::i32)); 14593 } 14594 } 14595 14596 return SDValue(); 14597 } 14598 14599 SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, 14600 DAGCombinerInfo &DCI) const { 14601 SelectionDAG &DAG = DCI.DAG; 14602 SDLoc SL(N); 14603 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; 14604 14605 SDValue Src = N->getOperand(0); 14606 SDValue Shift = N->getOperand(0); 14607 14608 // TODO: Extend type shouldn't matter (assuming legal types). 14609 if (Shift.getOpcode() == ISD::ZERO_EXTEND) 14610 Shift = Shift.getOperand(0); 14611 14612 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) { 14613 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x 14614 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x 14615 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x 14616 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x 14617 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x 14618 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) { 14619 SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0), 14620 SDLoc(Shift.getOperand(0)), MVT::i32); 14621 14622 unsigned ShiftOffset = 8 * Offset; 14623 if (Shift.getOpcode() == ISD::SHL) 14624 ShiftOffset -= C->getZExtValue(); 14625 else 14626 ShiftOffset += C->getZExtValue(); 14627 14628 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) { 14629 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL, 14630 MVT::f32, Shifted); 14631 } 14632 } 14633 } 14634 14635 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14636 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); 14637 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) { 14638 // We simplified Src. If this node is not dead, visit it again so it is 14639 // folded properly. 14640 if (N->getOpcode() != ISD::DELETED_NODE) 14641 DCI.AddToWorklist(N); 14642 return SDValue(N, 0); 14643 } 14644 14645 // Handle (or x, (srl y, 8)) pattern when known bits are zero. 14646 if (SDValue DemandedSrc = 14647 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG)) 14648 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc); 14649 14650 return SDValue(); 14651 } 14652 14653 SDValue SITargetLowering::performClampCombine(SDNode *N, 14654 DAGCombinerInfo &DCI) const { 14655 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); 14656 if (!CSrc) 14657 return SDValue(); 14658 14659 const MachineFunction &MF = DCI.DAG.getMachineFunction(); 14660 const APFloat &F = CSrc->getValueAPF(); 14661 APFloat Zero = APFloat::getZero(F.getSemantics()); 14662 if (F < Zero || 14663 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) { 14664 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); 14665 } 14666 14667 APFloat One(F.getSemantics(), "1.0"); 14668 if (F > One) 14669 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0)); 14670 14671 return SDValue(CSrc, 0); 14672 } 14673 14674 14675 SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 14676 DAGCombinerInfo &DCI) const { 14677 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None) 14678 return SDValue(); 14679 switch (N->getOpcode()) { 14680 case ISD::ADD: 14681 return performAddCombine(N, DCI); 14682 case ISD::SUB: 14683 return performSubCombine(N, DCI); 14684 case ISD::UADDO_CARRY: 14685 case ISD::USUBO_CARRY: 14686 return performAddCarrySubCarryCombine(N, DCI); 14687 case ISD::FADD: 14688 return performFAddCombine(N, DCI); 14689 case ISD::FSUB: 14690 return performFSubCombine(N, DCI); 14691 case ISD::FDIV: 14692 return performFDivCombine(N, DCI); 14693 case ISD::SETCC: 14694 return performSetCCCombine(N, DCI); 14695 case ISD::FMAXNUM: 14696 case ISD::FMINNUM: 14697 case ISD::FMAXNUM_IEEE: 14698 case ISD::FMINNUM_IEEE: 14699 case ISD::FMAXIMUM: 14700 case ISD::FMINIMUM: 14701 case ISD::SMAX: 14702 case ISD::SMIN: 14703 case ISD::UMAX: 14704 case ISD::UMIN: 14705 case AMDGPUISD::FMIN_LEGACY: 14706 case AMDGPUISD::FMAX_LEGACY: 14707 return performMinMaxCombine(N, DCI); 14708 case ISD::FMA: 14709 return performFMACombine(N, DCI); 14710 case ISD::AND: 14711 return performAndCombine(N, DCI); 14712 case ISD::OR: 14713 return performOrCombine(N, DCI); 14714 case ISD::FSHR: { 14715 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 14716 if (N->getValueType(0) == MVT::i32 && N->isDivergent() && 14717 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { 14718 return matchPERM(N, DCI); 14719 } 14720 break; 14721 } 14722 case ISD::XOR: 14723 return performXorCombine(N, DCI); 14724 case ISD::ZERO_EXTEND: 14725 return performZeroExtendCombine(N, DCI); 14726 case ISD::SIGN_EXTEND_INREG: 14727 return performSignExtendInRegCombine(N , DCI); 14728 case AMDGPUISD::FP_CLASS: 14729 return performClassCombine(N, DCI); 14730 case ISD::FCANONICALIZE: 14731 return performFCanonicalizeCombine(N, DCI); 14732 case AMDGPUISD::RCP: 14733 return performRcpCombine(N, DCI); 14734 case ISD::FLDEXP: 14735 case AMDGPUISD::FRACT: 14736 case AMDGPUISD::RSQ: 14737 case AMDGPUISD::RCP_LEGACY: 14738 case AMDGPUISD::RCP_IFLAG: 14739 case AMDGPUISD::RSQ_CLAMP: { 14740 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted 14741 SDValue Src = N->getOperand(0); 14742 if (Src.isUndef()) 14743 return Src; 14744 break; 14745 } 14746 case ISD::SINT_TO_FP: 14747 case ISD::UINT_TO_FP: 14748 return performUCharToFloatCombine(N, DCI); 14749 case ISD::FCOPYSIGN: 14750 return performFCopySignCombine(N, DCI); 14751 case AMDGPUISD::CVT_F32_UBYTE0: 14752 case AMDGPUISD::CVT_F32_UBYTE1: 14753 case AMDGPUISD::CVT_F32_UBYTE2: 14754 case AMDGPUISD::CVT_F32_UBYTE3: 14755 return performCvtF32UByteNCombine(N, DCI); 14756 case AMDGPUISD::FMED3: 14757 return performFMed3Combine(N, DCI); 14758 case AMDGPUISD::CVT_PKRTZ_F16_F32: 14759 return performCvtPkRTZCombine(N, DCI); 14760 case AMDGPUISD::CLAMP: 14761 return performClampCombine(N, DCI); 14762 case ISD::SCALAR_TO_VECTOR: { 14763 SelectionDAG &DAG = DCI.DAG; 14764 EVT VT = N->getValueType(0); 14765 14766 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x)) 14767 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) { 14768 SDLoc SL(N); 14769 SDValue Src = N->getOperand(0); 14770 EVT EltVT = Src.getValueType(); 14771 if (EltVT != MVT::i16) 14772 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src); 14773 14774 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src); 14775 return DAG.getNode(ISD::BITCAST, SL, VT, Ext); 14776 } 14777 14778 break; 14779 } 14780 case ISD::EXTRACT_VECTOR_ELT: 14781 return performExtractVectorEltCombine(N, DCI); 14782 case ISD::INSERT_VECTOR_ELT: 14783 return performInsertVectorEltCombine(N, DCI); 14784 case ISD::FP_ROUND: 14785 return performFPRoundCombine(N, DCI); 14786 case ISD::LOAD: { 14787 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI)) 14788 return Widened; 14789 [[fallthrough]]; 14790 } 14791 default: { 14792 if (!DCI.isBeforeLegalize()) { 14793 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N)) 14794 return performMemSDNodeCombine(MemNode, DCI); 14795 } 14796 14797 break; 14798 } 14799 } 14800 14801 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 14802 } 14803 14804 /// Helper function for adjustWritemask 14805 static unsigned SubIdx2Lane(unsigned Idx) { 14806 switch (Idx) { 14807 default: return ~0u; 14808 case AMDGPU::sub0: return 0; 14809 case AMDGPU::sub1: return 1; 14810 case AMDGPU::sub2: return 2; 14811 case AMDGPU::sub3: return 3; 14812 case AMDGPU::sub4: return 4; // Possible with TFE/LWE 14813 } 14814 } 14815 14816 /// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions 14817 SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, 14818 SelectionDAG &DAG) const { 14819 unsigned Opcode = Node->getMachineOpcode(); 14820 14821 // Subtract 1 because the vdata output is not a MachineSDNode operand. 14822 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1; 14823 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx)) 14824 return Node; // not implemented for D16 14825 14826 SDNode *Users[5] = { nullptr }; 14827 unsigned Lane = 0; 14828 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1; 14829 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); 14830 unsigned NewDmask = 0; 14831 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1; 14832 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1; 14833 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) || 14834 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx))) 14835 ? true 14836 : false; 14837 unsigned TFCLane = 0; 14838 bool HasChain = Node->getNumValues() > 1; 14839 14840 if (OldDmask == 0) { 14841 // These are folded out, but on the chance it happens don't assert. 14842 return Node; 14843 } 14844 14845 unsigned OldBitsSet = llvm::popcount(OldDmask); 14846 // Work out which is the TFE/LWE lane if that is enabled. 14847 if (UsesTFC) { 14848 TFCLane = OldBitsSet; 14849 } 14850 14851 // Try to figure out the used register components 14852 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 14853 I != E; ++I) { 14854 14855 // Don't look at users of the chain. 14856 if (I.getUse().getResNo() != 0) 14857 continue; 14858 14859 // Abort if we can't understand the usage 14860 if (!I->isMachineOpcode() || 14861 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 14862 return Node; 14863 14864 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used. 14865 // Note that subregs are packed, i.e. Lane==0 is the first bit set 14866 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 14867 // set, etc. 14868 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 14869 if (Lane == ~0u) 14870 return Node; 14871 14872 // Check if the use is for the TFE/LWE generated result at VGPRn+1. 14873 if (UsesTFC && Lane == TFCLane) { 14874 Users[Lane] = *I; 14875 } else { 14876 // Set which texture component corresponds to the lane. 14877 unsigned Comp; 14878 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) { 14879 Comp = llvm::countr_zero(Dmask); 14880 Dmask &= ~(1 << Comp); 14881 } 14882 14883 // Abort if we have more than one user per component. 14884 if (Users[Lane]) 14885 return Node; 14886 14887 Users[Lane] = *I; 14888 NewDmask |= 1 << Comp; 14889 } 14890 } 14891 14892 // Don't allow 0 dmask, as hardware assumes one channel enabled. 14893 bool NoChannels = !NewDmask; 14894 if (NoChannels) { 14895 if (!UsesTFC) { 14896 // No uses of the result and not using TFC. Then do nothing. 14897 return Node; 14898 } 14899 // If the original dmask has one channel - then nothing to do 14900 if (OldBitsSet == 1) 14901 return Node; 14902 // Use an arbitrary dmask - required for the instruction to work 14903 NewDmask = 1; 14904 } 14905 // Abort if there's no change 14906 if (NewDmask == OldDmask) 14907 return Node; 14908 14909 unsigned BitsSet = llvm::popcount(NewDmask); 14910 14911 // Check for TFE or LWE - increase the number of channels by one to account 14912 // for the extra return value 14913 // This will need adjustment for D16 if this is also included in 14914 // adjustWriteMask (this function) but at present D16 are excluded. 14915 unsigned NewChannels = BitsSet + UsesTFC; 14916 14917 int NewOpcode = 14918 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels); 14919 assert(NewOpcode != -1 && 14920 NewOpcode != static_cast<int>(Node->getMachineOpcode()) && 14921 "failed to find equivalent MIMG op"); 14922 14923 // Adjust the writemask in the node 14924 SmallVector<SDValue, 12> Ops; 14925 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx); 14926 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); 14927 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end()); 14928 14929 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT(); 14930 14931 MVT ResultVT = NewChannels == 1 ? 14932 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 : 14933 NewChannels == 5 ? 8 : NewChannels); 14934 SDVTList NewVTList = HasChain ? 14935 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT); 14936 14937 14938 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node), 14939 NewVTList, Ops); 14940 14941 if (HasChain) { 14942 // Update chain. 14943 DAG.setNodeMemRefs(NewNode, Node->memoperands()); 14944 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1)); 14945 } 14946 14947 if (NewChannels == 1) { 14948 assert(Node->hasNUsesOfValue(1, 0)); 14949 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY, 14950 SDLoc(Node), Users[Lane]->getValueType(0), 14951 SDValue(NewNode, 0)); 14952 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 14953 return nullptr; 14954 } 14955 14956 // Update the users of the node with the new indices 14957 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) { 14958 SDNode *User = Users[i]; 14959 if (!User) { 14960 // Handle the special case of NoChannels. We set NewDmask to 1 above, but 14961 // Users[0] is still nullptr because channel 0 doesn't really have a use. 14962 if (i || !NoChannels) 14963 continue; 14964 } else { 14965 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); 14966 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); 14967 if (NewUser != User) { 14968 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0)); 14969 DAG.RemoveDeadNode(User); 14970 } 14971 } 14972 14973 switch (Idx) { 14974 default: break; 14975 case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 14976 case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 14977 case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 14978 case AMDGPU::sub3: Idx = AMDGPU::sub4; break; 14979 } 14980 } 14981 14982 DAG.RemoveDeadNode(Node); 14983 return nullptr; 14984 } 14985 14986 static bool isFrameIndexOp(SDValue Op) { 14987 if (Op.getOpcode() == ISD::AssertZext) 14988 Op = Op.getOperand(0); 14989 14990 return isa<FrameIndexSDNode>(Op); 14991 } 14992 14993 /// Legalize target independent instructions (e.g. INSERT_SUBREG) 14994 /// with frame index operands. 14995 /// LLVM assumes that inputs are to these instructions are registers. 14996 SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, 14997 SelectionDAG &DAG) const { 14998 if (Node->getOpcode() == ISD::CopyToReg) { 14999 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1)); 15000 SDValue SrcVal = Node->getOperand(2); 15001 15002 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have 15003 // to try understanding copies to physical registers. 15004 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) { 15005 SDLoc SL(Node); 15006 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 15007 SDValue VReg = DAG.getRegister( 15008 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1); 15009 15010 SDNode *Glued = Node->getGluedNode(); 15011 SDValue ToVReg 15012 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal, 15013 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0)); 15014 SDValue ToResultReg 15015 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0), 15016 VReg, ToVReg.getValue(1)); 15017 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode()); 15018 DAG.RemoveDeadNode(Node); 15019 return ToResultReg.getNode(); 15020 } 15021 } 15022 15023 SmallVector<SDValue, 8> Ops; 15024 for (unsigned i = 0; i < Node->getNumOperands(); ++i) { 15025 if (!isFrameIndexOp(Node->getOperand(i))) { 15026 Ops.push_back(Node->getOperand(i)); 15027 continue; 15028 } 15029 15030 SDLoc DL(Node); 15031 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, 15032 Node->getOperand(i).getValueType(), 15033 Node->getOperand(i)), 0)); 15034 } 15035 15036 return DAG.UpdateNodeOperands(Node, Ops); 15037 } 15038 15039 /// Fold the instructions after selecting them. 15040 /// Returns null if users were already updated. 15041 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 15042 SelectionDAG &DAG) const { 15043 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 15044 unsigned Opcode = Node->getMachineOpcode(); 15045 15046 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() && 15047 !TII->isGather4(Opcode) && 15048 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) { 15049 return adjustWritemask(Node, DAG); 15050 } 15051 15052 if (Opcode == AMDGPU::INSERT_SUBREG || 15053 Opcode == AMDGPU::REG_SEQUENCE) { 15054 legalizeTargetIndependentNode(Node, DAG); 15055 return Node; 15056 } 15057 15058 switch (Opcode) { 15059 case AMDGPU::V_DIV_SCALE_F32_e64: 15060 case AMDGPU::V_DIV_SCALE_F64_e64: { 15061 // Satisfy the operand register constraint when one of the inputs is 15062 // undefined. Ordinarily each undef value will have its own implicit_def of 15063 // a vreg, so force these to use a single register. 15064 SDValue Src0 = Node->getOperand(1); 15065 SDValue Src1 = Node->getOperand(3); 15066 SDValue Src2 = Node->getOperand(5); 15067 15068 if ((Src0.isMachineOpcode() && 15069 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) && 15070 (Src0 == Src1 || Src0 == Src2)) 15071 break; 15072 15073 MVT VT = Src0.getValueType().getSimpleVT(); 15074 const TargetRegisterClass *RC = 15075 getRegClassFor(VT, Src0.getNode()->isDivergent()); 15076 15077 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 15078 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT); 15079 15080 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), 15081 UndefReg, Src0, SDValue()); 15082 15083 // src0 must be the same register as src1 or src2, even if the value is 15084 // undefined, so make sure we don't violate this constraint. 15085 if (Src0.isMachineOpcode() && 15086 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) { 15087 if (Src1.isMachineOpcode() && 15088 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) 15089 Src0 = Src1; 15090 else if (Src2.isMachineOpcode() && 15091 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) 15092 Src0 = Src2; 15093 else { 15094 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF); 15095 Src0 = UndefReg; 15096 Src1 = UndefReg; 15097 } 15098 } else 15099 break; 15100 15101 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end()); 15102 Ops[1] = Src0; 15103 Ops[3] = Src1; 15104 Ops[5] = Src2; 15105 Ops.push_back(ImpDef.getValue(1)); 15106 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 15107 } 15108 default: 15109 break; 15110 } 15111 15112 return Node; 15113 } 15114 15115 // Any MIMG instructions that use tfe or lwe require an initialization of the 15116 // result register that will be written in the case of a memory access failure. 15117 // The required code is also added to tie this init code to the result of the 15118 // img instruction. 15119 void SITargetLowering::AddMemOpInit(MachineInstr &MI) const { 15120 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 15121 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 15122 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); 15123 MachineBasicBlock &MBB = *MI.getParent(); 15124 15125 int DstIdx = 15126 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); 15127 unsigned InitIdx = 0; 15128 15129 if (TII->isImage(MI)) { 15130 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe); 15131 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe); 15132 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16); 15133 15134 if (!TFE && !LWE) // intersect_ray 15135 return; 15136 15137 unsigned TFEVal = TFE ? TFE->getImm() : 0; 15138 unsigned LWEVal = LWE ? LWE->getImm() : 0; 15139 unsigned D16Val = D16 ? D16->getImm() : 0; 15140 15141 if (!TFEVal && !LWEVal) 15142 return; 15143 15144 // At least one of TFE or LWE are non-zero 15145 // We have to insert a suitable initialization of the result value and 15146 // tie this to the dest of the image instruction. 15147 15148 // Calculate which dword we have to initialize to 0. 15149 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask); 15150 15151 // check that dmask operand is found. 15152 assert(MO_Dmask && "Expected dmask operand in instruction"); 15153 15154 unsigned dmask = MO_Dmask->getImm(); 15155 // Determine the number of active lanes taking into account the 15156 // Gather4 special case 15157 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask); 15158 15159 bool Packed = !Subtarget->hasUnpackedD16VMem(); 15160 15161 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1; 15162 15163 // Abandon attempt if the dst size isn't large enough 15164 // - this is in fact an error but this is picked up elsewhere and 15165 // reported correctly. 15166 uint32_t DstSize = 15167 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; 15168 if (DstSize < InitIdx) 15169 return; 15170 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) { 15171 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; 15172 } else { 15173 return; 15174 } 15175 15176 const DebugLoc &DL = MI.getDebugLoc(); 15177 15178 // Create a register for the initialization value. 15179 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg()); 15180 unsigned NewDst = 0; // Final initialized value will be in here 15181 15182 // If PRTStrictNull feature is enabled (the default) then initialize 15183 // all the result registers to 0, otherwise just the error indication 15184 // register (VGPRn+1) 15185 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1; 15186 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1); 15187 15188 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst); 15189 for (; SizeLeft; SizeLeft--, CurrIdx++) { 15190 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); 15191 // Initialize dword 15192 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 15193 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg) 15194 .addImm(0); 15195 // Insert into the super-reg 15196 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst) 15197 .addReg(PrevDst) 15198 .addReg(SubReg) 15199 .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx)); 15200 15201 PrevDst = NewDst; 15202 } 15203 15204 // Add as an implicit operand 15205 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true)); 15206 15207 // Tie the just added implicit operand to the dst 15208 MI.tieOperands(DstIdx, MI.getNumOperands() - 1); 15209 } 15210 15211 /// Assign the register class depending on the number of 15212 /// bits set in the writemask 15213 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 15214 SDNode *Node) const { 15215 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 15216 15217 MachineFunction *MF = MI.getParent()->getParent(); 15218 MachineRegisterInfo &MRI = MF->getRegInfo(); 15219 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 15220 15221 if (TII->isVOP3(MI.getOpcode())) { 15222 // Make sure constant bus requirements are respected. 15223 TII->legalizeOperandsVOP3(MRI, MI); 15224 15225 // Prefer VGPRs over AGPRs in mAI instructions where possible. 15226 // This saves a chain-copy of registers and better balance register 15227 // use between vgpr and agpr as agpr tuples tend to be big. 15228 if (!MI.getDesc().operands().empty()) { 15229 unsigned Opc = MI.getOpcode(); 15230 bool HasAGPRs = Info->mayNeedAGPRs(); 15231 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 15232 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 15233 for (auto I : 15234 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 15235 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) { 15236 if (I == -1) 15237 break; 15238 if ((I == Src2Idx) && (HasAGPRs)) 15239 break; 15240 MachineOperand &Op = MI.getOperand(I); 15241 if (!Op.isReg() || !Op.getReg().isVirtual()) 15242 continue; 15243 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); 15244 if (!TRI->hasAGPRs(RC)) 15245 continue; 15246 auto *Src = MRI.getUniqueVRegDef(Op.getReg()); 15247 if (!Src || !Src->isCopy() || 15248 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) 15249 continue; 15250 auto *NewRC = TRI->getEquivalentVGPRClass(RC); 15251 // All uses of agpr64 and agpr32 can also accept vgpr except for 15252 // v_accvgpr_read, but we do not produce agpr reads during selection, 15253 // so no use checks are needed. 15254 MRI.setRegClass(Op.getReg(), NewRC); 15255 } 15256 15257 if (!HasAGPRs) 15258 return; 15259 15260 // Resolve the rest of AV operands to AGPRs. 15261 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) { 15262 if (Src2->isReg() && Src2->getReg().isVirtual()) { 15263 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg()); 15264 if (TRI->isVectorSuperClass(RC)) { 15265 auto *NewRC = TRI->getEquivalentAGPRClass(RC); 15266 MRI.setRegClass(Src2->getReg(), NewRC); 15267 if (Src2->isTied()) 15268 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC); 15269 } 15270 } 15271 } 15272 } 15273 15274 return; 15275 } 15276 15277 if (TII->isImage(MI)) 15278 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr); 15279 } 15280 15281 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, 15282 uint64_t Val) { 15283 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); 15284 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); 15285 } 15286 15287 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, 15288 const SDLoc &DL, 15289 SDValue Ptr) const { 15290 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 15291 15292 // Build the half of the subregister with the constants before building the 15293 // full 128-bit register. If we are building multiple resource descriptors, 15294 // this will allow CSEing of the 2-component register. 15295 const SDValue Ops0[] = { 15296 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), 15297 buildSMovImm32(DAG, DL, 0), 15298 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 15299 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), 15300 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) 15301 }; 15302 15303 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, 15304 MVT::v2i32, Ops0), 0); 15305 15306 // Combine the constants and the pointer. 15307 const SDValue Ops1[] = { 15308 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), 15309 Ptr, 15310 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), 15311 SubRegHi, 15312 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) 15313 }; 15314 15315 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); 15316 } 15317 15318 /// Return a resource descriptor with the 'Add TID' bit enabled 15319 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48] 15320 /// of the resource descriptor) to create an offset, which is added to 15321 /// the resource pointer. 15322 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, 15323 SDValue Ptr, uint32_t RsrcDword1, 15324 uint64_t RsrcDword2And3) const { 15325 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); 15326 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); 15327 if (RsrcDword1) { 15328 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, 15329 DAG.getConstant(RsrcDword1, DL, MVT::i32)), 15330 0); 15331 } 15332 15333 SDValue DataLo = buildSMovImm32(DAG, DL, 15334 RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); 15335 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); 15336 15337 const SDValue Ops[] = { 15338 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), 15339 PtrLo, 15340 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 15341 PtrHi, 15342 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), 15343 DataLo, 15344 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), 15345 DataHi, 15346 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) 15347 }; 15348 15349 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); 15350 } 15351 15352 //===----------------------------------------------------------------------===// 15353 // SI Inline Assembly Support 15354 //===----------------------------------------------------------------------===// 15355 15356 std::pair<unsigned, const TargetRegisterClass *> 15357 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, 15358 StringRef Constraint, 15359 MVT VT) const { 15360 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_); 15361 15362 const TargetRegisterClass *RC = nullptr; 15363 if (Constraint.size() == 1) { 15364 const unsigned BitWidth = VT.getSizeInBits(); 15365 switch (Constraint[0]) { 15366 default: 15367 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 15368 case 's': 15369 case 'r': 15370 switch (BitWidth) { 15371 case 16: 15372 RC = &AMDGPU::SReg_32RegClass; 15373 break; 15374 case 64: 15375 RC = &AMDGPU::SGPR_64RegClass; 15376 break; 15377 default: 15378 RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth); 15379 if (!RC) 15380 return std::pair(0U, nullptr); 15381 break; 15382 } 15383 break; 15384 case 'v': 15385 switch (BitWidth) { 15386 case 16: 15387 RC = &AMDGPU::VGPR_32RegClass; 15388 break; 15389 default: 15390 RC = TRI->getVGPRClassForBitWidth(BitWidth); 15391 if (!RC) 15392 return std::pair(0U, nullptr); 15393 break; 15394 } 15395 break; 15396 case 'a': 15397 if (!Subtarget->hasMAIInsts()) 15398 break; 15399 switch (BitWidth) { 15400 case 16: 15401 RC = &AMDGPU::AGPR_32RegClass; 15402 break; 15403 default: 15404 RC = TRI->getAGPRClassForBitWidth(BitWidth); 15405 if (!RC) 15406 return std::pair(0U, nullptr); 15407 break; 15408 } 15409 break; 15410 } 15411 // We actually support i128, i16 and f16 as inline parameters 15412 // even if they are not reported as legal 15413 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 || 15414 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16)) 15415 return std::pair(0U, RC); 15416 } 15417 15418 if (Constraint.starts_with("{") && Constraint.ends_with("}")) { 15419 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2); 15420 if (RegName.consume_front("v")) { 15421 RC = &AMDGPU::VGPR_32RegClass; 15422 } else if (RegName.consume_front("s")) { 15423 RC = &AMDGPU::SGPR_32RegClass; 15424 } else if (RegName.consume_front("a")) { 15425 RC = &AMDGPU::AGPR_32RegClass; 15426 } 15427 15428 if (RC) { 15429 uint32_t Idx; 15430 if (RegName.consume_front("[")) { 15431 uint32_t End; 15432 bool Failed = RegName.consumeInteger(10, Idx); 15433 Failed |= !RegName.consume_front(":"); 15434 Failed |= RegName.consumeInteger(10, End); 15435 Failed |= !RegName.consume_back("]"); 15436 if (!Failed) { 15437 uint32_t Width = (End - Idx + 1) * 32; 15438 MCRegister Reg = RC->getRegister(Idx); 15439 if (SIRegisterInfo::isVGPRClass(RC)) 15440 RC = TRI->getVGPRClassForBitWidth(Width); 15441 else if (SIRegisterInfo::isSGPRClass(RC)) 15442 RC = TRI->getSGPRClassForBitWidth(Width); 15443 else if (SIRegisterInfo::isAGPRClass(RC)) 15444 RC = TRI->getAGPRClassForBitWidth(Width); 15445 if (RC) { 15446 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC); 15447 return std::pair(Reg, RC); 15448 } 15449 } 15450 } else { 15451 bool Failed = RegName.getAsInteger(10, Idx); 15452 if (!Failed && Idx < RC->getNumRegs()) 15453 return std::pair(RC->getRegister(Idx), RC); 15454 } 15455 } 15456 } 15457 15458 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 15459 if (Ret.first) 15460 Ret.second = TRI->getPhysRegBaseClass(Ret.first); 15461 15462 return Ret; 15463 } 15464 15465 static bool isImmConstraint(StringRef Constraint) { 15466 if (Constraint.size() == 1) { 15467 switch (Constraint[0]) { 15468 default: break; 15469 case 'I': 15470 case 'J': 15471 case 'A': 15472 case 'B': 15473 case 'C': 15474 return true; 15475 } 15476 } else if (Constraint == "DA" || 15477 Constraint == "DB") { 15478 return true; 15479 } 15480 return false; 15481 } 15482 15483 SITargetLowering::ConstraintType 15484 SITargetLowering::getConstraintType(StringRef Constraint) const { 15485 if (Constraint.size() == 1) { 15486 switch (Constraint[0]) { 15487 default: break; 15488 case 's': 15489 case 'v': 15490 case 'a': 15491 return C_RegisterClass; 15492 } 15493 } 15494 if (isImmConstraint(Constraint)) { 15495 return C_Other; 15496 } 15497 return TargetLowering::getConstraintType(Constraint); 15498 } 15499 15500 static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) { 15501 if (!AMDGPU::isInlinableIntLiteral(Val)) { 15502 Val = Val & maskTrailingOnes<uint64_t>(Size); 15503 } 15504 return Val; 15505 } 15506 15507 void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op, 15508 StringRef Constraint, 15509 std::vector<SDValue> &Ops, 15510 SelectionDAG &DAG) const { 15511 if (isImmConstraint(Constraint)) { 15512 uint64_t Val; 15513 if (getAsmOperandConstVal(Op, Val) && 15514 checkAsmConstraintVal(Op, Constraint, Val)) { 15515 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits()); 15516 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64)); 15517 } 15518 } else { 15519 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 15520 } 15521 } 15522 15523 bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const { 15524 unsigned Size = Op.getScalarValueSizeInBits(); 15525 if (Size > 64) 15526 return false; 15527 15528 if (Size == 16 && !Subtarget->has16BitInsts()) 15529 return false; 15530 15531 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 15532 Val = C->getSExtValue(); 15533 return true; 15534 } 15535 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) { 15536 Val = C->getValueAPF().bitcastToAPInt().getSExtValue(); 15537 return true; 15538 } 15539 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) { 15540 if (Size != 16 || Op.getNumOperands() != 2) 15541 return false; 15542 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef()) 15543 return false; 15544 if (ConstantSDNode *C = V->getConstantSplatNode()) { 15545 Val = C->getSExtValue(); 15546 return true; 15547 } 15548 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) { 15549 Val = C->getValueAPF().bitcastToAPInt().getSExtValue(); 15550 return true; 15551 } 15552 } 15553 15554 return false; 15555 } 15556 15557 bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint, 15558 uint64_t Val) const { 15559 if (Constraint.size() == 1) { 15560 switch (Constraint[0]) { 15561 case 'I': 15562 return AMDGPU::isInlinableIntLiteral(Val); 15563 case 'J': 15564 return isInt<16>(Val); 15565 case 'A': 15566 return checkAsmConstraintValA(Op, Val); 15567 case 'B': 15568 return isInt<32>(Val); 15569 case 'C': 15570 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) || 15571 AMDGPU::isInlinableIntLiteral(Val); 15572 default: 15573 break; 15574 } 15575 } else if (Constraint.size() == 2) { 15576 if (Constraint == "DA") { 15577 int64_t HiBits = static_cast<int32_t>(Val >> 32); 15578 int64_t LoBits = static_cast<int32_t>(Val); 15579 return checkAsmConstraintValA(Op, HiBits, 32) && 15580 checkAsmConstraintValA(Op, LoBits, 32); 15581 } 15582 if (Constraint == "DB") { 15583 return true; 15584 } 15585 } 15586 llvm_unreachable("Invalid asm constraint"); 15587 } 15588 15589 bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val, 15590 unsigned MaxSize) const { 15591 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize); 15592 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm(); 15593 if (Size == 16) { 15594 MVT VT = Op.getSimpleValueType(); 15595 switch (VT.SimpleTy) { 15596 default: 15597 return false; 15598 case MVT::i16: 15599 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi); 15600 case MVT::f16: 15601 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi); 15602 case MVT::bf16: 15603 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi); 15604 case MVT::v2i16: 15605 return AMDGPU::getInlineEncodingV2I16(Val).has_value(); 15606 case MVT::v2f16: 15607 return AMDGPU::getInlineEncodingV2F16(Val).has_value(); 15608 case MVT::v2bf16: 15609 return AMDGPU::getInlineEncodingV2BF16(Val).has_value(); 15610 } 15611 } 15612 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) || 15613 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) 15614 return true; 15615 return false; 15616 } 15617 15618 static int getAlignedAGPRClassID(unsigned UnalignedClassID) { 15619 switch (UnalignedClassID) { 15620 case AMDGPU::VReg_64RegClassID: 15621 return AMDGPU::VReg_64_Align2RegClassID; 15622 case AMDGPU::VReg_96RegClassID: 15623 return AMDGPU::VReg_96_Align2RegClassID; 15624 case AMDGPU::VReg_128RegClassID: 15625 return AMDGPU::VReg_128_Align2RegClassID; 15626 case AMDGPU::VReg_160RegClassID: 15627 return AMDGPU::VReg_160_Align2RegClassID; 15628 case AMDGPU::VReg_192RegClassID: 15629 return AMDGPU::VReg_192_Align2RegClassID; 15630 case AMDGPU::VReg_224RegClassID: 15631 return AMDGPU::VReg_224_Align2RegClassID; 15632 case AMDGPU::VReg_256RegClassID: 15633 return AMDGPU::VReg_256_Align2RegClassID; 15634 case AMDGPU::VReg_288RegClassID: 15635 return AMDGPU::VReg_288_Align2RegClassID; 15636 case AMDGPU::VReg_320RegClassID: 15637 return AMDGPU::VReg_320_Align2RegClassID; 15638 case AMDGPU::VReg_352RegClassID: 15639 return AMDGPU::VReg_352_Align2RegClassID; 15640 case AMDGPU::VReg_384RegClassID: 15641 return AMDGPU::VReg_384_Align2RegClassID; 15642 case AMDGPU::VReg_512RegClassID: 15643 return AMDGPU::VReg_512_Align2RegClassID; 15644 case AMDGPU::VReg_1024RegClassID: 15645 return AMDGPU::VReg_1024_Align2RegClassID; 15646 case AMDGPU::AReg_64RegClassID: 15647 return AMDGPU::AReg_64_Align2RegClassID; 15648 case AMDGPU::AReg_96RegClassID: 15649 return AMDGPU::AReg_96_Align2RegClassID; 15650 case AMDGPU::AReg_128RegClassID: 15651 return AMDGPU::AReg_128_Align2RegClassID; 15652 case AMDGPU::AReg_160RegClassID: 15653 return AMDGPU::AReg_160_Align2RegClassID; 15654 case AMDGPU::AReg_192RegClassID: 15655 return AMDGPU::AReg_192_Align2RegClassID; 15656 case AMDGPU::AReg_256RegClassID: 15657 return AMDGPU::AReg_256_Align2RegClassID; 15658 case AMDGPU::AReg_512RegClassID: 15659 return AMDGPU::AReg_512_Align2RegClassID; 15660 case AMDGPU::AReg_1024RegClassID: 15661 return AMDGPU::AReg_1024_Align2RegClassID; 15662 default: 15663 return -1; 15664 } 15665 } 15666 15667 // Figure out which registers should be reserved for stack access. Only after 15668 // the function is legalized do we know all of the non-spill stack objects or if 15669 // calls are present. 15670 void SITargetLowering::finalizeLowering(MachineFunction &MF) const { 15671 MachineRegisterInfo &MRI = MF.getRegInfo(); 15672 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 15673 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 15674 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 15675 const SIInstrInfo *TII = ST.getInstrInfo(); 15676 15677 if (Info->isEntryFunction()) { 15678 // Callable functions have fixed registers used for stack access. 15679 reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); 15680 } 15681 15682 // TODO: Move this logic to getReservedRegs() 15683 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling. 15684 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 15685 Register SReg = ST.isWave32() 15686 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1) 15687 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2, 15688 &AMDGPU::SGPR_64RegClass); 15689 Info->setSGPRForEXECCopy(SReg); 15690 15691 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), 15692 Info->getStackPtrOffsetReg())); 15693 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG) 15694 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); 15695 15696 // We need to worry about replacing the default register with itself in case 15697 // of MIR testcases missing the MFI. 15698 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG) 15699 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg()); 15700 15701 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG) 15702 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); 15703 15704 Info->limitOccupancy(MF); 15705 15706 if (ST.isWave32() && !MF.empty()) { 15707 for (auto &MBB : MF) { 15708 for (auto &MI : MBB) { 15709 TII->fixImplicitOperands(MI); 15710 } 15711 } 15712 } 15713 15714 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned 15715 // classes if required. Ideally the register class constraints would differ 15716 // per-subtarget, but there's no easy way to achieve that right now. This is 15717 // not a problem for VGPRs because the correctly aligned VGPR class is implied 15718 // from using them as the register class for legal types. 15719 if (ST.needsAlignedVGPRs()) { 15720 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { 15721 const Register Reg = Register::index2VirtReg(I); 15722 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); 15723 if (!RC) 15724 continue; 15725 int NewClassID = getAlignedAGPRClassID(RC->getID()); 15726 if (NewClassID != -1) 15727 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID)); 15728 } 15729 } 15730 15731 TargetLoweringBase::finalizeLowering(MF); 15732 } 15733 15734 void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 15735 KnownBits &Known, 15736 const APInt &DemandedElts, 15737 const SelectionDAG &DAG, 15738 unsigned Depth) const { 15739 Known.resetAll(); 15740 unsigned Opc = Op.getOpcode(); 15741 switch (Opc) { 15742 case ISD::INTRINSIC_WO_CHAIN: { 15743 unsigned IID = Op.getConstantOperandVal(0); 15744 switch (IID) { 15745 case Intrinsic::amdgcn_mbcnt_lo: 15746 case Intrinsic::amdgcn_mbcnt_hi: { 15747 const GCNSubtarget &ST = 15748 DAG.getMachineFunction().getSubtarget<GCNSubtarget>(); 15749 // These return at most the (wavefront size - 1) + src1 15750 // As long as src1 is an immediate we can calc known bits 15751 KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1); 15752 unsigned Src1ValBits = Src1Known.countMaxActiveBits(); 15753 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2()); 15754 // Cater for potential carry 15755 MaxActiveBits += Src1ValBits ? 1 : 0; 15756 unsigned Size = Op.getValueType().getSizeInBits(); 15757 if (MaxActiveBits < Size) 15758 Known.Zero.setHighBits(Size - MaxActiveBits); 15759 return; 15760 } 15761 } 15762 break; 15763 } 15764 } 15765 return AMDGPUTargetLowering::computeKnownBitsForTargetNode( 15766 Op, Known, DemandedElts, DAG, Depth); 15767 } 15768 15769 void SITargetLowering::computeKnownBitsForFrameIndex( 15770 const int FI, KnownBits &Known, const MachineFunction &MF) const { 15771 TargetLowering::computeKnownBitsForFrameIndex(FI, Known, MF); 15772 15773 // Set the high bits to zero based on the maximum allowed scratch size per 15774 // wave. We can't use vaddr in MUBUF instructions if we don't know the address 15775 // calculation won't overflow, so assume the sign bit is never set. 15776 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); 15777 } 15778 15779 static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, 15780 KnownBits &Known, unsigned Dim) { 15781 unsigned MaxValue = 15782 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim); 15783 Known.Zero.setHighBits(llvm::countl_zero(MaxValue)); 15784 } 15785 15786 void SITargetLowering::computeKnownBitsForTargetInstr( 15787 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts, 15788 const MachineRegisterInfo &MRI, unsigned Depth) const { 15789 const MachineInstr *MI = MRI.getVRegDef(R); 15790 switch (MI->getOpcode()) { 15791 case AMDGPU::G_INTRINSIC: 15792 case AMDGPU::G_INTRINSIC_CONVERGENT: { 15793 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) { 15794 case Intrinsic::amdgcn_workitem_id_x: 15795 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0); 15796 break; 15797 case Intrinsic::amdgcn_workitem_id_y: 15798 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1); 15799 break; 15800 case Intrinsic::amdgcn_workitem_id_z: 15801 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2); 15802 break; 15803 case Intrinsic::amdgcn_mbcnt_lo: 15804 case Intrinsic::amdgcn_mbcnt_hi: { 15805 // These return at most the wavefront size - 1. 15806 unsigned Size = MRI.getType(R).getSizeInBits(); 15807 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2()); 15808 break; 15809 } 15810 case Intrinsic::amdgcn_groupstaticsize: { 15811 // We can report everything over the maximum size as 0. We can't report 15812 // based on the actual size because we don't know if it's accurate or not 15813 // at any given point. 15814 Known.Zero.setHighBits( 15815 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize())); 15816 break; 15817 } 15818 } 15819 break; 15820 } 15821 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 15822 Known.Zero.setHighBits(24); 15823 break; 15824 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 15825 Known.Zero.setHighBits(16); 15826 break; 15827 case AMDGPU::G_AMDGPU_SMED3: 15828 case AMDGPU::G_AMDGPU_UMED3: { 15829 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs(); 15830 15831 KnownBits Known2; 15832 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1); 15833 if (Known2.isUnknown()) 15834 break; 15835 15836 KnownBits Known1; 15837 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1); 15838 if (Known1.isUnknown()) 15839 break; 15840 15841 KnownBits Known0; 15842 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1); 15843 if (Known0.isUnknown()) 15844 break; 15845 15846 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling. 15847 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero; 15848 Known.One = Known0.One & Known1.One & Known2.One; 15849 break; 15850 } 15851 } 15852 } 15853 15854 Align SITargetLowering::computeKnownAlignForTargetInstr( 15855 GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI, 15856 unsigned Depth) const { 15857 const MachineInstr *MI = MRI.getVRegDef(R); 15858 if (auto *GI = dyn_cast<GIntrinsic>(MI)) { 15859 // FIXME: Can this move to generic code? What about the case where the call 15860 // site specifies a lower alignment? 15861 Intrinsic::ID IID = GI->getIntrinsicID(); 15862 LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext(); 15863 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID); 15864 if (MaybeAlign RetAlign = Attrs.getRetAlignment()) 15865 return *RetAlign; 15866 } 15867 return Align(1); 15868 } 15869 15870 Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { 15871 const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML); 15872 const Align CacheLineAlign = Align(64); 15873 15874 // Pre-GFX10 target did not benefit from loop alignment 15875 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() || 15876 getSubtarget()->hasInstFwdPrefetchBug()) 15877 return PrefAlign; 15878 15879 // On GFX10 I$ is 4 x 64 bytes cache lines. 15880 // By default prefetcher keeps one cache line behind and reads two ahead. 15881 // We can modify it with S_INST_PREFETCH for larger loops to have two lines 15882 // behind and one ahead. 15883 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes. 15884 // If loop fits 64 bytes it always spans no more than two cache lines and 15885 // does not need an alignment. 15886 // Else if loop is less or equal 128 bytes we do not need to modify prefetch, 15887 // Else if loop is less or equal 192 bytes we need two lines behind. 15888 15889 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 15890 const MachineBasicBlock *Header = ML->getHeader(); 15891 if (Header->getAlignment() != PrefAlign) 15892 return Header->getAlignment(); // Already processed. 15893 15894 unsigned LoopSize = 0; 15895 for (const MachineBasicBlock *MBB : ML->blocks()) { 15896 // If inner loop block is aligned assume in average half of the alignment 15897 // size to be added as nops. 15898 if (MBB != Header) 15899 LoopSize += MBB->getAlignment().value() / 2; 15900 15901 for (const MachineInstr &MI : *MBB) { 15902 LoopSize += TII->getInstSizeInBytes(MI); 15903 if (LoopSize > 192) 15904 return PrefAlign; 15905 } 15906 } 15907 15908 if (LoopSize <= 64) 15909 return PrefAlign; 15910 15911 if (LoopSize <= 128) 15912 return CacheLineAlign; 15913 15914 // If any of parent loops is surrounded by prefetch instructions do not 15915 // insert new for inner loop, which would reset parent's settings. 15916 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) { 15917 if (MachineBasicBlock *Exit = P->getExitBlock()) { 15918 auto I = Exit->getFirstNonDebugInstr(); 15919 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH) 15920 return CacheLineAlign; 15921 } 15922 } 15923 15924 MachineBasicBlock *Pre = ML->getLoopPreheader(); 15925 MachineBasicBlock *Exit = ML->getExitBlock(); 15926 15927 if (Pre && Exit) { 15928 auto PreTerm = Pre->getFirstTerminator(); 15929 if (PreTerm == Pre->begin() || 15930 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH) 15931 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH)) 15932 .addImm(1); // prefetch 2 lines behind PC 15933 15934 auto ExitHead = Exit->getFirstNonDebugInstr(); 15935 if (ExitHead == Exit->end() || 15936 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH) 15937 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH)) 15938 .addImm(2); // prefetch 1 line behind PC 15939 } 15940 15941 return CacheLineAlign; 15942 } 15943 15944 LLVM_ATTRIBUTE_UNUSED 15945 static bool isCopyFromRegOfInlineAsm(const SDNode *N) { 15946 assert(N->getOpcode() == ISD::CopyFromReg); 15947 do { 15948 // Follow the chain until we find an INLINEASM node. 15949 N = N->getOperand(0).getNode(); 15950 if (N->getOpcode() == ISD::INLINEASM || 15951 N->getOpcode() == ISD::INLINEASM_BR) 15952 return true; 15953 } while (N->getOpcode() == ISD::CopyFromReg); 15954 return false; 15955 } 15956 15957 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N, 15958 FunctionLoweringInfo *FLI, 15959 UniformityInfo *UA) const { 15960 switch (N->getOpcode()) { 15961 case ISD::CopyFromReg: { 15962 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1)); 15963 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo(); 15964 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 15965 Register Reg = R->getReg(); 15966 15967 // FIXME: Why does this need to consider isLiveIn? 15968 if (Reg.isPhysical() || MRI.isLiveIn(Reg)) 15969 return !TRI->isSGPRReg(MRI, Reg); 15970 15971 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg())) 15972 return UA->isDivergent(V); 15973 15974 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N)); 15975 return !TRI->isSGPRReg(MRI, Reg); 15976 } 15977 case ISD::LOAD: { 15978 const LoadSDNode *L = cast<LoadSDNode>(N); 15979 unsigned AS = L->getAddressSpace(); 15980 // A flat load may access private memory. 15981 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; 15982 } 15983 case ISD::CALLSEQ_END: 15984 return true; 15985 case ISD::INTRINSIC_WO_CHAIN: 15986 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0)); 15987 case ISD::INTRINSIC_W_CHAIN: 15988 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1)); 15989 case AMDGPUISD::ATOMIC_CMP_SWAP: 15990 case AMDGPUISD::BUFFER_ATOMIC_SWAP: 15991 case AMDGPUISD::BUFFER_ATOMIC_ADD: 15992 case AMDGPUISD::BUFFER_ATOMIC_SUB: 15993 case AMDGPUISD::BUFFER_ATOMIC_SMIN: 15994 case AMDGPUISD::BUFFER_ATOMIC_UMIN: 15995 case AMDGPUISD::BUFFER_ATOMIC_SMAX: 15996 case AMDGPUISD::BUFFER_ATOMIC_UMAX: 15997 case AMDGPUISD::BUFFER_ATOMIC_AND: 15998 case AMDGPUISD::BUFFER_ATOMIC_OR: 15999 case AMDGPUISD::BUFFER_ATOMIC_XOR: 16000 case AMDGPUISD::BUFFER_ATOMIC_INC: 16001 case AMDGPUISD::BUFFER_ATOMIC_DEC: 16002 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: 16003 case AMDGPUISD::BUFFER_ATOMIC_CSUB: 16004 case AMDGPUISD::BUFFER_ATOMIC_FADD: 16005 case AMDGPUISD::BUFFER_ATOMIC_FMIN: 16006 case AMDGPUISD::BUFFER_ATOMIC_FMAX: 16007 // Target-specific read-modify-write atomics are sources of divergence. 16008 return true; 16009 default: 16010 if (auto *A = dyn_cast<AtomicSDNode>(N)) { 16011 // Generic read-modify-write atomics are sources of divergence. 16012 return A->readMem() && A->writeMem(); 16013 } 16014 return false; 16015 } 16016 } 16017 16018 bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG, 16019 EVT VT) const { 16020 switch (VT.getScalarType().getSimpleVT().SimpleTy) { 16021 case MVT::f32: 16022 return !denormalModeIsFlushAllF32(DAG.getMachineFunction()); 16023 case MVT::f64: 16024 case MVT::f16: 16025 return !denormalModeIsFlushAllF64F16(DAG.getMachineFunction()); 16026 default: 16027 return false; 16028 } 16029 } 16030 16031 bool SITargetLowering::denormalsEnabledForType( 16032 LLT Ty, const MachineFunction &MF) const { 16033 switch (Ty.getScalarSizeInBits()) { 16034 case 32: 16035 return !denormalModeIsFlushAllF32(MF); 16036 case 64: 16037 case 16: 16038 return !denormalModeIsFlushAllF64F16(MF); 16039 default: 16040 return false; 16041 } 16042 } 16043 16044 bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, 16045 const SelectionDAG &DAG, 16046 bool SNaN, 16047 unsigned Depth) const { 16048 if (Op.getOpcode() == AMDGPUISD::CLAMP) { 16049 const MachineFunction &MF = DAG.getMachineFunction(); 16050 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 16051 16052 if (Info->getMode().DX10Clamp) 16053 return true; // Clamped to 0. 16054 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); 16055 } 16056 16057 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG, 16058 SNaN, Depth); 16059 } 16060 16061 #if 0 16062 // FIXME: This should be checked before unsafe fp atomics are enabled 16063 // Global FP atomic instructions have a hardcoded FP mode and do not support 16064 // FP32 denormals, and only support v2f16 denormals. 16065 static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) { 16066 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics(); 16067 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt); 16068 if (&Flt == &APFloat::IEEEsingle()) 16069 return DenormMode == DenormalMode::getPreserveSign(); 16070 return DenormMode == DenormalMode::getIEEE(); 16071 } 16072 #endif 16073 16074 // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe 16075 // floating point atomic instructions. May generate more efficient code, 16076 // but may not respect rounding and denormal modes, and may give incorrect 16077 // results for certain memory destinations. 16078 bool unsafeFPAtomicsDisabled(Function *F) { 16079 return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() != 16080 "true"; 16081 } 16082 16083 static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) { 16084 LLVMContext &Ctx = RMW->getContext(); 16085 SmallVector<StringRef> SSNs; 16086 Ctx.getSyncScopeNames(SSNs); 16087 StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty() 16088 ? "system" 16089 : SSNs[RMW->getSyncScopeID()]; 16090 16091 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW) 16092 << "Hardware instruction generated for atomic " 16093 << RMW->getOperationName(RMW->getOperation()) 16094 << " operation at memory scope " << MemScope; 16095 } 16096 16097 static bool isHalf2OrBFloat2(Type *Ty) { 16098 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { 16099 Type *EltTy = VT->getElementType(); 16100 return VT->getNumElements() == 2 && 16101 (EltTy->isHalfTy() || EltTy->isBFloatTy()); 16102 } 16103 16104 return false; 16105 } 16106 16107 static bool isHalf2(Type *Ty) { 16108 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty); 16109 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy(); 16110 } 16111 16112 static bool isBFloat2(Type *Ty) { 16113 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty); 16114 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy(); 16115 } 16116 16117 TargetLowering::AtomicExpansionKind 16118 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { 16119 unsigned AS = RMW->getPointerAddressSpace(); 16120 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 16121 return AtomicExpansionKind::NotAtomic; 16122 16123 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) { 16124 OptimizationRemarkEmitter ORE(RMW->getFunction()); 16125 ORE.emit([=]() { 16126 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request."; 16127 }); 16128 return Kind; 16129 }; 16130 16131 auto SSID = RMW->getSyncScopeID(); 16132 bool HasSystemScope = 16133 SSID == SyncScope::System || 16134 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"); 16135 16136 switch (RMW->getOperation()) { 16137 case AtomicRMWInst::Sub: 16138 case AtomicRMWInst::Or: 16139 case AtomicRMWInst::Xor: { 16140 // Atomic sub/or/xor do not work over PCI express, but atomic add 16141 // does. InstCombine transforms these with 0 to or, so undo that. 16142 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) { 16143 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand()); 16144 ConstVal && ConstVal->isNullValue()) 16145 return AtomicExpansionKind::Expand; 16146 } 16147 16148 break; 16149 } 16150 case AtomicRMWInst::FAdd: { 16151 Type *Ty = RMW->getType(); 16152 16153 // TODO: Handle REGION_ADDRESS 16154 if (AS == AMDGPUAS::LOCAL_ADDRESS) { 16155 // DS F32 FP atomics do respect the denormal mode, but the rounding mode 16156 // is fixed to round-to-nearest-even. 16157 // 16158 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to 16159 // round-to-nearest-even. 16160 // 16161 // We ignore the rounding mode problem, even in strictfp. The C++ standard 16162 // suggests it is OK if the floating-point mode may not match the calling 16163 // thread. 16164 if (Ty->isFloatTy()) { 16165 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None 16166 : AtomicExpansionKind::CmpXChg; 16167 } 16168 16169 if (Ty->isDoubleTy()) { 16170 // Ignores denormal mode, but we don't consider flushing mandatory. 16171 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None 16172 : AtomicExpansionKind::CmpXChg; 16173 } 16174 16175 if (Subtarget->hasAtomicDsPkAdd16Insts() && isHalf2OrBFloat2(Ty)) 16176 return AtomicExpansionKind::None; 16177 16178 return AtomicExpansionKind::CmpXChg; 16179 } 16180 16181 if (!AMDGPU::isFlatGlobalAddrSpace(AS) && 16182 AS != AMDGPUAS::BUFFER_FAT_POINTER) 16183 return AtomicExpansionKind::CmpXChg; 16184 16185 if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy())) 16186 return AtomicExpansionKind::None; 16187 16188 if (AS == AMDGPUAS::FLAT_ADDRESS) { 16189 // gfx940, gfx12 16190 // FIXME: Needs to account for no fine-grained memory 16191 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty)) 16192 return AtomicExpansionKind::None; 16193 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) { 16194 // gfx90a, gfx940, gfx12 16195 // FIXME: Needs to account for no fine-grained memory 16196 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty)) 16197 return AtomicExpansionKind::None; 16198 16199 // gfx940, gfx12 16200 // FIXME: Needs to account for no fine-grained memory 16201 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty)) 16202 return AtomicExpansionKind::None; 16203 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) { 16204 // gfx90a, gfx940, gfx12 16205 // FIXME: Needs to account for no fine-grained memory 16206 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty)) 16207 return AtomicExpansionKind::None; 16208 16209 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for 16210 // buffer. gfx12 does have the buffer version. 16211 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty)) 16212 return AtomicExpansionKind::None; 16213 } 16214 16215 if (unsafeFPAtomicsDisabled(RMW->getFunction())) 16216 return AtomicExpansionKind::CmpXChg; 16217 16218 // Always expand system scope fp atomics. 16219 if (HasSystemScope) 16220 return AtomicExpansionKind::CmpXChg; 16221 16222 // global and flat atomic fadd f64: gfx90a, gfx940. 16223 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy()) 16224 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16225 16226 if (AS != AMDGPUAS::FLAT_ADDRESS) { 16227 if (Ty->isFloatTy()) { 16228 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+. 16229 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) 16230 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16231 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+. 16232 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) 16233 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16234 } else { 16235 // gfx908 16236 if (RMW->use_empty() && 16237 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && isHalf2(Ty)) 16238 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16239 } 16240 } 16241 16242 // flat atomic fadd f32: gfx940, gfx11+. 16243 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) { 16244 if (Subtarget->hasFlatAtomicFaddF32Inst()) 16245 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16246 16247 // If it is in flat address space, and the type is float, we will try to 16248 // expand it, if the target supports global and lds atomic fadd. The 16249 // reason we need that is, in the expansion, we emit the check of address 16250 // space. If it is in global address space, we emit the global atomic 16251 // fadd; if it is in shared address space, we emit the LDS atomic fadd. 16252 if (Subtarget->hasLDSFPAtomicAddF32()) { 16253 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) 16254 return AtomicExpansionKind::Expand; 16255 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) 16256 return AtomicExpansionKind::Expand; 16257 } 16258 } 16259 16260 return AtomicExpansionKind::CmpXChg; 16261 } 16262 case AtomicRMWInst::FMin: 16263 case AtomicRMWInst::FMax: { 16264 Type *Ty = RMW->getType(); 16265 16266 // LDS float and double fmin/fmax were always supported. 16267 if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy())) 16268 return AtomicExpansionKind::None; 16269 16270 if (unsafeFPAtomicsDisabled(RMW->getFunction())) 16271 return AtomicExpansionKind::CmpXChg; 16272 16273 // Always expand system scope fp atomics. 16274 if (HasSystemScope) 16275 return AtomicExpansionKind::CmpXChg; 16276 16277 // For flat and global cases: 16278 // float, double in gfx7. Manual claims denormal support. 16279 // Removed in gfx8. 16280 // float, double restored in gfx10. 16281 // double removed again in gfx11, so only f32 for gfx11/gfx12. 16282 // 16283 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no 16284 // f32. 16285 // 16286 // FIXME: Check scope and fine grained memory 16287 if (AS == AMDGPUAS::FLAT_ADDRESS) { 16288 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy()) 16289 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16290 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy()) 16291 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16292 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) || 16293 AS == AMDGPUAS::BUFFER_FAT_POINTER) { 16294 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy()) 16295 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16296 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy()) 16297 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16298 } 16299 16300 return AtomicExpansionKind::CmpXChg; 16301 } 16302 case AtomicRMWInst::Min: 16303 case AtomicRMWInst::Max: 16304 case AtomicRMWInst::UMin: 16305 case AtomicRMWInst::UMax: { 16306 if (AMDGPU::isFlatGlobalAddrSpace(AS) || 16307 AS == AMDGPUAS::BUFFER_FAT_POINTER) { 16308 // Always expand system scope min/max atomics. 16309 if (HasSystemScope) 16310 return AtomicExpansionKind::CmpXChg; 16311 } 16312 break; 16313 } 16314 default: 16315 break; 16316 } 16317 16318 return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW); 16319 } 16320 16321 TargetLowering::AtomicExpansionKind 16322 SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 16323 return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS 16324 ? AtomicExpansionKind::NotAtomic 16325 : AtomicExpansionKind::None; 16326 } 16327 16328 TargetLowering::AtomicExpansionKind 16329 SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 16330 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS 16331 ? AtomicExpansionKind::NotAtomic 16332 : AtomicExpansionKind::None; 16333 } 16334 16335 TargetLowering::AtomicExpansionKind 16336 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const { 16337 return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS 16338 ? AtomicExpansionKind::NotAtomic 16339 : AtomicExpansionKind::None; 16340 } 16341 16342 const TargetRegisterClass * 16343 SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { 16344 const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false); 16345 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 16346 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent) 16347 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass 16348 : &AMDGPU::SReg_32RegClass; 16349 if (!TRI->isSGPRClass(RC) && !isDivergent) 16350 return TRI->getEquivalentSGPRClass(RC); 16351 if (TRI->isSGPRClass(RC) && isDivergent) 16352 return TRI->getEquivalentVGPRClass(RC); 16353 16354 return RC; 16355 } 16356 16357 // FIXME: This is a workaround for DivergenceAnalysis not understanding always 16358 // uniform values (as produced by the mask results of control flow intrinsics) 16359 // used outside of divergent blocks. The phi users need to also be treated as 16360 // always uniform. 16361 // 16362 // FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis? 16363 static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited, 16364 unsigned WaveSize) { 16365 // FIXME: We assume we never cast the mask results of a control flow 16366 // intrinsic. 16367 // Early exit if the type won't be consistent as a compile time hack. 16368 IntegerType *IT = dyn_cast<IntegerType>(V->getType()); 16369 if (!IT || IT->getBitWidth() != WaveSize) 16370 return false; 16371 16372 if (!isa<Instruction>(V)) 16373 return false; 16374 if (!Visited.insert(V).second) 16375 return false; 16376 bool Result = false; 16377 for (const auto *U : V->users()) { 16378 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) { 16379 if (V == U->getOperand(1)) { 16380 switch (Intrinsic->getIntrinsicID()) { 16381 default: 16382 Result = false; 16383 break; 16384 case Intrinsic::amdgcn_if_break: 16385 case Intrinsic::amdgcn_if: 16386 case Intrinsic::amdgcn_else: 16387 Result = true; 16388 break; 16389 } 16390 } 16391 if (V == U->getOperand(0)) { 16392 switch (Intrinsic->getIntrinsicID()) { 16393 default: 16394 Result = false; 16395 break; 16396 case Intrinsic::amdgcn_end_cf: 16397 case Intrinsic::amdgcn_loop: 16398 Result = true; 16399 break; 16400 } 16401 } 16402 } else { 16403 Result = hasCFUser(U, Visited, WaveSize); 16404 } 16405 if (Result) 16406 break; 16407 } 16408 return Result; 16409 } 16410 16411 bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, 16412 const Value *V) const { 16413 if (const CallInst *CI = dyn_cast<CallInst>(V)) { 16414 if (CI->isInlineAsm()) { 16415 // FIXME: This cannot give a correct answer. This should only trigger in 16416 // the case where inline asm returns mixed SGPR and VGPR results, used 16417 // outside the defining block. We don't have a specific result to 16418 // consider, so this assumes if any value is SGPR, the overall register 16419 // also needs to be SGPR. 16420 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo(); 16421 TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints( 16422 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI); 16423 for (auto &TC : TargetConstraints) { 16424 if (TC.Type == InlineAsm::isOutput) { 16425 ComputeConstraintToUse(TC, SDValue()); 16426 const TargetRegisterClass *RC = getRegForInlineAsmConstraint( 16427 SIRI, TC.ConstraintCode, TC.ConstraintVT).second; 16428 if (RC && SIRI->isSGPRClass(RC)) 16429 return true; 16430 } 16431 } 16432 } 16433 } 16434 SmallPtrSet<const Value *, 16> Visited; 16435 return hasCFUser(V, Visited, Subtarget->getWavefrontSize()); 16436 } 16437 16438 bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const { 16439 SDNode::use_iterator I = N->use_begin(), E = N->use_end(); 16440 for (; I != E; ++I) { 16441 if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) { 16442 if (getBasePtrIndex(M) == I.getOperandNo()) 16443 return true; 16444 } 16445 } 16446 return false; 16447 } 16448 16449 bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, 16450 SDValue N1) const { 16451 if (!N0.hasOneUse()) 16452 return false; 16453 // Take care of the opportunity to keep N0 uniform 16454 if (N0->isDivergent() || !N1->isDivergent()) 16455 return true; 16456 // Check if we have a good chance to form the memory access pattern with the 16457 // base and offset 16458 return (DAG.isBaseWithConstantOffset(N0) && 16459 hasMemSDNodeUser(*N0->use_begin())); 16460 } 16461 16462 bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, 16463 Register N0, Register N1) const { 16464 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks 16465 } 16466 16467 MachineMemOperand::Flags 16468 SITargetLowering::getTargetMMOFlags(const Instruction &I) const { 16469 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load. 16470 MachineMemOperand::Flags Flags = MachineMemOperand::MONone; 16471 if (I.getMetadata("amdgpu.noclobber")) 16472 Flags |= MONoClobber; 16473 if (I.getMetadata("amdgpu.last.use")) 16474 Flags |= MOLastUse; 16475 return Flags; 16476 } 16477 16478 bool SITargetLowering::checkForPhysRegDependency( 16479 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, 16480 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const { 16481 if (User->getOpcode() != ISD::CopyToReg) 16482 return false; 16483 if (!Def->isMachineOpcode()) 16484 return false; 16485 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def); 16486 if (!MDef) 16487 return false; 16488 16489 unsigned ResNo = User->getOperand(Op).getResNo(); 16490 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1) 16491 return false; 16492 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode()); 16493 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) { 16494 PhysReg = AMDGPU::SCC; 16495 const TargetRegisterClass *RC = 16496 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo)); 16497 Cost = RC->getCopyCost(); 16498 return true; 16499 } 16500 return false; 16501 } 16502 16503 void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { 16504 AtomicRMWInst::BinOp Op = AI->getOperation(); 16505 16506 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or || 16507 Op == AtomicRMWInst::Xor) { 16508 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0 16509 assert(cast<Constant>(AI->getValOperand())->isNullValue() && 16510 "this cannot be replaced with add"); 16511 AI->setOperation(AtomicRMWInst::Add); 16512 return; 16513 } 16514 16515 assert(Subtarget->hasAtomicFaddInsts() && 16516 "target should have atomic fadd instructions"); 16517 assert(AI->getType()->isFloatTy() && 16518 AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS && 16519 "generic atomicrmw expansion only supports FP32 operand in flat " 16520 "address space"); 16521 assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now"); 16522 16523 // Given: atomicrmw fadd ptr %addr, float %val ordering 16524 // 16525 // With this expansion we produce the following code: 16526 // [...] 16527 // br label %atomicrmw.check.shared 16528 // 16529 // atomicrmw.check.shared: 16530 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr) 16531 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private 16532 // 16533 // atomicrmw.shared: 16534 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3) 16535 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared, 16536 // float %val ordering 16537 // br label %atomicrmw.phi 16538 // 16539 // atomicrmw.check.private: 16540 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr) 16541 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global 16542 // 16543 // atomicrmw.private: 16544 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5) 16545 // %loaded.private = load float, ptr addrspace(5) %cast.private 16546 // %val.new = fadd float %loaded.private, %val 16547 // store float %val.new, ptr addrspace(5) %cast.private 16548 // br label %atomicrmw.phi 16549 // 16550 // atomicrmw.global: 16551 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1) 16552 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global, 16553 // float %val ordering 16554 // br label %atomicrmw.phi 16555 // 16556 // atomicrmw.phi: 16557 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ], 16558 // [ %loaded.private, %atomicrmw.private ], 16559 // [ %loaded.global, %atomicrmw.global ] 16560 // br label %atomicrmw.end 16561 // 16562 // atomicrmw.end: 16563 // [...] 16564 16565 IRBuilder<> Builder(AI); 16566 LLVMContext &Ctx = Builder.getContext(); 16567 16568 BasicBlock *BB = Builder.GetInsertBlock(); 16569 Function *F = BB->getParent(); 16570 BasicBlock *ExitBB = 16571 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); 16572 BasicBlock *CheckSharedBB = 16573 BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB); 16574 BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB); 16575 BasicBlock *CheckPrivateBB = 16576 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB); 16577 BasicBlock *PrivateBB = 16578 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB); 16579 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB); 16580 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB); 16581 16582 Value *Val = AI->getValOperand(); 16583 Type *ValTy = Val->getType(); 16584 Value *Addr = AI->getPointerOperand(); 16585 16586 auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr, 16587 Value *Val) -> Value * { 16588 AtomicRMWInst *OldVal = 16589 Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(), 16590 AI->getOrdering(), AI->getSyncScopeID()); 16591 SmallVector<std::pair<unsigned, MDNode *>> MDs; 16592 AI->getAllMetadata(MDs); 16593 for (auto &P : MDs) 16594 OldVal->setMetadata(P.first, P.second); 16595 return OldVal; 16596 }; 16597 16598 std::prev(BB->end())->eraseFromParent(); 16599 Builder.SetInsertPoint(BB); 16600 Builder.CreateBr(CheckSharedBB); 16601 16602 Builder.SetInsertPoint(CheckSharedBB); 16603 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {}, 16604 {Addr}, nullptr, "is.shared"); 16605 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB); 16606 16607 Builder.SetInsertPoint(SharedBB); 16608 Value *CastToLocal = Builder.CreateAddrSpaceCast( 16609 Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS)); 16610 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val); 16611 Builder.CreateBr(PhiBB); 16612 16613 Builder.SetInsertPoint(CheckPrivateBB); 16614 CallInst *IsPrivate = Builder.CreateIntrinsic( 16615 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private"); 16616 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB); 16617 16618 Builder.SetInsertPoint(PrivateBB); 16619 Value *CastToPrivate = Builder.CreateAddrSpaceCast( 16620 Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS)); 16621 Value *LoadedPrivate = 16622 Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private"); 16623 Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new"); 16624 Builder.CreateStore(NewVal, CastToPrivate); 16625 Builder.CreateBr(PhiBB); 16626 16627 Builder.SetInsertPoint(GlobalBB); 16628 Value *CastToGlobal = Builder.CreateAddrSpaceCast( 16629 Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS)); 16630 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val); 16631 Builder.CreateBr(PhiBB); 16632 16633 Builder.SetInsertPoint(PhiBB); 16634 PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi"); 16635 Loaded->addIncoming(LoadedShared, SharedBB); 16636 Loaded->addIncoming(LoadedPrivate, PrivateBB); 16637 Loaded->addIncoming(LoadedGlobal, GlobalBB); 16638 Builder.CreateBr(ExitBB); 16639 16640 AI->replaceAllUsesWith(Loaded); 16641 AI->eraseFromParent(); 16642 } 16643 16644 LoadInst * 16645 SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { 16646 IRBuilder<> Builder(AI); 16647 auto Order = AI->getOrdering(); 16648 16649 // The optimization removes store aspect of the atomicrmw. Therefore, cache 16650 // must be flushed if the atomic ordering had a release semantics. This is 16651 // not necessary a fence, a release fence just coincides to do that flush. 16652 // Avoid replacing of an atomicrmw with a release semantics. 16653 if (isReleaseOrStronger(Order)) 16654 return nullptr; 16655 16656 LoadInst *LI = Builder.CreateAlignedLoad( 16657 AI->getType(), AI->getPointerOperand(), AI->getAlign()); 16658 LI->setAtomic(Order, AI->getSyncScopeID()); 16659 LI->copyMetadata(*AI); 16660 LI->takeName(AI); 16661 AI->replaceAllUsesWith(LI); 16662 AI->eraseFromParent(); 16663 return LI; 16664 } 16665