1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIInstrInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "SIRegisterInfo.h" 24 #include "Utils/AMDGPUBaseInfo.h" 25 #include "llvm/ADT/ScopeExit.h" 26 #include "llvm/BinaryFormat/ELF.h" 27 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 31 #include "llvm/CodeGen/GlobalISel/Utils.h" 32 #include "llvm/CodeGen/TargetOpcodes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/IntrinsicsAMDGPU.h" 35 #include "llvm/IR/IntrinsicsR600.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Hack until load/store selection patterns support any tuple of legal types. 46 static cl::opt<bool> EnableNewLegality( 47 "amdgpu-global-isel-new-legality", 48 cl::desc("Use GlobalISel desired legality, rather than try to use" 49 "rules compatible with selection patterns"), 50 cl::init(false), 51 cl::ReallyHidden); 52 53 static constexpr unsigned MaxRegisterSize = 1024; 54 55 // Round the number of elements to the next power of two elements 56 static LLT getPow2VectorType(LLT Ty) { 57 unsigned NElts = Ty.getNumElements(); 58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 60 } 61 62 // Round the number of bits to the next power of two bits 63 static LLT getPow2ScalarType(LLT Ty) { 64 unsigned Bits = Ty.getSizeInBits(); 65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 66 return LLT::scalar(Pow2Bits); 67 } 68 69 /// \returns true if this is an odd sized vector which should widen by adding an 70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 71 /// excludes s1 vectors, which should always be scalarized. 72 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 73 return [=](const LegalityQuery &Query) { 74 const LLT Ty = Query.Types[TypeIdx]; 75 if (!Ty.isVector()) 76 return false; 77 78 const LLT EltTy = Ty.getElementType(); 79 const unsigned EltSize = EltTy.getSizeInBits(); 80 return Ty.getNumElements() % 2 != 0 && 81 EltSize > 1 && EltSize < 32 && 82 Ty.getSizeInBits() % 32 != 0; 83 }; 84 } 85 86 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 return Ty.getSizeInBits() % 32 == 0; 90 }; 91 } 92 93 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 94 return [=](const LegalityQuery &Query) { 95 const LLT Ty = Query.Types[TypeIdx]; 96 const LLT EltTy = Ty.getScalarType(); 97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 98 }; 99 } 100 101 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 const LLT EltTy = Ty.getElementType(); 105 return std::pair(TypeIdx, 106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 107 }; 108 } 109 110 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 111 return [=](const LegalityQuery &Query) { 112 const LLT Ty = Query.Types[TypeIdx]; 113 const LLT EltTy = Ty.getElementType(); 114 unsigned Size = Ty.getSizeInBits(); 115 unsigned Pieces = (Size + 63) / 64; 116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 117 return std::pair(TypeIdx, LLT::scalarOrVector( 118 ElementCount::getFixed(NewNumElts), EltTy)); 119 }; 120 } 121 122 // Increase the number of vector elements to reach the next multiple of 32-bit 123 // type. 124 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 125 return [=](const LegalityQuery &Query) { 126 const LLT Ty = Query.Types[TypeIdx]; 127 128 const LLT EltTy = Ty.getElementType(); 129 const int Size = Ty.getSizeInBits(); 130 const int EltSize = EltTy.getSizeInBits(); 131 const int NextMul32 = (Size + 31) / 32; 132 133 assert(EltSize < 32); 134 135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 137 }; 138 } 139 140 // Increase the number of vector elements to reach the next legal RegClass. 141 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) { 142 return [=](const LegalityQuery &Query) { 143 const LLT Ty = Query.Types[TypeIdx]; 144 const unsigned NumElts = Ty.getNumElements(); 145 const unsigned EltSize = Ty.getElementType().getSizeInBits(); 146 const unsigned MaxNumElts = MaxRegisterSize / EltSize; 147 148 assert(EltSize == 32 || EltSize == 64); 149 assert(Ty.getSizeInBits() < MaxRegisterSize); 150 151 unsigned NewNumElts; 152 // Find the nearest legal RegClass that is larger than the current type. 153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) { 154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize)) 155 break; 156 } 157 158 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize)); 159 }; 160 } 161 162 static LLT getBufferRsrcScalarType(const LLT Ty) { 163 if (!Ty.isVector()) 164 return LLT::scalar(128); 165 const ElementCount NumElems = Ty.getElementCount(); 166 return LLT::vector(NumElems, LLT::scalar(128)); 167 } 168 169 static LLT getBufferRsrcRegisterType(const LLT Ty) { 170 if (!Ty.isVector()) 171 return LLT::fixed_vector(4, LLT::scalar(32)); 172 const unsigned NumElems = Ty.getElementCount().getFixedValue(); 173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32)); 174 } 175 176 static LLT getBitcastRegisterType(const LLT Ty) { 177 const unsigned Size = Ty.getSizeInBits(); 178 179 if (Size <= 32) { 180 // <2 x s8> -> s16 181 // <4 x s8> -> s32 182 return LLT::scalar(Size); 183 } 184 185 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 186 } 187 188 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 189 return [=](const LegalityQuery &Query) { 190 const LLT Ty = Query.Types[TypeIdx]; 191 return std::pair(TypeIdx, getBitcastRegisterType(Ty)); 192 }; 193 } 194 195 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 196 return [=](const LegalityQuery &Query) { 197 const LLT Ty = Query.Types[TypeIdx]; 198 unsigned Size = Ty.getSizeInBits(); 199 assert(Size % 32 == 0); 200 return std::pair( 201 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 202 }; 203 } 204 205 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 206 return [=](const LegalityQuery &Query) { 207 const LLT QueryTy = Query.Types[TypeIdx]; 208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 209 }; 210 } 211 212 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 213 return [=](const LegalityQuery &Query) { 214 const LLT QueryTy = Query.Types[TypeIdx]; 215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 216 }; 217 } 218 219 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 223 }; 224 } 225 226 static bool isRegisterSize(unsigned Size) { 227 return Size % 32 == 0 && Size <= MaxRegisterSize; 228 } 229 230 static bool isRegisterVectorElementType(LLT EltTy) { 231 const int EltSize = EltTy.getSizeInBits(); 232 return EltSize == 16 || EltSize % 32 == 0; 233 } 234 235 static bool isRegisterVectorType(LLT Ty) { 236 const int EltSize = Ty.getElementType().getSizeInBits(); 237 return EltSize == 32 || EltSize == 64 || 238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 239 EltSize == 128 || EltSize == 256; 240 } 241 242 static bool isRegisterType(LLT Ty) { 243 if (!isRegisterSize(Ty.getSizeInBits())) 244 return false; 245 246 if (Ty.isVector()) 247 return isRegisterVectorType(Ty); 248 249 return true; 250 } 251 252 // Any combination of 32 or 64-bit elements up the maximum register size, and 253 // multiples of v2s16. 254 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 255 return [=](const LegalityQuery &Query) { 256 return isRegisterType(Query.Types[TypeIdx]); 257 }; 258 } 259 260 // RegisterType that doesn't have a corresponding RegClass. 261 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) { 262 return [=](const LegalityQuery &Query) { 263 LLT Ty = Query.Types[TypeIdx]; 264 return isRegisterType(Ty) && 265 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits()); 266 }; 267 } 268 269 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 270 return [=](const LegalityQuery &Query) { 271 const LLT QueryTy = Query.Types[TypeIdx]; 272 if (!QueryTy.isVector()) 273 return false; 274 const LLT EltTy = QueryTy.getElementType(); 275 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 276 }; 277 } 278 279 // If we have a truncating store or an extending load with a data size larger 280 // than 32-bits, we need to reduce to a 32-bit type. 281 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 282 return [=](const LegalityQuery &Query) { 283 const LLT Ty = Query.Types[TypeIdx]; 284 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 285 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 286 }; 287 } 288 289 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 290 // handle some operations by just promoting the register during 291 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 292 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 293 bool IsLoad, bool IsAtomic) { 294 switch (AS) { 295 case AMDGPUAS::PRIVATE_ADDRESS: 296 // FIXME: Private element size. 297 return ST.enableFlatScratch() ? 128 : 32; 298 case AMDGPUAS::LOCAL_ADDRESS: 299 return ST.useDS128() ? 128 : 64; 300 case AMDGPUAS::GLOBAL_ADDRESS: 301 case AMDGPUAS::CONSTANT_ADDRESS: 302 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 303 case AMDGPUAS::BUFFER_RESOURCE: 304 // Treat constant and global as identical. SMRD loads are sometimes usable for 305 // global loads (ideally constant address space should be eliminated) 306 // depending on the context. Legality cannot be context dependent, but 307 // RegBankSelect can split the load as necessary depending on the pointer 308 // register bank/uniformity and if the memory is invariant or not written in a 309 // kernel. 310 return IsLoad ? 512 : 128; 311 default: 312 // FIXME: Flat addresses may contextually need to be split to 32-bit parts 313 // if they may alias scratch depending on the subtarget. This needs to be 314 // moved to custom handling to use addressMayBeAccessedAsPrivate 315 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32; 316 } 317 } 318 319 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 320 const LegalityQuery &Query) { 321 const LLT Ty = Query.Types[0]; 322 323 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 324 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 325 326 unsigned RegSize = Ty.getSizeInBits(); 327 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 328 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 329 unsigned AS = Query.Types[1].getAddressSpace(); 330 331 // All of these need to be custom lowered to cast the pointer operand. 332 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 333 return false; 334 335 // Do not handle extending vector loads. 336 if (Ty.isVector() && MemSize != RegSize) 337 return false; 338 339 // TODO: We should be able to widen loads if the alignment is high enough, but 340 // we also need to modify the memory access size. 341 #if 0 342 // Accept widening loads based on alignment. 343 if (IsLoad && MemSize < Size) 344 MemSize = std::max(MemSize, Align); 345 #endif 346 347 // Only 1-byte and 2-byte to 32-bit extloads are valid. 348 if (MemSize != RegSize && RegSize != 32) 349 return false; 350 351 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 352 Query.MMODescrs[0].Ordering != 353 AtomicOrdering::NotAtomic)) 354 return false; 355 356 switch (MemSize) { 357 case 8: 358 case 16: 359 case 32: 360 case 64: 361 case 128: 362 break; 363 case 96: 364 if (!ST.hasDwordx3LoadStores()) 365 return false; 366 break; 367 case 256: 368 case 512: 369 // These may contextually need to be broken down. 370 break; 371 default: 372 return false; 373 } 374 375 assert(RegSize >= MemSize); 376 377 if (AlignBits < MemSize) { 378 const SITargetLowering *TLI = ST.getTargetLowering(); 379 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 380 Align(AlignBits / 8))) 381 return false; 382 } 383 384 return true; 385 } 386 387 // The newer buffer intrinsic forms take their resource arguments as 388 // pointers in address space 8, aka s128 values. However, in order to not break 389 // SelectionDAG, the underlying operations have to continue to take v4i32 390 // arguments. Therefore, we convert resource pointers - or vectors of them 391 // to integer values here. 392 static bool hasBufferRsrcWorkaround(const LLT Ty) { 393 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) 394 return true; 395 if (Ty.isVector()) { 396 const LLT ElemTy = Ty.getElementType(); 397 return hasBufferRsrcWorkaround(ElemTy); 398 } 399 return false; 400 } 401 402 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 403 // workaround this. Eventually it should ignore the type for loads and only care 404 // about the size. Return true in cases where we will workaround this for now by 405 // bitcasting. 406 static bool loadStoreBitcastWorkaround(const LLT Ty) { 407 if (EnableNewLegality) 408 return false; 409 410 const unsigned Size = Ty.getSizeInBits(); 411 if (Size <= 64) 412 return false; 413 // Address space 8 pointers get their own workaround. 414 if (hasBufferRsrcWorkaround(Ty)) 415 return false; 416 if (!Ty.isVector()) 417 return true; 418 419 LLT EltTy = Ty.getElementType(); 420 if (EltTy.isPointer()) 421 return true; 422 423 unsigned EltSize = EltTy.getSizeInBits(); 424 return EltSize != 32 && EltSize != 64; 425 } 426 427 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 428 const LLT Ty = Query.Types[0]; 429 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 430 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty); 431 } 432 433 /// Return true if a load or store of the type should be lowered with a bitcast 434 /// to a different type. 435 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 436 const LLT MemTy) { 437 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 438 const unsigned Size = Ty.getSizeInBits(); 439 if (Size != MemSizeInBits) 440 return Size <= 32 && Ty.isVector(); 441 442 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 443 return true; 444 445 // Don't try to handle bitcasting vector ext loads for now. 446 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 447 (Size <= 32 || isRegisterSize(Size)) && 448 !isRegisterVectorElementType(Ty.getElementType()); 449 } 450 451 /// Return true if we should legalize a load by widening an odd sized memory 452 /// access up to the alignment. Note this case when the memory access itself 453 /// changes, not the size of the result register. 454 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 455 uint64_t AlignInBits, unsigned AddrSpace, 456 unsigned Opcode) { 457 unsigned SizeInBits = MemoryTy.getSizeInBits(); 458 // We don't want to widen cases that are naturally legal. 459 if (isPowerOf2_32(SizeInBits)) 460 return false; 461 462 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 463 // end up widening these for a scalar load during RegBankSelect, if we don't 464 // have 96-bit scalar loads. 465 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 466 return false; 467 468 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false)) 469 return false; 470 471 // A load is known dereferenceable up to the alignment, so it's legal to widen 472 // to it. 473 // 474 // TODO: Could check dereferenceable for less aligned cases. 475 unsigned RoundedSize = NextPowerOf2(SizeInBits); 476 if (AlignInBits < RoundedSize) 477 return false; 478 479 // Do not widen if it would introduce a slow unaligned load. 480 const SITargetLowering *TLI = ST.getTargetLowering(); 481 unsigned Fast = 0; 482 return TLI->allowsMisalignedMemoryAccessesImpl( 483 RoundedSize, AddrSpace, Align(AlignInBits / 8), 484 MachineMemOperand::MOLoad, &Fast) && 485 Fast; 486 } 487 488 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 489 unsigned Opcode) { 490 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 491 return false; 492 493 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 494 Query.MMODescrs[0].AlignInBits, 495 Query.Types[1].getAddressSpace(), Opcode); 496 } 497 498 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial 499 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts 500 /// and inttoptr. In addition, handle vectors of p8. Returns the new type. 501 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, 502 MachineRegisterInfo &MRI, unsigned Idx) { 503 MachineOperand &MO = MI.getOperand(Idx); 504 505 const LLT PointerTy = MRI.getType(MO.getReg()); 506 507 // Paranoidly prevent us from doing this multiple times. 508 if (!hasBufferRsrcWorkaround(PointerTy)) 509 return PointerTy; 510 511 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 512 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 513 if (!PointerTy.isVector()) { 514 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8) 515 const unsigned NumParts = PointerTy.getSizeInBits() / 32; 516 const LLT S32 = LLT::scalar(32); 517 518 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy); 519 std::array<Register, 4> VectorElems; 520 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 521 for (unsigned I = 0; I < NumParts; ++I) 522 VectorElems[I] = 523 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0); 524 B.buildMergeValues(MO, VectorElems); 525 MO.setReg(VectorReg); 526 return VectorTy; 527 } 528 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy); 529 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 530 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg); 531 B.buildIntToPtr(MO, Scalar); 532 MO.setReg(BitcastReg); 533 534 return VectorTy; 535 } 536 537 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is 538 /// the form in which the value must be in order to be passed to the low-level 539 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is 540 /// needed in order to account for the fact that we can't define a register 541 /// class for s128 without breaking SelectionDAG. 542 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) { 543 MachineRegisterInfo &MRI = *B.getMRI(); 544 const LLT PointerTy = MRI.getType(Pointer); 545 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 546 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 547 548 if (!PointerTy.isVector()) { 549 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32) 550 SmallVector<Register, 4> PointerParts; 551 const unsigned NumParts = PointerTy.getSizeInBits() / 32; 552 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer); 553 for (unsigned I = 0; I < NumParts; ++I) 554 PointerParts.push_back(Unmerged.getReg(I)); 555 return B.buildBuildVector(VectorTy, PointerParts).getReg(0); 556 } 557 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0); 558 return B.buildBitcast(VectorTy, Scalar).getReg(0); 559 } 560 561 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, 562 unsigned Idx) { 563 MachineOperand &MO = MI.getOperand(Idx); 564 565 const LLT PointerTy = B.getMRI()->getType(MO.getReg()); 566 // Paranoidly prevent us from doing this multiple times. 567 if (!hasBufferRsrcWorkaround(PointerTy)) 568 return; 569 MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B)); 570 } 571 572 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 573 const GCNTargetMachine &TM) 574 : ST(ST_) { 575 using namespace TargetOpcode; 576 577 auto GetAddrSpacePtr = [&TM](unsigned AS) { 578 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 579 }; 580 581 const LLT S1 = LLT::scalar(1); 582 const LLT S8 = LLT::scalar(8); 583 const LLT S16 = LLT::scalar(16); 584 const LLT S32 = LLT::scalar(32); 585 const LLT S64 = LLT::scalar(64); 586 const LLT S128 = LLT::scalar(128); 587 const LLT S256 = LLT::scalar(256); 588 const LLT S512 = LLT::scalar(512); 589 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 590 591 const LLT V2S8 = LLT::fixed_vector(2, 8); 592 const LLT V2S16 = LLT::fixed_vector(2, 16); 593 const LLT V4S16 = LLT::fixed_vector(4, 16); 594 595 const LLT V2S32 = LLT::fixed_vector(2, 32); 596 const LLT V3S32 = LLT::fixed_vector(3, 32); 597 const LLT V4S32 = LLT::fixed_vector(4, 32); 598 const LLT V5S32 = LLT::fixed_vector(5, 32); 599 const LLT V6S32 = LLT::fixed_vector(6, 32); 600 const LLT V7S32 = LLT::fixed_vector(7, 32); 601 const LLT V8S32 = LLT::fixed_vector(8, 32); 602 const LLT V9S32 = LLT::fixed_vector(9, 32); 603 const LLT V10S32 = LLT::fixed_vector(10, 32); 604 const LLT V11S32 = LLT::fixed_vector(11, 32); 605 const LLT V12S32 = LLT::fixed_vector(12, 32); 606 const LLT V13S32 = LLT::fixed_vector(13, 32); 607 const LLT V14S32 = LLT::fixed_vector(14, 32); 608 const LLT V15S32 = LLT::fixed_vector(15, 32); 609 const LLT V16S32 = LLT::fixed_vector(16, 32); 610 const LLT V32S32 = LLT::fixed_vector(32, 32); 611 612 const LLT V2S64 = LLT::fixed_vector(2, 64); 613 const LLT V3S64 = LLT::fixed_vector(3, 64); 614 const LLT V4S64 = LLT::fixed_vector(4, 64); 615 const LLT V5S64 = LLT::fixed_vector(5, 64); 616 const LLT V6S64 = LLT::fixed_vector(6, 64); 617 const LLT V7S64 = LLT::fixed_vector(7, 64); 618 const LLT V8S64 = LLT::fixed_vector(8, 64); 619 const LLT V16S64 = LLT::fixed_vector(16, 64); 620 621 std::initializer_list<LLT> AllS32Vectors = 622 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 623 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 624 std::initializer_list<LLT> AllS64Vectors = 625 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 626 627 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 628 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 629 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 630 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 631 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 632 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 633 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 634 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER); 635 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE); 636 const LLT BufferStridedPtr = 637 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER); 638 639 const LLT CodePtr = FlatPtr; 640 641 const std::initializer_list<LLT> AddrSpaces64 = { 642 GlobalPtr, ConstantPtr, FlatPtr 643 }; 644 645 const std::initializer_list<LLT> AddrSpaces32 = { 646 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 647 }; 648 649 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr}; 650 651 const std::initializer_list<LLT> FPTypesBase = { 652 S32, S64 653 }; 654 655 const std::initializer_list<LLT> FPTypes16 = { 656 S32, S64, S16 657 }; 658 659 const std::initializer_list<LLT> FPTypesPK16 = { 660 S32, S64, S16, V2S16 661 }; 662 663 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 664 665 // s1 for VCC branches, s32 for SCC branches. 666 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 667 668 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 669 // elements for v3s16 670 getActionDefinitionsBuilder(G_PHI) 671 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 672 .legalFor(AllS32Vectors) 673 .legalFor(AllS64Vectors) 674 .legalFor(AddrSpaces64) 675 .legalFor(AddrSpaces32) 676 .legalFor(AddrSpaces128) 677 .legalIf(isPointer(0)) 678 .clampScalar(0, S16, S256) 679 .widenScalarToNextPow2(0, 32) 680 .clampMaxNumElements(0, S32, 16) 681 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 682 .scalarize(0); 683 684 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 685 // Full set of gfx9 features. 686 if (ST.hasScalarAddSub64()) { 687 getActionDefinitionsBuilder({G_ADD, G_SUB}) 688 .legalFor({S64, S32, S16, V2S16}) 689 .clampMaxNumElementsStrict(0, S16, 2) 690 .scalarize(0) 691 .minScalar(0, S16) 692 .widenScalarToNextMultipleOf(0, 32) 693 .maxScalar(0, S32); 694 } else { 695 getActionDefinitionsBuilder({G_ADD, G_SUB}) 696 .legalFor({S32, S16, V2S16}) 697 .clampMaxNumElementsStrict(0, S16, 2) 698 .scalarize(0) 699 .minScalar(0, S16) 700 .widenScalarToNextMultipleOf(0, 32) 701 .maxScalar(0, S32); 702 } 703 704 if (ST.hasScalarSMulU64()) { 705 getActionDefinitionsBuilder(G_MUL) 706 .legalFor({S64, S32, S16, V2S16}) 707 .clampMaxNumElementsStrict(0, S16, 2) 708 .scalarize(0) 709 .minScalar(0, S16) 710 .widenScalarToNextMultipleOf(0, 32) 711 .custom(); 712 } else { 713 getActionDefinitionsBuilder(G_MUL) 714 .legalFor({S32, S16, V2S16}) 715 .clampMaxNumElementsStrict(0, S16, 2) 716 .scalarize(0) 717 .minScalar(0, S16) 718 .widenScalarToNextMultipleOf(0, 32) 719 .custom(); 720 } 721 assert(ST.hasMad64_32()); 722 723 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 724 .legalFor({S32, S16, V2S16}) // Clamp modifier 725 .minScalarOrElt(0, S16) 726 .clampMaxNumElementsStrict(0, S16, 2) 727 .scalarize(0) 728 .widenScalarToNextPow2(0, 32) 729 .lower(); 730 } else if (ST.has16BitInsts()) { 731 getActionDefinitionsBuilder({G_ADD, G_SUB}) 732 .legalFor({S32, S16}) 733 .minScalar(0, S16) 734 .widenScalarToNextMultipleOf(0, 32) 735 .maxScalar(0, S32) 736 .scalarize(0); 737 738 getActionDefinitionsBuilder(G_MUL) 739 .legalFor({S32, S16}) 740 .scalarize(0) 741 .minScalar(0, S16) 742 .widenScalarToNextMultipleOf(0, 32) 743 .custom(); 744 assert(ST.hasMad64_32()); 745 746 // Technically the saturating operations require clamp bit support, but this 747 // was introduced at the same time as 16-bit operations. 748 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 749 .legalFor({S32, S16}) // Clamp modifier 750 .minScalar(0, S16) 751 .scalarize(0) 752 .widenScalarToNextPow2(0, 16) 753 .lower(); 754 755 // We're just lowering this, but it helps get a better result to try to 756 // coerce to the desired type first. 757 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 758 .minScalar(0, S16) 759 .scalarize(0) 760 .lower(); 761 } else { 762 getActionDefinitionsBuilder({G_ADD, G_SUB}) 763 .legalFor({S32}) 764 .widenScalarToNextMultipleOf(0, 32) 765 .clampScalar(0, S32, S32) 766 .scalarize(0); 767 768 auto &Mul = getActionDefinitionsBuilder(G_MUL) 769 .legalFor({S32}) 770 .scalarize(0) 771 .minScalar(0, S32) 772 .widenScalarToNextMultipleOf(0, 32); 773 774 if (ST.hasMad64_32()) 775 Mul.custom(); 776 else 777 Mul.maxScalar(0, S32); 778 779 if (ST.hasIntClamp()) { 780 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 781 .legalFor({S32}) // Clamp modifier. 782 .scalarize(0) 783 .minScalarOrElt(0, S32) 784 .lower(); 785 } else { 786 // Clamp bit support was added in VI, along with 16-bit operations. 787 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 788 .minScalar(0, S32) 789 .scalarize(0) 790 .lower(); 791 } 792 793 // FIXME: DAG expansion gets better results. The widening uses the smaller 794 // range values and goes for the min/max lowering directly. 795 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 796 .minScalar(0, S32) 797 .scalarize(0) 798 .lower(); 799 } 800 801 getActionDefinitionsBuilder( 802 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 803 .customFor({S32, S64}) 804 .clampScalar(0, S32, S64) 805 .widenScalarToNextPow2(0, 32) 806 .scalarize(0); 807 808 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 809 .legalFor({S32}) 810 .maxScalar(0, S32); 811 812 if (ST.hasVOP3PInsts()) { 813 Mulh 814 .clampMaxNumElements(0, S8, 2) 815 .lowerFor({V2S8}); 816 } 817 818 Mulh 819 .scalarize(0) 820 .lower(); 821 822 // Report legal for any types we can handle anywhere. For the cases only legal 823 // on the SALU, RegBankSelect will be able to re-legalize. 824 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 825 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 826 .clampScalar(0, S32, S64) 827 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 828 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 829 .widenScalarToNextPow2(0) 830 .scalarize(0); 831 832 getActionDefinitionsBuilder( 833 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 834 .legalFor({{S32, S1}, {S32, S32}}) 835 .clampScalar(0, S32, S32) 836 .scalarize(0); 837 838 getActionDefinitionsBuilder(G_BITCAST) 839 // Don't worry about the size constraint. 840 .legalIf(all(isRegisterType(0), isRegisterType(1))) 841 .lower(); 842 843 844 getActionDefinitionsBuilder(G_CONSTANT) 845 .legalFor({S1, S32, S64, S16, GlobalPtr, 846 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 847 .legalIf(isPointer(0)) 848 .clampScalar(0, S32, S64) 849 .widenScalarToNextPow2(0); 850 851 getActionDefinitionsBuilder(G_FCONSTANT) 852 .legalFor({S32, S64, S16}) 853 .clampScalar(0, S16, S64); 854 855 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 856 .legalIf(isRegisterType(0)) 857 // s1 and s16 are special cases because they have legal operations on 858 // them, but don't really occupy registers in the normal way. 859 .legalFor({S1, S16}) 860 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 861 .clampScalarOrElt(0, S32, MaxScalar) 862 .widenScalarToNextPow2(0, 32) 863 .clampMaxNumElements(0, S32, 16); 864 865 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 866 867 // If the amount is divergent, we have to do a wave reduction to get the 868 // maximum value, so this is expanded during RegBankSelect. 869 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 870 .legalFor({{PrivatePtr, S32}}); 871 872 getActionDefinitionsBuilder(G_STACKSAVE) 873 .customFor({PrivatePtr}); 874 getActionDefinitionsBuilder(G_STACKRESTORE) 875 .legalFor({PrivatePtr}); 876 877 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 878 .customIf(typeIsNot(0, PrivatePtr)); 879 880 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 881 882 auto &FPOpActions = getActionDefinitionsBuilder( 883 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE, 884 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA}) 885 .legalFor({S32, S64}); 886 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 887 .customFor({S32, S64}); 888 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 889 .customFor({S32, S64}); 890 891 if (ST.has16BitInsts()) { 892 if (ST.hasVOP3PInsts()) 893 FPOpActions.legalFor({S16, V2S16}); 894 else 895 FPOpActions.legalFor({S16}); 896 897 TrigActions.customFor({S16}); 898 FDIVActions.customFor({S16}); 899 } 900 901 if (ST.hasPackedFP32Ops()) { 902 FPOpActions.legalFor({V2S32}); 903 FPOpActions.clampMaxNumElementsStrict(0, S32, 2); 904 } 905 906 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 907 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 908 909 if (ST.hasVOP3PInsts()) { 910 MinNumMaxNum.customFor(FPTypesPK16) 911 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 912 .clampMaxNumElements(0, S16, 2) 913 .clampScalar(0, S16, S64) 914 .scalarize(0); 915 } else if (ST.has16BitInsts()) { 916 MinNumMaxNum.customFor(FPTypes16) 917 .clampScalar(0, S16, S64) 918 .scalarize(0); 919 } else { 920 MinNumMaxNum.customFor(FPTypesBase) 921 .clampScalar(0, S32, S64) 922 .scalarize(0); 923 } 924 925 if (ST.hasVOP3PInsts()) 926 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 927 928 FPOpActions 929 .scalarize(0) 930 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 931 932 TrigActions 933 .scalarize(0) 934 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 935 936 FDIVActions 937 .scalarize(0) 938 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 939 940 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 941 .legalFor(FPTypesPK16) 942 .clampMaxNumElementsStrict(0, S16, 2) 943 .scalarize(0) 944 .clampScalar(0, S16, S64); 945 946 if (ST.has16BitInsts()) { 947 getActionDefinitionsBuilder(G_FSQRT) 948 .legalFor({S16}) 949 .customFor({S32, S64}) 950 .scalarize(0) 951 .unsupported(); 952 getActionDefinitionsBuilder(G_FFLOOR) 953 .legalFor({S32, S64, S16}) 954 .scalarize(0) 955 .clampScalar(0, S16, S64); 956 957 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 958 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}}) 959 .scalarize(0) 960 .maxScalarIf(typeIs(0, S16), 1, S16) 961 .clampScalar(1, S32, S32) 962 .lower(); 963 964 getActionDefinitionsBuilder(G_FFREXP) 965 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}}) 966 .scalarize(0) 967 .lower(); 968 } else { 969 getActionDefinitionsBuilder(G_FSQRT) 970 .customFor({S32, S64, S16}) 971 .scalarize(0) 972 .unsupported(); 973 974 975 if (ST.hasFractBug()) { 976 getActionDefinitionsBuilder(G_FFLOOR) 977 .customFor({S64}) 978 .legalFor({S32, S64}) 979 .scalarize(0) 980 .clampScalar(0, S32, S64); 981 } else { 982 getActionDefinitionsBuilder(G_FFLOOR) 983 .legalFor({S32, S64}) 984 .scalarize(0) 985 .clampScalar(0, S32, S64); 986 } 987 988 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 989 .legalFor({{S32, S32}, {S64, S32}}) 990 .scalarize(0) 991 .clampScalar(0, S32, S64) 992 .clampScalar(1, S32, S32) 993 .lower(); 994 995 getActionDefinitionsBuilder(G_FFREXP) 996 .customFor({{S32, S32}, {S64, S32}}) 997 .scalarize(0) 998 .minScalar(0, S32) 999 .clampScalar(1, S32, S32) 1000 .lower(); 1001 } 1002 1003 getActionDefinitionsBuilder(G_FPTRUNC) 1004 .legalFor({{S32, S64}, {S16, S32}}) 1005 .scalarize(0) 1006 .lower(); 1007 1008 getActionDefinitionsBuilder(G_FPEXT) 1009 .legalFor({{S64, S32}, {S32, S16}}) 1010 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 1011 .scalarize(0); 1012 1013 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB}); 1014 if (ST.has16BitInsts()) { 1015 FSubActions 1016 // Use actual fsub instruction 1017 .legalFor({S32, S16}) 1018 // Must use fadd + fneg 1019 .lowerFor({S64, V2S16}); 1020 } else { 1021 FSubActions 1022 // Use actual fsub instruction 1023 .legalFor({S32}) 1024 // Must use fadd + fneg 1025 .lowerFor({S64, S16, V2S16}); 1026 } 1027 1028 FSubActions 1029 .scalarize(0) 1030 .clampScalar(0, S32, S64); 1031 1032 // Whether this is legal depends on the floating point mode for the function. 1033 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 1034 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 1035 FMad.customFor({S32, S16}); 1036 else if (ST.hasMadMacF32Insts()) 1037 FMad.customFor({S32}); 1038 else if (ST.hasMadF16()) 1039 FMad.customFor({S16}); 1040 FMad.scalarize(0) 1041 .lower(); 1042 1043 auto &FRem = getActionDefinitionsBuilder(G_FREM); 1044 if (ST.has16BitInsts()) { 1045 FRem.customFor({S16, S32, S64}); 1046 } else { 1047 FRem.minScalar(0, S32) 1048 .customFor({S32, S64}); 1049 } 1050 FRem.scalarize(0); 1051 1052 // TODO: Do we need to clamp maximum bitwidth? 1053 getActionDefinitionsBuilder(G_TRUNC) 1054 .legalIf(isScalar(0)) 1055 .legalFor({{V2S16, V2S32}}) 1056 .clampMaxNumElements(0, S16, 2) 1057 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 1058 // situations (like an invalid implicit use), we don't want to infinite loop 1059 // in the legalizer. 1060 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 1061 .alwaysLegal(); 1062 1063 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 1064 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 1065 {S32, S1}, {S64, S1}, {S16, S1}}) 1066 .scalarize(0) 1067 .clampScalar(0, S32, S64) 1068 .widenScalarToNextPow2(1, 32); 1069 1070 // TODO: Split s1->s64 during regbankselect for VALU. 1071 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 1072 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 1073 .lowerIf(typeIs(1, S1)) 1074 .customFor({{S32, S64}, {S64, S64}}); 1075 if (ST.has16BitInsts()) 1076 IToFP.legalFor({{S16, S16}}); 1077 IToFP.clampScalar(1, S32, S64) 1078 .minScalar(0, S32) 1079 .scalarize(0) 1080 .widenScalarToNextPow2(1); 1081 1082 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 1083 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 1084 .customFor({{S64, S32}, {S64, S64}}) 1085 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 1086 if (ST.has16BitInsts()) 1087 FPToI.legalFor({{S16, S16}}); 1088 else 1089 FPToI.minScalar(1, S32); 1090 1091 FPToI.minScalar(0, S32) 1092 .widenScalarToNextPow2(0, 32) 1093 .scalarize(0) 1094 .lower(); 1095 1096 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 1097 .customFor({S16, S32}) 1098 .scalarize(0) 1099 .lower(); 1100 1101 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN 1102 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT}) 1103 .scalarize(0) 1104 .lower(); 1105 1106 if (ST.has16BitInsts()) { 1107 getActionDefinitionsBuilder( 1108 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 1109 .legalFor({S16, S32, S64}) 1110 .clampScalar(0, S16, S64) 1111 .scalarize(0); 1112 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 1113 getActionDefinitionsBuilder( 1114 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 1115 .legalFor({S32, S64}) 1116 .clampScalar(0, S32, S64) 1117 .scalarize(0); 1118 } else { 1119 getActionDefinitionsBuilder( 1120 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 1121 .legalFor({S32}) 1122 .customFor({S64}) 1123 .clampScalar(0, S32, S64) 1124 .scalarize(0); 1125 } 1126 1127 getActionDefinitionsBuilder(G_PTR_ADD) 1128 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr}) 1129 .legalIf(all(isPointer(0), sameSize(0, 1))) 1130 .scalarize(0) 1131 .scalarSameSizeAs(1, 0); 1132 1133 getActionDefinitionsBuilder(G_PTRMASK) 1134 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 1135 .scalarSameSizeAs(1, 0) 1136 .scalarize(0); 1137 1138 auto &CmpBuilder = 1139 getActionDefinitionsBuilder(G_ICMP) 1140 // The compare output type differs based on the register bank of the output, 1141 // so make both s1 and s32 legal. 1142 // 1143 // Scalar compares producing output in scc will be promoted to s32, as that 1144 // is the allocatable register type that will be needed for the copy from 1145 // scc. This will be promoted during RegBankSelect, and we assume something 1146 // before that won't try to use s32 result types. 1147 // 1148 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 1149 // bank. 1150 .legalForCartesianProduct( 1151 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 1152 .legalForCartesianProduct( 1153 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 1154 if (ST.has16BitInsts()) { 1155 CmpBuilder.legalFor({{S1, S16}}); 1156 } 1157 1158 CmpBuilder 1159 .widenScalarToNextPow2(1) 1160 .clampScalar(1, S32, S64) 1161 .scalarize(0) 1162 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 1163 1164 auto &FCmpBuilder = 1165 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct( 1166 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase); 1167 1168 if (ST.hasSALUFloatInsts()) 1169 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32}); 1170 1171 FCmpBuilder 1172 .widenScalarToNextPow2(1) 1173 .clampScalar(1, S32, S64) 1174 .scalarize(0); 1175 1176 // FIXME: fpow has a selection pattern that should move to custom lowering. 1177 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW); 1178 if (ST.has16BitInsts()) 1179 ExpOps.customFor({{S32}, {S16}}); 1180 else 1181 ExpOps.customFor({S32}); 1182 ExpOps.clampScalar(0, MinScalarFPTy, S32) 1183 .scalarize(0); 1184 1185 getActionDefinitionsBuilder(G_FPOWI) 1186 .clampScalar(0, MinScalarFPTy, S32) 1187 .lower(); 1188 1189 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2}); 1190 Log2Ops.customFor({S32}); 1191 if (ST.has16BitInsts()) 1192 Log2Ops.legalFor({S16}); 1193 else 1194 Log2Ops.customFor({S16}); 1195 Log2Ops.scalarize(0) 1196 .lower(); 1197 1198 auto &LogOps = 1199 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10}); 1200 LogOps.customFor({S32, S16}); 1201 LogOps.clampScalar(0, MinScalarFPTy, S32) 1202 .scalarize(0); 1203 1204 // The 64-bit versions produce 32-bit results, but only on the SALU. 1205 getActionDefinitionsBuilder(G_CTPOP) 1206 .legalFor({{S32, S32}, {S32, S64}}) 1207 .clampScalar(0, S32, S32) 1208 .widenScalarToNextPow2(1, 32) 1209 .clampScalar(1, S32, S64) 1210 .scalarize(0) 1211 .widenScalarToNextPow2(0, 32); 1212 1213 // If no 16 bit instr is available, lower into different instructions. 1214 if (ST.has16BitInsts()) 1215 getActionDefinitionsBuilder(G_IS_FPCLASS) 1216 .legalForCartesianProduct({S1}, FPTypes16) 1217 .widenScalarToNextPow2(1) 1218 .scalarize(0) 1219 .lower(); 1220 else 1221 getActionDefinitionsBuilder(G_IS_FPCLASS) 1222 .legalForCartesianProduct({S1}, FPTypesBase) 1223 .lowerFor({S1, S16}) 1224 .widenScalarToNextPow2(1) 1225 .scalarize(0) 1226 .lower(); 1227 1228 // The hardware instructions return a different result on 0 than the generic 1229 // instructions expect. The hardware produces -1, but these produce the 1230 // bitwidth. 1231 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 1232 .scalarize(0) 1233 .clampScalar(0, S32, S32) 1234 .clampScalar(1, S32, S64) 1235 .widenScalarToNextPow2(0, 32) 1236 .widenScalarToNextPow2(1, 32) 1237 .custom(); 1238 1239 // The 64-bit versions produce 32-bit results, but only on the SALU. 1240 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 1241 .legalFor({{S32, S32}, {S32, S64}}) 1242 .clampScalar(0, S32, S32) 1243 .clampScalar(1, S32, S64) 1244 .scalarize(0) 1245 .widenScalarToNextPow2(0, 32) 1246 .widenScalarToNextPow2(1, 32); 1247 1248 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1249 // RegBankSelect. 1250 getActionDefinitionsBuilder(G_BITREVERSE) 1251 .legalFor({S32, S64}) 1252 .clampScalar(0, S32, S64) 1253 .scalarize(0) 1254 .widenScalarToNextPow2(0); 1255 1256 if (ST.has16BitInsts()) { 1257 getActionDefinitionsBuilder(G_BSWAP) 1258 .legalFor({S16, S32, V2S16}) 1259 .clampMaxNumElementsStrict(0, S16, 2) 1260 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1261 // narrowScalar limitation. 1262 .widenScalarToNextPow2(0) 1263 .clampScalar(0, S16, S32) 1264 .scalarize(0); 1265 1266 if (ST.hasVOP3PInsts()) { 1267 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1268 .legalFor({S32, S16, V2S16}) 1269 .clampMaxNumElements(0, S16, 2) 1270 .minScalar(0, S16) 1271 .widenScalarToNextPow2(0) 1272 .scalarize(0) 1273 .lower(); 1274 } else { 1275 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1276 .legalFor({S32, S16}) 1277 .widenScalarToNextPow2(0) 1278 .minScalar(0, S16) 1279 .scalarize(0) 1280 .lower(); 1281 } 1282 } else { 1283 // TODO: Should have same legality without v_perm_b32 1284 getActionDefinitionsBuilder(G_BSWAP) 1285 .legalFor({S32}) 1286 .lowerIf(scalarNarrowerThan(0, 32)) 1287 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1288 // narrowScalar limitation. 1289 .widenScalarToNextPow2(0) 1290 .maxScalar(0, S32) 1291 .scalarize(0) 1292 .lower(); 1293 1294 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1295 .legalFor({S32}) 1296 .minScalar(0, S32) 1297 .widenScalarToNextPow2(0) 1298 .scalarize(0) 1299 .lower(); 1300 } 1301 1302 getActionDefinitionsBuilder(G_INTTOPTR) 1303 // List the common cases 1304 .legalForCartesianProduct(AddrSpaces64, {S64}) 1305 .legalForCartesianProduct(AddrSpaces32, {S32}) 1306 .scalarize(0) 1307 // Accept any address space as long as the size matches 1308 .legalIf(sameSize(0, 1)) 1309 .widenScalarIf(smallerThan(1, 0), 1310 [](const LegalityQuery &Query) { 1311 return std::pair( 1312 1, LLT::scalar(Query.Types[0].getSizeInBits())); 1313 }) 1314 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) { 1315 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1316 }); 1317 1318 getActionDefinitionsBuilder(G_PTRTOINT) 1319 // List the common cases 1320 .legalForCartesianProduct(AddrSpaces64, {S64}) 1321 .legalForCartesianProduct(AddrSpaces32, {S32}) 1322 .scalarize(0) 1323 // Accept any address space as long as the size matches 1324 .legalIf(sameSize(0, 1)) 1325 .widenScalarIf(smallerThan(0, 1), 1326 [](const LegalityQuery &Query) { 1327 return std::pair( 1328 0, LLT::scalar(Query.Types[1].getSizeInBits())); 1329 }) 1330 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) { 1331 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1332 }); 1333 1334 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1335 .scalarize(0) 1336 .custom(); 1337 1338 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1339 bool IsLoad) -> bool { 1340 const LLT DstTy = Query.Types[0]; 1341 1342 // Split vector extloads. 1343 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1344 1345 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1346 return true; 1347 1348 const LLT PtrTy = Query.Types[1]; 1349 unsigned AS = PtrTy.getAddressSpace(); 1350 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 1351 Query.MMODescrs[0].Ordering != 1352 AtomicOrdering::NotAtomic)) 1353 return true; 1354 1355 // Catch weird sized loads that don't evenly divide into the access sizes 1356 // TODO: May be able to widen depending on alignment etc. 1357 unsigned NumRegs = (MemSize + 31) / 32; 1358 if (NumRegs == 3) { 1359 if (!ST.hasDwordx3LoadStores()) 1360 return true; 1361 } else { 1362 // If the alignment allows, these should have been widened. 1363 if (!isPowerOf2_32(NumRegs)) 1364 return true; 1365 } 1366 1367 return false; 1368 }; 1369 1370 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1371 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1372 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1373 1374 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1375 // LDS 1376 // TODO: Unsupported flat for SI. 1377 1378 for (unsigned Op : {G_LOAD, G_STORE}) { 1379 const bool IsStore = Op == G_STORE; 1380 1381 auto &Actions = getActionDefinitionsBuilder(Op); 1382 // Explicitly list some common cases. 1383 // TODO: Does this help compile time at all? 1384 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1385 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1386 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1387 {S64, GlobalPtr, S64, GlobalAlign32}, 1388 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1389 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1390 {S32, GlobalPtr, S8, GlobalAlign8}, 1391 {S32, GlobalPtr, S16, GlobalAlign16}, 1392 1393 {S32, LocalPtr, S32, 32}, 1394 {S64, LocalPtr, S64, 32}, 1395 {V2S32, LocalPtr, V2S32, 32}, 1396 {S32, LocalPtr, S8, 8}, 1397 {S32, LocalPtr, S16, 16}, 1398 {V2S16, LocalPtr, S32, 32}, 1399 1400 {S32, PrivatePtr, S32, 32}, 1401 {S32, PrivatePtr, S8, 8}, 1402 {S32, PrivatePtr, S16, 16}, 1403 {V2S16, PrivatePtr, S32, 32}, 1404 1405 {S32, ConstantPtr, S32, GlobalAlign32}, 1406 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1407 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1408 {S64, ConstantPtr, S64, GlobalAlign32}, 1409 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1410 Actions.legalIf( 1411 [=](const LegalityQuery &Query) -> bool { 1412 return isLoadStoreLegal(ST, Query); 1413 }); 1414 1415 // The custom pointers (fat pointers, buffer resources) don't work with load 1416 // and store at this level. Fat pointers should have been lowered to 1417 // intrinsics before the translation to MIR. 1418 Actions.unsupportedIf( 1419 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr})); 1420 1421 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and 1422 // ptrtoint. This is needed to account for the fact that we can't have i128 1423 // as a register class for SelectionDAG reasons. 1424 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1425 return hasBufferRsrcWorkaround(Query.Types[0]); 1426 }); 1427 1428 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1429 // 64-bits. 1430 // 1431 // TODO: Should generalize bitcast action into coerce, which will also cover 1432 // inserting addrspacecasts. 1433 Actions.customIf(typeIs(1, Constant32Ptr)); 1434 1435 // Turn any illegal element vectors into something easier to deal 1436 // with. These will ultimately produce 32-bit scalar shifts to extract the 1437 // parts anyway. 1438 // 1439 // For odd 16-bit element vectors, prefer to split those into pieces with 1440 // 16-bit vector parts. 1441 Actions.bitcastIf( 1442 [=](const LegalityQuery &Query) -> bool { 1443 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1444 Query.MMODescrs[0].MemoryTy); 1445 }, bitcastToRegisterType(0)); 1446 1447 if (!IsStore) { 1448 // Widen suitably aligned loads by loading extra bytes. The standard 1449 // legalization actions can't properly express widening memory operands. 1450 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1451 return shouldWidenLoad(ST, Query, G_LOAD); 1452 }); 1453 } 1454 1455 // FIXME: load/store narrowing should be moved to lower action 1456 Actions 1457 .narrowScalarIf( 1458 [=](const LegalityQuery &Query) -> bool { 1459 return !Query.Types[0].isVector() && 1460 needToSplitMemOp(Query, Op == G_LOAD); 1461 }, 1462 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1463 const LLT DstTy = Query.Types[0]; 1464 const LLT PtrTy = Query.Types[1]; 1465 1466 const unsigned DstSize = DstTy.getSizeInBits(); 1467 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1468 1469 // Split extloads. 1470 if (DstSize > MemSize) 1471 return std::pair(0, LLT::scalar(MemSize)); 1472 1473 unsigned MaxSize = maxSizeForAddrSpace( 1474 ST, PtrTy.getAddressSpace(), Op == G_LOAD, 1475 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 1476 if (MemSize > MaxSize) 1477 return std::pair(0, LLT::scalar(MaxSize)); 1478 1479 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1480 return std::pair(0, LLT::scalar(Align)); 1481 }) 1482 .fewerElementsIf( 1483 [=](const LegalityQuery &Query) -> bool { 1484 return Query.Types[0].isVector() && 1485 needToSplitMemOp(Query, Op == G_LOAD); 1486 }, 1487 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1488 const LLT DstTy = Query.Types[0]; 1489 const LLT PtrTy = Query.Types[1]; 1490 1491 LLT EltTy = DstTy.getElementType(); 1492 unsigned MaxSize = maxSizeForAddrSpace( 1493 ST, PtrTy.getAddressSpace(), Op == G_LOAD, 1494 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 1495 1496 // FIXME: Handle widened to power of 2 results better. This ends 1497 // up scalarizing. 1498 // FIXME: 3 element stores scalarized on SI 1499 1500 // Split if it's too large for the address space. 1501 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1502 if (MemSize > MaxSize) { 1503 unsigned NumElts = DstTy.getNumElements(); 1504 unsigned EltSize = EltTy.getSizeInBits(); 1505 1506 if (MaxSize % EltSize == 0) { 1507 return std::pair( 1508 0, LLT::scalarOrVector( 1509 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1510 } 1511 1512 unsigned NumPieces = MemSize / MaxSize; 1513 1514 // FIXME: Refine when odd breakdowns handled 1515 // The scalars will need to be re-legalized. 1516 if (NumPieces == 1 || NumPieces >= NumElts || 1517 NumElts % NumPieces != 0) 1518 return std::pair(0, EltTy); 1519 1520 return std::pair(0, 1521 LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1522 } 1523 1524 // FIXME: We could probably handle weird extending loads better. 1525 if (DstTy.getSizeInBits() > MemSize) 1526 return std::pair(0, EltTy); 1527 1528 unsigned EltSize = EltTy.getSizeInBits(); 1529 unsigned DstSize = DstTy.getSizeInBits(); 1530 if (!isPowerOf2_32(DstSize)) { 1531 // We're probably decomposing an odd sized store. Try to split 1532 // to the widest type. TODO: Account for alignment. As-is it 1533 // should be OK, since the new parts will be further legalized. 1534 unsigned FloorSize = llvm::bit_floor(DstSize); 1535 return std::pair( 1536 0, LLT::scalarOrVector( 1537 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1538 } 1539 1540 // May need relegalization for the scalars. 1541 return std::pair(0, EltTy); 1542 }) 1543 .minScalar(0, S32) 1544 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1545 .widenScalarToNextPow2(0) 1546 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1547 .lower(); 1548 } 1549 1550 // FIXME: Unaligned accesses not lowered. 1551 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1552 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1553 {S32, GlobalPtr, S16, 2 * 8}, 1554 {S32, LocalPtr, S8, 8}, 1555 {S32, LocalPtr, S16, 16}, 1556 {S32, PrivatePtr, S8, 8}, 1557 {S32, PrivatePtr, S16, 16}, 1558 {S32, ConstantPtr, S8, 8}, 1559 {S32, ConstantPtr, S16, 2 * 8}}) 1560 .legalIf( 1561 [=](const LegalityQuery &Query) -> bool { 1562 return isLoadStoreLegal(ST, Query); 1563 }); 1564 1565 if (ST.hasFlatAddressSpace()) { 1566 ExtLoads.legalForTypesWithMemDesc( 1567 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1568 } 1569 1570 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1571 // 64-bits. 1572 // 1573 // TODO: Should generalize bitcast action into coerce, which will also cover 1574 // inserting addrspacecasts. 1575 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1576 1577 ExtLoads.clampScalar(0, S32, S32) 1578 .widenScalarToNextPow2(0) 1579 .lower(); 1580 1581 auto &Atomics = getActionDefinitionsBuilder( 1582 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1583 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1584 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1585 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP}) 1586 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1587 {S64, GlobalPtr}, {S64, LocalPtr}, 1588 {S32, RegionPtr}, {S64, RegionPtr}}); 1589 if (ST.hasFlatAddressSpace()) { 1590 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1591 } 1592 1593 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1594 if (ST.hasLDSFPAtomicAdd()) { 1595 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1596 if (ST.hasGFX90AInsts()) 1597 Atomic.legalFor({{S64, LocalPtr}}); 1598 if (ST.hasAtomicDsPkAdd16Insts()) 1599 Atomic.legalFor({{V2S16, LocalPtr}}); 1600 } 1601 if (ST.hasAtomicFaddInsts()) 1602 Atomic.legalFor({{S32, GlobalPtr}}); 1603 if (ST.hasFlatAtomicFaddF32Inst()) 1604 Atomic.legalFor({{S32, FlatPtr}}); 1605 1606 if (ST.hasGFX90AInsts()) { 1607 // These are legal with some caveats, and should have undergone expansion in 1608 // the IR in most situations 1609 // TODO: Move atomic expansion into legalizer 1610 Atomic.legalFor({ 1611 {S32, GlobalPtr}, 1612 {S64, GlobalPtr}, 1613 {S64, FlatPtr} 1614 }); 1615 } 1616 1617 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1618 // demarshalling 1619 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1620 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1621 {S32, FlatPtr}, {S64, FlatPtr}}) 1622 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1623 {S32, RegionPtr}, {S64, RegionPtr}}); 1624 // TODO: Pointer types, any 32-bit or 64-bit vector 1625 1626 // Condition should be s32 for scalar, s1 for vector. 1627 getActionDefinitionsBuilder(G_SELECT) 1628 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1629 LocalPtr, FlatPtr, PrivatePtr, 1630 LLT::fixed_vector(2, LocalPtr), 1631 LLT::fixed_vector(2, PrivatePtr)}, 1632 {S1, S32}) 1633 .clampScalar(0, S16, S64) 1634 .scalarize(1) 1635 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1636 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1637 .clampMaxNumElements(0, S32, 2) 1638 .clampMaxNumElements(0, LocalPtr, 2) 1639 .clampMaxNumElements(0, PrivatePtr, 2) 1640 .scalarize(0) 1641 .widenScalarToNextPow2(0) 1642 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1643 1644 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1645 // be more flexible with the shift amount type. 1646 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1647 .legalFor({{S32, S32}, {S64, S32}}); 1648 if (ST.has16BitInsts()) { 1649 if (ST.hasVOP3PInsts()) { 1650 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1651 .clampMaxNumElements(0, S16, 2); 1652 } else 1653 Shifts.legalFor({{S16, S16}}); 1654 1655 // TODO: Support 16-bit shift amounts for all types 1656 Shifts.widenScalarIf( 1657 [=](const LegalityQuery &Query) { 1658 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1659 // 32-bit amount. 1660 const LLT ValTy = Query.Types[0]; 1661 const LLT AmountTy = Query.Types[1]; 1662 return ValTy.getSizeInBits() <= 16 && 1663 AmountTy.getSizeInBits() < 16; 1664 }, changeTo(1, S16)); 1665 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1666 Shifts.clampScalar(1, S32, S32); 1667 Shifts.widenScalarToNextPow2(0, 16); 1668 Shifts.clampScalar(0, S16, S64); 1669 1670 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1671 .minScalar(0, S16) 1672 .scalarize(0) 1673 .lower(); 1674 } else { 1675 // Make sure we legalize the shift amount type first, as the general 1676 // expansion for the shifted type will produce much worse code if it hasn't 1677 // been truncated already. 1678 Shifts.clampScalar(1, S32, S32); 1679 Shifts.widenScalarToNextPow2(0, 32); 1680 Shifts.clampScalar(0, S32, S64); 1681 1682 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1683 .minScalar(0, S32) 1684 .scalarize(0) 1685 .lower(); 1686 } 1687 Shifts.scalarize(0); 1688 1689 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1690 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1691 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1692 unsigned IdxTypeIdx = 2; 1693 1694 getActionDefinitionsBuilder(Op) 1695 .customIf([=](const LegalityQuery &Query) { 1696 const LLT EltTy = Query.Types[EltTypeIdx]; 1697 const LLT VecTy = Query.Types[VecTypeIdx]; 1698 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1699 const unsigned EltSize = EltTy.getSizeInBits(); 1700 const bool isLegalVecType = 1701 !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits()); 1702 // Address space 8 pointers are 128-bit wide values, but the logic 1703 // below will try to bitcast them to 2N x s64, which will fail. 1704 // Therefore, as an intermediate step, wrap extracts/insertions from a 1705 // ptrtoint-ing the vector and scalar arguments (or inttoptring the 1706 // extraction result) in order to produce a vector operation that can 1707 // be handled by the logic below. 1708 if (EltTy.isPointer() && EltSize > 64) 1709 return true; 1710 return (EltSize == 32 || EltSize == 64) && 1711 VecTy.getSizeInBits() % 32 == 0 && 1712 VecTy.getSizeInBits() <= MaxRegisterSize && 1713 IdxTy.getSizeInBits() == 32 && 1714 isLegalVecType; 1715 }) 1716 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1717 bitcastToVectorElement32(VecTypeIdx)) 1718 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1719 .bitcastIf( 1720 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1721 [=](const LegalityQuery &Query) { 1722 // For > 64-bit element types, try to turn this into a 64-bit 1723 // element vector since we may be able to do better indexing 1724 // if this is scalar. If not, fall back to 32. 1725 const LLT EltTy = Query.Types[EltTypeIdx]; 1726 const LLT VecTy = Query.Types[VecTypeIdx]; 1727 const unsigned DstEltSize = EltTy.getSizeInBits(); 1728 const unsigned VecSize = VecTy.getSizeInBits(); 1729 1730 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1731 return std::pair( 1732 VecTypeIdx, 1733 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1734 }) 1735 .clampScalar(EltTypeIdx, S32, S64) 1736 .clampScalar(VecTypeIdx, S32, S64) 1737 .clampScalar(IdxTypeIdx, S32, S32) 1738 .clampMaxNumElements(VecTypeIdx, S32, 32) 1739 // TODO: Clamp elements for 64-bit vectors? 1740 .moreElementsIf( 1741 isIllegalRegisterType(VecTypeIdx), 1742 moreElementsToNextExistingRegClass(VecTypeIdx)) 1743 // It should only be necessary with variable indexes. 1744 // As a last resort, lower to the stack 1745 .lower(); 1746 } 1747 1748 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1749 .unsupportedIf([=](const LegalityQuery &Query) { 1750 const LLT &EltTy = Query.Types[1].getElementType(); 1751 return Query.Types[0] != EltTy; 1752 }); 1753 1754 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1755 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1756 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1757 1758 // FIXME: Doesn't handle extract of illegal sizes. 1759 getActionDefinitionsBuilder(Op) 1760 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1761 .lowerIf([=](const LegalityQuery &Query) { 1762 // Sub-vector(or single element) insert and extract. 1763 // TODO: verify immediate offset here since lower only works with 1764 // whole elements. 1765 const LLT BigTy = Query.Types[BigTyIdx]; 1766 return BigTy.isVector(); 1767 }) 1768 // FIXME: Multiples of 16 should not be legal. 1769 .legalIf([=](const LegalityQuery &Query) { 1770 const LLT BigTy = Query.Types[BigTyIdx]; 1771 const LLT LitTy = Query.Types[LitTyIdx]; 1772 return (BigTy.getSizeInBits() % 32 == 0) && 1773 (LitTy.getSizeInBits() % 16 == 0); 1774 }) 1775 .widenScalarIf( 1776 [=](const LegalityQuery &Query) { 1777 const LLT BigTy = Query.Types[BigTyIdx]; 1778 return (BigTy.getScalarSizeInBits() < 16); 1779 }, 1780 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1781 .widenScalarIf( 1782 [=](const LegalityQuery &Query) { 1783 const LLT LitTy = Query.Types[LitTyIdx]; 1784 return (LitTy.getScalarSizeInBits() < 16); 1785 }, 1786 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1787 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1788 .widenScalarToNextPow2(BigTyIdx, 32); 1789 1790 } 1791 1792 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1793 .legalForCartesianProduct(AllS32Vectors, {S32}) 1794 .legalForCartesianProduct(AllS64Vectors, {S64}) 1795 .clampNumElements(0, V16S32, V32S32) 1796 .clampNumElements(0, V2S64, V16S64) 1797 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)) 1798 .moreElementsIf( 1799 isIllegalRegisterType(0), 1800 moreElementsToNextExistingRegClass(0)); 1801 1802 if (ST.hasScalarPackInsts()) { 1803 BuildVector 1804 // FIXME: Should probably widen s1 vectors straight to s32 1805 .minScalarOrElt(0, S16) 1806 .minScalar(1, S16); 1807 1808 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1809 .legalFor({V2S16, S32}) 1810 .lower(); 1811 } else { 1812 BuildVector.customFor({V2S16, S16}); 1813 BuildVector.minScalarOrElt(0, S32); 1814 1815 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1816 .customFor({V2S16, S32}) 1817 .lower(); 1818 } 1819 1820 BuildVector.legalIf(isRegisterType(0)); 1821 1822 // FIXME: Clamp maximum size 1823 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1824 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1825 .clampMaxNumElements(0, S32, 32) 1826 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1827 .clampMaxNumElements(0, S16, 64); 1828 1829 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1830 1831 // Merge/Unmerge 1832 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1833 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1834 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1835 1836 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1837 const LLT Ty = Query.Types[TypeIdx]; 1838 if (Ty.isVector()) { 1839 const LLT &EltTy = Ty.getElementType(); 1840 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1841 return true; 1842 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits())) 1843 return true; 1844 } 1845 return false; 1846 }; 1847 1848 auto &Builder = getActionDefinitionsBuilder(Op) 1849 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1850 .lowerFor({{S16, V2S16}}) 1851 .lowerIf([=](const LegalityQuery &Query) { 1852 const LLT BigTy = Query.Types[BigTyIdx]; 1853 return BigTy.getSizeInBits() == 32; 1854 }) 1855 // Try to widen to s16 first for small types. 1856 // TODO: Only do this on targets with legal s16 shifts 1857 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1858 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1859 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1860 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1861 elementTypeIs(1, S16)), 1862 changeTo(1, V2S16)) 1863 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1864 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1865 // valid. 1866 .clampScalar(LitTyIdx, S32, S512) 1867 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1868 // Break up vectors with weird elements into scalars 1869 .fewerElementsIf( 1870 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1871 scalarize(0)) 1872 .fewerElementsIf( 1873 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1874 scalarize(1)) 1875 .clampScalar(BigTyIdx, S32, MaxScalar); 1876 1877 if (Op == G_MERGE_VALUES) { 1878 Builder.widenScalarIf( 1879 // TODO: Use 16-bit shifts if legal for 8-bit values? 1880 [=](const LegalityQuery &Query) { 1881 const LLT Ty = Query.Types[LitTyIdx]; 1882 return Ty.getSizeInBits() < 32; 1883 }, 1884 changeTo(LitTyIdx, S32)); 1885 } 1886 1887 Builder.widenScalarIf( 1888 [=](const LegalityQuery &Query) { 1889 const LLT Ty = Query.Types[BigTyIdx]; 1890 return Ty.getSizeInBits() % 16 != 0; 1891 }, 1892 [=](const LegalityQuery &Query) { 1893 // Pick the next power of 2, or a multiple of 64 over 128. 1894 // Whichever is smaller. 1895 const LLT &Ty = Query.Types[BigTyIdx]; 1896 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1897 if (NewSizeInBits >= 256) { 1898 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1899 if (RoundedTo < NewSizeInBits) 1900 NewSizeInBits = RoundedTo; 1901 } 1902 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1903 }) 1904 // Any vectors left are the wrong size. Scalarize them. 1905 .scalarize(0) 1906 .scalarize(1); 1907 } 1908 1909 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1910 // RegBankSelect. 1911 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1912 .legalFor({{S32}, {S64}}); 1913 1914 if (ST.hasVOP3PInsts()) { 1915 SextInReg.lowerFor({{V2S16}}) 1916 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1917 // get more vector shift opportunities, since we'll get those when 1918 // expanded. 1919 .clampMaxNumElementsStrict(0, S16, 2); 1920 } else if (ST.has16BitInsts()) { 1921 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1922 } else { 1923 // Prefer to promote to s32 before lowering if we don't have 16-bit 1924 // shifts. This avoid a lot of intermediate truncate and extend operations. 1925 SextInReg.lowerFor({{S32}, {S64}}); 1926 } 1927 1928 SextInReg 1929 .scalarize(0) 1930 .clampScalar(0, S32, S64) 1931 .lower(); 1932 1933 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1934 .scalarize(0) 1935 .lower(); 1936 1937 // TODO: Only Try to form v2s16 with legal packed instructions. 1938 getActionDefinitionsBuilder(G_FSHR) 1939 .legalFor({{S32, S32}}) 1940 .lowerFor({{V2S16, V2S16}}) 1941 .clampMaxNumElementsStrict(0, S16, 2) 1942 .scalarize(0) 1943 .lower(); 1944 1945 if (ST.hasVOP3PInsts()) { 1946 getActionDefinitionsBuilder(G_FSHL) 1947 .lowerFor({{V2S16, V2S16}}) 1948 .clampMaxNumElementsStrict(0, S16, 2) 1949 .scalarize(0) 1950 .lower(); 1951 } else { 1952 getActionDefinitionsBuilder(G_FSHL) 1953 .scalarize(0) 1954 .lower(); 1955 } 1956 1957 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1958 .legalFor({S64}); 1959 1960 getActionDefinitionsBuilder(G_FENCE) 1961 .alwaysLegal(); 1962 1963 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1964 .scalarize(0) 1965 .minScalar(0, S32) 1966 .lower(); 1967 1968 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1969 .legalFor({{S32, S32}, {S64, S32}}) 1970 .clampScalar(1, S32, S32) 1971 .clampScalar(0, S32, S64) 1972 .widenScalarToNextPow2(0) 1973 .scalarize(0); 1974 1975 getActionDefinitionsBuilder( 1976 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops 1977 G_FCOPYSIGN, 1978 1979 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB, 1980 G_READ_REGISTER, G_WRITE_REGISTER, 1981 1982 G_SADDO, G_SSUBO}) 1983 .lower(); 1984 1985 if (ST.hasIEEEMinMax()) { 1986 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) 1987 .legalFor(FPTypesPK16) 1988 .clampMaxNumElements(0, S16, 2) 1989 .scalarize(0); 1990 } else { 1991 // TODO: Implement 1992 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 1993 } 1994 1995 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1996 .lower(); 1997 1998 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1999 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 2000 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 2001 .unsupported(); 2002 2003 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal(); 2004 2005 getLegacyLegalizerInfo().computeTables(); 2006 verify(*ST.getInstrInfo()); 2007 } 2008 2009 bool AMDGPULegalizerInfo::legalizeCustom( 2010 LegalizerHelper &Helper, MachineInstr &MI, 2011 LostDebugLocObserver &LocObserver) const { 2012 MachineIRBuilder &B = Helper.MIRBuilder; 2013 MachineRegisterInfo &MRI = *B.getMRI(); 2014 2015 switch (MI.getOpcode()) { 2016 case TargetOpcode::G_ADDRSPACE_CAST: 2017 return legalizeAddrSpaceCast(MI, MRI, B); 2018 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: 2019 return legalizeFroundeven(MI, MRI, B); 2020 case TargetOpcode::G_FCEIL: 2021 return legalizeFceil(MI, MRI, B); 2022 case TargetOpcode::G_FREM: 2023 return legalizeFrem(MI, MRI, B); 2024 case TargetOpcode::G_INTRINSIC_TRUNC: 2025 return legalizeIntrinsicTrunc(MI, MRI, B); 2026 case TargetOpcode::G_SITOFP: 2027 return legalizeITOFP(MI, MRI, B, true); 2028 case TargetOpcode::G_UITOFP: 2029 return legalizeITOFP(MI, MRI, B, false); 2030 case TargetOpcode::G_FPTOSI: 2031 return legalizeFPTOI(MI, MRI, B, true); 2032 case TargetOpcode::G_FPTOUI: 2033 return legalizeFPTOI(MI, MRI, B, false); 2034 case TargetOpcode::G_FMINNUM: 2035 case TargetOpcode::G_FMAXNUM: 2036 case TargetOpcode::G_FMINNUM_IEEE: 2037 case TargetOpcode::G_FMAXNUM_IEEE: 2038 return legalizeMinNumMaxNum(Helper, MI); 2039 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 2040 return legalizeExtractVectorElt(MI, MRI, B); 2041 case TargetOpcode::G_INSERT_VECTOR_ELT: 2042 return legalizeInsertVectorElt(MI, MRI, B); 2043 case TargetOpcode::G_FSIN: 2044 case TargetOpcode::G_FCOS: 2045 return legalizeSinCos(MI, MRI, B); 2046 case TargetOpcode::G_GLOBAL_VALUE: 2047 return legalizeGlobalValue(MI, MRI, B); 2048 case TargetOpcode::G_LOAD: 2049 case TargetOpcode::G_SEXTLOAD: 2050 case TargetOpcode::G_ZEXTLOAD: 2051 return legalizeLoad(Helper, MI); 2052 case TargetOpcode::G_STORE: 2053 return legalizeStore(Helper, MI); 2054 case TargetOpcode::G_FMAD: 2055 return legalizeFMad(MI, MRI, B); 2056 case TargetOpcode::G_FDIV: 2057 return legalizeFDIV(MI, MRI, B); 2058 case TargetOpcode::G_FFREXP: 2059 return legalizeFFREXP(MI, MRI, B); 2060 case TargetOpcode::G_FSQRT: 2061 return legalizeFSQRT(MI, MRI, B); 2062 case TargetOpcode::G_UDIV: 2063 case TargetOpcode::G_UREM: 2064 case TargetOpcode::G_UDIVREM: 2065 return legalizeUnsignedDIV_REM(MI, MRI, B); 2066 case TargetOpcode::G_SDIV: 2067 case TargetOpcode::G_SREM: 2068 case TargetOpcode::G_SDIVREM: 2069 return legalizeSignedDIV_REM(MI, MRI, B); 2070 case TargetOpcode::G_ATOMIC_CMPXCHG: 2071 return legalizeAtomicCmpXChg(MI, MRI, B); 2072 case TargetOpcode::G_FLOG2: 2073 return legalizeFlog2(MI, B); 2074 case TargetOpcode::G_FLOG: 2075 case TargetOpcode::G_FLOG10: 2076 return legalizeFlogCommon(MI, B); 2077 case TargetOpcode::G_FEXP2: 2078 return legalizeFExp2(MI, B); 2079 case TargetOpcode::G_FEXP: 2080 case TargetOpcode::G_FEXP10: 2081 return legalizeFExp(MI, B); 2082 case TargetOpcode::G_FPOW: 2083 return legalizeFPow(MI, B); 2084 case TargetOpcode::G_FFLOOR: 2085 return legalizeFFloor(MI, MRI, B); 2086 case TargetOpcode::G_BUILD_VECTOR: 2087 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 2088 return legalizeBuildVector(MI, MRI, B); 2089 case TargetOpcode::G_MUL: 2090 return legalizeMul(Helper, MI); 2091 case TargetOpcode::G_CTLZ: 2092 case TargetOpcode::G_CTTZ: 2093 return legalizeCTLZ_CTTZ(MI, MRI, B); 2094 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 2095 return legalizeFPTruncRound(MI, B); 2096 case TargetOpcode::G_STACKSAVE: 2097 return legalizeStackSave(MI, B); 2098 default: 2099 return false; 2100 } 2101 2102 llvm_unreachable("expected switch to return"); 2103 } 2104 2105 Register AMDGPULegalizerInfo::getSegmentAperture( 2106 unsigned AS, 2107 MachineRegisterInfo &MRI, 2108 MachineIRBuilder &B) const { 2109 MachineFunction &MF = B.getMF(); 2110 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 2111 const LLT S32 = LLT::scalar(32); 2112 const LLT S64 = LLT::scalar(64); 2113 2114 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 2115 2116 if (ST.hasApertureRegs()) { 2117 // Note: this register is somewhat broken. When used as a 32-bit operand, 2118 // it only returns zeroes. The real value is in the upper 32 bits. 2119 // Thus, we must emit extract the high 32 bits. 2120 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) 2121 ? AMDGPU::SRC_SHARED_BASE 2122 : AMDGPU::SRC_PRIVATE_BASE; 2123 // FIXME: It would be more natural to emit a COPY here, but then copy 2124 // coalescing would kick in and it would think it's okay to use the "HI" 2125 // subregister (instead of extracting the HI 32 bits) which is an artificial 2126 // (unusable) register. 2127 // Register TableGen definitions would need an overhaul to get rid of the 2128 // artificial "HI" aperture registers and prevent this kind of issue from 2129 // happening. 2130 Register Dst = MRI.createGenericVirtualRegister(S64); 2131 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass); 2132 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)}); 2133 return B.buildUnmerge(S32, Dst).getReg(1); 2134 } 2135 2136 // TODO: can we be smarter about machine pointer info? 2137 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 2138 Register LoadAddr = MRI.createGenericVirtualRegister( 2139 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2140 // For code object version 5, private_base and shared_base are passed through 2141 // implicit kernargs. 2142 if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= 2143 AMDGPU::AMDHSA_COV5) { 2144 AMDGPUTargetLowering::ImplicitParameter Param = 2145 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 2146 : AMDGPUTargetLowering::PRIVATE_BASE; 2147 uint64_t Offset = 2148 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 2149 2150 Register KernargPtrReg = MRI.createGenericVirtualRegister( 2151 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2152 2153 if (!loadInputValue(KernargPtrReg, B, 2154 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 2155 return Register(); 2156 2157 MachineMemOperand *MMO = MF.getMachineMemOperand( 2158 PtrInfo, 2159 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2160 MachineMemOperand::MOInvariant, 2161 LLT::scalar(32), commonAlignment(Align(64), Offset)); 2162 2163 // Pointer address 2164 B.buildPtrAdd(LoadAddr, KernargPtrReg, 2165 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 2166 // Load address 2167 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 2168 } 2169 2170 Register QueuePtr = MRI.createGenericVirtualRegister( 2171 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2172 2173 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 2174 return Register(); 2175 2176 // Offset into amd_queue_t for group_segment_aperture_base_hi / 2177 // private_segment_aperture_base_hi. 2178 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 2179 2180 MachineMemOperand *MMO = MF.getMachineMemOperand( 2181 PtrInfo, 2182 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2183 MachineMemOperand::MOInvariant, 2184 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 2185 2186 B.buildPtrAdd(LoadAddr, QueuePtr, 2187 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 2188 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 2189 } 2190 2191 /// Return true if the value is a known valid address, such that a null check is 2192 /// not necessary. 2193 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 2194 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 2195 MachineInstr *Def = MRI.getVRegDef(Val); 2196 switch (Def->getOpcode()) { 2197 case AMDGPU::G_FRAME_INDEX: 2198 case AMDGPU::G_GLOBAL_VALUE: 2199 case AMDGPU::G_BLOCK_ADDR: 2200 return true; 2201 case AMDGPU::G_CONSTANT: { 2202 const ConstantInt *CI = Def->getOperand(1).getCImm(); 2203 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 2204 } 2205 default: 2206 return false; 2207 } 2208 2209 return false; 2210 } 2211 2212 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 2213 MachineInstr &MI, MachineRegisterInfo &MRI, 2214 MachineIRBuilder &B) const { 2215 MachineFunction &MF = B.getMF(); 2216 2217 const LLT S32 = LLT::scalar(32); 2218 Register Dst = MI.getOperand(0).getReg(); 2219 Register Src = MI.getOperand(1).getReg(); 2220 2221 LLT DstTy = MRI.getType(Dst); 2222 LLT SrcTy = MRI.getType(Src); 2223 unsigned DestAS = DstTy.getAddressSpace(); 2224 unsigned SrcAS = SrcTy.getAddressSpace(); 2225 2226 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 2227 // vector element. 2228 assert(!DstTy.isVector()); 2229 2230 const AMDGPUTargetMachine &TM 2231 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 2232 2233 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 2234 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 2235 return true; 2236 } 2237 2238 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 2239 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 2240 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 2241 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 2242 // Extract low 32-bits of the pointer. 2243 B.buildExtract(Dst, Src, 0); 2244 MI.eraseFromParent(); 2245 return true; 2246 } 2247 2248 unsigned NullVal = TM.getNullPointerValue(DestAS); 2249 2250 auto SegmentNull = B.buildConstant(DstTy, NullVal); 2251 auto FlatNull = B.buildConstant(SrcTy, 0); 2252 2253 // Extract low 32-bits of the pointer. 2254 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 2255 2256 auto CmpRes = 2257 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 2258 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 2259 2260 MI.eraseFromParent(); 2261 return true; 2262 } 2263 2264 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 2265 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 2266 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 2267 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 2268 if (!ApertureReg.isValid()) 2269 return false; 2270 2271 // Coerce the type of the low half of the result so we can use merge_values. 2272 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 2273 2274 // TODO: Should we allow mismatched types but matching sizes in merges to 2275 // avoid the ptrtoint? 2276 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg}); 2277 2278 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 2279 B.buildCopy(Dst, BuildPtr); 2280 MI.eraseFromParent(); 2281 return true; 2282 } 2283 2284 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 2285 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 2286 2287 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 2288 SegmentNull.getReg(0)); 2289 2290 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 2291 2292 MI.eraseFromParent(); 2293 return true; 2294 } 2295 2296 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2297 SrcTy.getSizeInBits() == 64) { 2298 // Truncate. 2299 B.buildExtract(Dst, Src, 0); 2300 MI.eraseFromParent(); 2301 return true; 2302 } 2303 2304 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2305 DstTy.getSizeInBits() == 64) { 2306 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2307 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 2308 auto PtrLo = B.buildPtrToInt(S32, Src); 2309 auto HighAddr = B.buildConstant(S32, AddrHiVal); 2310 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); 2311 MI.eraseFromParent(); 2312 return true; 2313 } 2314 2315 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 2316 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 2317 2318 LLVMContext &Ctx = MF.getFunction().getContext(); 2319 Ctx.diagnose(InvalidAddrSpaceCast); 2320 B.buildUndef(Dst); 2321 MI.eraseFromParent(); 2322 return true; 2323 } 2324 2325 bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI, 2326 MachineRegisterInfo &MRI, 2327 MachineIRBuilder &B) const { 2328 Register Src = MI.getOperand(1).getReg(); 2329 LLT Ty = MRI.getType(Src); 2330 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 2331 2332 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2333 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2334 2335 auto C1 = B.buildFConstant(Ty, C1Val); 2336 auto CopySign = B.buildFCopysign(Ty, C1, Src); 2337 2338 // TODO: Should this propagate fast-math-flags? 2339 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 2340 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 2341 2342 auto C2 = B.buildFConstant(Ty, C2Val); 2343 auto Fabs = B.buildFAbs(Ty, Src); 2344 2345 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 2346 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2347 MI.eraseFromParent(); 2348 return true; 2349 } 2350 2351 bool AMDGPULegalizerInfo::legalizeFceil( 2352 MachineInstr &MI, MachineRegisterInfo &MRI, 2353 MachineIRBuilder &B) const { 2354 2355 const LLT S1 = LLT::scalar(1); 2356 const LLT S64 = LLT::scalar(64); 2357 2358 Register Src = MI.getOperand(1).getReg(); 2359 assert(MRI.getType(Src) == S64); 2360 2361 // result = trunc(src) 2362 // if (src > 0.0 && src != result) 2363 // result += 1.0 2364 2365 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2366 2367 const auto Zero = B.buildFConstant(S64, 0.0); 2368 const auto One = B.buildFConstant(S64, 1.0); 2369 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2370 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2371 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2372 auto Add = B.buildSelect(S64, And, One, Zero); 2373 2374 // TODO: Should this propagate fast-math-flags? 2375 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2376 MI.eraseFromParent(); 2377 return true; 2378 } 2379 2380 bool AMDGPULegalizerInfo::legalizeFrem( 2381 MachineInstr &MI, MachineRegisterInfo &MRI, 2382 MachineIRBuilder &B) const { 2383 Register DstReg = MI.getOperand(0).getReg(); 2384 Register Src0Reg = MI.getOperand(1).getReg(); 2385 Register Src1Reg = MI.getOperand(2).getReg(); 2386 auto Flags = MI.getFlags(); 2387 LLT Ty = MRI.getType(DstReg); 2388 2389 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2390 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2391 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2392 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2393 MI.eraseFromParent(); 2394 return true; 2395 } 2396 2397 static MachineInstrBuilder extractF64Exponent(Register Hi, 2398 MachineIRBuilder &B) { 2399 const unsigned FractBits = 52; 2400 const unsigned ExpBits = 11; 2401 LLT S32 = LLT::scalar(32); 2402 2403 auto Const0 = B.buildConstant(S32, FractBits - 32); 2404 auto Const1 = B.buildConstant(S32, ExpBits); 2405 2406 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}) 2407 .addUse(Hi) 2408 .addUse(Const0.getReg(0)) 2409 .addUse(Const1.getReg(0)); 2410 2411 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2412 } 2413 2414 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2415 MachineInstr &MI, MachineRegisterInfo &MRI, 2416 MachineIRBuilder &B) const { 2417 const LLT S1 = LLT::scalar(1); 2418 const LLT S32 = LLT::scalar(32); 2419 const LLT S64 = LLT::scalar(64); 2420 2421 Register Src = MI.getOperand(1).getReg(); 2422 assert(MRI.getType(Src) == S64); 2423 2424 // TODO: Should this use extract since the low half is unused? 2425 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2426 Register Hi = Unmerge.getReg(1); 2427 2428 // Extract the upper half, since this is where we will find the sign and 2429 // exponent. 2430 auto Exp = extractF64Exponent(Hi, B); 2431 2432 const unsigned FractBits = 52; 2433 2434 // Extract the sign bit. 2435 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2436 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2437 2438 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2439 2440 const auto Zero32 = B.buildConstant(S32, 0); 2441 2442 // Extend back to 64-bits. 2443 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit}); 2444 2445 auto Shr = B.buildAShr(S64, FractMask, Exp); 2446 auto Not = B.buildNot(S64, Shr); 2447 auto Tmp0 = B.buildAnd(S64, Src, Not); 2448 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2449 2450 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2451 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2452 2453 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2454 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2455 MI.eraseFromParent(); 2456 return true; 2457 } 2458 2459 bool AMDGPULegalizerInfo::legalizeITOFP( 2460 MachineInstr &MI, MachineRegisterInfo &MRI, 2461 MachineIRBuilder &B, bool Signed) const { 2462 2463 Register Dst = MI.getOperand(0).getReg(); 2464 Register Src = MI.getOperand(1).getReg(); 2465 2466 const LLT S64 = LLT::scalar(64); 2467 const LLT S32 = LLT::scalar(32); 2468 2469 assert(MRI.getType(Src) == S64); 2470 2471 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2472 auto ThirtyTwo = B.buildConstant(S32, 32); 2473 2474 if (MRI.getType(Dst) == S64) { 2475 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2476 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2477 2478 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2479 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo); 2480 2481 // TODO: Should this propagate fast-math-flags? 2482 B.buildFAdd(Dst, LdExp, CvtLo); 2483 MI.eraseFromParent(); 2484 return true; 2485 } 2486 2487 assert(MRI.getType(Dst) == S32); 2488 2489 auto One = B.buildConstant(S32, 1); 2490 2491 MachineInstrBuilder ShAmt; 2492 if (Signed) { 2493 auto ThirtyOne = B.buildConstant(S32, 31); 2494 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2495 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2496 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2497 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}) 2498 .addUse(Unmerge.getReg(1)); 2499 auto LS2 = B.buildSub(S32, LS, One); 2500 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2501 } else 2502 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2503 auto Norm = B.buildShl(S64, Src, ShAmt); 2504 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2505 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2506 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2507 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2508 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2509 B.buildFLdexp(Dst, FVal, Scale); 2510 MI.eraseFromParent(); 2511 return true; 2512 } 2513 2514 // TODO: Copied from DAG implementation. Verify logic and document how this 2515 // actually works. 2516 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2517 MachineRegisterInfo &MRI, 2518 MachineIRBuilder &B, 2519 bool Signed) const { 2520 2521 Register Dst = MI.getOperand(0).getReg(); 2522 Register Src = MI.getOperand(1).getReg(); 2523 2524 const LLT S64 = LLT::scalar(64); 2525 const LLT S32 = LLT::scalar(32); 2526 2527 const LLT SrcLT = MRI.getType(Src); 2528 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2529 2530 unsigned Flags = MI.getFlags(); 2531 2532 // The basic idea of converting a floating point number into a pair of 32-bit 2533 // integers is illustrated as follows: 2534 // 2535 // tf := trunc(val); 2536 // hif := floor(tf * 2^-32); 2537 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2538 // hi := fptoi(hif); 2539 // lo := fptoi(lof); 2540 // 2541 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2542 MachineInstrBuilder Sign; 2543 if (Signed && SrcLT == S32) { 2544 // However, a 32-bit floating point number has only 23 bits mantissa and 2545 // it's not enough to hold all the significant bits of `lof` if val is 2546 // negative. To avoid the loss of precision, We need to take the absolute 2547 // value after truncating and flip the result back based on the original 2548 // signedness. 2549 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2550 Trunc = B.buildFAbs(S32, Trunc, Flags); 2551 } 2552 MachineInstrBuilder K0, K1; 2553 if (SrcLT == S64) { 2554 K0 = B.buildFConstant( 2555 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2556 K1 = B.buildFConstant( 2557 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2558 } else { 2559 K0 = B.buildFConstant( 2560 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000))); 2561 K1 = B.buildFConstant( 2562 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000))); 2563 } 2564 2565 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2566 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2567 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2568 2569 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2570 : B.buildFPTOUI(S32, FloorMul); 2571 auto Lo = B.buildFPTOUI(S32, Fma); 2572 2573 if (Signed && SrcLT == S32) { 2574 // Flip the result based on the signedness, which is either all 0s or 1s. 2575 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign}); 2576 // r := xor({lo, hi}, sign) - sign; 2577 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign), 2578 Sign); 2579 } else 2580 B.buildMergeLikeInstr(Dst, {Lo, Hi}); 2581 MI.eraseFromParent(); 2582 2583 return true; 2584 } 2585 2586 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2587 MachineInstr &MI) const { 2588 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2589 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2590 2591 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2592 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2593 2594 // With ieee_mode disabled, the instructions have the correct behavior 2595 // already for G_FMINNUM/G_FMAXNUM 2596 if (!MFI->getMode().IEEE) 2597 return !IsIEEEOp; 2598 2599 if (IsIEEEOp) 2600 return true; 2601 2602 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2603 } 2604 2605 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2606 MachineInstr &MI, MachineRegisterInfo &MRI, 2607 MachineIRBuilder &B) const { 2608 // TODO: Should move some of this into LegalizerHelper. 2609 2610 // TODO: Promote dynamic indexing of s16 to s32 2611 2612 Register Dst = MI.getOperand(0).getReg(); 2613 Register Vec = MI.getOperand(1).getReg(); 2614 2615 LLT VecTy = MRI.getType(Vec); 2616 LLT EltTy = VecTy.getElementType(); 2617 assert(EltTy == MRI.getType(Dst)); 2618 2619 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 2620 // but we can't go directly to that logic becasue you can't bitcast a vector 2621 // of pointers to a vector of integers. Therefore, introduce an intermediate 2622 // vector of integers using ptrtoint (and inttoptr on the output) in order to 2623 // drive the legalization forward. 2624 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 2625 LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 2626 LLT IntVecTy = VecTy.changeElementType(IntTy); 2627 2628 auto IntVec = B.buildPtrToInt(IntVecTy, Vec); 2629 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2)); 2630 B.buildIntToPtr(Dst, IntElt); 2631 2632 MI.eraseFromParent(); 2633 return true; 2634 } 2635 2636 // FIXME: Artifact combiner probably should have replaced the truncated 2637 // constant before this, so we shouldn't need 2638 // getIConstantVRegValWithLookThrough. 2639 std::optional<ValueAndVReg> MaybeIdxVal = 2640 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2641 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2642 return true; 2643 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 2644 2645 if (IdxVal < VecTy.getNumElements()) { 2646 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2647 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2648 } else { 2649 B.buildUndef(Dst); 2650 } 2651 2652 MI.eraseFromParent(); 2653 return true; 2654 } 2655 2656 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2657 MachineInstr &MI, MachineRegisterInfo &MRI, 2658 MachineIRBuilder &B) const { 2659 // TODO: Should move some of this into LegalizerHelper. 2660 2661 // TODO: Promote dynamic indexing of s16 to s32 2662 2663 Register Dst = MI.getOperand(0).getReg(); 2664 Register Vec = MI.getOperand(1).getReg(); 2665 Register Ins = MI.getOperand(2).getReg(); 2666 2667 LLT VecTy = MRI.getType(Vec); 2668 LLT EltTy = VecTy.getElementType(); 2669 assert(EltTy == MRI.getType(Ins)); 2670 2671 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 2672 // but we can't go directly to that logic becasue you can't bitcast a vector 2673 // of pointers to a vector of integers. Therefore, make the pointer vector 2674 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd 2675 // new value, and then inttoptr the result vector back. This will then allow 2676 // the rest of legalization to take over. 2677 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 2678 LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 2679 LLT IntVecTy = VecTy.changeElementType(IntTy); 2680 2681 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec); 2682 auto IntIns = B.buildPtrToInt(IntTy, Ins); 2683 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns, 2684 MI.getOperand(3)); 2685 B.buildIntToPtr(Dst, IntVecDest); 2686 MI.eraseFromParent(); 2687 return true; 2688 } 2689 2690 // FIXME: Artifact combiner probably should have replaced the truncated 2691 // constant before this, so we shouldn't need 2692 // getIConstantVRegValWithLookThrough. 2693 std::optional<ValueAndVReg> MaybeIdxVal = 2694 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2695 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2696 return true; 2697 2698 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 2699 2700 unsigned NumElts = VecTy.getNumElements(); 2701 if (IdxVal < NumElts) { 2702 SmallVector<Register, 8> SrcRegs; 2703 for (unsigned i = 0; i < NumElts; ++i) 2704 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2705 B.buildUnmerge(SrcRegs, Vec); 2706 2707 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2708 B.buildMergeLikeInstr(Dst, SrcRegs); 2709 } else { 2710 B.buildUndef(Dst); 2711 } 2712 2713 MI.eraseFromParent(); 2714 return true; 2715 } 2716 2717 bool AMDGPULegalizerInfo::legalizeSinCos( 2718 MachineInstr &MI, MachineRegisterInfo &MRI, 2719 MachineIRBuilder &B) const { 2720 2721 Register DstReg = MI.getOperand(0).getReg(); 2722 Register SrcReg = MI.getOperand(1).getReg(); 2723 LLT Ty = MRI.getType(DstReg); 2724 unsigned Flags = MI.getFlags(); 2725 2726 Register TrigVal; 2727 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2728 if (ST.hasTrigReducedRange()) { 2729 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2730 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}) 2731 .addUse(MulVal.getReg(0)) 2732 .setMIFlags(Flags) 2733 .getReg(0); 2734 } else 2735 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2736 2737 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2738 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2739 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg)) 2740 .addUse(TrigVal) 2741 .setMIFlags(Flags); 2742 MI.eraseFromParent(); 2743 return true; 2744 } 2745 2746 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2747 MachineIRBuilder &B, 2748 const GlobalValue *GV, 2749 int64_t Offset, 2750 unsigned GAFlags) const { 2751 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2752 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2753 // to the following code sequence: 2754 // 2755 // For constant address space: 2756 // s_getpc_b64 s[0:1] 2757 // s_add_u32 s0, s0, $symbol 2758 // s_addc_u32 s1, s1, 0 2759 // 2760 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2761 // a fixup or relocation is emitted to replace $symbol with a literal 2762 // constant, which is a pc-relative offset from the encoding of the $symbol 2763 // operand to the global variable. 2764 // 2765 // For global address space: 2766 // s_getpc_b64 s[0:1] 2767 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2768 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2769 // 2770 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2771 // fixups or relocations are emitted to replace $symbol@*@lo and 2772 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2773 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2774 // operand to the global variable. 2775 2776 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2777 2778 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2779 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2780 2781 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2782 .addDef(PCReg); 2783 2784 MIB.addGlobalAddress(GV, Offset, GAFlags); 2785 if (GAFlags == SIInstrInfo::MO_NONE) 2786 MIB.addImm(0); 2787 else 2788 MIB.addGlobalAddress(GV, Offset, GAFlags + 1); 2789 2790 if (!B.getMRI()->getRegClassOrNull(PCReg)) 2791 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2792 2793 if (PtrTy.getSizeInBits() == 32) 2794 B.buildExtract(DstReg, PCReg, 0); 2795 return true; 2796 } 2797 2798 // Emit a ABS32_LO / ABS32_HI relocation stub. 2799 void AMDGPULegalizerInfo::buildAbsGlobalAddress( 2800 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, 2801 MachineRegisterInfo &MRI) const { 2802 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32; 2803 2804 LLT S32 = LLT::scalar(32); 2805 2806 // Use the destination directly, if and only if we store the lower address 2807 // part only and we don't have a register class being set. 2808 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg) 2809 ? DstReg 2810 : MRI.createGenericVirtualRegister(S32); 2811 2812 if (!MRI.getRegClassOrNull(AddrLo)) 2813 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass); 2814 2815 // Write the lower half. 2816 B.buildInstr(AMDGPU::S_MOV_B32) 2817 .addDef(AddrLo) 2818 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO); 2819 2820 // If required, write the upper half as well. 2821 if (RequiresHighHalf) { 2822 assert(PtrTy.getSizeInBits() == 64 && 2823 "Must provide a 64-bit pointer type!"); 2824 2825 Register AddrHi = MRI.createGenericVirtualRegister(S32); 2826 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass); 2827 2828 B.buildInstr(AMDGPU::S_MOV_B32) 2829 .addDef(AddrHi) 2830 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI); 2831 2832 // Use the destination directly, if and only if we don't have a register 2833 // class being set. 2834 Register AddrDst = !MRI.getRegClassOrNull(DstReg) 2835 ? DstReg 2836 : MRI.createGenericVirtualRegister(LLT::scalar(64)); 2837 2838 if (!MRI.getRegClassOrNull(AddrDst)) 2839 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass); 2840 2841 B.buildMergeValues(AddrDst, {AddrLo, AddrHi}); 2842 2843 // If we created a new register for the destination, cast the result into 2844 // the final output. 2845 if (AddrDst != DstReg) 2846 B.buildCast(DstReg, AddrDst); 2847 } else if (AddrLo != DstReg) { 2848 // If we created a new register for the destination, cast the result into 2849 // the final output. 2850 B.buildCast(DstReg, AddrLo); 2851 } 2852 } 2853 2854 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2855 MachineInstr &MI, MachineRegisterInfo &MRI, 2856 MachineIRBuilder &B) const { 2857 Register DstReg = MI.getOperand(0).getReg(); 2858 LLT Ty = MRI.getType(DstReg); 2859 unsigned AS = Ty.getAddressSpace(); 2860 2861 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2862 MachineFunction &MF = B.getMF(); 2863 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2864 2865 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2866 if (!MFI->isModuleEntryFunction() && 2867 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2868 const Function &Fn = MF.getFunction(); 2869 DiagnosticInfoUnsupported BadLDSDecl( 2870 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2871 DS_Warning); 2872 Fn.getContext().diagnose(BadLDSDecl); 2873 2874 // We currently don't have a way to correctly allocate LDS objects that 2875 // aren't directly associated with a kernel. We do force inlining of 2876 // functions that use local objects. However, if these dead functions are 2877 // not eliminated, we don't want a compile time error. Just emit a warning 2878 // and a trap, since there should be no callable path here. 2879 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>()); 2880 B.buildUndef(DstReg); 2881 MI.eraseFromParent(); 2882 return true; 2883 } 2884 2885 // TODO: We could emit code to handle the initialization somewhere. 2886 // We ignore the initializer for now and legalize it to allow selection. 2887 // The initializer will anyway get errored out during assembly emission. 2888 const SITargetLowering *TLI = ST.getTargetLowering(); 2889 if (!TLI->shouldUseLDSConstAddress(GV)) { 2890 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2891 return true; // Leave in place; 2892 } 2893 2894 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2895 Type *Ty = GV->getValueType(); 2896 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2897 // zero-sized type in other languages to declare the dynamic shared 2898 // memory which size is not known at the compile time. They will be 2899 // allocated by the runtime and placed directly after the static 2900 // allocated ones. They all share the same offset. 2901 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2902 // Adjust alignment for that dynamic shared memory array. 2903 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV)); 2904 LLT S32 = LLT::scalar(32); 2905 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}); 2906 B.buildIntToPtr(DstReg, Sz); 2907 MI.eraseFromParent(); 2908 return true; 2909 } 2910 } 2911 2912 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2913 *cast<GlobalVariable>(GV))); 2914 MI.eraseFromParent(); 2915 return true; 2916 } 2917 2918 if (ST.isAmdPalOS() || ST.isMesa3DOS()) { 2919 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI); 2920 MI.eraseFromParent(); 2921 return true; 2922 } 2923 2924 const SITargetLowering *TLI = ST.getTargetLowering(); 2925 2926 if (TLI->shouldEmitFixup(GV)) { 2927 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2928 MI.eraseFromParent(); 2929 return true; 2930 } 2931 2932 if (TLI->shouldEmitPCReloc(GV)) { 2933 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2934 MI.eraseFromParent(); 2935 return true; 2936 } 2937 2938 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2939 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2940 2941 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2942 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2943 MachinePointerInfo::getGOT(MF), 2944 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2945 MachineMemOperand::MOInvariant, 2946 LoadTy, Align(8)); 2947 2948 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2949 2950 if (Ty.getSizeInBits() == 32) { 2951 // Truncate if this is a 32-bit constant address. 2952 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2953 B.buildExtract(DstReg, Load, 0); 2954 } else 2955 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2956 2957 MI.eraseFromParent(); 2958 return true; 2959 } 2960 2961 static LLT widenToNextPowerOf2(LLT Ty) { 2962 if (Ty.isVector()) 2963 return Ty.changeElementCount( 2964 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2965 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2966 } 2967 2968 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2969 MachineInstr &MI) const { 2970 MachineIRBuilder &B = Helper.MIRBuilder; 2971 MachineRegisterInfo &MRI = *B.getMRI(); 2972 GISelChangeObserver &Observer = Helper.Observer; 2973 2974 Register PtrReg = MI.getOperand(1).getReg(); 2975 LLT PtrTy = MRI.getType(PtrReg); 2976 unsigned AddrSpace = PtrTy.getAddressSpace(); 2977 2978 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2979 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2980 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2981 Observer.changingInstr(MI); 2982 MI.getOperand(1).setReg(Cast.getReg(0)); 2983 Observer.changedInstr(MI); 2984 return true; 2985 } 2986 2987 if (MI.getOpcode() != AMDGPU::G_LOAD) 2988 return false; 2989 2990 Register ValReg = MI.getOperand(0).getReg(); 2991 LLT ValTy = MRI.getType(ValReg); 2992 2993 if (hasBufferRsrcWorkaround(ValTy)) { 2994 Observer.changingInstr(MI); 2995 castBufferRsrcFromV4I32(MI, B, MRI, 0); 2996 Observer.changedInstr(MI); 2997 return true; 2998 } 2999 3000 MachineMemOperand *MMO = *MI.memoperands_begin(); 3001 const unsigned ValSize = ValTy.getSizeInBits(); 3002 const LLT MemTy = MMO->getMemoryType(); 3003 const Align MemAlign = MMO->getAlign(); 3004 const unsigned MemSize = MemTy.getSizeInBits(); 3005 const uint64_t AlignInBits = 8 * MemAlign.value(); 3006 3007 // Widen non-power-of-2 loads to the alignment if needed 3008 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 3009 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 3010 3011 // This was already the correct extending load result type, so just adjust 3012 // the memory type. 3013 if (WideMemSize == ValSize) { 3014 MachineFunction &MF = B.getMF(); 3015 3016 MachineMemOperand *WideMMO = 3017 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 3018 Observer.changingInstr(MI); 3019 MI.setMemRefs(MF, {WideMMO}); 3020 Observer.changedInstr(MI); 3021 return true; 3022 } 3023 3024 // Don't bother handling edge case that should probably never be produced. 3025 if (ValSize > WideMemSize) 3026 return false; 3027 3028 LLT WideTy = widenToNextPowerOf2(ValTy); 3029 3030 Register WideLoad; 3031 if (!WideTy.isVector()) { 3032 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 3033 B.buildTrunc(ValReg, WideLoad).getReg(0); 3034 } else { 3035 // Extract the subvector. 3036 3037 if (isRegisterType(ValTy)) { 3038 // If this a case where G_EXTRACT is legal, use it. 3039 // (e.g. <3 x s32> -> <4 x s32>) 3040 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 3041 B.buildExtract(ValReg, WideLoad, 0); 3042 } else { 3043 // For cases where the widened type isn't a nice register value, unmerge 3044 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 3045 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 3046 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 3047 } 3048 } 3049 3050 MI.eraseFromParent(); 3051 return true; 3052 } 3053 3054 return false; 3055 } 3056 3057 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper, 3058 MachineInstr &MI) const { 3059 MachineIRBuilder &B = Helper.MIRBuilder; 3060 MachineRegisterInfo &MRI = *B.getMRI(); 3061 GISelChangeObserver &Observer = Helper.Observer; 3062 3063 Register DataReg = MI.getOperand(0).getReg(); 3064 LLT DataTy = MRI.getType(DataReg); 3065 3066 if (hasBufferRsrcWorkaround(DataTy)) { 3067 Observer.changingInstr(MI); 3068 castBufferRsrcArgToV4I32(MI, B, 0); 3069 Observer.changedInstr(MI); 3070 return true; 3071 } 3072 return false; 3073 } 3074 3075 bool AMDGPULegalizerInfo::legalizeFMad( 3076 MachineInstr &MI, MachineRegisterInfo &MRI, 3077 MachineIRBuilder &B) const { 3078 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3079 assert(Ty.isScalar()); 3080 3081 MachineFunction &MF = B.getMF(); 3082 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 3083 3084 // TODO: Always legal with future ftz flag. 3085 // FIXME: Do we need just output? 3086 if (Ty == LLT::float32() && 3087 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()) 3088 return true; 3089 if (Ty == LLT::float16() && 3090 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()) 3091 return true; 3092 3093 MachineIRBuilder HelperBuilder(MI); 3094 GISelObserverWrapper DummyObserver; 3095 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 3096 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 3097 } 3098 3099 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 3100 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3101 Register DstReg = MI.getOperand(0).getReg(); 3102 Register PtrReg = MI.getOperand(1).getReg(); 3103 Register CmpVal = MI.getOperand(2).getReg(); 3104 Register NewVal = MI.getOperand(3).getReg(); 3105 3106 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 3107 "this should not have been custom lowered"); 3108 3109 LLT ValTy = MRI.getType(CmpVal); 3110 LLT VecTy = LLT::fixed_vector(2, ValTy); 3111 3112 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 3113 3114 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 3115 .addDef(DstReg) 3116 .addUse(PtrReg) 3117 .addUse(PackedVal) 3118 .setMemRefs(MI.memoperands()); 3119 3120 MI.eraseFromParent(); 3121 return true; 3122 } 3123 3124 /// Return true if it's known that \p Src can never be an f32 denormal value. 3125 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI, 3126 Register Src) { 3127 const MachineInstr *DefMI = MRI.getVRegDef(Src); 3128 switch (DefMI->getOpcode()) { 3129 case TargetOpcode::G_INTRINSIC: { 3130 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) { 3131 case Intrinsic::amdgcn_frexp_mant: 3132 return true; 3133 default: 3134 break; 3135 } 3136 3137 break; 3138 } 3139 case TargetOpcode::G_FFREXP: { 3140 if (DefMI->getOperand(0).getReg() == Src) 3141 return true; 3142 break; 3143 } 3144 case TargetOpcode::G_FPEXT: { 3145 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16); 3146 } 3147 default: 3148 return false; 3149 } 3150 3151 return false; 3152 } 3153 3154 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) { 3155 if (Flags & MachineInstr::FmAfn) 3156 return true; 3157 const auto &Options = MF.getTarget().Options; 3158 return Options.UnsafeFPMath || Options.ApproxFuncFPMath; 3159 } 3160 3161 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, 3162 unsigned Flags) { 3163 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) && 3164 MF.getDenormalMode(APFloat::IEEEsingle()).Input != 3165 DenormalMode::PreserveSign; 3166 } 3167 3168 std::pair<Register, Register> 3169 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src, 3170 unsigned Flags) const { 3171 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) 3172 return {}; 3173 3174 const LLT F32 = LLT::scalar(32); 3175 auto SmallestNormal = B.buildFConstant( 3176 F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle())); 3177 auto IsLtSmallestNormal = 3178 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal); 3179 3180 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32); 3181 auto One = B.buildFConstant(F32, 1.0); 3182 auto ScaleFactor = 3183 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags); 3184 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags); 3185 3186 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)}; 3187 } 3188 3189 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI, 3190 MachineIRBuilder &B) const { 3191 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals. 3192 // If we have to handle denormals, scale up the input and adjust the result. 3193 3194 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0) 3195 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0) 3196 3197 Register Dst = MI.getOperand(0).getReg(); 3198 Register Src = MI.getOperand(1).getReg(); 3199 LLT Ty = B.getMRI()->getType(Dst); 3200 unsigned Flags = MI.getFlags(); 3201 3202 if (Ty == LLT::scalar(16)) { 3203 const LLT F32 = LLT::scalar(32); 3204 // Nothing in half is a denormal when promoted to f32. 3205 auto Ext = B.buildFPExt(F32, Src, Flags); 3206 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}) 3207 .addUse(Ext.getReg(0)) 3208 .setMIFlags(Flags); 3209 B.buildFPTrunc(Dst, Log2, Flags); 3210 MI.eraseFromParent(); 3211 return true; 3212 } 3213 3214 assert(Ty == LLT::scalar(32)); 3215 3216 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags); 3217 if (!ScaledInput) { 3218 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)}) 3219 .addUse(Src) 3220 .setMIFlags(Flags); 3221 MI.eraseFromParent(); 3222 return true; 3223 } 3224 3225 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 3226 .addUse(ScaledInput) 3227 .setMIFlags(Flags); 3228 3229 auto ThirtyTwo = B.buildFConstant(Ty, 32.0); 3230 auto Zero = B.buildFConstant(Ty, 0.0); 3231 auto ResultOffset = 3232 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags); 3233 B.buildFSub(Dst, Log2, ResultOffset, Flags); 3234 3235 MI.eraseFromParent(); 3236 return true; 3237 } 3238 3239 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y, 3240 Register Z, unsigned Flags) { 3241 auto FMul = B.buildFMul(Ty, X, Y, Flags); 3242 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0); 3243 } 3244 3245 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, 3246 MachineIRBuilder &B) const { 3247 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10; 3248 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG); 3249 3250 MachineRegisterInfo &MRI = *B.getMRI(); 3251 Register Dst = MI.getOperand(0).getReg(); 3252 Register X = MI.getOperand(1).getReg(); 3253 unsigned Flags = MI.getFlags(); 3254 const LLT Ty = MRI.getType(X); 3255 MachineFunction &MF = B.getMF(); 3256 3257 const LLT F32 = LLT::scalar(32); 3258 const LLT F16 = LLT::scalar(16); 3259 3260 const AMDGPUTargetMachine &TM = 3261 static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 3262 3263 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) || 3264 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) { 3265 if (Ty == F16 && !ST.has16BitInsts()) { 3266 Register LogVal = MRI.createGenericVirtualRegister(F32); 3267 auto PromoteSrc = B.buildFPExt(F32, X); 3268 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags); 3269 B.buildFPTrunc(Dst, LogVal); 3270 } else { 3271 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags); 3272 } 3273 3274 MI.eraseFromParent(); 3275 return true; 3276 } 3277 3278 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags); 3279 if (ScaledInput) 3280 X = ScaledInput; 3281 3282 auto Y = 3283 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags); 3284 3285 Register R; 3286 if (ST.hasFastFMAF32()) { 3287 // c+cc are ln(2)/ln(10) to more than 49 bits 3288 const float c_log10 = 0x1.344134p-2f; 3289 const float cc_log10 = 0x1.09f79ep-26f; 3290 3291 // c + cc is ln(2) to more than 49 bits 3292 const float c_log = 0x1.62e42ep-1f; 3293 const float cc_log = 0x1.efa39ep-25f; 3294 3295 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log); 3296 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log); 3297 3298 R = B.buildFMul(Ty, Y, C, Flags).getReg(0); 3299 auto NegR = B.buildFNeg(Ty, R, Flags); 3300 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags); 3301 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags); 3302 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0); 3303 } else { 3304 // ch+ct is ln(2)/ln(10) to more than 36 bits 3305 const float ch_log10 = 0x1.344000p-2f; 3306 const float ct_log10 = 0x1.3509f6p-18f; 3307 3308 // ch + ct is ln(2) to more than 36 bits 3309 const float ch_log = 0x1.62e000p-1f; 3310 const float ct_log = 0x1.0bfbe8p-15f; 3311 3312 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log); 3313 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log); 3314 3315 auto MaskConst = B.buildConstant(Ty, 0xfffff000); 3316 auto YH = B.buildAnd(Ty, Y, MaskConst); 3317 auto YT = B.buildFSub(Ty, Y, YH, Flags); 3318 auto YTCT = B.buildFMul(Ty, YT, CT, Flags); 3319 3320 Register Mad0 = 3321 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags); 3322 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags); 3323 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags); 3324 } 3325 3326 const bool IsFiniteOnly = 3327 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) && 3328 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath); 3329 3330 if (!IsFiniteOnly) { 3331 // Expand isfinite(x) => fabs(x) < inf 3332 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 3333 auto Fabs = B.buildFAbs(Ty, Y); 3334 auto IsFinite = 3335 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 3336 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0); 3337 } 3338 3339 if (ScaledInput) { 3340 auto Zero = B.buildFConstant(Ty, 0.0); 3341 auto ShiftK = 3342 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f); 3343 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags); 3344 B.buildFSub(Dst, R, Shift, Flags); 3345 } else { 3346 B.buildCopy(Dst, R); 3347 } 3348 3349 MI.eraseFromParent(); 3350 return true; 3351 } 3352 3353 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, 3354 Register Src, bool IsLog10, 3355 unsigned Flags) const { 3356 const double Log2BaseInverted = 3357 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2; 3358 3359 LLT Ty = B.getMRI()->getType(Dst); 3360 3361 if (Ty == LLT::scalar(32)) { 3362 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags); 3363 if (ScaledInput) { 3364 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 3365 .addUse(Src) 3366 .setMIFlags(Flags); 3367 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted); 3368 auto Zero = B.buildFConstant(Ty, 0.0); 3369 auto ResultOffset = 3370 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags); 3371 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted); 3372 3373 if (ST.hasFastFMAF32()) 3374 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags); 3375 else { 3376 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags); 3377 B.buildFAdd(Dst, Mul, ResultOffset, Flags); 3378 } 3379 3380 return true; 3381 } 3382 } 3383 3384 auto Log2Operand = Ty == LLT::scalar(16) 3385 ? B.buildFLog2(Ty, Src, Flags) 3386 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 3387 .addUse(Src) 3388 .setMIFlags(Flags); 3389 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 3390 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 3391 return true; 3392 } 3393 3394 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI, 3395 MachineIRBuilder &B) const { 3396 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals. 3397 // If we have to handle denormals, scale up the input and adjust the result. 3398 3399 Register Dst = MI.getOperand(0).getReg(); 3400 Register Src = MI.getOperand(1).getReg(); 3401 unsigned Flags = MI.getFlags(); 3402 LLT Ty = B.getMRI()->getType(Dst); 3403 const LLT F16 = LLT::scalar(16); 3404 const LLT F32 = LLT::scalar(32); 3405 3406 if (Ty == F16) { 3407 // Nothing in half is a denormal when promoted to f32. 3408 auto Ext = B.buildFPExt(F32, Src, Flags); 3409 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}) 3410 .addUse(Ext.getReg(0)) 3411 .setMIFlags(Flags); 3412 B.buildFPTrunc(Dst, Log2, Flags); 3413 MI.eraseFromParent(); 3414 return true; 3415 } 3416 3417 assert(Ty == F32); 3418 3419 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) { 3420 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}) 3421 .addUse(Src) 3422 .setMIFlags(Flags); 3423 MI.eraseFromParent(); 3424 return true; 3425 } 3426 3427 // bool needs_scaling = x < -0x1.f80000p+6f; 3428 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); 3429 3430 // -nextafter(128.0, -1) 3431 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f); 3432 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, 3433 RangeCheckConst, Flags); 3434 3435 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f); 3436 auto Zero = B.buildFConstant(Ty, 0.0); 3437 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags); 3438 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags); 3439 3440 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 3441 .addUse(AddInput.getReg(0)) 3442 .setMIFlags(Flags); 3443 3444 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f); 3445 auto One = B.buildFConstant(Ty, 1.0); 3446 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags); 3447 B.buildFMul(Dst, Exp2, ResultScale, Flags); 3448 MI.eraseFromParent(); 3449 return true; 3450 } 3451 3452 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, 3453 Register X, unsigned Flags) const { 3454 LLT Ty = B.getMRI()->getType(Dst); 3455 LLT F32 = LLT::scalar(32); 3456 3457 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) { 3458 auto Log2E = B.buildFConstant(Ty, numbers::log2e); 3459 auto Mul = B.buildFMul(Ty, X, Log2E, Flags); 3460 3461 if (Ty == F32) { 3462 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}) 3463 .addUse(Mul.getReg(0)) 3464 .setMIFlags(Flags); 3465 } else { 3466 B.buildFExp2(Dst, Mul.getReg(0), Flags); 3467 } 3468 3469 return true; 3470 } 3471 3472 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f); 3473 auto NeedsScaling = 3474 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags); 3475 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f); 3476 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags); 3477 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags); 3478 3479 auto Log2E = B.buildFConstant(Ty, numbers::log2e); 3480 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags); 3481 3482 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 3483 .addUse(ExpInput.getReg(0)) 3484 .setMIFlags(Flags); 3485 3486 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f); 3487 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags); 3488 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags); 3489 return true; 3490 } 3491 3492 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 3493 MachineIRBuilder &B) const { 3494 Register Dst = MI.getOperand(0).getReg(); 3495 Register X = MI.getOperand(1).getReg(); 3496 const unsigned Flags = MI.getFlags(); 3497 MachineFunction &MF = B.getMF(); 3498 MachineRegisterInfo &MRI = *B.getMRI(); 3499 LLT Ty = MRI.getType(Dst); 3500 const LLT F16 = LLT::scalar(16); 3501 const LLT F32 = LLT::scalar(32); 3502 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10; 3503 3504 if (Ty == F16) { 3505 // v_exp_f16 (fmul x, log2e) 3506 if (allowApproxFunc(MF, Flags)) { 3507 // TODO: Does this really require fast? 3508 legalizeFExpUnsafe(B, Dst, X, Flags); 3509 MI.eraseFromParent(); 3510 return true; 3511 } 3512 3513 // exp(f16 x) -> 3514 // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) 3515 3516 // Nothing in half is a denormal when promoted to f32. 3517 auto Ext = B.buildFPExt(F32, X, Flags); 3518 Register Lowered = MRI.createGenericVirtualRegister(F32); 3519 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags); 3520 B.buildFPTrunc(Dst, Lowered, Flags); 3521 MI.eraseFromParent(); 3522 return true; 3523 } 3524 3525 assert(Ty == F32); 3526 3527 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying 3528 // library behavior. Also, is known-not-daz source sufficient? 3529 if (allowApproxFunc(MF, Flags)) { 3530 legalizeFExpUnsafe(B, Dst, X, Flags); 3531 MI.eraseFromParent(); 3532 return true; 3533 } 3534 3535 // Algorithm: 3536 // 3537 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) 3538 // 3539 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer 3540 // n = 64*m + j, 0 <= j < 64 3541 // 3542 // e^x = 2^((64*m + j + f)/64) 3543 // = (2^m) * (2^(j/64)) * 2^(f/64) 3544 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) 3545 // 3546 // f = x*(64/ln(2)) - n 3547 // r = f*(ln(2)/64) = x - n*(ln(2)/64) 3548 // 3549 // e^x = (2^m) * (2^(j/64)) * e^r 3550 // 3551 // (2^(j/64)) is precomputed 3552 // 3553 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3554 // e^r = 1 + q 3555 // 3556 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3557 // 3558 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) 3559 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract; 3560 Register PH, PL; 3561 3562 if (ST.hasFastFMAF32()) { 3563 const float c_exp = numbers::log2ef; 3564 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits 3565 const float c_exp10 = 0x1.a934f0p+1f; 3566 const float cc_exp10 = 0x1.2f346ep-24f; 3567 3568 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp); 3569 PH = B.buildFMul(Ty, X, C, Flags).getReg(0); 3570 auto NegPH = B.buildFNeg(Ty, PH, Flags); 3571 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags); 3572 3573 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp); 3574 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0); 3575 } else { 3576 const float ch_exp = 0x1.714000p+0f; 3577 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits 3578 3579 const float ch_exp10 = 0x1.a92000p+1f; 3580 const float cl_exp10 = 0x1.4f0978p-11f; 3581 3582 auto MaskConst = B.buildConstant(Ty, 0xfffff000); 3583 auto XH = B.buildAnd(Ty, X, MaskConst); 3584 auto XL = B.buildFSub(Ty, X, XH, Flags); 3585 3586 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp); 3587 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0); 3588 3589 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp); 3590 auto XLCL = B.buildFMul(Ty, XL, CL, Flags); 3591 3592 Register Mad0 = 3593 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags); 3594 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags); 3595 } 3596 3597 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags); 3598 3599 // It is unsafe to contract this fsub into the PH multiply. 3600 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract); 3601 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags); 3602 auto IntE = B.buildFPTOSI(LLT::scalar(32), E); 3603 3604 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 3605 .addUse(A.getReg(0)) 3606 .setMIFlags(Flags); 3607 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags); 3608 3609 auto UnderflowCheckConst = 3610 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f); 3611 auto Zero = B.buildFConstant(Ty, 0.0); 3612 auto Underflow = 3613 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst); 3614 3615 R = B.buildSelect(Ty, Underflow, Zero, R); 3616 3617 const auto &Options = MF.getTarget().Options; 3618 3619 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) { 3620 auto OverflowCheckConst = 3621 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f); 3622 3623 auto Overflow = 3624 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst); 3625 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 3626 R = B.buildSelect(Ty, Overflow, Inf, R, Flags); 3627 } 3628 3629 B.buildCopy(Dst, R); 3630 MI.eraseFromParent(); 3631 return true; 3632 } 3633 3634 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 3635 MachineIRBuilder &B) const { 3636 Register Dst = MI.getOperand(0).getReg(); 3637 Register Src0 = MI.getOperand(1).getReg(); 3638 Register Src1 = MI.getOperand(2).getReg(); 3639 unsigned Flags = MI.getFlags(); 3640 LLT Ty = B.getMRI()->getType(Dst); 3641 const LLT F16 = LLT::float16(); 3642 const LLT F32 = LLT::float32(); 3643 3644 if (Ty == F32) { 3645 auto Log = B.buildFLog2(F32, Src0, Flags); 3646 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32}) 3647 .addUse(Log.getReg(0)) 3648 .addUse(Src1) 3649 .setMIFlags(Flags); 3650 B.buildFExp2(Dst, Mul, Flags); 3651 } else if (Ty == F16) { 3652 // There's no f16 fmul_legacy, so we need to convert for it. 3653 auto Log = B.buildFLog2(F16, Src0, Flags); 3654 auto Ext0 = B.buildFPExt(F32, Log, Flags); 3655 auto Ext1 = B.buildFPExt(F32, Src1, Flags); 3656 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32}) 3657 .addUse(Ext0.getReg(0)) 3658 .addUse(Ext1.getReg(0)) 3659 .setMIFlags(Flags); 3660 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags); 3661 } else 3662 return false; 3663 3664 MI.eraseFromParent(); 3665 return true; 3666 } 3667 3668 // Find a source register, ignoring any possible source modifiers. 3669 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 3670 Register ModSrc = OrigSrc; 3671 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 3672 ModSrc = SrcFNeg->getOperand(1).getReg(); 3673 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 3674 ModSrc = SrcFAbs->getOperand(1).getReg(); 3675 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 3676 ModSrc = SrcFAbs->getOperand(1).getReg(); 3677 return ModSrc; 3678 } 3679 3680 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 3681 MachineRegisterInfo &MRI, 3682 MachineIRBuilder &B) const { 3683 3684 const LLT S1 = LLT::scalar(1); 3685 const LLT F64 = LLT::float64(); 3686 Register Dst = MI.getOperand(0).getReg(); 3687 Register OrigSrc = MI.getOperand(1).getReg(); 3688 unsigned Flags = MI.getFlags(); 3689 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 && 3690 "this should not have been custom lowered"); 3691 3692 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 3693 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 3694 // efficient way to implement it is using V_FRACT_F64. The workaround for the 3695 // V_FRACT bug is: 3696 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 3697 // 3698 // Convert floor(x) to (x - fract(x)) 3699 3700 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64}) 3701 .addUse(OrigSrc) 3702 .setMIFlags(Flags); 3703 3704 // Give source modifier matching some assistance before obscuring a foldable 3705 // pattern. 3706 3707 // TODO: We can avoid the neg on the fract? The input sign to fract 3708 // shouldn't matter? 3709 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 3710 3711 auto Const = 3712 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff)); 3713 3714 Register Min = MRI.createGenericVirtualRegister(F64); 3715 3716 // We don't need to concern ourselves with the snan handling difference, so 3717 // use the one which will directly select. 3718 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3719 if (MFI->getMode().IEEE) 3720 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 3721 else 3722 B.buildFMinNum(Min, Fract, Const, Flags); 3723 3724 Register CorrectedFract = Min; 3725 if (!MI.getFlag(MachineInstr::FmNoNans)) { 3726 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 3727 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0); 3728 } 3729 3730 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags); 3731 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 3732 3733 MI.eraseFromParent(); 3734 return true; 3735 } 3736 3737 // Turn an illegal packed v2s16 build vector into bit operations. 3738 // TODO: This should probably be a bitcast action in LegalizerHelper. 3739 bool AMDGPULegalizerInfo::legalizeBuildVector( 3740 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3741 Register Dst = MI.getOperand(0).getReg(); 3742 const LLT S32 = LLT::scalar(32); 3743 const LLT S16 = LLT::scalar(16); 3744 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 3745 3746 Register Src0 = MI.getOperand(1).getReg(); 3747 Register Src1 = MI.getOperand(2).getReg(); 3748 3749 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) { 3750 assert(MRI.getType(Src0) == S32); 3751 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0); 3752 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0); 3753 } 3754 3755 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1}); 3756 B.buildBitcast(Dst, Merge); 3757 3758 MI.eraseFromParent(); 3759 return true; 3760 } 3761 3762 // Build a big integer multiply or multiply-add using MAD_64_32 instructions. 3763 // 3764 // Source and accumulation registers must all be 32-bits. 3765 // 3766 // TODO: When the multiply is uniform, we should produce a code sequence 3767 // that is better suited to instruction selection on the SALU. Instead of 3768 // the outer loop going over parts of the result, the outer loop should go 3769 // over parts of one of the factors. This should result in instruction 3770 // selection that makes full use of S_ADDC_U32 instructions. 3771 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper, 3772 MutableArrayRef<Register> Accum, 3773 ArrayRef<Register> Src0, 3774 ArrayRef<Register> Src1, 3775 bool UsePartialMad64_32, 3776 bool SeparateOddAlignedProducts) const { 3777 // Use (possibly empty) vectors of S1 registers to represent the set of 3778 // carries from one pair of positions to the next. 3779 using Carry = SmallVector<Register, 2>; 3780 3781 MachineIRBuilder &B = Helper.MIRBuilder; 3782 GISelKnownBits &KB = *Helper.getKnownBits(); 3783 3784 const LLT S1 = LLT::scalar(1); 3785 const LLT S32 = LLT::scalar(32); 3786 const LLT S64 = LLT::scalar(64); 3787 3788 Register Zero32; 3789 Register Zero64; 3790 3791 auto getZero32 = [&]() -> Register { 3792 if (!Zero32) 3793 Zero32 = B.buildConstant(S32, 0).getReg(0); 3794 return Zero32; 3795 }; 3796 auto getZero64 = [&]() -> Register { 3797 if (!Zero64) 3798 Zero64 = B.buildConstant(S64, 0).getReg(0); 3799 return Zero64; 3800 }; 3801 3802 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros; 3803 for (unsigned i = 0; i < Src0.size(); ++i) { 3804 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero()); 3805 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero()); 3806 } 3807 3808 // Merge the given carries into the 32-bit LocalAccum, which is modified 3809 // in-place. 3810 // 3811 // Returns the carry-out, which is a single S1 register or null. 3812 auto mergeCarry = 3813 [&](Register &LocalAccum, const Carry &CarryIn) -> Register { 3814 if (CarryIn.empty()) 3815 return Register(); 3816 3817 bool HaveCarryOut = true; 3818 Register CarryAccum; 3819 if (CarryIn.size() == 1) { 3820 if (!LocalAccum) { 3821 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 3822 return Register(); 3823 } 3824 3825 CarryAccum = getZero32(); 3826 } else { 3827 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 3828 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { 3829 CarryAccum = 3830 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) 3831 .getReg(0); 3832 } 3833 3834 if (!LocalAccum) { 3835 LocalAccum = getZero32(); 3836 HaveCarryOut = false; 3837 } 3838 } 3839 3840 auto Add = 3841 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); 3842 LocalAccum = Add.getReg(0); 3843 return HaveCarryOut ? Add.getReg(1) : Register(); 3844 }; 3845 3846 // Build a multiply-add chain to compute 3847 // 3848 // LocalAccum + (partial products at DstIndex) 3849 // + (opportunistic subset of CarryIn) 3850 // 3851 // LocalAccum is an array of one or two 32-bit registers that are updated 3852 // in-place. The incoming registers may be null. 3853 // 3854 // In some edge cases, carry-ins can be consumed "for free". In that case, 3855 // the consumed carry bits are removed from CarryIn in-place. 3856 auto buildMadChain = 3857 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) 3858 -> Carry { 3859 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || 3860 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); 3861 3862 Carry CarryOut; 3863 unsigned j0 = 0; 3864 3865 // Use plain 32-bit multiplication for the most significant part of the 3866 // result by default. 3867 if (LocalAccum.size() == 1 && 3868 (!UsePartialMad64_32 || !CarryIn.empty())) { 3869 do { 3870 // Skip multiplication if one of the operands is 0 3871 unsigned j1 = DstIndex - j0; 3872 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 3873 ++j0; 3874 continue; 3875 } 3876 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); 3877 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) { 3878 LocalAccum[0] = Mul.getReg(0); 3879 } else { 3880 if (CarryIn.empty()) { 3881 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); 3882 } else { 3883 LocalAccum[0] = 3884 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) 3885 .getReg(0); 3886 CarryIn.pop_back(); 3887 } 3888 } 3889 ++j0; 3890 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); 3891 } 3892 3893 // Build full 64-bit multiplies. 3894 if (j0 <= DstIndex) { 3895 bool HaveSmallAccum = false; 3896 Register Tmp; 3897 3898 if (LocalAccum[0]) { 3899 if (LocalAccum.size() == 1) { 3900 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); 3901 HaveSmallAccum = true; 3902 } else if (LocalAccum[1]) { 3903 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0); 3904 HaveSmallAccum = false; 3905 } else { 3906 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); 3907 HaveSmallAccum = true; 3908 } 3909 } else { 3910 assert(LocalAccum.size() == 1 || !LocalAccum[1]); 3911 Tmp = getZero64(); 3912 HaveSmallAccum = true; 3913 } 3914 3915 do { 3916 unsigned j1 = DstIndex - j0; 3917 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 3918 ++j0; 3919 continue; 3920 } 3921 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, 3922 {Src0[j0], Src1[j1], Tmp}); 3923 Tmp = Mad.getReg(0); 3924 if (!HaveSmallAccum) 3925 CarryOut.push_back(Mad.getReg(1)); 3926 HaveSmallAccum = false; 3927 3928 ++j0; 3929 } while (j0 <= DstIndex); 3930 3931 auto Unmerge = B.buildUnmerge(S32, Tmp); 3932 LocalAccum[0] = Unmerge.getReg(0); 3933 if (LocalAccum.size() > 1) 3934 LocalAccum[1] = Unmerge.getReg(1); 3935 } 3936 3937 return CarryOut; 3938 }; 3939 3940 // Outer multiply loop, iterating over destination parts from least 3941 // significant to most significant parts. 3942 // 3943 // The columns of the following diagram correspond to the destination parts 3944 // affected by one iteration of the outer loop (ignoring boundary 3945 // conditions). 3946 // 3947 // Dest index relative to 2 * i: 1 0 -1 3948 // ------ 3949 // Carries from previous iteration: e o 3950 // Even-aligned partial product sum: E E . 3951 // Odd-aligned partial product sum: O O 3952 // 3953 // 'o' is OddCarry, 'e' is EvenCarry. 3954 // EE and OO are computed from partial products via buildMadChain and use 3955 // accumulation where possible and appropriate. 3956 // 3957 Register SeparateOddCarry; 3958 Carry EvenCarry; 3959 Carry OddCarry; 3960 3961 for (unsigned i = 0; i <= Accum.size() / 2; ++i) { 3962 Carry OddCarryIn = std::move(OddCarry); 3963 Carry EvenCarryIn = std::move(EvenCarry); 3964 OddCarry.clear(); 3965 EvenCarry.clear(); 3966 3967 // Partial products at offset 2 * i. 3968 if (2 * i < Accum.size()) { 3969 auto LocalAccum = Accum.drop_front(2 * i).take_front(2); 3970 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); 3971 } 3972 3973 // Partial products at offset 2 * i - 1. 3974 if (i > 0) { 3975 if (!SeparateOddAlignedProducts) { 3976 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); 3977 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3978 } else { 3979 bool IsHighest = 2 * i >= Accum.size(); 3980 Register SeparateOddOut[2]; 3981 auto LocalAccum = MutableArrayRef(SeparateOddOut) 3982 .take_front(IsHighest ? 1 : 2); 3983 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3984 3985 MachineInstr *Lo; 3986 3987 if (i == 1) { 3988 if (!IsHighest) 3989 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); 3990 else 3991 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); 3992 } else { 3993 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], 3994 SeparateOddCarry); 3995 } 3996 Accum[2 * i - 1] = Lo->getOperand(0).getReg(); 3997 3998 if (!IsHighest) { 3999 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], 4000 Lo->getOperand(1).getReg()); 4001 Accum[2 * i] = Hi.getReg(0); 4002 SeparateOddCarry = Hi.getReg(1); 4003 } 4004 } 4005 } 4006 4007 // Add in the carries from the previous iteration 4008 if (i > 0) { 4009 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) 4010 EvenCarryIn.push_back(CarryOut); 4011 4012 if (2 * i < Accum.size()) { 4013 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) 4014 OddCarry.push_back(CarryOut); 4015 } 4016 } 4017 } 4018 } 4019 4020 // Custom narrowing of wide multiplies using wide multiply-add instructions. 4021 // 4022 // TODO: If the multiply is followed by an addition, we should attempt to 4023 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. 4024 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, 4025 MachineInstr &MI) const { 4026 assert(ST.hasMad64_32()); 4027 assert(MI.getOpcode() == TargetOpcode::G_MUL); 4028 4029 MachineIRBuilder &B = Helper.MIRBuilder; 4030 MachineRegisterInfo &MRI = *B.getMRI(); 4031 4032 Register DstReg = MI.getOperand(0).getReg(); 4033 Register Src0 = MI.getOperand(1).getReg(); 4034 Register Src1 = MI.getOperand(2).getReg(); 4035 4036 LLT Ty = MRI.getType(DstReg); 4037 assert(Ty.isScalar()); 4038 4039 unsigned Size = Ty.getSizeInBits(); 4040 unsigned NumParts = Size / 32; 4041 assert((Size % 32) == 0); 4042 assert(NumParts >= 2); 4043 4044 // Whether to use MAD_64_32 for partial products whose high half is 4045 // discarded. This avoids some ADD instructions but risks false dependency 4046 // stalls on some subtargets in some cases. 4047 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; 4048 4049 // Whether to compute odd-aligned partial products separately. This is 4050 // advisable on subtargets where the accumulator of MAD_64_32 must be placed 4051 // in an even-aligned VGPR. 4052 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); 4053 4054 LLT S32 = LLT::scalar(32); 4055 SmallVector<Register, 2> Src0Parts, Src1Parts; 4056 for (unsigned i = 0; i < NumParts; ++i) { 4057 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); 4058 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); 4059 } 4060 B.buildUnmerge(Src0Parts, Src0); 4061 B.buildUnmerge(Src1Parts, Src1); 4062 4063 SmallVector<Register, 2> AccumRegs(NumParts); 4064 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, 4065 SeparateOddAlignedProducts); 4066 4067 B.buildMergeLikeInstr(DstReg, AccumRegs); 4068 MI.eraseFromParent(); 4069 return true; 4070 } 4071 4072 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 4073 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 4074 // case with a single min instruction instead of a compare+select. 4075 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 4076 MachineRegisterInfo &MRI, 4077 MachineIRBuilder &B) const { 4078 Register Dst = MI.getOperand(0).getReg(); 4079 Register Src = MI.getOperand(1).getReg(); 4080 LLT DstTy = MRI.getType(Dst); 4081 LLT SrcTy = MRI.getType(Src); 4082 4083 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 4084 ? AMDGPU::G_AMDGPU_FFBH_U32 4085 : AMDGPU::G_AMDGPU_FFBL_B32; 4086 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 4087 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 4088 4089 MI.eraseFromParent(); 4090 return true; 4091 } 4092 4093 // Check that this is a G_XOR x, -1 4094 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 4095 if (MI.getOpcode() != TargetOpcode::G_XOR) 4096 return false; 4097 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 4098 return ConstVal && *ConstVal == -1; 4099 } 4100 4101 // Return the use branch instruction, otherwise null if the usage is invalid. 4102 static MachineInstr * 4103 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 4104 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 4105 Register CondDef = MI.getOperand(0).getReg(); 4106 if (!MRI.hasOneNonDBGUse(CondDef)) 4107 return nullptr; 4108 4109 MachineBasicBlock *Parent = MI.getParent(); 4110 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 4111 4112 if (isNot(MRI, *UseMI)) { 4113 Register NegatedCond = UseMI->getOperand(0).getReg(); 4114 if (!MRI.hasOneNonDBGUse(NegatedCond)) 4115 return nullptr; 4116 4117 // We're deleting the def of this value, so we need to remove it. 4118 eraseInstr(*UseMI, MRI); 4119 4120 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 4121 Negated = true; 4122 } 4123 4124 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 4125 return nullptr; 4126 4127 // Make sure the cond br is followed by a G_BR, or is the last instruction. 4128 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 4129 if (Next == Parent->end()) { 4130 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 4131 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 4132 return nullptr; 4133 UncondBrTarget = &*NextMBB; 4134 } else { 4135 if (Next->getOpcode() != AMDGPU::G_BR) 4136 return nullptr; 4137 Br = &*Next; 4138 UncondBrTarget = Br->getOperand(0).getMBB(); 4139 } 4140 4141 return UseMI; 4142 } 4143 4144 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 4145 const ArgDescriptor *Arg, 4146 const TargetRegisterClass *ArgRC, 4147 LLT ArgTy) const { 4148 MCRegister SrcReg = Arg->getRegister(); 4149 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 4150 assert(DstReg.isVirtual() && "Virtual register expected"); 4151 4152 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 4153 *ArgRC, B.getDebugLoc(), ArgTy); 4154 if (Arg->isMasked()) { 4155 // TODO: Should we try to emit this once in the entry block? 4156 const LLT S32 = LLT::scalar(32); 4157 const unsigned Mask = Arg->getMask(); 4158 const unsigned Shift = llvm::countr_zero<unsigned>(Mask); 4159 4160 Register AndMaskSrc = LiveIn; 4161 4162 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 4163 // 0. 4164 if (Shift != 0) { 4165 auto ShiftAmt = B.buildConstant(S32, Shift); 4166 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 4167 } 4168 4169 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 4170 } else { 4171 B.buildCopy(DstReg, LiveIn); 4172 } 4173 4174 return true; 4175 } 4176 4177 bool AMDGPULegalizerInfo::loadInputValue( 4178 Register DstReg, MachineIRBuilder &B, 4179 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4180 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4181 const ArgDescriptor *Arg; 4182 const TargetRegisterClass *ArgRC; 4183 LLT ArgTy; 4184 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 4185 4186 if (!Arg) { 4187 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 4188 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 4189 // case the pointer argument may be missing and we use null. 4190 B.buildConstant(DstReg, 0); 4191 return true; 4192 } 4193 4194 // It's undefined behavior if a function marked with the amdgpu-no-* 4195 // attributes uses the corresponding intrinsic. 4196 B.buildUndef(DstReg); 4197 return true; 4198 } 4199 4200 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 4201 return false; // TODO: Handle these 4202 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 4203 } 4204 4205 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 4206 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 4207 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4208 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 4209 return false; 4210 4211 MI.eraseFromParent(); 4212 return true; 4213 } 4214 4215 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, 4216 int64_t C) { 4217 B.buildConstant(MI.getOperand(0).getReg(), C); 4218 MI.eraseFromParent(); 4219 return true; 4220 } 4221 4222 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( 4223 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 4224 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4225 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); 4226 if (MaxID == 0) 4227 return replaceWithConstant(B, MI, 0); 4228 4229 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4230 const ArgDescriptor *Arg; 4231 const TargetRegisterClass *ArgRC; 4232 LLT ArgTy; 4233 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 4234 4235 Register DstReg = MI.getOperand(0).getReg(); 4236 if (!Arg) { 4237 // It's undefined behavior if a function marked with the amdgpu-no-* 4238 // attributes uses the corresponding intrinsic. 4239 B.buildUndef(DstReg); 4240 MI.eraseFromParent(); 4241 return true; 4242 } 4243 4244 if (Arg->isMasked()) { 4245 // Don't bother inserting AssertZext for packed IDs since we're emitting the 4246 // masking operations anyway. 4247 // 4248 // TODO: We could assert the top bit is 0 for the source copy. 4249 if (!loadInputValue(DstReg, B, ArgType)) 4250 return false; 4251 } else { 4252 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 4253 if (!loadInputValue(TmpReg, B, ArgType)) 4254 return false; 4255 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID)); 4256 } 4257 4258 MI.eraseFromParent(); 4259 return true; 4260 } 4261 4262 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 4263 int64_t Offset) const { 4264 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 4265 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 4266 4267 // TODO: If we passed in the base kernel offset we could have a better 4268 // alignment than 4, but we don't really need it. 4269 if (!loadInputValue(KernArgReg, B, 4270 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4271 llvm_unreachable("failed to find kernarg segment ptr"); 4272 4273 auto COffset = B.buildConstant(LLT::scalar(64), Offset); 4274 // TODO: Should get nuw 4275 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 4276 } 4277 4278 /// Legalize a value that's loaded from kernel arguments. This is only used by 4279 /// legacy intrinsics. 4280 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 4281 MachineIRBuilder &B, 4282 uint64_t Offset, 4283 Align Alignment) const { 4284 Register DstReg = MI.getOperand(0).getReg(); 4285 4286 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 4287 "unexpected kernarg parameter type"); 4288 4289 Register Ptr = getKernargParameterPtr(B, Offset); 4290 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 4291 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 4292 MachineMemOperand::MODereferenceable | 4293 MachineMemOperand::MOInvariant); 4294 MI.eraseFromParent(); 4295 return true; 4296 } 4297 4298 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 4299 MachineRegisterInfo &MRI, 4300 MachineIRBuilder &B) const { 4301 Register Dst = MI.getOperand(0).getReg(); 4302 LLT DstTy = MRI.getType(Dst); 4303 LLT S16 = LLT::scalar(16); 4304 LLT S32 = LLT::scalar(32); 4305 LLT S64 = LLT::scalar(64); 4306 4307 if (DstTy == S16) 4308 return legalizeFDIV16(MI, MRI, B); 4309 if (DstTy == S32) 4310 return legalizeFDIV32(MI, MRI, B); 4311 if (DstTy == S64) 4312 return legalizeFDIV64(MI, MRI, B); 4313 4314 return false; 4315 } 4316 4317 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 4318 Register DstDivReg, 4319 Register DstRemReg, 4320 Register X, 4321 Register Y) const { 4322 const LLT S1 = LLT::scalar(1); 4323 const LLT S32 = LLT::scalar(32); 4324 4325 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 4326 // algorithm used here. 4327 4328 // Initial estimate of inv(y). 4329 auto FloatY = B.buildUITOFP(S32, Y); 4330 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 4331 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe)); 4332 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 4333 auto Z = B.buildFPTOUI(S32, ScaledY); 4334 4335 // One round of UNR. 4336 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 4337 auto NegYZ = B.buildMul(S32, NegY, Z); 4338 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 4339 4340 // Quotient/remainder estimate. 4341 auto Q = B.buildUMulH(S32, X, Z); 4342 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 4343 4344 // First quotient/remainder refinement. 4345 auto One = B.buildConstant(S32, 1); 4346 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4347 if (DstDivReg) 4348 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 4349 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 4350 4351 // Second quotient/remainder refinement. 4352 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4353 if (DstDivReg) 4354 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 4355 4356 if (DstRemReg) 4357 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 4358 } 4359 4360 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 4361 // 4362 // Return lo, hi of result 4363 // 4364 // %cvt.lo = G_UITOFP Val.lo 4365 // %cvt.hi = G_UITOFP Val.hi 4366 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 4367 // %rcp = G_AMDGPU_RCP_IFLAG %mad 4368 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 4369 // %mul2 = G_FMUL %mul1, 2**(-32) 4370 // %trunc = G_INTRINSIC_TRUNC %mul2 4371 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 4372 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 4373 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 4374 Register Val) { 4375 const LLT S32 = LLT::scalar(32); 4376 auto Unmerge = B.buildUnmerge(S32, Val); 4377 4378 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 4379 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 4380 4381 auto Mad = B.buildFMAD( 4382 S32, CvtHi, // 2**32 4383 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo); 4384 4385 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 4386 auto Mul1 = B.buildFMul( 4387 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc))); 4388 4389 // 2**(-32) 4390 auto Mul2 = B.buildFMul( 4391 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000))); 4392 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 4393 4394 // -(2**32) 4395 auto Mad2 = B.buildFMAD( 4396 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)), 4397 Mul1); 4398 4399 auto ResultLo = B.buildFPTOUI(S32, Mad2); 4400 auto ResultHi = B.buildFPTOUI(S32, Trunc); 4401 4402 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 4403 } 4404 4405 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 4406 Register DstDivReg, 4407 Register DstRemReg, 4408 Register Numer, 4409 Register Denom) const { 4410 const LLT S32 = LLT::scalar(32); 4411 const LLT S64 = LLT::scalar(64); 4412 const LLT S1 = LLT::scalar(1); 4413 Register RcpLo, RcpHi; 4414 4415 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 4416 4417 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi}); 4418 4419 auto Zero64 = B.buildConstant(S64, 0); 4420 auto NegDenom = B.buildSub(S64, Zero64, Denom); 4421 4422 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 4423 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 4424 4425 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 4426 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 4427 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 4428 4429 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 4430 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 4431 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi}); 4432 4433 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 4434 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 4435 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 4436 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 4437 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 4438 4439 auto Zero32 = B.buildConstant(S32, 0); 4440 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 4441 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 4442 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi}); 4443 4444 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 4445 Register NumerLo = UnmergeNumer.getReg(0); 4446 Register NumerHi = UnmergeNumer.getReg(1); 4447 4448 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 4449 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 4450 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 4451 Register Mul3_Lo = UnmergeMul3.getReg(0); 4452 Register Mul3_Hi = UnmergeMul3.getReg(1); 4453 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 4454 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 4455 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 4456 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi}); 4457 4458 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 4459 Register DenomLo = UnmergeDenom.getReg(0); 4460 Register DenomHi = UnmergeDenom.getReg(1); 4461 4462 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 4463 auto C1 = B.buildSExt(S32, CmpHi); 4464 4465 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 4466 auto C2 = B.buildSExt(S32, CmpLo); 4467 4468 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 4469 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 4470 4471 // TODO: Here and below portions of the code can be enclosed into if/endif. 4472 // Currently control flow is unconditional and we have 4 selects after 4473 // potential endif to substitute PHIs. 4474 4475 // if C3 != 0 ... 4476 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 4477 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 4478 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 4479 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi}); 4480 4481 auto One64 = B.buildConstant(S64, 1); 4482 auto Add3 = B.buildAdd(S64, MulHi3, One64); 4483 4484 auto C4 = 4485 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 4486 auto C5 = 4487 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 4488 auto C6 = B.buildSelect( 4489 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 4490 4491 // if (C6 != 0) 4492 auto Add4 = B.buildAdd(S64, Add3, One64); 4493 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 4494 4495 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 4496 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 4497 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi}); 4498 4499 // endif C6 4500 // endif C3 4501 4502 if (DstDivReg) { 4503 auto Sel1 = B.buildSelect( 4504 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 4505 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4506 Sel1, MulHi3); 4507 } 4508 4509 if (DstRemReg) { 4510 auto Sel2 = B.buildSelect( 4511 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 4512 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4513 Sel2, Sub1); 4514 } 4515 } 4516 4517 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 4518 MachineRegisterInfo &MRI, 4519 MachineIRBuilder &B) const { 4520 Register DstDivReg, DstRemReg; 4521 switch (MI.getOpcode()) { 4522 default: 4523 llvm_unreachable("Unexpected opcode!"); 4524 case AMDGPU::G_UDIV: { 4525 DstDivReg = MI.getOperand(0).getReg(); 4526 break; 4527 } 4528 case AMDGPU::G_UREM: { 4529 DstRemReg = MI.getOperand(0).getReg(); 4530 break; 4531 } 4532 case AMDGPU::G_UDIVREM: { 4533 DstDivReg = MI.getOperand(0).getReg(); 4534 DstRemReg = MI.getOperand(1).getReg(); 4535 break; 4536 } 4537 } 4538 4539 const LLT S64 = LLT::scalar(64); 4540 const LLT S32 = LLT::scalar(32); 4541 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4542 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 4543 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 4544 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 4545 4546 if (Ty == S32) 4547 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 4548 else if (Ty == S64) 4549 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 4550 else 4551 return false; 4552 4553 MI.eraseFromParent(); 4554 return true; 4555 } 4556 4557 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 4558 MachineRegisterInfo &MRI, 4559 MachineIRBuilder &B) const { 4560 const LLT S64 = LLT::scalar(64); 4561 const LLT S32 = LLT::scalar(32); 4562 4563 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 4564 if (Ty != S32 && Ty != S64) 4565 return false; 4566 4567 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4568 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 4569 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 4570 4571 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 4572 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 4573 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 4574 4575 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 4576 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 4577 4578 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 4579 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 4580 4581 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 4582 switch (MI.getOpcode()) { 4583 default: 4584 llvm_unreachable("Unexpected opcode!"); 4585 case AMDGPU::G_SDIV: { 4586 DstDivReg = MI.getOperand(0).getReg(); 4587 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4588 break; 4589 } 4590 case AMDGPU::G_SREM: { 4591 DstRemReg = MI.getOperand(0).getReg(); 4592 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4593 break; 4594 } 4595 case AMDGPU::G_SDIVREM: { 4596 DstDivReg = MI.getOperand(0).getReg(); 4597 DstRemReg = MI.getOperand(1).getReg(); 4598 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4599 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4600 break; 4601 } 4602 } 4603 4604 if (Ty == S32) 4605 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 4606 else 4607 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 4608 4609 if (DstDivReg) { 4610 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 4611 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 4612 B.buildSub(DstDivReg, SignXor, Sign); 4613 } 4614 4615 if (DstRemReg) { 4616 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 4617 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 4618 B.buildSub(DstRemReg, SignXor, Sign); 4619 } 4620 4621 MI.eraseFromParent(); 4622 return true; 4623 } 4624 4625 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 4626 MachineRegisterInfo &MRI, 4627 MachineIRBuilder &B) const { 4628 Register Res = MI.getOperand(0).getReg(); 4629 Register LHS = MI.getOperand(1).getReg(); 4630 Register RHS = MI.getOperand(2).getReg(); 4631 uint16_t Flags = MI.getFlags(); 4632 LLT ResTy = MRI.getType(Res); 4633 4634 const MachineFunction &MF = B.getMF(); 4635 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) || 4636 MF.getTarget().Options.UnsafeFPMath; 4637 4638 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 4639 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16)) 4640 return false; 4641 4642 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 4643 // the CI documentation has a worst case error of 1 ulp. 4644 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 4645 // use it as long as we aren't trying to use denormals. 4646 // 4647 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. 4648 4649 // 1 / x -> RCP(x) 4650 if (CLHS->isExactlyValue(1.0)) { 4651 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res) 4652 .addUse(RHS) 4653 .setMIFlags(Flags); 4654 4655 MI.eraseFromParent(); 4656 return true; 4657 } 4658 4659 // -1 / x -> RCP( FNEG(x) ) 4660 if (CLHS->isExactlyValue(-1.0)) { 4661 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 4662 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res) 4663 .addUse(FNeg.getReg(0)) 4664 .setMIFlags(Flags); 4665 4666 MI.eraseFromParent(); 4667 return true; 4668 } 4669 } 4670 4671 // For f16 require afn or arcp. 4672 // For f32 require afn. 4673 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) || 4674 !MI.getFlag(MachineInstr::FmArcp))) 4675 return false; 4676 4677 // x / y -> x * (1.0 / y) 4678 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}) 4679 .addUse(RHS) 4680 .setMIFlags(Flags); 4681 B.buildFMul(Res, LHS, RCP, Flags); 4682 4683 MI.eraseFromParent(); 4684 return true; 4685 } 4686 4687 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 4688 MachineRegisterInfo &MRI, 4689 MachineIRBuilder &B) const { 4690 Register Res = MI.getOperand(0).getReg(); 4691 Register X = MI.getOperand(1).getReg(); 4692 Register Y = MI.getOperand(2).getReg(); 4693 uint16_t Flags = MI.getFlags(); 4694 LLT ResTy = MRI.getType(Res); 4695 4696 const MachineFunction &MF = B.getMF(); 4697 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 4698 MI.getFlag(MachineInstr::FmAfn); 4699 4700 if (!AllowInaccurateRcp) 4701 return false; 4702 4703 auto NegY = B.buildFNeg(ResTy, Y); 4704 auto One = B.buildFConstant(ResTy, 1.0); 4705 4706 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}) 4707 .addUse(Y) 4708 .setMIFlags(Flags); 4709 4710 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 4711 R = B.buildFMA(ResTy, Tmp0, R, R); 4712 4713 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 4714 R = B.buildFMA(ResTy, Tmp1, R, R); 4715 4716 auto Ret = B.buildFMul(ResTy, X, R); 4717 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 4718 4719 B.buildFMA(Res, Tmp2, R, Ret); 4720 MI.eraseFromParent(); 4721 return true; 4722 } 4723 4724 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 4725 MachineRegisterInfo &MRI, 4726 MachineIRBuilder &B) const { 4727 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 4728 return true; 4729 4730 Register Res = MI.getOperand(0).getReg(); 4731 Register LHS = MI.getOperand(1).getReg(); 4732 Register RHS = MI.getOperand(2).getReg(); 4733 4734 uint16_t Flags = MI.getFlags(); 4735 4736 LLT S16 = LLT::scalar(16); 4737 LLT S32 = LLT::scalar(32); 4738 4739 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 4740 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 4741 4742 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 4743 .addUse(RHSExt.getReg(0)) 4744 .setMIFlags(Flags); 4745 4746 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 4747 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 4748 4749 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res) 4750 .addUse(RDst.getReg(0)) 4751 .addUse(RHS) 4752 .addUse(LHS) 4753 .setMIFlags(Flags); 4754 4755 MI.eraseFromParent(); 4756 return true; 4757 } 4758 4759 static const unsigned SPDenormModeBitField = 4760 AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 4761 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 4762 4763 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 4764 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 4765 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, 4766 const GCNSubtarget &ST, 4767 SIModeRegisterDefaults Mode) { 4768 // Set SP denorm mode to this value. 4769 unsigned SPDenormMode = 4770 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 4771 4772 if (ST.hasDenormModeInst()) { 4773 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 4774 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 4775 4776 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 4777 B.buildInstr(AMDGPU::S_DENORM_MODE) 4778 .addImm(NewDenormModeValue); 4779 4780 } else { 4781 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 4782 .addImm(SPDenormMode) 4783 .addImm(SPDenormModeBitField); 4784 } 4785 } 4786 4787 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 4788 MachineRegisterInfo &MRI, 4789 MachineIRBuilder &B) const { 4790 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 4791 return true; 4792 4793 Register Res = MI.getOperand(0).getReg(); 4794 Register LHS = MI.getOperand(1).getReg(); 4795 Register RHS = MI.getOperand(2).getReg(); 4796 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4797 SIModeRegisterDefaults Mode = MFI->getMode(); 4798 4799 uint16_t Flags = MI.getFlags(); 4800 4801 LLT S32 = LLT::scalar(32); 4802 LLT S1 = LLT::scalar(1); 4803 4804 auto One = B.buildFConstant(S32, 1.0f); 4805 4806 auto DenominatorScaled = 4807 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}) 4808 .addUse(LHS) 4809 .addUse(RHS) 4810 .addImm(0) 4811 .setMIFlags(Flags); 4812 auto NumeratorScaled = 4813 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}) 4814 .addUse(LHS) 4815 .addUse(RHS) 4816 .addImm(1) 4817 .setMIFlags(Flags); 4818 4819 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 4820 .addUse(DenominatorScaled.getReg(0)) 4821 .setMIFlags(Flags); 4822 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 4823 4824 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE(); 4825 const bool HasDynamicDenormals = 4826 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) || 4827 (Mode.FP32Denormals.Output == DenormalMode::Dynamic); 4828 4829 Register SavedSPDenormMode; 4830 if (!PreservesDenormals) { 4831 if (HasDynamicDenormals) { 4832 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 4833 B.buildInstr(AMDGPU::S_GETREG_B32) 4834 .addDef(SavedSPDenormMode) 4835 .addImm(SPDenormModeBitField); 4836 } 4837 toggleSPDenormMode(true, B, ST, Mode); 4838 } 4839 4840 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 4841 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 4842 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 4843 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 4844 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 4845 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 4846 4847 if (!PreservesDenormals) { 4848 if (HasDynamicDenormals) { 4849 assert(SavedSPDenormMode); 4850 B.buildInstr(AMDGPU::S_SETREG_B32) 4851 .addReg(SavedSPDenormMode) 4852 .addImm(SPDenormModeBitField); 4853 } else 4854 toggleSPDenormMode(false, B, ST, Mode); 4855 } 4856 4857 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}) 4858 .addUse(Fma4.getReg(0)) 4859 .addUse(Fma1.getReg(0)) 4860 .addUse(Fma3.getReg(0)) 4861 .addUse(NumeratorScaled.getReg(1)) 4862 .setMIFlags(Flags); 4863 4864 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res) 4865 .addUse(Fmas.getReg(0)) 4866 .addUse(RHS) 4867 .addUse(LHS) 4868 .setMIFlags(Flags); 4869 4870 MI.eraseFromParent(); 4871 return true; 4872 } 4873 4874 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 4875 MachineRegisterInfo &MRI, 4876 MachineIRBuilder &B) const { 4877 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 4878 return true; 4879 4880 Register Res = MI.getOperand(0).getReg(); 4881 Register LHS = MI.getOperand(1).getReg(); 4882 Register RHS = MI.getOperand(2).getReg(); 4883 4884 uint16_t Flags = MI.getFlags(); 4885 4886 LLT S64 = LLT::scalar(64); 4887 LLT S1 = LLT::scalar(1); 4888 4889 auto One = B.buildFConstant(S64, 1.0); 4890 4891 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}) 4892 .addUse(LHS) 4893 .addUse(RHS) 4894 .addImm(0) 4895 .setMIFlags(Flags); 4896 4897 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 4898 4899 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}) 4900 .addUse(DivScale0.getReg(0)) 4901 .setMIFlags(Flags); 4902 4903 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 4904 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 4905 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 4906 4907 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}) 4908 .addUse(LHS) 4909 .addUse(RHS) 4910 .addImm(1) 4911 .setMIFlags(Flags); 4912 4913 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 4914 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 4915 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 4916 4917 Register Scale; 4918 if (!ST.hasUsableDivScaleConditionOutput()) { 4919 // Workaround a hardware bug on SI where the condition output from div_scale 4920 // is not usable. 4921 4922 LLT S32 = LLT::scalar(32); 4923 4924 auto NumUnmerge = B.buildUnmerge(S32, LHS); 4925 auto DenUnmerge = B.buildUnmerge(S32, RHS); 4926 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 4927 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 4928 4929 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 4930 Scale1Unmerge.getReg(1)); 4931 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 4932 Scale0Unmerge.getReg(1)); 4933 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 4934 } else { 4935 Scale = DivScale1.getReg(1); 4936 } 4937 4938 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}) 4939 .addUse(Fma4.getReg(0)) 4940 .addUse(Fma3.getReg(0)) 4941 .addUse(Mul.getReg(0)) 4942 .addUse(Scale) 4943 .setMIFlags(Flags); 4944 4945 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res)) 4946 .addUse(Fmas.getReg(0)) 4947 .addUse(RHS) 4948 .addUse(LHS) 4949 .setMIFlags(Flags); 4950 4951 MI.eraseFromParent(); 4952 return true; 4953 } 4954 4955 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI, 4956 MachineRegisterInfo &MRI, 4957 MachineIRBuilder &B) const { 4958 Register Res0 = MI.getOperand(0).getReg(); 4959 Register Res1 = MI.getOperand(1).getReg(); 4960 Register Val = MI.getOperand(2).getReg(); 4961 uint16_t Flags = MI.getFlags(); 4962 4963 LLT Ty = MRI.getType(Res0); 4964 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32); 4965 4966 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty}) 4967 .addUse(Val) 4968 .setMIFlags(Flags); 4969 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy}) 4970 .addUse(Val) 4971 .setMIFlags(Flags); 4972 4973 if (ST.hasFractBug()) { 4974 auto Fabs = B.buildFAbs(Ty, Val); 4975 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty))); 4976 auto IsFinite = 4977 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 4978 auto Zero = B.buildConstant(InstrExpTy, 0); 4979 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero); 4980 Mant = B.buildSelect(Ty, IsFinite, Mant, Val); 4981 } 4982 4983 B.buildCopy(Res0, Mant); 4984 B.buildSExtOrTrunc(Res1, Exp); 4985 4986 MI.eraseFromParent(); 4987 return true; 4988 } 4989 4990 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 4991 MachineRegisterInfo &MRI, 4992 MachineIRBuilder &B) const { 4993 Register Res = MI.getOperand(0).getReg(); 4994 Register LHS = MI.getOperand(2).getReg(); 4995 Register RHS = MI.getOperand(3).getReg(); 4996 uint16_t Flags = MI.getFlags(); 4997 4998 LLT S32 = LLT::scalar(32); 4999 LLT S1 = LLT::scalar(1); 5000 5001 auto Abs = B.buildFAbs(S32, RHS, Flags); 5002 const APFloat C0Val(1.0f); 5003 5004 auto C0 = B.buildFConstant(S32, 0x1p+96f); 5005 auto C1 = B.buildFConstant(S32, 0x1p-32f); 5006 auto C2 = B.buildFConstant(S32, 1.0f); 5007 5008 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 5009 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 5010 5011 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 5012 5013 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 5014 .addUse(Mul0.getReg(0)) 5015 .setMIFlags(Flags); 5016 5017 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 5018 5019 B.buildFMul(Res, Sel, Mul1, Flags); 5020 5021 MI.eraseFromParent(); 5022 return true; 5023 } 5024 5025 bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI, 5026 MachineRegisterInfo &MRI, 5027 MachineIRBuilder &B) const { 5028 // Bypass the correct expansion a standard promotion through G_FSQRT would 5029 // get. The f32 op is accurate enough for the f16 cas. 5030 unsigned Flags = MI.getFlags(); 5031 assert(!ST.has16BitInsts()); 5032 const LLT F32 = LLT::scalar(32); 5033 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags); 5034 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32}) 5035 .addUse(Ext.getReg(0)) 5036 .setMIFlags(Flags); 5037 B.buildFPTrunc(MI.getOperand(0), Log2, Flags); 5038 MI.eraseFromParent(); 5039 return true; 5040 } 5041 5042 bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI, 5043 MachineRegisterInfo &MRI, 5044 MachineIRBuilder &B) const { 5045 MachineFunction &MF = B.getMF(); 5046 Register Dst = MI.getOperand(0).getReg(); 5047 Register X = MI.getOperand(1).getReg(); 5048 const unsigned Flags = MI.getFlags(); 5049 const LLT S1 = LLT::scalar(1); 5050 const LLT F32 = LLT::scalar(32); 5051 const LLT I32 = LLT::scalar(32); 5052 5053 if (allowApproxFunc(MF, Flags)) { 5054 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst})) 5055 .addUse(X) 5056 .setMIFlags(Flags); 5057 MI.eraseFromParent(); 5058 return true; 5059 } 5060 5061 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f); 5062 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags); 5063 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f); 5064 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags); 5065 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags); 5066 5067 Register SqrtS = MRI.createGenericVirtualRegister(F32); 5068 if (needsDenormHandlingF32(MF, X, Flags)) { 5069 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS})) 5070 .addUse(SqrtX.getReg(0)) 5071 .setMIFlags(Flags); 5072 5073 auto NegOne = B.buildConstant(I32, -1); 5074 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne); 5075 5076 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags); 5077 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags); 5078 5079 auto PosOne = B.buildConstant(I32, 1); 5080 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne); 5081 5082 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags); 5083 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags); 5084 5085 auto Zero = B.buildFConstant(F32, 0.0f); 5086 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags); 5087 5088 SqrtS = 5089 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0); 5090 5091 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags); 5092 SqrtS = 5093 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0); 5094 } else { 5095 auto SqrtR = 5096 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0)); 5097 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags); 5098 5099 auto Half = B.buildFConstant(F32, 0.5f); 5100 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags); 5101 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags); 5102 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags); 5103 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags); 5104 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0); 5105 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags); 5106 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags); 5107 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0); 5108 } 5109 5110 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f); 5111 5112 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags); 5113 5114 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0); 5115 5116 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); 5117 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags); 5118 5119 MI.eraseFromParent(); 5120 return true; 5121 } 5122 5123 bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI, 5124 MachineRegisterInfo &MRI, 5125 MachineIRBuilder &B) const { 5126 // For double type, the SQRT and RSQ instructions don't have required 5127 // precision, we apply Goldschmidt's algorithm to improve the result: 5128 // 5129 // y0 = rsq(x) 5130 // g0 = x * y0 5131 // h0 = 0.5 * y0 5132 // 5133 // r0 = 0.5 - h0 * g0 5134 // g1 = g0 * r0 + g0 5135 // h1 = h0 * r0 + h0 5136 // 5137 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 5138 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 5139 // h2 = h1 * r1 + h1 5140 // 5141 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 5142 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 5143 // 5144 // sqrt(x) = g3 5145 5146 const LLT S1 = LLT::scalar(1); 5147 const LLT S32 = LLT::scalar(32); 5148 const LLT F64 = LLT::scalar(64); 5149 5150 Register Dst = MI.getOperand(0).getReg(); 5151 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt"); 5152 5153 Register X = MI.getOperand(1).getReg(); 5154 unsigned Flags = MI.getFlags(); 5155 5156 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767); 5157 5158 auto ZeroInt = B.buildConstant(S32, 0); 5159 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant); 5160 5161 // Scale up input if it is too small. 5162 auto ScaleUpFactor = B.buildConstant(S32, 256); 5163 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt); 5164 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags); 5165 5166 auto SqrtY = 5167 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0)); 5168 5169 auto Half = B.buildFConstant(F64, 0.5); 5170 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half); 5171 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY); 5172 5173 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0); 5174 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half); 5175 5176 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0); 5177 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0); 5178 5179 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1); 5180 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX); 5181 5182 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1); 5183 5184 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2); 5185 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX); 5186 5187 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2); 5188 5189 // Scale down the result. 5190 auto ScaleDownFactor = B.buildConstant(S32, -128); 5191 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt); 5192 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags); 5193 5194 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check 5195 // with finite only or nsz because rsq(+/-0) = +/-inf 5196 5197 // TODO: Check for DAZ and expand to subnormals 5198 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); 5199 5200 // If x is +INF, +0, or -0, use its original value 5201 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags); 5202 5203 MI.eraseFromParent(); 5204 return true; 5205 } 5206 5207 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, 5208 MachineRegisterInfo &MRI, 5209 MachineIRBuilder &B) const { 5210 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 5211 if (Ty == LLT::scalar(32)) 5212 return legalizeFSQRTF32(MI, MRI, B); 5213 if (Ty == LLT::scalar(64)) 5214 return legalizeFSQRTF64(MI, MRI, B); 5215 if (Ty == LLT::scalar(16)) 5216 return legalizeFSQRTF16(MI, MRI, B); 5217 return false; 5218 } 5219 5220 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 5221 // FIXME: Why do we handle this one but not other removed instructions? 5222 // 5223 // Reciprocal square root. The clamp prevents infinite results, clamping 5224 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 5225 // +-max_float. 5226 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 5227 MachineRegisterInfo &MRI, 5228 MachineIRBuilder &B) const { 5229 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 5230 return true; 5231 5232 Register Dst = MI.getOperand(0).getReg(); 5233 Register Src = MI.getOperand(2).getReg(); 5234 auto Flags = MI.getFlags(); 5235 5236 LLT Ty = MRI.getType(Dst); 5237 5238 const fltSemantics *FltSemantics; 5239 if (Ty == LLT::scalar(32)) 5240 FltSemantics = &APFloat::IEEEsingle(); 5241 else if (Ty == LLT::scalar(64)) 5242 FltSemantics = &APFloat::IEEEdouble(); 5243 else 5244 return false; 5245 5246 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}) 5247 .addUse(Src) 5248 .setMIFlags(Flags); 5249 5250 // We don't need to concern ourselves with the snan handling difference, since 5251 // the rsq quieted (or not) so use the one which will directly select. 5252 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5253 const bool UseIEEE = MFI->getMode().IEEE; 5254 5255 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 5256 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 5257 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 5258 5259 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 5260 5261 if (UseIEEE) 5262 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 5263 else 5264 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 5265 MI.eraseFromParent(); 5266 return true; 5267 } 5268 5269 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 5270 switch (IID) { 5271 case Intrinsic::amdgcn_ds_fadd: 5272 return AMDGPU::G_ATOMICRMW_FADD; 5273 case Intrinsic::amdgcn_ds_fmin: 5274 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 5275 case Intrinsic::amdgcn_ds_fmax: 5276 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 5277 default: 5278 llvm_unreachable("not a DS FP intrinsic"); 5279 } 5280 } 5281 5282 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 5283 MachineInstr &MI, 5284 Intrinsic::ID IID) const { 5285 GISelChangeObserver &Observer = Helper.Observer; 5286 Observer.changingInstr(MI); 5287 5288 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 5289 5290 // The remaining operands were used to set fields in the MemOperand on 5291 // construction. 5292 for (int I = 6; I > 3; --I) 5293 MI.removeOperand(I); 5294 5295 MI.removeOperand(1); // Remove the intrinsic ID. 5296 Observer.changedInstr(MI); 5297 return true; 5298 } 5299 5300 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 5301 MachineRegisterInfo &MRI, 5302 MachineIRBuilder &B) const { 5303 uint64_t Offset = 5304 ST.getTargetLowering()->getImplicitParameterOffset( 5305 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 5306 LLT DstTy = MRI.getType(DstReg); 5307 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 5308 5309 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 5310 if (!loadInputValue(KernargPtrReg, B, 5311 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 5312 return false; 5313 5314 // FIXME: This should be nuw 5315 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 5316 return true; 5317 } 5318 5319 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32 5320 /// bits of the pointer and replace them with the stride argument, then 5321 /// merge_values everything together. In the common case of a raw buffer (the 5322 /// stride component is 0), we can just AND off the upper half. 5323 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin( 5324 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5325 Register Result = MI.getOperand(0).getReg(); 5326 Register Pointer = MI.getOperand(2).getReg(); 5327 Register Stride = MI.getOperand(3).getReg(); 5328 Register NumRecords = MI.getOperand(4).getReg(); 5329 Register Flags = MI.getOperand(5).getReg(); 5330 5331 LLT S32 = LLT::scalar(32); 5332 5333 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 5334 auto Unmerge = B.buildUnmerge(S32, Pointer); 5335 Register LowHalf = Unmerge.getReg(0); 5336 Register HighHalf = Unmerge.getReg(1); 5337 5338 auto AndMask = B.buildConstant(S32, 0x0000ffff); 5339 auto Masked = B.buildAnd(S32, HighHalf, AndMask); 5340 5341 MachineInstrBuilder NewHighHalf = Masked; 5342 std::optional<ValueAndVReg> StrideConst = 5343 getIConstantVRegValWithLookThrough(Stride, MRI); 5344 if (!StrideConst || !StrideConst->Value.isZero()) { 5345 MachineInstrBuilder ShiftedStride; 5346 if (StrideConst) { 5347 uint32_t StrideVal = StrideConst->Value.getZExtValue(); 5348 uint32_t ShiftedStrideVal = StrideVal << 16; 5349 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal); 5350 } else { 5351 auto ExtStride = B.buildAnyExt(S32, Stride); 5352 auto ShiftConst = B.buildConstant(S32, 16); 5353 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst); 5354 } 5355 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride); 5356 } 5357 Register NewHighHalfReg = NewHighHalf.getReg(0); 5358 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags}); 5359 MI.eraseFromParent(); 5360 return true; 5361 } 5362 5363 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 5364 MachineRegisterInfo &MRI, 5365 MachineIRBuilder &B) const { 5366 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5367 if (!MFI->isEntryFunction()) { 5368 return legalizePreloadedArgIntrin(MI, MRI, B, 5369 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 5370 } 5371 5372 Register DstReg = MI.getOperand(0).getReg(); 5373 if (!getImplicitArgPtr(DstReg, MRI, B)) 5374 return false; 5375 5376 MI.eraseFromParent(); 5377 return true; 5378 } 5379 5380 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, 5381 MachineRegisterInfo &MRI, 5382 MachineIRBuilder &B) const { 5383 Function &F = B.getMF().getFunction(); 5384 std::optional<uint32_t> KnownSize = 5385 AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 5386 if (KnownSize.has_value()) 5387 B.buildConstant(DstReg, *KnownSize); 5388 return false; 5389 } 5390 5391 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, 5392 MachineRegisterInfo &MRI, 5393 MachineIRBuilder &B) const { 5394 5395 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5396 if (!MFI->isEntryFunction()) { 5397 return legalizePreloadedArgIntrin(MI, MRI, B, 5398 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 5399 } 5400 5401 Register DstReg = MI.getOperand(0).getReg(); 5402 if (!getLDSKernelId(DstReg, MRI, B)) 5403 return false; 5404 5405 MI.eraseFromParent(); 5406 return true; 5407 } 5408 5409 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 5410 MachineRegisterInfo &MRI, 5411 MachineIRBuilder &B, 5412 unsigned AddrSpace) const { 5413 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 5414 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 5415 Register Hi32 = Unmerge.getReg(1); 5416 5417 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 5418 MI.eraseFromParent(); 5419 return true; 5420 } 5421 5422 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 5423 // offset (the offset that is included in bounds checking and swizzling, to be 5424 // split between the instruction's voffset and immoffset fields) and soffset 5425 // (the offset that is excluded from bounds checking and swizzling, to go in 5426 // the instruction's soffset field). This function takes the first kind of 5427 // offset and figures out how to split it between voffset and immoffset. 5428 std::pair<Register, unsigned> 5429 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 5430 Register OrigOffset) const { 5431 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST); 5432 Register BaseReg; 5433 unsigned ImmOffset; 5434 const LLT S32 = LLT::scalar(32); 5435 MachineRegisterInfo &MRI = *B.getMRI(); 5436 5437 std::tie(BaseReg, ImmOffset) = 5438 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 5439 5440 // If BaseReg is a pointer, convert it to int. 5441 if (MRI.getType(BaseReg).isPointer()) 5442 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 5443 5444 // If the immediate value is too big for the immoffset field, put only bits 5445 // that would normally fit in the immoffset field. The remaining value that 5446 // is copied/added for the voffset field is a large power of 2, and it 5447 // stands more chance of being CSEd with the copy/add for another similar 5448 // load/store. 5449 // However, do not do that rounding down if that is a negative 5450 // number, as it appears to be illegal to have a negative offset in the 5451 // vgpr, even if adding the immediate offset makes it positive. 5452 unsigned Overflow = ImmOffset & ~MaxImm; 5453 ImmOffset -= Overflow; 5454 if ((int32_t)Overflow < 0) { 5455 Overflow += ImmOffset; 5456 ImmOffset = 0; 5457 } 5458 5459 if (Overflow != 0) { 5460 if (!BaseReg) { 5461 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 5462 } else { 5463 auto OverflowVal = B.buildConstant(S32, Overflow); 5464 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 5465 } 5466 } 5467 5468 if (!BaseReg) 5469 BaseReg = B.buildConstant(S32, 0).getReg(0); 5470 5471 return std::pair(BaseReg, ImmOffset); 5472 } 5473 5474 /// Handle register layout difference for f16 images for some subtargets. 5475 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 5476 MachineRegisterInfo &MRI, 5477 Register Reg, 5478 bool ImageStore) const { 5479 const LLT S16 = LLT::scalar(16); 5480 const LLT S32 = LLT::scalar(32); 5481 LLT StoreVT = MRI.getType(Reg); 5482 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 5483 5484 if (ST.hasUnpackedD16VMem()) { 5485 auto Unmerge = B.buildUnmerge(S16, Reg); 5486 5487 SmallVector<Register, 4> WideRegs; 5488 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5489 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 5490 5491 int NumElts = StoreVT.getNumElements(); 5492 5493 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 5494 .getReg(0); 5495 } 5496 5497 if (ImageStore && ST.hasImageStoreD16Bug()) { 5498 if (StoreVT.getNumElements() == 2) { 5499 SmallVector<Register, 4> PackedRegs; 5500 Reg = B.buildBitcast(S32, Reg).getReg(0); 5501 PackedRegs.push_back(Reg); 5502 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 5503 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 5504 .getReg(0); 5505 } 5506 5507 if (StoreVT.getNumElements() == 3) { 5508 SmallVector<Register, 4> PackedRegs; 5509 auto Unmerge = B.buildUnmerge(S16, Reg); 5510 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5511 PackedRegs.push_back(Unmerge.getReg(I)); 5512 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 5513 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 5514 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 5515 } 5516 5517 if (StoreVT.getNumElements() == 4) { 5518 SmallVector<Register, 4> PackedRegs; 5519 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 5520 auto Unmerge = B.buildUnmerge(S32, Reg); 5521 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5522 PackedRegs.push_back(Unmerge.getReg(I)); 5523 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 5524 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 5525 .getReg(0); 5526 } 5527 5528 llvm_unreachable("invalid data type"); 5529 } 5530 5531 if (StoreVT == LLT::fixed_vector(3, S16)) { 5532 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 5533 .getReg(0); 5534 } 5535 return Reg; 5536 } 5537 5538 Register AMDGPULegalizerInfo::fixStoreSourceType( 5539 MachineIRBuilder &B, Register VData, bool IsFormat) const { 5540 MachineRegisterInfo *MRI = B.getMRI(); 5541 LLT Ty = MRI->getType(VData); 5542 5543 const LLT S16 = LLT::scalar(16); 5544 5545 // Fixup buffer resources themselves needing to be v4i128. 5546 if (hasBufferRsrcWorkaround(Ty)) 5547 return castBufferRsrcToV4I32(VData, B); 5548 5549 // Fixup illegal register types for i8 stores. 5550 if (Ty == LLT::scalar(8) || Ty == S16) { 5551 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 5552 return AnyExt; 5553 } 5554 5555 if (Ty.isVector()) { 5556 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 5557 if (IsFormat) 5558 return handleD16VData(B, *MRI, VData); 5559 } 5560 } 5561 5562 return VData; 5563 } 5564 5565 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 5566 MachineRegisterInfo &MRI, 5567 MachineIRBuilder &B, 5568 bool IsTyped, 5569 bool IsFormat) const { 5570 Register VData = MI.getOperand(1).getReg(); 5571 LLT Ty = MRI.getType(VData); 5572 LLT EltTy = Ty.getScalarType(); 5573 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 5574 const LLT S32 = LLT::scalar(32); 5575 5576 VData = fixStoreSourceType(B, VData, IsFormat); 5577 castBufferRsrcArgToV4I32(MI, B, 2); 5578 Register RSrc = MI.getOperand(2).getReg(); 5579 5580 MachineMemOperand *MMO = *MI.memoperands_begin(); 5581 const int MemSize = MMO->getSize(); 5582 5583 unsigned ImmOffset; 5584 5585 // The typed intrinsics add an immediate after the registers. 5586 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 5587 5588 // The struct intrinsic variants add one additional operand over raw. 5589 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 5590 Register VIndex; 5591 int OpOffset = 0; 5592 if (HasVIndex) { 5593 VIndex = MI.getOperand(3).getReg(); 5594 OpOffset = 1; 5595 } else { 5596 VIndex = B.buildConstant(S32, 0).getReg(0); 5597 } 5598 5599 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 5600 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 5601 5602 unsigned Format = 0; 5603 if (IsTyped) { 5604 Format = MI.getOperand(5 + OpOffset).getImm(); 5605 ++OpOffset; 5606 } 5607 5608 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 5609 5610 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 5611 5612 unsigned Opc; 5613 if (IsTyped) { 5614 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 5615 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 5616 } else if (IsFormat) { 5617 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 5618 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 5619 } else { 5620 switch (MemSize) { 5621 case 1: 5622 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 5623 break; 5624 case 2: 5625 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 5626 break; 5627 default: 5628 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 5629 break; 5630 } 5631 } 5632 5633 auto MIB = B.buildInstr(Opc) 5634 .addUse(VData) // vdata 5635 .addUse(RSrc) // rsrc 5636 .addUse(VIndex) // vindex 5637 .addUse(VOffset) // voffset 5638 .addUse(SOffset) // soffset 5639 .addImm(ImmOffset); // offset(imm) 5640 5641 if (IsTyped) 5642 MIB.addImm(Format); 5643 5644 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5645 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 5646 .addMemOperand(MMO); 5647 5648 MI.eraseFromParent(); 5649 return true; 5650 } 5651 5652 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, 5653 Register VIndex, Register VOffset, Register SOffset, 5654 unsigned ImmOffset, unsigned Format, 5655 unsigned AuxiliaryData, MachineMemOperand *MMO, 5656 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) { 5657 auto MIB = B.buildInstr(Opc) 5658 .addDef(LoadDstReg) // vdata 5659 .addUse(RSrc) // rsrc 5660 .addUse(VIndex) // vindex 5661 .addUse(VOffset) // voffset 5662 .addUse(SOffset) // soffset 5663 .addImm(ImmOffset); // offset(imm) 5664 5665 if (IsTyped) 5666 MIB.addImm(Format); 5667 5668 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5669 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 5670 .addMemOperand(MMO); 5671 } 5672 5673 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 5674 MachineRegisterInfo &MRI, 5675 MachineIRBuilder &B, 5676 bool IsFormat, 5677 bool IsTyped) const { 5678 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 5679 MachineMemOperand *MMO = *MI.memoperands_begin(); 5680 const LLT MemTy = MMO->getMemoryType(); 5681 const LLT S32 = LLT::scalar(32); 5682 5683 Register Dst = MI.getOperand(0).getReg(); 5684 5685 Register StatusDst; 5686 int OpOffset = 0; 5687 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2); 5688 bool IsTFE = MI.getNumExplicitDefs() == 2; 5689 if (IsTFE) { 5690 StatusDst = MI.getOperand(1).getReg(); 5691 ++OpOffset; 5692 } 5693 5694 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset); 5695 Register RSrc = MI.getOperand(2 + OpOffset).getReg(); 5696 5697 // The typed intrinsics add an immediate after the registers. 5698 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 5699 5700 // The struct intrinsic variants add one additional operand over raw. 5701 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset; 5702 Register VIndex; 5703 if (HasVIndex) { 5704 VIndex = MI.getOperand(3 + OpOffset).getReg(); 5705 ++OpOffset; 5706 } else { 5707 VIndex = B.buildConstant(S32, 0).getReg(0); 5708 } 5709 5710 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 5711 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 5712 5713 unsigned Format = 0; 5714 if (IsTyped) { 5715 Format = MI.getOperand(5 + OpOffset).getImm(); 5716 ++OpOffset; 5717 } 5718 5719 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 5720 unsigned ImmOffset; 5721 5722 LLT Ty = MRI.getType(Dst); 5723 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the 5724 // logic doesn't have to handle that case. 5725 if (hasBufferRsrcWorkaround(Ty)) { 5726 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0); 5727 Dst = MI.getOperand(0).getReg(); 5728 } 5729 LLT EltTy = Ty.getScalarType(); 5730 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 5731 const bool Unpacked = ST.hasUnpackedD16VMem(); 5732 5733 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 5734 5735 unsigned Opc; 5736 5737 // TODO: Support TFE for typed and narrow loads. 5738 if (IsTyped) { 5739 if (IsTFE) 5740 return false; 5741 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 5742 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 5743 } else if (IsFormat) { 5744 if (IsD16) { 5745 if (IsTFE) 5746 return false; 5747 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16; 5748 } else { 5749 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE 5750 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 5751 } 5752 } else { 5753 if (IsTFE) 5754 return false; 5755 switch (MemTy.getSizeInBits()) { 5756 case 8: 5757 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 5758 break; 5759 case 16: 5760 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 5761 break; 5762 default: 5763 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 5764 break; 5765 } 5766 } 5767 5768 if (IsTFE) { 5769 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32); 5770 unsigned NumLoadDWords = NumValueDWords + 1; 5771 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32); 5772 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy); 5773 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5774 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5775 if (NumValueDWords == 1) { 5776 B.buildUnmerge({Dst, StatusDst}, LoadDstReg); 5777 } else { 5778 SmallVector<Register, 5> LoadElts; 5779 for (unsigned I = 0; I != NumValueDWords; ++I) 5780 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32)); 5781 LoadElts.push_back(StatusDst); 5782 B.buildUnmerge(LoadElts, LoadDstReg); 5783 LoadElts.truncate(NumValueDWords); 5784 B.buildMergeLikeInstr(Dst, LoadElts); 5785 } 5786 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) || 5787 (IsD16 && !Ty.isVector())) { 5788 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 5789 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5790 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5791 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 5792 B.buildTrunc(Dst, LoadDstReg); 5793 } else if (Unpacked && IsD16 && Ty.isVector()) { 5794 LLT UnpackedTy = Ty.changeElementSize(32); 5795 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 5796 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5797 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5798 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 5799 // FIXME: G_TRUNC should work, but legalization currently fails 5800 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 5801 SmallVector<Register, 4> Repack; 5802 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 5803 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 5804 B.buildMergeLikeInstr(Dst, Repack); 5805 } else { 5806 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format, 5807 AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5808 } 5809 5810 MI.eraseFromParent(); 5811 return true; 5812 } 5813 5814 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 5815 switch (IntrID) { 5816 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5817 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 5818 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5819 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 5820 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 5821 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5822 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 5823 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5824 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 5825 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 5826 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5827 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 5828 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5829 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 5830 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 5831 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5832 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 5833 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5834 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 5835 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 5836 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5837 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 5838 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5839 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 5840 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 5841 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5842 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 5843 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5844 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 5845 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 5846 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5847 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 5848 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5849 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 5850 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 5851 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5852 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 5853 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5854 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 5855 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 5856 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5857 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 5858 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5859 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 5860 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 5861 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5862 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 5863 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5864 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 5865 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 5866 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5867 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 5868 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5869 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 5870 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 5871 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5872 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 5873 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5874 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 5875 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 5876 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5877 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 5878 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5879 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 5880 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 5881 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5882 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 5883 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5884 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 5885 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 5886 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5887 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 5888 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5889 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 5890 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 5891 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5892 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 5893 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5894 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 5895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 5896 default: 5897 llvm_unreachable("unhandled atomic opcode"); 5898 } 5899 } 5900 5901 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 5902 MachineIRBuilder &B, 5903 Intrinsic::ID IID) const { 5904 const bool IsCmpSwap = 5905 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 5906 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap || 5907 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap || 5908 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap; 5909 5910 Register Dst = MI.getOperand(0).getReg(); 5911 // Since we don't have 128-bit atomics, we don't need to handle the case of 5912 // p8 argmunents to the atomic itself 5913 Register VData = MI.getOperand(2).getReg(); 5914 5915 Register CmpVal; 5916 int OpOffset = 0; 5917 5918 if (IsCmpSwap) { 5919 CmpVal = MI.getOperand(3).getReg(); 5920 ++OpOffset; 5921 } 5922 5923 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset); 5924 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 5925 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 5926 5927 // The struct intrinsic variants add one additional operand over raw. 5928 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 5929 Register VIndex; 5930 if (HasVIndex) { 5931 VIndex = MI.getOperand(4 + OpOffset).getReg(); 5932 ++OpOffset; 5933 } else { 5934 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 5935 } 5936 5937 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 5938 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 5939 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 5940 5941 MachineMemOperand *MMO = *MI.memoperands_begin(); 5942 5943 unsigned ImmOffset; 5944 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 5945 5946 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 5947 .addDef(Dst) 5948 .addUse(VData); // vdata 5949 5950 if (IsCmpSwap) 5951 MIB.addReg(CmpVal); 5952 5953 MIB.addUse(RSrc) // rsrc 5954 .addUse(VIndex) // vindex 5955 .addUse(VOffset) // voffset 5956 .addUse(SOffset) // soffset 5957 .addImm(ImmOffset) // offset(imm) 5958 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5959 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 5960 .addMemOperand(MMO); 5961 5962 MI.eraseFromParent(); 5963 return true; 5964 } 5965 5966 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 5967 /// vector with s16 typed elements. 5968 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 5969 SmallVectorImpl<Register> &PackedAddrs, 5970 unsigned ArgOffset, 5971 const AMDGPU::ImageDimIntrinsicInfo *Intr, 5972 bool IsA16, bool IsG16) { 5973 const LLT S16 = LLT::scalar(16); 5974 const LLT V2S16 = LLT::fixed_vector(2, 16); 5975 auto EndIdx = Intr->VAddrEnd; 5976 5977 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 5978 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 5979 if (!SrcOp.isReg()) 5980 continue; // _L to _LZ may have eliminated this. 5981 5982 Register AddrReg = SrcOp.getReg(); 5983 5984 if ((I < Intr->GradientStart) || 5985 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 5986 (I >= Intr->CoordStart && !IsA16)) { 5987 if ((I < Intr->GradientStart) && IsA16 && 5988 (B.getMRI()->getType(AddrReg) == S16)) { 5989 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 5990 // Special handling of bias when A16 is on. Bias is of type half but 5991 // occupies full 32-bit. 5992 PackedAddrs.push_back( 5993 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 5994 .getReg(0)); 5995 } else { 5996 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 5997 "Bias needs to be converted to 16 bit in A16 mode"); 5998 // Handle any gradient or coordinate operands that should not be packed 5999 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 6000 PackedAddrs.push_back(AddrReg); 6001 } 6002 } else { 6003 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 6004 // derivatives dx/dh and dx/dv are packed with undef. 6005 if (((I + 1) >= EndIdx) || 6006 ((Intr->NumGradients / 2) % 2 == 1 && 6007 (I == static_cast<unsigned>(Intr->GradientStart + 6008 (Intr->NumGradients / 2) - 1) || 6009 I == static_cast<unsigned>(Intr->GradientStart + 6010 Intr->NumGradients - 1))) || 6011 // Check for _L to _LZ optimization 6012 !MI.getOperand(ArgOffset + I + 1).isReg()) { 6013 PackedAddrs.push_back( 6014 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 6015 .getReg(0)); 6016 } else { 6017 PackedAddrs.push_back( 6018 B.buildBuildVector( 6019 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 6020 .getReg(0)); 6021 ++I; 6022 } 6023 } 6024 } 6025 } 6026 6027 /// Convert from separate vaddr components to a single vector address register, 6028 /// and replace the remaining operands with $noreg. 6029 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 6030 int DimIdx, int NumVAddrs) { 6031 const LLT S32 = LLT::scalar(32); 6032 (void)S32; 6033 SmallVector<Register, 8> AddrRegs; 6034 for (int I = 0; I != NumVAddrs; ++I) { 6035 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 6036 if (SrcOp.isReg()) { 6037 AddrRegs.push_back(SrcOp.getReg()); 6038 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 6039 } 6040 } 6041 6042 int NumAddrRegs = AddrRegs.size(); 6043 if (NumAddrRegs != 1) { 6044 auto VAddr = 6045 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 6046 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 6047 } 6048 6049 for (int I = 1; I != NumVAddrs; ++I) { 6050 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 6051 if (SrcOp.isReg()) 6052 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 6053 } 6054 } 6055 6056 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 6057 /// 6058 /// Depending on the subtarget, load/store with 16-bit element data need to be 6059 /// rewritten to use the low half of 32-bit registers, or directly use a packed 6060 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 6061 /// registers. 6062 /// 6063 /// We don't want to directly select image instructions just yet, but also want 6064 /// to exposes all register repacking to the legalizer/combiners. We also don't 6065 /// want a selected instruction entering RegBankSelect. In order to avoid 6066 /// defining a multitude of intermediate image instructions, directly hack on 6067 /// the intrinsic's arguments. In cases like a16 addresses, this requires 6068 /// padding now unnecessary arguments with $noreg. 6069 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 6070 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 6071 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 6072 6073 const MachineFunction &MF = *MI.getMF(); 6074 const unsigned NumDefs = MI.getNumExplicitDefs(); 6075 const unsigned ArgOffset = NumDefs + 1; 6076 bool IsTFE = NumDefs == 2; 6077 // We are only processing the operands of d16 image operations on subtargets 6078 // that use the unpacked register layout, or need to repack the TFE result. 6079 6080 // TODO: Do we need to guard against already legalized intrinsics? 6081 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 6082 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 6083 6084 MachineRegisterInfo *MRI = B.getMRI(); 6085 const LLT S32 = LLT::scalar(32); 6086 const LLT S16 = LLT::scalar(16); 6087 const LLT V2S16 = LLT::fixed_vector(2, 16); 6088 6089 unsigned DMask = 0; 6090 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 6091 LLT Ty = MRI->getType(VData); 6092 6093 // Check for 16 bit addresses and pack if true. 6094 LLT GradTy = 6095 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 6096 LLT AddrTy = 6097 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 6098 const bool IsG16 = 6099 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16; 6100 const bool IsA16 = AddrTy == S16; 6101 const bool IsD16 = Ty.getScalarType() == S16; 6102 6103 int DMaskLanes = 0; 6104 if (!BaseOpcode->Atomic) { 6105 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 6106 if (BaseOpcode->Gather4) { 6107 DMaskLanes = 4; 6108 } else if (DMask != 0) { 6109 DMaskLanes = llvm::popcount(DMask); 6110 } else if (!IsTFE && !BaseOpcode->Store) { 6111 // If dmask is 0, this is a no-op load. This can be eliminated. 6112 B.buildUndef(MI.getOperand(0)); 6113 MI.eraseFromParent(); 6114 return true; 6115 } 6116 } 6117 6118 Observer.changingInstr(MI); 6119 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 6120 6121 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 6122 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 6123 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 6124 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 6125 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 6126 6127 // Track that we legalized this 6128 MI.setDesc(B.getTII().get(NewOpcode)); 6129 6130 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 6131 // dmask to be at least 1 otherwise the instruction will fail 6132 if (IsTFE && DMask == 0) { 6133 DMask = 0x1; 6134 DMaskLanes = 1; 6135 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 6136 } 6137 6138 if (BaseOpcode->Atomic) { 6139 Register VData0 = MI.getOperand(2).getReg(); 6140 LLT Ty = MRI->getType(VData0); 6141 6142 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 6143 if (Ty.isVector()) 6144 return false; 6145 6146 if (BaseOpcode->AtomicX2) { 6147 Register VData1 = MI.getOperand(3).getReg(); 6148 // The two values are packed in one register. 6149 LLT PackedTy = LLT::fixed_vector(2, Ty); 6150 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 6151 MI.getOperand(2).setReg(Concat.getReg(0)); 6152 MI.getOperand(3).setReg(AMDGPU::NoRegister); 6153 } 6154 } 6155 6156 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 6157 6158 // Rewrite the addressing register layout before doing anything else. 6159 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 6160 // 16 bit gradients are supported, but are tied to the A16 control 6161 // so both gradients and addresses must be 16 bit 6162 return false; 6163 } 6164 6165 if (IsA16 && !ST.hasA16()) { 6166 // A16 not supported 6167 return false; 6168 } 6169 6170 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler); 6171 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding(); 6172 6173 if (IsA16 || IsG16) { 6174 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the 6175 // instructions expect VGPR_32 6176 SmallVector<Register, 4> PackedRegs; 6177 6178 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16); 6179 6180 // See also below in the non-a16 branch 6181 const bool UseNSA = ST.hasNSAEncoding() && 6182 PackedRegs.size() >= ST.getNSAThreshold(MF) && 6183 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA); 6184 const bool UsePartialNSA = 6185 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize; 6186 6187 if (UsePartialNSA) { 6188 // Pack registers that would go over NSAMaxSize into last VAddr register 6189 LLT PackedAddrTy = 6190 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16); 6191 auto Concat = B.buildConcatVectors( 6192 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1)); 6193 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0); 6194 PackedRegs.resize(NSAMaxSize); 6195 } else if (!UseNSA && PackedRegs.size() > 1) { 6196 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 6197 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 6198 PackedRegs[0] = Concat.getReg(0); 6199 PackedRegs.resize(1); 6200 } 6201 6202 const unsigned NumPacked = PackedRegs.size(); 6203 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 6204 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 6205 if (!SrcOp.isReg()) { 6206 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 6207 continue; 6208 } 6209 6210 assert(SrcOp.getReg() != AMDGPU::NoRegister); 6211 6212 if (I - Intr->VAddrStart < NumPacked) 6213 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 6214 else 6215 SrcOp.setReg(AMDGPU::NoRegister); 6216 } 6217 } else { 6218 // If the register allocator cannot place the address registers contiguously 6219 // without introducing moves, then using the non-sequential address encoding 6220 // is always preferable, since it saves VALU instructions and is usually a 6221 // wash in terms of code size or even better. 6222 // 6223 // However, we currently have no way of hinting to the register allocator 6224 // that MIMG addresses should be placed contiguously when it is possible to 6225 // do so, so force non-NSA for the common 2-address case as a heuristic. 6226 // 6227 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 6228 // allocation when possible. 6229 // 6230 // Partial NSA is allowed on GFX11+ where the final register is a contiguous 6231 // set of the remaining addresses. 6232 const bool UseNSA = ST.hasNSAEncoding() && 6233 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) && 6234 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA); 6235 const bool UsePartialNSA = 6236 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize; 6237 6238 if (UsePartialNSA) { 6239 convertImageAddrToPacked(B, MI, 6240 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1, 6241 Intr->NumVAddrs - NSAMaxSize + 1); 6242 } else if (!UseNSA && Intr->NumVAddrs > 1) { 6243 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 6244 Intr->NumVAddrs); 6245 } 6246 } 6247 6248 int Flags = 0; 6249 if (IsA16) 6250 Flags |= 1; 6251 if (IsG16) 6252 Flags |= 2; 6253 MI.addOperand(MachineOperand::CreateImm(Flags)); 6254 6255 if (BaseOpcode->Store) { // No TFE for stores? 6256 // TODO: Handle dmask trim 6257 if (!Ty.isVector() || !IsD16) 6258 return true; 6259 6260 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 6261 if (RepackedReg != VData) { 6262 MI.getOperand(1).setReg(RepackedReg); 6263 } 6264 6265 return true; 6266 } 6267 6268 Register DstReg = MI.getOperand(0).getReg(); 6269 const LLT EltTy = Ty.getScalarType(); 6270 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 6271 6272 // Confirm that the return type is large enough for the dmask specified 6273 if (NumElts < DMaskLanes) 6274 return false; 6275 6276 if (NumElts > 4 || DMaskLanes > 4) 6277 return false; 6278 6279 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 6280 const LLT AdjustedTy = 6281 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 6282 6283 // The raw dword aligned data component of the load. The only legal cases 6284 // where this matters should be when using the packed D16 format, for 6285 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 6286 LLT RoundedTy; 6287 6288 // S32 vector to cover all data, plus TFE result element. 6289 LLT TFETy; 6290 6291 // Register type to use for each loaded component. Will be S32 or V2S16. 6292 LLT RegTy; 6293 6294 if (IsD16 && ST.hasUnpackedD16VMem()) { 6295 RoundedTy = 6296 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 6297 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 6298 RegTy = S32; 6299 } else { 6300 unsigned EltSize = EltTy.getSizeInBits(); 6301 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 6302 unsigned RoundedSize = 32 * RoundedElts; 6303 RoundedTy = LLT::scalarOrVector( 6304 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 6305 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 6306 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 6307 } 6308 6309 // The return type does not need adjustment. 6310 // TODO: Should we change s16 case to s32 or <2 x s16>? 6311 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 6312 return true; 6313 6314 Register Dst1Reg; 6315 6316 // Insert after the instruction. 6317 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 6318 6319 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 6320 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 6321 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 6322 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 6323 6324 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 6325 6326 MI.getOperand(0).setReg(NewResultReg); 6327 6328 // In the IR, TFE is supposed to be used with a 2 element struct return 6329 // type. The instruction really returns these two values in one contiguous 6330 // register, with one additional dword beyond the loaded data. Rewrite the 6331 // return type to use a single register result. 6332 6333 if (IsTFE) { 6334 Dst1Reg = MI.getOperand(1).getReg(); 6335 if (MRI->getType(Dst1Reg) != S32) 6336 return false; 6337 6338 // TODO: Make sure the TFE operand bit is set. 6339 MI.removeOperand(1); 6340 6341 // Handle the easy case that requires no repack instructions. 6342 if (Ty == S32) { 6343 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 6344 return true; 6345 } 6346 } 6347 6348 // Now figure out how to copy the new result register back into the old 6349 // result. 6350 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 6351 6352 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 6353 6354 if (ResultNumRegs == 1) { 6355 assert(!IsTFE); 6356 ResultRegs[0] = NewResultReg; 6357 } else { 6358 // We have to repack into a new vector of some kind. 6359 for (int I = 0; I != NumDataRegs; ++I) 6360 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 6361 B.buildUnmerge(ResultRegs, NewResultReg); 6362 6363 // Drop the final TFE element to get the data part. The TFE result is 6364 // directly written to the right place already. 6365 if (IsTFE) 6366 ResultRegs.resize(NumDataRegs); 6367 } 6368 6369 // For an s16 scalar result, we form an s32 result with a truncate regardless 6370 // of packed vs. unpacked. 6371 if (IsD16 && !Ty.isVector()) { 6372 B.buildTrunc(DstReg, ResultRegs[0]); 6373 return true; 6374 } 6375 6376 // Avoid a build/concat_vector of 1 entry. 6377 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 6378 B.buildBitcast(DstReg, ResultRegs[0]); 6379 return true; 6380 } 6381 6382 assert(Ty.isVector()); 6383 6384 if (IsD16) { 6385 // For packed D16 results with TFE enabled, all the data components are 6386 // S32. Cast back to the expected type. 6387 // 6388 // TODO: We don't really need to use load s32 elements. We would only need one 6389 // cast for the TFE result if a multiple of v2s16 was used. 6390 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 6391 for (Register &Reg : ResultRegs) 6392 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 6393 } else if (ST.hasUnpackedD16VMem()) { 6394 for (Register &Reg : ResultRegs) 6395 Reg = B.buildTrunc(S16, Reg).getReg(0); 6396 } 6397 } 6398 6399 auto padWithUndef = [&](LLT Ty, int NumElts) { 6400 if (NumElts == 0) 6401 return; 6402 Register Undef = B.buildUndef(Ty).getReg(0); 6403 for (int I = 0; I != NumElts; ++I) 6404 ResultRegs.push_back(Undef); 6405 }; 6406 6407 // Pad out any elements eliminated due to the dmask. 6408 LLT ResTy = MRI->getType(ResultRegs[0]); 6409 if (!ResTy.isVector()) { 6410 padWithUndef(ResTy, NumElts - ResultRegs.size()); 6411 B.buildBuildVector(DstReg, ResultRegs); 6412 return true; 6413 } 6414 6415 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 6416 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 6417 6418 // Deal with the one annoying legal case. 6419 const LLT V3S16 = LLT::fixed_vector(3, 16); 6420 if (Ty == V3S16) { 6421 if (IsTFE) { 6422 if (ResultRegs.size() == 1) { 6423 NewResultReg = ResultRegs[0]; 6424 } else if (ResultRegs.size() == 2) { 6425 LLT V4S16 = LLT::fixed_vector(4, 16); 6426 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 6427 } else { 6428 return false; 6429 } 6430 } 6431 6432 if (MRI->getType(DstReg).getNumElements() < 6433 MRI->getType(NewResultReg).getNumElements()) { 6434 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 6435 } else { 6436 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 6437 } 6438 return true; 6439 } 6440 6441 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 6442 B.buildConcatVectors(DstReg, ResultRegs); 6443 return true; 6444 } 6445 6446 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 6447 LegalizerHelper &Helper, MachineInstr &MI) const { 6448 MachineIRBuilder &B = Helper.MIRBuilder; 6449 GISelChangeObserver &Observer = Helper.Observer; 6450 6451 Register Dst = MI.getOperand(0).getReg(); 6452 LLT Ty = B.getMRI()->getType(Dst); 6453 unsigned Size = Ty.getSizeInBits(); 6454 MachineFunction &MF = B.getMF(); 6455 6456 Observer.changingInstr(MI); 6457 6458 // Handle needing to s.buffer.load() a p8 value. 6459 if (hasBufferRsrcWorkaround(Ty)) { 6460 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0); 6461 B.setInsertPt(B.getMBB(), MI); 6462 } 6463 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 6464 Ty = getBitcastRegisterType(Ty); 6465 Helper.bitcastDst(MI, Ty, 0); 6466 B.setInsertPt(B.getMBB(), MI); 6467 } 6468 6469 // FIXME: We don't really need this intermediate instruction. The intrinsic 6470 // should be fixed to have a memory operand. Since it's readnone, we're not 6471 // allowed to add one. 6472 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 6473 MI.removeOperand(1); // Remove intrinsic ID 6474 6475 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 6476 // TODO: Should this use datalayout alignment? 6477 const unsigned MemSize = (Size + 7) / 8; 6478 const Align MemAlign(4); 6479 MachineMemOperand *MMO = MF.getMachineMemOperand( 6480 MachinePointerInfo(), 6481 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 6482 MachineMemOperand::MOInvariant, 6483 MemSize, MemAlign); 6484 MI.addMemOperand(MF, MMO); 6485 6486 // If we don't have 96-bit result scalar loads, widening to 128-bit should 6487 // always be legal. We may need to restore this to a 96-bit result if it turns 6488 // out this needs to be converted to a vector load during RegBankSelect. 6489 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) { 6490 if (Ty.isVector()) 6491 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 6492 else 6493 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 6494 } 6495 6496 Observer.changedInstr(MI); 6497 return true; 6498 } 6499 6500 // TODO: Move to selection 6501 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 6502 MachineRegisterInfo &MRI, 6503 MachineIRBuilder &B) const { 6504 if (!ST.isTrapHandlerEnabled() || 6505 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 6506 return legalizeTrapEndpgm(MI, MRI, B); 6507 6508 return ST.supportsGetDoorbellID() ? 6509 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B); 6510 } 6511 6512 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 6513 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6514 const DebugLoc &DL = MI.getDebugLoc(); 6515 MachineBasicBlock &BB = B.getMBB(); 6516 MachineFunction *MF = BB.getParent(); 6517 6518 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) { 6519 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 6520 .addImm(0); 6521 MI.eraseFromParent(); 6522 return true; 6523 } 6524 6525 // We need a block split to make the real endpgm a terminator. We also don't 6526 // want to break phis in successor blocks, so we can't just delete to the 6527 // end of the block. 6528 BB.splitAt(MI, false /*UpdateLiveIns*/); 6529 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 6530 MF->push_back(TrapBB); 6531 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 6532 .addImm(0); 6533 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ)) 6534 .addMBB(TrapBB); 6535 6536 BB.addSuccessor(TrapBB); 6537 MI.eraseFromParent(); 6538 return true; 6539 } 6540 6541 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 6542 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6543 MachineFunction &MF = B.getMF(); 6544 const LLT S64 = LLT::scalar(64); 6545 6546 Register SGPR01(AMDGPU::SGPR0_SGPR1); 6547 // For code object version 5, queue_ptr is passed through implicit kernarg. 6548 if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= 6549 AMDGPU::AMDHSA_COV5) { 6550 AMDGPUTargetLowering::ImplicitParameter Param = 6551 AMDGPUTargetLowering::QUEUE_PTR; 6552 uint64_t Offset = 6553 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 6554 6555 Register KernargPtrReg = MRI.createGenericVirtualRegister( 6556 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6557 6558 if (!loadInputValue(KernargPtrReg, B, 6559 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 6560 return false; 6561 6562 // TODO: can we be smarter about machine pointer info? 6563 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 6564 MachineMemOperand *MMO = MF.getMachineMemOperand( 6565 PtrInfo, 6566 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 6567 MachineMemOperand::MOInvariant, 6568 LLT::scalar(64), commonAlignment(Align(64), Offset)); 6569 6570 // Pointer address 6571 Register LoadAddr = MRI.createGenericVirtualRegister( 6572 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6573 B.buildPtrAdd(LoadAddr, KernargPtrReg, 6574 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 6575 // Load address 6576 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 6577 B.buildCopy(SGPR01, Temp); 6578 B.buildInstr(AMDGPU::S_TRAP) 6579 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 6580 .addReg(SGPR01, RegState::Implicit); 6581 MI.eraseFromParent(); 6582 return true; 6583 } 6584 6585 // Pass queue pointer to trap handler as input, and insert trap instruction 6586 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 6587 Register LiveIn = 6588 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6589 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 6590 return false; 6591 6592 B.buildCopy(SGPR01, LiveIn); 6593 B.buildInstr(AMDGPU::S_TRAP) 6594 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 6595 .addReg(SGPR01, RegState::Implicit); 6596 6597 MI.eraseFromParent(); 6598 return true; 6599 } 6600 6601 bool AMDGPULegalizerInfo::legalizeTrapHsa( 6602 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6603 B.buildInstr(AMDGPU::S_TRAP) 6604 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 6605 MI.eraseFromParent(); 6606 return true; 6607 } 6608 6609 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 6610 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6611 // Is non-HSA path or trap-handler disabled? Then, report a warning 6612 // accordingly 6613 if (!ST.isTrapHandlerEnabled() || 6614 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 6615 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 6616 "debugtrap handler not supported", 6617 MI.getDebugLoc(), DS_Warning); 6618 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 6619 Ctx.diagnose(NoTrap); 6620 } else { 6621 // Insert debug-trap instruction 6622 B.buildInstr(AMDGPU::S_TRAP) 6623 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 6624 } 6625 6626 MI.eraseFromParent(); 6627 return true; 6628 } 6629 6630 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 6631 MachineIRBuilder &B) const { 6632 MachineRegisterInfo &MRI = *B.getMRI(); 6633 const LLT S16 = LLT::scalar(16); 6634 const LLT S32 = LLT::scalar(32); 6635 const LLT V2S16 = LLT::fixed_vector(2, 16); 6636 const LLT V3S32 = LLT::fixed_vector(3, 32); 6637 6638 Register DstReg = MI.getOperand(0).getReg(); 6639 Register NodePtr = MI.getOperand(2).getReg(); 6640 Register RayExtent = MI.getOperand(3).getReg(); 6641 Register RayOrigin = MI.getOperand(4).getReg(); 6642 Register RayDir = MI.getOperand(5).getReg(); 6643 Register RayInvDir = MI.getOperand(6).getReg(); 6644 Register TDescr = MI.getOperand(7).getReg(); 6645 6646 if (!ST.hasGFX10_AEncoding()) { 6647 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 6648 "intrinsic not supported on subtarget", 6649 MI.getDebugLoc()); 6650 B.getMF().getFunction().getContext().diagnose(BadIntrin); 6651 return false; 6652 } 6653 6654 const bool IsGFX11 = AMDGPU::isGFX11(ST); 6655 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); 6656 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST); 6657 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 6658 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 6659 const unsigned NumVDataDwords = 4; 6660 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 6661 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 6662 const bool UseNSA = 6663 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize()); 6664 6665 const unsigned BaseOpcodes[2][2] = { 6666 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 6667 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 6668 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 6669 int Opcode; 6670 if (UseNSA) { 6671 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 6672 IsGFX12Plus ? AMDGPU::MIMGEncGfx12 6673 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA 6674 : AMDGPU::MIMGEncGfx10NSA, 6675 NumVDataDwords, NumVAddrDwords); 6676 } else { 6677 assert(!IsGFX12Plus); 6678 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 6679 IsGFX11 ? AMDGPU::MIMGEncGfx11Default 6680 : AMDGPU::MIMGEncGfx10Default, 6681 NumVDataDwords, NumVAddrDwords); 6682 } 6683 assert(Opcode != -1); 6684 6685 SmallVector<Register, 12> Ops; 6686 if (UseNSA && IsGFX11Plus) { 6687 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { 6688 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 6689 auto Merged = B.buildMergeLikeInstr( 6690 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); 6691 Ops.push_back(Merged.getReg(0)); 6692 }; 6693 6694 Ops.push_back(NodePtr); 6695 Ops.push_back(RayExtent); 6696 packLanes(RayOrigin); 6697 6698 if (IsA16) { 6699 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 6700 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 6701 auto MergedDir = B.buildMergeLikeInstr( 6702 V3S32, 6703 {B.buildBitcast( 6704 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0), 6705 UnmergeRayDir.getReg(0)})) 6706 .getReg(0), 6707 B.buildBitcast( 6708 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1), 6709 UnmergeRayDir.getReg(1)})) 6710 .getReg(0), 6711 B.buildBitcast( 6712 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2), 6713 UnmergeRayDir.getReg(2)})) 6714 .getReg(0)}); 6715 Ops.push_back(MergedDir.getReg(0)); 6716 } else { 6717 packLanes(RayDir); 6718 packLanes(RayInvDir); 6719 } 6720 } else { 6721 if (Is64) { 6722 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 6723 Ops.push_back(Unmerge.getReg(0)); 6724 Ops.push_back(Unmerge.getReg(1)); 6725 } else { 6726 Ops.push_back(NodePtr); 6727 } 6728 Ops.push_back(RayExtent); 6729 6730 auto packLanes = [&Ops, &S32, &B](Register Src) { 6731 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 6732 Ops.push_back(Unmerge.getReg(0)); 6733 Ops.push_back(Unmerge.getReg(1)); 6734 Ops.push_back(Unmerge.getReg(2)); 6735 }; 6736 6737 packLanes(RayOrigin); 6738 if (IsA16) { 6739 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 6740 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 6741 Register R1 = MRI.createGenericVirtualRegister(S32); 6742 Register R2 = MRI.createGenericVirtualRegister(S32); 6743 Register R3 = MRI.createGenericVirtualRegister(S32); 6744 B.buildMergeLikeInstr(R1, 6745 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 6746 B.buildMergeLikeInstr( 6747 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 6748 B.buildMergeLikeInstr( 6749 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 6750 Ops.push_back(R1); 6751 Ops.push_back(R2); 6752 Ops.push_back(R3); 6753 } else { 6754 packLanes(RayDir); 6755 packLanes(RayInvDir); 6756 } 6757 } 6758 6759 if (!UseNSA) { 6760 // Build a single vector containing all the operands so far prepared. 6761 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 6762 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0); 6763 Ops.clear(); 6764 Ops.push_back(MergedOps); 6765 } 6766 6767 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 6768 .addDef(DstReg) 6769 .addImm(Opcode); 6770 6771 for (Register R : Ops) { 6772 MIB.addUse(R); 6773 } 6774 6775 MIB.addUse(TDescr) 6776 .addImm(IsA16 ? 1 : 0) 6777 .cloneMemRefs(MI); 6778 6779 MI.eraseFromParent(); 6780 return true; 6781 } 6782 6783 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 6784 MachineIRBuilder &B) const { 6785 unsigned Opc; 6786 int RoundMode = MI.getOperand(2).getImm(); 6787 6788 if (RoundMode == (int)RoundingMode::TowardPositive) 6789 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 6790 else if (RoundMode == (int)RoundingMode::TowardNegative) 6791 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 6792 else 6793 return false; 6794 6795 B.buildInstr(Opc) 6796 .addDef(MI.getOperand(0).getReg()) 6797 .addUse(MI.getOperand(1).getReg()); 6798 6799 MI.eraseFromParent(); 6800 6801 return true; 6802 } 6803 6804 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI, 6805 MachineIRBuilder &B) const { 6806 const SITargetLowering *TLI = ST.getTargetLowering(); 6807 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore(); 6808 Register DstReg = MI.getOperand(0).getReg(); 6809 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr}); 6810 MI.eraseFromParent(); 6811 return true; 6812 } 6813 6814 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 6815 MachineInstr &MI) const { 6816 MachineIRBuilder &B = Helper.MIRBuilder; 6817 MachineRegisterInfo &MRI = *B.getMRI(); 6818 6819 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 6820 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); 6821 switch (IntrID) { 6822 case Intrinsic::amdgcn_if: 6823 case Intrinsic::amdgcn_else: { 6824 MachineInstr *Br = nullptr; 6825 MachineBasicBlock *UncondBrTarget = nullptr; 6826 bool Negated = false; 6827 if (MachineInstr *BrCond = 6828 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 6829 const SIRegisterInfo *TRI 6830 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 6831 6832 Register Def = MI.getOperand(1).getReg(); 6833 Register Use = MI.getOperand(3).getReg(); 6834 6835 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 6836 6837 if (Negated) 6838 std::swap(CondBrTarget, UncondBrTarget); 6839 6840 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 6841 if (IntrID == Intrinsic::amdgcn_if) { 6842 B.buildInstr(AMDGPU::SI_IF) 6843 .addDef(Def) 6844 .addUse(Use) 6845 .addMBB(UncondBrTarget); 6846 } else { 6847 B.buildInstr(AMDGPU::SI_ELSE) 6848 .addDef(Def) 6849 .addUse(Use) 6850 .addMBB(UncondBrTarget); 6851 } 6852 6853 if (Br) { 6854 Br->getOperand(0).setMBB(CondBrTarget); 6855 } else { 6856 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 6857 // since we're swapping branch targets it needs to be reinserted. 6858 // FIXME: IRTranslator should probably not do this 6859 B.buildBr(*CondBrTarget); 6860 } 6861 6862 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 6863 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 6864 MI.eraseFromParent(); 6865 BrCond->eraseFromParent(); 6866 return true; 6867 } 6868 6869 return false; 6870 } 6871 case Intrinsic::amdgcn_loop: { 6872 MachineInstr *Br = nullptr; 6873 MachineBasicBlock *UncondBrTarget = nullptr; 6874 bool Negated = false; 6875 if (MachineInstr *BrCond = 6876 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 6877 const SIRegisterInfo *TRI 6878 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 6879 6880 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 6881 Register Reg = MI.getOperand(2).getReg(); 6882 6883 if (Negated) 6884 std::swap(CondBrTarget, UncondBrTarget); 6885 6886 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 6887 B.buildInstr(AMDGPU::SI_LOOP) 6888 .addUse(Reg) 6889 .addMBB(UncondBrTarget); 6890 6891 if (Br) 6892 Br->getOperand(0).setMBB(CondBrTarget); 6893 else 6894 B.buildBr(*CondBrTarget); 6895 6896 MI.eraseFromParent(); 6897 BrCond->eraseFromParent(); 6898 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 6899 return true; 6900 } 6901 6902 return false; 6903 } 6904 case Intrinsic::amdgcn_make_buffer_rsrc: 6905 return legalizePointerAsRsrcIntrin(MI, MRI, B); 6906 case Intrinsic::amdgcn_kernarg_segment_ptr: 6907 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 6908 // This only makes sense to call in a kernel, so just lower to null. 6909 B.buildConstant(MI.getOperand(0).getReg(), 0); 6910 MI.eraseFromParent(); 6911 return true; 6912 } 6913 6914 return legalizePreloadedArgIntrin( 6915 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 6916 case Intrinsic::amdgcn_implicitarg_ptr: 6917 return legalizeImplicitArgPtr(MI, MRI, B); 6918 case Intrinsic::amdgcn_workitem_id_x: 6919 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, 6920 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 6921 case Intrinsic::amdgcn_workitem_id_y: 6922 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, 6923 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 6924 case Intrinsic::amdgcn_workitem_id_z: 6925 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, 6926 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 6927 case Intrinsic::amdgcn_workgroup_id_x: 6928 return legalizePreloadedArgIntrin(MI, MRI, B, 6929 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 6930 case Intrinsic::amdgcn_workgroup_id_y: 6931 return legalizePreloadedArgIntrin(MI, MRI, B, 6932 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 6933 case Intrinsic::amdgcn_workgroup_id_z: 6934 return legalizePreloadedArgIntrin(MI, MRI, B, 6935 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 6936 case Intrinsic::amdgcn_lds_kernel_id: 6937 return legalizePreloadedArgIntrin(MI, MRI, B, 6938 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 6939 case Intrinsic::amdgcn_dispatch_ptr: 6940 return legalizePreloadedArgIntrin(MI, MRI, B, 6941 AMDGPUFunctionArgInfo::DISPATCH_PTR); 6942 case Intrinsic::amdgcn_queue_ptr: 6943 return legalizePreloadedArgIntrin(MI, MRI, B, 6944 AMDGPUFunctionArgInfo::QUEUE_PTR); 6945 case Intrinsic::amdgcn_implicit_buffer_ptr: 6946 return legalizePreloadedArgIntrin( 6947 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 6948 case Intrinsic::amdgcn_dispatch_id: 6949 return legalizePreloadedArgIntrin(MI, MRI, B, 6950 AMDGPUFunctionArgInfo::DISPATCH_ID); 6951 case Intrinsic::r600_read_ngroups_x: 6952 // TODO: Emit error for hsa 6953 return legalizeKernargMemParameter(MI, B, 6954 SI::KernelInputOffsets::NGROUPS_X); 6955 case Intrinsic::r600_read_ngroups_y: 6956 return legalizeKernargMemParameter(MI, B, 6957 SI::KernelInputOffsets::NGROUPS_Y); 6958 case Intrinsic::r600_read_ngroups_z: 6959 return legalizeKernargMemParameter(MI, B, 6960 SI::KernelInputOffsets::NGROUPS_Z); 6961 case Intrinsic::r600_read_local_size_x: 6962 // TODO: Could insert G_ASSERT_ZEXT from s16 6963 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 6964 case Intrinsic::r600_read_local_size_y: 6965 // TODO: Could insert G_ASSERT_ZEXT from s16 6966 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 6967 // TODO: Could insert G_ASSERT_ZEXT from s16 6968 case Intrinsic::r600_read_local_size_z: 6969 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 6970 case Intrinsic::r600_read_global_size_x: 6971 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 6972 case Intrinsic::r600_read_global_size_y: 6973 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 6974 case Intrinsic::r600_read_global_size_z: 6975 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 6976 case Intrinsic::amdgcn_fdiv_fast: 6977 return legalizeFDIVFastIntrin(MI, MRI, B); 6978 case Intrinsic::amdgcn_is_shared: 6979 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 6980 case Intrinsic::amdgcn_is_private: 6981 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 6982 case Intrinsic::amdgcn_wavefrontsize: { 6983 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 6984 MI.eraseFromParent(); 6985 return true; 6986 } 6987 case Intrinsic::amdgcn_s_buffer_load: 6988 return legalizeSBufferLoad(Helper, MI); 6989 case Intrinsic::amdgcn_raw_buffer_store: 6990 case Intrinsic::amdgcn_raw_ptr_buffer_store: 6991 case Intrinsic::amdgcn_struct_buffer_store: 6992 case Intrinsic::amdgcn_struct_ptr_buffer_store: 6993 return legalizeBufferStore(MI, MRI, B, false, false); 6994 case Intrinsic::amdgcn_raw_buffer_store_format: 6995 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: 6996 case Intrinsic::amdgcn_struct_buffer_store_format: 6997 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: 6998 return legalizeBufferStore(MI, MRI, B, false, true); 6999 case Intrinsic::amdgcn_raw_tbuffer_store: 7000 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: 7001 case Intrinsic::amdgcn_struct_tbuffer_store: 7002 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: 7003 return legalizeBufferStore(MI, MRI, B, true, true); 7004 case Intrinsic::amdgcn_raw_buffer_load: 7005 case Intrinsic::amdgcn_raw_ptr_buffer_load: 7006 case Intrinsic::amdgcn_struct_buffer_load: 7007 case Intrinsic::amdgcn_struct_ptr_buffer_load: 7008 return legalizeBufferLoad(MI, MRI, B, false, false); 7009 case Intrinsic::amdgcn_raw_buffer_load_format: 7010 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: 7011 case Intrinsic::amdgcn_struct_buffer_load_format: 7012 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: 7013 return legalizeBufferLoad(MI, MRI, B, true, false); 7014 case Intrinsic::amdgcn_raw_tbuffer_load: 7015 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: 7016 case Intrinsic::amdgcn_struct_tbuffer_load: 7017 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: 7018 return legalizeBufferLoad(MI, MRI, B, true, true); 7019 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 7020 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 7021 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 7022 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 7023 case Intrinsic::amdgcn_raw_buffer_atomic_add: 7024 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 7025 case Intrinsic::amdgcn_struct_buffer_atomic_add: 7026 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 7027 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 7028 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 7029 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 7030 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 7031 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 7032 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 7033 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 7034 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 7035 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 7036 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 7037 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 7038 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 7039 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 7040 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 7041 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 7042 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 7043 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 7044 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 7045 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 7046 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 7047 case Intrinsic::amdgcn_raw_buffer_atomic_and: 7048 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 7049 case Intrinsic::amdgcn_struct_buffer_atomic_and: 7050 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 7051 case Intrinsic::amdgcn_raw_buffer_atomic_or: 7052 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 7053 case Intrinsic::amdgcn_struct_buffer_atomic_or: 7054 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 7055 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 7056 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 7057 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 7058 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 7059 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 7060 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 7061 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 7062 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 7063 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 7064 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 7065 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 7066 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 7067 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 7068 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 7069 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 7070 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 7071 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 7072 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 7073 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 7074 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 7075 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 7076 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 7077 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 7078 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 7079 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 7080 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 7081 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 7082 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 7083 return legalizeBufferAtomic(MI, B, IntrID); 7084 case Intrinsic::trap: 7085 return legalizeTrapIntrinsic(MI, MRI, B); 7086 case Intrinsic::debugtrap: 7087 return legalizeDebugTrapIntrinsic(MI, MRI, B); 7088 case Intrinsic::amdgcn_rsq_clamp: 7089 return legalizeRsqClampIntrinsic(MI, MRI, B); 7090 case Intrinsic::amdgcn_ds_fadd: 7091 case Intrinsic::amdgcn_ds_fmin: 7092 case Intrinsic::amdgcn_ds_fmax: 7093 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 7094 case Intrinsic::amdgcn_image_bvh_intersect_ray: 7095 return legalizeBVHIntrinsic(MI, B); 7096 case Intrinsic::amdgcn_fmed3: { 7097 GISelChangeObserver &Observer = Helper.Observer; 7098 7099 // FIXME: This is to workaround the inability of tablegen match combiners to 7100 // match intrinsics in patterns. 7101 Observer.changingInstr(MI); 7102 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3)); 7103 MI.removeOperand(1); 7104 Observer.changedInstr(MI); 7105 return true; 7106 } 7107 default: { 7108 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 7109 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 7110 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 7111 return true; 7112 } 7113 } 7114 7115 return true; 7116 } 7117