1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/GlobalISel/Utils.h" 28 #include "llvm/IR/DiagnosticInfo.h" 29 #include "llvm/IR/IntrinsicsAMDGPU.h" 30 #include "llvm/IR/IntrinsicsR600.h" 31 32 #define DEBUG_TYPE "amdgpu-legalinfo" 33 34 using namespace llvm; 35 using namespace LegalizeActions; 36 using namespace LegalizeMutations; 37 using namespace LegalityPredicates; 38 using namespace MIPatternMatch; 39 40 // Hack until load/store selection patterns support any tuple of legal types. 41 static cl::opt<bool> EnableNewLegality( 42 "amdgpu-global-isel-new-legality", 43 cl::desc("Use GlobalISel desired legality, rather than try to use" 44 "rules compatible with selection patterns"), 45 cl::init(false), 46 cl::ReallyHidden); 47 48 static constexpr unsigned MaxRegisterSize = 1024; 49 50 // Round the number of elements to the next power of two elements 51 static LLT getPow2VectorType(LLT Ty) { 52 unsigned NElts = Ty.getNumElements(); 53 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 54 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 55 } 56 57 // Round the number of bits to the next power of two bits 58 static LLT getPow2ScalarType(LLT Ty) { 59 unsigned Bits = Ty.getSizeInBits(); 60 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 61 return LLT::scalar(Pow2Bits); 62 } 63 64 /// \returns true if this is an odd sized vector which should widen by adding an 65 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 66 /// excludes s1 vectors, which should always be scalarized. 67 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 if (!Ty.isVector()) 71 return false; 72 73 const LLT EltTy = Ty.getElementType(); 74 const unsigned EltSize = EltTy.getSizeInBits(); 75 return Ty.getNumElements() % 2 != 0 && 76 EltSize > 1 && EltSize < 32 && 77 Ty.getSizeInBits() % 32 != 0; 78 }; 79 } 80 81 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 return Ty.getSizeInBits() % 32 == 0; 85 }; 86 } 87 88 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 89 return [=](const LegalityQuery &Query) { 90 const LLT Ty = Query.Types[TypeIdx]; 91 const LLT EltTy = Ty.getScalarType(); 92 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 93 }; 94 } 95 96 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 const LLT EltTy = Ty.getElementType(); 100 return std::pair(TypeIdx, 101 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 102 }; 103 } 104 105 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 106 return [=](const LegalityQuery &Query) { 107 const LLT Ty = Query.Types[TypeIdx]; 108 const LLT EltTy = Ty.getElementType(); 109 unsigned Size = Ty.getSizeInBits(); 110 unsigned Pieces = (Size + 63) / 64; 111 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 112 return std::pair(TypeIdx, LLT::scalarOrVector( 113 ElementCount::getFixed(NewNumElts), EltTy)); 114 }; 115 } 116 117 // Increase the number of vector elements to reach the next multiple of 32-bit 118 // type. 119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 120 return [=](const LegalityQuery &Query) { 121 const LLT Ty = Query.Types[TypeIdx]; 122 123 const LLT EltTy = Ty.getElementType(); 124 const int Size = Ty.getSizeInBits(); 125 const int EltSize = EltTy.getSizeInBits(); 126 const int NextMul32 = (Size + 31) / 32; 127 128 assert(EltSize < 32); 129 130 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 131 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 132 }; 133 } 134 135 // Increase the number of vector elements to reach the next legal RegClass. 136 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) { 137 return [=](const LegalityQuery &Query) { 138 const LLT Ty = Query.Types[TypeIdx]; 139 const unsigned NumElts = Ty.getNumElements(); 140 const unsigned EltSize = Ty.getElementType().getSizeInBits(); 141 const unsigned MaxNumElts = MaxRegisterSize / EltSize; 142 143 assert(EltSize == 32 || EltSize == 64); 144 assert(Ty.getSizeInBits() < MaxRegisterSize); 145 146 unsigned NewNumElts; 147 // Find the nearest legal RegClass that is larger than the current type. 148 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) { 149 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize)) 150 break; 151 } 152 153 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize)); 154 }; 155 } 156 157 static LLT getBufferRsrcScalarType(const LLT Ty) { 158 if (!Ty.isVector()) 159 return LLT::scalar(128); 160 const ElementCount NumElems = Ty.getElementCount(); 161 return LLT::vector(NumElems, LLT::scalar(128)); 162 } 163 164 static LLT getBufferRsrcRegisterType(const LLT Ty) { 165 if (!Ty.isVector()) 166 return LLT::fixed_vector(4, LLT::scalar(32)); 167 const unsigned NumElems = Ty.getElementCount().getFixedValue(); 168 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32)); 169 } 170 171 static LLT getBitcastRegisterType(const LLT Ty) { 172 const unsigned Size = Ty.getSizeInBits(); 173 174 if (Size <= 32) { 175 // <2 x s8> -> s16 176 // <4 x s8> -> s32 177 return LLT::scalar(Size); 178 } 179 180 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 181 } 182 183 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 184 return [=](const LegalityQuery &Query) { 185 const LLT Ty = Query.Types[TypeIdx]; 186 return std::pair(TypeIdx, getBitcastRegisterType(Ty)); 187 }; 188 } 189 190 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT Ty = Query.Types[TypeIdx]; 193 unsigned Size = Ty.getSizeInBits(); 194 assert(Size % 32 == 0); 195 return std::pair( 196 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 197 }; 198 } 199 200 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 201 return [=](const LegalityQuery &Query) { 202 const LLT QueryTy = Query.Types[TypeIdx]; 203 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 204 }; 205 } 206 207 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 208 return [=](const LegalityQuery &Query) { 209 const LLT QueryTy = Query.Types[TypeIdx]; 210 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 211 }; 212 } 213 214 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 215 return [=](const LegalityQuery &Query) { 216 const LLT QueryTy = Query.Types[TypeIdx]; 217 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 218 }; 219 } 220 221 static bool isRegisterSize(unsigned Size) { 222 return Size % 32 == 0 && Size <= MaxRegisterSize; 223 } 224 225 static bool isRegisterVectorElementType(LLT EltTy) { 226 const int EltSize = EltTy.getSizeInBits(); 227 return EltSize == 16 || EltSize % 32 == 0; 228 } 229 230 static bool isRegisterVectorType(LLT Ty) { 231 const int EltSize = Ty.getElementType().getSizeInBits(); 232 return EltSize == 32 || EltSize == 64 || 233 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 234 EltSize == 128 || EltSize == 256; 235 } 236 237 static bool isRegisterType(LLT Ty) { 238 if (!isRegisterSize(Ty.getSizeInBits())) 239 return false; 240 241 if (Ty.isVector()) 242 return isRegisterVectorType(Ty); 243 244 return true; 245 } 246 247 // Any combination of 32 or 64-bit elements up the maximum register size, and 248 // multiples of v2s16. 249 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 250 return [=](const LegalityQuery &Query) { 251 return isRegisterType(Query.Types[TypeIdx]); 252 }; 253 } 254 255 // RegisterType that doesn't have a corresponding RegClass. 256 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) { 257 return [=](const LegalityQuery &Query) { 258 LLT Ty = Query.Types[TypeIdx]; 259 return isRegisterType(Ty) && 260 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits()); 261 }; 262 } 263 264 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 265 return [=](const LegalityQuery &Query) { 266 const LLT QueryTy = Query.Types[TypeIdx]; 267 if (!QueryTy.isVector()) 268 return false; 269 const LLT EltTy = QueryTy.getElementType(); 270 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 271 }; 272 } 273 274 // If we have a truncating store or an extending load with a data size larger 275 // than 32-bits, we need to reduce to a 32-bit type. 276 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 277 return [=](const LegalityQuery &Query) { 278 const LLT Ty = Query.Types[TypeIdx]; 279 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 280 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 281 }; 282 } 283 284 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 285 // handle some operations by just promoting the register during 286 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 287 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 288 bool IsLoad, bool IsAtomic) { 289 switch (AS) { 290 case AMDGPUAS::PRIVATE_ADDRESS: 291 // FIXME: Private element size. 292 return ST.enableFlatScratch() ? 128 : 32; 293 case AMDGPUAS::LOCAL_ADDRESS: 294 return ST.useDS128() ? 128 : 64; 295 case AMDGPUAS::GLOBAL_ADDRESS: 296 case AMDGPUAS::CONSTANT_ADDRESS: 297 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 298 case AMDGPUAS::BUFFER_RESOURCE: 299 // Treat constant and global as identical. SMRD loads are sometimes usable for 300 // global loads (ideally constant address space should be eliminated) 301 // depending on the context. Legality cannot be context dependent, but 302 // RegBankSelect can split the load as necessary depending on the pointer 303 // register bank/uniformity and if the memory is invariant or not written in a 304 // kernel. 305 return IsLoad ? 512 : 128; 306 default: 307 // FIXME: Flat addresses may contextually need to be split to 32-bit parts 308 // if they may alias scratch depending on the subtarget. This needs to be 309 // moved to custom handling to use addressMayBeAccessedAsPrivate 310 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32; 311 } 312 } 313 314 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 315 const LegalityQuery &Query) { 316 const LLT Ty = Query.Types[0]; 317 318 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 319 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 320 321 unsigned RegSize = Ty.getSizeInBits(); 322 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 323 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 324 unsigned AS = Query.Types[1].getAddressSpace(); 325 326 // All of these need to be custom lowered to cast the pointer operand. 327 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 328 return false; 329 330 // Do not handle extending vector loads. 331 if (Ty.isVector() && MemSize != RegSize) 332 return false; 333 334 // TODO: We should be able to widen loads if the alignment is high enough, but 335 // we also need to modify the memory access size. 336 #if 0 337 // Accept widening loads based on alignment. 338 if (IsLoad && MemSize < Size) 339 MemSize = std::max(MemSize, Align); 340 #endif 341 342 // Only 1-byte and 2-byte to 32-bit extloads are valid. 343 if (MemSize != RegSize && RegSize != 32) 344 return false; 345 346 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 347 Query.MMODescrs[0].Ordering != 348 AtomicOrdering::NotAtomic)) 349 return false; 350 351 switch (MemSize) { 352 case 8: 353 case 16: 354 case 32: 355 case 64: 356 case 128: 357 break; 358 case 96: 359 if (!ST.hasDwordx3LoadStores()) 360 return false; 361 break; 362 case 256: 363 case 512: 364 // These may contextually need to be broken down. 365 break; 366 default: 367 return false; 368 } 369 370 assert(RegSize >= MemSize); 371 372 if (AlignBits < MemSize) { 373 const SITargetLowering *TLI = ST.getTargetLowering(); 374 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 375 Align(AlignBits / 8))) 376 return false; 377 } 378 379 return true; 380 } 381 382 // The newer buffer intrinsic forms take their resource arguments as 383 // pointers in address space 8, aka s128 values. However, in order to not break 384 // SelectionDAG, the underlying operations have to continue to take v4i32 385 // arguments. Therefore, we convert resource pointers - or vectors of them 386 // to integer values here. 387 static bool hasBufferRsrcWorkaround(const LLT Ty) { 388 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) 389 return true; 390 if (Ty.isVector()) { 391 const LLT ElemTy = Ty.getElementType(); 392 return hasBufferRsrcWorkaround(ElemTy); 393 } 394 return false; 395 } 396 397 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 398 // workaround this. Eventually it should ignore the type for loads and only care 399 // about the size. Return true in cases where we will workaround this for now by 400 // bitcasting. 401 static bool loadStoreBitcastWorkaround(const LLT Ty) { 402 if (EnableNewLegality) 403 return false; 404 405 const unsigned Size = Ty.getSizeInBits(); 406 if (Size <= 64) 407 return false; 408 // Address space 8 pointers get their own workaround. 409 if (hasBufferRsrcWorkaround(Ty)) 410 return false; 411 if (!Ty.isVector()) 412 return true; 413 414 LLT EltTy = Ty.getElementType(); 415 if (EltTy.isPointer()) 416 return true; 417 418 unsigned EltSize = EltTy.getSizeInBits(); 419 return EltSize != 32 && EltSize != 64; 420 } 421 422 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 423 const LLT Ty = Query.Types[0]; 424 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 425 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty); 426 } 427 428 /// Return true if a load or store of the type should be lowered with a bitcast 429 /// to a different type. 430 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 431 const LLT MemTy) { 432 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 433 const unsigned Size = Ty.getSizeInBits(); 434 if (Size != MemSizeInBits) 435 return Size <= 32 && Ty.isVector(); 436 437 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 438 return true; 439 440 // Don't try to handle bitcasting vector ext loads for now. 441 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 442 (Size <= 32 || isRegisterSize(Size)) && 443 !isRegisterVectorElementType(Ty.getElementType()); 444 } 445 446 /// Return true if we should legalize a load by widening an odd sized memory 447 /// access up to the alignment. Note this case when the memory access itself 448 /// changes, not the size of the result register. 449 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 450 uint64_t AlignInBits, unsigned AddrSpace, 451 unsigned Opcode) { 452 unsigned SizeInBits = MemoryTy.getSizeInBits(); 453 // We don't want to widen cases that are naturally legal. 454 if (isPowerOf2_32(SizeInBits)) 455 return false; 456 457 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 458 // end up widening these for a scalar load during RegBankSelect, since there 459 // aren't 96-bit scalar loads. 460 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 461 return false; 462 463 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false)) 464 return false; 465 466 // A load is known dereferenceable up to the alignment, so it's legal to widen 467 // to it. 468 // 469 // TODO: Could check dereferenceable for less aligned cases. 470 unsigned RoundedSize = NextPowerOf2(SizeInBits); 471 if (AlignInBits < RoundedSize) 472 return false; 473 474 // Do not widen if it would introduce a slow unaligned load. 475 const SITargetLowering *TLI = ST.getTargetLowering(); 476 unsigned Fast = 0; 477 return TLI->allowsMisalignedMemoryAccessesImpl( 478 RoundedSize, AddrSpace, Align(AlignInBits / 8), 479 MachineMemOperand::MOLoad, &Fast) && 480 Fast; 481 } 482 483 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 484 unsigned Opcode) { 485 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 486 return false; 487 488 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 489 Query.MMODescrs[0].AlignInBits, 490 Query.Types[1].getAddressSpace(), Opcode); 491 } 492 493 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial 494 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts 495 /// and inttoptr. In addition, handle vectors of p8. Returns the new type. 496 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, 497 MachineRegisterInfo &MRI, unsigned Idx) { 498 MachineOperand &MO = MI.getOperand(Idx); 499 500 const LLT PointerTy = MRI.getType(MO.getReg()); 501 502 // Paranoidly prevent us from doing this multiple times. 503 if (!hasBufferRsrcWorkaround(PointerTy)) 504 return PointerTy; 505 506 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 507 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 508 if (!PointerTy.isVector()) { 509 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8) 510 const unsigned NumParts = PointerTy.getSizeInBits() / 32; 511 const LLT S32 = LLT::scalar(32); 512 513 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy); 514 std::array<Register, 4> VectorElems; 515 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 516 for (unsigned I = 0; I < NumParts; ++I) 517 VectorElems[I] = 518 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0); 519 B.buildMergeValues(MO, VectorElems); 520 MO.setReg(VectorReg); 521 return VectorTy; 522 } 523 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy); 524 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 525 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg); 526 B.buildIntToPtr(MO, Scalar); 527 MO.setReg(BitcastReg); 528 529 return VectorTy; 530 } 531 532 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is 533 /// the form in which the value must be in order to be passed to the low-level 534 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is 535 /// needed in order to account for the fact that we can't define a register 536 /// class for s128 without breaking SelectionDAG. 537 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) { 538 MachineRegisterInfo &MRI = *B.getMRI(); 539 const LLT PointerTy = MRI.getType(Pointer); 540 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 541 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 542 543 if (!PointerTy.isVector()) { 544 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32) 545 SmallVector<Register, 4> PointerParts; 546 const unsigned NumParts = PointerTy.getSizeInBits() / 32; 547 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer); 548 for (unsigned I = 0; I < NumParts; ++I) 549 PointerParts.push_back(Unmerged.getReg(I)); 550 return B.buildBuildVector(VectorTy, PointerParts).getReg(0); 551 } 552 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0); 553 return B.buildBitcast(VectorTy, Scalar).getReg(0); 554 } 555 556 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, 557 unsigned Idx) { 558 MachineOperand &MO = MI.getOperand(Idx); 559 560 const LLT PointerTy = B.getMRI()->getType(MO.getReg()); 561 // Paranoidly prevent us from doing this multiple times. 562 if (!hasBufferRsrcWorkaround(PointerTy)) 563 return; 564 MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B)); 565 } 566 567 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 568 const GCNTargetMachine &TM) 569 : ST(ST_) { 570 using namespace TargetOpcode; 571 572 auto GetAddrSpacePtr = [&TM](unsigned AS) { 573 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 574 }; 575 576 const LLT S1 = LLT::scalar(1); 577 const LLT S8 = LLT::scalar(8); 578 const LLT S16 = LLT::scalar(16); 579 const LLT S32 = LLT::scalar(32); 580 const LLT S64 = LLT::scalar(64); 581 const LLT S128 = LLT::scalar(128); 582 const LLT S256 = LLT::scalar(256); 583 const LLT S512 = LLT::scalar(512); 584 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 585 586 const LLT V2S8 = LLT::fixed_vector(2, 8); 587 const LLT V2S16 = LLT::fixed_vector(2, 16); 588 const LLT V4S16 = LLT::fixed_vector(4, 16); 589 590 const LLT V2S32 = LLT::fixed_vector(2, 32); 591 const LLT V3S32 = LLT::fixed_vector(3, 32); 592 const LLT V4S32 = LLT::fixed_vector(4, 32); 593 const LLT V5S32 = LLT::fixed_vector(5, 32); 594 const LLT V6S32 = LLT::fixed_vector(6, 32); 595 const LLT V7S32 = LLT::fixed_vector(7, 32); 596 const LLT V8S32 = LLT::fixed_vector(8, 32); 597 const LLT V9S32 = LLT::fixed_vector(9, 32); 598 const LLT V10S32 = LLT::fixed_vector(10, 32); 599 const LLT V11S32 = LLT::fixed_vector(11, 32); 600 const LLT V12S32 = LLT::fixed_vector(12, 32); 601 const LLT V13S32 = LLT::fixed_vector(13, 32); 602 const LLT V14S32 = LLT::fixed_vector(14, 32); 603 const LLT V15S32 = LLT::fixed_vector(15, 32); 604 const LLT V16S32 = LLT::fixed_vector(16, 32); 605 const LLT V32S32 = LLT::fixed_vector(32, 32); 606 607 const LLT V2S64 = LLT::fixed_vector(2, 64); 608 const LLT V3S64 = LLT::fixed_vector(3, 64); 609 const LLT V4S64 = LLT::fixed_vector(4, 64); 610 const LLT V5S64 = LLT::fixed_vector(5, 64); 611 const LLT V6S64 = LLT::fixed_vector(6, 64); 612 const LLT V7S64 = LLT::fixed_vector(7, 64); 613 const LLT V8S64 = LLT::fixed_vector(8, 64); 614 const LLT V16S64 = LLT::fixed_vector(16, 64); 615 616 std::initializer_list<LLT> AllS32Vectors = 617 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 618 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 619 std::initializer_list<LLT> AllS64Vectors = 620 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 621 622 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 623 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 624 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 625 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 626 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 627 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 628 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 629 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER); 630 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE); 631 632 const LLT CodePtr = FlatPtr; 633 634 const std::initializer_list<LLT> AddrSpaces64 = { 635 GlobalPtr, ConstantPtr, FlatPtr 636 }; 637 638 const std::initializer_list<LLT> AddrSpaces32 = { 639 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 640 }; 641 642 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr}; 643 644 const std::initializer_list<LLT> FPTypesBase = { 645 S32, S64 646 }; 647 648 const std::initializer_list<LLT> FPTypes16 = { 649 S32, S64, S16 650 }; 651 652 const std::initializer_list<LLT> FPTypesPK16 = { 653 S32, S64, S16, V2S16 654 }; 655 656 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 657 658 // s1 for VCC branches, s32 for SCC branches. 659 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 660 661 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 662 // elements for v3s16 663 getActionDefinitionsBuilder(G_PHI) 664 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 665 .legalFor(AllS32Vectors) 666 .legalFor(AllS64Vectors) 667 .legalFor(AddrSpaces64) 668 .legalFor(AddrSpaces32) 669 .legalFor(AddrSpaces128) 670 .legalIf(isPointer(0)) 671 .clampScalar(0, S16, S256) 672 .widenScalarToNextPow2(0, 32) 673 .clampMaxNumElements(0, S32, 16) 674 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 675 .scalarize(0); 676 677 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 678 // Full set of gfx9 features. 679 getActionDefinitionsBuilder({G_ADD, G_SUB}) 680 .legalFor({S32, S16, V2S16}) 681 .clampMaxNumElementsStrict(0, S16, 2) 682 .scalarize(0) 683 .minScalar(0, S16) 684 .widenScalarToNextMultipleOf(0, 32) 685 .maxScalar(0, S32); 686 687 getActionDefinitionsBuilder(G_MUL) 688 .legalFor({S32, S16, V2S16}) 689 .clampMaxNumElementsStrict(0, S16, 2) 690 .scalarize(0) 691 .minScalar(0, S16) 692 .widenScalarToNextMultipleOf(0, 32) 693 .custom(); 694 assert(ST.hasMad64_32()); 695 696 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 697 .legalFor({S32, S16, V2S16}) // Clamp modifier 698 .minScalarOrElt(0, S16) 699 .clampMaxNumElementsStrict(0, S16, 2) 700 .scalarize(0) 701 .widenScalarToNextPow2(0, 32) 702 .lower(); 703 } else if (ST.has16BitInsts()) { 704 getActionDefinitionsBuilder({G_ADD, G_SUB}) 705 .legalFor({S32, S16}) 706 .minScalar(0, S16) 707 .widenScalarToNextMultipleOf(0, 32) 708 .maxScalar(0, S32) 709 .scalarize(0); 710 711 getActionDefinitionsBuilder(G_MUL) 712 .legalFor({S32, S16}) 713 .scalarize(0) 714 .minScalar(0, S16) 715 .widenScalarToNextMultipleOf(0, 32) 716 .custom(); 717 assert(ST.hasMad64_32()); 718 719 // Technically the saturating operations require clamp bit support, but this 720 // was introduced at the same time as 16-bit operations. 721 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 722 .legalFor({S32, S16}) // Clamp modifier 723 .minScalar(0, S16) 724 .scalarize(0) 725 .widenScalarToNextPow2(0, 16) 726 .lower(); 727 728 // We're just lowering this, but it helps get a better result to try to 729 // coerce to the desired type first. 730 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 731 .minScalar(0, S16) 732 .scalarize(0) 733 .lower(); 734 } else { 735 getActionDefinitionsBuilder({G_ADD, G_SUB}) 736 .legalFor({S32}) 737 .widenScalarToNextMultipleOf(0, 32) 738 .clampScalar(0, S32, S32) 739 .scalarize(0); 740 741 auto &Mul = getActionDefinitionsBuilder(G_MUL) 742 .legalFor({S32}) 743 .scalarize(0) 744 .minScalar(0, S32) 745 .widenScalarToNextMultipleOf(0, 32); 746 747 if (ST.hasMad64_32()) 748 Mul.custom(); 749 else 750 Mul.maxScalar(0, S32); 751 752 if (ST.hasIntClamp()) { 753 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 754 .legalFor({S32}) // Clamp modifier. 755 .scalarize(0) 756 .minScalarOrElt(0, S32) 757 .lower(); 758 } else { 759 // Clamp bit support was added in VI, along with 16-bit operations. 760 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 761 .minScalar(0, S32) 762 .scalarize(0) 763 .lower(); 764 } 765 766 // FIXME: DAG expansion gets better results. The widening uses the smaller 767 // range values and goes for the min/max lowering directly. 768 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 769 .minScalar(0, S32) 770 .scalarize(0) 771 .lower(); 772 } 773 774 getActionDefinitionsBuilder( 775 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 776 .customFor({S32, S64}) 777 .clampScalar(0, S32, S64) 778 .widenScalarToNextPow2(0, 32) 779 .scalarize(0); 780 781 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 782 .legalFor({S32}) 783 .maxScalar(0, S32); 784 785 if (ST.hasVOP3PInsts()) { 786 Mulh 787 .clampMaxNumElements(0, S8, 2) 788 .lowerFor({V2S8}); 789 } 790 791 Mulh 792 .scalarize(0) 793 .lower(); 794 795 // Report legal for any types we can handle anywhere. For the cases only legal 796 // on the SALU, RegBankSelect will be able to re-legalize. 797 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 798 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 799 .clampScalar(0, S32, S64) 800 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 801 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 802 .widenScalarToNextPow2(0) 803 .scalarize(0); 804 805 getActionDefinitionsBuilder( 806 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 807 .legalFor({{S32, S1}, {S32, S32}}) 808 .clampScalar(0, S32, S32) 809 .scalarize(0); 810 811 getActionDefinitionsBuilder(G_BITCAST) 812 // Don't worry about the size constraint. 813 .legalIf(all(isRegisterType(0), isRegisterType(1))) 814 .lower(); 815 816 817 getActionDefinitionsBuilder(G_CONSTANT) 818 .legalFor({S1, S32, S64, S16, GlobalPtr, 819 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 820 .legalIf(isPointer(0)) 821 .clampScalar(0, S32, S64) 822 .widenScalarToNextPow2(0); 823 824 getActionDefinitionsBuilder(G_FCONSTANT) 825 .legalFor({S32, S64, S16}) 826 .clampScalar(0, S16, S64); 827 828 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 829 .legalIf(isRegisterType(0)) 830 // s1 and s16 are special cases because they have legal operations on 831 // them, but don't really occupy registers in the normal way. 832 .legalFor({S1, S16}) 833 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 834 .clampScalarOrElt(0, S32, MaxScalar) 835 .widenScalarToNextPow2(0, 32) 836 .clampMaxNumElements(0, S32, 16); 837 838 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 839 840 // If the amount is divergent, we have to do a wave reduction to get the 841 // maximum value, so this is expanded during RegBankSelect. 842 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 843 .legalFor({{PrivatePtr, S32}}); 844 845 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 846 .customIf(typeIsNot(0, PrivatePtr)); 847 848 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 849 850 auto &FPOpActions = getActionDefinitionsBuilder( 851 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE, 852 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA}) 853 .legalFor({S32, S64}); 854 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 855 .customFor({S32, S64}); 856 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 857 .customFor({S32, S64}); 858 859 if (ST.has16BitInsts()) { 860 if (ST.hasVOP3PInsts()) 861 FPOpActions.legalFor({S16, V2S16}); 862 else 863 FPOpActions.legalFor({S16}); 864 865 TrigActions.customFor({S16}); 866 FDIVActions.customFor({S16}); 867 } 868 869 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 870 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 871 872 if (ST.hasVOP3PInsts()) { 873 MinNumMaxNum.customFor(FPTypesPK16) 874 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 875 .clampMaxNumElements(0, S16, 2) 876 .clampScalar(0, S16, S64) 877 .scalarize(0); 878 } else if (ST.has16BitInsts()) { 879 MinNumMaxNum.customFor(FPTypes16) 880 .clampScalar(0, S16, S64) 881 .scalarize(0); 882 } else { 883 MinNumMaxNum.customFor(FPTypesBase) 884 .clampScalar(0, S32, S64) 885 .scalarize(0); 886 } 887 888 if (ST.hasVOP3PInsts()) 889 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 890 891 FPOpActions 892 .scalarize(0) 893 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 894 895 TrigActions 896 .scalarize(0) 897 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 898 899 FDIVActions 900 .scalarize(0) 901 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 902 903 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 904 .legalFor(FPTypesPK16) 905 .clampMaxNumElementsStrict(0, S16, 2) 906 .scalarize(0) 907 .clampScalar(0, S16, S64); 908 909 if (ST.has16BitInsts()) { 910 getActionDefinitionsBuilder(G_FSQRT) 911 .legalFor({S32, S16}) 912 .customFor({S64}) 913 .scalarize(0) 914 .clampScalar(0, S16, S64); 915 getActionDefinitionsBuilder(G_FFLOOR) 916 .legalFor({S32, S64, S16}) 917 .scalarize(0) 918 .clampScalar(0, S16, S64); 919 920 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 921 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}}) 922 .scalarize(0) 923 .maxScalarIf(typeIs(0, S16), 1, S16) 924 .clampScalar(1, S32, S32) 925 .lower(); 926 927 getActionDefinitionsBuilder(G_FFREXP) 928 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}}) 929 .scalarize(0) 930 .lower(); 931 } else { 932 getActionDefinitionsBuilder(G_FSQRT) 933 .legalFor({S32}) 934 .customFor({S64}) 935 .scalarize(0) 936 .clampScalar(0, S32, S64); 937 938 if (ST.hasFractBug()) { 939 getActionDefinitionsBuilder(G_FFLOOR) 940 .customFor({S64}) 941 .legalFor({S32, S64}) 942 .scalarize(0) 943 .clampScalar(0, S32, S64); 944 } else { 945 getActionDefinitionsBuilder(G_FFLOOR) 946 .legalFor({S32, S64}) 947 .scalarize(0) 948 .clampScalar(0, S32, S64); 949 } 950 951 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 952 .legalFor({{S32, S32}, {S64, S32}}) 953 .scalarize(0) 954 .clampScalar(0, S32, S64) 955 .clampScalar(1, S32, S32) 956 .lower(); 957 958 getActionDefinitionsBuilder(G_FFREXP) 959 .customFor({{S32, S32}, {S64, S32}}) 960 .scalarize(0) 961 .minScalar(0, S32) 962 .clampScalar(1, S32, S32) 963 .lower(); 964 } 965 966 getActionDefinitionsBuilder(G_FPTRUNC) 967 .legalFor({{S32, S64}, {S16, S32}}) 968 .scalarize(0) 969 .lower(); 970 971 getActionDefinitionsBuilder(G_FPEXT) 972 .legalFor({{S64, S32}, {S32, S16}}) 973 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 974 .scalarize(0); 975 976 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB}); 977 if (ST.has16BitInsts()) { 978 FSubActions 979 // Use actual fsub instruction 980 .legalFor({S32, S16}) 981 // Must use fadd + fneg 982 .lowerFor({S64, V2S16}); 983 } else { 984 FSubActions 985 // Use actual fsub instruction 986 .legalFor({S32}) 987 // Must use fadd + fneg 988 .lowerFor({S64, S16, V2S16}); 989 } 990 991 FSubActions 992 .scalarize(0) 993 .clampScalar(0, S32, S64); 994 995 // Whether this is legal depends on the floating point mode for the function. 996 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 997 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 998 FMad.customFor({S32, S16}); 999 else if (ST.hasMadMacF32Insts()) 1000 FMad.customFor({S32}); 1001 else if (ST.hasMadF16()) 1002 FMad.customFor({S16}); 1003 FMad.scalarize(0) 1004 .lower(); 1005 1006 auto &FRem = getActionDefinitionsBuilder(G_FREM); 1007 if (ST.has16BitInsts()) { 1008 FRem.customFor({S16, S32, S64}); 1009 } else { 1010 FRem.minScalar(0, S32) 1011 .customFor({S32, S64}); 1012 } 1013 FRem.scalarize(0); 1014 1015 // TODO: Do we need to clamp maximum bitwidth? 1016 getActionDefinitionsBuilder(G_TRUNC) 1017 .legalIf(isScalar(0)) 1018 .legalFor({{V2S16, V2S32}}) 1019 .clampMaxNumElements(0, S16, 2) 1020 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 1021 // situations (like an invalid implicit use), we don't want to infinite loop 1022 // in the legalizer. 1023 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 1024 .alwaysLegal(); 1025 1026 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 1027 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 1028 {S32, S1}, {S64, S1}, {S16, S1}}) 1029 .scalarize(0) 1030 .clampScalar(0, S32, S64) 1031 .widenScalarToNextPow2(1, 32); 1032 1033 // TODO: Split s1->s64 during regbankselect for VALU. 1034 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 1035 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 1036 .lowerIf(typeIs(1, S1)) 1037 .customFor({{S32, S64}, {S64, S64}}); 1038 if (ST.has16BitInsts()) 1039 IToFP.legalFor({{S16, S16}}); 1040 IToFP.clampScalar(1, S32, S64) 1041 .minScalar(0, S32) 1042 .scalarize(0) 1043 .widenScalarToNextPow2(1); 1044 1045 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 1046 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 1047 .customFor({{S64, S32}, {S64, S64}}) 1048 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 1049 if (ST.has16BitInsts()) 1050 FPToI.legalFor({{S16, S16}}); 1051 else 1052 FPToI.minScalar(1, S32); 1053 1054 FPToI.minScalar(0, S32) 1055 .widenScalarToNextPow2(0, 32) 1056 .scalarize(0) 1057 .lower(); 1058 1059 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 1060 .customFor({S16, S32}) 1061 .scalarize(0) 1062 .lower(); 1063 1064 // Lower roundeven into G_FRINT 1065 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 1066 .scalarize(0) 1067 .lower(); 1068 1069 if (ST.has16BitInsts()) { 1070 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 1071 .legalFor({S16, S32, S64}) 1072 .clampScalar(0, S16, S64) 1073 .scalarize(0); 1074 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 1075 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 1076 .legalFor({S32, S64}) 1077 .clampScalar(0, S32, S64) 1078 .scalarize(0); 1079 } else { 1080 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 1081 .legalFor({S32}) 1082 .customFor({S64}) 1083 .clampScalar(0, S32, S64) 1084 .scalarize(0); 1085 } 1086 1087 getActionDefinitionsBuilder(G_PTR_ADD) 1088 .unsupportedFor({BufferFatPtr, RsrcPtr}) 1089 .legalIf(all(isPointer(0), sameSize(0, 1))) 1090 .scalarize(0) 1091 .scalarSameSizeAs(1, 0); 1092 1093 getActionDefinitionsBuilder(G_PTRMASK) 1094 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 1095 .scalarSameSizeAs(1, 0) 1096 .scalarize(0); 1097 1098 auto &CmpBuilder = 1099 getActionDefinitionsBuilder(G_ICMP) 1100 // The compare output type differs based on the register bank of the output, 1101 // so make both s1 and s32 legal. 1102 // 1103 // Scalar compares producing output in scc will be promoted to s32, as that 1104 // is the allocatable register type that will be needed for the copy from 1105 // scc. This will be promoted during RegBankSelect, and we assume something 1106 // before that won't try to use s32 result types. 1107 // 1108 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 1109 // bank. 1110 .legalForCartesianProduct( 1111 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 1112 .legalForCartesianProduct( 1113 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 1114 if (ST.has16BitInsts()) { 1115 CmpBuilder.legalFor({{S1, S16}}); 1116 } 1117 1118 CmpBuilder 1119 .widenScalarToNextPow2(1) 1120 .clampScalar(1, S32, S64) 1121 .scalarize(0) 1122 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 1123 1124 getActionDefinitionsBuilder(G_FCMP) 1125 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 1126 .widenScalarToNextPow2(1) 1127 .clampScalar(1, S32, S64) 1128 .scalarize(0); 1129 1130 // FIXME: fpow has a selection pattern that should move to custom lowering. 1131 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW); 1132 if (ST.has16BitInsts()) 1133 ExpOps.customFor({{S32}, {S16}}); 1134 else 1135 ExpOps.customFor({S32}); 1136 ExpOps.clampScalar(0, MinScalarFPTy, S32) 1137 .scalarize(0); 1138 1139 getActionDefinitionsBuilder(G_FPOWI) 1140 .clampScalar(0, MinScalarFPTy, S32) 1141 .lower(); 1142 1143 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2}); 1144 Log2Ops.customFor({S32}); 1145 if (ST.has16BitInsts()) 1146 Log2Ops.legalFor({S16}); 1147 else 1148 Log2Ops.customFor({S16}); 1149 Log2Ops.scalarize(0) 1150 .lower(); 1151 1152 auto &LogOps = getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP}); 1153 LogOps.customFor({S32, S16}); 1154 LogOps.clampScalar(0, MinScalarFPTy, S32) 1155 .scalarize(0); 1156 1157 // The 64-bit versions produce 32-bit results, but only on the SALU. 1158 getActionDefinitionsBuilder(G_CTPOP) 1159 .legalFor({{S32, S32}, {S32, S64}}) 1160 .clampScalar(0, S32, S32) 1161 .widenScalarToNextPow2(1, 32) 1162 .clampScalar(1, S32, S64) 1163 .scalarize(0) 1164 .widenScalarToNextPow2(0, 32); 1165 1166 // If no 16 bit instr is available, lower into different instructions. 1167 if (ST.has16BitInsts()) 1168 getActionDefinitionsBuilder(G_IS_FPCLASS) 1169 .legalForCartesianProduct({S1}, FPTypes16) 1170 .widenScalarToNextPow2(1) 1171 .scalarize(0) 1172 .lower(); 1173 else 1174 getActionDefinitionsBuilder(G_IS_FPCLASS) 1175 .legalForCartesianProduct({S1}, FPTypesBase) 1176 .lowerFor({S1, S16}) 1177 .widenScalarToNextPow2(1) 1178 .scalarize(0) 1179 .lower(); 1180 1181 // The hardware instructions return a different result on 0 than the generic 1182 // instructions expect. The hardware produces -1, but these produce the 1183 // bitwidth. 1184 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 1185 .scalarize(0) 1186 .clampScalar(0, S32, S32) 1187 .clampScalar(1, S32, S64) 1188 .widenScalarToNextPow2(0, 32) 1189 .widenScalarToNextPow2(1, 32) 1190 .custom(); 1191 1192 // The 64-bit versions produce 32-bit results, but only on the SALU. 1193 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 1194 .legalFor({{S32, S32}, {S32, S64}}) 1195 .clampScalar(0, S32, S32) 1196 .clampScalar(1, S32, S64) 1197 .scalarize(0) 1198 .widenScalarToNextPow2(0, 32) 1199 .widenScalarToNextPow2(1, 32); 1200 1201 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1202 // RegBankSelect. 1203 getActionDefinitionsBuilder(G_BITREVERSE) 1204 .legalFor({S32, S64}) 1205 .clampScalar(0, S32, S64) 1206 .scalarize(0) 1207 .widenScalarToNextPow2(0); 1208 1209 if (ST.has16BitInsts()) { 1210 getActionDefinitionsBuilder(G_BSWAP) 1211 .legalFor({S16, S32, V2S16}) 1212 .clampMaxNumElementsStrict(0, S16, 2) 1213 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1214 // narrowScalar limitation. 1215 .widenScalarToNextPow2(0) 1216 .clampScalar(0, S16, S32) 1217 .scalarize(0); 1218 1219 if (ST.hasVOP3PInsts()) { 1220 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1221 .legalFor({S32, S16, V2S16}) 1222 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1223 .clampMaxNumElements(0, S16, 2) 1224 .minScalar(0, S16) 1225 .widenScalarToNextPow2(0) 1226 .scalarize(0) 1227 .lower(); 1228 } else { 1229 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1230 .legalFor({S32, S16}) 1231 .widenScalarToNextPow2(0) 1232 .minScalar(0, S16) 1233 .scalarize(0) 1234 .lower(); 1235 } 1236 } else { 1237 // TODO: Should have same legality without v_perm_b32 1238 getActionDefinitionsBuilder(G_BSWAP) 1239 .legalFor({S32}) 1240 .lowerIf(scalarNarrowerThan(0, 32)) 1241 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1242 // narrowScalar limitation. 1243 .widenScalarToNextPow2(0) 1244 .maxScalar(0, S32) 1245 .scalarize(0) 1246 .lower(); 1247 1248 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1249 .legalFor({S32}) 1250 .minScalar(0, S32) 1251 .widenScalarToNextPow2(0) 1252 .scalarize(0) 1253 .lower(); 1254 } 1255 1256 getActionDefinitionsBuilder(G_INTTOPTR) 1257 // List the common cases 1258 .legalForCartesianProduct(AddrSpaces64, {S64}) 1259 .legalForCartesianProduct(AddrSpaces32, {S32}) 1260 .scalarize(0) 1261 // Accept any address space as long as the size matches 1262 .legalIf(sameSize(0, 1)) 1263 .widenScalarIf(smallerThan(1, 0), 1264 [](const LegalityQuery &Query) { 1265 return std::pair( 1266 1, LLT::scalar(Query.Types[0].getSizeInBits())); 1267 }) 1268 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) { 1269 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1270 }); 1271 1272 getActionDefinitionsBuilder(G_PTRTOINT) 1273 // List the common cases 1274 .legalForCartesianProduct(AddrSpaces64, {S64}) 1275 .legalForCartesianProduct(AddrSpaces32, {S32}) 1276 .scalarize(0) 1277 // Accept any address space as long as the size matches 1278 .legalIf(sameSize(0, 1)) 1279 .widenScalarIf(smallerThan(0, 1), 1280 [](const LegalityQuery &Query) { 1281 return std::pair( 1282 0, LLT::scalar(Query.Types[1].getSizeInBits())); 1283 }) 1284 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) { 1285 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1286 }); 1287 1288 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1289 .scalarize(0) 1290 .custom(); 1291 1292 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1293 bool IsLoad) -> bool { 1294 const LLT DstTy = Query.Types[0]; 1295 1296 // Split vector extloads. 1297 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1298 1299 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1300 return true; 1301 1302 const LLT PtrTy = Query.Types[1]; 1303 unsigned AS = PtrTy.getAddressSpace(); 1304 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 1305 Query.MMODescrs[0].Ordering != 1306 AtomicOrdering::NotAtomic)) 1307 return true; 1308 1309 // Catch weird sized loads that don't evenly divide into the access sizes 1310 // TODO: May be able to widen depending on alignment etc. 1311 unsigned NumRegs = (MemSize + 31) / 32; 1312 if (NumRegs == 3) { 1313 if (!ST.hasDwordx3LoadStores()) 1314 return true; 1315 } else { 1316 // If the alignment allows, these should have been widened. 1317 if (!isPowerOf2_32(NumRegs)) 1318 return true; 1319 } 1320 1321 return false; 1322 }; 1323 1324 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1325 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1326 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1327 1328 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1329 // LDS 1330 // TODO: Unsupported flat for SI. 1331 1332 for (unsigned Op : {G_LOAD, G_STORE}) { 1333 const bool IsStore = Op == G_STORE; 1334 1335 auto &Actions = getActionDefinitionsBuilder(Op); 1336 // Explicitly list some common cases. 1337 // TODO: Does this help compile time at all? 1338 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1339 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1340 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1341 {S64, GlobalPtr, S64, GlobalAlign32}, 1342 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1343 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1344 {S32, GlobalPtr, S8, GlobalAlign8}, 1345 {S32, GlobalPtr, S16, GlobalAlign16}, 1346 1347 {S32, LocalPtr, S32, 32}, 1348 {S64, LocalPtr, S64, 32}, 1349 {V2S32, LocalPtr, V2S32, 32}, 1350 {S32, LocalPtr, S8, 8}, 1351 {S32, LocalPtr, S16, 16}, 1352 {V2S16, LocalPtr, S32, 32}, 1353 1354 {S32, PrivatePtr, S32, 32}, 1355 {S32, PrivatePtr, S8, 8}, 1356 {S32, PrivatePtr, S16, 16}, 1357 {V2S16, PrivatePtr, S32, 32}, 1358 1359 {S32, ConstantPtr, S32, GlobalAlign32}, 1360 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1361 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1362 {S64, ConstantPtr, S64, GlobalAlign32}, 1363 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1364 Actions.legalIf( 1365 [=](const LegalityQuery &Query) -> bool { 1366 return isLoadStoreLegal(ST, Query); 1367 }); 1368 1369 // The custom pointers (fat pointers, buffer resources) don't work with load 1370 // and store at this level. Fat pointers should have been lowered to 1371 // intrinsics before the translation to MIR. 1372 Actions.unsupportedIf(typeInSet(1, {BufferFatPtr, RsrcPtr})); 1373 1374 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and 1375 // ptrtoint. This is needed to account for the fact that we can't have i128 1376 // as a register class for SelectionDAG reasons. 1377 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1378 return hasBufferRsrcWorkaround(Query.Types[0]); 1379 }); 1380 1381 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1382 // 64-bits. 1383 // 1384 // TODO: Should generalize bitcast action into coerce, which will also cover 1385 // inserting addrspacecasts. 1386 Actions.customIf(typeIs(1, Constant32Ptr)); 1387 1388 // Turn any illegal element vectors into something easier to deal 1389 // with. These will ultimately produce 32-bit scalar shifts to extract the 1390 // parts anyway. 1391 // 1392 // For odd 16-bit element vectors, prefer to split those into pieces with 1393 // 16-bit vector parts. 1394 Actions.bitcastIf( 1395 [=](const LegalityQuery &Query) -> bool { 1396 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1397 Query.MMODescrs[0].MemoryTy); 1398 }, bitcastToRegisterType(0)); 1399 1400 if (!IsStore) { 1401 // Widen suitably aligned loads by loading extra bytes. The standard 1402 // legalization actions can't properly express widening memory operands. 1403 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1404 return shouldWidenLoad(ST, Query, G_LOAD); 1405 }); 1406 } 1407 1408 // FIXME: load/store narrowing should be moved to lower action 1409 Actions 1410 .narrowScalarIf( 1411 [=](const LegalityQuery &Query) -> bool { 1412 return !Query.Types[0].isVector() && 1413 needToSplitMemOp(Query, Op == G_LOAD); 1414 }, 1415 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1416 const LLT DstTy = Query.Types[0]; 1417 const LLT PtrTy = Query.Types[1]; 1418 1419 const unsigned DstSize = DstTy.getSizeInBits(); 1420 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1421 1422 // Split extloads. 1423 if (DstSize > MemSize) 1424 return std::pair(0, LLT::scalar(MemSize)); 1425 1426 unsigned MaxSize = maxSizeForAddrSpace( 1427 ST, PtrTy.getAddressSpace(), Op == G_LOAD, 1428 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 1429 if (MemSize > MaxSize) 1430 return std::pair(0, LLT::scalar(MaxSize)); 1431 1432 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1433 return std::pair(0, LLT::scalar(Align)); 1434 }) 1435 .fewerElementsIf( 1436 [=](const LegalityQuery &Query) -> bool { 1437 return Query.Types[0].isVector() && 1438 needToSplitMemOp(Query, Op == G_LOAD); 1439 }, 1440 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1441 const LLT DstTy = Query.Types[0]; 1442 const LLT PtrTy = Query.Types[1]; 1443 1444 LLT EltTy = DstTy.getElementType(); 1445 unsigned MaxSize = maxSizeForAddrSpace( 1446 ST, PtrTy.getAddressSpace(), Op == G_LOAD, 1447 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 1448 1449 // FIXME: Handle widened to power of 2 results better. This ends 1450 // up scalarizing. 1451 // FIXME: 3 element stores scalarized on SI 1452 1453 // Split if it's too large for the address space. 1454 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1455 if (MemSize > MaxSize) { 1456 unsigned NumElts = DstTy.getNumElements(); 1457 unsigned EltSize = EltTy.getSizeInBits(); 1458 1459 if (MaxSize % EltSize == 0) { 1460 return std::pair( 1461 0, LLT::scalarOrVector( 1462 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1463 } 1464 1465 unsigned NumPieces = MemSize / MaxSize; 1466 1467 // FIXME: Refine when odd breakdowns handled 1468 // The scalars will need to be re-legalized. 1469 if (NumPieces == 1 || NumPieces >= NumElts || 1470 NumElts % NumPieces != 0) 1471 return std::pair(0, EltTy); 1472 1473 return std::pair(0, 1474 LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1475 } 1476 1477 // FIXME: We could probably handle weird extending loads better. 1478 if (DstTy.getSizeInBits() > MemSize) 1479 return std::pair(0, EltTy); 1480 1481 unsigned EltSize = EltTy.getSizeInBits(); 1482 unsigned DstSize = DstTy.getSizeInBits(); 1483 if (!isPowerOf2_32(DstSize)) { 1484 // We're probably decomposing an odd sized store. Try to split 1485 // to the widest type. TODO: Account for alignment. As-is it 1486 // should be OK, since the new parts will be further legalized. 1487 unsigned FloorSize = llvm::bit_floor(DstSize); 1488 return std::pair( 1489 0, LLT::scalarOrVector( 1490 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1491 } 1492 1493 // May need relegalization for the scalars. 1494 return std::pair(0, EltTy); 1495 }) 1496 .minScalar(0, S32) 1497 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1498 .widenScalarToNextPow2(0) 1499 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1500 .lower(); 1501 } 1502 1503 // FIXME: Unaligned accesses not lowered. 1504 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1505 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1506 {S32, GlobalPtr, S16, 2 * 8}, 1507 {S32, LocalPtr, S8, 8}, 1508 {S32, LocalPtr, S16, 16}, 1509 {S32, PrivatePtr, S8, 8}, 1510 {S32, PrivatePtr, S16, 16}, 1511 {S32, ConstantPtr, S8, 8}, 1512 {S32, ConstantPtr, S16, 2 * 8}}) 1513 .legalIf( 1514 [=](const LegalityQuery &Query) -> bool { 1515 return isLoadStoreLegal(ST, Query); 1516 }); 1517 1518 if (ST.hasFlatAddressSpace()) { 1519 ExtLoads.legalForTypesWithMemDesc( 1520 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1521 } 1522 1523 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1524 // 64-bits. 1525 // 1526 // TODO: Should generalize bitcast action into coerce, which will also cover 1527 // inserting addrspacecasts. 1528 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1529 1530 ExtLoads.clampScalar(0, S32, S32) 1531 .widenScalarToNextPow2(0) 1532 .lower(); 1533 1534 auto &Atomics = getActionDefinitionsBuilder( 1535 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1536 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1537 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1538 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP}) 1539 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1540 {S64, GlobalPtr}, {S64, LocalPtr}, 1541 {S32, RegionPtr}, {S64, RegionPtr}}); 1542 if (ST.hasFlatAddressSpace()) { 1543 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1544 } 1545 1546 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1547 if (ST.hasLDSFPAtomicAdd()) { 1548 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1549 if (ST.hasGFX90AInsts()) 1550 Atomic.legalFor({{S64, LocalPtr}}); 1551 if (ST.hasAtomicDsPkAdd16Insts()) 1552 Atomic.legalFor({{V2S16, LocalPtr}}); 1553 } 1554 if (ST.hasAtomicFaddInsts()) 1555 Atomic.legalFor({{S32, GlobalPtr}}); 1556 if (ST.hasFlatAtomicFaddF32Inst()) 1557 Atomic.legalFor({{S32, FlatPtr}}); 1558 1559 if (ST.hasGFX90AInsts()) { 1560 // These are legal with some caveats, and should have undergone expansion in 1561 // the IR in most situations 1562 // TODO: Move atomic expansion into legalizer 1563 Atomic.legalFor({ 1564 {S32, GlobalPtr}, 1565 {S64, GlobalPtr}, 1566 {S64, FlatPtr} 1567 }); 1568 } 1569 1570 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1571 // demarshalling 1572 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1573 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1574 {S32, FlatPtr}, {S64, FlatPtr}}) 1575 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1576 {S32, RegionPtr}, {S64, RegionPtr}}); 1577 // TODO: Pointer types, any 32-bit or 64-bit vector 1578 1579 // Condition should be s32 for scalar, s1 for vector. 1580 getActionDefinitionsBuilder(G_SELECT) 1581 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1582 LocalPtr, FlatPtr, PrivatePtr, 1583 LLT::fixed_vector(2, LocalPtr), 1584 LLT::fixed_vector(2, PrivatePtr)}, 1585 {S1, S32}) 1586 .clampScalar(0, S16, S64) 1587 .scalarize(1) 1588 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1589 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1590 .clampMaxNumElements(0, S32, 2) 1591 .clampMaxNumElements(0, LocalPtr, 2) 1592 .clampMaxNumElements(0, PrivatePtr, 2) 1593 .scalarize(0) 1594 .widenScalarToNextPow2(0) 1595 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1596 1597 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1598 // be more flexible with the shift amount type. 1599 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1600 .legalFor({{S32, S32}, {S64, S32}}); 1601 if (ST.has16BitInsts()) { 1602 if (ST.hasVOP3PInsts()) { 1603 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1604 .clampMaxNumElements(0, S16, 2); 1605 } else 1606 Shifts.legalFor({{S16, S16}}); 1607 1608 // TODO: Support 16-bit shift amounts for all types 1609 Shifts.widenScalarIf( 1610 [=](const LegalityQuery &Query) { 1611 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1612 // 32-bit amount. 1613 const LLT ValTy = Query.Types[0]; 1614 const LLT AmountTy = Query.Types[1]; 1615 return ValTy.getSizeInBits() <= 16 && 1616 AmountTy.getSizeInBits() < 16; 1617 }, changeTo(1, S16)); 1618 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1619 Shifts.clampScalar(1, S32, S32); 1620 Shifts.widenScalarToNextPow2(0, 16); 1621 Shifts.clampScalar(0, S16, S64); 1622 1623 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1624 .minScalar(0, S16) 1625 .scalarize(0) 1626 .lower(); 1627 } else { 1628 // Make sure we legalize the shift amount type first, as the general 1629 // expansion for the shifted type will produce much worse code if it hasn't 1630 // been truncated already. 1631 Shifts.clampScalar(1, S32, S32); 1632 Shifts.widenScalarToNextPow2(0, 32); 1633 Shifts.clampScalar(0, S32, S64); 1634 1635 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1636 .minScalar(0, S32) 1637 .scalarize(0) 1638 .lower(); 1639 } 1640 Shifts.scalarize(0); 1641 1642 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1643 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1644 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1645 unsigned IdxTypeIdx = 2; 1646 1647 getActionDefinitionsBuilder(Op) 1648 .customIf([=](const LegalityQuery &Query) { 1649 const LLT EltTy = Query.Types[EltTypeIdx]; 1650 const LLT VecTy = Query.Types[VecTypeIdx]; 1651 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1652 const unsigned EltSize = EltTy.getSizeInBits(); 1653 const bool isLegalVecType = 1654 !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits()); 1655 // Address space 8 pointers are 128-bit wide values, but the logic 1656 // below will try to bitcast them to 2N x s64, which will fail. 1657 // Therefore, as an intermediate step, wrap extracts/insertions from a 1658 // ptrtoint-ing the vector and scalar arguments (or inttoptring the 1659 // extraction result) in order to produce a vector operation that can 1660 // be handled by the logic below. 1661 if (EltTy.isPointer() && EltSize > 64) 1662 return true; 1663 return (EltSize == 32 || EltSize == 64) && 1664 VecTy.getSizeInBits() % 32 == 0 && 1665 VecTy.getSizeInBits() <= MaxRegisterSize && 1666 IdxTy.getSizeInBits() == 32 && 1667 isLegalVecType; 1668 }) 1669 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1670 bitcastToVectorElement32(VecTypeIdx)) 1671 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1672 .bitcastIf( 1673 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1674 [=](const LegalityQuery &Query) { 1675 // For > 64-bit element types, try to turn this into a 64-bit 1676 // element vector since we may be able to do better indexing 1677 // if this is scalar. If not, fall back to 32. 1678 const LLT EltTy = Query.Types[EltTypeIdx]; 1679 const LLT VecTy = Query.Types[VecTypeIdx]; 1680 const unsigned DstEltSize = EltTy.getSizeInBits(); 1681 const unsigned VecSize = VecTy.getSizeInBits(); 1682 1683 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1684 return std::pair( 1685 VecTypeIdx, 1686 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1687 }) 1688 .clampScalar(EltTypeIdx, S32, S64) 1689 .clampScalar(VecTypeIdx, S32, S64) 1690 .clampScalar(IdxTypeIdx, S32, S32) 1691 .clampMaxNumElements(VecTypeIdx, S32, 32) 1692 // TODO: Clamp elements for 64-bit vectors? 1693 .moreElementsIf( 1694 isIllegalRegisterType(VecTypeIdx), 1695 moreElementsToNextExistingRegClass(VecTypeIdx)) 1696 // It should only be necessary with variable indexes. 1697 // As a last resort, lower to the stack 1698 .lower(); 1699 } 1700 1701 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1702 .unsupportedIf([=](const LegalityQuery &Query) { 1703 const LLT &EltTy = Query.Types[1].getElementType(); 1704 return Query.Types[0] != EltTy; 1705 }); 1706 1707 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1708 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1709 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1710 1711 // FIXME: Doesn't handle extract of illegal sizes. 1712 getActionDefinitionsBuilder(Op) 1713 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1714 .lowerIf([=](const LegalityQuery &Query) { 1715 // Sub-vector(or single element) insert and extract. 1716 // TODO: verify immediate offset here since lower only works with 1717 // whole elements. 1718 const LLT BigTy = Query.Types[BigTyIdx]; 1719 return BigTy.isVector(); 1720 }) 1721 // FIXME: Multiples of 16 should not be legal. 1722 .legalIf([=](const LegalityQuery &Query) { 1723 const LLT BigTy = Query.Types[BigTyIdx]; 1724 const LLT LitTy = Query.Types[LitTyIdx]; 1725 return (BigTy.getSizeInBits() % 32 == 0) && 1726 (LitTy.getSizeInBits() % 16 == 0); 1727 }) 1728 .widenScalarIf( 1729 [=](const LegalityQuery &Query) { 1730 const LLT BigTy = Query.Types[BigTyIdx]; 1731 return (BigTy.getScalarSizeInBits() < 16); 1732 }, 1733 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1734 .widenScalarIf( 1735 [=](const LegalityQuery &Query) { 1736 const LLT LitTy = Query.Types[LitTyIdx]; 1737 return (LitTy.getScalarSizeInBits() < 16); 1738 }, 1739 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1740 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1741 .widenScalarToNextPow2(BigTyIdx, 32); 1742 1743 } 1744 1745 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1746 .legalForCartesianProduct(AllS32Vectors, {S32}) 1747 .legalForCartesianProduct(AllS64Vectors, {S64}) 1748 .clampNumElements(0, V16S32, V32S32) 1749 .clampNumElements(0, V2S64, V16S64) 1750 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)) 1751 .moreElementsIf( 1752 isIllegalRegisterType(0), 1753 moreElementsToNextExistingRegClass(0)); 1754 1755 if (ST.hasScalarPackInsts()) { 1756 BuildVector 1757 // FIXME: Should probably widen s1 vectors straight to s32 1758 .minScalarOrElt(0, S16) 1759 .minScalar(1, S16); 1760 1761 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1762 .legalFor({V2S16, S32}) 1763 .lower(); 1764 } else { 1765 BuildVector.customFor({V2S16, S16}); 1766 BuildVector.minScalarOrElt(0, S32); 1767 1768 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1769 .customFor({V2S16, S32}) 1770 .lower(); 1771 } 1772 1773 BuildVector.legalIf(isRegisterType(0)); 1774 1775 // FIXME: Clamp maximum size 1776 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1777 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1778 .clampMaxNumElements(0, S32, 32) 1779 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1780 .clampMaxNumElements(0, S16, 64); 1781 1782 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1783 1784 // Merge/Unmerge 1785 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1786 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1787 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1788 1789 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1790 const LLT Ty = Query.Types[TypeIdx]; 1791 if (Ty.isVector()) { 1792 const LLT &EltTy = Ty.getElementType(); 1793 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1794 return true; 1795 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits())) 1796 return true; 1797 } 1798 return false; 1799 }; 1800 1801 auto &Builder = getActionDefinitionsBuilder(Op) 1802 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1803 .lowerFor({{S16, V2S16}}) 1804 .lowerIf([=](const LegalityQuery &Query) { 1805 const LLT BigTy = Query.Types[BigTyIdx]; 1806 return BigTy.getSizeInBits() == 32; 1807 }) 1808 // Try to widen to s16 first for small types. 1809 // TODO: Only do this on targets with legal s16 shifts 1810 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1811 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1812 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1813 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1814 elementTypeIs(1, S16)), 1815 changeTo(1, V2S16)) 1816 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1817 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1818 // valid. 1819 .clampScalar(LitTyIdx, S32, S512) 1820 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1821 // Break up vectors with weird elements into scalars 1822 .fewerElementsIf( 1823 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1824 scalarize(0)) 1825 .fewerElementsIf( 1826 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1827 scalarize(1)) 1828 .clampScalar(BigTyIdx, S32, MaxScalar); 1829 1830 if (Op == G_MERGE_VALUES) { 1831 Builder.widenScalarIf( 1832 // TODO: Use 16-bit shifts if legal for 8-bit values? 1833 [=](const LegalityQuery &Query) { 1834 const LLT Ty = Query.Types[LitTyIdx]; 1835 return Ty.getSizeInBits() < 32; 1836 }, 1837 changeTo(LitTyIdx, S32)); 1838 } 1839 1840 Builder.widenScalarIf( 1841 [=](const LegalityQuery &Query) { 1842 const LLT Ty = Query.Types[BigTyIdx]; 1843 return Ty.getSizeInBits() % 16 != 0; 1844 }, 1845 [=](const LegalityQuery &Query) { 1846 // Pick the next power of 2, or a multiple of 64 over 128. 1847 // Whichever is smaller. 1848 const LLT &Ty = Query.Types[BigTyIdx]; 1849 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1850 if (NewSizeInBits >= 256) { 1851 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1852 if (RoundedTo < NewSizeInBits) 1853 NewSizeInBits = RoundedTo; 1854 } 1855 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1856 }) 1857 // Any vectors left are the wrong size. Scalarize them. 1858 .scalarize(0) 1859 .scalarize(1); 1860 } 1861 1862 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1863 // RegBankSelect. 1864 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1865 .legalFor({{S32}, {S64}}); 1866 1867 if (ST.hasVOP3PInsts()) { 1868 SextInReg.lowerFor({{V2S16}}) 1869 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1870 // get more vector shift opportunities, since we'll get those when 1871 // expanded. 1872 .clampMaxNumElementsStrict(0, S16, 2); 1873 } else if (ST.has16BitInsts()) { 1874 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1875 } else { 1876 // Prefer to promote to s32 before lowering if we don't have 16-bit 1877 // shifts. This avoid a lot of intermediate truncate and extend operations. 1878 SextInReg.lowerFor({{S32}, {S64}}); 1879 } 1880 1881 SextInReg 1882 .scalarize(0) 1883 .clampScalar(0, S32, S64) 1884 .lower(); 1885 1886 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1887 .scalarize(0) 1888 .lower(); 1889 1890 // TODO: Only Try to form v2s16 with legal packed instructions. 1891 getActionDefinitionsBuilder(G_FSHR) 1892 .legalFor({{S32, S32}}) 1893 .lowerFor({{V2S16, V2S16}}) 1894 .clampMaxNumElementsStrict(0, S16, 2) 1895 .scalarize(0) 1896 .lower(); 1897 1898 if (ST.hasVOP3PInsts()) { 1899 getActionDefinitionsBuilder(G_FSHL) 1900 .lowerFor({{V2S16, V2S16}}) 1901 .clampMaxNumElementsStrict(0, S16, 2) 1902 .scalarize(0) 1903 .lower(); 1904 } else { 1905 getActionDefinitionsBuilder(G_FSHL) 1906 .scalarize(0) 1907 .lower(); 1908 } 1909 1910 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1911 .legalFor({S64}); 1912 1913 getActionDefinitionsBuilder(G_FENCE) 1914 .alwaysLegal(); 1915 1916 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1917 .scalarize(0) 1918 .minScalar(0, S32) 1919 .lower(); 1920 1921 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1922 .legalFor({{S32, S32}, {S64, S32}}) 1923 .clampScalar(1, S32, S32) 1924 .clampScalar(0, S32, S64) 1925 .widenScalarToNextPow2(0) 1926 .scalarize(0); 1927 1928 getActionDefinitionsBuilder({ 1929 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1930 G_FCOPYSIGN, 1931 1932 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1933 G_ATOMICRMW_NAND, 1934 G_ATOMICRMW_FSUB, 1935 G_READ_REGISTER, 1936 G_WRITE_REGISTER, 1937 1938 G_SADDO, G_SSUBO, 1939 1940 // TODO: Implement 1941 G_FMINIMUM, G_FMAXIMUM}).lower(); 1942 1943 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1944 .lower(); 1945 1946 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1947 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1948 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1949 .unsupported(); 1950 1951 getLegacyLegalizerInfo().computeTables(); 1952 verify(*ST.getInstrInfo()); 1953 } 1954 1955 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1956 MachineInstr &MI) const { 1957 MachineIRBuilder &B = Helper.MIRBuilder; 1958 MachineRegisterInfo &MRI = *B.getMRI(); 1959 1960 switch (MI.getOpcode()) { 1961 case TargetOpcode::G_ADDRSPACE_CAST: 1962 return legalizeAddrSpaceCast(MI, MRI, B); 1963 case TargetOpcode::G_FRINT: 1964 return legalizeFrint(MI, MRI, B); 1965 case TargetOpcode::G_FCEIL: 1966 return legalizeFceil(MI, MRI, B); 1967 case TargetOpcode::G_FREM: 1968 return legalizeFrem(MI, MRI, B); 1969 case TargetOpcode::G_INTRINSIC_TRUNC: 1970 return legalizeIntrinsicTrunc(MI, MRI, B); 1971 case TargetOpcode::G_SITOFP: 1972 return legalizeITOFP(MI, MRI, B, true); 1973 case TargetOpcode::G_UITOFP: 1974 return legalizeITOFP(MI, MRI, B, false); 1975 case TargetOpcode::G_FPTOSI: 1976 return legalizeFPTOI(MI, MRI, B, true); 1977 case TargetOpcode::G_FPTOUI: 1978 return legalizeFPTOI(MI, MRI, B, false); 1979 case TargetOpcode::G_FMINNUM: 1980 case TargetOpcode::G_FMAXNUM: 1981 case TargetOpcode::G_FMINNUM_IEEE: 1982 case TargetOpcode::G_FMAXNUM_IEEE: 1983 return legalizeMinNumMaxNum(Helper, MI); 1984 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1985 return legalizeExtractVectorElt(MI, MRI, B); 1986 case TargetOpcode::G_INSERT_VECTOR_ELT: 1987 return legalizeInsertVectorElt(MI, MRI, B); 1988 case TargetOpcode::G_FSIN: 1989 case TargetOpcode::G_FCOS: 1990 return legalizeSinCos(MI, MRI, B); 1991 case TargetOpcode::G_GLOBAL_VALUE: 1992 return legalizeGlobalValue(MI, MRI, B); 1993 case TargetOpcode::G_LOAD: 1994 case TargetOpcode::G_SEXTLOAD: 1995 case TargetOpcode::G_ZEXTLOAD: 1996 return legalizeLoad(Helper, MI); 1997 case TargetOpcode::G_STORE: 1998 return legalizeStore(Helper, MI); 1999 case TargetOpcode::G_FMAD: 2000 return legalizeFMad(MI, MRI, B); 2001 case TargetOpcode::G_FDIV: 2002 return legalizeFDIV(MI, MRI, B); 2003 case TargetOpcode::G_FFREXP: 2004 return legalizeFFREXP(MI, MRI, B); 2005 case TargetOpcode::G_FSQRT: 2006 return legalizeFSQRT(MI, MRI, B); 2007 case TargetOpcode::G_UDIV: 2008 case TargetOpcode::G_UREM: 2009 case TargetOpcode::G_UDIVREM: 2010 return legalizeUnsignedDIV_REM(MI, MRI, B); 2011 case TargetOpcode::G_SDIV: 2012 case TargetOpcode::G_SREM: 2013 case TargetOpcode::G_SDIVREM: 2014 return legalizeSignedDIV_REM(MI, MRI, B); 2015 case TargetOpcode::G_ATOMIC_CMPXCHG: 2016 return legalizeAtomicCmpXChg(MI, MRI, B); 2017 case TargetOpcode::G_FLOG2: 2018 return legalizeFlog2(MI, B); 2019 case TargetOpcode::G_FLOG: 2020 case TargetOpcode::G_FLOG10: 2021 return legalizeFlogCommon(MI, B); 2022 case TargetOpcode::G_FEXP2: 2023 return legalizeFExp2(MI, B); 2024 case TargetOpcode::G_FEXP: 2025 return legalizeFExp(MI, B); 2026 case TargetOpcode::G_FPOW: 2027 return legalizeFPow(MI, B); 2028 case TargetOpcode::G_FFLOOR: 2029 return legalizeFFloor(MI, MRI, B); 2030 case TargetOpcode::G_BUILD_VECTOR: 2031 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 2032 return legalizeBuildVector(MI, MRI, B); 2033 case TargetOpcode::G_MUL: 2034 return legalizeMul(Helper, MI); 2035 case TargetOpcode::G_CTLZ: 2036 case TargetOpcode::G_CTTZ: 2037 return legalizeCTLZ_CTTZ(MI, MRI, B); 2038 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 2039 return legalizeFPTruncRound(MI, B); 2040 default: 2041 return false; 2042 } 2043 2044 llvm_unreachable("expected switch to return"); 2045 } 2046 2047 Register AMDGPULegalizerInfo::getSegmentAperture( 2048 unsigned AS, 2049 MachineRegisterInfo &MRI, 2050 MachineIRBuilder &B) const { 2051 MachineFunction &MF = B.getMF(); 2052 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 2053 const LLT S32 = LLT::scalar(32); 2054 const LLT S64 = LLT::scalar(64); 2055 2056 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 2057 2058 if (ST.hasApertureRegs()) { 2059 // Note: this register is somewhat broken. When used as a 32-bit operand, 2060 // it only returns zeroes. The real value is in the upper 32 bits. 2061 // Thus, we must emit extract the high 32 bits. 2062 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) 2063 ? AMDGPU::SRC_SHARED_BASE 2064 : AMDGPU::SRC_PRIVATE_BASE; 2065 // FIXME: It would be more natural to emit a COPY here, but then copy 2066 // coalescing would kick in and it would think it's okay to use the "HI" 2067 // subregister (instead of extracting the HI 32 bits) which is an artificial 2068 // (unusable) register. 2069 // Register TableGen definitions would need an overhaul to get rid of the 2070 // artificial "HI" aperture registers and prevent this kind of issue from 2071 // happening. 2072 Register Dst = MRI.createGenericVirtualRegister(S64); 2073 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass); 2074 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)}); 2075 return B.buildUnmerge(S32, Dst).getReg(1); 2076 } 2077 2078 // TODO: can we be smarter about machine pointer info? 2079 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 2080 Register LoadAddr = MRI.createGenericVirtualRegister( 2081 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2082 // For code object version 5, private_base and shared_base are passed through 2083 // implicit kernargs. 2084 if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= 2085 AMDGPU::AMDHSA_COV5) { 2086 AMDGPUTargetLowering::ImplicitParameter Param = 2087 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 2088 : AMDGPUTargetLowering::PRIVATE_BASE; 2089 uint64_t Offset = 2090 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 2091 2092 Register KernargPtrReg = MRI.createGenericVirtualRegister( 2093 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2094 2095 if (!loadInputValue(KernargPtrReg, B, 2096 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 2097 return Register(); 2098 2099 MachineMemOperand *MMO = MF.getMachineMemOperand( 2100 PtrInfo, 2101 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2102 MachineMemOperand::MOInvariant, 2103 LLT::scalar(32), commonAlignment(Align(64), Offset)); 2104 2105 // Pointer address 2106 B.buildPtrAdd(LoadAddr, KernargPtrReg, 2107 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 2108 // Load address 2109 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 2110 } 2111 2112 Register QueuePtr = MRI.createGenericVirtualRegister( 2113 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2114 2115 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 2116 return Register(); 2117 2118 // Offset into amd_queue_t for group_segment_aperture_base_hi / 2119 // private_segment_aperture_base_hi. 2120 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 2121 2122 MachineMemOperand *MMO = MF.getMachineMemOperand( 2123 PtrInfo, 2124 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2125 MachineMemOperand::MOInvariant, 2126 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 2127 2128 B.buildPtrAdd(LoadAddr, QueuePtr, 2129 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 2130 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 2131 } 2132 2133 /// Return true if the value is a known valid address, such that a null check is 2134 /// not necessary. 2135 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 2136 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 2137 MachineInstr *Def = MRI.getVRegDef(Val); 2138 switch (Def->getOpcode()) { 2139 case AMDGPU::G_FRAME_INDEX: 2140 case AMDGPU::G_GLOBAL_VALUE: 2141 case AMDGPU::G_BLOCK_ADDR: 2142 return true; 2143 case AMDGPU::G_CONSTANT: { 2144 const ConstantInt *CI = Def->getOperand(1).getCImm(); 2145 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 2146 } 2147 default: 2148 return false; 2149 } 2150 2151 return false; 2152 } 2153 2154 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 2155 MachineInstr &MI, MachineRegisterInfo &MRI, 2156 MachineIRBuilder &B) const { 2157 MachineFunction &MF = B.getMF(); 2158 2159 const LLT S32 = LLT::scalar(32); 2160 Register Dst = MI.getOperand(0).getReg(); 2161 Register Src = MI.getOperand(1).getReg(); 2162 2163 LLT DstTy = MRI.getType(Dst); 2164 LLT SrcTy = MRI.getType(Src); 2165 unsigned DestAS = DstTy.getAddressSpace(); 2166 unsigned SrcAS = SrcTy.getAddressSpace(); 2167 2168 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 2169 // vector element. 2170 assert(!DstTy.isVector()); 2171 2172 const AMDGPUTargetMachine &TM 2173 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 2174 2175 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 2176 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 2177 return true; 2178 } 2179 2180 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 2181 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 2182 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 2183 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 2184 // Extract low 32-bits of the pointer. 2185 B.buildExtract(Dst, Src, 0); 2186 MI.eraseFromParent(); 2187 return true; 2188 } 2189 2190 unsigned NullVal = TM.getNullPointerValue(DestAS); 2191 2192 auto SegmentNull = B.buildConstant(DstTy, NullVal); 2193 auto FlatNull = B.buildConstant(SrcTy, 0); 2194 2195 // Extract low 32-bits of the pointer. 2196 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 2197 2198 auto CmpRes = 2199 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 2200 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 2201 2202 MI.eraseFromParent(); 2203 return true; 2204 } 2205 2206 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 2207 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 2208 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 2209 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 2210 if (!ApertureReg.isValid()) 2211 return false; 2212 2213 // Coerce the type of the low half of the result so we can use merge_values. 2214 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 2215 2216 // TODO: Should we allow mismatched types but matching sizes in merges to 2217 // avoid the ptrtoint? 2218 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg}); 2219 2220 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 2221 B.buildCopy(Dst, BuildPtr); 2222 MI.eraseFromParent(); 2223 return true; 2224 } 2225 2226 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 2227 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 2228 2229 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 2230 SegmentNull.getReg(0)); 2231 2232 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 2233 2234 MI.eraseFromParent(); 2235 return true; 2236 } 2237 2238 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2239 SrcTy.getSizeInBits() == 64) { 2240 // Truncate. 2241 B.buildExtract(Dst, Src, 0); 2242 MI.eraseFromParent(); 2243 return true; 2244 } 2245 2246 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2247 DstTy.getSizeInBits() == 64) { 2248 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2249 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 2250 auto PtrLo = B.buildPtrToInt(S32, Src); 2251 auto HighAddr = B.buildConstant(S32, AddrHiVal); 2252 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); 2253 MI.eraseFromParent(); 2254 return true; 2255 } 2256 2257 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 2258 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 2259 2260 LLVMContext &Ctx = MF.getFunction().getContext(); 2261 Ctx.diagnose(InvalidAddrSpaceCast); 2262 B.buildUndef(Dst); 2263 MI.eraseFromParent(); 2264 return true; 2265 } 2266 2267 bool AMDGPULegalizerInfo::legalizeFrint( 2268 MachineInstr &MI, MachineRegisterInfo &MRI, 2269 MachineIRBuilder &B) const { 2270 Register Src = MI.getOperand(1).getReg(); 2271 LLT Ty = MRI.getType(Src); 2272 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 2273 2274 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2275 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2276 2277 auto C1 = B.buildFConstant(Ty, C1Val); 2278 auto CopySign = B.buildFCopysign(Ty, C1, Src); 2279 2280 // TODO: Should this propagate fast-math-flags? 2281 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 2282 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 2283 2284 auto C2 = B.buildFConstant(Ty, C2Val); 2285 auto Fabs = B.buildFAbs(Ty, Src); 2286 2287 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 2288 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2289 MI.eraseFromParent(); 2290 return true; 2291 } 2292 2293 bool AMDGPULegalizerInfo::legalizeFceil( 2294 MachineInstr &MI, MachineRegisterInfo &MRI, 2295 MachineIRBuilder &B) const { 2296 2297 const LLT S1 = LLT::scalar(1); 2298 const LLT S64 = LLT::scalar(64); 2299 2300 Register Src = MI.getOperand(1).getReg(); 2301 assert(MRI.getType(Src) == S64); 2302 2303 // result = trunc(src) 2304 // if (src > 0.0 && src != result) 2305 // result += 1.0 2306 2307 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2308 2309 const auto Zero = B.buildFConstant(S64, 0.0); 2310 const auto One = B.buildFConstant(S64, 1.0); 2311 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2312 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2313 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2314 auto Add = B.buildSelect(S64, And, One, Zero); 2315 2316 // TODO: Should this propagate fast-math-flags? 2317 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2318 MI.eraseFromParent(); 2319 return true; 2320 } 2321 2322 bool AMDGPULegalizerInfo::legalizeFrem( 2323 MachineInstr &MI, MachineRegisterInfo &MRI, 2324 MachineIRBuilder &B) const { 2325 Register DstReg = MI.getOperand(0).getReg(); 2326 Register Src0Reg = MI.getOperand(1).getReg(); 2327 Register Src1Reg = MI.getOperand(2).getReg(); 2328 auto Flags = MI.getFlags(); 2329 LLT Ty = MRI.getType(DstReg); 2330 2331 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2332 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2333 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2334 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2335 MI.eraseFromParent(); 2336 return true; 2337 } 2338 2339 static MachineInstrBuilder extractF64Exponent(Register Hi, 2340 MachineIRBuilder &B) { 2341 const unsigned FractBits = 52; 2342 const unsigned ExpBits = 11; 2343 LLT S32 = LLT::scalar(32); 2344 2345 auto Const0 = B.buildConstant(S32, FractBits - 32); 2346 auto Const1 = B.buildConstant(S32, ExpBits); 2347 2348 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2349 .addUse(Hi) 2350 .addUse(Const0.getReg(0)) 2351 .addUse(Const1.getReg(0)); 2352 2353 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2354 } 2355 2356 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2357 MachineInstr &MI, MachineRegisterInfo &MRI, 2358 MachineIRBuilder &B) const { 2359 const LLT S1 = LLT::scalar(1); 2360 const LLT S32 = LLT::scalar(32); 2361 const LLT S64 = LLT::scalar(64); 2362 2363 Register Src = MI.getOperand(1).getReg(); 2364 assert(MRI.getType(Src) == S64); 2365 2366 // TODO: Should this use extract since the low half is unused? 2367 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2368 Register Hi = Unmerge.getReg(1); 2369 2370 // Extract the upper half, since this is where we will find the sign and 2371 // exponent. 2372 auto Exp = extractF64Exponent(Hi, B); 2373 2374 const unsigned FractBits = 52; 2375 2376 // Extract the sign bit. 2377 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2378 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2379 2380 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2381 2382 const auto Zero32 = B.buildConstant(S32, 0); 2383 2384 // Extend back to 64-bits. 2385 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit}); 2386 2387 auto Shr = B.buildAShr(S64, FractMask, Exp); 2388 auto Not = B.buildNot(S64, Shr); 2389 auto Tmp0 = B.buildAnd(S64, Src, Not); 2390 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2391 2392 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2393 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2394 2395 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2396 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2397 MI.eraseFromParent(); 2398 return true; 2399 } 2400 2401 bool AMDGPULegalizerInfo::legalizeITOFP( 2402 MachineInstr &MI, MachineRegisterInfo &MRI, 2403 MachineIRBuilder &B, bool Signed) const { 2404 2405 Register Dst = MI.getOperand(0).getReg(); 2406 Register Src = MI.getOperand(1).getReg(); 2407 2408 const LLT S64 = LLT::scalar(64); 2409 const LLT S32 = LLT::scalar(32); 2410 2411 assert(MRI.getType(Src) == S64); 2412 2413 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2414 auto ThirtyTwo = B.buildConstant(S32, 32); 2415 2416 if (MRI.getType(Dst) == S64) { 2417 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2418 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2419 2420 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2421 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo); 2422 2423 // TODO: Should this propagate fast-math-flags? 2424 B.buildFAdd(Dst, LdExp, CvtLo); 2425 MI.eraseFromParent(); 2426 return true; 2427 } 2428 2429 assert(MRI.getType(Dst) == S32); 2430 2431 auto One = B.buildConstant(S32, 1); 2432 2433 MachineInstrBuilder ShAmt; 2434 if (Signed) { 2435 auto ThirtyOne = B.buildConstant(S32, 31); 2436 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2437 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2438 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2439 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2440 /*HasSideEffects=*/false) 2441 .addUse(Unmerge.getReg(1)); 2442 auto LS2 = B.buildSub(S32, LS, One); 2443 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2444 } else 2445 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2446 auto Norm = B.buildShl(S64, Src, ShAmt); 2447 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2448 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2449 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2450 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2451 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2452 B.buildFLdexp(Dst, FVal, Scale); 2453 MI.eraseFromParent(); 2454 return true; 2455 } 2456 2457 // TODO: Copied from DAG implementation. Verify logic and document how this 2458 // actually works. 2459 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2460 MachineRegisterInfo &MRI, 2461 MachineIRBuilder &B, 2462 bool Signed) const { 2463 2464 Register Dst = MI.getOperand(0).getReg(); 2465 Register Src = MI.getOperand(1).getReg(); 2466 2467 const LLT S64 = LLT::scalar(64); 2468 const LLT S32 = LLT::scalar(32); 2469 2470 const LLT SrcLT = MRI.getType(Src); 2471 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2472 2473 unsigned Flags = MI.getFlags(); 2474 2475 // The basic idea of converting a floating point number into a pair of 32-bit 2476 // integers is illustrated as follows: 2477 // 2478 // tf := trunc(val); 2479 // hif := floor(tf * 2^-32); 2480 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2481 // hi := fptoi(hif); 2482 // lo := fptoi(lof); 2483 // 2484 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2485 MachineInstrBuilder Sign; 2486 if (Signed && SrcLT == S32) { 2487 // However, a 32-bit floating point number has only 23 bits mantissa and 2488 // it's not enough to hold all the significant bits of `lof` if val is 2489 // negative. To avoid the loss of precision, We need to take the absolute 2490 // value after truncating and flip the result back based on the original 2491 // signedness. 2492 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2493 Trunc = B.buildFAbs(S32, Trunc, Flags); 2494 } 2495 MachineInstrBuilder K0, K1; 2496 if (SrcLT == S64) { 2497 K0 = B.buildFConstant( 2498 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2499 K1 = B.buildFConstant( 2500 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2501 } else { 2502 K0 = B.buildFConstant( 2503 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000))); 2504 K1 = B.buildFConstant( 2505 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000))); 2506 } 2507 2508 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2509 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2510 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2511 2512 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2513 : B.buildFPTOUI(S32, FloorMul); 2514 auto Lo = B.buildFPTOUI(S32, Fma); 2515 2516 if (Signed && SrcLT == S32) { 2517 // Flip the result based on the signedness, which is either all 0s or 1s. 2518 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign}); 2519 // r := xor({lo, hi}, sign) - sign; 2520 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign), 2521 Sign); 2522 } else 2523 B.buildMergeLikeInstr(Dst, {Lo, Hi}); 2524 MI.eraseFromParent(); 2525 2526 return true; 2527 } 2528 2529 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2530 MachineInstr &MI) const { 2531 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2532 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2533 2534 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2535 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2536 2537 // With ieee_mode disabled, the instructions have the correct behavior 2538 // already for G_FMINNUM/G_FMAXNUM 2539 if (!MFI->getMode().IEEE) 2540 return !IsIEEEOp; 2541 2542 if (IsIEEEOp) 2543 return true; 2544 2545 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2546 } 2547 2548 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2549 MachineInstr &MI, MachineRegisterInfo &MRI, 2550 MachineIRBuilder &B) const { 2551 // TODO: Should move some of this into LegalizerHelper. 2552 2553 // TODO: Promote dynamic indexing of s16 to s32 2554 2555 Register Dst = MI.getOperand(0).getReg(); 2556 Register Vec = MI.getOperand(1).getReg(); 2557 2558 LLT VecTy = MRI.getType(Vec); 2559 LLT EltTy = VecTy.getElementType(); 2560 assert(EltTy == MRI.getType(Dst)); 2561 2562 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 2563 // but we can't go directly to that logic becasue you can't bitcast a vector 2564 // of pointers to a vector of integers. Therefore, introduce an intermediate 2565 // vector of integers using ptrtoint (and inttoptr on the output) in order to 2566 // drive the legalization forward. 2567 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 2568 LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 2569 LLT IntVecTy = VecTy.changeElementType(IntTy); 2570 2571 auto IntVec = B.buildPtrToInt(IntVecTy, Vec); 2572 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2)); 2573 B.buildIntToPtr(Dst, IntElt); 2574 2575 MI.eraseFromParent(); 2576 return true; 2577 } 2578 2579 // FIXME: Artifact combiner probably should have replaced the truncated 2580 // constant before this, so we shouldn't need 2581 // getIConstantVRegValWithLookThrough. 2582 std::optional<ValueAndVReg> MaybeIdxVal = 2583 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2584 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2585 return true; 2586 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 2587 2588 if (IdxVal < VecTy.getNumElements()) { 2589 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2590 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2591 } else { 2592 B.buildUndef(Dst); 2593 } 2594 2595 MI.eraseFromParent(); 2596 return true; 2597 } 2598 2599 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2600 MachineInstr &MI, MachineRegisterInfo &MRI, 2601 MachineIRBuilder &B) const { 2602 // TODO: Should move some of this into LegalizerHelper. 2603 2604 // TODO: Promote dynamic indexing of s16 to s32 2605 2606 Register Dst = MI.getOperand(0).getReg(); 2607 Register Vec = MI.getOperand(1).getReg(); 2608 Register Ins = MI.getOperand(2).getReg(); 2609 2610 LLT VecTy = MRI.getType(Vec); 2611 LLT EltTy = VecTy.getElementType(); 2612 assert(EltTy == MRI.getType(Ins)); 2613 2614 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 2615 // but we can't go directly to that logic becasue you can't bitcast a vector 2616 // of pointers to a vector of integers. Therefore, make the pointer vector 2617 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd 2618 // new value, and then inttoptr the result vector back. This will then allow 2619 // the rest of legalization to take over. 2620 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 2621 LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 2622 LLT IntVecTy = VecTy.changeElementType(IntTy); 2623 2624 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec); 2625 auto IntIns = B.buildPtrToInt(IntTy, Ins); 2626 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns, 2627 MI.getOperand(3)); 2628 B.buildIntToPtr(Dst, IntVecDest); 2629 MI.eraseFromParent(); 2630 return true; 2631 } 2632 2633 // FIXME: Artifact combiner probably should have replaced the truncated 2634 // constant before this, so we shouldn't need 2635 // getIConstantVRegValWithLookThrough. 2636 std::optional<ValueAndVReg> MaybeIdxVal = 2637 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2638 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2639 return true; 2640 2641 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 2642 2643 unsigned NumElts = VecTy.getNumElements(); 2644 if (IdxVal < NumElts) { 2645 SmallVector<Register, 8> SrcRegs; 2646 for (unsigned i = 0; i < NumElts; ++i) 2647 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2648 B.buildUnmerge(SrcRegs, Vec); 2649 2650 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2651 B.buildMergeLikeInstr(Dst, SrcRegs); 2652 } else { 2653 B.buildUndef(Dst); 2654 } 2655 2656 MI.eraseFromParent(); 2657 return true; 2658 } 2659 2660 bool AMDGPULegalizerInfo::legalizeSinCos( 2661 MachineInstr &MI, MachineRegisterInfo &MRI, 2662 MachineIRBuilder &B) const { 2663 2664 Register DstReg = MI.getOperand(0).getReg(); 2665 Register SrcReg = MI.getOperand(1).getReg(); 2666 LLT Ty = MRI.getType(DstReg); 2667 unsigned Flags = MI.getFlags(); 2668 2669 Register TrigVal; 2670 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2671 if (ST.hasTrigReducedRange()) { 2672 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2673 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2674 .addUse(MulVal.getReg(0)) 2675 .setMIFlags(Flags).getReg(0); 2676 } else 2677 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2678 2679 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2680 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2681 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false) 2682 .addUse(TrigVal) 2683 .setMIFlags(Flags); 2684 MI.eraseFromParent(); 2685 return true; 2686 } 2687 2688 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2689 MachineIRBuilder &B, 2690 const GlobalValue *GV, 2691 int64_t Offset, 2692 unsigned GAFlags) const { 2693 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2694 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2695 // to the following code sequence: 2696 // 2697 // For constant address space: 2698 // s_getpc_b64 s[0:1] 2699 // s_add_u32 s0, s0, $symbol 2700 // s_addc_u32 s1, s1, 0 2701 // 2702 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2703 // a fixup or relocation is emitted to replace $symbol with a literal 2704 // constant, which is a pc-relative offset from the encoding of the $symbol 2705 // operand to the global variable. 2706 // 2707 // For global address space: 2708 // s_getpc_b64 s[0:1] 2709 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2710 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2711 // 2712 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2713 // fixups or relocations are emitted to replace $symbol@*@lo and 2714 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2715 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2716 // operand to the global variable. 2717 // 2718 // What we want here is an offset from the value returned by s_getpc 2719 // (which is the address of the s_add_u32 instruction) to the global 2720 // variable, but since the encoding of $symbol starts 4 bytes after the start 2721 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2722 // small. This requires us to add 4 to the global variable offset in order to 2723 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2724 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2725 // instruction. 2726 2727 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2728 2729 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2730 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2731 2732 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2733 .addDef(PCReg); 2734 2735 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2736 if (GAFlags == SIInstrInfo::MO_NONE) 2737 MIB.addImm(0); 2738 else 2739 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2740 2741 if (!B.getMRI()->getRegClassOrNull(PCReg)) 2742 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2743 2744 if (PtrTy.getSizeInBits() == 32) 2745 B.buildExtract(DstReg, PCReg, 0); 2746 return true; 2747 } 2748 2749 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2750 MachineInstr &MI, MachineRegisterInfo &MRI, 2751 MachineIRBuilder &B) const { 2752 Register DstReg = MI.getOperand(0).getReg(); 2753 LLT Ty = MRI.getType(DstReg); 2754 unsigned AS = Ty.getAddressSpace(); 2755 2756 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2757 MachineFunction &MF = B.getMF(); 2758 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2759 2760 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2761 if (!MFI->isModuleEntryFunction() && 2762 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2763 const Function &Fn = MF.getFunction(); 2764 DiagnosticInfoUnsupported BadLDSDecl( 2765 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2766 DS_Warning); 2767 Fn.getContext().diagnose(BadLDSDecl); 2768 2769 // We currently don't have a way to correctly allocate LDS objects that 2770 // aren't directly associated with a kernel. We do force inlining of 2771 // functions that use local objects. However, if these dead functions are 2772 // not eliminated, we don't want a compile time error. Just emit a warning 2773 // and a trap, since there should be no callable path here. 2774 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2775 B.buildUndef(DstReg); 2776 MI.eraseFromParent(); 2777 return true; 2778 } 2779 2780 // TODO: We could emit code to handle the initialization somewhere. 2781 // We ignore the initializer for now and legalize it to allow selection. 2782 // The initializer will anyway get errored out during assembly emission. 2783 const SITargetLowering *TLI = ST.getTargetLowering(); 2784 if (!TLI->shouldUseLDSConstAddress(GV)) { 2785 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2786 return true; // Leave in place; 2787 } 2788 2789 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2790 Type *Ty = GV->getValueType(); 2791 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2792 // zero-sized type in other languages to declare the dynamic shared 2793 // memory which size is not known at the compile time. They will be 2794 // allocated by the runtime and placed directly after the static 2795 // allocated ones. They all share the same offset. 2796 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2797 // Adjust alignment for that dynamic shared memory array. 2798 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV)); 2799 LLT S32 = LLT::scalar(32); 2800 auto Sz = 2801 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2802 B.buildIntToPtr(DstReg, Sz); 2803 MI.eraseFromParent(); 2804 return true; 2805 } 2806 } 2807 2808 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2809 *cast<GlobalVariable>(GV))); 2810 MI.eraseFromParent(); 2811 return true; 2812 } 2813 2814 const SITargetLowering *TLI = ST.getTargetLowering(); 2815 2816 if (TLI->shouldEmitFixup(GV)) { 2817 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2818 MI.eraseFromParent(); 2819 return true; 2820 } 2821 2822 if (TLI->shouldEmitPCReloc(GV)) { 2823 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2824 MI.eraseFromParent(); 2825 return true; 2826 } 2827 2828 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2829 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2830 2831 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2832 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2833 MachinePointerInfo::getGOT(MF), 2834 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2835 MachineMemOperand::MOInvariant, 2836 LoadTy, Align(8)); 2837 2838 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2839 2840 if (Ty.getSizeInBits() == 32) { 2841 // Truncate if this is a 32-bit constant address. 2842 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2843 B.buildExtract(DstReg, Load, 0); 2844 } else 2845 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2846 2847 MI.eraseFromParent(); 2848 return true; 2849 } 2850 2851 static LLT widenToNextPowerOf2(LLT Ty) { 2852 if (Ty.isVector()) 2853 return Ty.changeElementCount( 2854 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2855 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2856 } 2857 2858 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2859 MachineInstr &MI) const { 2860 MachineIRBuilder &B = Helper.MIRBuilder; 2861 MachineRegisterInfo &MRI = *B.getMRI(); 2862 GISelChangeObserver &Observer = Helper.Observer; 2863 2864 Register PtrReg = MI.getOperand(1).getReg(); 2865 LLT PtrTy = MRI.getType(PtrReg); 2866 unsigned AddrSpace = PtrTy.getAddressSpace(); 2867 2868 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2869 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2870 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2871 Observer.changingInstr(MI); 2872 MI.getOperand(1).setReg(Cast.getReg(0)); 2873 Observer.changedInstr(MI); 2874 return true; 2875 } 2876 2877 if (MI.getOpcode() != AMDGPU::G_LOAD) 2878 return false; 2879 2880 Register ValReg = MI.getOperand(0).getReg(); 2881 LLT ValTy = MRI.getType(ValReg); 2882 2883 if (hasBufferRsrcWorkaround(ValTy)) { 2884 Observer.changingInstr(MI); 2885 castBufferRsrcFromV4I32(MI, B, MRI, 0); 2886 Observer.changedInstr(MI); 2887 return true; 2888 } 2889 2890 MachineMemOperand *MMO = *MI.memoperands_begin(); 2891 const unsigned ValSize = ValTy.getSizeInBits(); 2892 const LLT MemTy = MMO->getMemoryType(); 2893 const Align MemAlign = MMO->getAlign(); 2894 const unsigned MemSize = MemTy.getSizeInBits(); 2895 const uint64_t AlignInBits = 8 * MemAlign.value(); 2896 2897 // Widen non-power-of-2 loads to the alignment if needed 2898 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2899 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2900 2901 // This was already the correct extending load result type, so just adjust 2902 // the memory type. 2903 if (WideMemSize == ValSize) { 2904 MachineFunction &MF = B.getMF(); 2905 2906 MachineMemOperand *WideMMO = 2907 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2908 Observer.changingInstr(MI); 2909 MI.setMemRefs(MF, {WideMMO}); 2910 Observer.changedInstr(MI); 2911 return true; 2912 } 2913 2914 // Don't bother handling edge case that should probably never be produced. 2915 if (ValSize > WideMemSize) 2916 return false; 2917 2918 LLT WideTy = widenToNextPowerOf2(ValTy); 2919 2920 Register WideLoad; 2921 if (!WideTy.isVector()) { 2922 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2923 B.buildTrunc(ValReg, WideLoad).getReg(0); 2924 } else { 2925 // Extract the subvector. 2926 2927 if (isRegisterType(ValTy)) { 2928 // If this a case where G_EXTRACT is legal, use it. 2929 // (e.g. <3 x s32> -> <4 x s32>) 2930 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2931 B.buildExtract(ValReg, WideLoad, 0); 2932 } else { 2933 // For cases where the widened type isn't a nice register value, unmerge 2934 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2935 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2936 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2937 } 2938 } 2939 2940 MI.eraseFromParent(); 2941 return true; 2942 } 2943 2944 return false; 2945 } 2946 2947 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper, 2948 MachineInstr &MI) const { 2949 MachineIRBuilder &B = Helper.MIRBuilder; 2950 MachineRegisterInfo &MRI = *B.getMRI(); 2951 GISelChangeObserver &Observer = Helper.Observer; 2952 2953 Register DataReg = MI.getOperand(0).getReg(); 2954 LLT DataTy = MRI.getType(DataReg); 2955 2956 if (hasBufferRsrcWorkaround(DataTy)) { 2957 Observer.changingInstr(MI); 2958 castBufferRsrcArgToV4I32(MI, B, 0); 2959 Observer.changedInstr(MI); 2960 return true; 2961 } 2962 return false; 2963 } 2964 2965 bool AMDGPULegalizerInfo::legalizeFMad( 2966 MachineInstr &MI, MachineRegisterInfo &MRI, 2967 MachineIRBuilder &B) const { 2968 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2969 assert(Ty.isScalar()); 2970 2971 MachineFunction &MF = B.getMF(); 2972 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2973 2974 // TODO: Always legal with future ftz flag. 2975 // FIXME: Do we need just output? 2976 if (Ty == LLT::scalar(32) && 2977 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()) 2978 return true; 2979 if (Ty == LLT::scalar(16) && 2980 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()) 2981 return true; 2982 2983 MachineIRBuilder HelperBuilder(MI); 2984 GISelObserverWrapper DummyObserver; 2985 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2986 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2987 } 2988 2989 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2990 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2991 Register DstReg = MI.getOperand(0).getReg(); 2992 Register PtrReg = MI.getOperand(1).getReg(); 2993 Register CmpVal = MI.getOperand(2).getReg(); 2994 Register NewVal = MI.getOperand(3).getReg(); 2995 2996 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2997 "this should not have been custom lowered"); 2998 2999 LLT ValTy = MRI.getType(CmpVal); 3000 LLT VecTy = LLT::fixed_vector(2, ValTy); 3001 3002 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 3003 3004 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 3005 .addDef(DstReg) 3006 .addUse(PtrReg) 3007 .addUse(PackedVal) 3008 .setMemRefs(MI.memoperands()); 3009 3010 MI.eraseFromParent(); 3011 return true; 3012 } 3013 3014 /// Return true if it's known that \p Src can never be an f32 denormal value. 3015 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI, 3016 Register Src) { 3017 Register ExtSrc; 3018 if (mi_match(Src, MRI, m_GFPExt(m_Reg(ExtSrc)))) 3019 return MRI.getType(ExtSrc) == LLT::scalar(16); 3020 return false; 3021 } 3022 3023 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) { 3024 if (Flags & MachineInstr::FmAfn) 3025 return true; 3026 const auto &Options = MF.getTarget().Options; 3027 return Options.UnsafeFPMath || Options.ApproxFuncFPMath; 3028 } 3029 3030 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, 3031 unsigned Flags) { 3032 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) && 3033 MF.getDenormalMode(APFloat::IEEEsingle()).Input != 3034 DenormalMode::PreserveSign; 3035 } 3036 3037 std::pair<Register, Register> 3038 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src, 3039 unsigned Flags) const { 3040 if (allowApproxFunc(B.getMF(), Flags) || 3041 !needsDenormHandlingF32(B.getMF(), Src, Flags)) 3042 return {}; 3043 3044 const LLT F32 = LLT::scalar(32); 3045 auto SmallestNormal = B.buildFConstant( 3046 F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle())); 3047 auto IsLtSmallestNormal = 3048 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal); 3049 3050 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32); 3051 auto One = B.buildFConstant(F32, 1.0); 3052 auto ScaleFactor = 3053 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags); 3054 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags); 3055 3056 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)}; 3057 } 3058 3059 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI, 3060 MachineIRBuilder &B) const { 3061 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals. 3062 // If we have to handle denormals, scale up the input and adjust the result. 3063 3064 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0) 3065 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0) 3066 3067 Register Dst = MI.getOperand(0).getReg(); 3068 Register Src = MI.getOperand(1).getReg(); 3069 LLT Ty = B.getMRI()->getType(Dst); 3070 unsigned Flags = MI.getFlags(); 3071 3072 if (Ty == LLT::scalar(16)) { 3073 const LLT F32 = LLT::scalar(32); 3074 // Nothing in half is a denormal when promoted to f32. 3075 auto Ext = B.buildFPExt(F32, Src, Flags); 3076 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}, false) 3077 .addUse(Ext.getReg(0)) 3078 .setMIFlags(Flags); 3079 B.buildFPTrunc(Dst, Log2, Flags); 3080 MI.eraseFromParent(); 3081 return true; 3082 } 3083 3084 assert(Ty == LLT::scalar(32)); 3085 3086 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags); 3087 if (!ScaledInput) { 3088 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)}, false) 3089 .addUse(Src) 3090 .setMIFlags(Flags); 3091 MI.eraseFromParent(); 3092 return true; 3093 } 3094 3095 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) 3096 .addUse(ScaledInput) 3097 .setMIFlags(Flags); 3098 3099 auto ThirtyTwo = B.buildFConstant(Ty, 32.0); 3100 auto Zero = B.buildFConstant(Ty, 0.0); 3101 auto ResultOffset = 3102 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags); 3103 B.buildFSub(Dst, Log2, ResultOffset, Flags); 3104 3105 MI.eraseFromParent(); 3106 return true; 3107 } 3108 3109 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y, 3110 Register Z, unsigned Flags) { 3111 auto FMul = B.buildFMul(Ty, X, Y, Flags); 3112 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0); 3113 } 3114 3115 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, 3116 MachineIRBuilder &B) const { 3117 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10; 3118 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG); 3119 3120 MachineRegisterInfo &MRI = *B.getMRI(); 3121 Register Dst = MI.getOperand(0).getReg(); 3122 Register X = MI.getOperand(1).getReg(); 3123 unsigned Flags = MI.getFlags(); 3124 const LLT Ty = MRI.getType(X); 3125 MachineFunction &MF = B.getMF(); 3126 3127 const LLT F32 = LLT::scalar(32); 3128 const LLT F16 = LLT::scalar(16); 3129 3130 const AMDGPUTargetMachine &TM = 3131 static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 3132 3133 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) || 3134 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) { 3135 const double Log2BaseInv = 3136 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2; 3137 3138 if (Ty == F16 && !ST.has16BitInsts()) { 3139 Register LogVal = MRI.createGenericVirtualRegister(F32); 3140 auto PromoteSrc = B.buildFPExt(F32, X); 3141 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), Log2BaseInv, Flags); 3142 B.buildFPTrunc(Dst, LogVal); 3143 } else { 3144 legalizeFlogUnsafe(B, Dst, X, Log2BaseInv, Flags); 3145 } 3146 3147 MI.eraseFromParent(); 3148 return true; 3149 } 3150 3151 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags); 3152 if (ScaledInput) 3153 X = ScaledInput; 3154 3155 auto Y = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) 3156 .addUse(X) 3157 .setMIFlags(Flags); 3158 3159 Register R; 3160 if (ST.hasFastFMAF32()) { 3161 // c+cc are ln(2)/ln(10) to more than 49 bits 3162 const float c_log10 = 0x1.344134p-2f; 3163 const float cc_log10 = 0x1.09f79ep-26f; 3164 3165 // c + cc is ln(2) to more than 49 bits 3166 const float c_log = 0x1.62e42ep-1f; 3167 const float cc_log = 0x1.efa39ep-25f; 3168 3169 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log); 3170 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log); 3171 3172 R = B.buildFMul(Ty, Y, C, Flags).getReg(0); 3173 auto NegR = B.buildFNeg(Ty, R, Flags); 3174 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags); 3175 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags); 3176 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0); 3177 } else { 3178 // ch+ct is ln(2)/ln(10) to more than 36 bits 3179 const float ch_log10 = 0x1.344000p-2f; 3180 const float ct_log10 = 0x1.3509f6p-18f; 3181 3182 // ch + ct is ln(2) to more than 36 bits 3183 const float ch_log = 0x1.62e000p-1f; 3184 const float ct_log = 0x1.0bfbe8p-15f; 3185 3186 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log); 3187 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log); 3188 3189 auto MaskConst = B.buildConstant(Ty, 0xfffff000); 3190 auto YH = B.buildAnd(Ty, Y, MaskConst); 3191 auto YT = B.buildFSub(Ty, Y, YH, Flags); 3192 auto YTCT = B.buildFMul(Ty, YT, CT, Flags); 3193 3194 Register Mad0 = 3195 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags); 3196 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags); 3197 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags); 3198 } 3199 3200 const bool IsFiniteOnly = 3201 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) && 3202 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath); 3203 3204 if (!IsFiniteOnly) { 3205 // Expand isfinite(x) => fabs(x) < inf 3206 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 3207 auto Fabs = B.buildFAbs(Ty, Y); 3208 auto IsFinite = 3209 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 3210 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0); 3211 } 3212 3213 if (ScaledInput) { 3214 auto Zero = B.buildFConstant(Ty, 0.0); 3215 auto ShiftK = 3216 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f); 3217 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags); 3218 B.buildFSub(Dst, R, Shift, Flags); 3219 } else { 3220 B.buildCopy(Dst, R); 3221 } 3222 3223 MI.eraseFromParent(); 3224 return true; 3225 } 3226 3227 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, 3228 Register Src, 3229 double Log2BaseInverted, 3230 unsigned Flags) const { 3231 LLT Ty = B.getMRI()->getType(Dst); 3232 auto Log2Operand = Ty == LLT::scalar(16) 3233 ? B.buildFLog2(Ty, Src, Flags) 3234 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) 3235 .addUse(Src) 3236 .setMIFlags(Flags); 3237 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 3238 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 3239 return true; 3240 } 3241 3242 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI, 3243 MachineIRBuilder &B) const { 3244 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals. 3245 // If we have to handle denormals, scale up the input and adjust the result. 3246 3247 Register Dst = MI.getOperand(0).getReg(); 3248 Register Src = MI.getOperand(1).getReg(); 3249 unsigned Flags = MI.getFlags(); 3250 LLT Ty = B.getMRI()->getType(Dst); 3251 const LLT F16 = LLT::scalar(16); 3252 const LLT F32 = LLT::scalar(32); 3253 3254 if (Ty == F16) { 3255 // Nothing in half is a denormal when promoted to f32. 3256 auto Ext = B.buildFPExt(F32, Src, Flags); 3257 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}, false) 3258 .addUse(Ext.getReg(0)) 3259 .setMIFlags(Flags); 3260 B.buildFPTrunc(Dst, Log2, Flags); 3261 MI.eraseFromParent(); 3262 return true; 3263 } 3264 3265 assert(Ty == F32); 3266 3267 if (allowApproxFunc(B.getMF(), Flags) || 3268 !needsDenormHandlingF32(B.getMF(), Src, Flags)) { 3269 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false) 3270 .addUse(Src) 3271 .setMIFlags(Flags); 3272 MI.eraseFromParent(); 3273 return true; 3274 } 3275 3276 // bool needs_scaling = x < -0x1.f80000p+6f; 3277 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); 3278 3279 // -nextafter(128.0, -1) 3280 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f); 3281 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, 3282 RangeCheckConst, Flags); 3283 3284 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f); 3285 auto Zero = B.buildFConstant(Ty, 0.0); 3286 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags); 3287 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags); 3288 3289 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false) 3290 .addUse(AddInput.getReg(0)) 3291 .setMIFlags(Flags); 3292 3293 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f); 3294 auto One = B.buildFConstant(Ty, 1.0); 3295 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags); 3296 B.buildFMul(Dst, Exp2, ResultScale, Flags); 3297 MI.eraseFromParent(); 3298 return true; 3299 } 3300 3301 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, 3302 Register Src, 3303 unsigned Flags) const { 3304 LLT Ty = B.getMRI()->getType(Dst); 3305 auto K = B.buildFConstant(Ty, numbers::log2e); 3306 auto Mul = B.buildFMul(Ty, Src, K, Flags); 3307 3308 if (Ty == LLT::scalar(32)) { 3309 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false) 3310 .addUse(Mul.getReg(0)) 3311 .setMIFlags(Flags); 3312 } else { 3313 B.buildFExp2(Dst, Mul.getReg(0), Flags); 3314 } 3315 3316 return true; 3317 } 3318 3319 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 3320 MachineIRBuilder &B) const { 3321 Register Dst = MI.getOperand(0).getReg(); 3322 Register X = MI.getOperand(1).getReg(); 3323 const unsigned Flags = MI.getFlags(); 3324 MachineFunction &MF = B.getMF(); 3325 MachineRegisterInfo &MRI = *B.getMRI(); 3326 LLT Ty = MRI.getType(Dst); 3327 const LLT F16 = LLT::scalar(16); 3328 const LLT F32 = LLT::scalar(32); 3329 const bool IsExp10 = false; // TODO: For some reason exp10 is missing 3330 3331 if (Ty == F16) { 3332 // v_exp_f16 (fmul x, log2e) 3333 if (allowApproxFunc(MF, Flags)) { 3334 // TODO: Does this really require fast? 3335 legalizeFExpUnsafe(B, Dst, X, Flags); 3336 MI.eraseFromParent(); 3337 return true; 3338 } 3339 3340 // exp(f16 x) -> 3341 // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) 3342 3343 // Nothing in half is a denormal when promoted to f32. 3344 auto Ext = B.buildFPExt(F32, X, Flags); 3345 Register Lowered = MRI.createGenericVirtualRegister(F32); 3346 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags); 3347 B.buildFPTrunc(Dst, Lowered, Flags); 3348 MI.eraseFromParent(); 3349 return true; 3350 } 3351 3352 assert(Ty == F32); 3353 3354 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying 3355 // library behavior. Also, is known-not-daz source sufficient? 3356 if (allowApproxFunc(MF, Flags) && !needsDenormHandlingF32(MF, X, Flags)) { 3357 legalizeFExpUnsafe(B, Dst, X, Flags); 3358 MI.eraseFromParent(); 3359 return true; 3360 } 3361 3362 // Algorithm: 3363 // 3364 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) 3365 // 3366 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer 3367 // n = 64*m + j, 0 <= j < 64 3368 // 3369 // e^x = 2^((64*m + j + f)/64) 3370 // = (2^m) * (2^(j/64)) * 2^(f/64) 3371 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) 3372 // 3373 // f = x*(64/ln(2)) - n 3374 // r = f*(ln(2)/64) = x - n*(ln(2)/64) 3375 // 3376 // e^x = (2^m) * (2^(j/64)) * e^r 3377 // 3378 // (2^(j/64)) is precomputed 3379 // 3380 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3381 // e^r = 1 + q 3382 // 3383 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3384 // 3385 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) 3386 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract; 3387 Register PH, PL; 3388 3389 if (ST.hasFastFMAF32()) { 3390 const float c_exp = numbers::log2ef; 3391 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits 3392 const float c_exp10 = 0x1.a934f0p+1f; 3393 const float cc_exp10 = 0x1.2f346ep-24f; 3394 3395 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp); 3396 PH = B.buildFMul(Ty, X, C, Flags).getReg(0); 3397 auto NegPH = B.buildFNeg(Ty, PH, Flags); 3398 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags); 3399 3400 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp); 3401 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0); 3402 } else { 3403 const float ch_exp = 0x1.714000p+0f; 3404 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits 3405 3406 const float ch_exp10 = 0x1.a92000p+1f; 3407 const float cl_exp10 = 0x1.4f0978p-11f; 3408 3409 auto MaskConst = B.buildConstant(Ty, 0xfffff000); 3410 auto XH = B.buildAnd(Ty, X, MaskConst); 3411 auto XL = B.buildFSub(Ty, X, XH, Flags); 3412 3413 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp); 3414 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0); 3415 3416 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp); 3417 auto XLCL = B.buildFMul(Ty, XL, CL, Flags); 3418 3419 Register Mad0 = 3420 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags); 3421 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags); 3422 } 3423 3424 auto E = B.buildFRint(Ty, PH, Flags); 3425 3426 // It is unsafe to contract this fsub into the PH multiply. 3427 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract); 3428 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags); 3429 auto IntE = B.buildFPTOSI(LLT::scalar(32), E); 3430 3431 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false) 3432 .addUse(A.getReg(0)) 3433 .setMIFlags(Flags); 3434 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags); 3435 3436 auto UnderflowCheckConst = 3437 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f); 3438 auto Zero = B.buildFConstant(Ty, 0.0); 3439 auto Underflow = 3440 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst); 3441 3442 R = B.buildSelect(Ty, Underflow, Zero, R); 3443 3444 const auto &Options = MF.getTarget().Options; 3445 3446 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) { 3447 auto OverflowCheckConst = 3448 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f); 3449 3450 auto Overflow = 3451 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst); 3452 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 3453 R = B.buildSelect(Ty, Overflow, Inf, R, Flags); 3454 } 3455 3456 B.buildCopy(Dst, R); 3457 MI.eraseFromParent(); 3458 return true; 3459 } 3460 3461 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 3462 MachineIRBuilder &B) const { 3463 Register Dst = MI.getOperand(0).getReg(); 3464 Register Src0 = MI.getOperand(1).getReg(); 3465 Register Src1 = MI.getOperand(2).getReg(); 3466 unsigned Flags = MI.getFlags(); 3467 LLT Ty = B.getMRI()->getType(Dst); 3468 const LLT S16 = LLT::scalar(16); 3469 const LLT S32 = LLT::scalar(32); 3470 3471 if (Ty == S32) { 3472 auto Log = B.buildFLog2(S32, Src0, Flags); 3473 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 3474 .addUse(Log.getReg(0)) 3475 .addUse(Src1) 3476 .setMIFlags(Flags); 3477 B.buildFExp2(Dst, Mul, Flags); 3478 } else if (Ty == S16) { 3479 // There's no f16 fmul_legacy, so we need to convert for it. 3480 auto Log = B.buildFLog2(S16, Src0, Flags); 3481 auto Ext0 = B.buildFPExt(S32, Log, Flags); 3482 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 3483 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 3484 .addUse(Ext0.getReg(0)) 3485 .addUse(Ext1.getReg(0)) 3486 .setMIFlags(Flags); 3487 3488 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 3489 } else 3490 return false; 3491 3492 MI.eraseFromParent(); 3493 return true; 3494 } 3495 3496 // Find a source register, ignoring any possible source modifiers. 3497 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 3498 Register ModSrc = OrigSrc; 3499 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 3500 ModSrc = SrcFNeg->getOperand(1).getReg(); 3501 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 3502 ModSrc = SrcFAbs->getOperand(1).getReg(); 3503 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 3504 ModSrc = SrcFAbs->getOperand(1).getReg(); 3505 return ModSrc; 3506 } 3507 3508 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 3509 MachineRegisterInfo &MRI, 3510 MachineIRBuilder &B) const { 3511 3512 const LLT S1 = LLT::scalar(1); 3513 const LLT S64 = LLT::scalar(64); 3514 Register Dst = MI.getOperand(0).getReg(); 3515 Register OrigSrc = MI.getOperand(1).getReg(); 3516 unsigned Flags = MI.getFlags(); 3517 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 3518 "this should not have been custom lowered"); 3519 3520 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 3521 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 3522 // efficient way to implement it is using V_FRACT_F64. The workaround for the 3523 // V_FRACT bug is: 3524 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 3525 // 3526 // Convert floor(x) to (x - fract(x)) 3527 3528 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 3529 .addUse(OrigSrc) 3530 .setMIFlags(Flags); 3531 3532 // Give source modifier matching some assistance before obscuring a foldable 3533 // pattern. 3534 3535 // TODO: We can avoid the neg on the fract? The input sign to fract 3536 // shouldn't matter? 3537 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 3538 3539 auto Const = 3540 B.buildFConstant(S64, llvm::bit_cast<double>(0x3fefffffffffffff)); 3541 3542 Register Min = MRI.createGenericVirtualRegister(S64); 3543 3544 // We don't need to concern ourselves with the snan handling difference, so 3545 // use the one which will directly select. 3546 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3547 if (MFI->getMode().IEEE) 3548 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 3549 else 3550 B.buildFMinNum(Min, Fract, Const, Flags); 3551 3552 Register CorrectedFract = Min; 3553 if (!MI.getFlag(MachineInstr::FmNoNans)) { 3554 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 3555 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 3556 } 3557 3558 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 3559 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 3560 3561 MI.eraseFromParent(); 3562 return true; 3563 } 3564 3565 // Turn an illegal packed v2s16 build vector into bit operations. 3566 // TODO: This should probably be a bitcast action in LegalizerHelper. 3567 bool AMDGPULegalizerInfo::legalizeBuildVector( 3568 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3569 Register Dst = MI.getOperand(0).getReg(); 3570 const LLT S32 = LLT::scalar(32); 3571 const LLT S16 = LLT::scalar(16); 3572 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 3573 3574 Register Src0 = MI.getOperand(1).getReg(); 3575 Register Src1 = MI.getOperand(2).getReg(); 3576 3577 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) { 3578 assert(MRI.getType(Src0) == S32); 3579 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0); 3580 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0); 3581 } 3582 3583 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1}); 3584 B.buildBitcast(Dst, Merge); 3585 3586 MI.eraseFromParent(); 3587 return true; 3588 } 3589 3590 // Build a big integer multiply or multiply-add using MAD_64_32 instructions. 3591 // 3592 // Source and accumulation registers must all be 32-bits. 3593 // 3594 // TODO: When the multiply is uniform, we should produce a code sequence 3595 // that is better suited to instruction selection on the SALU. Instead of 3596 // the outer loop going over parts of the result, the outer loop should go 3597 // over parts of one of the factors. This should result in instruction 3598 // selection that makes full use of S_ADDC_U32 instructions. 3599 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper, 3600 MutableArrayRef<Register> Accum, 3601 ArrayRef<Register> Src0, 3602 ArrayRef<Register> Src1, 3603 bool UsePartialMad64_32, 3604 bool SeparateOddAlignedProducts) const { 3605 // Use (possibly empty) vectors of S1 registers to represent the set of 3606 // carries from one pair of positions to the next. 3607 using Carry = SmallVector<Register, 2>; 3608 3609 MachineIRBuilder &B = Helper.MIRBuilder; 3610 GISelKnownBits &KB = *Helper.getKnownBits(); 3611 3612 const LLT S1 = LLT::scalar(1); 3613 const LLT S32 = LLT::scalar(32); 3614 const LLT S64 = LLT::scalar(64); 3615 3616 Register Zero32; 3617 Register Zero64; 3618 3619 auto getZero32 = [&]() -> Register { 3620 if (!Zero32) 3621 Zero32 = B.buildConstant(S32, 0).getReg(0); 3622 return Zero32; 3623 }; 3624 auto getZero64 = [&]() -> Register { 3625 if (!Zero64) 3626 Zero64 = B.buildConstant(S64, 0).getReg(0); 3627 return Zero64; 3628 }; 3629 3630 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros; 3631 for (unsigned i = 0; i < Src0.size(); ++i) { 3632 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero()); 3633 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero()); 3634 } 3635 3636 // Merge the given carries into the 32-bit LocalAccum, which is modified 3637 // in-place. 3638 // 3639 // Returns the carry-out, which is a single S1 register or null. 3640 auto mergeCarry = 3641 [&](Register &LocalAccum, const Carry &CarryIn) -> Register { 3642 if (CarryIn.empty()) 3643 return Register(); 3644 3645 bool HaveCarryOut = true; 3646 Register CarryAccum; 3647 if (CarryIn.size() == 1) { 3648 if (!LocalAccum) { 3649 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 3650 return Register(); 3651 } 3652 3653 CarryAccum = getZero32(); 3654 } else { 3655 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 3656 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { 3657 CarryAccum = 3658 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) 3659 .getReg(0); 3660 } 3661 3662 if (!LocalAccum) { 3663 LocalAccum = getZero32(); 3664 HaveCarryOut = false; 3665 } 3666 } 3667 3668 auto Add = 3669 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); 3670 LocalAccum = Add.getReg(0); 3671 return HaveCarryOut ? Add.getReg(1) : Register(); 3672 }; 3673 3674 // Build a multiply-add chain to compute 3675 // 3676 // LocalAccum + (partial products at DstIndex) 3677 // + (opportunistic subset of CarryIn) 3678 // 3679 // LocalAccum is an array of one or two 32-bit registers that are updated 3680 // in-place. The incoming registers may be null. 3681 // 3682 // In some edge cases, carry-ins can be consumed "for free". In that case, 3683 // the consumed carry bits are removed from CarryIn in-place. 3684 auto buildMadChain = 3685 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) 3686 -> Carry { 3687 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || 3688 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); 3689 3690 Carry CarryOut; 3691 unsigned j0 = 0; 3692 3693 // Use plain 32-bit multiplication for the most significant part of the 3694 // result by default. 3695 if (LocalAccum.size() == 1 && 3696 (!UsePartialMad64_32 || !CarryIn.empty())) { 3697 do { 3698 // Skip multiplication if one of the operands is 0 3699 unsigned j1 = DstIndex - j0; 3700 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 3701 ++j0; 3702 continue; 3703 } 3704 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); 3705 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) { 3706 LocalAccum[0] = Mul.getReg(0); 3707 } else { 3708 if (CarryIn.empty()) { 3709 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); 3710 } else { 3711 LocalAccum[0] = 3712 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) 3713 .getReg(0); 3714 CarryIn.pop_back(); 3715 } 3716 } 3717 ++j0; 3718 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); 3719 } 3720 3721 // Build full 64-bit multiplies. 3722 if (j0 <= DstIndex) { 3723 bool HaveSmallAccum = false; 3724 Register Tmp; 3725 3726 if (LocalAccum[0]) { 3727 if (LocalAccum.size() == 1) { 3728 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); 3729 HaveSmallAccum = true; 3730 } else if (LocalAccum[1]) { 3731 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0); 3732 HaveSmallAccum = false; 3733 } else { 3734 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); 3735 HaveSmallAccum = true; 3736 } 3737 } else { 3738 assert(LocalAccum.size() == 1 || !LocalAccum[1]); 3739 Tmp = getZero64(); 3740 HaveSmallAccum = true; 3741 } 3742 3743 do { 3744 unsigned j1 = DstIndex - j0; 3745 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 3746 ++j0; 3747 continue; 3748 } 3749 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, 3750 {Src0[j0], Src1[j1], Tmp}); 3751 Tmp = Mad.getReg(0); 3752 if (!HaveSmallAccum) 3753 CarryOut.push_back(Mad.getReg(1)); 3754 HaveSmallAccum = false; 3755 3756 ++j0; 3757 } while (j0 <= DstIndex); 3758 3759 auto Unmerge = B.buildUnmerge(S32, Tmp); 3760 LocalAccum[0] = Unmerge.getReg(0); 3761 if (LocalAccum.size() > 1) 3762 LocalAccum[1] = Unmerge.getReg(1); 3763 } 3764 3765 return CarryOut; 3766 }; 3767 3768 // Outer multiply loop, iterating over destination parts from least 3769 // significant to most significant parts. 3770 // 3771 // The columns of the following diagram correspond to the destination parts 3772 // affected by one iteration of the outer loop (ignoring boundary 3773 // conditions). 3774 // 3775 // Dest index relative to 2 * i: 1 0 -1 3776 // ------ 3777 // Carries from previous iteration: e o 3778 // Even-aligned partial product sum: E E . 3779 // Odd-aligned partial product sum: O O 3780 // 3781 // 'o' is OddCarry, 'e' is EvenCarry. 3782 // EE and OO are computed from partial products via buildMadChain and use 3783 // accumulation where possible and appropriate. 3784 // 3785 Register SeparateOddCarry; 3786 Carry EvenCarry; 3787 Carry OddCarry; 3788 3789 for (unsigned i = 0; i <= Accum.size() / 2; ++i) { 3790 Carry OddCarryIn = std::move(OddCarry); 3791 Carry EvenCarryIn = std::move(EvenCarry); 3792 OddCarry.clear(); 3793 EvenCarry.clear(); 3794 3795 // Partial products at offset 2 * i. 3796 if (2 * i < Accum.size()) { 3797 auto LocalAccum = Accum.drop_front(2 * i).take_front(2); 3798 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); 3799 } 3800 3801 // Partial products at offset 2 * i - 1. 3802 if (i > 0) { 3803 if (!SeparateOddAlignedProducts) { 3804 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); 3805 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3806 } else { 3807 bool IsHighest = 2 * i >= Accum.size(); 3808 Register SeparateOddOut[2]; 3809 auto LocalAccum = MutableArrayRef(SeparateOddOut) 3810 .take_front(IsHighest ? 1 : 2); 3811 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3812 3813 MachineInstr *Lo; 3814 3815 if (i == 1) { 3816 if (!IsHighest) 3817 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); 3818 else 3819 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); 3820 } else { 3821 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], 3822 SeparateOddCarry); 3823 } 3824 Accum[2 * i - 1] = Lo->getOperand(0).getReg(); 3825 3826 if (!IsHighest) { 3827 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], 3828 Lo->getOperand(1).getReg()); 3829 Accum[2 * i] = Hi.getReg(0); 3830 SeparateOddCarry = Hi.getReg(1); 3831 } 3832 } 3833 } 3834 3835 // Add in the carries from the previous iteration 3836 if (i > 0) { 3837 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) 3838 EvenCarryIn.push_back(CarryOut); 3839 3840 if (2 * i < Accum.size()) { 3841 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) 3842 OddCarry.push_back(CarryOut); 3843 } 3844 } 3845 } 3846 } 3847 3848 // Custom narrowing of wide multiplies using wide multiply-add instructions. 3849 // 3850 // TODO: If the multiply is followed by an addition, we should attempt to 3851 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. 3852 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, 3853 MachineInstr &MI) const { 3854 assert(ST.hasMad64_32()); 3855 assert(MI.getOpcode() == TargetOpcode::G_MUL); 3856 3857 MachineIRBuilder &B = Helper.MIRBuilder; 3858 MachineRegisterInfo &MRI = *B.getMRI(); 3859 3860 Register DstReg = MI.getOperand(0).getReg(); 3861 Register Src0 = MI.getOperand(1).getReg(); 3862 Register Src1 = MI.getOperand(2).getReg(); 3863 3864 LLT Ty = MRI.getType(DstReg); 3865 assert(Ty.isScalar()); 3866 3867 unsigned Size = Ty.getSizeInBits(); 3868 unsigned NumParts = Size / 32; 3869 assert((Size % 32) == 0); 3870 assert(NumParts >= 2); 3871 3872 // Whether to use MAD_64_32 for partial products whose high half is 3873 // discarded. This avoids some ADD instructions but risks false dependency 3874 // stalls on some subtargets in some cases. 3875 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; 3876 3877 // Whether to compute odd-aligned partial products separately. This is 3878 // advisable on subtargets where the accumulator of MAD_64_32 must be placed 3879 // in an even-aligned VGPR. 3880 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); 3881 3882 LLT S32 = LLT::scalar(32); 3883 SmallVector<Register, 2> Src0Parts, Src1Parts; 3884 for (unsigned i = 0; i < NumParts; ++i) { 3885 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); 3886 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); 3887 } 3888 B.buildUnmerge(Src0Parts, Src0); 3889 B.buildUnmerge(Src1Parts, Src1); 3890 3891 SmallVector<Register, 2> AccumRegs(NumParts); 3892 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, 3893 SeparateOddAlignedProducts); 3894 3895 B.buildMergeLikeInstr(DstReg, AccumRegs); 3896 MI.eraseFromParent(); 3897 return true; 3898 } 3899 3900 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 3901 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 3902 // case with a single min instruction instead of a compare+select. 3903 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 3904 MachineRegisterInfo &MRI, 3905 MachineIRBuilder &B) const { 3906 Register Dst = MI.getOperand(0).getReg(); 3907 Register Src = MI.getOperand(1).getReg(); 3908 LLT DstTy = MRI.getType(Dst); 3909 LLT SrcTy = MRI.getType(Src); 3910 3911 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 3912 ? AMDGPU::G_AMDGPU_FFBH_U32 3913 : AMDGPU::G_AMDGPU_FFBL_B32; 3914 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 3915 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 3916 3917 MI.eraseFromParent(); 3918 return true; 3919 } 3920 3921 // Check that this is a G_XOR x, -1 3922 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 3923 if (MI.getOpcode() != TargetOpcode::G_XOR) 3924 return false; 3925 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 3926 return ConstVal && *ConstVal == -1; 3927 } 3928 3929 // Return the use branch instruction, otherwise null if the usage is invalid. 3930 static MachineInstr * 3931 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 3932 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 3933 Register CondDef = MI.getOperand(0).getReg(); 3934 if (!MRI.hasOneNonDBGUse(CondDef)) 3935 return nullptr; 3936 3937 MachineBasicBlock *Parent = MI.getParent(); 3938 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 3939 3940 if (isNot(MRI, *UseMI)) { 3941 Register NegatedCond = UseMI->getOperand(0).getReg(); 3942 if (!MRI.hasOneNonDBGUse(NegatedCond)) 3943 return nullptr; 3944 3945 // We're deleting the def of this value, so we need to remove it. 3946 eraseInstr(*UseMI, MRI); 3947 3948 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 3949 Negated = true; 3950 } 3951 3952 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 3953 return nullptr; 3954 3955 // Make sure the cond br is followed by a G_BR, or is the last instruction. 3956 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 3957 if (Next == Parent->end()) { 3958 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 3959 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 3960 return nullptr; 3961 UncondBrTarget = &*NextMBB; 3962 } else { 3963 if (Next->getOpcode() != AMDGPU::G_BR) 3964 return nullptr; 3965 Br = &*Next; 3966 UncondBrTarget = Br->getOperand(0).getMBB(); 3967 } 3968 3969 return UseMI; 3970 } 3971 3972 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 3973 const ArgDescriptor *Arg, 3974 const TargetRegisterClass *ArgRC, 3975 LLT ArgTy) const { 3976 MCRegister SrcReg = Arg->getRegister(); 3977 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 3978 assert(DstReg.isVirtual() && "Virtual register expected"); 3979 3980 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 3981 *ArgRC, B.getDebugLoc(), ArgTy); 3982 if (Arg->isMasked()) { 3983 // TODO: Should we try to emit this once in the entry block? 3984 const LLT S32 = LLT::scalar(32); 3985 const unsigned Mask = Arg->getMask(); 3986 const unsigned Shift = llvm::countr_zero<unsigned>(Mask); 3987 3988 Register AndMaskSrc = LiveIn; 3989 3990 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 3991 // 0. 3992 if (Shift != 0) { 3993 auto ShiftAmt = B.buildConstant(S32, Shift); 3994 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 3995 } 3996 3997 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 3998 } else { 3999 B.buildCopy(DstReg, LiveIn); 4000 } 4001 4002 return true; 4003 } 4004 4005 bool AMDGPULegalizerInfo::loadInputValue( 4006 Register DstReg, MachineIRBuilder &B, 4007 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4008 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4009 const ArgDescriptor *Arg; 4010 const TargetRegisterClass *ArgRC; 4011 LLT ArgTy; 4012 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 4013 4014 if (!Arg) { 4015 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 4016 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 4017 // case the pointer argument may be missing and we use null. 4018 B.buildConstant(DstReg, 0); 4019 return true; 4020 } 4021 4022 // It's undefined behavior if a function marked with the amdgpu-no-* 4023 // attributes uses the corresponding intrinsic. 4024 B.buildUndef(DstReg); 4025 return true; 4026 } 4027 4028 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 4029 return false; // TODO: Handle these 4030 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 4031 } 4032 4033 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 4034 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 4035 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4036 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 4037 return false; 4038 4039 MI.eraseFromParent(); 4040 return true; 4041 } 4042 4043 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, 4044 int64_t C) { 4045 B.buildConstant(MI.getOperand(0).getReg(), C); 4046 MI.eraseFromParent(); 4047 return true; 4048 } 4049 4050 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( 4051 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 4052 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4053 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); 4054 if (MaxID == 0) 4055 return replaceWithConstant(B, MI, 0); 4056 4057 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4058 const ArgDescriptor *Arg; 4059 const TargetRegisterClass *ArgRC; 4060 LLT ArgTy; 4061 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 4062 4063 Register DstReg = MI.getOperand(0).getReg(); 4064 if (!Arg) { 4065 // It's undefined behavior if a function marked with the amdgpu-no-* 4066 // attributes uses the corresponding intrinsic. 4067 B.buildUndef(DstReg); 4068 MI.eraseFromParent(); 4069 return true; 4070 } 4071 4072 if (Arg->isMasked()) { 4073 // Don't bother inserting AssertZext for packed IDs since we're emitting the 4074 // masking operations anyway. 4075 // 4076 // TODO: We could assert the top bit is 0 for the source copy. 4077 if (!loadInputValue(DstReg, B, ArgType)) 4078 return false; 4079 } else { 4080 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 4081 if (!loadInputValue(TmpReg, B, ArgType)) 4082 return false; 4083 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID)); 4084 } 4085 4086 MI.eraseFromParent(); 4087 return true; 4088 } 4089 4090 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 4091 int64_t Offset) const { 4092 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 4093 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 4094 4095 // TODO: If we passed in the base kernel offset we could have a better 4096 // alignment than 4, but we don't really need it. 4097 if (!loadInputValue(KernArgReg, B, 4098 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4099 llvm_unreachable("failed to find kernarg segment ptr"); 4100 4101 auto COffset = B.buildConstant(LLT::scalar(64), Offset); 4102 // TODO: Should get nuw 4103 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 4104 } 4105 4106 /// Legalize a value that's loaded from kernel arguments. This is only used by 4107 /// legacy intrinsics. 4108 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 4109 MachineIRBuilder &B, 4110 uint64_t Offset, 4111 Align Alignment) const { 4112 Register DstReg = MI.getOperand(0).getReg(); 4113 4114 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 4115 "unexpected kernarg parameter type"); 4116 4117 Register Ptr = getKernargParameterPtr(B, Offset); 4118 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 4119 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 4120 MachineMemOperand::MODereferenceable | 4121 MachineMemOperand::MOInvariant); 4122 MI.eraseFromParent(); 4123 return true; 4124 } 4125 4126 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 4127 MachineRegisterInfo &MRI, 4128 MachineIRBuilder &B) const { 4129 Register Dst = MI.getOperand(0).getReg(); 4130 LLT DstTy = MRI.getType(Dst); 4131 LLT S16 = LLT::scalar(16); 4132 LLT S32 = LLT::scalar(32); 4133 LLT S64 = LLT::scalar(64); 4134 4135 if (DstTy == S16) 4136 return legalizeFDIV16(MI, MRI, B); 4137 if (DstTy == S32) 4138 return legalizeFDIV32(MI, MRI, B); 4139 if (DstTy == S64) 4140 return legalizeFDIV64(MI, MRI, B); 4141 4142 return false; 4143 } 4144 4145 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 4146 Register DstDivReg, 4147 Register DstRemReg, 4148 Register X, 4149 Register Y) const { 4150 const LLT S1 = LLT::scalar(1); 4151 const LLT S32 = LLT::scalar(32); 4152 4153 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 4154 // algorithm used here. 4155 4156 // Initial estimate of inv(y). 4157 auto FloatY = B.buildUITOFP(S32, Y); 4158 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 4159 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe)); 4160 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 4161 auto Z = B.buildFPTOUI(S32, ScaledY); 4162 4163 // One round of UNR. 4164 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 4165 auto NegYZ = B.buildMul(S32, NegY, Z); 4166 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 4167 4168 // Quotient/remainder estimate. 4169 auto Q = B.buildUMulH(S32, X, Z); 4170 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 4171 4172 // First quotient/remainder refinement. 4173 auto One = B.buildConstant(S32, 1); 4174 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4175 if (DstDivReg) 4176 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 4177 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 4178 4179 // Second quotient/remainder refinement. 4180 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4181 if (DstDivReg) 4182 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 4183 4184 if (DstRemReg) 4185 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 4186 } 4187 4188 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 4189 // 4190 // Return lo, hi of result 4191 // 4192 // %cvt.lo = G_UITOFP Val.lo 4193 // %cvt.hi = G_UITOFP Val.hi 4194 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 4195 // %rcp = G_AMDGPU_RCP_IFLAG %mad 4196 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 4197 // %mul2 = G_FMUL %mul1, 2**(-32) 4198 // %trunc = G_INTRINSIC_TRUNC %mul2 4199 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 4200 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 4201 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 4202 Register Val) { 4203 const LLT S32 = LLT::scalar(32); 4204 auto Unmerge = B.buildUnmerge(S32, Val); 4205 4206 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 4207 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 4208 4209 auto Mad = B.buildFMAD( 4210 S32, CvtHi, // 2**32 4211 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo); 4212 4213 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 4214 auto Mul1 = B.buildFMul( 4215 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc))); 4216 4217 // 2**(-32) 4218 auto Mul2 = B.buildFMul( 4219 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000))); 4220 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 4221 4222 // -(2**32) 4223 auto Mad2 = B.buildFMAD( 4224 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)), 4225 Mul1); 4226 4227 auto ResultLo = B.buildFPTOUI(S32, Mad2); 4228 auto ResultHi = B.buildFPTOUI(S32, Trunc); 4229 4230 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 4231 } 4232 4233 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 4234 Register DstDivReg, 4235 Register DstRemReg, 4236 Register Numer, 4237 Register Denom) const { 4238 const LLT S32 = LLT::scalar(32); 4239 const LLT S64 = LLT::scalar(64); 4240 const LLT S1 = LLT::scalar(1); 4241 Register RcpLo, RcpHi; 4242 4243 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 4244 4245 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi}); 4246 4247 auto Zero64 = B.buildConstant(S64, 0); 4248 auto NegDenom = B.buildSub(S64, Zero64, Denom); 4249 4250 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 4251 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 4252 4253 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 4254 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 4255 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 4256 4257 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 4258 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 4259 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi}); 4260 4261 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 4262 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 4263 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 4264 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 4265 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 4266 4267 auto Zero32 = B.buildConstant(S32, 0); 4268 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 4269 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 4270 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi}); 4271 4272 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 4273 Register NumerLo = UnmergeNumer.getReg(0); 4274 Register NumerHi = UnmergeNumer.getReg(1); 4275 4276 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 4277 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 4278 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 4279 Register Mul3_Lo = UnmergeMul3.getReg(0); 4280 Register Mul3_Hi = UnmergeMul3.getReg(1); 4281 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 4282 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 4283 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 4284 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi}); 4285 4286 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 4287 Register DenomLo = UnmergeDenom.getReg(0); 4288 Register DenomHi = UnmergeDenom.getReg(1); 4289 4290 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 4291 auto C1 = B.buildSExt(S32, CmpHi); 4292 4293 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 4294 auto C2 = B.buildSExt(S32, CmpLo); 4295 4296 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 4297 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 4298 4299 // TODO: Here and below portions of the code can be enclosed into if/endif. 4300 // Currently control flow is unconditional and we have 4 selects after 4301 // potential endif to substitute PHIs. 4302 4303 // if C3 != 0 ... 4304 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 4305 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 4306 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 4307 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi}); 4308 4309 auto One64 = B.buildConstant(S64, 1); 4310 auto Add3 = B.buildAdd(S64, MulHi3, One64); 4311 4312 auto C4 = 4313 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 4314 auto C5 = 4315 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 4316 auto C6 = B.buildSelect( 4317 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 4318 4319 // if (C6 != 0) 4320 auto Add4 = B.buildAdd(S64, Add3, One64); 4321 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 4322 4323 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 4324 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 4325 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi}); 4326 4327 // endif C6 4328 // endif C3 4329 4330 if (DstDivReg) { 4331 auto Sel1 = B.buildSelect( 4332 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 4333 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4334 Sel1, MulHi3); 4335 } 4336 4337 if (DstRemReg) { 4338 auto Sel2 = B.buildSelect( 4339 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 4340 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4341 Sel2, Sub1); 4342 } 4343 } 4344 4345 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 4346 MachineRegisterInfo &MRI, 4347 MachineIRBuilder &B) const { 4348 Register DstDivReg, DstRemReg; 4349 switch (MI.getOpcode()) { 4350 default: 4351 llvm_unreachable("Unexpected opcode!"); 4352 case AMDGPU::G_UDIV: { 4353 DstDivReg = MI.getOperand(0).getReg(); 4354 break; 4355 } 4356 case AMDGPU::G_UREM: { 4357 DstRemReg = MI.getOperand(0).getReg(); 4358 break; 4359 } 4360 case AMDGPU::G_UDIVREM: { 4361 DstDivReg = MI.getOperand(0).getReg(); 4362 DstRemReg = MI.getOperand(1).getReg(); 4363 break; 4364 } 4365 } 4366 4367 const LLT S64 = LLT::scalar(64); 4368 const LLT S32 = LLT::scalar(32); 4369 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4370 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 4371 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 4372 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 4373 4374 if (Ty == S32) 4375 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 4376 else if (Ty == S64) 4377 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 4378 else 4379 return false; 4380 4381 MI.eraseFromParent(); 4382 return true; 4383 } 4384 4385 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 4386 MachineRegisterInfo &MRI, 4387 MachineIRBuilder &B) const { 4388 const LLT S64 = LLT::scalar(64); 4389 const LLT S32 = LLT::scalar(32); 4390 4391 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 4392 if (Ty != S32 && Ty != S64) 4393 return false; 4394 4395 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4396 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 4397 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 4398 4399 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 4400 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 4401 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 4402 4403 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 4404 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 4405 4406 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 4407 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 4408 4409 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 4410 switch (MI.getOpcode()) { 4411 default: 4412 llvm_unreachable("Unexpected opcode!"); 4413 case AMDGPU::G_SDIV: { 4414 DstDivReg = MI.getOperand(0).getReg(); 4415 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4416 break; 4417 } 4418 case AMDGPU::G_SREM: { 4419 DstRemReg = MI.getOperand(0).getReg(); 4420 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4421 break; 4422 } 4423 case AMDGPU::G_SDIVREM: { 4424 DstDivReg = MI.getOperand(0).getReg(); 4425 DstRemReg = MI.getOperand(1).getReg(); 4426 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4427 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4428 break; 4429 } 4430 } 4431 4432 if (Ty == S32) 4433 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 4434 else 4435 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 4436 4437 if (DstDivReg) { 4438 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 4439 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 4440 B.buildSub(DstDivReg, SignXor, Sign); 4441 } 4442 4443 if (DstRemReg) { 4444 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 4445 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 4446 B.buildSub(DstRemReg, SignXor, Sign); 4447 } 4448 4449 MI.eraseFromParent(); 4450 return true; 4451 } 4452 4453 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 4454 MachineRegisterInfo &MRI, 4455 MachineIRBuilder &B) const { 4456 Register Res = MI.getOperand(0).getReg(); 4457 Register LHS = MI.getOperand(1).getReg(); 4458 Register RHS = MI.getOperand(2).getReg(); 4459 uint16_t Flags = MI.getFlags(); 4460 LLT ResTy = MRI.getType(Res); 4461 4462 const MachineFunction &MF = B.getMF(); 4463 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) || 4464 MF.getTarget().Options.UnsafeFPMath; 4465 4466 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 4467 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16)) 4468 return false; 4469 4470 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 4471 // the CI documentation has a worst case error of 1 ulp. 4472 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 4473 // use it as long as we aren't trying to use denormals. 4474 // 4475 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. 4476 4477 // 1 / x -> RCP(x) 4478 if (CLHS->isExactlyValue(1.0)) { 4479 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 4480 .addUse(RHS) 4481 .setMIFlags(Flags); 4482 4483 MI.eraseFromParent(); 4484 return true; 4485 } 4486 4487 // TODO: Match rsq 4488 4489 // -1 / x -> RCP( FNEG(x) ) 4490 if (CLHS->isExactlyValue(-1.0)) { 4491 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 4492 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 4493 .addUse(FNeg.getReg(0)) 4494 .setMIFlags(Flags); 4495 4496 MI.eraseFromParent(); 4497 return true; 4498 } 4499 } 4500 4501 // For f16 require arcp only. 4502 // For f32 require afn+arcp. 4503 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) || 4504 !MI.getFlag(MachineInstr::FmArcp))) 4505 return false; 4506 4507 // x / y -> x * (1.0 / y) 4508 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 4509 .addUse(RHS) 4510 .setMIFlags(Flags); 4511 B.buildFMul(Res, LHS, RCP, Flags); 4512 4513 MI.eraseFromParent(); 4514 return true; 4515 } 4516 4517 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 4518 MachineRegisterInfo &MRI, 4519 MachineIRBuilder &B) const { 4520 Register Res = MI.getOperand(0).getReg(); 4521 Register X = MI.getOperand(1).getReg(); 4522 Register Y = MI.getOperand(2).getReg(); 4523 uint16_t Flags = MI.getFlags(); 4524 LLT ResTy = MRI.getType(Res); 4525 4526 const MachineFunction &MF = B.getMF(); 4527 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 4528 MI.getFlag(MachineInstr::FmAfn); 4529 4530 if (!AllowInaccurateRcp) 4531 return false; 4532 4533 auto NegY = B.buildFNeg(ResTy, Y); 4534 auto One = B.buildFConstant(ResTy, 1.0); 4535 4536 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 4537 .addUse(Y) 4538 .setMIFlags(Flags); 4539 4540 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 4541 R = B.buildFMA(ResTy, Tmp0, R, R); 4542 4543 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 4544 R = B.buildFMA(ResTy, Tmp1, R, R); 4545 4546 auto Ret = B.buildFMul(ResTy, X, R); 4547 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 4548 4549 B.buildFMA(Res, Tmp2, R, Ret); 4550 MI.eraseFromParent(); 4551 return true; 4552 } 4553 4554 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 4555 MachineRegisterInfo &MRI, 4556 MachineIRBuilder &B) const { 4557 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 4558 return true; 4559 4560 Register Res = MI.getOperand(0).getReg(); 4561 Register LHS = MI.getOperand(1).getReg(); 4562 Register RHS = MI.getOperand(2).getReg(); 4563 4564 uint16_t Flags = MI.getFlags(); 4565 4566 LLT S16 = LLT::scalar(16); 4567 LLT S32 = LLT::scalar(32); 4568 4569 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 4570 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 4571 4572 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 4573 .addUse(RHSExt.getReg(0)) 4574 .setMIFlags(Flags); 4575 4576 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 4577 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 4578 4579 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 4580 .addUse(RDst.getReg(0)) 4581 .addUse(RHS) 4582 .addUse(LHS) 4583 .setMIFlags(Flags); 4584 4585 MI.eraseFromParent(); 4586 return true; 4587 } 4588 4589 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 4590 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 4591 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, 4592 const GCNSubtarget &ST, 4593 SIModeRegisterDefaults Mode) { 4594 // Set SP denorm mode to this value. 4595 unsigned SPDenormMode = 4596 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 4597 4598 if (ST.hasDenormModeInst()) { 4599 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 4600 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 4601 4602 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 4603 B.buildInstr(AMDGPU::S_DENORM_MODE) 4604 .addImm(NewDenormModeValue); 4605 4606 } else { 4607 // Select FP32 bit field in mode register. 4608 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 4609 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 4610 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 4611 4612 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 4613 .addImm(SPDenormMode) 4614 .addImm(SPDenormModeBitField); 4615 } 4616 } 4617 4618 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 4619 MachineRegisterInfo &MRI, 4620 MachineIRBuilder &B) const { 4621 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 4622 return true; 4623 4624 Register Res = MI.getOperand(0).getReg(); 4625 Register LHS = MI.getOperand(1).getReg(); 4626 Register RHS = MI.getOperand(2).getReg(); 4627 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4628 SIModeRegisterDefaults Mode = MFI->getMode(); 4629 4630 uint16_t Flags = MI.getFlags(); 4631 4632 LLT S32 = LLT::scalar(32); 4633 LLT S1 = LLT::scalar(1); 4634 4635 auto One = B.buildFConstant(S32, 1.0f); 4636 4637 auto DenominatorScaled = 4638 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 4639 .addUse(LHS) 4640 .addUse(RHS) 4641 .addImm(0) 4642 .setMIFlags(Flags); 4643 auto NumeratorScaled = 4644 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 4645 .addUse(LHS) 4646 .addUse(RHS) 4647 .addImm(1) 4648 .setMIFlags(Flags); 4649 4650 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 4651 .addUse(DenominatorScaled.getReg(0)) 4652 .setMIFlags(Flags); 4653 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 4654 4655 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 4656 // aren't modeled as reading it. 4657 if (Mode.FP32Denormals != DenormalMode::getIEEE()) 4658 toggleSPDenormMode(true, B, ST, Mode); 4659 4660 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 4661 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 4662 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 4663 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 4664 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 4665 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 4666 4667 // FIXME: This mishandles dynamic denormal mode. We need to query the 4668 // current mode and restore the original. 4669 if (Mode.FP32Denormals != DenormalMode::getIEEE()) 4670 toggleSPDenormMode(false, B, ST, Mode); 4671 4672 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 4673 .addUse(Fma4.getReg(0)) 4674 .addUse(Fma1.getReg(0)) 4675 .addUse(Fma3.getReg(0)) 4676 .addUse(NumeratorScaled.getReg(1)) 4677 .setMIFlags(Flags); 4678 4679 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 4680 .addUse(Fmas.getReg(0)) 4681 .addUse(RHS) 4682 .addUse(LHS) 4683 .setMIFlags(Flags); 4684 4685 MI.eraseFromParent(); 4686 return true; 4687 } 4688 4689 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 4690 MachineRegisterInfo &MRI, 4691 MachineIRBuilder &B) const { 4692 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 4693 return true; 4694 4695 Register Res = MI.getOperand(0).getReg(); 4696 Register LHS = MI.getOperand(1).getReg(); 4697 Register RHS = MI.getOperand(2).getReg(); 4698 4699 uint16_t Flags = MI.getFlags(); 4700 4701 LLT S64 = LLT::scalar(64); 4702 LLT S1 = LLT::scalar(1); 4703 4704 auto One = B.buildFConstant(S64, 1.0); 4705 4706 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 4707 .addUse(LHS) 4708 .addUse(RHS) 4709 .addImm(0) 4710 .setMIFlags(Flags); 4711 4712 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 4713 4714 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 4715 .addUse(DivScale0.getReg(0)) 4716 .setMIFlags(Flags); 4717 4718 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 4719 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 4720 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 4721 4722 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 4723 .addUse(LHS) 4724 .addUse(RHS) 4725 .addImm(1) 4726 .setMIFlags(Flags); 4727 4728 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 4729 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 4730 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 4731 4732 Register Scale; 4733 if (!ST.hasUsableDivScaleConditionOutput()) { 4734 // Workaround a hardware bug on SI where the condition output from div_scale 4735 // is not usable. 4736 4737 LLT S32 = LLT::scalar(32); 4738 4739 auto NumUnmerge = B.buildUnmerge(S32, LHS); 4740 auto DenUnmerge = B.buildUnmerge(S32, RHS); 4741 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 4742 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 4743 4744 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 4745 Scale1Unmerge.getReg(1)); 4746 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 4747 Scale0Unmerge.getReg(1)); 4748 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 4749 } else { 4750 Scale = DivScale1.getReg(1); 4751 } 4752 4753 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 4754 .addUse(Fma4.getReg(0)) 4755 .addUse(Fma3.getReg(0)) 4756 .addUse(Mul.getReg(0)) 4757 .addUse(Scale) 4758 .setMIFlags(Flags); 4759 4760 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false) 4761 .addUse(Fmas.getReg(0)) 4762 .addUse(RHS) 4763 .addUse(LHS) 4764 .setMIFlags(Flags); 4765 4766 MI.eraseFromParent(); 4767 return true; 4768 } 4769 4770 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI, 4771 MachineRegisterInfo &MRI, 4772 MachineIRBuilder &B) const { 4773 Register Res0 = MI.getOperand(0).getReg(); 4774 Register Res1 = MI.getOperand(1).getReg(); 4775 Register Val = MI.getOperand(2).getReg(); 4776 uint16_t Flags = MI.getFlags(); 4777 4778 LLT Ty = MRI.getType(Res0); 4779 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32); 4780 4781 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty}, false) 4782 .addUse(Val) 4783 .setMIFlags(Flags); 4784 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy}, false) 4785 .addUse(Val) 4786 .setMIFlags(Flags); 4787 4788 if (ST.hasFractBug()) { 4789 auto Fabs = B.buildFAbs(Ty, Val); 4790 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty))); 4791 auto IsFinite = 4792 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 4793 auto Zero = B.buildConstant(InstrExpTy, 0); 4794 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero); 4795 Mant = B.buildSelect(Ty, IsFinite, Mant, Val); 4796 } 4797 4798 B.buildCopy(Res0, Mant); 4799 B.buildSExtOrTrunc(Res1, Exp); 4800 4801 MI.eraseFromParent(); 4802 return true; 4803 } 4804 4805 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 4806 MachineRegisterInfo &MRI, 4807 MachineIRBuilder &B) const { 4808 Register Res = MI.getOperand(0).getReg(); 4809 Register LHS = MI.getOperand(2).getReg(); 4810 Register RHS = MI.getOperand(3).getReg(); 4811 uint16_t Flags = MI.getFlags(); 4812 4813 LLT S32 = LLT::scalar(32); 4814 LLT S1 = LLT::scalar(1); 4815 4816 auto Abs = B.buildFAbs(S32, RHS, Flags); 4817 const APFloat C0Val(1.0f); 4818 4819 auto C0 = B.buildFConstant(S32, 0x1p+96f); 4820 auto C1 = B.buildFConstant(S32, 0x1p-32f); 4821 auto C2 = B.buildFConstant(S32, 1.0f); 4822 4823 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 4824 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 4825 4826 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 4827 4828 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 4829 .addUse(Mul0.getReg(0)) 4830 .setMIFlags(Flags); 4831 4832 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 4833 4834 B.buildFMul(Res, Sel, Mul1, Flags); 4835 4836 MI.eraseFromParent(); 4837 return true; 4838 } 4839 4840 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, 4841 MachineRegisterInfo &MRI, 4842 MachineIRBuilder &B) const { 4843 // For double type, the SQRT and RSQ instructions don't have required 4844 // precision, we apply Goldschmidt's algorithm to improve the result: 4845 // 4846 // y0 = rsq(x) 4847 // g0 = x * y0 4848 // h0 = 0.5 * y0 4849 // 4850 // r0 = 0.5 - h0 * g0 4851 // g1 = g0 * r0 + g0 4852 // h1 = h0 * r0 + h0 4853 // 4854 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 4855 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 4856 // h2 = h1 * r1 + h1 4857 // 4858 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 4859 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 4860 // 4861 // sqrt(x) = g3 4862 4863 const LLT S1 = LLT::scalar(1); 4864 const LLT S32 = LLT::scalar(32); 4865 const LLT F64 = LLT::scalar(64); 4866 4867 Register Dst = MI.getOperand(0).getReg(); 4868 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt"); 4869 4870 Register X = MI.getOperand(1).getReg(); 4871 unsigned Flags = MI.getFlags(); 4872 4873 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767); 4874 4875 auto ZeroInt = B.buildConstant(S32, 0); 4876 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant); 4877 4878 // Scale up input if it is too small. 4879 auto ScaleUpFactor = B.buildConstant(S32, 256); 4880 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt); 4881 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags); 4882 4883 auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}, false) 4884 .addReg(SqrtX.getReg(0)); 4885 4886 auto Half = B.buildFConstant(F64, 0.5); 4887 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half); 4888 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY); 4889 4890 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0); 4891 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half); 4892 4893 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0); 4894 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0); 4895 4896 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1); 4897 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX); 4898 4899 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1); 4900 4901 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2); 4902 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX); 4903 4904 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2); 4905 4906 // Scale down the result. 4907 auto ScaleDownFactor = B.buildConstant(S32, -128); 4908 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt); 4909 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags); 4910 4911 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check 4912 // with finite only or nsz because rsq(+/-0) = +/-inf 4913 4914 // TODO: Check for DAZ and expand to subnormals 4915 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); 4916 4917 // If x is +INF, +0, or -0, use its original value 4918 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags); 4919 4920 MI.eraseFromParent(); 4921 return true; 4922 } 4923 4924 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 4925 // FIXME: Why do we handle this one but not other removed instructions? 4926 // 4927 // Reciprocal square root. The clamp prevents infinite results, clamping 4928 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 4929 // +-max_float. 4930 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 4931 MachineRegisterInfo &MRI, 4932 MachineIRBuilder &B) const { 4933 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 4934 return true; 4935 4936 Register Dst = MI.getOperand(0).getReg(); 4937 Register Src = MI.getOperand(2).getReg(); 4938 auto Flags = MI.getFlags(); 4939 4940 LLT Ty = MRI.getType(Dst); 4941 4942 const fltSemantics *FltSemantics; 4943 if (Ty == LLT::scalar(32)) 4944 FltSemantics = &APFloat::IEEEsingle(); 4945 else if (Ty == LLT::scalar(64)) 4946 FltSemantics = &APFloat::IEEEdouble(); 4947 else 4948 return false; 4949 4950 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 4951 .addUse(Src) 4952 .setMIFlags(Flags); 4953 4954 // We don't need to concern ourselves with the snan handling difference, since 4955 // the rsq quieted (or not) so use the one which will directly select. 4956 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4957 const bool UseIEEE = MFI->getMode().IEEE; 4958 4959 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 4960 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 4961 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 4962 4963 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 4964 4965 if (UseIEEE) 4966 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 4967 else 4968 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 4969 MI.eraseFromParent(); 4970 return true; 4971 } 4972 4973 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 4974 switch (IID) { 4975 case Intrinsic::amdgcn_ds_fadd: 4976 return AMDGPU::G_ATOMICRMW_FADD; 4977 case Intrinsic::amdgcn_ds_fmin: 4978 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 4979 case Intrinsic::amdgcn_ds_fmax: 4980 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 4981 default: 4982 llvm_unreachable("not a DS FP intrinsic"); 4983 } 4984 } 4985 4986 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 4987 MachineInstr &MI, 4988 Intrinsic::ID IID) const { 4989 GISelChangeObserver &Observer = Helper.Observer; 4990 Observer.changingInstr(MI); 4991 4992 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 4993 4994 // The remaining operands were used to set fields in the MemOperand on 4995 // construction. 4996 for (int I = 6; I > 3; --I) 4997 MI.removeOperand(I); 4998 4999 MI.removeOperand(1); // Remove the intrinsic ID. 5000 Observer.changedInstr(MI); 5001 return true; 5002 } 5003 5004 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 5005 MachineRegisterInfo &MRI, 5006 MachineIRBuilder &B) const { 5007 uint64_t Offset = 5008 ST.getTargetLowering()->getImplicitParameterOffset( 5009 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 5010 LLT DstTy = MRI.getType(DstReg); 5011 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 5012 5013 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 5014 if (!loadInputValue(KernargPtrReg, B, 5015 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 5016 return false; 5017 5018 // FIXME: This should be nuw 5019 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 5020 return true; 5021 } 5022 5023 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32 5024 /// bits of the pointer and replace them with the stride argument, then 5025 /// merge_values everything together. In the common case of a raw buffer (the 5026 /// stride component is 0), we can just AND off the upper half. 5027 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin( 5028 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5029 Register Result = MI.getOperand(0).getReg(); 5030 Register Pointer = MI.getOperand(2).getReg(); 5031 Register Stride = MI.getOperand(3).getReg(); 5032 Register NumRecords = MI.getOperand(4).getReg(); 5033 Register Flags = MI.getOperand(5).getReg(); 5034 5035 LLT S32 = LLT::scalar(32); 5036 5037 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 5038 auto Unmerge = B.buildUnmerge(S32, Pointer); 5039 Register LowHalf = Unmerge.getReg(0); 5040 Register HighHalf = Unmerge.getReg(1); 5041 5042 auto AndMask = B.buildConstant(S32, 0x0000ffff); 5043 auto Masked = B.buildAnd(S32, HighHalf, AndMask); 5044 5045 MachineInstrBuilder NewHighHalf = Masked; 5046 std::optional<ValueAndVReg> StrideConst = 5047 getIConstantVRegValWithLookThrough(Stride, MRI); 5048 if (!StrideConst || !StrideConst->Value.isZero()) { 5049 MachineInstrBuilder ShiftedStride; 5050 if (StrideConst) { 5051 uint32_t StrideVal = StrideConst->Value.getZExtValue(); 5052 uint32_t ShiftedStrideVal = StrideVal << 16; 5053 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal); 5054 } else { 5055 auto ExtStride = B.buildAnyExt(S32, Stride); 5056 auto ShiftConst = B.buildConstant(S32, 16); 5057 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst); 5058 } 5059 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride); 5060 } 5061 Register NewHighHalfReg = NewHighHalf.getReg(0); 5062 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags}); 5063 MI.eraseFromParent(); 5064 return true; 5065 } 5066 5067 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 5068 MachineRegisterInfo &MRI, 5069 MachineIRBuilder &B) const { 5070 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5071 if (!MFI->isEntryFunction()) { 5072 return legalizePreloadedArgIntrin(MI, MRI, B, 5073 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 5074 } 5075 5076 Register DstReg = MI.getOperand(0).getReg(); 5077 if (!getImplicitArgPtr(DstReg, MRI, B)) 5078 return false; 5079 5080 MI.eraseFromParent(); 5081 return true; 5082 } 5083 5084 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, 5085 MachineRegisterInfo &MRI, 5086 MachineIRBuilder &B) const { 5087 Function &F = B.getMF().getFunction(); 5088 std::optional<uint32_t> KnownSize = 5089 AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 5090 if (KnownSize.has_value()) 5091 B.buildConstant(DstReg, *KnownSize); 5092 return false; 5093 } 5094 5095 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, 5096 MachineRegisterInfo &MRI, 5097 MachineIRBuilder &B) const { 5098 5099 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5100 if (!MFI->isEntryFunction()) { 5101 return legalizePreloadedArgIntrin(MI, MRI, B, 5102 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 5103 } 5104 5105 Register DstReg = MI.getOperand(0).getReg(); 5106 if (!getLDSKernelId(DstReg, MRI, B)) 5107 return false; 5108 5109 MI.eraseFromParent(); 5110 return true; 5111 } 5112 5113 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 5114 MachineRegisterInfo &MRI, 5115 MachineIRBuilder &B, 5116 unsigned AddrSpace) const { 5117 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 5118 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 5119 Register Hi32 = Unmerge.getReg(1); 5120 5121 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 5122 MI.eraseFromParent(); 5123 return true; 5124 } 5125 5126 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 5127 // offset (the offset that is included in bounds checking and swizzling, to be 5128 // split between the instruction's voffset and immoffset fields) and soffset 5129 // (the offset that is excluded from bounds checking and swizzling, to go in 5130 // the instruction's soffset field). This function takes the first kind of 5131 // offset and figures out how to split it between voffset and immoffset. 5132 std::pair<Register, unsigned> 5133 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 5134 Register OrigOffset) const { 5135 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(); 5136 Register BaseReg; 5137 unsigned ImmOffset; 5138 const LLT S32 = LLT::scalar(32); 5139 MachineRegisterInfo &MRI = *B.getMRI(); 5140 5141 std::tie(BaseReg, ImmOffset) = 5142 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 5143 5144 // If BaseReg is a pointer, convert it to int. 5145 if (MRI.getType(BaseReg).isPointer()) 5146 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 5147 5148 // If the immediate value is too big for the immoffset field, put only bits 5149 // that would normally fit in the immoffset field. The remaining value that 5150 // is copied/added for the voffset field is a large power of 2, and it 5151 // stands more chance of being CSEd with the copy/add for another similar 5152 // load/store. 5153 // However, do not do that rounding down if that is a negative 5154 // number, as it appears to be illegal to have a negative offset in the 5155 // vgpr, even if adding the immediate offset makes it positive. 5156 unsigned Overflow = ImmOffset & ~MaxImm; 5157 ImmOffset -= Overflow; 5158 if ((int32_t)Overflow < 0) { 5159 Overflow += ImmOffset; 5160 ImmOffset = 0; 5161 } 5162 5163 if (Overflow != 0) { 5164 if (!BaseReg) { 5165 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 5166 } else { 5167 auto OverflowVal = B.buildConstant(S32, Overflow); 5168 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 5169 } 5170 } 5171 5172 if (!BaseReg) 5173 BaseReg = B.buildConstant(S32, 0).getReg(0); 5174 5175 return std::pair(BaseReg, ImmOffset); 5176 } 5177 5178 /// Handle register layout difference for f16 images for some subtargets. 5179 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 5180 MachineRegisterInfo &MRI, 5181 Register Reg, 5182 bool ImageStore) const { 5183 const LLT S16 = LLT::scalar(16); 5184 const LLT S32 = LLT::scalar(32); 5185 LLT StoreVT = MRI.getType(Reg); 5186 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 5187 5188 if (ST.hasUnpackedD16VMem()) { 5189 auto Unmerge = B.buildUnmerge(S16, Reg); 5190 5191 SmallVector<Register, 4> WideRegs; 5192 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5193 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 5194 5195 int NumElts = StoreVT.getNumElements(); 5196 5197 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 5198 .getReg(0); 5199 } 5200 5201 if (ImageStore && ST.hasImageStoreD16Bug()) { 5202 if (StoreVT.getNumElements() == 2) { 5203 SmallVector<Register, 4> PackedRegs; 5204 Reg = B.buildBitcast(S32, Reg).getReg(0); 5205 PackedRegs.push_back(Reg); 5206 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 5207 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 5208 .getReg(0); 5209 } 5210 5211 if (StoreVT.getNumElements() == 3) { 5212 SmallVector<Register, 4> PackedRegs; 5213 auto Unmerge = B.buildUnmerge(S16, Reg); 5214 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5215 PackedRegs.push_back(Unmerge.getReg(I)); 5216 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 5217 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 5218 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 5219 } 5220 5221 if (StoreVT.getNumElements() == 4) { 5222 SmallVector<Register, 4> PackedRegs; 5223 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 5224 auto Unmerge = B.buildUnmerge(S32, Reg); 5225 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5226 PackedRegs.push_back(Unmerge.getReg(I)); 5227 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 5228 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 5229 .getReg(0); 5230 } 5231 5232 llvm_unreachable("invalid data type"); 5233 } 5234 5235 if (StoreVT == LLT::fixed_vector(3, S16)) { 5236 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 5237 .getReg(0); 5238 } 5239 return Reg; 5240 } 5241 5242 Register AMDGPULegalizerInfo::fixStoreSourceType( 5243 MachineIRBuilder &B, Register VData, bool IsFormat) const { 5244 MachineRegisterInfo *MRI = B.getMRI(); 5245 LLT Ty = MRI->getType(VData); 5246 5247 const LLT S16 = LLT::scalar(16); 5248 5249 // Fixup buffer resources themselves needing to be v4i128. 5250 if (hasBufferRsrcWorkaround(Ty)) 5251 return castBufferRsrcToV4I32(VData, B); 5252 5253 // Fixup illegal register types for i8 stores. 5254 if (Ty == LLT::scalar(8) || Ty == S16) { 5255 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 5256 return AnyExt; 5257 } 5258 5259 if (Ty.isVector()) { 5260 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 5261 if (IsFormat) 5262 return handleD16VData(B, *MRI, VData); 5263 } 5264 } 5265 5266 return VData; 5267 } 5268 5269 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 5270 MachineRegisterInfo &MRI, 5271 MachineIRBuilder &B, 5272 bool IsTyped, 5273 bool IsFormat) const { 5274 Register VData = MI.getOperand(1).getReg(); 5275 LLT Ty = MRI.getType(VData); 5276 LLT EltTy = Ty.getScalarType(); 5277 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 5278 const LLT S32 = LLT::scalar(32); 5279 5280 VData = fixStoreSourceType(B, VData, IsFormat); 5281 castBufferRsrcArgToV4I32(MI, B, 2); 5282 Register RSrc = MI.getOperand(2).getReg(); 5283 5284 MachineMemOperand *MMO = *MI.memoperands_begin(); 5285 const int MemSize = MMO->getSize(); 5286 5287 unsigned ImmOffset; 5288 5289 // The typed intrinsics add an immediate after the registers. 5290 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 5291 5292 // The struct intrinsic variants add one additional operand over raw. 5293 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 5294 Register VIndex; 5295 int OpOffset = 0; 5296 if (HasVIndex) { 5297 VIndex = MI.getOperand(3).getReg(); 5298 OpOffset = 1; 5299 } else { 5300 VIndex = B.buildConstant(S32, 0).getReg(0); 5301 } 5302 5303 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 5304 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 5305 5306 unsigned Format = 0; 5307 if (IsTyped) { 5308 Format = MI.getOperand(5 + OpOffset).getImm(); 5309 ++OpOffset; 5310 } 5311 5312 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 5313 5314 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 5315 5316 unsigned Opc; 5317 if (IsTyped) { 5318 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 5319 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 5320 } else if (IsFormat) { 5321 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 5322 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 5323 } else { 5324 switch (MemSize) { 5325 case 1: 5326 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 5327 break; 5328 case 2: 5329 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 5330 break; 5331 default: 5332 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 5333 break; 5334 } 5335 } 5336 5337 auto MIB = B.buildInstr(Opc) 5338 .addUse(VData) // vdata 5339 .addUse(RSrc) // rsrc 5340 .addUse(VIndex) // vindex 5341 .addUse(VOffset) // voffset 5342 .addUse(SOffset) // soffset 5343 .addImm(ImmOffset); // offset(imm) 5344 5345 if (IsTyped) 5346 MIB.addImm(Format); 5347 5348 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5349 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 5350 .addMemOperand(MMO); 5351 5352 MI.eraseFromParent(); 5353 return true; 5354 } 5355 5356 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, 5357 Register VIndex, Register VOffset, Register SOffset, 5358 unsigned ImmOffset, unsigned Format, 5359 unsigned AuxiliaryData, MachineMemOperand *MMO, 5360 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) { 5361 auto MIB = B.buildInstr(Opc) 5362 .addDef(LoadDstReg) // vdata 5363 .addUse(RSrc) // rsrc 5364 .addUse(VIndex) // vindex 5365 .addUse(VOffset) // voffset 5366 .addUse(SOffset) // soffset 5367 .addImm(ImmOffset); // offset(imm) 5368 5369 if (IsTyped) 5370 MIB.addImm(Format); 5371 5372 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5373 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 5374 .addMemOperand(MMO); 5375 } 5376 5377 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 5378 MachineRegisterInfo &MRI, 5379 MachineIRBuilder &B, 5380 bool IsFormat, 5381 bool IsTyped) const { 5382 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 5383 MachineMemOperand *MMO = *MI.memoperands_begin(); 5384 const LLT MemTy = MMO->getMemoryType(); 5385 const LLT S32 = LLT::scalar(32); 5386 5387 Register Dst = MI.getOperand(0).getReg(); 5388 5389 Register StatusDst; 5390 int OpOffset = 0; 5391 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2); 5392 bool IsTFE = MI.getNumExplicitDefs() == 2; 5393 if (IsTFE) { 5394 StatusDst = MI.getOperand(1).getReg(); 5395 ++OpOffset; 5396 } 5397 5398 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset); 5399 Register RSrc = MI.getOperand(2 + OpOffset).getReg(); 5400 5401 // The typed intrinsics add an immediate after the registers. 5402 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 5403 5404 // The struct intrinsic variants add one additional operand over raw. 5405 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset; 5406 Register VIndex; 5407 if (HasVIndex) { 5408 VIndex = MI.getOperand(3 + OpOffset).getReg(); 5409 ++OpOffset; 5410 } else { 5411 VIndex = B.buildConstant(S32, 0).getReg(0); 5412 } 5413 5414 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 5415 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 5416 5417 unsigned Format = 0; 5418 if (IsTyped) { 5419 Format = MI.getOperand(5 + OpOffset).getImm(); 5420 ++OpOffset; 5421 } 5422 5423 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 5424 unsigned ImmOffset; 5425 5426 LLT Ty = MRI.getType(Dst); 5427 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the 5428 // logic doesn't have to handle that case. 5429 if (hasBufferRsrcWorkaround(Ty)) { 5430 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0); 5431 Dst = MI.getOperand(0).getReg(); 5432 } 5433 LLT EltTy = Ty.getScalarType(); 5434 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 5435 const bool Unpacked = ST.hasUnpackedD16VMem(); 5436 5437 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 5438 5439 unsigned Opc; 5440 5441 // TODO: Support TFE for typed and narrow loads. 5442 if (IsTyped) { 5443 if (IsTFE) 5444 return false; 5445 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 5446 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 5447 } else if (IsFormat) { 5448 if (IsD16) { 5449 if (IsTFE) 5450 return false; 5451 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16; 5452 } else { 5453 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE 5454 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 5455 } 5456 } else { 5457 if (IsTFE) 5458 return false; 5459 switch (MemTy.getSizeInBits()) { 5460 case 8: 5461 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 5462 break; 5463 case 16: 5464 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 5465 break; 5466 default: 5467 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 5468 break; 5469 } 5470 } 5471 5472 if (IsTFE) { 5473 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32); 5474 unsigned NumLoadDWords = NumValueDWords + 1; 5475 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32); 5476 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy); 5477 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5478 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5479 if (NumValueDWords == 1) { 5480 B.buildUnmerge({Dst, StatusDst}, LoadDstReg); 5481 } else { 5482 SmallVector<Register, 5> LoadElts; 5483 for (unsigned I = 0; I != NumValueDWords; ++I) 5484 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32)); 5485 LoadElts.push_back(StatusDst); 5486 B.buildUnmerge(LoadElts, LoadDstReg); 5487 LoadElts.truncate(NumValueDWords); 5488 B.buildMergeLikeInstr(Dst, LoadElts); 5489 } 5490 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) || 5491 (IsD16 && !Ty.isVector())) { 5492 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 5493 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5494 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5495 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 5496 B.buildTrunc(Dst, LoadDstReg); 5497 } else if (Unpacked && IsD16 && Ty.isVector()) { 5498 LLT UnpackedTy = Ty.changeElementSize(32); 5499 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 5500 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5501 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5502 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 5503 // FIXME: G_TRUNC should work, but legalization currently fails 5504 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 5505 SmallVector<Register, 4> Repack; 5506 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 5507 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 5508 B.buildMergeLikeInstr(Dst, Repack); 5509 } else { 5510 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format, 5511 AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5512 } 5513 5514 MI.eraseFromParent(); 5515 return true; 5516 } 5517 5518 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 5519 switch (IntrID) { 5520 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5521 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 5522 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5523 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 5524 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 5525 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5526 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 5527 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5528 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 5529 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 5530 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5531 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 5532 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5533 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 5534 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 5535 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5536 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 5537 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5538 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 5539 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 5540 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5541 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 5542 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5543 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 5544 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 5545 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5546 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 5547 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5548 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 5549 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 5550 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5551 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 5552 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5553 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 5554 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 5555 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5556 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 5557 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5558 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 5559 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 5560 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5561 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 5562 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5563 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 5564 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 5565 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5566 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 5567 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5568 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 5569 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 5570 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5571 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 5572 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5573 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 5574 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 5575 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5576 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 5577 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5578 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 5579 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 5580 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5581 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 5582 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5583 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 5584 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 5585 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5586 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 5587 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5588 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 5589 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 5590 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5591 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 5592 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5593 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 5594 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 5595 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5596 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 5597 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5598 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 5599 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 5600 default: 5601 llvm_unreachable("unhandled atomic opcode"); 5602 } 5603 } 5604 5605 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 5606 MachineIRBuilder &B, 5607 Intrinsic::ID IID) const { 5608 const bool IsCmpSwap = 5609 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 5610 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap || 5611 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap || 5612 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap; 5613 const bool HasReturn = MI.getNumExplicitDefs() != 0; 5614 5615 Register Dst; 5616 5617 int OpOffset = 0; 5618 if (HasReturn) { 5619 // A few FP atomics do not support return values. 5620 Dst = MI.getOperand(0).getReg(); 5621 } else { 5622 OpOffset = -1; 5623 } 5624 5625 // Since we don't have 128-bit atomics, we don't need to handle the case of 5626 // p8 argmunents to the atomic itself 5627 Register VData = MI.getOperand(2 + OpOffset).getReg(); 5628 Register CmpVal; 5629 5630 if (IsCmpSwap) { 5631 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 5632 ++OpOffset; 5633 } 5634 5635 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset); 5636 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 5637 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 5638 5639 // The struct intrinsic variants add one additional operand over raw. 5640 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 5641 Register VIndex; 5642 if (HasVIndex) { 5643 VIndex = MI.getOperand(4 + OpOffset).getReg(); 5644 ++OpOffset; 5645 } else { 5646 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 5647 } 5648 5649 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 5650 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 5651 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 5652 5653 MachineMemOperand *MMO = *MI.memoperands_begin(); 5654 5655 unsigned ImmOffset; 5656 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 5657 5658 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 5659 5660 if (HasReturn) 5661 MIB.addDef(Dst); 5662 5663 MIB.addUse(VData); // vdata 5664 5665 if (IsCmpSwap) 5666 MIB.addReg(CmpVal); 5667 5668 MIB.addUse(RSrc) // rsrc 5669 .addUse(VIndex) // vindex 5670 .addUse(VOffset) // voffset 5671 .addUse(SOffset) // soffset 5672 .addImm(ImmOffset) // offset(imm) 5673 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5674 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 5675 .addMemOperand(MMO); 5676 5677 MI.eraseFromParent(); 5678 return true; 5679 } 5680 5681 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 5682 /// vector with s16 typed elements. 5683 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 5684 SmallVectorImpl<Register> &PackedAddrs, 5685 unsigned ArgOffset, 5686 const AMDGPU::ImageDimIntrinsicInfo *Intr, 5687 bool IsA16, bool IsG16) { 5688 const LLT S16 = LLT::scalar(16); 5689 const LLT V2S16 = LLT::fixed_vector(2, 16); 5690 auto EndIdx = Intr->VAddrEnd; 5691 5692 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 5693 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 5694 if (!SrcOp.isReg()) 5695 continue; // _L to _LZ may have eliminated this. 5696 5697 Register AddrReg = SrcOp.getReg(); 5698 5699 if ((I < Intr->GradientStart) || 5700 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 5701 (I >= Intr->CoordStart && !IsA16)) { 5702 if ((I < Intr->GradientStart) && IsA16 && 5703 (B.getMRI()->getType(AddrReg) == S16)) { 5704 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 5705 // Special handling of bias when A16 is on. Bias is of type half but 5706 // occupies full 32-bit. 5707 PackedAddrs.push_back( 5708 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 5709 .getReg(0)); 5710 } else { 5711 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 5712 "Bias needs to be converted to 16 bit in A16 mode"); 5713 // Handle any gradient or coordinate operands that should not be packed 5714 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 5715 PackedAddrs.push_back(AddrReg); 5716 } 5717 } else { 5718 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 5719 // derivatives dx/dh and dx/dv are packed with undef. 5720 if (((I + 1) >= EndIdx) || 5721 ((Intr->NumGradients / 2) % 2 == 1 && 5722 (I == static_cast<unsigned>(Intr->GradientStart + 5723 (Intr->NumGradients / 2) - 1) || 5724 I == static_cast<unsigned>(Intr->GradientStart + 5725 Intr->NumGradients - 1))) || 5726 // Check for _L to _LZ optimization 5727 !MI.getOperand(ArgOffset + I + 1).isReg()) { 5728 PackedAddrs.push_back( 5729 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 5730 .getReg(0)); 5731 } else { 5732 PackedAddrs.push_back( 5733 B.buildBuildVector( 5734 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 5735 .getReg(0)); 5736 ++I; 5737 } 5738 } 5739 } 5740 } 5741 5742 /// Convert from separate vaddr components to a single vector address register, 5743 /// and replace the remaining operands with $noreg. 5744 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 5745 int DimIdx, int NumVAddrs) { 5746 const LLT S32 = LLT::scalar(32); 5747 (void)S32; 5748 SmallVector<Register, 8> AddrRegs; 5749 for (int I = 0; I != NumVAddrs; ++I) { 5750 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 5751 if (SrcOp.isReg()) { 5752 AddrRegs.push_back(SrcOp.getReg()); 5753 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 5754 } 5755 } 5756 5757 int NumAddrRegs = AddrRegs.size(); 5758 if (NumAddrRegs != 1) { 5759 auto VAddr = 5760 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 5761 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 5762 } 5763 5764 for (int I = 1; I != NumVAddrs; ++I) { 5765 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 5766 if (SrcOp.isReg()) 5767 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 5768 } 5769 } 5770 5771 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 5772 /// 5773 /// Depending on the subtarget, load/store with 16-bit element data need to be 5774 /// rewritten to use the low half of 32-bit registers, or directly use a packed 5775 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 5776 /// registers. 5777 /// 5778 /// We don't want to directly select image instructions just yet, but also want 5779 /// to exposes all register repacking to the legalizer/combiners. We also don't 5780 /// want a selected instruction entering RegBankSelect. In order to avoid 5781 /// defining a multitude of intermediate image instructions, directly hack on 5782 /// the intrinsic's arguments. In cases like a16 addresses, this requires 5783 /// padding now unnecessary arguments with $noreg. 5784 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 5785 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 5786 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 5787 5788 const MachineFunction &MF = *MI.getMF(); 5789 const unsigned NumDefs = MI.getNumExplicitDefs(); 5790 const unsigned ArgOffset = NumDefs + 1; 5791 bool IsTFE = NumDefs == 2; 5792 // We are only processing the operands of d16 image operations on subtargets 5793 // that use the unpacked register layout, or need to repack the TFE result. 5794 5795 // TODO: Do we need to guard against already legalized intrinsics? 5796 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 5797 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 5798 5799 MachineRegisterInfo *MRI = B.getMRI(); 5800 const LLT S32 = LLT::scalar(32); 5801 const LLT S16 = LLT::scalar(16); 5802 const LLT V2S16 = LLT::fixed_vector(2, 16); 5803 5804 unsigned DMask = 0; 5805 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 5806 LLT Ty = MRI->getType(VData); 5807 5808 // Check for 16 bit addresses and pack if true. 5809 LLT GradTy = 5810 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 5811 LLT AddrTy = 5812 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 5813 const bool IsG16 = 5814 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16; 5815 const bool IsA16 = AddrTy == S16; 5816 const bool IsD16 = Ty.getScalarType() == S16; 5817 5818 int DMaskLanes = 0; 5819 if (!BaseOpcode->Atomic) { 5820 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 5821 if (BaseOpcode->Gather4) { 5822 DMaskLanes = 4; 5823 } else if (DMask != 0) { 5824 DMaskLanes = llvm::popcount(DMask); 5825 } else if (!IsTFE && !BaseOpcode->Store) { 5826 // If dmask is 0, this is a no-op load. This can be eliminated. 5827 B.buildUndef(MI.getOperand(0)); 5828 MI.eraseFromParent(); 5829 return true; 5830 } 5831 } 5832 5833 Observer.changingInstr(MI); 5834 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 5835 5836 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 5837 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 5838 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 5839 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 5840 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 5841 5842 // Track that we legalized this 5843 MI.setDesc(B.getTII().get(NewOpcode)); 5844 5845 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 5846 // dmask to be at least 1 otherwise the instruction will fail 5847 if (IsTFE && DMask == 0) { 5848 DMask = 0x1; 5849 DMaskLanes = 1; 5850 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 5851 } 5852 5853 if (BaseOpcode->Atomic) { 5854 Register VData0 = MI.getOperand(2).getReg(); 5855 LLT Ty = MRI->getType(VData0); 5856 5857 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 5858 if (Ty.isVector()) 5859 return false; 5860 5861 if (BaseOpcode->AtomicX2) { 5862 Register VData1 = MI.getOperand(3).getReg(); 5863 // The two values are packed in one register. 5864 LLT PackedTy = LLT::fixed_vector(2, Ty); 5865 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 5866 MI.getOperand(2).setReg(Concat.getReg(0)); 5867 MI.getOperand(3).setReg(AMDGPU::NoRegister); 5868 } 5869 } 5870 5871 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 5872 5873 // Rewrite the addressing register layout before doing anything else. 5874 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 5875 // 16 bit gradients are supported, but are tied to the A16 control 5876 // so both gradients and addresses must be 16 bit 5877 return false; 5878 } 5879 5880 if (IsA16 && !ST.hasA16()) { 5881 // A16 not supported 5882 return false; 5883 } 5884 5885 const unsigned NSAMaxSize = ST.getNSAMaxSize(); 5886 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding(); 5887 5888 if (IsA16 || IsG16) { 5889 if (Intr->NumVAddrs > 1) { 5890 SmallVector<Register, 4> PackedRegs; 5891 5892 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 5893 IsG16); 5894 5895 // See also below in the non-a16 branch 5896 const bool UseNSA = ST.hasNSAEncoding() && 5897 PackedRegs.size() >= ST.getNSAThreshold(MF) && 5898 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA); 5899 const bool UsePartialNSA = 5900 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize; 5901 5902 if (UsePartialNSA) { 5903 // Pack registers that would go over NSAMaxSize into last VAddr register 5904 LLT PackedAddrTy = 5905 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16); 5906 auto Concat = B.buildConcatVectors( 5907 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1)); 5908 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0); 5909 PackedRegs.resize(NSAMaxSize); 5910 } else if (!UseNSA && PackedRegs.size() > 1) { 5911 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 5912 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 5913 PackedRegs[0] = Concat.getReg(0); 5914 PackedRegs.resize(1); 5915 } 5916 5917 const unsigned NumPacked = PackedRegs.size(); 5918 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 5919 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 5920 if (!SrcOp.isReg()) { 5921 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 5922 continue; 5923 } 5924 5925 assert(SrcOp.getReg() != AMDGPU::NoRegister); 5926 5927 if (I - Intr->VAddrStart < NumPacked) 5928 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 5929 else 5930 SrcOp.setReg(AMDGPU::NoRegister); 5931 } 5932 } 5933 } else { 5934 // If the register allocator cannot place the address registers contiguously 5935 // without introducing moves, then using the non-sequential address encoding 5936 // is always preferable, since it saves VALU instructions and is usually a 5937 // wash in terms of code size or even better. 5938 // 5939 // However, we currently have no way of hinting to the register allocator 5940 // that MIMG addresses should be placed contiguously when it is possible to 5941 // do so, so force non-NSA for the common 2-address case as a heuristic. 5942 // 5943 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 5944 // allocation when possible. 5945 // 5946 // Partial NSA is allowed on GFX11 where the final register is a contiguous 5947 // set of the remaining addresses. 5948 const bool UseNSA = ST.hasNSAEncoding() && 5949 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) && 5950 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA); 5951 const bool UsePartialNSA = 5952 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize; 5953 5954 if (UsePartialNSA) { 5955 convertImageAddrToPacked(B, MI, 5956 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1, 5957 Intr->NumVAddrs - NSAMaxSize + 1); 5958 } else if (!UseNSA && Intr->NumVAddrs > 1) { 5959 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 5960 Intr->NumVAddrs); 5961 } 5962 } 5963 5964 int Flags = 0; 5965 if (IsA16) 5966 Flags |= 1; 5967 if (IsG16) 5968 Flags |= 2; 5969 MI.addOperand(MachineOperand::CreateImm(Flags)); 5970 5971 if (BaseOpcode->Store) { // No TFE for stores? 5972 // TODO: Handle dmask trim 5973 if (!Ty.isVector() || !IsD16) 5974 return true; 5975 5976 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 5977 if (RepackedReg != VData) { 5978 MI.getOperand(1).setReg(RepackedReg); 5979 } 5980 5981 return true; 5982 } 5983 5984 Register DstReg = MI.getOperand(0).getReg(); 5985 const LLT EltTy = Ty.getScalarType(); 5986 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 5987 5988 // Confirm that the return type is large enough for the dmask specified 5989 if (NumElts < DMaskLanes) 5990 return false; 5991 5992 if (NumElts > 4 || DMaskLanes > 4) 5993 return false; 5994 5995 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 5996 const LLT AdjustedTy = 5997 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 5998 5999 // The raw dword aligned data component of the load. The only legal cases 6000 // where this matters should be when using the packed D16 format, for 6001 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 6002 LLT RoundedTy; 6003 6004 // S32 vector to cover all data, plus TFE result element. 6005 LLT TFETy; 6006 6007 // Register type to use for each loaded component. Will be S32 or V2S16. 6008 LLT RegTy; 6009 6010 if (IsD16 && ST.hasUnpackedD16VMem()) { 6011 RoundedTy = 6012 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 6013 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 6014 RegTy = S32; 6015 } else { 6016 unsigned EltSize = EltTy.getSizeInBits(); 6017 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 6018 unsigned RoundedSize = 32 * RoundedElts; 6019 RoundedTy = LLT::scalarOrVector( 6020 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 6021 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 6022 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 6023 } 6024 6025 // The return type does not need adjustment. 6026 // TODO: Should we change s16 case to s32 or <2 x s16>? 6027 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 6028 return true; 6029 6030 Register Dst1Reg; 6031 6032 // Insert after the instruction. 6033 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 6034 6035 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 6036 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 6037 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 6038 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 6039 6040 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 6041 6042 MI.getOperand(0).setReg(NewResultReg); 6043 6044 // In the IR, TFE is supposed to be used with a 2 element struct return 6045 // type. The instruction really returns these two values in one contiguous 6046 // register, with one additional dword beyond the loaded data. Rewrite the 6047 // return type to use a single register result. 6048 6049 if (IsTFE) { 6050 Dst1Reg = MI.getOperand(1).getReg(); 6051 if (MRI->getType(Dst1Reg) != S32) 6052 return false; 6053 6054 // TODO: Make sure the TFE operand bit is set. 6055 MI.removeOperand(1); 6056 6057 // Handle the easy case that requires no repack instructions. 6058 if (Ty == S32) { 6059 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 6060 return true; 6061 } 6062 } 6063 6064 // Now figure out how to copy the new result register back into the old 6065 // result. 6066 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 6067 6068 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 6069 6070 if (ResultNumRegs == 1) { 6071 assert(!IsTFE); 6072 ResultRegs[0] = NewResultReg; 6073 } else { 6074 // We have to repack into a new vector of some kind. 6075 for (int I = 0; I != NumDataRegs; ++I) 6076 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 6077 B.buildUnmerge(ResultRegs, NewResultReg); 6078 6079 // Drop the final TFE element to get the data part. The TFE result is 6080 // directly written to the right place already. 6081 if (IsTFE) 6082 ResultRegs.resize(NumDataRegs); 6083 } 6084 6085 // For an s16 scalar result, we form an s32 result with a truncate regardless 6086 // of packed vs. unpacked. 6087 if (IsD16 && !Ty.isVector()) { 6088 B.buildTrunc(DstReg, ResultRegs[0]); 6089 return true; 6090 } 6091 6092 // Avoid a build/concat_vector of 1 entry. 6093 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 6094 B.buildBitcast(DstReg, ResultRegs[0]); 6095 return true; 6096 } 6097 6098 assert(Ty.isVector()); 6099 6100 if (IsD16) { 6101 // For packed D16 results with TFE enabled, all the data components are 6102 // S32. Cast back to the expected type. 6103 // 6104 // TODO: We don't really need to use load s32 elements. We would only need one 6105 // cast for the TFE result if a multiple of v2s16 was used. 6106 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 6107 for (Register &Reg : ResultRegs) 6108 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 6109 } else if (ST.hasUnpackedD16VMem()) { 6110 for (Register &Reg : ResultRegs) 6111 Reg = B.buildTrunc(S16, Reg).getReg(0); 6112 } 6113 } 6114 6115 auto padWithUndef = [&](LLT Ty, int NumElts) { 6116 if (NumElts == 0) 6117 return; 6118 Register Undef = B.buildUndef(Ty).getReg(0); 6119 for (int I = 0; I != NumElts; ++I) 6120 ResultRegs.push_back(Undef); 6121 }; 6122 6123 // Pad out any elements eliminated due to the dmask. 6124 LLT ResTy = MRI->getType(ResultRegs[0]); 6125 if (!ResTy.isVector()) { 6126 padWithUndef(ResTy, NumElts - ResultRegs.size()); 6127 B.buildBuildVector(DstReg, ResultRegs); 6128 return true; 6129 } 6130 6131 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 6132 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 6133 6134 // Deal with the one annoying legal case. 6135 const LLT V3S16 = LLT::fixed_vector(3, 16); 6136 if (Ty == V3S16) { 6137 if (IsTFE) { 6138 if (ResultRegs.size() == 1) { 6139 NewResultReg = ResultRegs[0]; 6140 } else if (ResultRegs.size() == 2) { 6141 LLT V4S16 = LLT::fixed_vector(4, 16); 6142 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 6143 } else { 6144 return false; 6145 } 6146 } 6147 6148 if (MRI->getType(DstReg).getNumElements() < 6149 MRI->getType(NewResultReg).getNumElements()) { 6150 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 6151 } else { 6152 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 6153 } 6154 return true; 6155 } 6156 6157 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 6158 B.buildConcatVectors(DstReg, ResultRegs); 6159 return true; 6160 } 6161 6162 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 6163 LegalizerHelper &Helper, MachineInstr &MI) const { 6164 MachineIRBuilder &B = Helper.MIRBuilder; 6165 GISelChangeObserver &Observer = Helper.Observer; 6166 6167 Register Dst = MI.getOperand(0).getReg(); 6168 LLT Ty = B.getMRI()->getType(Dst); 6169 unsigned Size = Ty.getSizeInBits(); 6170 MachineFunction &MF = B.getMF(); 6171 6172 Observer.changingInstr(MI); 6173 6174 // Handle needing to s.buffer.load() a p8 value. 6175 if (hasBufferRsrcWorkaround(Ty)) { 6176 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0); 6177 Dst = MI.getOperand(0).getReg(); 6178 B.setInsertPt(B.getMBB(), MI); 6179 } 6180 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 6181 Ty = getBitcastRegisterType(Ty); 6182 Helper.bitcastDst(MI, Ty, 0); 6183 Dst = MI.getOperand(0).getReg(); 6184 B.setInsertPt(B.getMBB(), MI); 6185 } 6186 6187 // FIXME: We don't really need this intermediate instruction. The intrinsic 6188 // should be fixed to have a memory operand. Since it's readnone, we're not 6189 // allowed to add one. 6190 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 6191 MI.removeOperand(1); // Remove intrinsic ID 6192 6193 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 6194 // TODO: Should this use datalayout alignment? 6195 const unsigned MemSize = (Size + 7) / 8; 6196 const Align MemAlign(4); 6197 MachineMemOperand *MMO = MF.getMachineMemOperand( 6198 MachinePointerInfo(), 6199 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 6200 MachineMemOperand::MOInvariant, 6201 MemSize, MemAlign); 6202 MI.addMemOperand(MF, MMO); 6203 6204 // There are no 96-bit result scalar loads, but widening to 128-bit should 6205 // always be legal. We may need to restore this to a 96-bit result if it turns 6206 // out this needs to be converted to a vector load during RegBankSelect. 6207 if (!isPowerOf2_32(Size)) { 6208 if (Ty.isVector()) 6209 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 6210 else 6211 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 6212 } 6213 6214 Observer.changedInstr(MI); 6215 return true; 6216 } 6217 6218 // TODO: Move to selection 6219 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 6220 MachineRegisterInfo &MRI, 6221 MachineIRBuilder &B) const { 6222 if (!ST.isTrapHandlerEnabled() || 6223 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 6224 return legalizeTrapEndpgm(MI, MRI, B); 6225 6226 const Module *M = B.getMF().getFunction().getParent(); 6227 unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M); 6228 if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3) 6229 return legalizeTrapHsaQueuePtr(MI, MRI, B); 6230 6231 return ST.supportsGetDoorbellID() ? 6232 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B); 6233 } 6234 6235 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 6236 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6237 const DebugLoc &DL = MI.getDebugLoc(); 6238 MachineBasicBlock &BB = B.getMBB(); 6239 MachineFunction *MF = BB.getParent(); 6240 6241 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) { 6242 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 6243 .addImm(0); 6244 MI.eraseFromParent(); 6245 return true; 6246 } 6247 6248 // We need a block split to make the real endpgm a terminator. We also don't 6249 // want to break phis in successor blocks, so we can't just delete to the 6250 // end of the block. 6251 BB.splitAt(MI, false /*UpdateLiveIns*/); 6252 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 6253 MF->push_back(TrapBB); 6254 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 6255 .addImm(0); 6256 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ)) 6257 .addMBB(TrapBB); 6258 6259 BB.addSuccessor(TrapBB); 6260 MI.eraseFromParent(); 6261 return true; 6262 } 6263 6264 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 6265 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6266 MachineFunction &MF = B.getMF(); 6267 const LLT S64 = LLT::scalar(64); 6268 6269 Register SGPR01(AMDGPU::SGPR0_SGPR1); 6270 // For code object version 5, queue_ptr is passed through implicit kernarg. 6271 if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= 6272 AMDGPU::AMDHSA_COV5) { 6273 AMDGPUTargetLowering::ImplicitParameter Param = 6274 AMDGPUTargetLowering::QUEUE_PTR; 6275 uint64_t Offset = 6276 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 6277 6278 Register KernargPtrReg = MRI.createGenericVirtualRegister( 6279 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6280 6281 if (!loadInputValue(KernargPtrReg, B, 6282 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 6283 return false; 6284 6285 // TODO: can we be smarter about machine pointer info? 6286 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 6287 MachineMemOperand *MMO = MF.getMachineMemOperand( 6288 PtrInfo, 6289 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 6290 MachineMemOperand::MOInvariant, 6291 LLT::scalar(64), commonAlignment(Align(64), Offset)); 6292 6293 // Pointer address 6294 Register LoadAddr = MRI.createGenericVirtualRegister( 6295 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6296 B.buildPtrAdd(LoadAddr, KernargPtrReg, 6297 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 6298 // Load address 6299 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 6300 B.buildCopy(SGPR01, Temp); 6301 B.buildInstr(AMDGPU::S_TRAP) 6302 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 6303 .addReg(SGPR01, RegState::Implicit); 6304 MI.eraseFromParent(); 6305 return true; 6306 } 6307 6308 // Pass queue pointer to trap handler as input, and insert trap instruction 6309 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 6310 Register LiveIn = 6311 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6312 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 6313 return false; 6314 6315 B.buildCopy(SGPR01, LiveIn); 6316 B.buildInstr(AMDGPU::S_TRAP) 6317 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 6318 .addReg(SGPR01, RegState::Implicit); 6319 6320 MI.eraseFromParent(); 6321 return true; 6322 } 6323 6324 bool AMDGPULegalizerInfo::legalizeTrapHsa( 6325 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6326 B.buildInstr(AMDGPU::S_TRAP) 6327 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 6328 MI.eraseFromParent(); 6329 return true; 6330 } 6331 6332 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 6333 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6334 // Is non-HSA path or trap-handler disabled? Then, report a warning 6335 // accordingly 6336 if (!ST.isTrapHandlerEnabled() || 6337 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 6338 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 6339 "debugtrap handler not supported", 6340 MI.getDebugLoc(), DS_Warning); 6341 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 6342 Ctx.diagnose(NoTrap); 6343 } else { 6344 // Insert debug-trap instruction 6345 B.buildInstr(AMDGPU::S_TRAP) 6346 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 6347 } 6348 6349 MI.eraseFromParent(); 6350 return true; 6351 } 6352 6353 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 6354 MachineIRBuilder &B) const { 6355 MachineRegisterInfo &MRI = *B.getMRI(); 6356 const LLT S16 = LLT::scalar(16); 6357 const LLT S32 = LLT::scalar(32); 6358 const LLT V2S16 = LLT::fixed_vector(2, 16); 6359 const LLT V3S32 = LLT::fixed_vector(3, 32); 6360 6361 Register DstReg = MI.getOperand(0).getReg(); 6362 Register NodePtr = MI.getOperand(2).getReg(); 6363 Register RayExtent = MI.getOperand(3).getReg(); 6364 Register RayOrigin = MI.getOperand(4).getReg(); 6365 Register RayDir = MI.getOperand(5).getReg(); 6366 Register RayInvDir = MI.getOperand(6).getReg(); 6367 Register TDescr = MI.getOperand(7).getReg(); 6368 6369 if (!ST.hasGFX10_AEncoding()) { 6370 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 6371 "intrinsic not supported on subtarget", 6372 MI.getDebugLoc()); 6373 B.getMF().getFunction().getContext().diagnose(BadIntrin); 6374 return false; 6375 } 6376 6377 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); 6378 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 6379 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 6380 const unsigned NumVDataDwords = 4; 6381 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 6382 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 6383 const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); 6384 const unsigned BaseOpcodes[2][2] = { 6385 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 6386 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 6387 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 6388 int Opcode; 6389 if (UseNSA) { 6390 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 6391 IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA 6392 : AMDGPU::MIMGEncGfx10NSA, 6393 NumVDataDwords, NumVAddrDwords); 6394 } else { 6395 Opcode = AMDGPU::getMIMGOpcode( 6396 BaseOpcodes[Is64][IsA16], 6397 IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, 6398 NumVDataDwords, NumVAddrDwords); 6399 } 6400 assert(Opcode != -1); 6401 6402 SmallVector<Register, 12> Ops; 6403 if (UseNSA && IsGFX11Plus) { 6404 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { 6405 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 6406 auto Merged = B.buildMergeLikeInstr( 6407 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); 6408 Ops.push_back(Merged.getReg(0)); 6409 }; 6410 6411 Ops.push_back(NodePtr); 6412 Ops.push_back(RayExtent); 6413 packLanes(RayOrigin); 6414 6415 if (IsA16) { 6416 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 6417 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 6418 auto MergedDir = B.buildMergeLikeInstr( 6419 V3S32, 6420 {B.buildBitcast( 6421 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0), 6422 UnmergeRayDir.getReg(0)})) 6423 .getReg(0), 6424 B.buildBitcast( 6425 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1), 6426 UnmergeRayDir.getReg(1)})) 6427 .getReg(0), 6428 B.buildBitcast( 6429 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2), 6430 UnmergeRayDir.getReg(2)})) 6431 .getReg(0)}); 6432 Ops.push_back(MergedDir.getReg(0)); 6433 } else { 6434 packLanes(RayDir); 6435 packLanes(RayInvDir); 6436 } 6437 } else { 6438 if (Is64) { 6439 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 6440 Ops.push_back(Unmerge.getReg(0)); 6441 Ops.push_back(Unmerge.getReg(1)); 6442 } else { 6443 Ops.push_back(NodePtr); 6444 } 6445 Ops.push_back(RayExtent); 6446 6447 auto packLanes = [&Ops, &S32, &B](Register Src) { 6448 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 6449 Ops.push_back(Unmerge.getReg(0)); 6450 Ops.push_back(Unmerge.getReg(1)); 6451 Ops.push_back(Unmerge.getReg(2)); 6452 }; 6453 6454 packLanes(RayOrigin); 6455 if (IsA16) { 6456 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 6457 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 6458 Register R1 = MRI.createGenericVirtualRegister(S32); 6459 Register R2 = MRI.createGenericVirtualRegister(S32); 6460 Register R3 = MRI.createGenericVirtualRegister(S32); 6461 B.buildMergeLikeInstr(R1, 6462 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 6463 B.buildMergeLikeInstr( 6464 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 6465 B.buildMergeLikeInstr( 6466 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 6467 Ops.push_back(R1); 6468 Ops.push_back(R2); 6469 Ops.push_back(R3); 6470 } else { 6471 packLanes(RayDir); 6472 packLanes(RayInvDir); 6473 } 6474 } 6475 6476 if (!UseNSA) { 6477 // Build a single vector containing all the operands so far prepared. 6478 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 6479 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0); 6480 Ops.clear(); 6481 Ops.push_back(MergedOps); 6482 } 6483 6484 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 6485 .addDef(DstReg) 6486 .addImm(Opcode); 6487 6488 for (Register R : Ops) { 6489 MIB.addUse(R); 6490 } 6491 6492 MIB.addUse(TDescr) 6493 .addImm(IsA16 ? 1 : 0) 6494 .cloneMemRefs(MI); 6495 6496 MI.eraseFromParent(); 6497 return true; 6498 } 6499 6500 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 6501 MachineIRBuilder &B) const { 6502 unsigned Opc; 6503 int RoundMode = MI.getOperand(2).getImm(); 6504 6505 if (RoundMode == (int)RoundingMode::TowardPositive) 6506 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 6507 else if (RoundMode == (int)RoundingMode::TowardNegative) 6508 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 6509 else 6510 return false; 6511 6512 B.buildInstr(Opc) 6513 .addDef(MI.getOperand(0).getReg()) 6514 .addUse(MI.getOperand(1).getReg()); 6515 6516 MI.eraseFromParent(); 6517 6518 return true; 6519 } 6520 6521 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 6522 MachineInstr &MI) const { 6523 MachineIRBuilder &B = Helper.MIRBuilder; 6524 MachineRegisterInfo &MRI = *B.getMRI(); 6525 6526 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 6527 auto IntrID = MI.getIntrinsicID(); 6528 switch (IntrID) { 6529 case Intrinsic::amdgcn_if: 6530 case Intrinsic::amdgcn_else: { 6531 MachineInstr *Br = nullptr; 6532 MachineBasicBlock *UncondBrTarget = nullptr; 6533 bool Negated = false; 6534 if (MachineInstr *BrCond = 6535 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 6536 const SIRegisterInfo *TRI 6537 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 6538 6539 Register Def = MI.getOperand(1).getReg(); 6540 Register Use = MI.getOperand(3).getReg(); 6541 6542 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 6543 6544 if (Negated) 6545 std::swap(CondBrTarget, UncondBrTarget); 6546 6547 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 6548 if (IntrID == Intrinsic::amdgcn_if) { 6549 B.buildInstr(AMDGPU::SI_IF) 6550 .addDef(Def) 6551 .addUse(Use) 6552 .addMBB(UncondBrTarget); 6553 } else { 6554 B.buildInstr(AMDGPU::SI_ELSE) 6555 .addDef(Def) 6556 .addUse(Use) 6557 .addMBB(UncondBrTarget); 6558 } 6559 6560 if (Br) { 6561 Br->getOperand(0).setMBB(CondBrTarget); 6562 } else { 6563 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 6564 // since we're swapping branch targets it needs to be reinserted. 6565 // FIXME: IRTranslator should probably not do this 6566 B.buildBr(*CondBrTarget); 6567 } 6568 6569 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 6570 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 6571 MI.eraseFromParent(); 6572 BrCond->eraseFromParent(); 6573 return true; 6574 } 6575 6576 return false; 6577 } 6578 case Intrinsic::amdgcn_loop: { 6579 MachineInstr *Br = nullptr; 6580 MachineBasicBlock *UncondBrTarget = nullptr; 6581 bool Negated = false; 6582 if (MachineInstr *BrCond = 6583 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 6584 const SIRegisterInfo *TRI 6585 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 6586 6587 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 6588 Register Reg = MI.getOperand(2).getReg(); 6589 6590 if (Negated) 6591 std::swap(CondBrTarget, UncondBrTarget); 6592 6593 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 6594 B.buildInstr(AMDGPU::SI_LOOP) 6595 .addUse(Reg) 6596 .addMBB(UncondBrTarget); 6597 6598 if (Br) 6599 Br->getOperand(0).setMBB(CondBrTarget); 6600 else 6601 B.buildBr(*CondBrTarget); 6602 6603 MI.eraseFromParent(); 6604 BrCond->eraseFromParent(); 6605 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 6606 return true; 6607 } 6608 6609 return false; 6610 } 6611 case Intrinsic::amdgcn_make_buffer_rsrc: 6612 return legalizePointerAsRsrcIntrin(MI, MRI, B); 6613 case Intrinsic::amdgcn_kernarg_segment_ptr: 6614 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 6615 // This only makes sense to call in a kernel, so just lower to null. 6616 B.buildConstant(MI.getOperand(0).getReg(), 0); 6617 MI.eraseFromParent(); 6618 return true; 6619 } 6620 6621 return legalizePreloadedArgIntrin( 6622 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 6623 case Intrinsic::amdgcn_implicitarg_ptr: 6624 return legalizeImplicitArgPtr(MI, MRI, B); 6625 case Intrinsic::amdgcn_workitem_id_x: 6626 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, 6627 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 6628 case Intrinsic::amdgcn_workitem_id_y: 6629 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, 6630 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 6631 case Intrinsic::amdgcn_workitem_id_z: 6632 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, 6633 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 6634 case Intrinsic::amdgcn_workgroup_id_x: 6635 return legalizePreloadedArgIntrin(MI, MRI, B, 6636 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 6637 case Intrinsic::amdgcn_workgroup_id_y: 6638 return legalizePreloadedArgIntrin(MI, MRI, B, 6639 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 6640 case Intrinsic::amdgcn_workgroup_id_z: 6641 return legalizePreloadedArgIntrin(MI, MRI, B, 6642 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 6643 case Intrinsic::amdgcn_lds_kernel_id: 6644 return legalizePreloadedArgIntrin(MI, MRI, B, 6645 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 6646 case Intrinsic::amdgcn_dispatch_ptr: 6647 return legalizePreloadedArgIntrin(MI, MRI, B, 6648 AMDGPUFunctionArgInfo::DISPATCH_PTR); 6649 case Intrinsic::amdgcn_queue_ptr: 6650 return legalizePreloadedArgIntrin(MI, MRI, B, 6651 AMDGPUFunctionArgInfo::QUEUE_PTR); 6652 case Intrinsic::amdgcn_implicit_buffer_ptr: 6653 return legalizePreloadedArgIntrin( 6654 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 6655 case Intrinsic::amdgcn_dispatch_id: 6656 return legalizePreloadedArgIntrin(MI, MRI, B, 6657 AMDGPUFunctionArgInfo::DISPATCH_ID); 6658 case Intrinsic::r600_read_ngroups_x: 6659 // TODO: Emit error for hsa 6660 return legalizeKernargMemParameter(MI, B, 6661 SI::KernelInputOffsets::NGROUPS_X); 6662 case Intrinsic::r600_read_ngroups_y: 6663 return legalizeKernargMemParameter(MI, B, 6664 SI::KernelInputOffsets::NGROUPS_Y); 6665 case Intrinsic::r600_read_ngroups_z: 6666 return legalizeKernargMemParameter(MI, B, 6667 SI::KernelInputOffsets::NGROUPS_Z); 6668 case Intrinsic::r600_read_local_size_x: 6669 // TODO: Could insert G_ASSERT_ZEXT from s16 6670 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 6671 case Intrinsic::r600_read_local_size_y: 6672 // TODO: Could insert G_ASSERT_ZEXT from s16 6673 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 6674 // TODO: Could insert G_ASSERT_ZEXT from s16 6675 case Intrinsic::r600_read_local_size_z: 6676 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 6677 case Intrinsic::r600_read_global_size_x: 6678 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 6679 case Intrinsic::r600_read_global_size_y: 6680 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 6681 case Intrinsic::r600_read_global_size_z: 6682 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 6683 case Intrinsic::amdgcn_fdiv_fast: 6684 return legalizeFDIVFastIntrin(MI, MRI, B); 6685 case Intrinsic::amdgcn_is_shared: 6686 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 6687 case Intrinsic::amdgcn_is_private: 6688 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 6689 case Intrinsic::amdgcn_wavefrontsize: { 6690 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 6691 MI.eraseFromParent(); 6692 return true; 6693 } 6694 case Intrinsic::amdgcn_s_buffer_load: 6695 return legalizeSBufferLoad(Helper, MI); 6696 case Intrinsic::amdgcn_raw_buffer_store: 6697 case Intrinsic::amdgcn_raw_ptr_buffer_store: 6698 case Intrinsic::amdgcn_struct_buffer_store: 6699 case Intrinsic::amdgcn_struct_ptr_buffer_store: 6700 return legalizeBufferStore(MI, MRI, B, false, false); 6701 case Intrinsic::amdgcn_raw_buffer_store_format: 6702 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: 6703 case Intrinsic::amdgcn_struct_buffer_store_format: 6704 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: 6705 return legalizeBufferStore(MI, MRI, B, false, true); 6706 case Intrinsic::amdgcn_raw_tbuffer_store: 6707 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: 6708 case Intrinsic::amdgcn_struct_tbuffer_store: 6709 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: 6710 return legalizeBufferStore(MI, MRI, B, true, true); 6711 case Intrinsic::amdgcn_raw_buffer_load: 6712 case Intrinsic::amdgcn_raw_ptr_buffer_load: 6713 case Intrinsic::amdgcn_struct_buffer_load: 6714 case Intrinsic::amdgcn_struct_ptr_buffer_load: 6715 return legalizeBufferLoad(MI, MRI, B, false, false); 6716 case Intrinsic::amdgcn_raw_buffer_load_format: 6717 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: 6718 case Intrinsic::amdgcn_struct_buffer_load_format: 6719 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: 6720 return legalizeBufferLoad(MI, MRI, B, true, false); 6721 case Intrinsic::amdgcn_raw_tbuffer_load: 6722 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: 6723 case Intrinsic::amdgcn_struct_tbuffer_load: 6724 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: 6725 return legalizeBufferLoad(MI, MRI, B, true, true); 6726 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 6727 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 6728 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 6729 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 6730 case Intrinsic::amdgcn_raw_buffer_atomic_add: 6731 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 6732 case Intrinsic::amdgcn_struct_buffer_atomic_add: 6733 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 6734 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 6735 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 6736 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 6737 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 6738 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 6739 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 6740 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 6741 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 6742 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 6743 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 6744 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 6745 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 6746 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 6747 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 6748 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 6749 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 6750 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 6751 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 6752 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 6753 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 6754 case Intrinsic::amdgcn_raw_buffer_atomic_and: 6755 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 6756 case Intrinsic::amdgcn_struct_buffer_atomic_and: 6757 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 6758 case Intrinsic::amdgcn_raw_buffer_atomic_or: 6759 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 6760 case Intrinsic::amdgcn_struct_buffer_atomic_or: 6761 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 6762 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 6763 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 6764 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 6765 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 6766 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 6767 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 6768 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 6769 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 6770 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 6771 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 6772 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 6773 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 6774 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 6775 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 6776 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 6777 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 6778 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 6779 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 6780 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 6781 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 6782 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 6783 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 6784 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 6785 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 6786 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 6787 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 6788 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 6789 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 6790 return legalizeBufferAtomic(MI, B, IntrID); 6791 case Intrinsic::trap: 6792 return legalizeTrapIntrinsic(MI, MRI, B); 6793 case Intrinsic::debugtrap: 6794 return legalizeDebugTrapIntrinsic(MI, MRI, B); 6795 case Intrinsic::amdgcn_rsq_clamp: 6796 return legalizeRsqClampIntrinsic(MI, MRI, B); 6797 case Intrinsic::amdgcn_ds_fadd: 6798 case Intrinsic::amdgcn_ds_fmin: 6799 case Intrinsic::amdgcn_ds_fmax: 6800 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 6801 case Intrinsic::amdgcn_image_bvh_intersect_ray: 6802 return legalizeBVHIntrinsic(MI, B); 6803 case Intrinsic::amdgcn_fmed3: { 6804 GISelChangeObserver &Observer = Helper.Observer; 6805 6806 // FIXME: This is to workaround the inability of tablegen match combiners to 6807 // match intrinsics in patterns. 6808 Observer.changingInstr(MI); 6809 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3)); 6810 MI.removeOperand(1); 6811 Observer.changedInstr(MI); 6812 return true; 6813 } 6814 default: { 6815 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 6816 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 6817 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 6818 return true; 6819 } 6820 } 6821 6822 return true; 6823 } 6824