1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIInstrInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "SIRegisterInfo.h" 24 #include "Utils/AMDGPUBaseInfo.h" 25 #include "llvm/ADT/ScopeExit.h" 26 #include "llvm/BinaryFormat/ELF.h" 27 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 31 #include "llvm/CodeGen/GlobalISel/Utils.h" 32 #include "llvm/CodeGen/TargetOpcodes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/IntrinsicsAMDGPU.h" 35 #include "llvm/IR/IntrinsicsR600.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Hack until load/store selection patterns support any tuple of legal types. 46 static cl::opt<bool> EnableNewLegality( 47 "amdgpu-global-isel-new-legality", 48 cl::desc("Use GlobalISel desired legality, rather than try to use" 49 "rules compatible with selection patterns"), 50 cl::init(false), 51 cl::ReallyHidden); 52 53 static constexpr unsigned MaxRegisterSize = 1024; 54 55 // Round the number of elements to the next power of two elements 56 static LLT getPow2VectorType(LLT Ty) { 57 unsigned NElts = Ty.getNumElements(); 58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 60 } 61 62 // Round the number of bits to the next power of two bits 63 static LLT getPow2ScalarType(LLT Ty) { 64 unsigned Bits = Ty.getSizeInBits(); 65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 66 return LLT::scalar(Pow2Bits); 67 } 68 69 /// \returns true if this is an odd sized vector which should widen by adding an 70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 71 /// excludes s1 vectors, which should always be scalarized. 72 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 73 return [=](const LegalityQuery &Query) { 74 const LLT Ty = Query.Types[TypeIdx]; 75 if (!Ty.isVector()) 76 return false; 77 78 const LLT EltTy = Ty.getElementType(); 79 const unsigned EltSize = EltTy.getSizeInBits(); 80 return Ty.getNumElements() % 2 != 0 && 81 EltSize > 1 && EltSize < 32 && 82 Ty.getSizeInBits() % 32 != 0; 83 }; 84 } 85 86 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 return Ty.getSizeInBits() % 32 == 0; 90 }; 91 } 92 93 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 94 return [=](const LegalityQuery &Query) { 95 const LLT Ty = Query.Types[TypeIdx]; 96 const LLT EltTy = Ty.getScalarType(); 97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 98 }; 99 } 100 101 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 const LLT EltTy = Ty.getElementType(); 105 return std::pair(TypeIdx, 106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 107 }; 108 } 109 110 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 111 return [=](const LegalityQuery &Query) { 112 const LLT Ty = Query.Types[TypeIdx]; 113 const LLT EltTy = Ty.getElementType(); 114 unsigned Size = Ty.getSizeInBits(); 115 unsigned Pieces = (Size + 63) / 64; 116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 117 return std::pair(TypeIdx, LLT::scalarOrVector( 118 ElementCount::getFixed(NewNumElts), EltTy)); 119 }; 120 } 121 122 // Increase the number of vector elements to reach the next multiple of 32-bit 123 // type. 124 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 125 return [=](const LegalityQuery &Query) { 126 const LLT Ty = Query.Types[TypeIdx]; 127 128 const LLT EltTy = Ty.getElementType(); 129 const int Size = Ty.getSizeInBits(); 130 const int EltSize = EltTy.getSizeInBits(); 131 const int NextMul32 = (Size + 31) / 32; 132 133 assert(EltSize < 32); 134 135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 137 }; 138 } 139 140 // Increase the number of vector elements to reach the next legal RegClass. 141 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) { 142 return [=](const LegalityQuery &Query) { 143 const LLT Ty = Query.Types[TypeIdx]; 144 const unsigned NumElts = Ty.getNumElements(); 145 const unsigned EltSize = Ty.getElementType().getSizeInBits(); 146 const unsigned MaxNumElts = MaxRegisterSize / EltSize; 147 148 assert(EltSize == 32 || EltSize == 64); 149 assert(Ty.getSizeInBits() < MaxRegisterSize); 150 151 unsigned NewNumElts; 152 // Find the nearest legal RegClass that is larger than the current type. 153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) { 154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize)) 155 break; 156 } 157 158 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize)); 159 }; 160 } 161 162 static LLT getBufferRsrcScalarType(const LLT Ty) { 163 if (!Ty.isVector()) 164 return LLT::scalar(128); 165 const ElementCount NumElems = Ty.getElementCount(); 166 return LLT::vector(NumElems, LLT::scalar(128)); 167 } 168 169 static LLT getBufferRsrcRegisterType(const LLT Ty) { 170 if (!Ty.isVector()) 171 return LLT::fixed_vector(4, LLT::scalar(32)); 172 const unsigned NumElems = Ty.getElementCount().getFixedValue(); 173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32)); 174 } 175 176 static LLT getBitcastRegisterType(const LLT Ty) { 177 const unsigned Size = Ty.getSizeInBits(); 178 179 if (Size <= 32) { 180 // <2 x s8> -> s16 181 // <4 x s8> -> s32 182 return LLT::scalar(Size); 183 } 184 185 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 186 } 187 188 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 189 return [=](const LegalityQuery &Query) { 190 const LLT Ty = Query.Types[TypeIdx]; 191 return std::pair(TypeIdx, getBitcastRegisterType(Ty)); 192 }; 193 } 194 195 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 196 return [=](const LegalityQuery &Query) { 197 const LLT Ty = Query.Types[TypeIdx]; 198 unsigned Size = Ty.getSizeInBits(); 199 assert(Size % 32 == 0); 200 return std::pair( 201 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 202 }; 203 } 204 205 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 206 return [=](const LegalityQuery &Query) { 207 const LLT QueryTy = Query.Types[TypeIdx]; 208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 209 }; 210 } 211 212 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 213 return [=](const LegalityQuery &Query) { 214 const LLT QueryTy = Query.Types[TypeIdx]; 215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 216 }; 217 } 218 219 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 223 }; 224 } 225 226 static bool isRegisterSize(unsigned Size) { 227 return Size % 32 == 0 && Size <= MaxRegisterSize; 228 } 229 230 static bool isRegisterVectorElementType(LLT EltTy) { 231 const int EltSize = EltTy.getSizeInBits(); 232 return EltSize == 16 || EltSize % 32 == 0; 233 } 234 235 static bool isRegisterVectorType(LLT Ty) { 236 const int EltSize = Ty.getElementType().getSizeInBits(); 237 return EltSize == 32 || EltSize == 64 || 238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 239 EltSize == 128 || EltSize == 256; 240 } 241 242 // TODO: replace all uses of isRegisterType with isRegisterClassType 243 static bool isRegisterType(LLT Ty) { 244 if (!isRegisterSize(Ty.getSizeInBits())) 245 return false; 246 247 if (Ty.isVector()) 248 return isRegisterVectorType(Ty); 249 250 return true; 251 } 252 253 // Any combination of 32 or 64-bit elements up the maximum register size, and 254 // multiples of v2s16. 255 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 256 return [=](const LegalityQuery &Query) { 257 return isRegisterType(Query.Types[TypeIdx]); 258 }; 259 } 260 261 // RegisterType that doesn't have a corresponding RegClass. 262 // TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this 263 // should be removed. 264 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) { 265 return [=](const LegalityQuery &Query) { 266 LLT Ty = Query.Types[TypeIdx]; 267 return isRegisterType(Ty) && 268 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits()); 269 }; 270 } 271 272 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 273 return [=](const LegalityQuery &Query) { 274 const LLT QueryTy = Query.Types[TypeIdx]; 275 if (!QueryTy.isVector()) 276 return false; 277 const LLT EltTy = QueryTy.getElementType(); 278 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 279 }; 280 } 281 282 static const LLT S1 = LLT::scalar(1); 283 static const LLT S8 = LLT::scalar(8); 284 static const LLT S16 = LLT::scalar(16); 285 static const LLT S32 = LLT::scalar(32); 286 static const LLT F32 = LLT::float32(); 287 static const LLT S64 = LLT::scalar(64); 288 static const LLT F64 = LLT::float64(); 289 static const LLT S96 = LLT::scalar(96); 290 static const LLT S128 = LLT::scalar(128); 291 static const LLT S160 = LLT::scalar(160); 292 static const LLT S224 = LLT::scalar(224); 293 static const LLT S256 = LLT::scalar(256); 294 static const LLT S512 = LLT::scalar(512); 295 static const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 296 297 static const LLT V2S8 = LLT::fixed_vector(2, 8); 298 static const LLT V2S16 = LLT::fixed_vector(2, 16); 299 static const LLT V4S16 = LLT::fixed_vector(4, 16); 300 static const LLT V6S16 = LLT::fixed_vector(6, 16); 301 static const LLT V8S16 = LLT::fixed_vector(8, 16); 302 static const LLT V10S16 = LLT::fixed_vector(10, 16); 303 static const LLT V12S16 = LLT::fixed_vector(12, 16); 304 static const LLT V16S16 = LLT::fixed_vector(16, 16); 305 306 static const LLT V2F16 = LLT::fixed_vector(2, LLT::float16()); 307 static const LLT V2BF16 = V2F16; // FIXME 308 309 static const LLT V2S32 = LLT::fixed_vector(2, 32); 310 static const LLT V3S32 = LLT::fixed_vector(3, 32); 311 static const LLT V4S32 = LLT::fixed_vector(4, 32); 312 static const LLT V5S32 = LLT::fixed_vector(5, 32); 313 static const LLT V6S32 = LLT::fixed_vector(6, 32); 314 static const LLT V7S32 = LLT::fixed_vector(7, 32); 315 static const LLT V8S32 = LLT::fixed_vector(8, 32); 316 static const LLT V9S32 = LLT::fixed_vector(9, 32); 317 static const LLT V10S32 = LLT::fixed_vector(10, 32); 318 static const LLT V11S32 = LLT::fixed_vector(11, 32); 319 static const LLT V12S32 = LLT::fixed_vector(12, 32); 320 static const LLT V16S32 = LLT::fixed_vector(16, 32); 321 static const LLT V32S32 = LLT::fixed_vector(32, 32); 322 323 static const LLT V2S64 = LLT::fixed_vector(2, 64); 324 static const LLT V3S64 = LLT::fixed_vector(3, 64); 325 static const LLT V4S64 = LLT::fixed_vector(4, 64); 326 static const LLT V5S64 = LLT::fixed_vector(5, 64); 327 static const LLT V6S64 = LLT::fixed_vector(6, 64); 328 static const LLT V7S64 = LLT::fixed_vector(7, 64); 329 static const LLT V8S64 = LLT::fixed_vector(8, 64); 330 static const LLT V16S64 = LLT::fixed_vector(16, 64); 331 332 static const LLT V2S128 = LLT::fixed_vector(2, 128); 333 static const LLT V4S128 = LLT::fixed_vector(4, 128); 334 335 static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128, 336 S160, S224, S256, S512}; 337 338 static std::initializer_list<LLT> AllS16Vectors{ 339 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128}; 340 341 static std::initializer_list<LLT> AllS32Vectors = { 342 V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 343 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32}; 344 345 static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64, 346 V6S64, V7S64, V8S64, V16S64}; 347 348 // Checks whether a type is in the list of legal register types. 349 static bool isRegisterClassType(LLT Ty) { 350 if (Ty.isPointerOrPointerVector()) 351 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits())); 352 353 return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) || 354 is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty); 355 } 356 357 static LegalityPredicate isRegisterClassType(unsigned TypeIdx) { 358 return [TypeIdx](const LegalityQuery &Query) { 359 return isRegisterClassType(Query.Types[TypeIdx]); 360 }; 361 } 362 363 // If we have a truncating store or an extending load with a data size larger 364 // than 32-bits, we need to reduce to a 32-bit type. 365 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 366 return [=](const LegalityQuery &Query) { 367 const LLT Ty = Query.Types[TypeIdx]; 368 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 369 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 370 }; 371 } 372 373 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 374 // handle some operations by just promoting the register during 375 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 376 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 377 bool IsLoad, bool IsAtomic) { 378 switch (AS) { 379 case AMDGPUAS::PRIVATE_ADDRESS: 380 // FIXME: Private element size. 381 return ST.enableFlatScratch() ? 128 : 32; 382 case AMDGPUAS::LOCAL_ADDRESS: 383 return ST.useDS128() ? 128 : 64; 384 case AMDGPUAS::GLOBAL_ADDRESS: 385 case AMDGPUAS::CONSTANT_ADDRESS: 386 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 387 case AMDGPUAS::BUFFER_RESOURCE: 388 // Treat constant and global as identical. SMRD loads are sometimes usable for 389 // global loads (ideally constant address space should be eliminated) 390 // depending on the context. Legality cannot be context dependent, but 391 // RegBankSelect can split the load as necessary depending on the pointer 392 // register bank/uniformity and if the memory is invariant or not written in a 393 // kernel. 394 return IsLoad ? 512 : 128; 395 default: 396 // FIXME: Flat addresses may contextually need to be split to 32-bit parts 397 // if they may alias scratch depending on the subtarget. This needs to be 398 // moved to custom handling to use addressMayBeAccessedAsPrivate 399 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32; 400 } 401 } 402 403 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 404 const LegalityQuery &Query) { 405 const LLT Ty = Query.Types[0]; 406 407 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 408 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 409 410 unsigned RegSize = Ty.getSizeInBits(); 411 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 412 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 413 unsigned AS = Query.Types[1].getAddressSpace(); 414 415 // All of these need to be custom lowered to cast the pointer operand. 416 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 417 return false; 418 419 // Do not handle extending vector loads. 420 if (Ty.isVector() && MemSize != RegSize) 421 return false; 422 423 // TODO: We should be able to widen loads if the alignment is high enough, but 424 // we also need to modify the memory access size. 425 #if 0 426 // Accept widening loads based on alignment. 427 if (IsLoad && MemSize < Size) 428 MemSize = std::max(MemSize, Align); 429 #endif 430 431 // Only 1-byte and 2-byte to 32-bit extloads are valid. 432 if (MemSize != RegSize && RegSize != 32) 433 return false; 434 435 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 436 Query.MMODescrs[0].Ordering != 437 AtomicOrdering::NotAtomic)) 438 return false; 439 440 switch (MemSize) { 441 case 8: 442 case 16: 443 case 32: 444 case 64: 445 case 128: 446 break; 447 case 96: 448 if (!ST.hasDwordx3LoadStores()) 449 return false; 450 break; 451 case 256: 452 case 512: 453 // These may contextually need to be broken down. 454 break; 455 default: 456 return false; 457 } 458 459 assert(RegSize >= MemSize); 460 461 if (AlignBits < MemSize) { 462 const SITargetLowering *TLI = ST.getTargetLowering(); 463 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 464 Align(AlignBits / 8))) 465 return false; 466 } 467 468 return true; 469 } 470 471 // The newer buffer intrinsic forms take their resource arguments as 472 // pointers in address space 8, aka s128 values. However, in order to not break 473 // SelectionDAG, the underlying operations have to continue to take v4i32 474 // arguments. Therefore, we convert resource pointers - or vectors of them 475 // to integer values here. 476 static bool hasBufferRsrcWorkaround(const LLT Ty) { 477 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) 478 return true; 479 if (Ty.isVector()) { 480 const LLT ElemTy = Ty.getElementType(); 481 return hasBufferRsrcWorkaround(ElemTy); 482 } 483 return false; 484 } 485 486 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 487 // workaround this. Eventually it should ignore the type for loads and only care 488 // about the size. Return true in cases where we will workaround this for now by 489 // bitcasting. 490 static bool loadStoreBitcastWorkaround(const LLT Ty) { 491 if (EnableNewLegality) 492 return false; 493 494 const unsigned Size = Ty.getSizeInBits(); 495 if (Size <= 64) 496 return false; 497 // Address space 8 pointers get their own workaround. 498 if (hasBufferRsrcWorkaround(Ty)) 499 return false; 500 if (!Ty.isVector()) 501 return true; 502 503 if (Ty.isPointerVector()) 504 return true; 505 506 unsigned EltSize = Ty.getScalarSizeInBits(); 507 return EltSize != 32 && EltSize != 64; 508 } 509 510 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 511 const LLT Ty = Query.Types[0]; 512 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 513 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty); 514 } 515 516 /// Return true if a load or store of the type should be lowered with a bitcast 517 /// to a different type. 518 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 519 const LLT MemTy) { 520 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 521 const unsigned Size = Ty.getSizeInBits(); 522 if (Size != MemSizeInBits) 523 return Size <= 32 && Ty.isVector(); 524 525 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 526 return true; 527 528 // Don't try to handle bitcasting vector ext loads for now. 529 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 530 (Size <= 32 || isRegisterSize(Size)) && 531 !isRegisterVectorElementType(Ty.getElementType()); 532 } 533 534 /// Return true if we should legalize a load by widening an odd sized memory 535 /// access up to the alignment. Note this case when the memory access itself 536 /// changes, not the size of the result register. 537 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 538 uint64_t AlignInBits, unsigned AddrSpace, 539 unsigned Opcode) { 540 unsigned SizeInBits = MemoryTy.getSizeInBits(); 541 // We don't want to widen cases that are naturally legal. 542 if (isPowerOf2_32(SizeInBits)) 543 return false; 544 545 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 546 // end up widening these for a scalar load during RegBankSelect, if we don't 547 // have 96-bit scalar loads. 548 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 549 return false; 550 551 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false)) 552 return false; 553 554 // A load is known dereferenceable up to the alignment, so it's legal to widen 555 // to it. 556 // 557 // TODO: Could check dereferenceable for less aligned cases. 558 unsigned RoundedSize = NextPowerOf2(SizeInBits); 559 if (AlignInBits < RoundedSize) 560 return false; 561 562 // Do not widen if it would introduce a slow unaligned load. 563 const SITargetLowering *TLI = ST.getTargetLowering(); 564 unsigned Fast = 0; 565 return TLI->allowsMisalignedMemoryAccessesImpl( 566 RoundedSize, AddrSpace, Align(AlignInBits / 8), 567 MachineMemOperand::MOLoad, &Fast) && 568 Fast; 569 } 570 571 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 572 unsigned Opcode) { 573 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 574 return false; 575 576 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 577 Query.MMODescrs[0].AlignInBits, 578 Query.Types[1].getAddressSpace(), Opcode); 579 } 580 581 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial 582 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts 583 /// and inttoptr. In addition, handle vectors of p8. Returns the new type. 584 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, 585 MachineRegisterInfo &MRI, unsigned Idx) { 586 MachineOperand &MO = MI.getOperand(Idx); 587 588 const LLT PointerTy = MRI.getType(MO.getReg()); 589 590 // Paranoidly prevent us from doing this multiple times. 591 if (!hasBufferRsrcWorkaround(PointerTy)) 592 return PointerTy; 593 594 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 595 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 596 if (!PointerTy.isVector()) { 597 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8) 598 const unsigned NumParts = PointerTy.getSizeInBits() / 32; 599 const LLT S32 = LLT::scalar(32); 600 601 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy); 602 std::array<Register, 4> VectorElems; 603 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 604 for (unsigned I = 0; I < NumParts; ++I) 605 VectorElems[I] = 606 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0); 607 B.buildMergeValues(MO, VectorElems); 608 MO.setReg(VectorReg); 609 return VectorTy; 610 } 611 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy); 612 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 613 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg); 614 B.buildIntToPtr(MO, Scalar); 615 MO.setReg(BitcastReg); 616 617 return VectorTy; 618 } 619 620 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is 621 /// the form in which the value must be in order to be passed to the low-level 622 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is 623 /// needed in order to account for the fact that we can't define a register 624 /// class for s128 without breaking SelectionDAG. 625 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) { 626 MachineRegisterInfo &MRI = *B.getMRI(); 627 const LLT PointerTy = MRI.getType(Pointer); 628 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 629 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 630 631 if (!PointerTy.isVector()) { 632 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32) 633 SmallVector<Register, 4> PointerParts; 634 const unsigned NumParts = PointerTy.getSizeInBits() / 32; 635 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer); 636 for (unsigned I = 0; I < NumParts; ++I) 637 PointerParts.push_back(Unmerged.getReg(I)); 638 return B.buildBuildVector(VectorTy, PointerParts).getReg(0); 639 } 640 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0); 641 return B.buildBitcast(VectorTy, Scalar).getReg(0); 642 } 643 644 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, 645 unsigned Idx) { 646 MachineOperand &MO = MI.getOperand(Idx); 647 648 const LLT PointerTy = B.getMRI()->getType(MO.getReg()); 649 // Paranoidly prevent us from doing this multiple times. 650 if (!hasBufferRsrcWorkaround(PointerTy)) 651 return; 652 MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B)); 653 } 654 655 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 656 const GCNTargetMachine &TM) 657 : ST(ST_) { 658 using namespace TargetOpcode; 659 660 auto GetAddrSpacePtr = [&TM](unsigned AS) { 661 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 662 }; 663 664 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 665 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 666 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 667 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 668 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 669 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 670 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 671 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER); 672 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE); 673 const LLT BufferStridedPtr = 674 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER); 675 676 const LLT CodePtr = FlatPtr; 677 678 const std::initializer_list<LLT> AddrSpaces64 = { 679 GlobalPtr, ConstantPtr, FlatPtr 680 }; 681 682 const std::initializer_list<LLT> AddrSpaces32 = { 683 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 684 }; 685 686 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr}; 687 688 const std::initializer_list<LLT> FPTypesBase = { 689 S32, S64 690 }; 691 692 const std::initializer_list<LLT> FPTypes16 = { 693 S32, S64, S16 694 }; 695 696 const std::initializer_list<LLT> FPTypesPK16 = { 697 S32, S64, S16, V2S16 698 }; 699 700 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 701 702 // s1 for VCC branches, s32 for SCC branches. 703 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 704 705 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 706 // elements for v3s16 707 getActionDefinitionsBuilder(G_PHI) 708 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 709 .legalFor(AllS32Vectors) 710 .legalFor(AllS64Vectors) 711 .legalFor(AddrSpaces64) 712 .legalFor(AddrSpaces32) 713 .legalFor(AddrSpaces128) 714 .legalIf(isPointer(0)) 715 .clampScalar(0, S16, S256) 716 .widenScalarToNextPow2(0, 32) 717 .clampMaxNumElements(0, S32, 16) 718 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 719 .scalarize(0); 720 721 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 722 // Full set of gfx9 features. 723 if (ST.hasScalarAddSub64()) { 724 getActionDefinitionsBuilder({G_ADD, G_SUB}) 725 .legalFor({S64, S32, S16, V2S16}) 726 .clampMaxNumElementsStrict(0, S16, 2) 727 .scalarize(0) 728 .minScalar(0, S16) 729 .widenScalarToNextMultipleOf(0, 32) 730 .maxScalar(0, S32); 731 } else { 732 getActionDefinitionsBuilder({G_ADD, G_SUB}) 733 .legalFor({S32, S16, V2S16}) 734 .clampMaxNumElementsStrict(0, S16, 2) 735 .scalarize(0) 736 .minScalar(0, S16) 737 .widenScalarToNextMultipleOf(0, 32) 738 .maxScalar(0, S32); 739 } 740 741 if (ST.hasScalarSMulU64()) { 742 getActionDefinitionsBuilder(G_MUL) 743 .legalFor({S64, S32, S16, V2S16}) 744 .clampMaxNumElementsStrict(0, S16, 2) 745 .scalarize(0) 746 .minScalar(0, S16) 747 .widenScalarToNextMultipleOf(0, 32) 748 .custom(); 749 } else { 750 getActionDefinitionsBuilder(G_MUL) 751 .legalFor({S32, S16, V2S16}) 752 .clampMaxNumElementsStrict(0, S16, 2) 753 .scalarize(0) 754 .minScalar(0, S16) 755 .widenScalarToNextMultipleOf(0, 32) 756 .custom(); 757 } 758 assert(ST.hasMad64_32()); 759 760 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 761 .legalFor({S32, S16, V2S16}) // Clamp modifier 762 .minScalarOrElt(0, S16) 763 .clampMaxNumElementsStrict(0, S16, 2) 764 .scalarize(0) 765 .widenScalarToNextPow2(0, 32) 766 .lower(); 767 } else if (ST.has16BitInsts()) { 768 getActionDefinitionsBuilder({G_ADD, G_SUB}) 769 .legalFor({S32, S16}) 770 .minScalar(0, S16) 771 .widenScalarToNextMultipleOf(0, 32) 772 .maxScalar(0, S32) 773 .scalarize(0); 774 775 getActionDefinitionsBuilder(G_MUL) 776 .legalFor({S32, S16}) 777 .scalarize(0) 778 .minScalar(0, S16) 779 .widenScalarToNextMultipleOf(0, 32) 780 .custom(); 781 assert(ST.hasMad64_32()); 782 783 // Technically the saturating operations require clamp bit support, but this 784 // was introduced at the same time as 16-bit operations. 785 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 786 .legalFor({S32, S16}) // Clamp modifier 787 .minScalar(0, S16) 788 .scalarize(0) 789 .widenScalarToNextPow2(0, 16) 790 .lower(); 791 792 // We're just lowering this, but it helps get a better result to try to 793 // coerce to the desired type first. 794 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 795 .minScalar(0, S16) 796 .scalarize(0) 797 .lower(); 798 } else { 799 getActionDefinitionsBuilder({G_ADD, G_SUB}) 800 .legalFor({S32}) 801 .widenScalarToNextMultipleOf(0, 32) 802 .clampScalar(0, S32, S32) 803 .scalarize(0); 804 805 auto &Mul = getActionDefinitionsBuilder(G_MUL) 806 .legalFor({S32}) 807 .scalarize(0) 808 .minScalar(0, S32) 809 .widenScalarToNextMultipleOf(0, 32); 810 811 if (ST.hasMad64_32()) 812 Mul.custom(); 813 else 814 Mul.maxScalar(0, S32); 815 816 if (ST.hasIntClamp()) { 817 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 818 .legalFor({S32}) // Clamp modifier. 819 .scalarize(0) 820 .minScalarOrElt(0, S32) 821 .lower(); 822 } else { 823 // Clamp bit support was added in VI, along with 16-bit operations. 824 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 825 .minScalar(0, S32) 826 .scalarize(0) 827 .lower(); 828 } 829 830 // FIXME: DAG expansion gets better results. The widening uses the smaller 831 // range values and goes for the min/max lowering directly. 832 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 833 .minScalar(0, S32) 834 .scalarize(0) 835 .lower(); 836 } 837 838 getActionDefinitionsBuilder( 839 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 840 .customFor({S32, S64}) 841 .clampScalar(0, S32, S64) 842 .widenScalarToNextPow2(0, 32) 843 .scalarize(0); 844 845 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 846 .legalFor({S32}) 847 .maxScalar(0, S32); 848 849 if (ST.hasVOP3PInsts()) { 850 Mulh 851 .clampMaxNumElements(0, S8, 2) 852 .lowerFor({V2S8}); 853 } 854 855 Mulh 856 .scalarize(0) 857 .lower(); 858 859 // Report legal for any types we can handle anywhere. For the cases only legal 860 // on the SALU, RegBankSelect will be able to re-legalize. 861 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 862 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 863 .clampScalar(0, S32, S64) 864 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 865 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 866 .widenScalarToNextPow2(0) 867 .scalarize(0); 868 869 getActionDefinitionsBuilder( 870 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 871 .legalFor({{S32, S1}, {S32, S32}}) 872 .clampScalar(0, S32, S32) 873 .scalarize(0); 874 875 getActionDefinitionsBuilder(G_BITCAST) 876 // Don't worry about the size constraint. 877 .legalIf(all(isRegisterClassType(0), isRegisterClassType(1))) 878 .lower(); 879 880 getActionDefinitionsBuilder(G_CONSTANT) 881 .legalFor({S1, S32, S64, S16, GlobalPtr, 882 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 883 .legalIf(isPointer(0)) 884 .clampScalar(0, S32, S64) 885 .widenScalarToNextPow2(0); 886 887 getActionDefinitionsBuilder(G_FCONSTANT) 888 .legalFor({S32, S64, S16}) 889 .clampScalar(0, S16, S64); 890 891 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 892 .legalIf(isRegisterType(0)) 893 // s1 and s16 are special cases because they have legal operations on 894 // them, but don't really occupy registers in the normal way. 895 .legalFor({S1, S16}) 896 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 897 .clampScalarOrElt(0, S32, MaxScalar) 898 .widenScalarToNextPow2(0, 32) 899 .clampMaxNumElements(0, S32, 16); 900 901 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 902 903 // If the amount is divergent, we have to do a wave reduction to get the 904 // maximum value, so this is expanded during RegBankSelect. 905 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 906 .legalFor({{PrivatePtr, S32}}); 907 908 getActionDefinitionsBuilder(G_STACKSAVE) 909 .customFor({PrivatePtr}); 910 getActionDefinitionsBuilder(G_STACKRESTORE) 911 .legalFor({PrivatePtr}); 912 913 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64}); 914 915 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 916 .customIf(typeIsNot(0, PrivatePtr)); 917 918 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 919 920 auto &FPOpActions = getActionDefinitionsBuilder( 921 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE, 922 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA}) 923 .legalFor({S32, S64}); 924 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 925 .customFor({S32, S64}); 926 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 927 .customFor({S32, S64}); 928 929 if (ST.has16BitInsts()) { 930 if (ST.hasVOP3PInsts()) 931 FPOpActions.legalFor({S16, V2S16}); 932 else 933 FPOpActions.legalFor({S16}); 934 935 TrigActions.customFor({S16}); 936 FDIVActions.customFor({S16}); 937 } 938 939 if (ST.hasPackedFP32Ops()) { 940 FPOpActions.legalFor({V2S32}); 941 FPOpActions.clampMaxNumElementsStrict(0, S32, 2); 942 } 943 944 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 945 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 946 947 if (ST.hasVOP3PInsts()) { 948 MinNumMaxNum.customFor(FPTypesPK16) 949 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 950 .clampMaxNumElements(0, S16, 2) 951 .clampScalar(0, S16, S64) 952 .scalarize(0); 953 } else if (ST.has16BitInsts()) { 954 MinNumMaxNum.customFor(FPTypes16) 955 .clampScalar(0, S16, S64) 956 .scalarize(0); 957 } else { 958 MinNumMaxNum.customFor(FPTypesBase) 959 .clampScalar(0, S32, S64) 960 .scalarize(0); 961 } 962 963 if (ST.hasVOP3PInsts()) 964 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 965 966 FPOpActions 967 .scalarize(0) 968 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 969 970 TrigActions 971 .scalarize(0) 972 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 973 974 FDIVActions 975 .scalarize(0) 976 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 977 978 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 979 .legalFor(FPTypesPK16) 980 .clampMaxNumElementsStrict(0, S16, 2) 981 .scalarize(0) 982 .clampScalar(0, S16, S64); 983 984 if (ST.has16BitInsts()) { 985 getActionDefinitionsBuilder(G_FSQRT) 986 .legalFor({S16}) 987 .customFor({S32, S64}) 988 .scalarize(0) 989 .unsupported(); 990 getActionDefinitionsBuilder(G_FFLOOR) 991 .legalFor({S32, S64, S16}) 992 .scalarize(0) 993 .clampScalar(0, S16, S64); 994 995 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 996 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}}) 997 .scalarize(0) 998 .maxScalarIf(typeIs(0, S16), 1, S16) 999 .clampScalar(1, S32, S32) 1000 .lower(); 1001 1002 getActionDefinitionsBuilder(G_FFREXP) 1003 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}}) 1004 .scalarize(0) 1005 .lower(); 1006 } else { 1007 getActionDefinitionsBuilder(G_FSQRT) 1008 .customFor({S32, S64, S16}) 1009 .scalarize(0) 1010 .unsupported(); 1011 1012 1013 if (ST.hasFractBug()) { 1014 getActionDefinitionsBuilder(G_FFLOOR) 1015 .customFor({S64}) 1016 .legalFor({S32, S64}) 1017 .scalarize(0) 1018 .clampScalar(0, S32, S64); 1019 } else { 1020 getActionDefinitionsBuilder(G_FFLOOR) 1021 .legalFor({S32, S64}) 1022 .scalarize(0) 1023 .clampScalar(0, S32, S64); 1024 } 1025 1026 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 1027 .legalFor({{S32, S32}, {S64, S32}}) 1028 .scalarize(0) 1029 .clampScalar(0, S32, S64) 1030 .clampScalar(1, S32, S32) 1031 .lower(); 1032 1033 getActionDefinitionsBuilder(G_FFREXP) 1034 .customFor({{S32, S32}, {S64, S32}}) 1035 .scalarize(0) 1036 .minScalar(0, S32) 1037 .clampScalar(1, S32, S32) 1038 .lower(); 1039 } 1040 1041 getActionDefinitionsBuilder(G_FPTRUNC) 1042 .legalFor({{S32, S64}, {S16, S32}}) 1043 .scalarize(0) 1044 .lower(); 1045 1046 getActionDefinitionsBuilder(G_FPEXT) 1047 .legalFor({{S64, S32}, {S32, S16}}) 1048 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 1049 .scalarize(0); 1050 1051 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB}); 1052 if (ST.has16BitInsts()) { 1053 FSubActions 1054 // Use actual fsub instruction 1055 .legalFor({S32, S16}) 1056 // Must use fadd + fneg 1057 .lowerFor({S64, V2S16}); 1058 } else { 1059 FSubActions 1060 // Use actual fsub instruction 1061 .legalFor({S32}) 1062 // Must use fadd + fneg 1063 .lowerFor({S64, S16, V2S16}); 1064 } 1065 1066 FSubActions 1067 .scalarize(0) 1068 .clampScalar(0, S32, S64); 1069 1070 // Whether this is legal depends on the floating point mode for the function. 1071 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 1072 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 1073 FMad.customFor({S32, S16}); 1074 else if (ST.hasMadMacF32Insts()) 1075 FMad.customFor({S32}); 1076 else if (ST.hasMadF16()) 1077 FMad.customFor({S16}); 1078 FMad.scalarize(0) 1079 .lower(); 1080 1081 auto &FRem = getActionDefinitionsBuilder(G_FREM); 1082 if (ST.has16BitInsts()) { 1083 FRem.customFor({S16, S32, S64}); 1084 } else { 1085 FRem.minScalar(0, S32) 1086 .customFor({S32, S64}); 1087 } 1088 FRem.scalarize(0); 1089 1090 // TODO: Do we need to clamp maximum bitwidth? 1091 getActionDefinitionsBuilder(G_TRUNC) 1092 .legalIf(isScalar(0)) 1093 .legalFor({{V2S16, V2S32}}) 1094 .clampMaxNumElements(0, S16, 2) 1095 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 1096 // situations (like an invalid implicit use), we don't want to infinite loop 1097 // in the legalizer. 1098 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 1099 .alwaysLegal(); 1100 1101 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 1102 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 1103 {S32, S1}, {S64, S1}, {S16, S1}}) 1104 .scalarize(0) 1105 .clampScalar(0, S32, S64) 1106 .widenScalarToNextPow2(1, 32); 1107 1108 // TODO: Split s1->s64 during regbankselect for VALU. 1109 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 1110 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 1111 .lowerIf(typeIs(1, S1)) 1112 .customFor({{S32, S64}, {S64, S64}}); 1113 if (ST.has16BitInsts()) 1114 IToFP.legalFor({{S16, S16}}); 1115 IToFP.clampScalar(1, S32, S64) 1116 .minScalar(0, S32) 1117 .scalarize(0) 1118 .widenScalarToNextPow2(1); 1119 1120 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 1121 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 1122 .customFor({{S64, S32}, {S64, S64}}) 1123 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 1124 if (ST.has16BitInsts()) 1125 FPToI.legalFor({{S16, S16}}); 1126 else 1127 FPToI.minScalar(1, S32); 1128 1129 FPToI.minScalar(0, S32) 1130 .widenScalarToNextPow2(0, 32) 1131 .scalarize(0) 1132 .lower(); 1133 1134 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 1135 .customFor({S16, S32}) 1136 .scalarize(0) 1137 .lower(); 1138 1139 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN 1140 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT}) 1141 .scalarize(0) 1142 .lower(); 1143 1144 if (ST.has16BitInsts()) { 1145 getActionDefinitionsBuilder( 1146 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 1147 .legalFor({S16, S32, S64}) 1148 .clampScalar(0, S16, S64) 1149 .scalarize(0); 1150 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 1151 getActionDefinitionsBuilder( 1152 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 1153 .legalFor({S32, S64}) 1154 .clampScalar(0, S32, S64) 1155 .scalarize(0); 1156 } else { 1157 getActionDefinitionsBuilder( 1158 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 1159 .legalFor({S32}) 1160 .customFor({S64}) 1161 .clampScalar(0, S32, S64) 1162 .scalarize(0); 1163 } 1164 1165 getActionDefinitionsBuilder(G_PTR_ADD) 1166 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr}) 1167 .legalIf(all(isPointer(0), sameSize(0, 1))) 1168 .scalarize(0) 1169 .scalarSameSizeAs(1, 0); 1170 1171 getActionDefinitionsBuilder(G_PTRMASK) 1172 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 1173 .scalarSameSizeAs(1, 0) 1174 .scalarize(0); 1175 1176 auto &CmpBuilder = 1177 getActionDefinitionsBuilder(G_ICMP) 1178 // The compare output type differs based on the register bank of the output, 1179 // so make both s1 and s32 legal. 1180 // 1181 // Scalar compares producing output in scc will be promoted to s32, as that 1182 // is the allocatable register type that will be needed for the copy from 1183 // scc. This will be promoted during RegBankSelect, and we assume something 1184 // before that won't try to use s32 result types. 1185 // 1186 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 1187 // bank. 1188 .legalForCartesianProduct( 1189 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 1190 .legalForCartesianProduct( 1191 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 1192 if (ST.has16BitInsts()) { 1193 CmpBuilder.legalFor({{S1, S16}}); 1194 } 1195 1196 CmpBuilder 1197 .widenScalarToNextPow2(1) 1198 .clampScalar(1, S32, S64) 1199 .scalarize(0) 1200 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 1201 1202 auto &FCmpBuilder = 1203 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct( 1204 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase); 1205 1206 if (ST.hasSALUFloatInsts()) 1207 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32}); 1208 1209 FCmpBuilder 1210 .widenScalarToNextPow2(1) 1211 .clampScalar(1, S32, S64) 1212 .scalarize(0); 1213 1214 // FIXME: fpow has a selection pattern that should move to custom lowering. 1215 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW); 1216 if (ST.has16BitInsts()) 1217 ExpOps.customFor({{S32}, {S16}}); 1218 else 1219 ExpOps.customFor({S32}); 1220 ExpOps.clampScalar(0, MinScalarFPTy, S32) 1221 .scalarize(0); 1222 1223 getActionDefinitionsBuilder(G_FPOWI) 1224 .clampScalar(0, MinScalarFPTy, S32) 1225 .lower(); 1226 1227 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2}); 1228 Log2Ops.customFor({S32}); 1229 if (ST.has16BitInsts()) 1230 Log2Ops.legalFor({S16}); 1231 else 1232 Log2Ops.customFor({S16}); 1233 Log2Ops.scalarize(0) 1234 .lower(); 1235 1236 auto &LogOps = 1237 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10}); 1238 LogOps.customFor({S32, S16}); 1239 LogOps.clampScalar(0, MinScalarFPTy, S32) 1240 .scalarize(0); 1241 1242 // The 64-bit versions produce 32-bit results, but only on the SALU. 1243 getActionDefinitionsBuilder(G_CTPOP) 1244 .legalFor({{S32, S32}, {S32, S64}}) 1245 .clampScalar(0, S32, S32) 1246 .widenScalarToNextPow2(1, 32) 1247 .clampScalar(1, S32, S64) 1248 .scalarize(0) 1249 .widenScalarToNextPow2(0, 32); 1250 1251 // If no 16 bit instr is available, lower into different instructions. 1252 if (ST.has16BitInsts()) 1253 getActionDefinitionsBuilder(G_IS_FPCLASS) 1254 .legalForCartesianProduct({S1}, FPTypes16) 1255 .widenScalarToNextPow2(1) 1256 .scalarize(0) 1257 .lower(); 1258 else 1259 getActionDefinitionsBuilder(G_IS_FPCLASS) 1260 .legalForCartesianProduct({S1}, FPTypesBase) 1261 .lowerFor({S1, S16}) 1262 .widenScalarToNextPow2(1) 1263 .scalarize(0) 1264 .lower(); 1265 1266 // The hardware instructions return a different result on 0 than the generic 1267 // instructions expect. The hardware produces -1, but these produce the 1268 // bitwidth. 1269 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 1270 .scalarize(0) 1271 .clampScalar(0, S32, S32) 1272 .clampScalar(1, S32, S64) 1273 .widenScalarToNextPow2(0, 32) 1274 .widenScalarToNextPow2(1, 32) 1275 .custom(); 1276 1277 // The 64-bit versions produce 32-bit results, but only on the SALU. 1278 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF) 1279 .legalFor({{S32, S32}, {S32, S64}}) 1280 .customIf(scalarNarrowerThan(1, 32)) 1281 .clampScalar(0, S32, S32) 1282 .clampScalar(1, S32, S64) 1283 .scalarize(0) 1284 .widenScalarToNextPow2(0, 32) 1285 .widenScalarToNextPow2(1, 32); 1286 1287 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF) 1288 .legalFor({{S32, S32}, {S32, S64}}) 1289 .clampScalar(0, S32, S32) 1290 .clampScalar(1, S32, S64) 1291 .scalarize(0) 1292 .widenScalarToNextPow2(0, 32) 1293 .widenScalarToNextPow2(1, 32); 1294 1295 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1296 // RegBankSelect. 1297 getActionDefinitionsBuilder(G_BITREVERSE) 1298 .legalFor({S32, S64}) 1299 .clampScalar(0, S32, S64) 1300 .scalarize(0) 1301 .widenScalarToNextPow2(0); 1302 1303 if (ST.has16BitInsts()) { 1304 getActionDefinitionsBuilder(G_BSWAP) 1305 .legalFor({S16, S32, V2S16}) 1306 .clampMaxNumElementsStrict(0, S16, 2) 1307 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1308 // narrowScalar limitation. 1309 .widenScalarToNextPow2(0) 1310 .clampScalar(0, S16, S32) 1311 .scalarize(0); 1312 1313 if (ST.hasVOP3PInsts()) { 1314 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1315 .legalFor({S32, S16, V2S16}) 1316 .clampMaxNumElements(0, S16, 2) 1317 .minScalar(0, S16) 1318 .widenScalarToNextPow2(0) 1319 .scalarize(0) 1320 .lower(); 1321 } else { 1322 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1323 .legalFor({S32, S16}) 1324 .widenScalarToNextPow2(0) 1325 .minScalar(0, S16) 1326 .scalarize(0) 1327 .lower(); 1328 } 1329 } else { 1330 // TODO: Should have same legality without v_perm_b32 1331 getActionDefinitionsBuilder(G_BSWAP) 1332 .legalFor({S32}) 1333 .lowerIf(scalarNarrowerThan(0, 32)) 1334 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1335 // narrowScalar limitation. 1336 .widenScalarToNextPow2(0) 1337 .maxScalar(0, S32) 1338 .scalarize(0) 1339 .lower(); 1340 1341 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1342 .legalFor({S32}) 1343 .minScalar(0, S32) 1344 .widenScalarToNextPow2(0) 1345 .scalarize(0) 1346 .lower(); 1347 } 1348 1349 getActionDefinitionsBuilder(G_INTTOPTR) 1350 // List the common cases 1351 .legalForCartesianProduct(AddrSpaces64, {S64}) 1352 .legalForCartesianProduct(AddrSpaces32, {S32}) 1353 .scalarize(0) 1354 // Accept any address space as long as the size matches 1355 .legalIf(sameSize(0, 1)) 1356 .widenScalarIf(smallerThan(1, 0), 1357 [](const LegalityQuery &Query) { 1358 return std::pair( 1359 1, LLT::scalar(Query.Types[0].getSizeInBits())); 1360 }) 1361 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) { 1362 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1363 }); 1364 1365 getActionDefinitionsBuilder(G_PTRTOINT) 1366 // List the common cases 1367 .legalForCartesianProduct(AddrSpaces64, {S64}) 1368 .legalForCartesianProduct(AddrSpaces32, {S32}) 1369 .scalarize(0) 1370 // Accept any address space as long as the size matches 1371 .legalIf(sameSize(0, 1)) 1372 .widenScalarIf(smallerThan(0, 1), 1373 [](const LegalityQuery &Query) { 1374 return std::pair( 1375 0, LLT::scalar(Query.Types[1].getSizeInBits())); 1376 }) 1377 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) { 1378 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1379 }); 1380 1381 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1382 .scalarize(0) 1383 .custom(); 1384 1385 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1386 bool IsLoad) -> bool { 1387 const LLT DstTy = Query.Types[0]; 1388 1389 // Split vector extloads. 1390 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1391 1392 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1393 return true; 1394 1395 const LLT PtrTy = Query.Types[1]; 1396 unsigned AS = PtrTy.getAddressSpace(); 1397 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 1398 Query.MMODescrs[0].Ordering != 1399 AtomicOrdering::NotAtomic)) 1400 return true; 1401 1402 // Catch weird sized loads that don't evenly divide into the access sizes 1403 // TODO: May be able to widen depending on alignment etc. 1404 unsigned NumRegs = (MemSize + 31) / 32; 1405 if (NumRegs == 3) { 1406 if (!ST.hasDwordx3LoadStores()) 1407 return true; 1408 } else { 1409 // If the alignment allows, these should have been widened. 1410 if (!isPowerOf2_32(NumRegs)) 1411 return true; 1412 } 1413 1414 return false; 1415 }; 1416 1417 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1418 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1419 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1420 1421 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1422 // LDS 1423 // TODO: Unsupported flat for SI. 1424 1425 for (unsigned Op : {G_LOAD, G_STORE}) { 1426 const bool IsStore = Op == G_STORE; 1427 1428 auto &Actions = getActionDefinitionsBuilder(Op); 1429 // Explicitly list some common cases. 1430 // TODO: Does this help compile time at all? 1431 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1432 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1433 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1434 {S64, GlobalPtr, S64, GlobalAlign32}, 1435 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1436 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1437 {S32, GlobalPtr, S8, GlobalAlign8}, 1438 {S32, GlobalPtr, S16, GlobalAlign16}, 1439 1440 {S32, LocalPtr, S32, 32}, 1441 {S64, LocalPtr, S64, 32}, 1442 {V2S32, LocalPtr, V2S32, 32}, 1443 {S32, LocalPtr, S8, 8}, 1444 {S32, LocalPtr, S16, 16}, 1445 {V2S16, LocalPtr, S32, 32}, 1446 1447 {S32, PrivatePtr, S32, 32}, 1448 {S32, PrivatePtr, S8, 8}, 1449 {S32, PrivatePtr, S16, 16}, 1450 {V2S16, PrivatePtr, S32, 32}, 1451 1452 {S32, ConstantPtr, S32, GlobalAlign32}, 1453 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1454 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1455 {S64, ConstantPtr, S64, GlobalAlign32}, 1456 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1457 Actions.legalIf( 1458 [=](const LegalityQuery &Query) -> bool { 1459 return isLoadStoreLegal(ST, Query); 1460 }); 1461 1462 // The custom pointers (fat pointers, buffer resources) don't work with load 1463 // and store at this level. Fat pointers should have been lowered to 1464 // intrinsics before the translation to MIR. 1465 Actions.unsupportedIf( 1466 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr})); 1467 1468 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and 1469 // ptrtoint. This is needed to account for the fact that we can't have i128 1470 // as a register class for SelectionDAG reasons. 1471 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1472 return hasBufferRsrcWorkaround(Query.Types[0]); 1473 }); 1474 1475 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1476 // 64-bits. 1477 // 1478 // TODO: Should generalize bitcast action into coerce, which will also cover 1479 // inserting addrspacecasts. 1480 Actions.customIf(typeIs(1, Constant32Ptr)); 1481 1482 // Turn any illegal element vectors into something easier to deal 1483 // with. These will ultimately produce 32-bit scalar shifts to extract the 1484 // parts anyway. 1485 // 1486 // For odd 16-bit element vectors, prefer to split those into pieces with 1487 // 16-bit vector parts. 1488 Actions.bitcastIf( 1489 [=](const LegalityQuery &Query) -> bool { 1490 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1491 Query.MMODescrs[0].MemoryTy); 1492 }, bitcastToRegisterType(0)); 1493 1494 if (!IsStore) { 1495 // Widen suitably aligned loads by loading extra bytes. The standard 1496 // legalization actions can't properly express widening memory operands. 1497 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1498 return shouldWidenLoad(ST, Query, G_LOAD); 1499 }); 1500 } 1501 1502 // FIXME: load/store narrowing should be moved to lower action 1503 Actions 1504 .narrowScalarIf( 1505 [=](const LegalityQuery &Query) -> bool { 1506 return !Query.Types[0].isVector() && 1507 needToSplitMemOp(Query, Op == G_LOAD); 1508 }, 1509 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1510 const LLT DstTy = Query.Types[0]; 1511 const LLT PtrTy = Query.Types[1]; 1512 1513 const unsigned DstSize = DstTy.getSizeInBits(); 1514 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1515 1516 // Split extloads. 1517 if (DstSize > MemSize) 1518 return std::pair(0, LLT::scalar(MemSize)); 1519 1520 unsigned MaxSize = maxSizeForAddrSpace( 1521 ST, PtrTy.getAddressSpace(), Op == G_LOAD, 1522 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 1523 if (MemSize > MaxSize) 1524 return std::pair(0, LLT::scalar(MaxSize)); 1525 1526 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1527 return std::pair(0, LLT::scalar(Align)); 1528 }) 1529 .fewerElementsIf( 1530 [=](const LegalityQuery &Query) -> bool { 1531 return Query.Types[0].isVector() && 1532 needToSplitMemOp(Query, Op == G_LOAD); 1533 }, 1534 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1535 const LLT DstTy = Query.Types[0]; 1536 const LLT PtrTy = Query.Types[1]; 1537 1538 LLT EltTy = DstTy.getElementType(); 1539 unsigned MaxSize = maxSizeForAddrSpace( 1540 ST, PtrTy.getAddressSpace(), Op == G_LOAD, 1541 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 1542 1543 // FIXME: Handle widened to power of 2 results better. This ends 1544 // up scalarizing. 1545 // FIXME: 3 element stores scalarized on SI 1546 1547 // Split if it's too large for the address space. 1548 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1549 if (MemSize > MaxSize) { 1550 unsigned NumElts = DstTy.getNumElements(); 1551 unsigned EltSize = EltTy.getSizeInBits(); 1552 1553 if (MaxSize % EltSize == 0) { 1554 return std::pair( 1555 0, LLT::scalarOrVector( 1556 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1557 } 1558 1559 unsigned NumPieces = MemSize / MaxSize; 1560 1561 // FIXME: Refine when odd breakdowns handled 1562 // The scalars will need to be re-legalized. 1563 if (NumPieces == 1 || NumPieces >= NumElts || 1564 NumElts % NumPieces != 0) 1565 return std::pair(0, EltTy); 1566 1567 return std::pair(0, 1568 LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1569 } 1570 1571 // FIXME: We could probably handle weird extending loads better. 1572 if (DstTy.getSizeInBits() > MemSize) 1573 return std::pair(0, EltTy); 1574 1575 unsigned EltSize = EltTy.getSizeInBits(); 1576 unsigned DstSize = DstTy.getSizeInBits(); 1577 if (!isPowerOf2_32(DstSize)) { 1578 // We're probably decomposing an odd sized store. Try to split 1579 // to the widest type. TODO: Account for alignment. As-is it 1580 // should be OK, since the new parts will be further legalized. 1581 unsigned FloorSize = llvm::bit_floor(DstSize); 1582 return std::pair( 1583 0, LLT::scalarOrVector( 1584 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1585 } 1586 1587 // May need relegalization for the scalars. 1588 return std::pair(0, EltTy); 1589 }) 1590 .minScalar(0, S32) 1591 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1592 .widenScalarToNextPow2(0) 1593 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1594 .lower(); 1595 } 1596 1597 // FIXME: Unaligned accesses not lowered. 1598 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1599 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1600 {S32, GlobalPtr, S16, 2 * 8}, 1601 {S32, LocalPtr, S8, 8}, 1602 {S32, LocalPtr, S16, 16}, 1603 {S32, PrivatePtr, S8, 8}, 1604 {S32, PrivatePtr, S16, 16}, 1605 {S32, ConstantPtr, S8, 8}, 1606 {S32, ConstantPtr, S16, 2 * 8}}) 1607 .legalIf( 1608 [=](const LegalityQuery &Query) -> bool { 1609 return isLoadStoreLegal(ST, Query); 1610 }); 1611 1612 if (ST.hasFlatAddressSpace()) { 1613 ExtLoads.legalForTypesWithMemDesc( 1614 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1615 } 1616 1617 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1618 // 64-bits. 1619 // 1620 // TODO: Should generalize bitcast action into coerce, which will also cover 1621 // inserting addrspacecasts. 1622 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1623 1624 ExtLoads.clampScalar(0, S32, S32) 1625 .widenScalarToNextPow2(0) 1626 .lower(); 1627 1628 auto &Atomics = getActionDefinitionsBuilder( 1629 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1630 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1631 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1632 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP}) 1633 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1634 {S64, GlobalPtr}, {S64, LocalPtr}, 1635 {S32, RegionPtr}, {S64, RegionPtr}}); 1636 if (ST.hasFlatAddressSpace()) { 1637 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1638 } 1639 1640 // TODO: v2bf16 operations, and fat buffer pointer support. 1641 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1642 if (ST.hasLDSFPAtomicAddF32()) { 1643 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1644 if (ST.hasLdsAtomicAddF64()) 1645 Atomic.legalFor({{S64, LocalPtr}}); 1646 if (ST.hasAtomicDsPkAdd16Insts()) 1647 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}}); 1648 } 1649 if (ST.hasAtomicFaddInsts()) 1650 Atomic.legalFor({{S32, GlobalPtr}}); 1651 if (ST.hasFlatAtomicFaddF32Inst()) 1652 Atomic.legalFor({{S32, FlatPtr}}); 1653 1654 if (ST.hasGFX90AInsts()) { 1655 // These are legal with some caveats, and should have undergone expansion in 1656 // the IR in most situations 1657 // TODO: Move atomic expansion into legalizer 1658 Atomic.legalFor({ 1659 {S32, GlobalPtr}, 1660 {S64, GlobalPtr}, 1661 {S64, FlatPtr} 1662 }); 1663 } 1664 1665 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() || 1666 ST.hasAtomicBufferGlobalPkAddF16Insts()) 1667 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}}); 1668 if (ST.hasAtomicGlobalPkAddBF16Inst()) 1669 Atomic.legalFor({{V2BF16, GlobalPtr}}); 1670 if (ST.hasAtomicFlatPkAdd16Insts()) 1671 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}}); 1672 1673 1674 // Most of the legalization work here is done by AtomicExpand. We could 1675 // probably use a simpler legality rule that just assumes anything is OK. 1676 auto &AtomicFMinFMax = 1677 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX}) 1678 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}}); 1679 1680 if (ST.hasAtomicFMinFMaxF32GlobalInsts()) 1681 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}}); 1682 if (ST.hasAtomicFMinFMaxF64GlobalInsts()) 1683 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}}); 1684 if (ST.hasAtomicFMinFMaxF32FlatInsts()) 1685 AtomicFMinFMax.legalFor({F32, FlatPtr}); 1686 if (ST.hasAtomicFMinFMaxF64FlatInsts()) 1687 AtomicFMinFMax.legalFor({F64, FlatPtr}); 1688 1689 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1690 // demarshalling 1691 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1692 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1693 {S32, FlatPtr}, {S64, FlatPtr}}) 1694 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1695 {S32, RegionPtr}, {S64, RegionPtr}}); 1696 // TODO: Pointer types, any 32-bit or 64-bit vector 1697 1698 // Condition should be s32 for scalar, s1 for vector. 1699 getActionDefinitionsBuilder(G_SELECT) 1700 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1701 LocalPtr, FlatPtr, PrivatePtr, 1702 LLT::fixed_vector(2, LocalPtr), 1703 LLT::fixed_vector(2, PrivatePtr)}, 1704 {S1, S32}) 1705 .clampScalar(0, S16, S64) 1706 .scalarize(1) 1707 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1708 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1709 .clampMaxNumElements(0, S32, 2) 1710 .clampMaxNumElements(0, LocalPtr, 2) 1711 .clampMaxNumElements(0, PrivatePtr, 2) 1712 .scalarize(0) 1713 .widenScalarToNextPow2(0) 1714 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1715 1716 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1717 // be more flexible with the shift amount type. 1718 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1719 .legalFor({{S32, S32}, {S64, S32}}); 1720 if (ST.has16BitInsts()) { 1721 if (ST.hasVOP3PInsts()) { 1722 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1723 .clampMaxNumElements(0, S16, 2); 1724 } else 1725 Shifts.legalFor({{S16, S16}}); 1726 1727 // TODO: Support 16-bit shift amounts for all types 1728 Shifts.widenScalarIf( 1729 [=](const LegalityQuery &Query) { 1730 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1731 // 32-bit amount. 1732 const LLT ValTy = Query.Types[0]; 1733 const LLT AmountTy = Query.Types[1]; 1734 return ValTy.getSizeInBits() <= 16 && 1735 AmountTy.getSizeInBits() < 16; 1736 }, changeTo(1, S16)); 1737 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1738 Shifts.clampScalar(1, S32, S32); 1739 Shifts.widenScalarToNextPow2(0, 16); 1740 Shifts.clampScalar(0, S16, S64); 1741 1742 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1743 .minScalar(0, S16) 1744 .scalarize(0) 1745 .lower(); 1746 } else { 1747 // Make sure we legalize the shift amount type first, as the general 1748 // expansion for the shifted type will produce much worse code if it hasn't 1749 // been truncated already. 1750 Shifts.clampScalar(1, S32, S32); 1751 Shifts.widenScalarToNextPow2(0, 32); 1752 Shifts.clampScalar(0, S32, S64); 1753 1754 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1755 .minScalar(0, S32) 1756 .scalarize(0) 1757 .lower(); 1758 } 1759 Shifts.scalarize(0); 1760 1761 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1762 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1763 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1764 unsigned IdxTypeIdx = 2; 1765 1766 getActionDefinitionsBuilder(Op) 1767 .customIf([=](const LegalityQuery &Query) { 1768 const LLT EltTy = Query.Types[EltTypeIdx]; 1769 const LLT VecTy = Query.Types[VecTypeIdx]; 1770 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1771 const unsigned EltSize = EltTy.getSizeInBits(); 1772 const bool isLegalVecType = 1773 !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits()); 1774 // Address space 8 pointers are 128-bit wide values, but the logic 1775 // below will try to bitcast them to 2N x s64, which will fail. 1776 // Therefore, as an intermediate step, wrap extracts/insertions from a 1777 // ptrtoint-ing the vector and scalar arguments (or inttoptring the 1778 // extraction result) in order to produce a vector operation that can 1779 // be handled by the logic below. 1780 if (EltTy.isPointer() && EltSize > 64) 1781 return true; 1782 return (EltSize == 32 || EltSize == 64) && 1783 VecTy.getSizeInBits() % 32 == 0 && 1784 VecTy.getSizeInBits() <= MaxRegisterSize && 1785 IdxTy.getSizeInBits() == 32 && 1786 isLegalVecType; 1787 }) 1788 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1789 bitcastToVectorElement32(VecTypeIdx)) 1790 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1791 .bitcastIf( 1792 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1793 [=](const LegalityQuery &Query) { 1794 // For > 64-bit element types, try to turn this into a 64-bit 1795 // element vector since we may be able to do better indexing 1796 // if this is scalar. If not, fall back to 32. 1797 const LLT EltTy = Query.Types[EltTypeIdx]; 1798 const LLT VecTy = Query.Types[VecTypeIdx]; 1799 const unsigned DstEltSize = EltTy.getSizeInBits(); 1800 const unsigned VecSize = VecTy.getSizeInBits(); 1801 1802 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1803 return std::pair( 1804 VecTypeIdx, 1805 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1806 }) 1807 .clampScalar(EltTypeIdx, S32, S64) 1808 .clampScalar(VecTypeIdx, S32, S64) 1809 .clampScalar(IdxTypeIdx, S32, S32) 1810 .clampMaxNumElements(VecTypeIdx, S32, 32) 1811 // TODO: Clamp elements for 64-bit vectors? 1812 .moreElementsIf( 1813 isIllegalRegisterType(VecTypeIdx), 1814 moreElementsToNextExistingRegClass(VecTypeIdx)) 1815 // It should only be necessary with variable indexes. 1816 // As a last resort, lower to the stack 1817 .lower(); 1818 } 1819 1820 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1821 .unsupportedIf([=](const LegalityQuery &Query) { 1822 const LLT &EltTy = Query.Types[1].getElementType(); 1823 return Query.Types[0] != EltTy; 1824 }); 1825 1826 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1827 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1828 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1829 1830 // FIXME: Doesn't handle extract of illegal sizes. 1831 getActionDefinitionsBuilder(Op) 1832 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1833 .lowerIf([=](const LegalityQuery &Query) { 1834 // Sub-vector(or single element) insert and extract. 1835 // TODO: verify immediate offset here since lower only works with 1836 // whole elements. 1837 const LLT BigTy = Query.Types[BigTyIdx]; 1838 return BigTy.isVector(); 1839 }) 1840 // FIXME: Multiples of 16 should not be legal. 1841 .legalIf([=](const LegalityQuery &Query) { 1842 const LLT BigTy = Query.Types[BigTyIdx]; 1843 const LLT LitTy = Query.Types[LitTyIdx]; 1844 return (BigTy.getSizeInBits() % 32 == 0) && 1845 (LitTy.getSizeInBits() % 16 == 0); 1846 }) 1847 .widenScalarIf( 1848 [=](const LegalityQuery &Query) { 1849 const LLT BigTy = Query.Types[BigTyIdx]; 1850 return (BigTy.getScalarSizeInBits() < 16); 1851 }, 1852 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1853 .widenScalarIf( 1854 [=](const LegalityQuery &Query) { 1855 const LLT LitTy = Query.Types[LitTyIdx]; 1856 return (LitTy.getScalarSizeInBits() < 16); 1857 }, 1858 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1859 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1860 .widenScalarToNextPow2(BigTyIdx, 32); 1861 1862 } 1863 1864 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1865 .legalForCartesianProduct(AllS32Vectors, {S32}) 1866 .legalForCartesianProduct(AllS64Vectors, {S64}) 1867 .clampNumElements(0, V16S32, V32S32) 1868 .clampNumElements(0, V2S64, V16S64) 1869 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)) 1870 .moreElementsIf( 1871 isIllegalRegisterType(0), 1872 moreElementsToNextExistingRegClass(0)); 1873 1874 if (ST.hasScalarPackInsts()) { 1875 BuildVector 1876 // FIXME: Should probably widen s1 vectors straight to s32 1877 .minScalarOrElt(0, S16) 1878 .minScalar(1, S16); 1879 1880 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1881 .legalFor({V2S16, S32}) 1882 .lower(); 1883 } else { 1884 BuildVector.customFor({V2S16, S16}); 1885 BuildVector.minScalarOrElt(0, S32); 1886 1887 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1888 .customFor({V2S16, S32}) 1889 .lower(); 1890 } 1891 1892 BuildVector.legalIf(isRegisterType(0)); 1893 1894 // FIXME: Clamp maximum size 1895 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1896 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1897 .clampMaxNumElements(0, S32, 32) 1898 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1899 .clampMaxNumElements(0, S16, 64); 1900 1901 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1902 1903 // Merge/Unmerge 1904 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1905 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1906 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1907 1908 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1909 const LLT Ty = Query.Types[TypeIdx]; 1910 if (Ty.isVector()) { 1911 const LLT &EltTy = Ty.getElementType(); 1912 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1913 return true; 1914 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits())) 1915 return true; 1916 } 1917 return false; 1918 }; 1919 1920 auto &Builder = getActionDefinitionsBuilder(Op) 1921 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1922 .lowerFor({{S16, V2S16}}) 1923 .lowerIf([=](const LegalityQuery &Query) { 1924 const LLT BigTy = Query.Types[BigTyIdx]; 1925 return BigTy.getSizeInBits() == 32; 1926 }) 1927 // Try to widen to s16 first for small types. 1928 // TODO: Only do this on targets with legal s16 shifts 1929 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1930 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1931 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1932 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1933 elementTypeIs(1, S16)), 1934 changeTo(1, V2S16)) 1935 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1936 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1937 // valid. 1938 .clampScalar(LitTyIdx, S32, S512) 1939 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1940 // Break up vectors with weird elements into scalars 1941 .fewerElementsIf( 1942 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1943 scalarize(0)) 1944 .fewerElementsIf( 1945 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1946 scalarize(1)) 1947 .clampScalar(BigTyIdx, S32, MaxScalar); 1948 1949 if (Op == G_MERGE_VALUES) { 1950 Builder.widenScalarIf( 1951 // TODO: Use 16-bit shifts if legal for 8-bit values? 1952 [=](const LegalityQuery &Query) { 1953 const LLT Ty = Query.Types[LitTyIdx]; 1954 return Ty.getSizeInBits() < 32; 1955 }, 1956 changeTo(LitTyIdx, S32)); 1957 } 1958 1959 Builder.widenScalarIf( 1960 [=](const LegalityQuery &Query) { 1961 const LLT Ty = Query.Types[BigTyIdx]; 1962 return Ty.getSizeInBits() % 16 != 0; 1963 }, 1964 [=](const LegalityQuery &Query) { 1965 // Pick the next power of 2, or a multiple of 64 over 128. 1966 // Whichever is smaller. 1967 const LLT &Ty = Query.Types[BigTyIdx]; 1968 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1969 if (NewSizeInBits >= 256) { 1970 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1971 if (RoundedTo < NewSizeInBits) 1972 NewSizeInBits = RoundedTo; 1973 } 1974 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1975 }) 1976 // Any vectors left are the wrong size. Scalarize them. 1977 .scalarize(0) 1978 .scalarize(1); 1979 } 1980 1981 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1982 // RegBankSelect. 1983 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1984 .legalFor({{S32}, {S64}}); 1985 1986 if (ST.hasVOP3PInsts()) { 1987 SextInReg.lowerFor({{V2S16}}) 1988 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1989 // get more vector shift opportunities, since we'll get those when 1990 // expanded. 1991 .clampMaxNumElementsStrict(0, S16, 2); 1992 } else if (ST.has16BitInsts()) { 1993 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1994 } else { 1995 // Prefer to promote to s32 before lowering if we don't have 16-bit 1996 // shifts. This avoid a lot of intermediate truncate and extend operations. 1997 SextInReg.lowerFor({{S32}, {S64}}); 1998 } 1999 2000 SextInReg 2001 .scalarize(0) 2002 .clampScalar(0, S32, S64) 2003 .lower(); 2004 2005 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 2006 .scalarize(0) 2007 .lower(); 2008 2009 // TODO: Only Try to form v2s16 with legal packed instructions. 2010 getActionDefinitionsBuilder(G_FSHR) 2011 .legalFor({{S32, S32}}) 2012 .lowerFor({{V2S16, V2S16}}) 2013 .clampMaxNumElementsStrict(0, S16, 2) 2014 .scalarize(0) 2015 .lower(); 2016 2017 if (ST.hasVOP3PInsts()) { 2018 getActionDefinitionsBuilder(G_FSHL) 2019 .lowerFor({{V2S16, V2S16}}) 2020 .clampMaxNumElementsStrict(0, S16, 2) 2021 .scalarize(0) 2022 .lower(); 2023 } else { 2024 getActionDefinitionsBuilder(G_FSHL) 2025 .scalarize(0) 2026 .lower(); 2027 } 2028 2029 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 2030 .legalFor({S64}); 2031 2032 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64}); 2033 2034 getActionDefinitionsBuilder(G_FENCE) 2035 .alwaysLegal(); 2036 2037 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 2038 .scalarize(0) 2039 .minScalar(0, S32) 2040 .lower(); 2041 2042 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 2043 .legalFor({{S32, S32}, {S64, S32}}) 2044 .clampScalar(1, S32, S32) 2045 .clampScalar(0, S32, S64) 2046 .widenScalarToNextPow2(0) 2047 .scalarize(0); 2048 2049 getActionDefinitionsBuilder( 2050 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops 2051 G_FCOPYSIGN, 2052 2053 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB, 2054 G_READ_REGISTER, G_WRITE_REGISTER, 2055 2056 G_SADDO, G_SSUBO}) 2057 .lower(); 2058 2059 if (ST.hasIEEEMinMax()) { 2060 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) 2061 .legalFor(FPTypesPK16) 2062 .clampMaxNumElements(0, S16, 2) 2063 .scalarize(0); 2064 } else { 2065 // TODO: Implement 2066 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 2067 } 2068 2069 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 2070 .lower(); 2071 2072 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom(); 2073 2074 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 2075 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 2076 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 2077 .unsupported(); 2078 2079 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal(); 2080 2081 getLegacyLegalizerInfo().computeTables(); 2082 verify(*ST.getInstrInfo()); 2083 } 2084 2085 bool AMDGPULegalizerInfo::legalizeCustom( 2086 LegalizerHelper &Helper, MachineInstr &MI, 2087 LostDebugLocObserver &LocObserver) const { 2088 MachineIRBuilder &B = Helper.MIRBuilder; 2089 MachineRegisterInfo &MRI = *B.getMRI(); 2090 2091 switch (MI.getOpcode()) { 2092 case TargetOpcode::G_ADDRSPACE_CAST: 2093 return legalizeAddrSpaceCast(MI, MRI, B); 2094 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: 2095 return legalizeFroundeven(MI, MRI, B); 2096 case TargetOpcode::G_FCEIL: 2097 return legalizeFceil(MI, MRI, B); 2098 case TargetOpcode::G_FREM: 2099 return legalizeFrem(MI, MRI, B); 2100 case TargetOpcode::G_INTRINSIC_TRUNC: 2101 return legalizeIntrinsicTrunc(MI, MRI, B); 2102 case TargetOpcode::G_SITOFP: 2103 return legalizeITOFP(MI, MRI, B, true); 2104 case TargetOpcode::G_UITOFP: 2105 return legalizeITOFP(MI, MRI, B, false); 2106 case TargetOpcode::G_FPTOSI: 2107 return legalizeFPTOI(MI, MRI, B, true); 2108 case TargetOpcode::G_FPTOUI: 2109 return legalizeFPTOI(MI, MRI, B, false); 2110 case TargetOpcode::G_FMINNUM: 2111 case TargetOpcode::G_FMAXNUM: 2112 case TargetOpcode::G_FMINNUM_IEEE: 2113 case TargetOpcode::G_FMAXNUM_IEEE: 2114 return legalizeMinNumMaxNum(Helper, MI); 2115 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 2116 return legalizeExtractVectorElt(MI, MRI, B); 2117 case TargetOpcode::G_INSERT_VECTOR_ELT: 2118 return legalizeInsertVectorElt(MI, MRI, B); 2119 case TargetOpcode::G_FSIN: 2120 case TargetOpcode::G_FCOS: 2121 return legalizeSinCos(MI, MRI, B); 2122 case TargetOpcode::G_GLOBAL_VALUE: 2123 return legalizeGlobalValue(MI, MRI, B); 2124 case TargetOpcode::G_LOAD: 2125 case TargetOpcode::G_SEXTLOAD: 2126 case TargetOpcode::G_ZEXTLOAD: 2127 return legalizeLoad(Helper, MI); 2128 case TargetOpcode::G_STORE: 2129 return legalizeStore(Helper, MI); 2130 case TargetOpcode::G_FMAD: 2131 return legalizeFMad(MI, MRI, B); 2132 case TargetOpcode::G_FDIV: 2133 return legalizeFDIV(MI, MRI, B); 2134 case TargetOpcode::G_FFREXP: 2135 return legalizeFFREXP(MI, MRI, B); 2136 case TargetOpcode::G_FSQRT: 2137 return legalizeFSQRT(MI, MRI, B); 2138 case TargetOpcode::G_UDIV: 2139 case TargetOpcode::G_UREM: 2140 case TargetOpcode::G_UDIVREM: 2141 return legalizeUnsignedDIV_REM(MI, MRI, B); 2142 case TargetOpcode::G_SDIV: 2143 case TargetOpcode::G_SREM: 2144 case TargetOpcode::G_SDIVREM: 2145 return legalizeSignedDIV_REM(MI, MRI, B); 2146 case TargetOpcode::G_ATOMIC_CMPXCHG: 2147 return legalizeAtomicCmpXChg(MI, MRI, B); 2148 case TargetOpcode::G_FLOG2: 2149 return legalizeFlog2(MI, B); 2150 case TargetOpcode::G_FLOG: 2151 case TargetOpcode::G_FLOG10: 2152 return legalizeFlogCommon(MI, B); 2153 case TargetOpcode::G_FEXP2: 2154 return legalizeFExp2(MI, B); 2155 case TargetOpcode::G_FEXP: 2156 case TargetOpcode::G_FEXP10: 2157 return legalizeFExp(MI, B); 2158 case TargetOpcode::G_FPOW: 2159 return legalizeFPow(MI, B); 2160 case TargetOpcode::G_FFLOOR: 2161 return legalizeFFloor(MI, MRI, B); 2162 case TargetOpcode::G_BUILD_VECTOR: 2163 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 2164 return legalizeBuildVector(MI, MRI, B); 2165 case TargetOpcode::G_MUL: 2166 return legalizeMul(Helper, MI); 2167 case TargetOpcode::G_CTLZ: 2168 case TargetOpcode::G_CTTZ: 2169 return legalizeCTLZ_CTTZ(MI, MRI, B); 2170 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 2171 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B); 2172 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 2173 return legalizeFPTruncRound(MI, B); 2174 case TargetOpcode::G_STACKSAVE: 2175 return legalizeStackSave(MI, B); 2176 case TargetOpcode::G_GET_FPENV: 2177 return legalizeGetFPEnv(MI, MRI, B); 2178 case TargetOpcode::G_SET_FPENV: 2179 return legalizeSetFPEnv(MI, MRI, B); 2180 case TargetOpcode::G_TRAP: 2181 return legalizeTrap(MI, MRI, B); 2182 case TargetOpcode::G_DEBUGTRAP: 2183 return legalizeDebugTrap(MI, MRI, B); 2184 default: 2185 return false; 2186 } 2187 2188 llvm_unreachable("expected switch to return"); 2189 } 2190 2191 Register AMDGPULegalizerInfo::getSegmentAperture( 2192 unsigned AS, 2193 MachineRegisterInfo &MRI, 2194 MachineIRBuilder &B) const { 2195 MachineFunction &MF = B.getMF(); 2196 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 2197 const LLT S32 = LLT::scalar(32); 2198 const LLT S64 = LLT::scalar(64); 2199 2200 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 2201 2202 if (ST.hasApertureRegs()) { 2203 // Note: this register is somewhat broken. When used as a 32-bit operand, 2204 // it only returns zeroes. The real value is in the upper 32 bits. 2205 // Thus, we must emit extract the high 32 bits. 2206 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) 2207 ? AMDGPU::SRC_SHARED_BASE 2208 : AMDGPU::SRC_PRIVATE_BASE; 2209 // FIXME: It would be more natural to emit a COPY here, but then copy 2210 // coalescing would kick in and it would think it's okay to use the "HI" 2211 // subregister (instead of extracting the HI 32 bits) which is an artificial 2212 // (unusable) register. 2213 // Register TableGen definitions would need an overhaul to get rid of the 2214 // artificial "HI" aperture registers and prevent this kind of issue from 2215 // happening. 2216 Register Dst = MRI.createGenericVirtualRegister(S64); 2217 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass); 2218 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)}); 2219 return B.buildUnmerge(S32, Dst).getReg(1); 2220 } 2221 2222 // TODO: can we be smarter about machine pointer info? 2223 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 2224 Register LoadAddr = MRI.createGenericVirtualRegister( 2225 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2226 // For code object version 5, private_base and shared_base are passed through 2227 // implicit kernargs. 2228 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= 2229 AMDGPU::AMDHSA_COV5) { 2230 AMDGPUTargetLowering::ImplicitParameter Param = 2231 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 2232 : AMDGPUTargetLowering::PRIVATE_BASE; 2233 uint64_t Offset = 2234 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 2235 2236 Register KernargPtrReg = MRI.createGenericVirtualRegister( 2237 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2238 2239 if (!loadInputValue(KernargPtrReg, B, 2240 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 2241 return Register(); 2242 2243 MachineMemOperand *MMO = MF.getMachineMemOperand( 2244 PtrInfo, 2245 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2246 MachineMemOperand::MOInvariant, 2247 LLT::scalar(32), commonAlignment(Align(64), Offset)); 2248 2249 // Pointer address 2250 B.buildPtrAdd(LoadAddr, KernargPtrReg, 2251 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 2252 // Load address 2253 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 2254 } 2255 2256 Register QueuePtr = MRI.createGenericVirtualRegister( 2257 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2258 2259 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 2260 return Register(); 2261 2262 // Offset into amd_queue_t for group_segment_aperture_base_hi / 2263 // private_segment_aperture_base_hi. 2264 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 2265 2266 MachineMemOperand *MMO = MF.getMachineMemOperand( 2267 PtrInfo, 2268 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2269 MachineMemOperand::MOInvariant, 2270 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 2271 2272 B.buildPtrAdd(LoadAddr, QueuePtr, 2273 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 2274 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 2275 } 2276 2277 /// Return true if the value is a known valid address, such that a null check is 2278 /// not necessary. 2279 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 2280 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 2281 MachineInstr *Def = MRI.getVRegDef(Val); 2282 switch (Def->getOpcode()) { 2283 case AMDGPU::G_FRAME_INDEX: 2284 case AMDGPU::G_GLOBAL_VALUE: 2285 case AMDGPU::G_BLOCK_ADDR: 2286 return true; 2287 case AMDGPU::G_CONSTANT: { 2288 const ConstantInt *CI = Def->getOperand(1).getCImm(); 2289 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 2290 } 2291 default: 2292 return false; 2293 } 2294 2295 return false; 2296 } 2297 2298 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 2299 MachineInstr &MI, MachineRegisterInfo &MRI, 2300 MachineIRBuilder &B) const { 2301 MachineFunction &MF = B.getMF(); 2302 2303 // MI can either be a G_ADDRSPACE_CAST or a 2304 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull 2305 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST || 2306 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() == 2307 Intrinsic::amdgcn_addrspacecast_nonnull)); 2308 2309 const LLT S32 = LLT::scalar(32); 2310 Register Dst = MI.getOperand(0).getReg(); 2311 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg() 2312 : MI.getOperand(1).getReg(); 2313 LLT DstTy = MRI.getType(Dst); 2314 LLT SrcTy = MRI.getType(Src); 2315 unsigned DestAS = DstTy.getAddressSpace(); 2316 unsigned SrcAS = SrcTy.getAddressSpace(); 2317 2318 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 2319 // vector element. 2320 assert(!DstTy.isVector()); 2321 2322 const AMDGPUTargetMachine &TM 2323 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 2324 2325 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 2326 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 2327 return true; 2328 } 2329 2330 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 2331 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 2332 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 2333 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for 2334 // G_ADDRSPACE_CAST we need to guess. 2335 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) { 2336 // Extract low 32-bits of the pointer. 2337 B.buildExtract(Dst, Src, 0); 2338 MI.eraseFromParent(); 2339 return true; 2340 } 2341 2342 unsigned NullVal = TM.getNullPointerValue(DestAS); 2343 2344 auto SegmentNull = B.buildConstant(DstTy, NullVal); 2345 auto FlatNull = B.buildConstant(SrcTy, 0); 2346 2347 // Extract low 32-bits of the pointer. 2348 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 2349 2350 auto CmpRes = 2351 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 2352 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 2353 2354 MI.eraseFromParent(); 2355 return true; 2356 } 2357 2358 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 2359 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 2360 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 2361 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 2362 if (!ApertureReg.isValid()) 2363 return false; 2364 2365 // Coerce the type of the low half of the result so we can use merge_values. 2366 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 2367 2368 // TODO: Should we allow mismatched types but matching sizes in merges to 2369 // avoid the ptrtoint? 2370 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg}); 2371 2372 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for 2373 // G_ADDRSPACE_CAST we need to guess. 2374 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) { 2375 B.buildCopy(Dst, BuildPtr); 2376 MI.eraseFromParent(); 2377 return true; 2378 } 2379 2380 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 2381 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 2382 2383 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 2384 SegmentNull.getReg(0)); 2385 2386 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 2387 2388 MI.eraseFromParent(); 2389 return true; 2390 } 2391 2392 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2393 SrcTy.getSizeInBits() == 64) { 2394 // Truncate. 2395 B.buildExtract(Dst, Src, 0); 2396 MI.eraseFromParent(); 2397 return true; 2398 } 2399 2400 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2401 DstTy.getSizeInBits() == 64) { 2402 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2403 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 2404 auto PtrLo = B.buildPtrToInt(S32, Src); 2405 auto HighAddr = B.buildConstant(S32, AddrHiVal); 2406 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); 2407 MI.eraseFromParent(); 2408 return true; 2409 } 2410 2411 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 2412 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 2413 2414 LLVMContext &Ctx = MF.getFunction().getContext(); 2415 Ctx.diagnose(InvalidAddrSpaceCast); 2416 B.buildUndef(Dst); 2417 MI.eraseFromParent(); 2418 return true; 2419 } 2420 2421 bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI, 2422 MachineRegisterInfo &MRI, 2423 MachineIRBuilder &B) const { 2424 Register Src = MI.getOperand(1).getReg(); 2425 LLT Ty = MRI.getType(Src); 2426 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 2427 2428 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2429 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2430 2431 auto C1 = B.buildFConstant(Ty, C1Val); 2432 auto CopySign = B.buildFCopysign(Ty, C1, Src); 2433 2434 // TODO: Should this propagate fast-math-flags? 2435 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 2436 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 2437 2438 auto C2 = B.buildFConstant(Ty, C2Val); 2439 auto Fabs = B.buildFAbs(Ty, Src); 2440 2441 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 2442 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2443 MI.eraseFromParent(); 2444 return true; 2445 } 2446 2447 bool AMDGPULegalizerInfo::legalizeFceil( 2448 MachineInstr &MI, MachineRegisterInfo &MRI, 2449 MachineIRBuilder &B) const { 2450 2451 const LLT S1 = LLT::scalar(1); 2452 const LLT S64 = LLT::scalar(64); 2453 2454 Register Src = MI.getOperand(1).getReg(); 2455 assert(MRI.getType(Src) == S64); 2456 2457 // result = trunc(src) 2458 // if (src > 0.0 && src != result) 2459 // result += 1.0 2460 2461 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2462 2463 const auto Zero = B.buildFConstant(S64, 0.0); 2464 const auto One = B.buildFConstant(S64, 1.0); 2465 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2466 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2467 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2468 auto Add = B.buildSelect(S64, And, One, Zero); 2469 2470 // TODO: Should this propagate fast-math-flags? 2471 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2472 MI.eraseFromParent(); 2473 return true; 2474 } 2475 2476 bool AMDGPULegalizerInfo::legalizeFrem( 2477 MachineInstr &MI, MachineRegisterInfo &MRI, 2478 MachineIRBuilder &B) const { 2479 Register DstReg = MI.getOperand(0).getReg(); 2480 Register Src0Reg = MI.getOperand(1).getReg(); 2481 Register Src1Reg = MI.getOperand(2).getReg(); 2482 auto Flags = MI.getFlags(); 2483 LLT Ty = MRI.getType(DstReg); 2484 2485 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2486 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2487 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2488 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2489 MI.eraseFromParent(); 2490 return true; 2491 } 2492 2493 static MachineInstrBuilder extractF64Exponent(Register Hi, 2494 MachineIRBuilder &B) { 2495 const unsigned FractBits = 52; 2496 const unsigned ExpBits = 11; 2497 LLT S32 = LLT::scalar(32); 2498 2499 auto Const0 = B.buildConstant(S32, FractBits - 32); 2500 auto Const1 = B.buildConstant(S32, ExpBits); 2501 2502 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}) 2503 .addUse(Hi) 2504 .addUse(Const0.getReg(0)) 2505 .addUse(Const1.getReg(0)); 2506 2507 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2508 } 2509 2510 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2511 MachineInstr &MI, MachineRegisterInfo &MRI, 2512 MachineIRBuilder &B) const { 2513 const LLT S1 = LLT::scalar(1); 2514 const LLT S32 = LLT::scalar(32); 2515 const LLT S64 = LLT::scalar(64); 2516 2517 Register Src = MI.getOperand(1).getReg(); 2518 assert(MRI.getType(Src) == S64); 2519 2520 // TODO: Should this use extract since the low half is unused? 2521 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2522 Register Hi = Unmerge.getReg(1); 2523 2524 // Extract the upper half, since this is where we will find the sign and 2525 // exponent. 2526 auto Exp = extractF64Exponent(Hi, B); 2527 2528 const unsigned FractBits = 52; 2529 2530 // Extract the sign bit. 2531 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2532 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2533 2534 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2535 2536 const auto Zero32 = B.buildConstant(S32, 0); 2537 2538 // Extend back to 64-bits. 2539 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit}); 2540 2541 auto Shr = B.buildAShr(S64, FractMask, Exp); 2542 auto Not = B.buildNot(S64, Shr); 2543 auto Tmp0 = B.buildAnd(S64, Src, Not); 2544 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2545 2546 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2547 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2548 2549 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2550 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2551 MI.eraseFromParent(); 2552 return true; 2553 } 2554 2555 bool AMDGPULegalizerInfo::legalizeITOFP( 2556 MachineInstr &MI, MachineRegisterInfo &MRI, 2557 MachineIRBuilder &B, bool Signed) const { 2558 2559 Register Dst = MI.getOperand(0).getReg(); 2560 Register Src = MI.getOperand(1).getReg(); 2561 2562 const LLT S64 = LLT::scalar(64); 2563 const LLT S32 = LLT::scalar(32); 2564 2565 assert(MRI.getType(Src) == S64); 2566 2567 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2568 auto ThirtyTwo = B.buildConstant(S32, 32); 2569 2570 if (MRI.getType(Dst) == S64) { 2571 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2572 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2573 2574 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2575 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo); 2576 2577 // TODO: Should this propagate fast-math-flags? 2578 B.buildFAdd(Dst, LdExp, CvtLo); 2579 MI.eraseFromParent(); 2580 return true; 2581 } 2582 2583 assert(MRI.getType(Dst) == S32); 2584 2585 auto One = B.buildConstant(S32, 1); 2586 2587 MachineInstrBuilder ShAmt; 2588 if (Signed) { 2589 auto ThirtyOne = B.buildConstant(S32, 31); 2590 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2591 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2592 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2593 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}) 2594 .addUse(Unmerge.getReg(1)); 2595 auto LS2 = B.buildSub(S32, LS, One); 2596 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2597 } else 2598 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2599 auto Norm = B.buildShl(S64, Src, ShAmt); 2600 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2601 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2602 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2603 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2604 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2605 B.buildFLdexp(Dst, FVal, Scale); 2606 MI.eraseFromParent(); 2607 return true; 2608 } 2609 2610 // TODO: Copied from DAG implementation. Verify logic and document how this 2611 // actually works. 2612 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2613 MachineRegisterInfo &MRI, 2614 MachineIRBuilder &B, 2615 bool Signed) const { 2616 2617 Register Dst = MI.getOperand(0).getReg(); 2618 Register Src = MI.getOperand(1).getReg(); 2619 2620 const LLT S64 = LLT::scalar(64); 2621 const LLT S32 = LLT::scalar(32); 2622 2623 const LLT SrcLT = MRI.getType(Src); 2624 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2625 2626 unsigned Flags = MI.getFlags(); 2627 2628 // The basic idea of converting a floating point number into a pair of 32-bit 2629 // integers is illustrated as follows: 2630 // 2631 // tf := trunc(val); 2632 // hif := floor(tf * 2^-32); 2633 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2634 // hi := fptoi(hif); 2635 // lo := fptoi(lof); 2636 // 2637 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2638 MachineInstrBuilder Sign; 2639 if (Signed && SrcLT == S32) { 2640 // However, a 32-bit floating point number has only 23 bits mantissa and 2641 // it's not enough to hold all the significant bits of `lof` if val is 2642 // negative. To avoid the loss of precision, We need to take the absolute 2643 // value after truncating and flip the result back based on the original 2644 // signedness. 2645 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2646 Trunc = B.buildFAbs(S32, Trunc, Flags); 2647 } 2648 MachineInstrBuilder K0, K1; 2649 if (SrcLT == S64) { 2650 K0 = B.buildFConstant( 2651 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2652 K1 = B.buildFConstant( 2653 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2654 } else { 2655 K0 = B.buildFConstant( 2656 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000))); 2657 K1 = B.buildFConstant( 2658 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000))); 2659 } 2660 2661 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2662 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2663 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2664 2665 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2666 : B.buildFPTOUI(S32, FloorMul); 2667 auto Lo = B.buildFPTOUI(S32, Fma); 2668 2669 if (Signed && SrcLT == S32) { 2670 // Flip the result based on the signedness, which is either all 0s or 1s. 2671 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign}); 2672 // r := xor({lo, hi}, sign) - sign; 2673 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign), 2674 Sign); 2675 } else 2676 B.buildMergeLikeInstr(Dst, {Lo, Hi}); 2677 MI.eraseFromParent(); 2678 2679 return true; 2680 } 2681 2682 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2683 MachineInstr &MI) const { 2684 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2685 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2686 2687 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2688 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2689 2690 // With ieee_mode disabled, the instructions have the correct behavior 2691 // already for G_FMINNUM/G_FMAXNUM 2692 if (!MFI->getMode().IEEE) 2693 return !IsIEEEOp; 2694 2695 if (IsIEEEOp) 2696 return true; 2697 2698 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2699 } 2700 2701 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2702 MachineInstr &MI, MachineRegisterInfo &MRI, 2703 MachineIRBuilder &B) const { 2704 // TODO: Should move some of this into LegalizerHelper. 2705 2706 // TODO: Promote dynamic indexing of s16 to s32 2707 2708 Register Dst = MI.getOperand(0).getReg(); 2709 Register Vec = MI.getOperand(1).getReg(); 2710 2711 LLT VecTy = MRI.getType(Vec); 2712 LLT EltTy = VecTy.getElementType(); 2713 assert(EltTy == MRI.getType(Dst)); 2714 2715 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 2716 // but we can't go directly to that logic becasue you can't bitcast a vector 2717 // of pointers to a vector of integers. Therefore, introduce an intermediate 2718 // vector of integers using ptrtoint (and inttoptr on the output) in order to 2719 // drive the legalization forward. 2720 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 2721 LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 2722 LLT IntVecTy = VecTy.changeElementType(IntTy); 2723 2724 auto IntVec = B.buildPtrToInt(IntVecTy, Vec); 2725 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2)); 2726 B.buildIntToPtr(Dst, IntElt); 2727 2728 MI.eraseFromParent(); 2729 return true; 2730 } 2731 2732 // FIXME: Artifact combiner probably should have replaced the truncated 2733 // constant before this, so we shouldn't need 2734 // getIConstantVRegValWithLookThrough. 2735 std::optional<ValueAndVReg> MaybeIdxVal = 2736 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2737 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2738 return true; 2739 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 2740 2741 if (IdxVal < VecTy.getNumElements()) { 2742 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2743 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2744 } else { 2745 B.buildUndef(Dst); 2746 } 2747 2748 MI.eraseFromParent(); 2749 return true; 2750 } 2751 2752 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2753 MachineInstr &MI, MachineRegisterInfo &MRI, 2754 MachineIRBuilder &B) const { 2755 // TODO: Should move some of this into LegalizerHelper. 2756 2757 // TODO: Promote dynamic indexing of s16 to s32 2758 2759 Register Dst = MI.getOperand(0).getReg(); 2760 Register Vec = MI.getOperand(1).getReg(); 2761 Register Ins = MI.getOperand(2).getReg(); 2762 2763 LLT VecTy = MRI.getType(Vec); 2764 LLT EltTy = VecTy.getElementType(); 2765 assert(EltTy == MRI.getType(Ins)); 2766 2767 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 2768 // but we can't go directly to that logic becasue you can't bitcast a vector 2769 // of pointers to a vector of integers. Therefore, make the pointer vector 2770 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd 2771 // new value, and then inttoptr the result vector back. This will then allow 2772 // the rest of legalization to take over. 2773 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 2774 LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 2775 LLT IntVecTy = VecTy.changeElementType(IntTy); 2776 2777 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec); 2778 auto IntIns = B.buildPtrToInt(IntTy, Ins); 2779 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns, 2780 MI.getOperand(3)); 2781 B.buildIntToPtr(Dst, IntVecDest); 2782 MI.eraseFromParent(); 2783 return true; 2784 } 2785 2786 // FIXME: Artifact combiner probably should have replaced the truncated 2787 // constant before this, so we shouldn't need 2788 // getIConstantVRegValWithLookThrough. 2789 std::optional<ValueAndVReg> MaybeIdxVal = 2790 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2791 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2792 return true; 2793 2794 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 2795 2796 unsigned NumElts = VecTy.getNumElements(); 2797 if (IdxVal < NumElts) { 2798 SmallVector<Register, 8> SrcRegs; 2799 for (unsigned i = 0; i < NumElts; ++i) 2800 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2801 B.buildUnmerge(SrcRegs, Vec); 2802 2803 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2804 B.buildMergeLikeInstr(Dst, SrcRegs); 2805 } else { 2806 B.buildUndef(Dst); 2807 } 2808 2809 MI.eraseFromParent(); 2810 return true; 2811 } 2812 2813 bool AMDGPULegalizerInfo::legalizeSinCos( 2814 MachineInstr &MI, MachineRegisterInfo &MRI, 2815 MachineIRBuilder &B) const { 2816 2817 Register DstReg = MI.getOperand(0).getReg(); 2818 Register SrcReg = MI.getOperand(1).getReg(); 2819 LLT Ty = MRI.getType(DstReg); 2820 unsigned Flags = MI.getFlags(); 2821 2822 Register TrigVal; 2823 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2824 if (ST.hasTrigReducedRange()) { 2825 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2826 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}) 2827 .addUse(MulVal.getReg(0)) 2828 .setMIFlags(Flags) 2829 .getReg(0); 2830 } else 2831 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2832 2833 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2834 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2835 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg)) 2836 .addUse(TrigVal) 2837 .setMIFlags(Flags); 2838 MI.eraseFromParent(); 2839 return true; 2840 } 2841 2842 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2843 MachineIRBuilder &B, 2844 const GlobalValue *GV, 2845 int64_t Offset, 2846 unsigned GAFlags) const { 2847 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2848 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2849 // to the following code sequence: 2850 // 2851 // For constant address space: 2852 // s_getpc_b64 s[0:1] 2853 // s_add_u32 s0, s0, $symbol 2854 // s_addc_u32 s1, s1, 0 2855 // 2856 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2857 // a fixup or relocation is emitted to replace $symbol with a literal 2858 // constant, which is a pc-relative offset from the encoding of the $symbol 2859 // operand to the global variable. 2860 // 2861 // For global address space: 2862 // s_getpc_b64 s[0:1] 2863 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2864 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2865 // 2866 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2867 // fixups or relocations are emitted to replace $symbol@*@lo and 2868 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2869 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2870 // operand to the global variable. 2871 2872 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2873 2874 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2875 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2876 2877 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2878 .addDef(PCReg); 2879 2880 MIB.addGlobalAddress(GV, Offset, GAFlags); 2881 if (GAFlags == SIInstrInfo::MO_NONE) 2882 MIB.addImm(0); 2883 else 2884 MIB.addGlobalAddress(GV, Offset, GAFlags + 1); 2885 2886 if (!B.getMRI()->getRegClassOrNull(PCReg)) 2887 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2888 2889 if (PtrTy.getSizeInBits() == 32) 2890 B.buildExtract(DstReg, PCReg, 0); 2891 return true; 2892 } 2893 2894 // Emit a ABS32_LO / ABS32_HI relocation stub. 2895 void AMDGPULegalizerInfo::buildAbsGlobalAddress( 2896 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, 2897 MachineRegisterInfo &MRI) const { 2898 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32; 2899 2900 LLT S32 = LLT::scalar(32); 2901 2902 // Use the destination directly, if and only if we store the lower address 2903 // part only and we don't have a register class being set. 2904 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg) 2905 ? DstReg 2906 : MRI.createGenericVirtualRegister(S32); 2907 2908 if (!MRI.getRegClassOrNull(AddrLo)) 2909 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass); 2910 2911 // Write the lower half. 2912 B.buildInstr(AMDGPU::S_MOV_B32) 2913 .addDef(AddrLo) 2914 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO); 2915 2916 // If required, write the upper half as well. 2917 if (RequiresHighHalf) { 2918 assert(PtrTy.getSizeInBits() == 64 && 2919 "Must provide a 64-bit pointer type!"); 2920 2921 Register AddrHi = MRI.createGenericVirtualRegister(S32); 2922 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass); 2923 2924 B.buildInstr(AMDGPU::S_MOV_B32) 2925 .addDef(AddrHi) 2926 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI); 2927 2928 // Use the destination directly, if and only if we don't have a register 2929 // class being set. 2930 Register AddrDst = !MRI.getRegClassOrNull(DstReg) 2931 ? DstReg 2932 : MRI.createGenericVirtualRegister(LLT::scalar(64)); 2933 2934 if (!MRI.getRegClassOrNull(AddrDst)) 2935 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass); 2936 2937 B.buildMergeValues(AddrDst, {AddrLo, AddrHi}); 2938 2939 // If we created a new register for the destination, cast the result into 2940 // the final output. 2941 if (AddrDst != DstReg) 2942 B.buildCast(DstReg, AddrDst); 2943 } else if (AddrLo != DstReg) { 2944 // If we created a new register for the destination, cast the result into 2945 // the final output. 2946 B.buildCast(DstReg, AddrLo); 2947 } 2948 } 2949 2950 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2951 MachineInstr &MI, MachineRegisterInfo &MRI, 2952 MachineIRBuilder &B) const { 2953 Register DstReg = MI.getOperand(0).getReg(); 2954 LLT Ty = MRI.getType(DstReg); 2955 unsigned AS = Ty.getAddressSpace(); 2956 2957 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2958 MachineFunction &MF = B.getMF(); 2959 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2960 2961 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2962 if (!MFI->isModuleEntryFunction() && 2963 GV->getName() != "llvm.amdgcn.module.lds") { 2964 const Function &Fn = MF.getFunction(); 2965 DiagnosticInfoUnsupported BadLDSDecl( 2966 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2967 DS_Warning); 2968 Fn.getContext().diagnose(BadLDSDecl); 2969 2970 // We currently don't have a way to correctly allocate LDS objects that 2971 // aren't directly associated with a kernel. We do force inlining of 2972 // functions that use local objects. However, if these dead functions are 2973 // not eliminated, we don't want a compile time error. Just emit a warning 2974 // and a trap, since there should be no callable path here. 2975 B.buildTrap(); 2976 B.buildUndef(DstReg); 2977 MI.eraseFromParent(); 2978 return true; 2979 } 2980 2981 // TODO: We could emit code to handle the initialization somewhere. 2982 // We ignore the initializer for now and legalize it to allow selection. 2983 // The initializer will anyway get errored out during assembly emission. 2984 const SITargetLowering *TLI = ST.getTargetLowering(); 2985 if (!TLI->shouldUseLDSConstAddress(GV)) { 2986 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2987 return true; // Leave in place; 2988 } 2989 2990 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2991 Type *Ty = GV->getValueType(); 2992 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2993 // zero-sized type in other languages to declare the dynamic shared 2994 // memory which size is not known at the compile time. They will be 2995 // allocated by the runtime and placed directly after the static 2996 // allocated ones. They all share the same offset. 2997 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2998 // Adjust alignment for that dynamic shared memory array. 2999 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV)); 3000 LLT S32 = LLT::scalar(32); 3001 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}); 3002 B.buildIntToPtr(DstReg, Sz); 3003 MI.eraseFromParent(); 3004 return true; 3005 } 3006 } 3007 3008 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 3009 *cast<GlobalVariable>(GV))); 3010 MI.eraseFromParent(); 3011 return true; 3012 } 3013 3014 if (ST.isAmdPalOS() || ST.isMesa3DOS()) { 3015 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI); 3016 MI.eraseFromParent(); 3017 return true; 3018 } 3019 3020 const SITargetLowering *TLI = ST.getTargetLowering(); 3021 3022 if (TLI->shouldEmitFixup(GV)) { 3023 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 3024 MI.eraseFromParent(); 3025 return true; 3026 } 3027 3028 if (TLI->shouldEmitPCReloc(GV)) { 3029 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 3030 MI.eraseFromParent(); 3031 return true; 3032 } 3033 3034 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 3035 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 3036 3037 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 3038 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 3039 MachinePointerInfo::getGOT(MF), 3040 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3041 MachineMemOperand::MOInvariant, 3042 LoadTy, Align(8)); 3043 3044 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 3045 3046 if (Ty.getSizeInBits() == 32) { 3047 // Truncate if this is a 32-bit constant address. 3048 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 3049 B.buildExtract(DstReg, Load, 0); 3050 } else 3051 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 3052 3053 MI.eraseFromParent(); 3054 return true; 3055 } 3056 3057 static LLT widenToNextPowerOf2(LLT Ty) { 3058 if (Ty.isVector()) 3059 return Ty.changeElementCount( 3060 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 3061 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 3062 } 3063 3064 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 3065 MachineInstr &MI) const { 3066 MachineIRBuilder &B = Helper.MIRBuilder; 3067 MachineRegisterInfo &MRI = *B.getMRI(); 3068 GISelChangeObserver &Observer = Helper.Observer; 3069 3070 Register PtrReg = MI.getOperand(1).getReg(); 3071 LLT PtrTy = MRI.getType(PtrReg); 3072 unsigned AddrSpace = PtrTy.getAddressSpace(); 3073 3074 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 3075 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 3076 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 3077 Observer.changingInstr(MI); 3078 MI.getOperand(1).setReg(Cast.getReg(0)); 3079 Observer.changedInstr(MI); 3080 return true; 3081 } 3082 3083 if (MI.getOpcode() != AMDGPU::G_LOAD) 3084 return false; 3085 3086 Register ValReg = MI.getOperand(0).getReg(); 3087 LLT ValTy = MRI.getType(ValReg); 3088 3089 if (hasBufferRsrcWorkaround(ValTy)) { 3090 Observer.changingInstr(MI); 3091 castBufferRsrcFromV4I32(MI, B, MRI, 0); 3092 Observer.changedInstr(MI); 3093 return true; 3094 } 3095 3096 MachineMemOperand *MMO = *MI.memoperands_begin(); 3097 const unsigned ValSize = ValTy.getSizeInBits(); 3098 const LLT MemTy = MMO->getMemoryType(); 3099 const Align MemAlign = MMO->getAlign(); 3100 const unsigned MemSize = MemTy.getSizeInBits(); 3101 const uint64_t AlignInBits = 8 * MemAlign.value(); 3102 3103 // Widen non-power-of-2 loads to the alignment if needed 3104 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 3105 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 3106 3107 // This was already the correct extending load result type, so just adjust 3108 // the memory type. 3109 if (WideMemSize == ValSize) { 3110 MachineFunction &MF = B.getMF(); 3111 3112 MachineMemOperand *WideMMO = 3113 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 3114 Observer.changingInstr(MI); 3115 MI.setMemRefs(MF, {WideMMO}); 3116 Observer.changedInstr(MI); 3117 return true; 3118 } 3119 3120 // Don't bother handling edge case that should probably never be produced. 3121 if (ValSize > WideMemSize) 3122 return false; 3123 3124 LLT WideTy = widenToNextPowerOf2(ValTy); 3125 3126 Register WideLoad; 3127 if (!WideTy.isVector()) { 3128 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 3129 B.buildTrunc(ValReg, WideLoad).getReg(0); 3130 } else { 3131 // Extract the subvector. 3132 3133 if (isRegisterType(ValTy)) { 3134 // If this a case where G_EXTRACT is legal, use it. 3135 // (e.g. <3 x s32> -> <4 x s32>) 3136 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 3137 B.buildExtract(ValReg, WideLoad, 0); 3138 } else { 3139 // For cases where the widened type isn't a nice register value, unmerge 3140 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 3141 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 3142 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 3143 } 3144 } 3145 3146 MI.eraseFromParent(); 3147 return true; 3148 } 3149 3150 return false; 3151 } 3152 3153 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper, 3154 MachineInstr &MI) const { 3155 MachineIRBuilder &B = Helper.MIRBuilder; 3156 MachineRegisterInfo &MRI = *B.getMRI(); 3157 GISelChangeObserver &Observer = Helper.Observer; 3158 3159 Register DataReg = MI.getOperand(0).getReg(); 3160 LLT DataTy = MRI.getType(DataReg); 3161 3162 if (hasBufferRsrcWorkaround(DataTy)) { 3163 Observer.changingInstr(MI); 3164 castBufferRsrcArgToV4I32(MI, B, 0); 3165 Observer.changedInstr(MI); 3166 return true; 3167 } 3168 return false; 3169 } 3170 3171 bool AMDGPULegalizerInfo::legalizeFMad( 3172 MachineInstr &MI, MachineRegisterInfo &MRI, 3173 MachineIRBuilder &B) const { 3174 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3175 assert(Ty.isScalar()); 3176 3177 MachineFunction &MF = B.getMF(); 3178 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 3179 3180 // TODO: Always legal with future ftz flag. 3181 // FIXME: Do we need just output? 3182 if (Ty == LLT::float32() && 3183 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()) 3184 return true; 3185 if (Ty == LLT::float16() && 3186 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()) 3187 return true; 3188 3189 MachineIRBuilder HelperBuilder(MI); 3190 GISelObserverWrapper DummyObserver; 3191 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 3192 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 3193 } 3194 3195 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 3196 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3197 Register DstReg = MI.getOperand(0).getReg(); 3198 Register PtrReg = MI.getOperand(1).getReg(); 3199 Register CmpVal = MI.getOperand(2).getReg(); 3200 Register NewVal = MI.getOperand(3).getReg(); 3201 3202 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 3203 "this should not have been custom lowered"); 3204 3205 LLT ValTy = MRI.getType(CmpVal); 3206 LLT VecTy = LLT::fixed_vector(2, ValTy); 3207 3208 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 3209 3210 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 3211 .addDef(DstReg) 3212 .addUse(PtrReg) 3213 .addUse(PackedVal) 3214 .setMemRefs(MI.memoperands()); 3215 3216 MI.eraseFromParent(); 3217 return true; 3218 } 3219 3220 /// Return true if it's known that \p Src can never be an f32 denormal value. 3221 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI, 3222 Register Src) { 3223 const MachineInstr *DefMI = MRI.getVRegDef(Src); 3224 switch (DefMI->getOpcode()) { 3225 case TargetOpcode::G_INTRINSIC: { 3226 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) { 3227 case Intrinsic::amdgcn_frexp_mant: 3228 return true; 3229 default: 3230 break; 3231 } 3232 3233 break; 3234 } 3235 case TargetOpcode::G_FFREXP: { 3236 if (DefMI->getOperand(0).getReg() == Src) 3237 return true; 3238 break; 3239 } 3240 case TargetOpcode::G_FPEXT: { 3241 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16); 3242 } 3243 default: 3244 return false; 3245 } 3246 3247 return false; 3248 } 3249 3250 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) { 3251 if (Flags & MachineInstr::FmAfn) 3252 return true; 3253 const auto &Options = MF.getTarget().Options; 3254 return Options.UnsafeFPMath || Options.ApproxFuncFPMath; 3255 } 3256 3257 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, 3258 unsigned Flags) { 3259 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) && 3260 MF.getDenormalMode(APFloat::IEEEsingle()).Input != 3261 DenormalMode::PreserveSign; 3262 } 3263 3264 std::pair<Register, Register> 3265 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src, 3266 unsigned Flags) const { 3267 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) 3268 return {}; 3269 3270 const LLT F32 = LLT::scalar(32); 3271 auto SmallestNormal = B.buildFConstant( 3272 F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle())); 3273 auto IsLtSmallestNormal = 3274 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal); 3275 3276 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32); 3277 auto One = B.buildFConstant(F32, 1.0); 3278 auto ScaleFactor = 3279 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags); 3280 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags); 3281 3282 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)}; 3283 } 3284 3285 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI, 3286 MachineIRBuilder &B) const { 3287 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals. 3288 // If we have to handle denormals, scale up the input and adjust the result. 3289 3290 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0) 3291 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0) 3292 3293 Register Dst = MI.getOperand(0).getReg(); 3294 Register Src = MI.getOperand(1).getReg(); 3295 LLT Ty = B.getMRI()->getType(Dst); 3296 unsigned Flags = MI.getFlags(); 3297 3298 if (Ty == LLT::scalar(16)) { 3299 const LLT F32 = LLT::scalar(32); 3300 // Nothing in half is a denormal when promoted to f32. 3301 auto Ext = B.buildFPExt(F32, Src, Flags); 3302 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}) 3303 .addUse(Ext.getReg(0)) 3304 .setMIFlags(Flags); 3305 B.buildFPTrunc(Dst, Log2, Flags); 3306 MI.eraseFromParent(); 3307 return true; 3308 } 3309 3310 assert(Ty == LLT::scalar(32)); 3311 3312 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags); 3313 if (!ScaledInput) { 3314 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)}) 3315 .addUse(Src) 3316 .setMIFlags(Flags); 3317 MI.eraseFromParent(); 3318 return true; 3319 } 3320 3321 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 3322 .addUse(ScaledInput) 3323 .setMIFlags(Flags); 3324 3325 auto ThirtyTwo = B.buildFConstant(Ty, 32.0); 3326 auto Zero = B.buildFConstant(Ty, 0.0); 3327 auto ResultOffset = 3328 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags); 3329 B.buildFSub(Dst, Log2, ResultOffset, Flags); 3330 3331 MI.eraseFromParent(); 3332 return true; 3333 } 3334 3335 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y, 3336 Register Z, unsigned Flags) { 3337 auto FMul = B.buildFMul(Ty, X, Y, Flags); 3338 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0); 3339 } 3340 3341 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, 3342 MachineIRBuilder &B) const { 3343 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10; 3344 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG); 3345 3346 MachineRegisterInfo &MRI = *B.getMRI(); 3347 Register Dst = MI.getOperand(0).getReg(); 3348 Register X = MI.getOperand(1).getReg(); 3349 unsigned Flags = MI.getFlags(); 3350 const LLT Ty = MRI.getType(X); 3351 MachineFunction &MF = B.getMF(); 3352 3353 const LLT F32 = LLT::scalar(32); 3354 const LLT F16 = LLT::scalar(16); 3355 3356 const AMDGPUTargetMachine &TM = 3357 static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 3358 3359 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) || 3360 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) { 3361 if (Ty == F16 && !ST.has16BitInsts()) { 3362 Register LogVal = MRI.createGenericVirtualRegister(F32); 3363 auto PromoteSrc = B.buildFPExt(F32, X); 3364 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags); 3365 B.buildFPTrunc(Dst, LogVal); 3366 } else { 3367 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags); 3368 } 3369 3370 MI.eraseFromParent(); 3371 return true; 3372 } 3373 3374 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags); 3375 if (ScaledInput) 3376 X = ScaledInput; 3377 3378 auto Y = 3379 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags); 3380 3381 Register R; 3382 if (ST.hasFastFMAF32()) { 3383 // c+cc are ln(2)/ln(10) to more than 49 bits 3384 const float c_log10 = 0x1.344134p-2f; 3385 const float cc_log10 = 0x1.09f79ep-26f; 3386 3387 // c + cc is ln(2) to more than 49 bits 3388 const float c_log = 0x1.62e42ep-1f; 3389 const float cc_log = 0x1.efa39ep-25f; 3390 3391 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log); 3392 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log); 3393 3394 R = B.buildFMul(Ty, Y, C, Flags).getReg(0); 3395 auto NegR = B.buildFNeg(Ty, R, Flags); 3396 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags); 3397 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags); 3398 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0); 3399 } else { 3400 // ch+ct is ln(2)/ln(10) to more than 36 bits 3401 const float ch_log10 = 0x1.344000p-2f; 3402 const float ct_log10 = 0x1.3509f6p-18f; 3403 3404 // ch + ct is ln(2) to more than 36 bits 3405 const float ch_log = 0x1.62e000p-1f; 3406 const float ct_log = 0x1.0bfbe8p-15f; 3407 3408 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log); 3409 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log); 3410 3411 auto MaskConst = B.buildConstant(Ty, 0xfffff000); 3412 auto YH = B.buildAnd(Ty, Y, MaskConst); 3413 auto YT = B.buildFSub(Ty, Y, YH, Flags); 3414 auto YTCT = B.buildFMul(Ty, YT, CT, Flags); 3415 3416 Register Mad0 = 3417 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags); 3418 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags); 3419 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags); 3420 } 3421 3422 const bool IsFiniteOnly = 3423 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) && 3424 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath); 3425 3426 if (!IsFiniteOnly) { 3427 // Expand isfinite(x) => fabs(x) < inf 3428 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 3429 auto Fabs = B.buildFAbs(Ty, Y); 3430 auto IsFinite = 3431 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 3432 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0); 3433 } 3434 3435 if (ScaledInput) { 3436 auto Zero = B.buildFConstant(Ty, 0.0); 3437 auto ShiftK = 3438 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f); 3439 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags); 3440 B.buildFSub(Dst, R, Shift, Flags); 3441 } else { 3442 B.buildCopy(Dst, R); 3443 } 3444 3445 MI.eraseFromParent(); 3446 return true; 3447 } 3448 3449 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, 3450 Register Src, bool IsLog10, 3451 unsigned Flags) const { 3452 const double Log2BaseInverted = 3453 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2; 3454 3455 LLT Ty = B.getMRI()->getType(Dst); 3456 3457 if (Ty == LLT::scalar(32)) { 3458 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags); 3459 if (ScaledInput) { 3460 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 3461 .addUse(Src) 3462 .setMIFlags(Flags); 3463 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted); 3464 auto Zero = B.buildFConstant(Ty, 0.0); 3465 auto ResultOffset = 3466 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags); 3467 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted); 3468 3469 if (ST.hasFastFMAF32()) 3470 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags); 3471 else { 3472 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags); 3473 B.buildFAdd(Dst, Mul, ResultOffset, Flags); 3474 } 3475 3476 return true; 3477 } 3478 } 3479 3480 auto Log2Operand = Ty == LLT::scalar(16) 3481 ? B.buildFLog2(Ty, Src, Flags) 3482 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 3483 .addUse(Src) 3484 .setMIFlags(Flags); 3485 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 3486 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 3487 return true; 3488 } 3489 3490 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI, 3491 MachineIRBuilder &B) const { 3492 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals. 3493 // If we have to handle denormals, scale up the input and adjust the result. 3494 3495 Register Dst = MI.getOperand(0).getReg(); 3496 Register Src = MI.getOperand(1).getReg(); 3497 unsigned Flags = MI.getFlags(); 3498 LLT Ty = B.getMRI()->getType(Dst); 3499 const LLT F16 = LLT::scalar(16); 3500 const LLT F32 = LLT::scalar(32); 3501 3502 if (Ty == F16) { 3503 // Nothing in half is a denormal when promoted to f32. 3504 auto Ext = B.buildFPExt(F32, Src, Flags); 3505 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}) 3506 .addUse(Ext.getReg(0)) 3507 .setMIFlags(Flags); 3508 B.buildFPTrunc(Dst, Log2, Flags); 3509 MI.eraseFromParent(); 3510 return true; 3511 } 3512 3513 assert(Ty == F32); 3514 3515 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) { 3516 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}) 3517 .addUse(Src) 3518 .setMIFlags(Flags); 3519 MI.eraseFromParent(); 3520 return true; 3521 } 3522 3523 // bool needs_scaling = x < -0x1.f80000p+6f; 3524 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); 3525 3526 // -nextafter(128.0, -1) 3527 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f); 3528 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, 3529 RangeCheckConst, Flags); 3530 3531 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f); 3532 auto Zero = B.buildFConstant(Ty, 0.0); 3533 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags); 3534 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags); 3535 3536 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 3537 .addUse(AddInput.getReg(0)) 3538 .setMIFlags(Flags); 3539 3540 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f); 3541 auto One = B.buildFConstant(Ty, 1.0); 3542 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags); 3543 B.buildFMul(Dst, Exp2, ResultScale, Flags); 3544 MI.eraseFromParent(); 3545 return true; 3546 } 3547 3548 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, 3549 Register X, unsigned Flags) const { 3550 LLT Ty = B.getMRI()->getType(Dst); 3551 LLT F32 = LLT::scalar(32); 3552 3553 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) { 3554 auto Log2E = B.buildFConstant(Ty, numbers::log2e); 3555 auto Mul = B.buildFMul(Ty, X, Log2E, Flags); 3556 3557 if (Ty == F32) { 3558 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}) 3559 .addUse(Mul.getReg(0)) 3560 .setMIFlags(Flags); 3561 } else { 3562 B.buildFExp2(Dst, Mul.getReg(0), Flags); 3563 } 3564 3565 return true; 3566 } 3567 3568 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f); 3569 auto NeedsScaling = 3570 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags); 3571 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f); 3572 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags); 3573 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags); 3574 3575 auto Log2E = B.buildFConstant(Ty, numbers::log2e); 3576 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags); 3577 3578 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 3579 .addUse(ExpInput.getReg(0)) 3580 .setMIFlags(Flags); 3581 3582 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f); 3583 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags); 3584 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags); 3585 return true; 3586 } 3587 3588 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 3589 MachineIRBuilder &B) const { 3590 Register Dst = MI.getOperand(0).getReg(); 3591 Register X = MI.getOperand(1).getReg(); 3592 const unsigned Flags = MI.getFlags(); 3593 MachineFunction &MF = B.getMF(); 3594 MachineRegisterInfo &MRI = *B.getMRI(); 3595 LLT Ty = MRI.getType(Dst); 3596 const LLT F16 = LLT::scalar(16); 3597 const LLT F32 = LLT::scalar(32); 3598 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10; 3599 3600 if (Ty == F16) { 3601 // v_exp_f16 (fmul x, log2e) 3602 if (allowApproxFunc(MF, Flags)) { 3603 // TODO: Does this really require fast? 3604 legalizeFExpUnsafe(B, Dst, X, Flags); 3605 MI.eraseFromParent(); 3606 return true; 3607 } 3608 3609 // exp(f16 x) -> 3610 // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) 3611 3612 // Nothing in half is a denormal when promoted to f32. 3613 auto Ext = B.buildFPExt(F32, X, Flags); 3614 Register Lowered = MRI.createGenericVirtualRegister(F32); 3615 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags); 3616 B.buildFPTrunc(Dst, Lowered, Flags); 3617 MI.eraseFromParent(); 3618 return true; 3619 } 3620 3621 assert(Ty == F32); 3622 3623 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying 3624 // library behavior. Also, is known-not-daz source sufficient? 3625 if (allowApproxFunc(MF, Flags)) { 3626 legalizeFExpUnsafe(B, Dst, X, Flags); 3627 MI.eraseFromParent(); 3628 return true; 3629 } 3630 3631 // Algorithm: 3632 // 3633 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) 3634 // 3635 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer 3636 // n = 64*m + j, 0 <= j < 64 3637 // 3638 // e^x = 2^((64*m + j + f)/64) 3639 // = (2^m) * (2^(j/64)) * 2^(f/64) 3640 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) 3641 // 3642 // f = x*(64/ln(2)) - n 3643 // r = f*(ln(2)/64) = x - n*(ln(2)/64) 3644 // 3645 // e^x = (2^m) * (2^(j/64)) * e^r 3646 // 3647 // (2^(j/64)) is precomputed 3648 // 3649 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3650 // e^r = 1 + q 3651 // 3652 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3653 // 3654 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) 3655 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract; 3656 Register PH, PL; 3657 3658 if (ST.hasFastFMAF32()) { 3659 const float c_exp = numbers::log2ef; 3660 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits 3661 const float c_exp10 = 0x1.a934f0p+1f; 3662 const float cc_exp10 = 0x1.2f346ep-24f; 3663 3664 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp); 3665 PH = B.buildFMul(Ty, X, C, Flags).getReg(0); 3666 auto NegPH = B.buildFNeg(Ty, PH, Flags); 3667 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags); 3668 3669 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp); 3670 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0); 3671 } else { 3672 const float ch_exp = 0x1.714000p+0f; 3673 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits 3674 3675 const float ch_exp10 = 0x1.a92000p+1f; 3676 const float cl_exp10 = 0x1.4f0978p-11f; 3677 3678 auto MaskConst = B.buildConstant(Ty, 0xfffff000); 3679 auto XH = B.buildAnd(Ty, X, MaskConst); 3680 auto XL = B.buildFSub(Ty, X, XH, Flags); 3681 3682 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp); 3683 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0); 3684 3685 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp); 3686 auto XLCL = B.buildFMul(Ty, XL, CL, Flags); 3687 3688 Register Mad0 = 3689 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags); 3690 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags); 3691 } 3692 3693 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags); 3694 3695 // It is unsafe to contract this fsub into the PH multiply. 3696 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract); 3697 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags); 3698 auto IntE = B.buildFPTOSI(LLT::scalar(32), E); 3699 3700 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 3701 .addUse(A.getReg(0)) 3702 .setMIFlags(Flags); 3703 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags); 3704 3705 auto UnderflowCheckConst = 3706 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f); 3707 auto Zero = B.buildFConstant(Ty, 0.0); 3708 auto Underflow = 3709 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst); 3710 3711 R = B.buildSelect(Ty, Underflow, Zero, R); 3712 3713 const auto &Options = MF.getTarget().Options; 3714 3715 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) { 3716 auto OverflowCheckConst = 3717 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f); 3718 3719 auto Overflow = 3720 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst); 3721 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 3722 R = B.buildSelect(Ty, Overflow, Inf, R, Flags); 3723 } 3724 3725 B.buildCopy(Dst, R); 3726 MI.eraseFromParent(); 3727 return true; 3728 } 3729 3730 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 3731 MachineIRBuilder &B) const { 3732 Register Dst = MI.getOperand(0).getReg(); 3733 Register Src0 = MI.getOperand(1).getReg(); 3734 Register Src1 = MI.getOperand(2).getReg(); 3735 unsigned Flags = MI.getFlags(); 3736 LLT Ty = B.getMRI()->getType(Dst); 3737 const LLT F16 = LLT::float16(); 3738 const LLT F32 = LLT::float32(); 3739 3740 if (Ty == F32) { 3741 auto Log = B.buildFLog2(F32, Src0, Flags); 3742 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32}) 3743 .addUse(Log.getReg(0)) 3744 .addUse(Src1) 3745 .setMIFlags(Flags); 3746 B.buildFExp2(Dst, Mul, Flags); 3747 } else if (Ty == F16) { 3748 // There's no f16 fmul_legacy, so we need to convert for it. 3749 auto Log = B.buildFLog2(F16, Src0, Flags); 3750 auto Ext0 = B.buildFPExt(F32, Log, Flags); 3751 auto Ext1 = B.buildFPExt(F32, Src1, Flags); 3752 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32}) 3753 .addUse(Ext0.getReg(0)) 3754 .addUse(Ext1.getReg(0)) 3755 .setMIFlags(Flags); 3756 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags); 3757 } else 3758 return false; 3759 3760 MI.eraseFromParent(); 3761 return true; 3762 } 3763 3764 // Find a source register, ignoring any possible source modifiers. 3765 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 3766 Register ModSrc = OrigSrc; 3767 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 3768 ModSrc = SrcFNeg->getOperand(1).getReg(); 3769 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 3770 ModSrc = SrcFAbs->getOperand(1).getReg(); 3771 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 3772 ModSrc = SrcFAbs->getOperand(1).getReg(); 3773 return ModSrc; 3774 } 3775 3776 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 3777 MachineRegisterInfo &MRI, 3778 MachineIRBuilder &B) const { 3779 3780 const LLT S1 = LLT::scalar(1); 3781 const LLT F64 = LLT::float64(); 3782 Register Dst = MI.getOperand(0).getReg(); 3783 Register OrigSrc = MI.getOperand(1).getReg(); 3784 unsigned Flags = MI.getFlags(); 3785 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 && 3786 "this should not have been custom lowered"); 3787 3788 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 3789 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 3790 // efficient way to implement it is using V_FRACT_F64. The workaround for the 3791 // V_FRACT bug is: 3792 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 3793 // 3794 // Convert floor(x) to (x - fract(x)) 3795 3796 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64}) 3797 .addUse(OrigSrc) 3798 .setMIFlags(Flags); 3799 3800 // Give source modifier matching some assistance before obscuring a foldable 3801 // pattern. 3802 3803 // TODO: We can avoid the neg on the fract? The input sign to fract 3804 // shouldn't matter? 3805 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 3806 3807 auto Const = 3808 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff)); 3809 3810 Register Min = MRI.createGenericVirtualRegister(F64); 3811 3812 // We don't need to concern ourselves with the snan handling difference, so 3813 // use the one which will directly select. 3814 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3815 if (MFI->getMode().IEEE) 3816 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 3817 else 3818 B.buildFMinNum(Min, Fract, Const, Flags); 3819 3820 Register CorrectedFract = Min; 3821 if (!MI.getFlag(MachineInstr::FmNoNans)) { 3822 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 3823 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0); 3824 } 3825 3826 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags); 3827 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 3828 3829 MI.eraseFromParent(); 3830 return true; 3831 } 3832 3833 // Turn an illegal packed v2s16 build vector into bit operations. 3834 // TODO: This should probably be a bitcast action in LegalizerHelper. 3835 bool AMDGPULegalizerInfo::legalizeBuildVector( 3836 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3837 Register Dst = MI.getOperand(0).getReg(); 3838 const LLT S32 = LLT::scalar(32); 3839 const LLT S16 = LLT::scalar(16); 3840 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 3841 3842 Register Src0 = MI.getOperand(1).getReg(); 3843 Register Src1 = MI.getOperand(2).getReg(); 3844 3845 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) { 3846 assert(MRI.getType(Src0) == S32); 3847 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0); 3848 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0); 3849 } 3850 3851 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1}); 3852 B.buildBitcast(Dst, Merge); 3853 3854 MI.eraseFromParent(); 3855 return true; 3856 } 3857 3858 // Build a big integer multiply or multiply-add using MAD_64_32 instructions. 3859 // 3860 // Source and accumulation registers must all be 32-bits. 3861 // 3862 // TODO: When the multiply is uniform, we should produce a code sequence 3863 // that is better suited to instruction selection on the SALU. Instead of 3864 // the outer loop going over parts of the result, the outer loop should go 3865 // over parts of one of the factors. This should result in instruction 3866 // selection that makes full use of S_ADDC_U32 instructions. 3867 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper, 3868 MutableArrayRef<Register> Accum, 3869 ArrayRef<Register> Src0, 3870 ArrayRef<Register> Src1, 3871 bool UsePartialMad64_32, 3872 bool SeparateOddAlignedProducts) const { 3873 // Use (possibly empty) vectors of S1 registers to represent the set of 3874 // carries from one pair of positions to the next. 3875 using Carry = SmallVector<Register, 2>; 3876 3877 MachineIRBuilder &B = Helper.MIRBuilder; 3878 GISelKnownBits &KB = *Helper.getKnownBits(); 3879 3880 const LLT S1 = LLT::scalar(1); 3881 const LLT S32 = LLT::scalar(32); 3882 const LLT S64 = LLT::scalar(64); 3883 3884 Register Zero32; 3885 Register Zero64; 3886 3887 auto getZero32 = [&]() -> Register { 3888 if (!Zero32) 3889 Zero32 = B.buildConstant(S32, 0).getReg(0); 3890 return Zero32; 3891 }; 3892 auto getZero64 = [&]() -> Register { 3893 if (!Zero64) 3894 Zero64 = B.buildConstant(S64, 0).getReg(0); 3895 return Zero64; 3896 }; 3897 3898 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros; 3899 for (unsigned i = 0; i < Src0.size(); ++i) { 3900 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero()); 3901 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero()); 3902 } 3903 3904 // Merge the given carries into the 32-bit LocalAccum, which is modified 3905 // in-place. 3906 // 3907 // Returns the carry-out, which is a single S1 register or null. 3908 auto mergeCarry = 3909 [&](Register &LocalAccum, const Carry &CarryIn) -> Register { 3910 if (CarryIn.empty()) 3911 return Register(); 3912 3913 bool HaveCarryOut = true; 3914 Register CarryAccum; 3915 if (CarryIn.size() == 1) { 3916 if (!LocalAccum) { 3917 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 3918 return Register(); 3919 } 3920 3921 CarryAccum = getZero32(); 3922 } else { 3923 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 3924 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { 3925 CarryAccum = 3926 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) 3927 .getReg(0); 3928 } 3929 3930 if (!LocalAccum) { 3931 LocalAccum = getZero32(); 3932 HaveCarryOut = false; 3933 } 3934 } 3935 3936 auto Add = 3937 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); 3938 LocalAccum = Add.getReg(0); 3939 return HaveCarryOut ? Add.getReg(1) : Register(); 3940 }; 3941 3942 // Build a multiply-add chain to compute 3943 // 3944 // LocalAccum + (partial products at DstIndex) 3945 // + (opportunistic subset of CarryIn) 3946 // 3947 // LocalAccum is an array of one or two 32-bit registers that are updated 3948 // in-place. The incoming registers may be null. 3949 // 3950 // In some edge cases, carry-ins can be consumed "for free". In that case, 3951 // the consumed carry bits are removed from CarryIn in-place. 3952 auto buildMadChain = 3953 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) 3954 -> Carry { 3955 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || 3956 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); 3957 3958 Carry CarryOut; 3959 unsigned j0 = 0; 3960 3961 // Use plain 32-bit multiplication for the most significant part of the 3962 // result by default. 3963 if (LocalAccum.size() == 1 && 3964 (!UsePartialMad64_32 || !CarryIn.empty())) { 3965 do { 3966 // Skip multiplication if one of the operands is 0 3967 unsigned j1 = DstIndex - j0; 3968 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 3969 ++j0; 3970 continue; 3971 } 3972 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); 3973 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) { 3974 LocalAccum[0] = Mul.getReg(0); 3975 } else { 3976 if (CarryIn.empty()) { 3977 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); 3978 } else { 3979 LocalAccum[0] = 3980 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) 3981 .getReg(0); 3982 CarryIn.pop_back(); 3983 } 3984 } 3985 ++j0; 3986 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); 3987 } 3988 3989 // Build full 64-bit multiplies. 3990 if (j0 <= DstIndex) { 3991 bool HaveSmallAccum = false; 3992 Register Tmp; 3993 3994 if (LocalAccum[0]) { 3995 if (LocalAccum.size() == 1) { 3996 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); 3997 HaveSmallAccum = true; 3998 } else if (LocalAccum[1]) { 3999 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0); 4000 HaveSmallAccum = false; 4001 } else { 4002 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); 4003 HaveSmallAccum = true; 4004 } 4005 } else { 4006 assert(LocalAccum.size() == 1 || !LocalAccum[1]); 4007 Tmp = getZero64(); 4008 HaveSmallAccum = true; 4009 } 4010 4011 do { 4012 unsigned j1 = DstIndex - j0; 4013 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 4014 ++j0; 4015 continue; 4016 } 4017 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, 4018 {Src0[j0], Src1[j1], Tmp}); 4019 Tmp = Mad.getReg(0); 4020 if (!HaveSmallAccum) 4021 CarryOut.push_back(Mad.getReg(1)); 4022 HaveSmallAccum = false; 4023 4024 ++j0; 4025 } while (j0 <= DstIndex); 4026 4027 auto Unmerge = B.buildUnmerge(S32, Tmp); 4028 LocalAccum[0] = Unmerge.getReg(0); 4029 if (LocalAccum.size() > 1) 4030 LocalAccum[1] = Unmerge.getReg(1); 4031 } 4032 4033 return CarryOut; 4034 }; 4035 4036 // Outer multiply loop, iterating over destination parts from least 4037 // significant to most significant parts. 4038 // 4039 // The columns of the following diagram correspond to the destination parts 4040 // affected by one iteration of the outer loop (ignoring boundary 4041 // conditions). 4042 // 4043 // Dest index relative to 2 * i: 1 0 -1 4044 // ------ 4045 // Carries from previous iteration: e o 4046 // Even-aligned partial product sum: E E . 4047 // Odd-aligned partial product sum: O O 4048 // 4049 // 'o' is OddCarry, 'e' is EvenCarry. 4050 // EE and OO are computed from partial products via buildMadChain and use 4051 // accumulation where possible and appropriate. 4052 // 4053 Register SeparateOddCarry; 4054 Carry EvenCarry; 4055 Carry OddCarry; 4056 4057 for (unsigned i = 0; i <= Accum.size() / 2; ++i) { 4058 Carry OddCarryIn = std::move(OddCarry); 4059 Carry EvenCarryIn = std::move(EvenCarry); 4060 OddCarry.clear(); 4061 EvenCarry.clear(); 4062 4063 // Partial products at offset 2 * i. 4064 if (2 * i < Accum.size()) { 4065 auto LocalAccum = Accum.drop_front(2 * i).take_front(2); 4066 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); 4067 } 4068 4069 // Partial products at offset 2 * i - 1. 4070 if (i > 0) { 4071 if (!SeparateOddAlignedProducts) { 4072 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); 4073 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 4074 } else { 4075 bool IsHighest = 2 * i >= Accum.size(); 4076 Register SeparateOddOut[2]; 4077 auto LocalAccum = MutableArrayRef(SeparateOddOut) 4078 .take_front(IsHighest ? 1 : 2); 4079 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 4080 4081 MachineInstr *Lo; 4082 4083 if (i == 1) { 4084 if (!IsHighest) 4085 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); 4086 else 4087 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); 4088 } else { 4089 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], 4090 SeparateOddCarry); 4091 } 4092 Accum[2 * i - 1] = Lo->getOperand(0).getReg(); 4093 4094 if (!IsHighest) { 4095 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], 4096 Lo->getOperand(1).getReg()); 4097 Accum[2 * i] = Hi.getReg(0); 4098 SeparateOddCarry = Hi.getReg(1); 4099 } 4100 } 4101 } 4102 4103 // Add in the carries from the previous iteration 4104 if (i > 0) { 4105 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) 4106 EvenCarryIn.push_back(CarryOut); 4107 4108 if (2 * i < Accum.size()) { 4109 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) 4110 OddCarry.push_back(CarryOut); 4111 } 4112 } 4113 } 4114 } 4115 4116 // Custom narrowing of wide multiplies using wide multiply-add instructions. 4117 // 4118 // TODO: If the multiply is followed by an addition, we should attempt to 4119 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. 4120 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, 4121 MachineInstr &MI) const { 4122 assert(ST.hasMad64_32()); 4123 assert(MI.getOpcode() == TargetOpcode::G_MUL); 4124 4125 MachineIRBuilder &B = Helper.MIRBuilder; 4126 MachineRegisterInfo &MRI = *B.getMRI(); 4127 4128 Register DstReg = MI.getOperand(0).getReg(); 4129 Register Src0 = MI.getOperand(1).getReg(); 4130 Register Src1 = MI.getOperand(2).getReg(); 4131 4132 LLT Ty = MRI.getType(DstReg); 4133 assert(Ty.isScalar()); 4134 4135 unsigned Size = Ty.getSizeInBits(); 4136 unsigned NumParts = Size / 32; 4137 assert((Size % 32) == 0); 4138 assert(NumParts >= 2); 4139 4140 // Whether to use MAD_64_32 for partial products whose high half is 4141 // discarded. This avoids some ADD instructions but risks false dependency 4142 // stalls on some subtargets in some cases. 4143 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; 4144 4145 // Whether to compute odd-aligned partial products separately. This is 4146 // advisable on subtargets where the accumulator of MAD_64_32 must be placed 4147 // in an even-aligned VGPR. 4148 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); 4149 4150 LLT S32 = LLT::scalar(32); 4151 SmallVector<Register, 2> Src0Parts, Src1Parts; 4152 for (unsigned i = 0; i < NumParts; ++i) { 4153 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); 4154 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); 4155 } 4156 B.buildUnmerge(Src0Parts, Src0); 4157 B.buildUnmerge(Src1Parts, Src1); 4158 4159 SmallVector<Register, 2> AccumRegs(NumParts); 4160 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, 4161 SeparateOddAlignedProducts); 4162 4163 B.buildMergeLikeInstr(DstReg, AccumRegs); 4164 MI.eraseFromParent(); 4165 return true; 4166 } 4167 4168 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 4169 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 4170 // case with a single min instruction instead of a compare+select. 4171 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 4172 MachineRegisterInfo &MRI, 4173 MachineIRBuilder &B) const { 4174 Register Dst = MI.getOperand(0).getReg(); 4175 Register Src = MI.getOperand(1).getReg(); 4176 LLT DstTy = MRI.getType(Dst); 4177 LLT SrcTy = MRI.getType(Src); 4178 4179 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 4180 ? AMDGPU::G_AMDGPU_FFBH_U32 4181 : AMDGPU::G_AMDGPU_FFBL_B32; 4182 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 4183 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 4184 4185 MI.eraseFromParent(); 4186 return true; 4187 } 4188 4189 bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, 4190 MachineRegisterInfo &MRI, 4191 MachineIRBuilder &B) const { 4192 Register Dst = MI.getOperand(0).getReg(); 4193 Register Src = MI.getOperand(1).getReg(); 4194 LLT SrcTy = MRI.getType(Src); 4195 TypeSize NumBits = SrcTy.getSizeInBits(); 4196 4197 assert(NumBits < 32u); 4198 4199 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits); 4200 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u); 4201 auto Shift = B.buildShl(S32, Extend, ShiftAmt); 4202 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift}); 4203 B.buildTrunc(Dst, Ctlz); 4204 MI.eraseFromParent(); 4205 return true; 4206 } 4207 4208 // Check that this is a G_XOR x, -1 4209 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 4210 if (MI.getOpcode() != TargetOpcode::G_XOR) 4211 return false; 4212 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 4213 return ConstVal && *ConstVal == -1; 4214 } 4215 4216 // Return the use branch instruction, otherwise null if the usage is invalid. 4217 static MachineInstr * 4218 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 4219 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 4220 Register CondDef = MI.getOperand(0).getReg(); 4221 if (!MRI.hasOneNonDBGUse(CondDef)) 4222 return nullptr; 4223 4224 MachineBasicBlock *Parent = MI.getParent(); 4225 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 4226 4227 if (isNot(MRI, *UseMI)) { 4228 Register NegatedCond = UseMI->getOperand(0).getReg(); 4229 if (!MRI.hasOneNonDBGUse(NegatedCond)) 4230 return nullptr; 4231 4232 // We're deleting the def of this value, so we need to remove it. 4233 eraseInstr(*UseMI, MRI); 4234 4235 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 4236 Negated = true; 4237 } 4238 4239 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 4240 return nullptr; 4241 4242 // Make sure the cond br is followed by a G_BR, or is the last instruction. 4243 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 4244 if (Next == Parent->end()) { 4245 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 4246 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 4247 return nullptr; 4248 UncondBrTarget = &*NextMBB; 4249 } else { 4250 if (Next->getOpcode() != AMDGPU::G_BR) 4251 return nullptr; 4252 Br = &*Next; 4253 UncondBrTarget = Br->getOperand(0).getMBB(); 4254 } 4255 4256 return UseMI; 4257 } 4258 4259 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 4260 const ArgDescriptor *Arg, 4261 const TargetRegisterClass *ArgRC, 4262 LLT ArgTy) const { 4263 MCRegister SrcReg = Arg->getRegister(); 4264 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 4265 assert(DstReg.isVirtual() && "Virtual register expected"); 4266 4267 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 4268 *ArgRC, B.getDebugLoc(), ArgTy); 4269 if (Arg->isMasked()) { 4270 // TODO: Should we try to emit this once in the entry block? 4271 const LLT S32 = LLT::scalar(32); 4272 const unsigned Mask = Arg->getMask(); 4273 const unsigned Shift = llvm::countr_zero<unsigned>(Mask); 4274 4275 Register AndMaskSrc = LiveIn; 4276 4277 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 4278 // 0. 4279 if (Shift != 0) { 4280 auto ShiftAmt = B.buildConstant(S32, Shift); 4281 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 4282 } 4283 4284 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 4285 } else { 4286 B.buildCopy(DstReg, LiveIn); 4287 } 4288 4289 return true; 4290 } 4291 4292 bool AMDGPULegalizerInfo::loadInputValue( 4293 Register DstReg, MachineIRBuilder &B, 4294 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4295 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4296 const ArgDescriptor *Arg = nullptr; 4297 const TargetRegisterClass *ArgRC; 4298 LLT ArgTy; 4299 4300 CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); 4301 const ArgDescriptor WorkGroupIDX = 4302 ArgDescriptor::createRegister(AMDGPU::TTMP9); 4303 // If GridZ is not programmed in an entry function then the hardware will set 4304 // it to all zeros, so there is no need to mask the GridY value in the low 4305 // order bits. 4306 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( 4307 AMDGPU::TTMP7, 4308 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu); 4309 const ArgDescriptor WorkGroupIDZ = 4310 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); 4311 if (ST.hasArchitectedSGPRs() && 4312 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) { 4313 switch (ArgType) { 4314 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: 4315 Arg = &WorkGroupIDX; 4316 ArgRC = &AMDGPU::SReg_32RegClass; 4317 ArgTy = LLT::scalar(32); 4318 break; 4319 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: 4320 Arg = &WorkGroupIDY; 4321 ArgRC = &AMDGPU::SReg_32RegClass; 4322 ArgTy = LLT::scalar(32); 4323 break; 4324 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: 4325 Arg = &WorkGroupIDZ; 4326 ArgRC = &AMDGPU::SReg_32RegClass; 4327 ArgTy = LLT::scalar(32); 4328 break; 4329 default: 4330 break; 4331 } 4332 } 4333 4334 if (!Arg) 4335 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 4336 4337 if (!Arg) { 4338 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 4339 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 4340 // case the pointer argument may be missing and we use null. 4341 B.buildConstant(DstReg, 0); 4342 return true; 4343 } 4344 4345 // It's undefined behavior if a function marked with the amdgpu-no-* 4346 // attributes uses the corresponding intrinsic. 4347 B.buildUndef(DstReg); 4348 return true; 4349 } 4350 4351 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 4352 return false; // TODO: Handle these 4353 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 4354 } 4355 4356 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 4357 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 4358 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4359 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 4360 return false; 4361 4362 MI.eraseFromParent(); 4363 return true; 4364 } 4365 4366 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, 4367 int64_t C) { 4368 B.buildConstant(MI.getOperand(0).getReg(), C); 4369 MI.eraseFromParent(); 4370 return true; 4371 } 4372 4373 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( 4374 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 4375 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4376 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); 4377 if (MaxID == 0) 4378 return replaceWithConstant(B, MI, 0); 4379 4380 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4381 const ArgDescriptor *Arg; 4382 const TargetRegisterClass *ArgRC; 4383 LLT ArgTy; 4384 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 4385 4386 Register DstReg = MI.getOperand(0).getReg(); 4387 if (!Arg) { 4388 // It's undefined behavior if a function marked with the amdgpu-no-* 4389 // attributes uses the corresponding intrinsic. 4390 B.buildUndef(DstReg); 4391 MI.eraseFromParent(); 4392 return true; 4393 } 4394 4395 if (Arg->isMasked()) { 4396 // Don't bother inserting AssertZext for packed IDs since we're emitting the 4397 // masking operations anyway. 4398 // 4399 // TODO: We could assert the top bit is 0 for the source copy. 4400 if (!loadInputValue(DstReg, B, ArgType)) 4401 return false; 4402 } else { 4403 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 4404 if (!loadInputValue(TmpReg, B, ArgType)) 4405 return false; 4406 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID)); 4407 } 4408 4409 MI.eraseFromParent(); 4410 return true; 4411 } 4412 4413 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 4414 int64_t Offset) const { 4415 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 4416 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 4417 4418 // TODO: If we passed in the base kernel offset we could have a better 4419 // alignment than 4, but we don't really need it. 4420 if (!loadInputValue(KernArgReg, B, 4421 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4422 llvm_unreachable("failed to find kernarg segment ptr"); 4423 4424 auto COffset = B.buildConstant(LLT::scalar(64), Offset); 4425 // TODO: Should get nuw 4426 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 4427 } 4428 4429 /// Legalize a value that's loaded from kernel arguments. This is only used by 4430 /// legacy intrinsics. 4431 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 4432 MachineIRBuilder &B, 4433 uint64_t Offset, 4434 Align Alignment) const { 4435 Register DstReg = MI.getOperand(0).getReg(); 4436 4437 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 4438 "unexpected kernarg parameter type"); 4439 4440 Register Ptr = getKernargParameterPtr(B, Offset); 4441 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 4442 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 4443 MachineMemOperand::MODereferenceable | 4444 MachineMemOperand::MOInvariant); 4445 MI.eraseFromParent(); 4446 return true; 4447 } 4448 4449 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 4450 MachineRegisterInfo &MRI, 4451 MachineIRBuilder &B) const { 4452 Register Dst = MI.getOperand(0).getReg(); 4453 LLT DstTy = MRI.getType(Dst); 4454 LLT S16 = LLT::scalar(16); 4455 LLT S32 = LLT::scalar(32); 4456 LLT S64 = LLT::scalar(64); 4457 4458 if (DstTy == S16) 4459 return legalizeFDIV16(MI, MRI, B); 4460 if (DstTy == S32) 4461 return legalizeFDIV32(MI, MRI, B); 4462 if (DstTy == S64) 4463 return legalizeFDIV64(MI, MRI, B); 4464 4465 return false; 4466 } 4467 4468 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 4469 Register DstDivReg, 4470 Register DstRemReg, 4471 Register X, 4472 Register Y) const { 4473 const LLT S1 = LLT::scalar(1); 4474 const LLT S32 = LLT::scalar(32); 4475 4476 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 4477 // algorithm used here. 4478 4479 // Initial estimate of inv(y). 4480 auto FloatY = B.buildUITOFP(S32, Y); 4481 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 4482 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe)); 4483 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 4484 auto Z = B.buildFPTOUI(S32, ScaledY); 4485 4486 // One round of UNR. 4487 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 4488 auto NegYZ = B.buildMul(S32, NegY, Z); 4489 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 4490 4491 // Quotient/remainder estimate. 4492 auto Q = B.buildUMulH(S32, X, Z); 4493 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 4494 4495 // First quotient/remainder refinement. 4496 auto One = B.buildConstant(S32, 1); 4497 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4498 if (DstDivReg) 4499 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 4500 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 4501 4502 // Second quotient/remainder refinement. 4503 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4504 if (DstDivReg) 4505 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 4506 4507 if (DstRemReg) 4508 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 4509 } 4510 4511 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 4512 // 4513 // Return lo, hi of result 4514 // 4515 // %cvt.lo = G_UITOFP Val.lo 4516 // %cvt.hi = G_UITOFP Val.hi 4517 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 4518 // %rcp = G_AMDGPU_RCP_IFLAG %mad 4519 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 4520 // %mul2 = G_FMUL %mul1, 2**(-32) 4521 // %trunc = G_INTRINSIC_TRUNC %mul2 4522 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 4523 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 4524 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 4525 Register Val) { 4526 const LLT S32 = LLT::scalar(32); 4527 auto Unmerge = B.buildUnmerge(S32, Val); 4528 4529 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 4530 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 4531 4532 auto Mad = B.buildFMAD( 4533 S32, CvtHi, // 2**32 4534 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo); 4535 4536 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 4537 auto Mul1 = B.buildFMul( 4538 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc))); 4539 4540 // 2**(-32) 4541 auto Mul2 = B.buildFMul( 4542 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000))); 4543 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 4544 4545 // -(2**32) 4546 auto Mad2 = B.buildFMAD( 4547 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)), 4548 Mul1); 4549 4550 auto ResultLo = B.buildFPTOUI(S32, Mad2); 4551 auto ResultHi = B.buildFPTOUI(S32, Trunc); 4552 4553 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 4554 } 4555 4556 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 4557 Register DstDivReg, 4558 Register DstRemReg, 4559 Register Numer, 4560 Register Denom) const { 4561 const LLT S32 = LLT::scalar(32); 4562 const LLT S64 = LLT::scalar(64); 4563 const LLT S1 = LLT::scalar(1); 4564 Register RcpLo, RcpHi; 4565 4566 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 4567 4568 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi}); 4569 4570 auto Zero64 = B.buildConstant(S64, 0); 4571 auto NegDenom = B.buildSub(S64, Zero64, Denom); 4572 4573 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 4574 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 4575 4576 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 4577 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 4578 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 4579 4580 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 4581 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 4582 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi}); 4583 4584 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 4585 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 4586 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 4587 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 4588 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 4589 4590 auto Zero32 = B.buildConstant(S32, 0); 4591 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 4592 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 4593 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi}); 4594 4595 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 4596 Register NumerLo = UnmergeNumer.getReg(0); 4597 Register NumerHi = UnmergeNumer.getReg(1); 4598 4599 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 4600 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 4601 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 4602 Register Mul3_Lo = UnmergeMul3.getReg(0); 4603 Register Mul3_Hi = UnmergeMul3.getReg(1); 4604 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 4605 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 4606 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 4607 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi}); 4608 4609 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 4610 Register DenomLo = UnmergeDenom.getReg(0); 4611 Register DenomHi = UnmergeDenom.getReg(1); 4612 4613 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 4614 auto C1 = B.buildSExt(S32, CmpHi); 4615 4616 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 4617 auto C2 = B.buildSExt(S32, CmpLo); 4618 4619 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 4620 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 4621 4622 // TODO: Here and below portions of the code can be enclosed into if/endif. 4623 // Currently control flow is unconditional and we have 4 selects after 4624 // potential endif to substitute PHIs. 4625 4626 // if C3 != 0 ... 4627 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 4628 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 4629 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 4630 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi}); 4631 4632 auto One64 = B.buildConstant(S64, 1); 4633 auto Add3 = B.buildAdd(S64, MulHi3, One64); 4634 4635 auto C4 = 4636 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 4637 auto C5 = 4638 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 4639 auto C6 = B.buildSelect( 4640 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 4641 4642 // if (C6 != 0) 4643 auto Add4 = B.buildAdd(S64, Add3, One64); 4644 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 4645 4646 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 4647 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 4648 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi}); 4649 4650 // endif C6 4651 // endif C3 4652 4653 if (DstDivReg) { 4654 auto Sel1 = B.buildSelect( 4655 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 4656 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4657 Sel1, MulHi3); 4658 } 4659 4660 if (DstRemReg) { 4661 auto Sel2 = B.buildSelect( 4662 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 4663 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4664 Sel2, Sub1); 4665 } 4666 } 4667 4668 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 4669 MachineRegisterInfo &MRI, 4670 MachineIRBuilder &B) const { 4671 Register DstDivReg, DstRemReg; 4672 switch (MI.getOpcode()) { 4673 default: 4674 llvm_unreachable("Unexpected opcode!"); 4675 case AMDGPU::G_UDIV: { 4676 DstDivReg = MI.getOperand(0).getReg(); 4677 break; 4678 } 4679 case AMDGPU::G_UREM: { 4680 DstRemReg = MI.getOperand(0).getReg(); 4681 break; 4682 } 4683 case AMDGPU::G_UDIVREM: { 4684 DstDivReg = MI.getOperand(0).getReg(); 4685 DstRemReg = MI.getOperand(1).getReg(); 4686 break; 4687 } 4688 } 4689 4690 const LLT S64 = LLT::scalar(64); 4691 const LLT S32 = LLT::scalar(32); 4692 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4693 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 4694 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 4695 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 4696 4697 if (Ty == S32) 4698 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 4699 else if (Ty == S64) 4700 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 4701 else 4702 return false; 4703 4704 MI.eraseFromParent(); 4705 return true; 4706 } 4707 4708 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 4709 MachineRegisterInfo &MRI, 4710 MachineIRBuilder &B) const { 4711 const LLT S64 = LLT::scalar(64); 4712 const LLT S32 = LLT::scalar(32); 4713 4714 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 4715 if (Ty != S32 && Ty != S64) 4716 return false; 4717 4718 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4719 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 4720 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 4721 4722 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 4723 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 4724 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 4725 4726 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 4727 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 4728 4729 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 4730 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 4731 4732 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 4733 switch (MI.getOpcode()) { 4734 default: 4735 llvm_unreachable("Unexpected opcode!"); 4736 case AMDGPU::G_SDIV: { 4737 DstDivReg = MI.getOperand(0).getReg(); 4738 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4739 break; 4740 } 4741 case AMDGPU::G_SREM: { 4742 DstRemReg = MI.getOperand(0).getReg(); 4743 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4744 break; 4745 } 4746 case AMDGPU::G_SDIVREM: { 4747 DstDivReg = MI.getOperand(0).getReg(); 4748 DstRemReg = MI.getOperand(1).getReg(); 4749 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4750 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4751 break; 4752 } 4753 } 4754 4755 if (Ty == S32) 4756 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 4757 else 4758 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 4759 4760 if (DstDivReg) { 4761 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 4762 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 4763 B.buildSub(DstDivReg, SignXor, Sign); 4764 } 4765 4766 if (DstRemReg) { 4767 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 4768 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 4769 B.buildSub(DstRemReg, SignXor, Sign); 4770 } 4771 4772 MI.eraseFromParent(); 4773 return true; 4774 } 4775 4776 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 4777 MachineRegisterInfo &MRI, 4778 MachineIRBuilder &B) const { 4779 Register Res = MI.getOperand(0).getReg(); 4780 Register LHS = MI.getOperand(1).getReg(); 4781 Register RHS = MI.getOperand(2).getReg(); 4782 uint16_t Flags = MI.getFlags(); 4783 LLT ResTy = MRI.getType(Res); 4784 4785 const MachineFunction &MF = B.getMF(); 4786 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) || 4787 MF.getTarget().Options.UnsafeFPMath; 4788 4789 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 4790 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16)) 4791 return false; 4792 4793 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 4794 // the CI documentation has a worst case error of 1 ulp. 4795 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 4796 // use it as long as we aren't trying to use denormals. 4797 // 4798 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. 4799 4800 // 1 / x -> RCP(x) 4801 if (CLHS->isExactlyValue(1.0)) { 4802 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res) 4803 .addUse(RHS) 4804 .setMIFlags(Flags); 4805 4806 MI.eraseFromParent(); 4807 return true; 4808 } 4809 4810 // -1 / x -> RCP( FNEG(x) ) 4811 if (CLHS->isExactlyValue(-1.0)) { 4812 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 4813 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res) 4814 .addUse(FNeg.getReg(0)) 4815 .setMIFlags(Flags); 4816 4817 MI.eraseFromParent(); 4818 return true; 4819 } 4820 } 4821 4822 // For f16 require afn or arcp. 4823 // For f32 require afn. 4824 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) || 4825 !MI.getFlag(MachineInstr::FmArcp))) 4826 return false; 4827 4828 // x / y -> x * (1.0 / y) 4829 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}) 4830 .addUse(RHS) 4831 .setMIFlags(Flags); 4832 B.buildFMul(Res, LHS, RCP, Flags); 4833 4834 MI.eraseFromParent(); 4835 return true; 4836 } 4837 4838 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 4839 MachineRegisterInfo &MRI, 4840 MachineIRBuilder &B) const { 4841 Register Res = MI.getOperand(0).getReg(); 4842 Register X = MI.getOperand(1).getReg(); 4843 Register Y = MI.getOperand(2).getReg(); 4844 uint16_t Flags = MI.getFlags(); 4845 LLT ResTy = MRI.getType(Res); 4846 4847 const MachineFunction &MF = B.getMF(); 4848 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 4849 MI.getFlag(MachineInstr::FmAfn); 4850 4851 if (!AllowInaccurateRcp) 4852 return false; 4853 4854 auto NegY = B.buildFNeg(ResTy, Y); 4855 auto One = B.buildFConstant(ResTy, 1.0); 4856 4857 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}) 4858 .addUse(Y) 4859 .setMIFlags(Flags); 4860 4861 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 4862 R = B.buildFMA(ResTy, Tmp0, R, R); 4863 4864 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 4865 R = B.buildFMA(ResTy, Tmp1, R, R); 4866 4867 auto Ret = B.buildFMul(ResTy, X, R); 4868 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 4869 4870 B.buildFMA(Res, Tmp2, R, Ret); 4871 MI.eraseFromParent(); 4872 return true; 4873 } 4874 4875 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 4876 MachineRegisterInfo &MRI, 4877 MachineIRBuilder &B) const { 4878 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 4879 return true; 4880 4881 Register Res = MI.getOperand(0).getReg(); 4882 Register LHS = MI.getOperand(1).getReg(); 4883 Register RHS = MI.getOperand(2).getReg(); 4884 4885 uint16_t Flags = MI.getFlags(); 4886 4887 LLT S16 = LLT::scalar(16); 4888 LLT S32 = LLT::scalar(32); 4889 4890 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 4891 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 4892 4893 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 4894 .addUse(RHSExt.getReg(0)) 4895 .setMIFlags(Flags); 4896 4897 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 4898 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 4899 4900 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res) 4901 .addUse(RDst.getReg(0)) 4902 .addUse(RHS) 4903 .addUse(LHS) 4904 .setMIFlags(Flags); 4905 4906 MI.eraseFromParent(); 4907 return true; 4908 } 4909 4910 static constexpr unsigned SPDenormModeBitField = 4911 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2); 4912 4913 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 4914 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 4915 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, 4916 const GCNSubtarget &ST, 4917 SIModeRegisterDefaults Mode) { 4918 // Set SP denorm mode to this value. 4919 unsigned SPDenormMode = 4920 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 4921 4922 if (ST.hasDenormModeInst()) { 4923 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 4924 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 4925 4926 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 4927 B.buildInstr(AMDGPU::S_DENORM_MODE) 4928 .addImm(NewDenormModeValue); 4929 4930 } else { 4931 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 4932 .addImm(SPDenormMode) 4933 .addImm(SPDenormModeBitField); 4934 } 4935 } 4936 4937 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 4938 MachineRegisterInfo &MRI, 4939 MachineIRBuilder &B) const { 4940 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 4941 return true; 4942 4943 Register Res = MI.getOperand(0).getReg(); 4944 Register LHS = MI.getOperand(1).getReg(); 4945 Register RHS = MI.getOperand(2).getReg(); 4946 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4947 SIModeRegisterDefaults Mode = MFI->getMode(); 4948 4949 uint16_t Flags = MI.getFlags(); 4950 4951 LLT S32 = LLT::scalar(32); 4952 LLT S1 = LLT::scalar(1); 4953 4954 auto One = B.buildFConstant(S32, 1.0f); 4955 4956 auto DenominatorScaled = 4957 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}) 4958 .addUse(LHS) 4959 .addUse(RHS) 4960 .addImm(0) 4961 .setMIFlags(Flags); 4962 auto NumeratorScaled = 4963 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}) 4964 .addUse(LHS) 4965 .addUse(RHS) 4966 .addImm(1) 4967 .setMIFlags(Flags); 4968 4969 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 4970 .addUse(DenominatorScaled.getReg(0)) 4971 .setMIFlags(Flags); 4972 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 4973 4974 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE(); 4975 const bool HasDynamicDenormals = 4976 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) || 4977 (Mode.FP32Denormals.Output == DenormalMode::Dynamic); 4978 4979 Register SavedSPDenormMode; 4980 if (!PreservesDenormals) { 4981 if (HasDynamicDenormals) { 4982 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 4983 B.buildInstr(AMDGPU::S_GETREG_B32) 4984 .addDef(SavedSPDenormMode) 4985 .addImm(SPDenormModeBitField); 4986 } 4987 toggleSPDenormMode(true, B, ST, Mode); 4988 } 4989 4990 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 4991 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 4992 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 4993 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 4994 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 4995 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 4996 4997 if (!PreservesDenormals) { 4998 if (HasDynamicDenormals) { 4999 assert(SavedSPDenormMode); 5000 B.buildInstr(AMDGPU::S_SETREG_B32) 5001 .addReg(SavedSPDenormMode) 5002 .addImm(SPDenormModeBitField); 5003 } else 5004 toggleSPDenormMode(false, B, ST, Mode); 5005 } 5006 5007 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}) 5008 .addUse(Fma4.getReg(0)) 5009 .addUse(Fma1.getReg(0)) 5010 .addUse(Fma3.getReg(0)) 5011 .addUse(NumeratorScaled.getReg(1)) 5012 .setMIFlags(Flags); 5013 5014 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res) 5015 .addUse(Fmas.getReg(0)) 5016 .addUse(RHS) 5017 .addUse(LHS) 5018 .setMIFlags(Flags); 5019 5020 MI.eraseFromParent(); 5021 return true; 5022 } 5023 5024 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 5025 MachineRegisterInfo &MRI, 5026 MachineIRBuilder &B) const { 5027 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 5028 return true; 5029 5030 Register Res = MI.getOperand(0).getReg(); 5031 Register LHS = MI.getOperand(1).getReg(); 5032 Register RHS = MI.getOperand(2).getReg(); 5033 5034 uint16_t Flags = MI.getFlags(); 5035 5036 LLT S64 = LLT::scalar(64); 5037 LLT S1 = LLT::scalar(1); 5038 5039 auto One = B.buildFConstant(S64, 1.0); 5040 5041 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}) 5042 .addUse(LHS) 5043 .addUse(RHS) 5044 .addImm(0) 5045 .setMIFlags(Flags); 5046 5047 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 5048 5049 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}) 5050 .addUse(DivScale0.getReg(0)) 5051 .setMIFlags(Flags); 5052 5053 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 5054 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 5055 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 5056 5057 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}) 5058 .addUse(LHS) 5059 .addUse(RHS) 5060 .addImm(1) 5061 .setMIFlags(Flags); 5062 5063 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 5064 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 5065 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 5066 5067 Register Scale; 5068 if (!ST.hasUsableDivScaleConditionOutput()) { 5069 // Workaround a hardware bug on SI where the condition output from div_scale 5070 // is not usable. 5071 5072 LLT S32 = LLT::scalar(32); 5073 5074 auto NumUnmerge = B.buildUnmerge(S32, LHS); 5075 auto DenUnmerge = B.buildUnmerge(S32, RHS); 5076 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 5077 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 5078 5079 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 5080 Scale1Unmerge.getReg(1)); 5081 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 5082 Scale0Unmerge.getReg(1)); 5083 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 5084 } else { 5085 Scale = DivScale1.getReg(1); 5086 } 5087 5088 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}) 5089 .addUse(Fma4.getReg(0)) 5090 .addUse(Fma3.getReg(0)) 5091 .addUse(Mul.getReg(0)) 5092 .addUse(Scale) 5093 .setMIFlags(Flags); 5094 5095 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res)) 5096 .addUse(Fmas.getReg(0)) 5097 .addUse(RHS) 5098 .addUse(LHS) 5099 .setMIFlags(Flags); 5100 5101 MI.eraseFromParent(); 5102 return true; 5103 } 5104 5105 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI, 5106 MachineRegisterInfo &MRI, 5107 MachineIRBuilder &B) const { 5108 Register Res0 = MI.getOperand(0).getReg(); 5109 Register Res1 = MI.getOperand(1).getReg(); 5110 Register Val = MI.getOperand(2).getReg(); 5111 uint16_t Flags = MI.getFlags(); 5112 5113 LLT Ty = MRI.getType(Res0); 5114 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32); 5115 5116 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty}) 5117 .addUse(Val) 5118 .setMIFlags(Flags); 5119 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy}) 5120 .addUse(Val) 5121 .setMIFlags(Flags); 5122 5123 if (ST.hasFractBug()) { 5124 auto Fabs = B.buildFAbs(Ty, Val); 5125 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty))); 5126 auto IsFinite = 5127 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 5128 auto Zero = B.buildConstant(InstrExpTy, 0); 5129 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero); 5130 Mant = B.buildSelect(Ty, IsFinite, Mant, Val); 5131 } 5132 5133 B.buildCopy(Res0, Mant); 5134 B.buildSExtOrTrunc(Res1, Exp); 5135 5136 MI.eraseFromParent(); 5137 return true; 5138 } 5139 5140 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 5141 MachineRegisterInfo &MRI, 5142 MachineIRBuilder &B) const { 5143 Register Res = MI.getOperand(0).getReg(); 5144 Register LHS = MI.getOperand(2).getReg(); 5145 Register RHS = MI.getOperand(3).getReg(); 5146 uint16_t Flags = MI.getFlags(); 5147 5148 LLT S32 = LLT::scalar(32); 5149 LLT S1 = LLT::scalar(1); 5150 5151 auto Abs = B.buildFAbs(S32, RHS, Flags); 5152 const APFloat C0Val(1.0f); 5153 5154 auto C0 = B.buildFConstant(S32, 0x1p+96f); 5155 auto C1 = B.buildFConstant(S32, 0x1p-32f); 5156 auto C2 = B.buildFConstant(S32, 1.0f); 5157 5158 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 5159 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 5160 5161 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 5162 5163 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 5164 .addUse(Mul0.getReg(0)) 5165 .setMIFlags(Flags); 5166 5167 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 5168 5169 B.buildFMul(Res, Sel, Mul1, Flags); 5170 5171 MI.eraseFromParent(); 5172 return true; 5173 } 5174 5175 bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI, 5176 MachineRegisterInfo &MRI, 5177 MachineIRBuilder &B) const { 5178 // Bypass the correct expansion a standard promotion through G_FSQRT would 5179 // get. The f32 op is accurate enough for the f16 cas. 5180 unsigned Flags = MI.getFlags(); 5181 assert(!ST.has16BitInsts()); 5182 const LLT F32 = LLT::scalar(32); 5183 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags); 5184 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32}) 5185 .addUse(Ext.getReg(0)) 5186 .setMIFlags(Flags); 5187 B.buildFPTrunc(MI.getOperand(0), Log2, Flags); 5188 MI.eraseFromParent(); 5189 return true; 5190 } 5191 5192 bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI, 5193 MachineRegisterInfo &MRI, 5194 MachineIRBuilder &B) const { 5195 MachineFunction &MF = B.getMF(); 5196 Register Dst = MI.getOperand(0).getReg(); 5197 Register X = MI.getOperand(1).getReg(); 5198 const unsigned Flags = MI.getFlags(); 5199 const LLT S1 = LLT::scalar(1); 5200 const LLT F32 = LLT::scalar(32); 5201 const LLT I32 = LLT::scalar(32); 5202 5203 if (allowApproxFunc(MF, Flags)) { 5204 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst})) 5205 .addUse(X) 5206 .setMIFlags(Flags); 5207 MI.eraseFromParent(); 5208 return true; 5209 } 5210 5211 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f); 5212 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags); 5213 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f); 5214 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags); 5215 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags); 5216 5217 Register SqrtS = MRI.createGenericVirtualRegister(F32); 5218 if (needsDenormHandlingF32(MF, X, Flags)) { 5219 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS})) 5220 .addUse(SqrtX.getReg(0)) 5221 .setMIFlags(Flags); 5222 5223 auto NegOne = B.buildConstant(I32, -1); 5224 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne); 5225 5226 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags); 5227 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags); 5228 5229 auto PosOne = B.buildConstant(I32, 1); 5230 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne); 5231 5232 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags); 5233 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags); 5234 5235 auto Zero = B.buildFConstant(F32, 0.0f); 5236 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags); 5237 5238 SqrtS = 5239 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0); 5240 5241 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags); 5242 SqrtS = 5243 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0); 5244 } else { 5245 auto SqrtR = 5246 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0)); 5247 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags); 5248 5249 auto Half = B.buildFConstant(F32, 0.5f); 5250 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags); 5251 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags); 5252 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags); 5253 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags); 5254 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0); 5255 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags); 5256 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags); 5257 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0); 5258 } 5259 5260 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f); 5261 5262 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags); 5263 5264 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0); 5265 5266 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); 5267 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags); 5268 5269 MI.eraseFromParent(); 5270 return true; 5271 } 5272 5273 bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI, 5274 MachineRegisterInfo &MRI, 5275 MachineIRBuilder &B) const { 5276 // For double type, the SQRT and RSQ instructions don't have required 5277 // precision, we apply Goldschmidt's algorithm to improve the result: 5278 // 5279 // y0 = rsq(x) 5280 // g0 = x * y0 5281 // h0 = 0.5 * y0 5282 // 5283 // r0 = 0.5 - h0 * g0 5284 // g1 = g0 * r0 + g0 5285 // h1 = h0 * r0 + h0 5286 // 5287 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 5288 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 5289 // h2 = h1 * r1 + h1 5290 // 5291 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 5292 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 5293 // 5294 // sqrt(x) = g3 5295 5296 const LLT S1 = LLT::scalar(1); 5297 const LLT S32 = LLT::scalar(32); 5298 const LLT F64 = LLT::scalar(64); 5299 5300 Register Dst = MI.getOperand(0).getReg(); 5301 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt"); 5302 5303 Register X = MI.getOperand(1).getReg(); 5304 unsigned Flags = MI.getFlags(); 5305 5306 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767); 5307 5308 auto ZeroInt = B.buildConstant(S32, 0); 5309 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant); 5310 5311 // Scale up input if it is too small. 5312 auto ScaleUpFactor = B.buildConstant(S32, 256); 5313 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt); 5314 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags); 5315 5316 auto SqrtY = 5317 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0)); 5318 5319 auto Half = B.buildFConstant(F64, 0.5); 5320 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half); 5321 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY); 5322 5323 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0); 5324 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half); 5325 5326 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0); 5327 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0); 5328 5329 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1); 5330 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX); 5331 5332 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1); 5333 5334 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2); 5335 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX); 5336 5337 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2); 5338 5339 // Scale down the result. 5340 auto ScaleDownFactor = B.buildConstant(S32, -128); 5341 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt); 5342 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags); 5343 5344 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check 5345 // with finite only or nsz because rsq(+/-0) = +/-inf 5346 5347 // TODO: Check for DAZ and expand to subnormals 5348 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); 5349 5350 // If x is +INF, +0, or -0, use its original value 5351 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags); 5352 5353 MI.eraseFromParent(); 5354 return true; 5355 } 5356 5357 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, 5358 MachineRegisterInfo &MRI, 5359 MachineIRBuilder &B) const { 5360 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 5361 if (Ty == LLT::scalar(32)) 5362 return legalizeFSQRTF32(MI, MRI, B); 5363 if (Ty == LLT::scalar(64)) 5364 return legalizeFSQRTF64(MI, MRI, B); 5365 if (Ty == LLT::scalar(16)) 5366 return legalizeFSQRTF16(MI, MRI, B); 5367 return false; 5368 } 5369 5370 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 5371 // FIXME: Why do we handle this one but not other removed instructions? 5372 // 5373 // Reciprocal square root. The clamp prevents infinite results, clamping 5374 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 5375 // +-max_float. 5376 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 5377 MachineRegisterInfo &MRI, 5378 MachineIRBuilder &B) const { 5379 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 5380 return true; 5381 5382 Register Dst = MI.getOperand(0).getReg(); 5383 Register Src = MI.getOperand(2).getReg(); 5384 auto Flags = MI.getFlags(); 5385 5386 LLT Ty = MRI.getType(Dst); 5387 5388 const fltSemantics *FltSemantics; 5389 if (Ty == LLT::scalar(32)) 5390 FltSemantics = &APFloat::IEEEsingle(); 5391 else if (Ty == LLT::scalar(64)) 5392 FltSemantics = &APFloat::IEEEdouble(); 5393 else 5394 return false; 5395 5396 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}) 5397 .addUse(Src) 5398 .setMIFlags(Flags); 5399 5400 // We don't need to concern ourselves with the snan handling difference, since 5401 // the rsq quieted (or not) so use the one which will directly select. 5402 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5403 const bool UseIEEE = MFI->getMode().IEEE; 5404 5405 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 5406 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 5407 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 5408 5409 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 5410 5411 if (UseIEEE) 5412 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 5413 else 5414 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 5415 MI.eraseFromParent(); 5416 return true; 5417 } 5418 5419 // TODO: Fix pointer type handling 5420 bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, 5421 MachineInstr &MI, 5422 Intrinsic::ID IID) const { 5423 5424 MachineIRBuilder &B = Helper.MIRBuilder; 5425 MachineRegisterInfo &MRI = *B.getMRI(); 5426 5427 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || 5428 IID == Intrinsic::amdgcn_permlanex16; 5429 5430 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1, 5431 Register Src2, LLT VT) -> Register { 5432 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0); 5433 switch (IID) { 5434 case Intrinsic::amdgcn_readfirstlane: 5435 case Intrinsic::amdgcn_permlane64: 5436 return LaneOp.getReg(0); 5437 case Intrinsic::amdgcn_readlane: 5438 return LaneOp.addUse(Src1).getReg(0); 5439 case Intrinsic::amdgcn_writelane: 5440 return LaneOp.addUse(Src1).addUse(Src2).getReg(0); 5441 case Intrinsic::amdgcn_permlane16: 5442 case Intrinsic::amdgcn_permlanex16: { 5443 Register Src3 = MI.getOperand(5).getReg(); 5444 Register Src4 = MI.getOperand(6).getImm(); 5445 Register Src5 = MI.getOperand(7).getImm(); 5446 return LaneOp.addUse(Src1) 5447 .addUse(Src2) 5448 .addUse(Src3) 5449 .addImm(Src4) 5450 .addImm(Src5) 5451 .getReg(0); 5452 } 5453 default: 5454 llvm_unreachable("unhandled lane op"); 5455 } 5456 }; 5457 5458 Register DstReg = MI.getOperand(0).getReg(); 5459 Register Src0 = MI.getOperand(2).getReg(); 5460 Register Src1, Src2; 5461 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || 5462 IsPermLane16) { 5463 Src1 = MI.getOperand(3).getReg(); 5464 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) { 5465 Src2 = MI.getOperand(4).getReg(); 5466 } 5467 } 5468 5469 LLT Ty = MRI.getType(DstReg); 5470 unsigned Size = Ty.getSizeInBits(); 5471 5472 if (Size == 32) { 5473 // Already legal 5474 return true; 5475 } 5476 5477 if (Size < 32) { 5478 Src0 = B.buildAnyExt(S32, Src0).getReg(0); 5479 5480 if (IsPermLane16) 5481 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0); 5482 5483 if (IID == Intrinsic::amdgcn_writelane) 5484 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0); 5485 5486 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32); 5487 B.buildTrunc(DstReg, LaneOpDst); 5488 MI.eraseFromParent(); 5489 return true; 5490 } 5491 5492 if (Size % 32 != 0) 5493 return false; 5494 5495 LLT PartialResTy = S32; 5496 if (Ty.isVector()) { 5497 LLT EltTy = Ty.getElementType(); 5498 switch (EltTy.getSizeInBits()) { 5499 case 16: 5500 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2)); 5501 break; 5502 case 32: 5503 PartialResTy = EltTy; 5504 break; 5505 default: 5506 // Handle all other cases via S32 pieces; 5507 break; 5508 } 5509 } 5510 5511 SmallVector<Register, 2> PartialRes; 5512 unsigned NumParts = Size / 32; 5513 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0); 5514 MachineInstrBuilder Src1Parts, Src2Parts; 5515 5516 if (IsPermLane16) 5517 Src1Parts = B.buildUnmerge(PartialResTy, Src1); 5518 5519 if (IID == Intrinsic::amdgcn_writelane) 5520 Src2Parts = B.buildUnmerge(PartialResTy, Src2); 5521 5522 for (unsigned i = 0; i < NumParts; ++i) { 5523 Src0 = Src0Parts.getReg(i); 5524 5525 if (IsPermLane16) 5526 Src1 = Src1Parts.getReg(i); 5527 5528 if (IID == Intrinsic::amdgcn_writelane) 5529 Src2 = Src2Parts.getReg(i); 5530 5531 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy)); 5532 } 5533 5534 B.buildMergeLikeInstr(DstReg, PartialRes); 5535 MI.eraseFromParent(); 5536 return true; 5537 } 5538 5539 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 5540 MachineRegisterInfo &MRI, 5541 MachineIRBuilder &B) const { 5542 uint64_t Offset = 5543 ST.getTargetLowering()->getImplicitParameterOffset( 5544 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 5545 LLT DstTy = MRI.getType(DstReg); 5546 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 5547 5548 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 5549 if (!loadInputValue(KernargPtrReg, B, 5550 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 5551 return false; 5552 5553 // FIXME: This should be nuw 5554 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 5555 return true; 5556 } 5557 5558 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32 5559 /// bits of the pointer and replace them with the stride argument, then 5560 /// merge_values everything together. In the common case of a raw buffer (the 5561 /// stride component is 0), we can just AND off the upper half. 5562 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin( 5563 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5564 Register Result = MI.getOperand(0).getReg(); 5565 Register Pointer = MI.getOperand(2).getReg(); 5566 Register Stride = MI.getOperand(3).getReg(); 5567 Register NumRecords = MI.getOperand(4).getReg(); 5568 Register Flags = MI.getOperand(5).getReg(); 5569 5570 LLT S32 = LLT::scalar(32); 5571 5572 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 5573 auto Unmerge = B.buildUnmerge(S32, Pointer); 5574 Register LowHalf = Unmerge.getReg(0); 5575 Register HighHalf = Unmerge.getReg(1); 5576 5577 auto AndMask = B.buildConstant(S32, 0x0000ffff); 5578 auto Masked = B.buildAnd(S32, HighHalf, AndMask); 5579 5580 MachineInstrBuilder NewHighHalf = Masked; 5581 std::optional<ValueAndVReg> StrideConst = 5582 getIConstantVRegValWithLookThrough(Stride, MRI); 5583 if (!StrideConst || !StrideConst->Value.isZero()) { 5584 MachineInstrBuilder ShiftedStride; 5585 if (StrideConst) { 5586 uint32_t StrideVal = StrideConst->Value.getZExtValue(); 5587 uint32_t ShiftedStrideVal = StrideVal << 16; 5588 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal); 5589 } else { 5590 auto ExtStride = B.buildAnyExt(S32, Stride); 5591 auto ShiftConst = B.buildConstant(S32, 16); 5592 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst); 5593 } 5594 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride); 5595 } 5596 Register NewHighHalfReg = NewHighHalf.getReg(0); 5597 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags}); 5598 MI.eraseFromParent(); 5599 return true; 5600 } 5601 5602 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 5603 MachineRegisterInfo &MRI, 5604 MachineIRBuilder &B) const { 5605 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5606 if (!MFI->isEntryFunction()) { 5607 return legalizePreloadedArgIntrin(MI, MRI, B, 5608 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 5609 } 5610 5611 Register DstReg = MI.getOperand(0).getReg(); 5612 if (!getImplicitArgPtr(DstReg, MRI, B)) 5613 return false; 5614 5615 MI.eraseFromParent(); 5616 return true; 5617 } 5618 5619 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, 5620 MachineRegisterInfo &MRI, 5621 MachineIRBuilder &B) const { 5622 Function &F = B.getMF().getFunction(); 5623 std::optional<uint32_t> KnownSize = 5624 AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 5625 if (KnownSize.has_value()) 5626 B.buildConstant(DstReg, *KnownSize); 5627 return false; 5628 } 5629 5630 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, 5631 MachineRegisterInfo &MRI, 5632 MachineIRBuilder &B) const { 5633 5634 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5635 if (!MFI->isEntryFunction()) { 5636 return legalizePreloadedArgIntrin(MI, MRI, B, 5637 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 5638 } 5639 5640 Register DstReg = MI.getOperand(0).getReg(); 5641 if (!getLDSKernelId(DstReg, MRI, B)) 5642 return false; 5643 5644 MI.eraseFromParent(); 5645 return true; 5646 } 5647 5648 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 5649 MachineRegisterInfo &MRI, 5650 MachineIRBuilder &B, 5651 unsigned AddrSpace) const { 5652 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 5653 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 5654 Register Hi32 = Unmerge.getReg(1); 5655 5656 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 5657 MI.eraseFromParent(); 5658 return true; 5659 } 5660 5661 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 5662 // offset (the offset that is included in bounds checking and swizzling, to be 5663 // split between the instruction's voffset and immoffset fields) and soffset 5664 // (the offset that is excluded from bounds checking and swizzling, to go in 5665 // the instruction's soffset field). This function takes the first kind of 5666 // offset and figures out how to split it between voffset and immoffset. 5667 std::pair<Register, unsigned> 5668 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 5669 Register OrigOffset) const { 5670 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST); 5671 Register BaseReg; 5672 unsigned ImmOffset; 5673 const LLT S32 = LLT::scalar(32); 5674 MachineRegisterInfo &MRI = *B.getMRI(); 5675 5676 std::tie(BaseReg, ImmOffset) = 5677 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 5678 5679 // If BaseReg is a pointer, convert it to int. 5680 if (MRI.getType(BaseReg).isPointer()) 5681 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 5682 5683 // If the immediate value is too big for the immoffset field, put only bits 5684 // that would normally fit in the immoffset field. The remaining value that 5685 // is copied/added for the voffset field is a large power of 2, and it 5686 // stands more chance of being CSEd with the copy/add for another similar 5687 // load/store. 5688 // However, do not do that rounding down if that is a negative 5689 // number, as it appears to be illegal to have a negative offset in the 5690 // vgpr, even if adding the immediate offset makes it positive. 5691 unsigned Overflow = ImmOffset & ~MaxImm; 5692 ImmOffset -= Overflow; 5693 if ((int32_t)Overflow < 0) { 5694 Overflow += ImmOffset; 5695 ImmOffset = 0; 5696 } 5697 5698 if (Overflow != 0) { 5699 if (!BaseReg) { 5700 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 5701 } else { 5702 auto OverflowVal = B.buildConstant(S32, Overflow); 5703 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 5704 } 5705 } 5706 5707 if (!BaseReg) 5708 BaseReg = B.buildConstant(S32, 0).getReg(0); 5709 5710 return std::pair(BaseReg, ImmOffset); 5711 } 5712 5713 /// Handle register layout difference for f16 images for some subtargets. 5714 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 5715 MachineRegisterInfo &MRI, 5716 Register Reg, 5717 bool ImageStore) const { 5718 const LLT S16 = LLT::scalar(16); 5719 const LLT S32 = LLT::scalar(32); 5720 LLT StoreVT = MRI.getType(Reg); 5721 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 5722 5723 if (ST.hasUnpackedD16VMem()) { 5724 auto Unmerge = B.buildUnmerge(S16, Reg); 5725 5726 SmallVector<Register, 4> WideRegs; 5727 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5728 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 5729 5730 int NumElts = StoreVT.getNumElements(); 5731 5732 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 5733 .getReg(0); 5734 } 5735 5736 if (ImageStore && ST.hasImageStoreD16Bug()) { 5737 if (StoreVT.getNumElements() == 2) { 5738 SmallVector<Register, 4> PackedRegs; 5739 Reg = B.buildBitcast(S32, Reg).getReg(0); 5740 PackedRegs.push_back(Reg); 5741 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 5742 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 5743 .getReg(0); 5744 } 5745 5746 if (StoreVT.getNumElements() == 3) { 5747 SmallVector<Register, 4> PackedRegs; 5748 auto Unmerge = B.buildUnmerge(S16, Reg); 5749 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5750 PackedRegs.push_back(Unmerge.getReg(I)); 5751 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 5752 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 5753 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 5754 } 5755 5756 if (StoreVT.getNumElements() == 4) { 5757 SmallVector<Register, 4> PackedRegs; 5758 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 5759 auto Unmerge = B.buildUnmerge(S32, Reg); 5760 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5761 PackedRegs.push_back(Unmerge.getReg(I)); 5762 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 5763 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 5764 .getReg(0); 5765 } 5766 5767 llvm_unreachable("invalid data type"); 5768 } 5769 5770 if (StoreVT == LLT::fixed_vector(3, S16)) { 5771 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 5772 .getReg(0); 5773 } 5774 return Reg; 5775 } 5776 5777 Register AMDGPULegalizerInfo::fixStoreSourceType( 5778 MachineIRBuilder &B, Register VData, bool IsFormat) const { 5779 MachineRegisterInfo *MRI = B.getMRI(); 5780 LLT Ty = MRI->getType(VData); 5781 5782 const LLT S16 = LLT::scalar(16); 5783 5784 // Fixup buffer resources themselves needing to be v4i128. 5785 if (hasBufferRsrcWorkaround(Ty)) 5786 return castBufferRsrcToV4I32(VData, B); 5787 5788 // Fixup illegal register types for i8 stores. 5789 if (Ty == LLT::scalar(8) || Ty == S16) { 5790 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 5791 return AnyExt; 5792 } 5793 5794 if (Ty.isVector()) { 5795 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 5796 if (IsFormat) 5797 return handleD16VData(B, *MRI, VData); 5798 } 5799 } 5800 5801 return VData; 5802 } 5803 5804 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 5805 MachineRegisterInfo &MRI, 5806 MachineIRBuilder &B, 5807 bool IsTyped, 5808 bool IsFormat) const { 5809 Register VData = MI.getOperand(1).getReg(); 5810 LLT Ty = MRI.getType(VData); 5811 LLT EltTy = Ty.getScalarType(); 5812 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 5813 const LLT S32 = LLT::scalar(32); 5814 5815 VData = fixStoreSourceType(B, VData, IsFormat); 5816 castBufferRsrcArgToV4I32(MI, B, 2); 5817 Register RSrc = MI.getOperand(2).getReg(); 5818 5819 MachineMemOperand *MMO = *MI.memoperands_begin(); 5820 const int MemSize = MMO->getSize().getValue(); 5821 5822 unsigned ImmOffset; 5823 5824 // The typed intrinsics add an immediate after the registers. 5825 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 5826 5827 // The struct intrinsic variants add one additional operand over raw. 5828 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 5829 Register VIndex; 5830 int OpOffset = 0; 5831 if (HasVIndex) { 5832 VIndex = MI.getOperand(3).getReg(); 5833 OpOffset = 1; 5834 } else { 5835 VIndex = B.buildConstant(S32, 0).getReg(0); 5836 } 5837 5838 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 5839 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 5840 5841 unsigned Format = 0; 5842 if (IsTyped) { 5843 Format = MI.getOperand(5 + OpOffset).getImm(); 5844 ++OpOffset; 5845 } 5846 5847 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 5848 5849 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 5850 5851 unsigned Opc; 5852 if (IsTyped) { 5853 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 5854 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 5855 } else if (IsFormat) { 5856 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 5857 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 5858 } else { 5859 switch (MemSize) { 5860 case 1: 5861 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 5862 break; 5863 case 2: 5864 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 5865 break; 5866 default: 5867 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 5868 break; 5869 } 5870 } 5871 5872 auto MIB = B.buildInstr(Opc) 5873 .addUse(VData) // vdata 5874 .addUse(RSrc) // rsrc 5875 .addUse(VIndex) // vindex 5876 .addUse(VOffset) // voffset 5877 .addUse(SOffset) // soffset 5878 .addImm(ImmOffset); // offset(imm) 5879 5880 if (IsTyped) 5881 MIB.addImm(Format); 5882 5883 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5884 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 5885 .addMemOperand(MMO); 5886 5887 MI.eraseFromParent(); 5888 return true; 5889 } 5890 5891 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, 5892 Register VIndex, Register VOffset, Register SOffset, 5893 unsigned ImmOffset, unsigned Format, 5894 unsigned AuxiliaryData, MachineMemOperand *MMO, 5895 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) { 5896 auto MIB = B.buildInstr(Opc) 5897 .addDef(LoadDstReg) // vdata 5898 .addUse(RSrc) // rsrc 5899 .addUse(VIndex) // vindex 5900 .addUse(VOffset) // voffset 5901 .addUse(SOffset) // soffset 5902 .addImm(ImmOffset); // offset(imm) 5903 5904 if (IsTyped) 5905 MIB.addImm(Format); 5906 5907 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5908 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 5909 .addMemOperand(MMO); 5910 } 5911 5912 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 5913 MachineRegisterInfo &MRI, 5914 MachineIRBuilder &B, 5915 bool IsFormat, 5916 bool IsTyped) const { 5917 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 5918 MachineMemOperand *MMO = *MI.memoperands_begin(); 5919 const LLT MemTy = MMO->getMemoryType(); 5920 const LLT S32 = LLT::scalar(32); 5921 5922 Register Dst = MI.getOperand(0).getReg(); 5923 5924 Register StatusDst; 5925 int OpOffset = 0; 5926 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2); 5927 bool IsTFE = MI.getNumExplicitDefs() == 2; 5928 if (IsTFE) { 5929 StatusDst = MI.getOperand(1).getReg(); 5930 ++OpOffset; 5931 } 5932 5933 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset); 5934 Register RSrc = MI.getOperand(2 + OpOffset).getReg(); 5935 5936 // The typed intrinsics add an immediate after the registers. 5937 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 5938 5939 // The struct intrinsic variants add one additional operand over raw. 5940 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset; 5941 Register VIndex; 5942 if (HasVIndex) { 5943 VIndex = MI.getOperand(3 + OpOffset).getReg(); 5944 ++OpOffset; 5945 } else { 5946 VIndex = B.buildConstant(S32, 0).getReg(0); 5947 } 5948 5949 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 5950 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 5951 5952 unsigned Format = 0; 5953 if (IsTyped) { 5954 Format = MI.getOperand(5 + OpOffset).getImm(); 5955 ++OpOffset; 5956 } 5957 5958 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 5959 unsigned ImmOffset; 5960 5961 LLT Ty = MRI.getType(Dst); 5962 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the 5963 // logic doesn't have to handle that case. 5964 if (hasBufferRsrcWorkaround(Ty)) { 5965 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0); 5966 Dst = MI.getOperand(0).getReg(); 5967 } 5968 LLT EltTy = Ty.getScalarType(); 5969 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 5970 const bool Unpacked = ST.hasUnpackedD16VMem(); 5971 5972 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 5973 5974 unsigned Opc; 5975 5976 // TODO: Support TFE for typed and narrow loads. 5977 if (IsTyped) { 5978 if (IsTFE) 5979 return false; 5980 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 5981 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 5982 } else if (IsFormat) { 5983 if (IsD16) { 5984 if (IsTFE) 5985 return false; 5986 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16; 5987 } else { 5988 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE 5989 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 5990 } 5991 } else { 5992 switch (MemTy.getSizeInBits()) { 5993 case 8: 5994 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE 5995 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 5996 break; 5997 case 16: 5998 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE 5999 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 6000 break; 6001 default: 6002 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE 6003 : AMDGPU::G_AMDGPU_BUFFER_LOAD; 6004 break; 6005 } 6006 } 6007 6008 if (IsTFE) { 6009 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32); 6010 unsigned NumLoadDWords = NumValueDWords + 1; 6011 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32); 6012 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy); 6013 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 6014 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 6015 if (MemTy.getSizeInBits() < 32) { 6016 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32); 6017 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg); 6018 B.buildTrunc(Dst, ExtDst); 6019 } else if (NumValueDWords == 1) { 6020 B.buildUnmerge({Dst, StatusDst}, LoadDstReg); 6021 } else { 6022 SmallVector<Register, 5> LoadElts; 6023 for (unsigned I = 0; I != NumValueDWords; ++I) 6024 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32)); 6025 LoadElts.push_back(StatusDst); 6026 B.buildUnmerge(LoadElts, LoadDstReg); 6027 LoadElts.truncate(NumValueDWords); 6028 B.buildMergeLikeInstr(Dst, LoadElts); 6029 } 6030 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) || 6031 (IsD16 && !Ty.isVector())) { 6032 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 6033 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 6034 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 6035 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 6036 B.buildTrunc(Dst, LoadDstReg); 6037 } else if (Unpacked && IsD16 && Ty.isVector()) { 6038 LLT UnpackedTy = Ty.changeElementSize(32); 6039 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 6040 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 6041 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 6042 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 6043 // FIXME: G_TRUNC should work, but legalization currently fails 6044 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 6045 SmallVector<Register, 4> Repack; 6046 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 6047 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 6048 B.buildMergeLikeInstr(Dst, Repack); 6049 } else { 6050 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format, 6051 AuxiliaryData, MMO, IsTyped, HasVIndex, B); 6052 } 6053 6054 MI.eraseFromParent(); 6055 return true; 6056 } 6057 6058 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 6059 switch (IntrID) { 6060 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 6061 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 6062 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 6063 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 6064 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 6065 case Intrinsic::amdgcn_raw_buffer_atomic_add: 6066 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 6067 case Intrinsic::amdgcn_struct_buffer_atomic_add: 6068 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 6069 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 6070 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 6071 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 6072 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 6073 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 6074 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 6075 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 6076 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 6077 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 6078 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 6079 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 6080 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 6081 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 6082 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 6083 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 6084 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 6085 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 6086 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 6087 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 6088 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 6089 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 6090 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 6091 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 6092 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 6093 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 6094 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 6095 case Intrinsic::amdgcn_raw_buffer_atomic_and: 6096 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 6097 case Intrinsic::amdgcn_struct_buffer_atomic_and: 6098 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 6099 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 6100 case Intrinsic::amdgcn_raw_buffer_atomic_or: 6101 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 6102 case Intrinsic::amdgcn_struct_buffer_atomic_or: 6103 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 6104 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 6105 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 6106 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 6107 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 6108 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 6109 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 6110 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 6111 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 6112 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 6113 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 6114 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 6115 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 6116 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 6117 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 6118 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 6119 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 6120 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 6121 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 6122 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 6123 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 6124 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 6125 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 6126 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 6127 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 6128 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 6129 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 6130 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 6131 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 6132 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 6133 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 6134 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 6135 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 6136 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 6137 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 6138 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 6139 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 6140 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: 6141 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: 6142 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32; 6143 default: 6144 llvm_unreachable("unhandled atomic opcode"); 6145 } 6146 } 6147 6148 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 6149 MachineIRBuilder &B, 6150 Intrinsic::ID IID) const { 6151 const bool IsCmpSwap = 6152 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 6153 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap || 6154 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap || 6155 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap; 6156 6157 Register Dst = MI.getOperand(0).getReg(); 6158 // Since we don't have 128-bit atomics, we don't need to handle the case of 6159 // p8 argmunents to the atomic itself 6160 Register VData = MI.getOperand(2).getReg(); 6161 6162 Register CmpVal; 6163 int OpOffset = 0; 6164 6165 if (IsCmpSwap) { 6166 CmpVal = MI.getOperand(3).getReg(); 6167 ++OpOffset; 6168 } 6169 6170 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset); 6171 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 6172 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 6173 6174 // The struct intrinsic variants add one additional operand over raw. 6175 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 6176 Register VIndex; 6177 if (HasVIndex) { 6178 VIndex = MI.getOperand(4 + OpOffset).getReg(); 6179 ++OpOffset; 6180 } else { 6181 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 6182 } 6183 6184 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 6185 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 6186 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 6187 6188 MachineMemOperand *MMO = *MI.memoperands_begin(); 6189 6190 unsigned ImmOffset; 6191 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 6192 6193 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 6194 .addDef(Dst) 6195 .addUse(VData); // vdata 6196 6197 if (IsCmpSwap) 6198 MIB.addReg(CmpVal); 6199 6200 MIB.addUse(RSrc) // rsrc 6201 .addUse(VIndex) // vindex 6202 .addUse(VOffset) // voffset 6203 .addUse(SOffset) // soffset 6204 .addImm(ImmOffset) // offset(imm) 6205 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 6206 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 6207 .addMemOperand(MMO); 6208 6209 MI.eraseFromParent(); 6210 return true; 6211 } 6212 6213 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 6214 /// vector with s16 typed elements. 6215 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 6216 SmallVectorImpl<Register> &PackedAddrs, 6217 unsigned ArgOffset, 6218 const AMDGPU::ImageDimIntrinsicInfo *Intr, 6219 bool IsA16, bool IsG16) { 6220 const LLT S16 = LLT::scalar(16); 6221 const LLT V2S16 = LLT::fixed_vector(2, 16); 6222 auto EndIdx = Intr->VAddrEnd; 6223 6224 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 6225 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 6226 if (!SrcOp.isReg()) 6227 continue; // _L to _LZ may have eliminated this. 6228 6229 Register AddrReg = SrcOp.getReg(); 6230 6231 if ((I < Intr->GradientStart) || 6232 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 6233 (I >= Intr->CoordStart && !IsA16)) { 6234 if ((I < Intr->GradientStart) && IsA16 && 6235 (B.getMRI()->getType(AddrReg) == S16)) { 6236 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 6237 // Special handling of bias when A16 is on. Bias is of type half but 6238 // occupies full 32-bit. 6239 PackedAddrs.push_back( 6240 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 6241 .getReg(0)); 6242 } else { 6243 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 6244 "Bias needs to be converted to 16 bit in A16 mode"); 6245 // Handle any gradient or coordinate operands that should not be packed 6246 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 6247 PackedAddrs.push_back(AddrReg); 6248 } 6249 } else { 6250 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 6251 // derivatives dx/dh and dx/dv are packed with undef. 6252 if (((I + 1) >= EndIdx) || 6253 ((Intr->NumGradients / 2) % 2 == 1 && 6254 (I == static_cast<unsigned>(Intr->GradientStart + 6255 (Intr->NumGradients / 2) - 1) || 6256 I == static_cast<unsigned>(Intr->GradientStart + 6257 Intr->NumGradients - 1))) || 6258 // Check for _L to _LZ optimization 6259 !MI.getOperand(ArgOffset + I + 1).isReg()) { 6260 PackedAddrs.push_back( 6261 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 6262 .getReg(0)); 6263 } else { 6264 PackedAddrs.push_back( 6265 B.buildBuildVector( 6266 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 6267 .getReg(0)); 6268 ++I; 6269 } 6270 } 6271 } 6272 } 6273 6274 /// Convert from separate vaddr components to a single vector address register, 6275 /// and replace the remaining operands with $noreg. 6276 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 6277 int DimIdx, int NumVAddrs) { 6278 const LLT S32 = LLT::scalar(32); 6279 (void)S32; 6280 SmallVector<Register, 8> AddrRegs; 6281 for (int I = 0; I != NumVAddrs; ++I) { 6282 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 6283 if (SrcOp.isReg()) { 6284 AddrRegs.push_back(SrcOp.getReg()); 6285 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 6286 } 6287 } 6288 6289 int NumAddrRegs = AddrRegs.size(); 6290 if (NumAddrRegs != 1) { 6291 auto VAddr = 6292 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 6293 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 6294 } 6295 6296 for (int I = 1; I != NumVAddrs; ++I) { 6297 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 6298 if (SrcOp.isReg()) 6299 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 6300 } 6301 } 6302 6303 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 6304 /// 6305 /// Depending on the subtarget, load/store with 16-bit element data need to be 6306 /// rewritten to use the low half of 32-bit registers, or directly use a packed 6307 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 6308 /// registers. 6309 /// 6310 /// We don't want to directly select image instructions just yet, but also want 6311 /// to exposes all register repacking to the legalizer/combiners. We also don't 6312 /// want a selected instruction entering RegBankSelect. In order to avoid 6313 /// defining a multitude of intermediate image instructions, directly hack on 6314 /// the intrinsic's arguments. In cases like a16 addresses, this requires 6315 /// padding now unnecessary arguments with $noreg. 6316 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 6317 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 6318 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 6319 6320 const MachineFunction &MF = *MI.getMF(); 6321 const unsigned NumDefs = MI.getNumExplicitDefs(); 6322 const unsigned ArgOffset = NumDefs + 1; 6323 bool IsTFE = NumDefs == 2; 6324 // We are only processing the operands of d16 image operations on subtargets 6325 // that use the unpacked register layout, or need to repack the TFE result. 6326 6327 // TODO: Do we need to guard against already legalized intrinsics? 6328 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 6329 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 6330 6331 MachineRegisterInfo *MRI = B.getMRI(); 6332 const LLT S32 = LLT::scalar(32); 6333 const LLT S16 = LLT::scalar(16); 6334 const LLT V2S16 = LLT::fixed_vector(2, 16); 6335 6336 unsigned DMask = 0; 6337 Register VData; 6338 LLT Ty; 6339 6340 if (!BaseOpcode->NoReturn || BaseOpcode->Store) { 6341 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 6342 Ty = MRI->getType(VData); 6343 } 6344 6345 const bool IsAtomicPacked16Bit = 6346 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || 6347 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); 6348 6349 // Check for 16 bit addresses and pack if true. 6350 LLT GradTy = 6351 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 6352 LLT AddrTy = 6353 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 6354 const bool IsG16 = 6355 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16; 6356 const bool IsA16 = AddrTy == S16; 6357 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16; 6358 6359 int DMaskLanes = 0; 6360 if (!BaseOpcode->Atomic) { 6361 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 6362 if (BaseOpcode->Gather4) { 6363 DMaskLanes = 4; 6364 } else if (DMask != 0) { 6365 DMaskLanes = llvm::popcount(DMask); 6366 } else if (!IsTFE && !BaseOpcode->Store) { 6367 // If dmask is 0, this is a no-op load. This can be eliminated. 6368 B.buildUndef(MI.getOperand(0)); 6369 MI.eraseFromParent(); 6370 return true; 6371 } 6372 } 6373 6374 Observer.changingInstr(MI); 6375 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 6376 6377 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 6378 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 6379 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 6380 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 6381 unsigned NewOpcode = LoadOpcode; 6382 if (BaseOpcode->Store) 6383 NewOpcode = StoreOpcode; 6384 else if (BaseOpcode->NoReturn) 6385 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET; 6386 6387 // Track that we legalized this 6388 MI.setDesc(B.getTII().get(NewOpcode)); 6389 6390 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 6391 // dmask to be at least 1 otherwise the instruction will fail 6392 if (IsTFE && DMask == 0) { 6393 DMask = 0x1; 6394 DMaskLanes = 1; 6395 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 6396 } 6397 6398 if (BaseOpcode->Atomic) { 6399 Register VData0 = MI.getOperand(2).getReg(); 6400 LLT Ty = MRI->getType(VData0); 6401 6402 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 6403 if (Ty.isVector() && !IsAtomicPacked16Bit) 6404 return false; 6405 6406 if (BaseOpcode->AtomicX2) { 6407 Register VData1 = MI.getOperand(3).getReg(); 6408 // The two values are packed in one register. 6409 LLT PackedTy = LLT::fixed_vector(2, Ty); 6410 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 6411 MI.getOperand(2).setReg(Concat.getReg(0)); 6412 MI.getOperand(3).setReg(AMDGPU::NoRegister); 6413 } 6414 } 6415 6416 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 6417 6418 // Rewrite the addressing register layout before doing anything else. 6419 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 6420 // 16 bit gradients are supported, but are tied to the A16 control 6421 // so both gradients and addresses must be 16 bit 6422 return false; 6423 } 6424 6425 if (IsA16 && !ST.hasA16()) { 6426 // A16 not supported 6427 return false; 6428 } 6429 6430 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler); 6431 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding(); 6432 6433 if (IsA16 || IsG16) { 6434 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the 6435 // instructions expect VGPR_32 6436 SmallVector<Register, 4> PackedRegs; 6437 6438 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16); 6439 6440 // See also below in the non-a16 branch 6441 const bool UseNSA = ST.hasNSAEncoding() && 6442 PackedRegs.size() >= ST.getNSAThreshold(MF) && 6443 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA); 6444 const bool UsePartialNSA = 6445 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize; 6446 6447 if (UsePartialNSA) { 6448 // Pack registers that would go over NSAMaxSize into last VAddr register 6449 LLT PackedAddrTy = 6450 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16); 6451 auto Concat = B.buildConcatVectors( 6452 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1)); 6453 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0); 6454 PackedRegs.resize(NSAMaxSize); 6455 } else if (!UseNSA && PackedRegs.size() > 1) { 6456 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 6457 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 6458 PackedRegs[0] = Concat.getReg(0); 6459 PackedRegs.resize(1); 6460 } 6461 6462 const unsigned NumPacked = PackedRegs.size(); 6463 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 6464 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 6465 if (!SrcOp.isReg()) { 6466 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 6467 continue; 6468 } 6469 6470 assert(SrcOp.getReg() != AMDGPU::NoRegister); 6471 6472 if (I - Intr->VAddrStart < NumPacked) 6473 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 6474 else 6475 SrcOp.setReg(AMDGPU::NoRegister); 6476 } 6477 } else { 6478 // If the register allocator cannot place the address registers contiguously 6479 // without introducing moves, then using the non-sequential address encoding 6480 // is always preferable, since it saves VALU instructions and is usually a 6481 // wash in terms of code size or even better. 6482 // 6483 // However, we currently have no way of hinting to the register allocator 6484 // that MIMG addresses should be placed contiguously when it is possible to 6485 // do so, so force non-NSA for the common 2-address case as a heuristic. 6486 // 6487 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 6488 // allocation when possible. 6489 // 6490 // Partial NSA is allowed on GFX11+ where the final register is a contiguous 6491 // set of the remaining addresses. 6492 const bool UseNSA = ST.hasNSAEncoding() && 6493 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) && 6494 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA); 6495 const bool UsePartialNSA = 6496 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize; 6497 6498 if (UsePartialNSA) { 6499 convertImageAddrToPacked(B, MI, 6500 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1, 6501 Intr->NumVAddrs - NSAMaxSize + 1); 6502 } else if (!UseNSA && Intr->NumVAddrs > 1) { 6503 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 6504 Intr->NumVAddrs); 6505 } 6506 } 6507 6508 int Flags = 0; 6509 if (IsA16) 6510 Flags |= 1; 6511 if (IsG16) 6512 Flags |= 2; 6513 MI.addOperand(MachineOperand::CreateImm(Flags)); 6514 6515 if (BaseOpcode->NoReturn) { // No TFE for stores? 6516 // TODO: Handle dmask trim 6517 if (!Ty.isVector() || !IsD16) 6518 return true; 6519 6520 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 6521 if (RepackedReg != VData) { 6522 MI.getOperand(1).setReg(RepackedReg); 6523 } 6524 6525 return true; 6526 } 6527 6528 Register DstReg = MI.getOperand(0).getReg(); 6529 const LLT EltTy = Ty.getScalarType(); 6530 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 6531 6532 // Confirm that the return type is large enough for the dmask specified 6533 if (NumElts < DMaskLanes) 6534 return false; 6535 6536 if (NumElts > 4 || DMaskLanes > 4) 6537 return false; 6538 6539 // Image atomic instructions are using DMask to specify how many bits 6540 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16). 6541 // DMaskLanes for image atomic has default value '0'. 6542 // We must be sure that atomic variants (especially packed) will not be 6543 // truncated from v2s16 or v4s16 to s16 type. 6544 // 6545 // ChangeElementCount will be needed for image load where Ty is always scalar. 6546 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 6547 const LLT AdjustedTy = 6548 DMaskLanes == 0 6549 ? Ty 6550 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 6551 6552 // The raw dword aligned data component of the load. The only legal cases 6553 // where this matters should be when using the packed D16 format, for 6554 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 6555 LLT RoundedTy; 6556 6557 // S32 vector to cover all data, plus TFE result element. 6558 LLT TFETy; 6559 6560 // Register type to use for each loaded component. Will be S32 or V2S16. 6561 LLT RegTy; 6562 6563 if (IsD16 && ST.hasUnpackedD16VMem()) { 6564 RoundedTy = 6565 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 6566 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 6567 RegTy = S32; 6568 } else { 6569 unsigned EltSize = EltTy.getSizeInBits(); 6570 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 6571 unsigned RoundedSize = 32 * RoundedElts; 6572 RoundedTy = LLT::scalarOrVector( 6573 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 6574 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 6575 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 6576 } 6577 6578 // The return type does not need adjustment. 6579 // TODO: Should we change s16 case to s32 or <2 x s16>? 6580 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 6581 return true; 6582 6583 Register Dst1Reg; 6584 6585 // Insert after the instruction. 6586 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 6587 6588 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 6589 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 6590 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 6591 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 6592 6593 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 6594 6595 MI.getOperand(0).setReg(NewResultReg); 6596 6597 // In the IR, TFE is supposed to be used with a 2 element struct return 6598 // type. The instruction really returns these two values in one contiguous 6599 // register, with one additional dword beyond the loaded data. Rewrite the 6600 // return type to use a single register result. 6601 6602 if (IsTFE) { 6603 Dst1Reg = MI.getOperand(1).getReg(); 6604 if (MRI->getType(Dst1Reg) != S32) 6605 return false; 6606 6607 // TODO: Make sure the TFE operand bit is set. 6608 MI.removeOperand(1); 6609 6610 // Handle the easy case that requires no repack instructions. 6611 if (Ty == S32) { 6612 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 6613 return true; 6614 } 6615 } 6616 6617 // Now figure out how to copy the new result register back into the old 6618 // result. 6619 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 6620 6621 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 6622 6623 if (ResultNumRegs == 1) { 6624 assert(!IsTFE); 6625 ResultRegs[0] = NewResultReg; 6626 } else { 6627 // We have to repack into a new vector of some kind. 6628 for (int I = 0; I != NumDataRegs; ++I) 6629 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 6630 B.buildUnmerge(ResultRegs, NewResultReg); 6631 6632 // Drop the final TFE element to get the data part. The TFE result is 6633 // directly written to the right place already. 6634 if (IsTFE) 6635 ResultRegs.resize(NumDataRegs); 6636 } 6637 6638 // For an s16 scalar result, we form an s32 result with a truncate regardless 6639 // of packed vs. unpacked. 6640 if (IsD16 && !Ty.isVector()) { 6641 B.buildTrunc(DstReg, ResultRegs[0]); 6642 return true; 6643 } 6644 6645 // Avoid a build/concat_vector of 1 entry. 6646 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 6647 B.buildBitcast(DstReg, ResultRegs[0]); 6648 return true; 6649 } 6650 6651 assert(Ty.isVector()); 6652 6653 if (IsD16) { 6654 // For packed D16 results with TFE enabled, all the data components are 6655 // S32. Cast back to the expected type. 6656 // 6657 // TODO: We don't really need to use load s32 elements. We would only need one 6658 // cast for the TFE result if a multiple of v2s16 was used. 6659 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 6660 for (Register &Reg : ResultRegs) 6661 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 6662 } else if (ST.hasUnpackedD16VMem()) { 6663 for (Register &Reg : ResultRegs) 6664 Reg = B.buildTrunc(S16, Reg).getReg(0); 6665 } 6666 } 6667 6668 auto padWithUndef = [&](LLT Ty, int NumElts) { 6669 if (NumElts == 0) 6670 return; 6671 Register Undef = B.buildUndef(Ty).getReg(0); 6672 for (int I = 0; I != NumElts; ++I) 6673 ResultRegs.push_back(Undef); 6674 }; 6675 6676 // Pad out any elements eliminated due to the dmask. 6677 LLT ResTy = MRI->getType(ResultRegs[0]); 6678 if (!ResTy.isVector()) { 6679 padWithUndef(ResTy, NumElts - ResultRegs.size()); 6680 B.buildBuildVector(DstReg, ResultRegs); 6681 return true; 6682 } 6683 6684 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 6685 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 6686 6687 // Deal with the one annoying legal case. 6688 const LLT V3S16 = LLT::fixed_vector(3, 16); 6689 if (Ty == V3S16) { 6690 if (IsTFE) { 6691 if (ResultRegs.size() == 1) { 6692 NewResultReg = ResultRegs[0]; 6693 } else if (ResultRegs.size() == 2) { 6694 LLT V4S16 = LLT::fixed_vector(4, 16); 6695 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 6696 } else { 6697 return false; 6698 } 6699 } 6700 6701 if (MRI->getType(DstReg).getNumElements() < 6702 MRI->getType(NewResultReg).getNumElements()) { 6703 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 6704 } else { 6705 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 6706 } 6707 return true; 6708 } 6709 6710 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 6711 B.buildConcatVectors(DstReg, ResultRegs); 6712 return true; 6713 } 6714 6715 bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper, 6716 MachineInstr &MI) const { 6717 MachineIRBuilder &B = Helper.MIRBuilder; 6718 GISelChangeObserver &Observer = Helper.Observer; 6719 6720 Register OrigDst = MI.getOperand(0).getReg(); 6721 Register Dst; 6722 LLT Ty = B.getMRI()->getType(OrigDst); 6723 unsigned Size = Ty.getSizeInBits(); 6724 MachineFunction &MF = B.getMF(); 6725 unsigned Opc = 0; 6726 if (Size < 32 && ST.hasScalarSubwordLoads()) { 6727 assert(Size == 8 || Size == 16); 6728 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE 6729 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT; 6730 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit 6731 // destination register. 6732 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32)); 6733 } else { 6734 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD; 6735 Dst = OrigDst; 6736 } 6737 6738 Observer.changingInstr(MI); 6739 6740 // Handle needing to s.buffer.load() a p8 value. 6741 if (hasBufferRsrcWorkaround(Ty)) { 6742 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0); 6743 B.setInsertPt(B.getMBB(), MI); 6744 } 6745 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 6746 Ty = getBitcastRegisterType(Ty); 6747 Helper.bitcastDst(MI, Ty, 0); 6748 B.setInsertPt(B.getMBB(), MI); 6749 } 6750 6751 // FIXME: We don't really need this intermediate instruction. The intrinsic 6752 // should be fixed to have a memory operand. Since it's readnone, we're not 6753 // allowed to add one. 6754 MI.setDesc(B.getTII().get(Opc)); 6755 MI.removeOperand(1); // Remove intrinsic ID 6756 6757 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 6758 const unsigned MemSize = (Size + 7) / 8; 6759 const Align MemAlign = B.getDataLayout().getABITypeAlign( 6760 getTypeForLLT(Ty, MF.getFunction().getContext())); 6761 MachineMemOperand *MMO = MF.getMachineMemOperand( 6762 MachinePointerInfo(), 6763 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 6764 MachineMemOperand::MOInvariant, 6765 MemSize, MemAlign); 6766 MI.addMemOperand(MF, MMO); 6767 if (Dst != OrigDst) { 6768 MI.getOperand(0).setReg(Dst); 6769 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 6770 B.buildTrunc(OrigDst, Dst); 6771 } 6772 6773 // If we don't have 96-bit result scalar loads, widening to 128-bit should 6774 // always be legal. We may need to restore this to a 96-bit result if it turns 6775 // out this needs to be converted to a vector load during RegBankSelect. 6776 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) { 6777 if (Ty.isVector()) 6778 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 6779 else 6780 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 6781 } 6782 6783 Observer.changedInstr(MI); 6784 return true; 6785 } 6786 6787 // TODO: Move to selection 6788 bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI, 6789 MachineRegisterInfo &MRI, 6790 MachineIRBuilder &B) const { 6791 if (!ST.isTrapHandlerEnabled() || 6792 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 6793 return legalizeTrapEndpgm(MI, MRI, B); 6794 6795 return ST.supportsGetDoorbellID() ? 6796 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B); 6797 } 6798 6799 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 6800 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6801 const DebugLoc &DL = MI.getDebugLoc(); 6802 MachineBasicBlock &BB = B.getMBB(); 6803 MachineFunction *MF = BB.getParent(); 6804 6805 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) { 6806 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 6807 .addImm(0); 6808 MI.eraseFromParent(); 6809 return true; 6810 } 6811 6812 // We need a block split to make the real endpgm a terminator. We also don't 6813 // want to break phis in successor blocks, so we can't just delete to the 6814 // end of the block. 6815 BB.splitAt(MI, false /*UpdateLiveIns*/); 6816 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 6817 MF->push_back(TrapBB); 6818 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 6819 .addImm(0); 6820 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ)) 6821 .addMBB(TrapBB); 6822 6823 BB.addSuccessor(TrapBB); 6824 MI.eraseFromParent(); 6825 return true; 6826 } 6827 6828 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 6829 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6830 MachineFunction &MF = B.getMF(); 6831 const LLT S64 = LLT::scalar(64); 6832 6833 Register SGPR01(AMDGPU::SGPR0_SGPR1); 6834 // For code object version 5, queue_ptr is passed through implicit kernarg. 6835 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= 6836 AMDGPU::AMDHSA_COV5) { 6837 AMDGPUTargetLowering::ImplicitParameter Param = 6838 AMDGPUTargetLowering::QUEUE_PTR; 6839 uint64_t Offset = 6840 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 6841 6842 Register KernargPtrReg = MRI.createGenericVirtualRegister( 6843 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6844 6845 if (!loadInputValue(KernargPtrReg, B, 6846 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 6847 return false; 6848 6849 // TODO: can we be smarter about machine pointer info? 6850 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 6851 MachineMemOperand *MMO = MF.getMachineMemOperand( 6852 PtrInfo, 6853 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 6854 MachineMemOperand::MOInvariant, 6855 LLT::scalar(64), commonAlignment(Align(64), Offset)); 6856 6857 // Pointer address 6858 Register LoadAddr = MRI.createGenericVirtualRegister( 6859 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6860 B.buildPtrAdd(LoadAddr, KernargPtrReg, 6861 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 6862 // Load address 6863 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 6864 B.buildCopy(SGPR01, Temp); 6865 B.buildInstr(AMDGPU::S_TRAP) 6866 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 6867 .addReg(SGPR01, RegState::Implicit); 6868 MI.eraseFromParent(); 6869 return true; 6870 } 6871 6872 // Pass queue pointer to trap handler as input, and insert trap instruction 6873 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 6874 Register LiveIn = 6875 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6876 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 6877 return false; 6878 6879 B.buildCopy(SGPR01, LiveIn); 6880 B.buildInstr(AMDGPU::S_TRAP) 6881 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 6882 .addReg(SGPR01, RegState::Implicit); 6883 6884 MI.eraseFromParent(); 6885 return true; 6886 } 6887 6888 bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI, 6889 MachineRegisterInfo &MRI, 6890 MachineIRBuilder &B) const { 6891 // We need to simulate the 's_trap 2' instruction on targets that run in 6892 // PRIV=1 (where it is treated as a nop). 6893 if (ST.hasPrivEnabledTrap2NopBug()) { 6894 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI, 6895 MI.getDebugLoc()); 6896 MI.eraseFromParent(); 6897 return true; 6898 } 6899 6900 B.buildInstr(AMDGPU::S_TRAP) 6901 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 6902 MI.eraseFromParent(); 6903 return true; 6904 } 6905 6906 bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI, 6907 MachineRegisterInfo &MRI, 6908 MachineIRBuilder &B) const { 6909 // Is non-HSA path or trap-handler disabled? Then, report a warning 6910 // accordingly 6911 if (!ST.isTrapHandlerEnabled() || 6912 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 6913 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 6914 "debugtrap handler not supported", 6915 MI.getDebugLoc(), DS_Warning); 6916 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 6917 Ctx.diagnose(NoTrap); 6918 } else { 6919 // Insert debug-trap instruction 6920 B.buildInstr(AMDGPU::S_TRAP) 6921 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 6922 } 6923 6924 MI.eraseFromParent(); 6925 return true; 6926 } 6927 6928 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 6929 MachineIRBuilder &B) const { 6930 MachineRegisterInfo &MRI = *B.getMRI(); 6931 const LLT S16 = LLT::scalar(16); 6932 const LLT S32 = LLT::scalar(32); 6933 const LLT V2S16 = LLT::fixed_vector(2, 16); 6934 const LLT V3S32 = LLT::fixed_vector(3, 32); 6935 6936 Register DstReg = MI.getOperand(0).getReg(); 6937 Register NodePtr = MI.getOperand(2).getReg(); 6938 Register RayExtent = MI.getOperand(3).getReg(); 6939 Register RayOrigin = MI.getOperand(4).getReg(); 6940 Register RayDir = MI.getOperand(5).getReg(); 6941 Register RayInvDir = MI.getOperand(6).getReg(); 6942 Register TDescr = MI.getOperand(7).getReg(); 6943 6944 if (!ST.hasGFX10_AEncoding()) { 6945 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 6946 "intrinsic not supported on subtarget", 6947 MI.getDebugLoc()); 6948 B.getMF().getFunction().getContext().diagnose(BadIntrin); 6949 return false; 6950 } 6951 6952 const bool IsGFX11 = AMDGPU::isGFX11(ST); 6953 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); 6954 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST); 6955 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 6956 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 6957 const unsigned NumVDataDwords = 4; 6958 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 6959 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 6960 const bool UseNSA = 6961 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize()); 6962 6963 const unsigned BaseOpcodes[2][2] = { 6964 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 6965 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 6966 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 6967 int Opcode; 6968 if (UseNSA) { 6969 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 6970 IsGFX12Plus ? AMDGPU::MIMGEncGfx12 6971 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA 6972 : AMDGPU::MIMGEncGfx10NSA, 6973 NumVDataDwords, NumVAddrDwords); 6974 } else { 6975 assert(!IsGFX12Plus); 6976 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 6977 IsGFX11 ? AMDGPU::MIMGEncGfx11Default 6978 : AMDGPU::MIMGEncGfx10Default, 6979 NumVDataDwords, NumVAddrDwords); 6980 } 6981 assert(Opcode != -1); 6982 6983 SmallVector<Register, 12> Ops; 6984 if (UseNSA && IsGFX11Plus) { 6985 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { 6986 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 6987 auto Merged = B.buildMergeLikeInstr( 6988 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); 6989 Ops.push_back(Merged.getReg(0)); 6990 }; 6991 6992 Ops.push_back(NodePtr); 6993 Ops.push_back(RayExtent); 6994 packLanes(RayOrigin); 6995 6996 if (IsA16) { 6997 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 6998 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 6999 auto MergedDir = B.buildMergeLikeInstr( 7000 V3S32, 7001 {B.buildBitcast( 7002 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0), 7003 UnmergeRayDir.getReg(0)})) 7004 .getReg(0), 7005 B.buildBitcast( 7006 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1), 7007 UnmergeRayDir.getReg(1)})) 7008 .getReg(0), 7009 B.buildBitcast( 7010 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2), 7011 UnmergeRayDir.getReg(2)})) 7012 .getReg(0)}); 7013 Ops.push_back(MergedDir.getReg(0)); 7014 } else { 7015 packLanes(RayDir); 7016 packLanes(RayInvDir); 7017 } 7018 } else { 7019 if (Is64) { 7020 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 7021 Ops.push_back(Unmerge.getReg(0)); 7022 Ops.push_back(Unmerge.getReg(1)); 7023 } else { 7024 Ops.push_back(NodePtr); 7025 } 7026 Ops.push_back(RayExtent); 7027 7028 auto packLanes = [&Ops, &S32, &B](Register Src) { 7029 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 7030 Ops.push_back(Unmerge.getReg(0)); 7031 Ops.push_back(Unmerge.getReg(1)); 7032 Ops.push_back(Unmerge.getReg(2)); 7033 }; 7034 7035 packLanes(RayOrigin); 7036 if (IsA16) { 7037 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 7038 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 7039 Register R1 = MRI.createGenericVirtualRegister(S32); 7040 Register R2 = MRI.createGenericVirtualRegister(S32); 7041 Register R3 = MRI.createGenericVirtualRegister(S32); 7042 B.buildMergeLikeInstr(R1, 7043 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 7044 B.buildMergeLikeInstr( 7045 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 7046 B.buildMergeLikeInstr( 7047 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 7048 Ops.push_back(R1); 7049 Ops.push_back(R2); 7050 Ops.push_back(R3); 7051 } else { 7052 packLanes(RayDir); 7053 packLanes(RayInvDir); 7054 } 7055 } 7056 7057 if (!UseNSA) { 7058 // Build a single vector containing all the operands so far prepared. 7059 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 7060 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0); 7061 Ops.clear(); 7062 Ops.push_back(MergedOps); 7063 } 7064 7065 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 7066 .addDef(DstReg) 7067 .addImm(Opcode); 7068 7069 for (Register R : Ops) { 7070 MIB.addUse(R); 7071 } 7072 7073 MIB.addUse(TDescr) 7074 .addImm(IsA16 ? 1 : 0) 7075 .cloneMemRefs(MI); 7076 7077 MI.eraseFromParent(); 7078 return true; 7079 } 7080 7081 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 7082 MachineIRBuilder &B) const { 7083 unsigned Opc; 7084 int RoundMode = MI.getOperand(2).getImm(); 7085 7086 if (RoundMode == (int)RoundingMode::TowardPositive) 7087 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 7088 else if (RoundMode == (int)RoundingMode::TowardNegative) 7089 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 7090 else 7091 return false; 7092 7093 B.buildInstr(Opc) 7094 .addDef(MI.getOperand(0).getReg()) 7095 .addUse(MI.getOperand(1).getReg()); 7096 7097 MI.eraseFromParent(); 7098 7099 return true; 7100 } 7101 7102 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI, 7103 MachineIRBuilder &B) const { 7104 const SITargetLowering *TLI = ST.getTargetLowering(); 7105 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore(); 7106 Register DstReg = MI.getOperand(0).getReg(); 7107 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr}); 7108 MI.eraseFromParent(); 7109 return true; 7110 } 7111 7112 bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI, 7113 MachineIRBuilder &B) const { 7114 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25]. 7115 if (!ST.hasArchitectedSGPRs()) 7116 return false; 7117 LLT S32 = LLT::scalar(32); 7118 Register DstReg = MI.getOperand(0).getReg(); 7119 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8)); 7120 auto LSB = B.buildConstant(S32, 25); 7121 auto Width = B.buildConstant(S32, 5); 7122 B.buildUbfx(DstReg, TTMP8, LSB, Width); 7123 MI.eraseFromParent(); 7124 return true; 7125 } 7126 7127 static constexpr unsigned FPEnvModeBitField = 7128 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23); 7129 7130 static constexpr unsigned FPEnvTrapBitField = 7131 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5); 7132 7133 bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI, 7134 MachineRegisterInfo &MRI, 7135 MachineIRBuilder &B) const { 7136 Register Src = MI.getOperand(0).getReg(); 7137 if (MRI.getType(Src) != S64) 7138 return false; 7139 7140 auto ModeReg = 7141 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32}, 7142 /*HasSideEffects=*/true, /*isConvergent=*/false) 7143 .addImm(FPEnvModeBitField); 7144 auto TrapReg = 7145 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32}, 7146 /*HasSideEffects=*/true, /*isConvergent=*/false) 7147 .addImm(FPEnvTrapBitField); 7148 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg}); 7149 MI.eraseFromParent(); 7150 return true; 7151 } 7152 7153 bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI, 7154 MachineRegisterInfo &MRI, 7155 MachineIRBuilder &B) const { 7156 Register Src = MI.getOperand(0).getReg(); 7157 if (MRI.getType(Src) != S64) 7158 return false; 7159 7160 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0)); 7161 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(), 7162 /*HasSideEffects=*/true, /*isConvergent=*/false) 7163 .addImm(static_cast<int16_t>(FPEnvModeBitField)) 7164 .addReg(Unmerge.getReg(0)); 7165 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(), 7166 /*HasSideEffects=*/true, /*isConvergent=*/false) 7167 .addImm(static_cast<int16_t>(FPEnvTrapBitField)) 7168 .addReg(Unmerge.getReg(1)); 7169 MI.eraseFromParent(); 7170 return true; 7171 } 7172 7173 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 7174 MachineInstr &MI) const { 7175 MachineIRBuilder &B = Helper.MIRBuilder; 7176 MachineRegisterInfo &MRI = *B.getMRI(); 7177 7178 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 7179 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); 7180 switch (IntrID) { 7181 case Intrinsic::amdgcn_if: 7182 case Intrinsic::amdgcn_else: { 7183 MachineInstr *Br = nullptr; 7184 MachineBasicBlock *UncondBrTarget = nullptr; 7185 bool Negated = false; 7186 if (MachineInstr *BrCond = 7187 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 7188 const SIRegisterInfo *TRI 7189 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 7190 7191 Register Def = MI.getOperand(1).getReg(); 7192 Register Use = MI.getOperand(3).getReg(); 7193 7194 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 7195 7196 if (Negated) 7197 std::swap(CondBrTarget, UncondBrTarget); 7198 7199 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 7200 if (IntrID == Intrinsic::amdgcn_if) { 7201 B.buildInstr(AMDGPU::SI_IF) 7202 .addDef(Def) 7203 .addUse(Use) 7204 .addMBB(UncondBrTarget); 7205 } else { 7206 B.buildInstr(AMDGPU::SI_ELSE) 7207 .addDef(Def) 7208 .addUse(Use) 7209 .addMBB(UncondBrTarget); 7210 } 7211 7212 if (Br) { 7213 Br->getOperand(0).setMBB(CondBrTarget); 7214 } else { 7215 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 7216 // since we're swapping branch targets it needs to be reinserted. 7217 // FIXME: IRTranslator should probably not do this 7218 B.buildBr(*CondBrTarget); 7219 } 7220 7221 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 7222 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 7223 MI.eraseFromParent(); 7224 BrCond->eraseFromParent(); 7225 return true; 7226 } 7227 7228 return false; 7229 } 7230 case Intrinsic::amdgcn_loop: { 7231 MachineInstr *Br = nullptr; 7232 MachineBasicBlock *UncondBrTarget = nullptr; 7233 bool Negated = false; 7234 if (MachineInstr *BrCond = 7235 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 7236 const SIRegisterInfo *TRI 7237 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 7238 7239 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 7240 Register Reg = MI.getOperand(2).getReg(); 7241 7242 if (Negated) 7243 std::swap(CondBrTarget, UncondBrTarget); 7244 7245 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 7246 B.buildInstr(AMDGPU::SI_LOOP) 7247 .addUse(Reg) 7248 .addMBB(UncondBrTarget); 7249 7250 if (Br) 7251 Br->getOperand(0).setMBB(CondBrTarget); 7252 else 7253 B.buildBr(*CondBrTarget); 7254 7255 MI.eraseFromParent(); 7256 BrCond->eraseFromParent(); 7257 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 7258 return true; 7259 } 7260 7261 return false; 7262 } 7263 case Intrinsic::amdgcn_addrspacecast_nonnull: 7264 return legalizeAddrSpaceCast(MI, MRI, B); 7265 case Intrinsic::amdgcn_make_buffer_rsrc: 7266 return legalizePointerAsRsrcIntrin(MI, MRI, B); 7267 case Intrinsic::amdgcn_kernarg_segment_ptr: 7268 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 7269 // This only makes sense to call in a kernel, so just lower to null. 7270 B.buildConstant(MI.getOperand(0).getReg(), 0); 7271 MI.eraseFromParent(); 7272 return true; 7273 } 7274 7275 return legalizePreloadedArgIntrin( 7276 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 7277 case Intrinsic::amdgcn_implicitarg_ptr: 7278 return legalizeImplicitArgPtr(MI, MRI, B); 7279 case Intrinsic::amdgcn_workitem_id_x: 7280 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, 7281 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 7282 case Intrinsic::amdgcn_workitem_id_y: 7283 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, 7284 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 7285 case Intrinsic::amdgcn_workitem_id_z: 7286 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, 7287 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 7288 case Intrinsic::amdgcn_workgroup_id_x: 7289 return legalizePreloadedArgIntrin(MI, MRI, B, 7290 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 7291 case Intrinsic::amdgcn_workgroup_id_y: 7292 return legalizePreloadedArgIntrin(MI, MRI, B, 7293 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 7294 case Intrinsic::amdgcn_workgroup_id_z: 7295 return legalizePreloadedArgIntrin(MI, MRI, B, 7296 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 7297 case Intrinsic::amdgcn_wave_id: 7298 return legalizeWaveID(MI, B); 7299 case Intrinsic::amdgcn_lds_kernel_id: 7300 return legalizePreloadedArgIntrin(MI, MRI, B, 7301 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 7302 case Intrinsic::amdgcn_dispatch_ptr: 7303 return legalizePreloadedArgIntrin(MI, MRI, B, 7304 AMDGPUFunctionArgInfo::DISPATCH_PTR); 7305 case Intrinsic::amdgcn_queue_ptr: 7306 return legalizePreloadedArgIntrin(MI, MRI, B, 7307 AMDGPUFunctionArgInfo::QUEUE_PTR); 7308 case Intrinsic::amdgcn_implicit_buffer_ptr: 7309 return legalizePreloadedArgIntrin( 7310 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 7311 case Intrinsic::amdgcn_dispatch_id: 7312 return legalizePreloadedArgIntrin(MI, MRI, B, 7313 AMDGPUFunctionArgInfo::DISPATCH_ID); 7314 case Intrinsic::r600_read_ngroups_x: 7315 // TODO: Emit error for hsa 7316 return legalizeKernargMemParameter(MI, B, 7317 SI::KernelInputOffsets::NGROUPS_X); 7318 case Intrinsic::r600_read_ngroups_y: 7319 return legalizeKernargMemParameter(MI, B, 7320 SI::KernelInputOffsets::NGROUPS_Y); 7321 case Intrinsic::r600_read_ngroups_z: 7322 return legalizeKernargMemParameter(MI, B, 7323 SI::KernelInputOffsets::NGROUPS_Z); 7324 case Intrinsic::r600_read_local_size_x: 7325 // TODO: Could insert G_ASSERT_ZEXT from s16 7326 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 7327 case Intrinsic::r600_read_local_size_y: 7328 // TODO: Could insert G_ASSERT_ZEXT from s16 7329 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 7330 // TODO: Could insert G_ASSERT_ZEXT from s16 7331 case Intrinsic::r600_read_local_size_z: 7332 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 7333 case Intrinsic::r600_read_global_size_x: 7334 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 7335 case Intrinsic::r600_read_global_size_y: 7336 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 7337 case Intrinsic::r600_read_global_size_z: 7338 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 7339 case Intrinsic::amdgcn_fdiv_fast: 7340 return legalizeFDIVFastIntrin(MI, MRI, B); 7341 case Intrinsic::amdgcn_is_shared: 7342 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 7343 case Intrinsic::amdgcn_is_private: 7344 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 7345 case Intrinsic::amdgcn_wavefrontsize: { 7346 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 7347 MI.eraseFromParent(); 7348 return true; 7349 } 7350 case Intrinsic::amdgcn_s_buffer_load: 7351 return legalizeSBufferLoad(Helper, MI); 7352 case Intrinsic::amdgcn_raw_buffer_store: 7353 case Intrinsic::amdgcn_raw_ptr_buffer_store: 7354 case Intrinsic::amdgcn_struct_buffer_store: 7355 case Intrinsic::amdgcn_struct_ptr_buffer_store: 7356 return legalizeBufferStore(MI, MRI, B, false, false); 7357 case Intrinsic::amdgcn_raw_buffer_store_format: 7358 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: 7359 case Intrinsic::amdgcn_struct_buffer_store_format: 7360 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: 7361 return legalizeBufferStore(MI, MRI, B, false, true); 7362 case Intrinsic::amdgcn_raw_tbuffer_store: 7363 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: 7364 case Intrinsic::amdgcn_struct_tbuffer_store: 7365 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: 7366 return legalizeBufferStore(MI, MRI, B, true, true); 7367 case Intrinsic::amdgcn_raw_buffer_load: 7368 case Intrinsic::amdgcn_raw_ptr_buffer_load: 7369 case Intrinsic::amdgcn_raw_atomic_buffer_load: 7370 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: 7371 case Intrinsic::amdgcn_struct_buffer_load: 7372 case Intrinsic::amdgcn_struct_ptr_buffer_load: 7373 return legalizeBufferLoad(MI, MRI, B, false, false); 7374 case Intrinsic::amdgcn_raw_buffer_load_format: 7375 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: 7376 case Intrinsic::amdgcn_struct_buffer_load_format: 7377 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: 7378 return legalizeBufferLoad(MI, MRI, B, true, false); 7379 case Intrinsic::amdgcn_raw_tbuffer_load: 7380 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: 7381 case Intrinsic::amdgcn_struct_tbuffer_load: 7382 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: 7383 return legalizeBufferLoad(MI, MRI, B, true, true); 7384 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 7385 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 7386 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 7387 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 7388 case Intrinsic::amdgcn_raw_buffer_atomic_add: 7389 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 7390 case Intrinsic::amdgcn_struct_buffer_atomic_add: 7391 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 7392 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 7393 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 7394 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 7395 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 7396 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 7397 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 7398 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 7399 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 7400 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 7401 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 7402 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 7403 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 7404 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 7405 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 7406 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 7407 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 7408 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 7409 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 7410 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 7411 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 7412 case Intrinsic::amdgcn_raw_buffer_atomic_and: 7413 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 7414 case Intrinsic::amdgcn_struct_buffer_atomic_and: 7415 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 7416 case Intrinsic::amdgcn_raw_buffer_atomic_or: 7417 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 7418 case Intrinsic::amdgcn_struct_buffer_atomic_or: 7419 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 7420 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 7421 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 7422 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 7423 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 7424 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 7425 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 7426 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 7427 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 7428 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 7429 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 7430 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 7431 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 7432 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 7433 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 7434 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 7435 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 7436 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 7437 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 7438 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 7439 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 7440 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 7441 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 7442 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 7443 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 7444 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 7445 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 7446 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 7447 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 7448 return legalizeBufferAtomic(MI, B, IntrID); 7449 case Intrinsic::amdgcn_rsq_clamp: 7450 return legalizeRsqClampIntrinsic(MI, MRI, B); 7451 case Intrinsic::amdgcn_image_bvh_intersect_ray: 7452 return legalizeBVHIntrinsic(MI, B); 7453 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: 7454 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: 7455 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: 7456 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: 7457 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: 7458 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: 7459 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: 7460 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: { 7461 Register Index = MI.getOperand(5).getReg(); 7462 LLT S32 = LLT::scalar(32); 7463 if (MRI.getType(Index) != S32) 7464 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0)); 7465 return true; 7466 } 7467 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: 7468 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: 7469 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { 7470 Register Index = MI.getOperand(7).getReg(); 7471 LLT S32 = LLT::scalar(32); 7472 if (MRI.getType(Index) != S32) 7473 MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0)); 7474 return true; 7475 } 7476 case Intrinsic::amdgcn_fmed3: { 7477 GISelChangeObserver &Observer = Helper.Observer; 7478 7479 // FIXME: This is to workaround the inability of tablegen match combiners to 7480 // match intrinsics in patterns. 7481 Observer.changingInstr(MI); 7482 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3)); 7483 MI.removeOperand(1); 7484 Observer.changedInstr(MI); 7485 return true; 7486 } 7487 case Intrinsic::amdgcn_readlane: 7488 case Intrinsic::amdgcn_writelane: 7489 case Intrinsic::amdgcn_readfirstlane: 7490 case Intrinsic::amdgcn_permlane16: 7491 case Intrinsic::amdgcn_permlanex16: 7492 case Intrinsic::amdgcn_permlane64: 7493 return legalizeLaneOp(Helper, MI, IntrID); 7494 default: { 7495 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 7496 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 7497 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 7498 return true; 7499 } 7500 } 7501 7502 return true; 7503 } 7504